diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index dff4483cc5505..70c3f9b0c3c83 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -69,12 +69,11 @@ jobs: # In order to get diff files with: fetch-depth: 0 - - name: Cache Scala, SBT and Maven + - name: Cache SBT and Maven uses: actions/cache@v4 with: path: | build/apache-maven-* - build/scala-* build/*.jar ~/.sbt key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} @@ -137,12 +136,11 @@ jobs: # In order to get diff files with: fetch-depth: 0 - - name: Cache Scala, SBT and Maven + - name: Cache SBT and Maven uses: actions/cache@v4 with: path: | build/apache-maven-* - build/scala-* build/*.jar ~/.sbt key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 832826333f090..9dc9d85520c2c 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -76,26 +76,44 @@ jobs: id: set-outputs run: | if [ -z "${{ inputs.jobs }}" ]; then - pyspark=true; sparkr=true; tpcds=true; docker=true; pyspark_modules=`cd dev && python -c "import sparktestsupport.modules as m; print(','.join(m.name for m in m.all_modules if m.name.startswith('pyspark')))"` pyspark=`./dev/is-changed.py -m $pyspark_modules` - sparkr=`./dev/is-changed.py -m sparkr` - tpcds=`./dev/is-changed.py -m sql` - docker=`./dev/is-changed.py -m docker-integration-tests` - # 'build' and 'maven-build' are always true for now. - # It does not save significant time and most of PRs trigger the build. + if [[ "${{ github.repository }}" != 'apache/spark' ]]; then + pandas=$pyspark + yarn=`./dev/is-changed.py -m yarn` + kubernetes=`./dev/is-changed.py -m kubernetes` + sparkr=`./dev/is-changed.py -m sparkr` + tpcds=`./dev/is-changed.py -m sql` + docker=`./dev/is-changed.py -m docker-integration-tests` + buf=true + ui=true + docs=true + else + pandas=false + yarn=false + kubernetes=false + sparkr=false + tpcds=false + docker=false + buf=false + ui=false + docs=false + fi + build=`./dev/is-changed.py -m "core,unsafe,kvstore,avro,utils,network-common,network-shuffle,repl,launcher,examples,sketch,variant,api,catalyst,hive-thriftserver,mllib-local,mllib,graphx,streaming,sql-kafka-0-10,streaming-kafka-0-10,streaming-kinesis-asl,kubernetes,hadoop-cloud,spark-ganglia-lgpl,protobuf,yarn,connect,sql,hive"` precondition=" { - \"build\": \"true\", + \"build\": \"$build\", \"pyspark\": \"$pyspark\", + \"pyspark-pandas\": \"$pandas\", \"sparkr\": \"$sparkr\", \"tpcds-1g\": \"$tpcds\", \"docker-integration-tests\": \"$docker\", - \"maven-build\": \"true\", \"lint\" : \"true\", - \"k8s-integration-tests\" : \"true\", - \"buf\" : \"true\", - \"ui\" : \"true\", + \"docs\" : \"$docs\", + \"yarn\" : \"$yarn\", + \"k8s-integration-tests\" : \"$kubernetes\", + \"buf\" : \"$buf\", + \"ui\" : \"$ui\", }" echo $precondition # For debugging # Remove `\n` to avoid "Invalid format" error @@ -123,7 +141,7 @@ jobs: needs: precondition if: fromJson(needs.precondition.outputs.required).build == 'true' runs-on: ubuntu-latest - timeout-minutes: 300 + timeout-minutes: 180 strategy: fail-fast: false matrix: @@ -145,9 +163,8 @@ jobs: mllib-local, mllib, graphx - >- streaming, sql-kafka-0-10, streaming-kafka-0-10, streaming-kinesis-asl, - kubernetes, hadoop-cloud, spark-ganglia-lgpl, protobuf - - >- - yarn, connect + kubernetes, hadoop-cloud, spark-ganglia-lgpl, protobuf, connect + - yarn # Here, we split Hive and SQL tests into some of slow ones and the rest of them. included-tags: [""] excluded-tags: [""] @@ -185,14 +202,22 @@ jobs: hive: hive2.3 excluded-tags: org.apache.spark.tags.ExtendedSQLTest,org.apache.spark.tags.SlowSQLTest comment: "- other tests" + exclude: + # Always run if yarn == 'true', even infra-image is skip (such as non-master job) + # In practice, the build will run in individual PR, but not against the individual commit + # in Apache Spark repository. + - modules: ${{ fromJson(needs.precondition.outputs.required).yarn != 'true' && 'yarn' }} env: MODULES_TO_TEST: ${{ matrix.modules }} EXCLUDED_TAGS: ${{ matrix.excluded-tags }} INCLUDED_TAGS: ${{ matrix.included-tags }} HADOOP_PROFILE: ${{ matrix.hadoop }} HIVE_PROFILE: ${{ matrix.hive }} + # GitHub Actions' default miniconda to use in pip packaging test. + CONDA_PREFIX: /usr/share/miniconda GITHUB_PREV_SHA: ${{ github.event.before }} SPARK_LOCAL_IP: localhost + NOLINT_ON_COMPILE: true SKIP_UNIDOC: true SKIP_MIMA: true SKIP_PACKAGING: true @@ -212,12 +237,11 @@ jobs: git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty # Cache local repositories. Note that GitHub Actions cache has a 10G limit. - - name: Cache Scala, SBT and Maven + - name: Cache SBT and Maven uses: actions/cache@v4 with: path: | build/apache-maven-* - build/scala-* build/*.jar ~/.sbt key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} @@ -334,7 +358,7 @@ jobs: if: (!cancelled()) && fromJson(needs.precondition.outputs.required).pyspark == 'true' name: "Build modules: ${{ matrix.modules }}" runs-on: ubuntu-latest - timeout-minutes: 300 + timeout-minutes: 180 container: image: ${{ needs.precondition.outputs.image_url }} strategy: @@ -349,12 +373,12 @@ jobs: pyspark-core, pyspark-errors, pyspark-streaming - >- pyspark-mllib, pyspark-ml, pyspark-ml-connect + - >- + pyspark-connect - >- pyspark-pandas - >- pyspark-pandas-slow - - >- - pyspark-connect - >- pyspark-pandas-connect-part0 - >- @@ -363,11 +387,23 @@ jobs: pyspark-pandas-connect-part2 - >- pyspark-pandas-connect-part3 + exclude: + # Always run if pyspark-pandas == 'true', even infra-image is skip (such as non-master job) + # In practice, the build will run in individual PR, but not against the individual commit + # in Apache Spark repository. + - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas' }} + - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-slow' }} + - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect-part0' }} + - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect-part1' }} + - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect-part2' }} + - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect-part3' }} env: MODULES_TO_TEST: ${{ matrix.modules }} - PYTHON_TO_TEST: 'python3.9' + PYTHON_TO_TEST: 'python3.11' HADOOP_PROFILE: ${{ inputs.hadoop }} HIVE_PROFILE: hive2.3 + # GitHub Actions' default miniconda to use in pip packaging test. + CONDA_PREFIX: /usr/share/miniconda GITHUB_PREV_SHA: ${{ github.event.before }} SPARK_LOCAL_IP: localhost SKIP_UNIDOC: true @@ -394,12 +430,11 @@ jobs: git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty # Cache local repositories. Note that GitHub Actions cache has a 10G limit. - - name: Cache Scala, SBT and Maven + - name: Cache SBT and Maven uses: actions/cache@v4 with: path: | build/apache-maven-* - build/scala-* build/*.jar ~/.sbt key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} @@ -432,19 +467,12 @@ jobs: echo $py $py -m pip list done - - name: Install Conda for pip packaging test - if: contains(matrix.modules, 'pyspark-errors') - run: | - curl -s https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh > miniconda.sh - bash miniconda.sh -b -p $HOME/miniconda - rm miniconda.sh # Run the tests. - name: Run tests env: ${{ fromJSON(inputs.envs) }} shell: 'script -q -e -c "bash {0}"' run: | if [[ "$MODULES_TO_TEST" == *"pyspark-errors"* ]]; then - export PATH=$PATH:$HOME/miniconda/bin export SKIP_PACKAGING=false echo "Python Packaging Tests Enabled!" fi @@ -482,7 +510,7 @@ jobs: if: (!cancelled()) && fromJson(needs.precondition.outputs.required).sparkr == 'true' name: "Build modules: sparkr" runs-on: ubuntu-latest - timeout-minutes: 300 + timeout-minutes: 180 container: image: ${{ needs.precondition.outputs.image_url }} env: @@ -512,12 +540,11 @@ jobs: git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty # Cache local repositories. Note that GitHub Actions cache has a 10G limit. - - name: Cache Scala, SBT and Maven + - name: Cache SBT and Maven uses: actions/cache@v4 with: path: | build/apache-maven-* - build/scala-* build/*.jar ~/.sbt key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} @@ -598,17 +625,18 @@ jobs: - name: Python CodeGen check run: ./dev/connect-check-protos.py - # Static analysis, and documentation build + # Static analysis lint: needs: [precondition, infra-image] # always run if lint == 'true', even infra-image is skip (such as non-master job) if: (!cancelled()) && fromJson(needs.precondition.outputs.required).lint == 'true' - name: Linters, licenses, dependencies and documentation generation + name: Linters, licenses, and dependencies runs-on: ubuntu-latest - timeout-minutes: 300 + timeout-minutes: 180 env: LC_ALL: C.UTF-8 LANG: C.UTF-8 + NOLINT_ON_COMPILE: false PYSPARK_DRIVER_PYTHON: python3.9 PYSPARK_PYTHON: python3.9 GITHUB_PREV_SHA: ${{ github.event.before }} @@ -632,12 +660,11 @@ jobs: git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty # Cache local repositories. Note that GitHub Actions cache has a 10G limit. - - name: Cache Scala, SBT and Maven + - name: Cache SBT and Maven uses: actions/cache@v4 with: path: | build/apache-maven-* - build/scala-* build/*.jar ~/.sbt key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} @@ -741,7 +768,90 @@ jobs: Rscript -e "devtools::install_version('lintr', version='2.0.1', repos='https://cloud.r-project.org')" - name: Install R linter dependencies and SparkR run: ./R/install-dev.sh - # Should delete this section after SPARK 3.5 EOL. + - name: R linter + run: ./dev/lint-r + + # Documentation build + docs: + needs: [precondition, infra-image] + # always run if lint == 'true', even infra-image is skip (such as non-master job) + if: (!cancelled()) && fromJson(needs.precondition.outputs.required).docs == 'true' + name: Documentation generation + runs-on: ubuntu-latest + timeout-minutes: 180 + env: + LC_ALL: C.UTF-8 + LANG: C.UTF-8 + NOLINT_ON_COMPILE: false + PYSPARK_DRIVER_PYTHON: python3.9 + PYSPARK_PYTHON: python3.9 + GITHUB_PREV_SHA: ${{ github.event.before }} + container: + image: ${{ needs.precondition.outputs.image_url }} + steps: + - name: Checkout Spark repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + repository: apache/spark + ref: ${{ inputs.branch }} + - name: Add GITHUB_WORKSPACE to git trust safe.directory + run: | + git config --global --add safe.directory ${GITHUB_WORKSPACE} + - name: Sync the current branch with the latest in Apache Spark + if: github.repository != 'apache/spark' + run: | + echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV + git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} + git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD + git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty + # Cache local repositories. Note that GitHub Actions cache has a 10G limit. + - name: Cache SBT and Maven + uses: actions/cache@v4 + with: + path: | + build/apache-maven-* + build/*.jar + ~/.sbt + key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} + restore-keys: | + build- + - name: Cache Coursier local repository + uses: actions/cache@v4 + with: + path: ~/.cache/coursier + key: docs-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} + restore-keys: | + docs-coursier- + - name: Cache Maven local repository + uses: actions/cache@v4 + with: + path: ~/.m2/repository + key: docs-maven-${{ hashFiles('**/pom.xml') }} + restore-keys: | + docs-maven- + - name: Free up disk space + run: | + if [ -f ./dev/free_disk_space_container ]; then + ./dev/free_disk_space_container + fi + - name: Install Java ${{ inputs.java }} + uses: actions/setup-java@v4 + with: + distribution: zulu + java-version: ${{ inputs.java }} + - name: Install Python dependencies for python linter and documentation generation + if: inputs.branch != 'branch-3.4' && inputs.branch != 'branch-3.5' + run: | + # Should unpin 'sphinxcontrib-*' after upgrading sphinx>5 + # See 'ipython_genutils' in SPARK-38517 + # See 'docutils<0.18.0' in SPARK-39421 + python3.9 -m pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe 'pyzmq<24.0.0' \ + ipython ipython_genutils sphinx_plotly_directive 'numpy>=1.20.0' pyarrow pandas 'plotly>=4.8' 'docutils<0.18.0' \ + 'flake8==3.9.0' 'mypy==1.8.0' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' 'black==23.9.1' \ + 'pandas-stubs==1.2.0.53' 'grpcio==1.62.0' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' \ + 'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.1' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5' + python3.9 -m pip list - name: Install dependencies for documentation generation for branch-3.4, branch-3.5 if: inputs.branch == 'branch-3.4' || inputs.branch == 'branch-3.5' run: | @@ -759,13 +869,16 @@ jobs: python3.9 -m pip install 'docutils<0.18.0' # See SPARK-39421 - name: Install dependencies for documentation generation run: | + # Keep the version of Bundler here in sync with the following locations: + # - dev/create-release/spark-rm/Dockerfile + # - docs/README.md gem install bundler -v 2.4.22 cd docs bundle install - - name: R linter - run: ./dev/lint-r - name: Run documentation build run: | + # We need this link because the jekyll build calls `python`. + ln -s "$(which python3.9)" "/usr/local/bin/python" # Build docs first with SKIP_API to ensure they are buildable without requiring any # language docs to be built beforehand. cd docs; SKIP_API=1 bundle exec jekyll build; cd .. @@ -788,67 +901,6 @@ jobs: path: site.tar.bz2 retention-days: 1 - maven-build: - needs: precondition - if: fromJson(needs.precondition.outputs.required).maven-build == 'true' - name: Java ${{ matrix.java }} build with Maven (${{ matrix.os }}) - strategy: - fail-fast: false - matrix: - include: - - java: 17 - os: ubuntu-latest - - java: 21 - os: ubuntu-latest - - java: 21 - os: macos-14 - runs-on: ${{ matrix.os }} - timeout-minutes: 300 - steps: - - name: Checkout Spark repository - uses: actions/checkout@v4 - with: - fetch-depth: 0 - repository: apache/spark - ref: ${{ inputs.branch }} - - name: Sync the current branch with the latest in Apache Spark - if: github.repository != 'apache/spark' - run: | - git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty - - name: Cache Scala, SBT and Maven - uses: actions/cache@v4 - with: - path: | - build/apache-maven-* - build/scala-* - build/*.jar - ~/.sbt - key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} - restore-keys: | - build- - - name: Cache Maven local repository - uses: actions/cache@v4 - with: - path: ~/.m2/repository - key: java${{ matrix.java }}-maven-${{ hashFiles('**/pom.xml') }} - restore-keys: | - java${{ matrix.java }}-maven- - - name: Install Java ${{ matrix.java }} - uses: actions/setup-java@v4 - with: - distribution: zulu - java-version: ${{ matrix.java }} - - name: Build with Maven - run: | - export MAVEN_OPTS="-Xss64m -Xmx2g -XX:ReservedCodeCacheSize=1g -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN" - export MAVEN_CLI_OPTS="--no-transfer-progress" - export JAVA_VERSION=${{ matrix.java }} - # It uses Maven's 'install' intentionally, see https://github.com/apache/spark/pull/26414. - ./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Djava.version=${JAVA_VERSION/-ea} install - rm -rf ~/.m2/repository/org/apache/spark - # Any TPC-DS related updates on this job need to be applied to tpcds-1g-gen job of benchmark.yml as well tpcds-1g: needs: precondition @@ -856,7 +908,7 @@ jobs: name: Run TPC-DS queries with SF=1 # Pin to 'Ubuntu 20.04' due to 'databricks/tpcds-kit' compilation runs-on: ubuntu-20.04 - timeout-minutes: 300 + timeout-minutes: 180 env: SPARK_LOCAL_IP: localhost steps: @@ -872,12 +924,11 @@ jobs: git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty - - name: Cache Scala, SBT and Maven + - name: Cache SBT and Maven uses: actions/cache@v4 with: path: | build/apache-maven-* - build/scala-* build/*.jar ~/.sbt key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} @@ -958,13 +1009,12 @@ jobs: if: fromJson(needs.precondition.outputs.required).docker-integration-tests == 'true' name: Run Docker integration tests runs-on: ubuntu-latest - timeout-minutes: 300 + timeout-minutes: 180 env: HADOOP_PROFILE: ${{ inputs.hadoop }} HIVE_PROFILE: hive2.3 GITHUB_PREV_SHA: ${{ github.event.before }} SPARK_LOCAL_IP: localhost - ORACLE_DOCKER_IMAGE_NAME: gvenzl/oracle-free:23.3 SKIP_UNIDOC: true SKIP_MIMA: true SKIP_PACKAGING: true @@ -982,12 +1032,11 @@ jobs: git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty - - name: Cache Scala, SBT and Maven + - name: Cache SBT and Maven uses: actions/cache@v4 with: path: | build/apache-maven-* - build/scala-* build/*.jar ~/.sbt key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} @@ -1027,7 +1076,7 @@ jobs: if: fromJson(needs.precondition.outputs.required).k8s-integration-tests == 'true' name: Run Spark on Kubernetes Integration test runs-on: ubuntu-latest - timeout-minutes: 300 + timeout-minutes: 180 steps: - name: Checkout Spark repository uses: actions/checkout@v4 @@ -1042,12 +1091,11 @@ jobs: git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty - - name: Cache Scala, SBT and Maven + - name: Cache SBT and Maven uses: actions/cache@v4 with: path: | build/apache-maven-* - build/scala-* build/*.jar ~/.sbt key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} @@ -1086,6 +1134,11 @@ jobs: minikube mount ${PVC_TESTS_HOST_PATH}:${PVC_TESTS_VM_PATH} --gid=0 --uid=185 & kubectl create clusterrolebinding serviceaccounts-cluster-admin --clusterrole=cluster-admin --group=system:serviceaccounts || true kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/v1.8.2/installer/volcano-development.yaml || true + if [[ "${{ inputs.branch }}" == 'branch-3.5' || "${{ inputs.branch }}" == 'branch-3.4' ]]; then + kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/v1.7.0/installer/volcano-development.yaml || true + else + kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/v1.8.2/installer/volcano-development.yaml || true + fi eval $(minikube docker-env) build/sbt -Phadoop-3 -Psparkr -Pkubernetes -Pvolcano -Pkubernetes-integration-tests -Dspark.kubernetes.test.volcanoMaxConcurrencyJobNum=1 -Dtest.exclude.tags=local "kubernetes-integration-tests/test" - name: Upload Spark on K8S integration tests log files @@ -1100,7 +1153,7 @@ jobs: if: fromJson(needs.precondition.outputs.required).ui == 'true' name: Run Spark UI tests runs-on: ubuntu-latest - timeout-minutes: 300 + timeout-minutes: 180 steps: - uses: actions/checkout@v4 - name: Use Node.js diff --git a/.github/workflows/build_branch34.yml b/.github/workflows/build_branch34.yml index deb43d82c9791..deb6c42407970 100644 --- a/.github/workflows/build_branch34.yml +++ b/.github/workflows/build_branch34.yml @@ -43,9 +43,9 @@ jobs: jobs: >- { "build": "true", - "pyspark": "true", "sparkr": "true", "tpcds-1g": "true", "docker-integration-tests": "true", + "k8s-integration-tests": "true", "lint" : "true" } diff --git a/.github/workflows/cancel_duplicate_workflow_runs.yml b/.github/workflows/build_branch34_python.yml similarity index 58% rename from .github/workflows/cancel_duplicate_workflow_runs.yml rename to .github/workflows/build_branch34_python.yml index d41ca31190d94..c109ba2dc7922 100644 --- a/.github/workflows/cancel_duplicate_workflow_runs.yml +++ b/.github/workflows/build_branch34_python.yml @@ -17,22 +17,29 @@ # under the License. # -name: Cancelling Duplicates +name: "Build / Python-only (branch-3.4)" + on: - workflow_run: - workflows: - - 'Build' - types: ['requested'] + schedule: + - cron: '0 9 * * *' jobs: - cancel-duplicate-workflow-runs: - name: "Cancel duplicate workflow runs" - runs-on: ubuntu-latest - steps: - - uses: potiuk/cancel-workflow-runs@4723494a065d162f8e9efd071b98e0126e00f866 # @master - name: "Cancel duplicate workflow runs" - with: - cancelMode: allDuplicates - token: ${{ secrets.GITHUB_TOKEN }} - sourceRunId: ${{ github.event.workflow_run.id }} - skipEventTypes: '["push", "schedule"]' + run-build: + permissions: + packages: write + name: Run + uses: ./.github/workflows/build_and_test.yml + if: github.repository == 'apache/spark' + with: + java: 8 + branch: branch-3.4 + hadoop: hadoop3 + envs: >- + { + "PYTHON_TO_TEST": "" + } + jobs: >- + { + "pyspark": "true", + "pyspark-pandas": "true" + } diff --git a/.github/workflows/build_branch35.yml b/.github/workflows/build_branch35.yml index 9e6fe13c020e4..2ec080d5722c1 100644 --- a/.github/workflows/build_branch35.yml +++ b/.github/workflows/build_branch35.yml @@ -43,9 +43,9 @@ jobs: jobs: >- { "build": "true", - "pyspark": "true", "sparkr": "true", "tpcds-1g": "true", "docker-integration-tests": "true", + "k8s-integration-tests": "true", "lint" : "true" } diff --git a/.github/workflows/build_branch35_python.yml b/.github/workflows/build_branch35_python.yml new file mode 100644 index 0000000000000..1585534d33ba9 --- /dev/null +++ b/.github/workflows/build_branch35_python.yml @@ -0,0 +1,45 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +name: "Build / Python-only (branch-3.5)" + +on: + schedule: + - cron: '0 11 * * *' + +jobs: + run-build: + permissions: + packages: write + name: Run + uses: ./.github/workflows/build_and_test.yml + if: github.repository == 'apache/spark' + with: + java: 8 + branch: branch-3.5 + hadoop: hadoop3 + envs: >- + { + "PYTHON_TO_TEST": "" + } + jobs: >- + { + "pyspark": "true", + "pyspark-pandas": "true" + } diff --git a/.github/workflows/build_java21.yml b/.github/workflows/build_java21.yml index b1ef5a3218356..871e1a9c07ef0 100644 --- a/.github/workflows/build_java21.yml +++ b/.github/workflows/build_java21.yml @@ -46,5 +46,9 @@ jobs: "pyspark": "true", "sparkr": "true", "tpcds-1g": "true", - "docker-integration-tests": "true" + "docker-integration-tests": "true", + "yarn": "true", + "k8s-integration-tests": "true", + "buf": "true", + "ui": "true" } diff --git a/.github/workflows/build_maven_java21_macos14.yml b/.github/workflows/build_maven_java21_macos14.yml index 70b47fcecb260..fb5e609f4eae0 100644 --- a/.github/workflows/build_maven_java21_macos14.yml +++ b/.github/workflows/build_maven_java21_macos14.yml @@ -21,7 +21,7 @@ name: "Build / Maven (master, Scala 2.13, Hadoop 3, JDK 21, macos-14)" on: schedule: - - cron: '0 20 * * *' + - cron: '0 20 */2 * *' jobs: run-build: diff --git a/.github/workflows/build_ansi.yml b/.github/workflows/build_non_ansi.yml similarity index 84% rename from .github/workflows/build_ansi.yml rename to .github/workflows/build_non_ansi.yml index d9f587ae203bb..4ac2a589f4f81 100644 --- a/.github/workflows/build_ansi.yml +++ b/.github/workflows/build_non_ansi.yml @@ -17,11 +17,11 @@ # under the License. # -name: "Build / ANSI (master, Hadoop 3, JDK 17, Scala 2.13)" +name: "Build / Non-ANSI (master, Hadoop 3, JDK 17, Scala 2.13)" on: schedule: - - cron: '0 1,13 * * *' + - cron: '0 1 * * *' jobs: run-build: @@ -36,13 +36,15 @@ jobs: hadoop: hadoop3 envs: >- { - "SPARK_ANSI_SQL_MODE": "true", + "SPARK_ANSI_SQL_MODE": "false", } jobs: >- { "build": "true", + "docs": "true", "pyspark": "true", "sparkr": "true", "tpcds-1g": "true", - "docker-integration-tests": "true" + "docker-integration-tests": "true", + "yarn": "true" } diff --git a/.github/workflows/build_python_3.10.yml b/.github/workflows/build_python_3.10.yml new file mode 100644 index 0000000000000..5ae37fbc9120e --- /dev/null +++ b/.github/workflows/build_python_3.10.yml @@ -0,0 +1,45 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +name: "Build / Python-only (master, Python 3.10)" + +on: + schedule: + - cron: '0 17 * * *' + +jobs: + run-build: + permissions: + packages: write + name: Run + uses: ./.github/workflows/build_and_test.yml + if: github.repository == 'apache/spark' + with: + java: 17 + branch: master + hadoop: hadoop3 + envs: >- + { + "PYTHON_TO_TEST": "python3.10" + } + jobs: >- + { + "pyspark": "true", + "pyspark-pandas": "true" + } diff --git a/.github/workflows/build_python_3.12.yml b/.github/workflows/build_python_3.12.yml new file mode 100644 index 0000000000000..e1fd45a7d8838 --- /dev/null +++ b/.github/workflows/build_python_3.12.yml @@ -0,0 +1,45 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +name: "Build / Python-only (master, Python 3.12)" + +on: + schedule: + - cron: '0 19 * * *' + +jobs: + run-build: + permissions: + packages: write + name: Run + uses: ./.github/workflows/build_and_test.yml + if: github.repository == 'apache/spark' + with: + java: 17 + branch: master + hadoop: hadoop3 + envs: >- + { + "PYTHON_TO_TEST": "python3.12" + } + jobs: >- + { + "pyspark": "true", + "pyspark-pandas": "true" + } diff --git a/.github/workflows/build_python_connect.yml b/.github/workflows/build_python_connect.yml index 965e839b6b2bc..01d9d272d4366 100644 --- a/.github/workflows/build_python_connect.yml +++ b/.github/workflows/build_python_connect.yml @@ -33,12 +33,11 @@ jobs: steps: - name: Checkout Spark repository uses: actions/checkout@v4 - - name: Cache Scala, SBT and Maven + - name: Cache SBT and Maven uses: actions/cache@v4 with: path: | build/apache-maven-* - build/scala-* build/*.jar ~/.sbt key: build-spark-connect-python-only-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} @@ -63,7 +62,7 @@ jobs: architecture: x64 - name: Build Spark run: | - ./build/sbt -Phive test:package + ./build/sbt -Phive Test/package - name: Install pure Python package (pyspark-connect) env: SPARK_TESTING: 1 @@ -71,8 +70,8 @@ jobs: cd python python packaging/connect/setup.py sdist cd dist - pip install pyspark-connect-*.tar.gz - pip install 'six==1.16.0' 'pandas<=2.2.2' scipy 'plotly>=4.8' 'mlflow>=2.8.1' coverage matplotlib openpyxl 'memory-profiler>=0.61.0' 'scikit-learn>=1.3.2' torch torchvision torcheval deepspeed unittest-xml-reporting + pip install pyspark*connect-*.tar.gz + pip install 'six==1.16.0' 'pandas<=2.2.2' scipy 'plotly>=4.8' 'mlflow>=2.8.1' coverage matplotlib openpyxl 'memory-profiler>=0.61.0' 'scikit-learn>=1.3.2' 'graphviz==0.20.3' torch torchvision torcheval deepspeed unittest-xml-reporting - name: Run tests env: SPARK_TESTING: 1 @@ -81,26 +80,46 @@ jobs: # Make less noisy cp conf/log4j2.properties.template conf/log4j2.properties sed -i 's/rootLogger.level = info/rootLogger.level = warn/g' conf/log4j2.properties - # Start a Spark Connect server - PYTHONPATH="python/lib/pyspark.zip:python/lib/py4j-0.10.9.7-src.zip:$PYTHONPATH" ./sbin/start-connect-server.sh --driver-java-options "-Dlog4j.configurationFile=file:$GITHUB_WORKSPACE/conf/log4j2.properties" --jars `find connector/connect/server/target -name spark-connect*SNAPSHOT.jar` - # Make sure running Python workers that contains pyspark.core once. They will be reused. - python -c "from pyspark.sql import SparkSession; _ = SparkSession.builder.remote('sc://localhost').getOrCreate().range(100).repartition(100).mapInPandas(lambda x: x, 'id INT').collect()" + + # Start a Spark Connect server for local + PYTHONPATH="python/lib/pyspark.zip:python/lib/py4j-0.10.9.7-src.zip:$PYTHONPATH" ./sbin/start-connect-server.sh \ + --driver-java-options "-Dlog4j.configurationFile=file:$GITHUB_WORKSPACE/conf/log4j2.properties" \ + --jars "`find connector/connect/server/target -name spark-connect-*SNAPSHOT.jar`,`find connector/protobuf/target -name spark-protobuf-*SNAPSHOT.jar`,`find connector/avro/target -name spark-avro*SNAPSHOT.jar`" + # Remove Py4J and PySpark zipped library to make sure there is no JVM connection - rm python/lib/* - rm -r python/pyspark + mv python/lib lib.back + mv python/pyspark pyspark.back + # Several tests related to catalog requires to run them sequencially, e.g., writing a table in a listener. ./python/run-tests --parallelism=1 --python-executables=python3 --modules pyspark-connect,pyspark-ml-connect # None of tests are dependent on each other in Pandas API on Spark so run them in parallel ./python/run-tests --parallelism=4 --python-executables=python3 --modules pyspark-pandas-connect-part0,pyspark-pandas-connect-part1,pyspark-pandas-connect-part2,pyspark-pandas-connect-part3 + + # Stop Spark Connect server. + ./sbin/stop-connect-server.sh + mv lib.back python/lib + mv pyspark.back python/pyspark + + # Start a Spark Connect server for local-cluster + PYTHONPATH="python/lib/pyspark.zip:python/lib/py4j-0.10.9.7-src.zip:$PYTHONPATH" ./sbin/start-connect-server.sh \ + --master "local-cluster[2, 4, 1024]" \ + --driver-java-options "-Dlog4j.configurationFile=file:$GITHUB_WORKSPACE/conf/log4j2.properties" \ + --jars "`find connector/connect/server/target -name spark-connect-*SNAPSHOT.jar`,`find connector/protobuf/target -name spark-protobuf-*SNAPSHOT.jar`,`find connector/avro/target -name spark-avro*SNAPSHOT.jar`" + + # Remove Py4J and PySpark zipped library to make sure there is no JVM connection + mv python/lib lib.back + mv python/pyspark lib.back + + ./python/run-tests --parallelism=1 --python-executables=python3 --testnames "pyspark.resource.tests.test_connect_resources,pyspark.sql.tests.connect.client.test_artifact,pyspark.sql.tests.connect.client.test_artifact_localcluster,pyspark.sql.tests.connect.test_resources" - name: Upload test results to report if: always() uses: actions/upload-artifact@v4 with: name: test-results-spark-connect-python-only path: "**/target/test-reports/*.xml" - - name: Upload unit tests log files - if: failure() + - name: Upload Spark Connect server log file + if: ${{ !success() }} uses: actions/upload-artifact@v4 with: name: unit-tests-log-spark-connect-python-only - path: "**/target/unit-tests.log" + path: logs/*.out diff --git a/.github/workflows/build_python_connect35.yml b/.github/workflows/build_python_connect35.yml new file mode 100644 index 0000000000000..abff471349a22 --- /dev/null +++ b/.github/workflows/build_python_connect35.yml @@ -0,0 +1,113 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +name: Build / Spark Connect Python-only (master-server, 35-client, Python 3.11) + +on: + schedule: + - cron: '0 21 * * *' + +jobs: + # Build: build Spark and run the tests for specified modules using SBT + build: + name: "Build modules: pyspark-connect" + runs-on: ubuntu-latest + timeout-minutes: 100 + if: github.repository == 'apache/spark' + steps: + - name: Checkout Spark repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Cache SBT and Maven + uses: actions/cache@v4 + with: + path: | + build/apache-maven-* + build/*.jar + ~/.sbt + key: build-spark-connect-python-only-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} + restore-keys: | + build-spark-connect-python-only- + - name: Cache Coursier local repository + uses: actions/cache@v4 + with: + path: ~/.cache/coursier + key: coursier-build-spark-connect-python-only-${{ hashFiles('**/pom.xml') }} + restore-keys: | + coursier-build-spark-connect-python-only- + - name: Install Java 17 + uses: actions/setup-java@v4 + with: + distribution: zulu + java-version: 17 + - name: Install Python 3.11 + uses: actions/setup-python@v5 + with: + python-version: '3.11' + architecture: x64 + - name: Build Spark + run: | + ./build/sbt -Phive Test/package + - name: Install Python dependencies + run: | + pip install 'numpy==1.25.1' 'pyarrow==12.0.1' 'pandas<=2.0.3' scipy unittest-xml-reporting plotly>=4.8 'mlflow>=2.3.1' coverage 'matplotlib==3.7.2' openpyxl 'memory-profiler==0.60.0' 'scikit-learn==1.1.*' + + # Add Python deps for Spark Connect. + pip install 'grpcio>=1.48,<1.57' 'grpcio-status>=1.48,<1.57' 'protobuf==3.20.3' 'googleapis-common-protos==1.56.4' 'graphviz==0.20.3' + + # Add torch as a testing dependency for TorchDistributor + pip install 'torch==2.0.1' 'torchvision==0.15.2' torcheval + - name: Run tests + env: + SPARK_TESTING: 1 + SPARK_SKIP_CONNECT_COMPAT_TESTS: 1 + SPARK_CONNECT_TESTING_REMOTE: sc://localhost + run: | + # Make less noisy + cp conf/log4j2.properties.template conf/log4j2.properties + sed -i 's/rootLogger.level = info/rootLogger.level = warn/g' conf/log4j2.properties + + # Start a Spark Connect server for local + PYTHONPATH="python/lib/pyspark.zip:python/lib/py4j-0.10.9.7-src.zip:$PYTHONPATH" ./sbin/start-connect-server.sh \ + --driver-java-options "-Dlog4j.configurationFile=file:$GITHUB_WORKSPACE/conf/log4j2.properties" \ + --jars "`find connector/connect/server/target -name spark-connect-*SNAPSHOT.jar`,`find connector/protobuf/target -name spark-protobuf-*SNAPSHOT.jar`,`find connector/avro/target -name spark-avro*SNAPSHOT.jar`" + + # Checkout to branch-3.5 to use the tests in branch-3.5. + cd .. + git clone --single-branch --branch branch-3.5 $GITHUB_SERVER_URL/$GITHUB_REPOSITORY spark-3.5 + cd spark-3.5 + + # Several tests related to catalog requires to run them sequencially, e.g., writing a table in a listener. + # Run branch-3.5 tests + ./python/run-tests --parallelism=1 --python-executables=python3 --modules pyspark-connect + # None of tests are dependent on each other in Pandas API on Spark so run them in parallel + ./python/run-tests --parallelism=4 --python-executables=python3 --modules pyspark-pandas-connect,pyspark-pandas-slow-connect + - name: Upload test results to report + if: always() + uses: actions/upload-artifact@v4 + with: + name: test-results-spark-connect-python-only + path: "**/target/test-reports/*.xml" + - name: Upload Spark Connect server log file + if: ${{ !success() }} + uses: actions/upload-artifact@v4 + with: + name: unit-tests-log-spark-connect-python-only + path: logs/*.out diff --git a/.github/workflows/build_python.yml b/.github/workflows/build_python_pypy3.9.yml similarity index 79% rename from .github/workflows/build_python.yml rename to .github/workflows/build_python_pypy3.9.yml index 8e36227c08372..e05071ef034a0 100644 --- a/.github/workflows/build_python.yml +++ b/.github/workflows/build_python_pypy3.9.yml @@ -17,7 +17,7 @@ # under the License. # -name: "Build / Python-only (master, PyPy 3.8/Python 3.10/Python 3.11/Python 3.12)" +name: "Build / Python-only (master, PyPy 3.9)" on: schedule: @@ -25,10 +25,6 @@ on: jobs: run-build: - strategy: - fail-fast: false - matrix: - pyversion: ["pypy3", "python3.10", "python3.11", "python3.12"] permissions: packages: write name: Run @@ -40,9 +36,10 @@ jobs: hadoop: hadoop3 envs: >- { - "PYTHON_TO_TEST": "${{ matrix.pyversion }}" + "PYTHON_TO_TEST": "pypy3" } jobs: >- { - "pyspark": "true" + "pyspark": "true", + "pyspark-pandas": "true" } diff --git a/.github/workflows/build_rockdb_as_ui_backend.yml b/.github/workflows/build_rockdb_as_ui_backend.yml index e11ec85b8b176..96009c41dbbf9 100644 --- a/.github/workflows/build_rockdb_as_ui_backend.yml +++ b/.github/workflows/build_rockdb_as_ui_backend.yml @@ -43,6 +43,5 @@ jobs: "build": "true", "pyspark": "true", "sparkr": "true", - "tpcds-1g": "true", - "docker-integration-tests": "true" + "yarn": "true" } diff --git a/.github/workflows/build_sparkr_window.yml b/.github/workflows/build_sparkr_window.yml index 6debf0cd12235..e7db2b909f8f5 100644 --- a/.github/workflows/build_sparkr_window.yml +++ b/.github/workflows/build_sparkr_window.yml @@ -16,7 +16,7 @@ # specific language governing permissions and limitations # under the License. # -name: "Build / SparkR-only (master, 4.3.3, windows-2019)" +name: "Build / SparkR-only (master, 4.4.0, windows-2022)" on: schedule: @@ -25,7 +25,7 @@ on: jobs: build: name: "Build module: sparkr" - runs-on: windows-2019 + runs-on: windows-2022 timeout-minutes: 300 if: github.repository == 'apache/spark' steps: @@ -35,7 +35,7 @@ jobs: repository: cdarlint/winutils - name: Move Hadoop winutil into home directory run: | - Move-Item -Path hadoop-3.3.5 -Destination ~\ + Move-Item -Path hadoop-3.3.6 -Destination ~\ - name: Checkout Spark repository uses: actions/checkout@v4 - name: Cache Maven local repository @@ -50,10 +50,10 @@ jobs: with: distribution: zulu java-version: 17 - - name: Install R 4.3.3 + - name: Install R 4.4.0 uses: r-lib/actions/setup-r@v2 with: - r-version: 4.3.3 + r-version: 4.4.0 - name: Install R dependencies run: | Rscript -e "install.packages(c('knitr', 'rmarkdown', 'testthat', 'e1071', 'survival', 'arrow', 'xml2'), repos='https://cloud.r-project.org/')" @@ -79,7 +79,7 @@ jobs: shell: cmd - name: Run SparkR tests run: | - set HADOOP_HOME=%USERPROFILE%\hadoop-3.3.5 + set HADOOP_HOME=%USERPROFILE%\hadoop-3.3.6 set PATH=%HADOOP_HOME%\bin;%PATH% .\bin\spark-submit2.cmd --driver-java-options "-Dlog4j.configurationFile=file:///%CD:\=/%/R/log4j2.properties" --conf spark.hadoop.fs.defaultFS="file:///" R\pkg\tests\run-all.R shell: cmd diff --git a/.github/workflows/maven_test.yml b/.github/workflows/maven_test.yml index b01f08a23e470..d23cea926a274 100644 --- a/.github/workflows/maven_test.yml +++ b/.github/workflows/maven_test.yml @@ -142,12 +142,11 @@ jobs: git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty # Cache local repositories. Note that GitHub Actions cache has a 10G limit. - - name: Cache Scala, SBT and Maven + - name: Cache SBT and Maven uses: actions/cache@v4 with: path: | build/apache-maven-* - build/scala-* build/*.jar ~/.sbt key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} @@ -191,18 +190,18 @@ jobs: export ENABLE_KINESIS_TESTS=0 # Replace with the real module name, for example, connector#kafka-0-10 -> connector/kafka-0-10 export TEST_MODULES=`echo "$MODULES_TO_TEST" | sed -e "s%#%/%g"` - ./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Pspark-ganglia-lgpl -Pkinesis-asl -Djava.version=${JAVA_VERSION/-ea} clean install + ./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Pjvm-profiler -Pspark-ganglia-lgpl -Pkinesis-asl -Djava.version=${JAVA_VERSION/-ea} clean install if [[ "$INCLUDED_TAGS" != "" ]]; then - ./build/mvn $MAVEN_CLI_OPTS -pl "$TEST_MODULES" -Pyarn -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Pspark-ganglia-lgpl -Pkinesis-asl -Djava.version=${JAVA_VERSION/-ea} -Dtest.include.tags="$INCLUDED_TAGS" test -fae + ./build/mvn $MAVEN_CLI_OPTS -pl "$TEST_MODULES" -Pyarn -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Pjvm-profiler -Pspark-ganglia-lgpl -Pkinesis-asl -Djava.version=${JAVA_VERSION/-ea} -Dtest.include.tags="$INCLUDED_TAGS" test -fae elif [[ "$MODULES_TO_TEST" == "connect" ]]; then ./build/mvn $MAVEN_CLI_OPTS -Dtest.exclude.tags="$EXCLUDED_TAGS" -Djava.version=${JAVA_VERSION/-ea} -pl connector/connect/client/jvm,connector/connect/common,connector/connect/server test -fae elif [[ "$EXCLUDED_TAGS" != "" ]]; then - ./build/mvn $MAVEN_CLI_OPTS -pl "$TEST_MODULES" -Pyarn -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Pspark-ganglia-lgpl -Pkinesis-asl -Djava.version=${JAVA_VERSION/-ea} -Dtest.exclude.tags="$EXCLUDED_TAGS" test -fae + ./build/mvn $MAVEN_CLI_OPTS -pl "$TEST_MODULES" -Pyarn -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Pjvm-profiler -Pspark-ganglia-lgpl -Pkinesis-asl -Djava.version=${JAVA_VERSION/-ea} -Dtest.exclude.tags="$EXCLUDED_TAGS" test -fae elif [[ "$MODULES_TO_TEST" == *"sql#hive-thriftserver"* ]]; then # To avoid a compilation loop, for the `sql/hive-thriftserver` module, run `clean install` instead - ./build/mvn $MAVEN_CLI_OPTS -pl "$TEST_MODULES" -Pyarn -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Pspark-ganglia-lgpl -Pkinesis-asl -Djava.version=${JAVA_VERSION/-ea} clean install -fae + ./build/mvn $MAVEN_CLI_OPTS -pl "$TEST_MODULES" -Pyarn -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Pjvm-profiler -Pspark-ganglia-lgpl -Pkinesis-asl -Djava.version=${JAVA_VERSION/-ea} clean install -fae else - ./build/mvn $MAVEN_CLI_OPTS -pl "$TEST_MODULES" -Pyarn -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Pspark-ganglia-lgpl -Phadoop-cloud -Pkinesis-asl -Djava.version=${JAVA_VERSION/-ea} test -fae + ./build/mvn $MAVEN_CLI_OPTS -pl "$TEST_MODULES" -Pyarn -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Pspark-ganglia-lgpl -Phadoop-cloud -Pjvm-profiler -Pkinesis-asl -Djava.version=${JAVA_VERSION/-ea} test -fae fi - name: Clean up local Maven repository run: | diff --git a/.github/workflows/publish_snapshot.yml b/.github/workflows/publish_snapshot.yml index d09babd372403..1b5bd0ba61288 100644 --- a/.github/workflows/publish_snapshot.yml +++ b/.github/workflows/publish_snapshot.yml @@ -17,11 +17,11 @@ # under the License. # -name: Publish Snapshot +name: Publish snapshot on: schedule: - - cron: '0 0,12 * * *' + - cron: '0 0 * * *' workflow_dispatch: inputs: branch: diff --git a/.gitignore b/.gitignore index 174f66c6064fe..787eb6180c35c 100644 --- a/.gitignore +++ b/.gitignore @@ -50,6 +50,7 @@ dev/create-release/*final dev/create-release/*txt dev/pr-deps/ dist/ +docs/_generated/ docs/_site/ docs/api docs/.local_ruby_bundle diff --git a/LICENSE-binary b/LICENSE-binary index 40271c9924bc4..b6971798e5577 100644 --- a/LICENSE-binary +++ b/LICENSE-binary @@ -204,171 +204,167 @@ This project bundles some components that are also licensed under the Apache License Version 2.0: -org.apache.zookeeper:zookeeper -oro:oro -commons-configuration:commons-configuration -commons-digester:commons-digester -com.chuusai:shapeless_2.13 -com.googlecode.javaewah:JavaEWAH -com.twitter:chill-java -com.twitter:chill_2.13 -com.univocity:univocity-parsers -javax.jdo:jdo-api -joda-time:joda-time -net.sf.opencsv:opencsv -org.apache.derby:derby -org.objenesis:objenesis -org.roaringbitmap:RoaringBitmap -org.scalanlp:breeze-macros_2.13 -org.scalanlp:breeze_2.13 -org.typelevel:macro-compat_2.13 -org.yaml:snakeyaml -org.apache.xbean:xbean-asm7-shaded -com.squareup.okhttp3:logging-interceptor -com.squareup.okhttp3:okhttp -com.squareup.okio:okio -org.apache.spark:spark-catalyst_2.13 -org.apache.spark:spark-kvstore_2.13 -org.apache.spark:spark-launcher_2.13 -org.apache.spark:spark-mllib-local_2.13 -org.apache.spark:spark-network-common_2.13 -org.apache.spark:spark-network-shuffle_2.13 -org.apache.spark:spark-sketch_2.13 -org.apache.spark:spark-tags_2.13 -org.apache.spark:spark-unsafe_2.13 -commons-httpclient:commons-httpclient -com.vlkan:flatbuffers -com.ning:compress-lzf -io.airlift:aircompressor -io.dropwizard.metrics:metrics-core -io.dropwizard.metrics:metrics-graphite -io.dropwizard.metrics:metrics-json -io.dropwizard.metrics:metrics-jvm -io.dropwizard.metrics:metrics-jmx -org.iq80.snappy:snappy com.clearspring.analytics:stream -com.jamesmurty.utils:java-xmlbuilder -commons-codec:commons-codec -commons-collections:commons-collections -io.fabric8:kubernetes-client -io.fabric8:kubernetes-model -io.fabric8:kubernetes-model-common -io.netty:netty-all -net.hydromatic:eigenbase-properties -net.sf.supercsv:super-csv -org.apache.arrow:arrow-format -org.apache.arrow:arrow-memory -org.apache.arrow:arrow-vector -org.apache.commons:commons-crypto -org.apache.commons:commons-lang3 -org.apache.hadoop:hadoop-annotations -org.apache.hadoop:hadoop-auth -org.apache.hadoop:hadoop-client -org.apache.hadoop:hadoop-common -org.apache.hadoop:hadoop-hdfs -org.apache.hadoop:hadoop-hdfs-client -org.apache.hadoop:hadoop-mapreduce-client-app -org.apache.hadoop:hadoop-mapreduce-client-common -org.apache.hadoop:hadoop-mapreduce-client-core -org.apache.hadoop:hadoop-mapreduce-client-jobclient -org.apache.hadoop:hadoop-mapreduce-client-shuffle -org.apache.hadoop:hadoop-yarn-api -org.apache.hadoop:hadoop-yarn-client -org.apache.hadoop:hadoop-yarn-common -org.apache.hadoop:hadoop-yarn-server-common -org.apache.hadoop:hadoop-yarn-server-web-proxy -org.apache.httpcomponents:httpclient -org.apache.httpcomponents:httpcore -org.apache.kerby:kerb-admin -org.apache.kerby:kerb-client -org.apache.kerby:kerb-common -org.apache.kerby:kerb-core -org.apache.kerby:kerb-crypto -org.apache.kerby:kerb-identity -org.apache.kerby:kerb-server -org.apache.kerby:kerb-simplekdc -org.apache.kerby:kerb-util -org.apache.kerby:kerby-asn1 -org.apache.kerby:kerby-config -org.apache.kerby:kerby-pkix -org.apache.kerby:kerby-util -org.apache.kerby:kerby-xdr -org.apache.orc:orc-core -org.apache.orc:orc-mapreduce -org.mortbay.jetty:jetty -org.mortbay.jetty:jetty-util -com.jolbox:bonecp -org.json4s:json4s-ast_2.13 -org.json4s:json4s-core_2.13 -org.json4s:json4s-jackson_2.13 -org.json4s:json4s-scalap_2.13 -com.carrotsearch:hppc com.fasterxml.jackson.core:jackson-annotations com.fasterxml.jackson.core:jackson-core com.fasterxml.jackson.core:jackson-databind com.fasterxml.jackson.dataformat:jackson-dataformat-yaml -com.fasterxml.jackson.jaxrs:jackson-jaxrs-base -com.fasterxml.jackson.jaxrs:jackson-jaxrs-json-provider -com.fasterxml.jackson.module:jackson-module-jaxb-annotations -com.fasterxml.jackson.module:jackson-module-paranamer +com.fasterxml.jackson.datatype:jackson-datatype-jsr310 com.fasterxml.jackson.module:jackson-module-scala_2.13 -com.github.mifmif:generex +com.github.joshelser:dropwizard-metrics-hadoop-metrics2-reporter com.google.code.findbugs:jsr305 com.google.code.gson:gson +com.google.crypto.tink:tink com.google.flatbuffers:flatbuffers-java com.google.guava:guava -com.google.inject:guice -com.google.inject.extensions:guice-servlet -com.twitter:parquet-hadoop-bundle +com.jamesmurty.utils:java-xmlbuilder +com.ning:compress-lzf +com.squareup.okhttp3:logging-interceptor +com.squareup.okhttp3:okhttp +com.squareup.okio:okio +com.tdunning:json +com.twitter:chill-java +com.twitter:chill_2.13 +com.univocity:univocity-parsers +com.zaxxer.HikariCP commons-cli:commons-cli +commons-codec:commons-codec +commons-collections:commons-collections commons-dbcp:commons-dbcp commons-io:commons-io commons-lang:commons-lang -commons-net:commons-net commons-pool:commons-pool +io.airlift:aircompressor +io.dropwizard.metrics:metrics-core +io.dropwizard.metrics:metrics-graphite +io.dropwizard.metrics:metrics-jmx +io.dropwizard.metrics:metrics-json +io.dropwizard.metrics:metrics-jvm +io.fabric8:kubernetes-client +io.fabric8:kubernetes-client-api +io.fabric8:kubernetes-httpclient-okhttp +io.fabric8:kubernetes-model-admissionregistration +io.fabric8:kubernetes-model-apiextensions +io.fabric8:kubernetes-model-apps +io.fabric8:kubernetes-model-autoscaling +io.fabric8:kubernetes-model-batch +io.fabric8:kubernetes-model-certificates +io.fabric8:kubernetes-model-common +io.fabric8:kubernetes-model-coordination +io.fabric8:kubernetes-model-core +io.fabric8:kubernetes-model-discovery +io.fabric8:kubernetes-model-events +io.fabric8:kubernetes-model-extensions +io.fabric8:kubernetes-model-flowcontrol +io.fabric8:kubernetes-model-gatewayapi +io.fabric8:kubernetes-model-metrics +io.fabric8:kubernetes-model-networking +io.fabric8:kubernetes-model-node +io.fabric8:kubernetes-model-policy +io.fabric8:kubernetes-model-rbac +io.fabric8:kubernetes-model-resource +io.fabric8:kubernetes-model-scheduling +io.fabric8:kubernetes-model-storageclass io.fabric8:zjsonpatch -javax.inject:javax.inject -javax.validation:validation-api -log4j:apache-log4j-extras -log4j:log4j +io.github.java-diff-utils:java-diff-utils +io.netty:netty-all +io.netty:netty-buffer +io.netty:netty-codec +io.netty:netty-codec-http +io.netty:netty-codec-http2 +io.netty:netty-codec-socks +io.netty:netty-common +io.netty:netty-handler +io.netty:netty-handler-proxy +io.netty:netty-resolver +io.netty:netty-tcnative-boringssl-static +io.netty:netty-tcnative-classes +io.netty:netty-transport +io.netty:netty-transport-classes-epoll +io.netty:netty-transport-classes-kqueue +io.netty:netty-transport-native-epoll +io.netty:netty-transport-native-kqueue +io.netty:netty-transport-native-unix-common +jakarta.inject:jakarta.inject-api +jakarta.validation:jakarta.validation-api +javax.jdo:jdo-api +joda-time:joda-time +net.java.dev.jna:jna +net.sf.opencsv:opencsv +net.sf.supercsv:super-csv net.sf.jpam:jpam +org.apache.arrow:arrow-format +org.apache.arrow:arrow-memory-core +org.apache.arrow:arrow-memory-netty +org.apache.arrow:arrow-memory-netty-buffer-patch +org.apache.arrow:arrow-vector org.apache.avro:avro org.apache.avro:avro-ipc org.apache.avro:avro-mapred +org.apache.commons:commons-collections4 org.apache.commons:commons-compress +org.apache.commons:commons-crypto +org.apache.commons:commons-lang3 org.apache.commons:commons-math3 +org.apache.commons:commons-text org.apache.curator:curator-client org.apache.curator:curator-framework org.apache.curator:curator-recipes -org.apache.directory.api:api-asn1-api -org.apache.directory.api:api-util -org.apache.directory.server:apacheds-i18n -org.apache.directory.server:apacheds-kerberos-codec -org.apache.htrace:htrace-core +org.apache.derby:derby +org.apache.derby:derbyshared +org.apache.derby:derbytools +org.apache.datasketches:datasketches-java +org.apache.datasketches:datasketches-memory +org.apache.hadoop:hadoop-client-api +org.apache.hadoop:hadoop-client-runtime +org.apache.hive:hive-beeline +org.apache.hive:hive-cli +org.apache.hive:hive-common +org.apache.hive:hive-exec +org.apache.hive:hive-jdbc +org.apache.hive:hive-llap-common +org.apache.hive:hive-metastore +org.apache.hive:hive-serde +org.apache.hive:hive-service-rpc +org.apache.hive:hive-shims-0.23 +org.apache.hive:hive-shims +org.apache.hive:hive-shims-common +org.apache.hive:hive-shims-scheduler +org.apache.hive:hive-storage-api +org.apache.httpcomponents:httpclient +org.apache.httpcomponents:httpcore org.apache.ivy:ivy -=org.apache.parquet:parquet-column +org.apache.logging.log4j:log4j-1.2-api +org.apache.logging.log4j:log4j-api +org.apache.logging.log4j:log4j-core +org.apache.logging.log4j:log4j-layout-template-json +org.apache.logging.log4j:log4j-slf4j-impl +org.apache.orc:orc-core +org.apache.orc:orc-format +org.apache.orc:orc-mapreduce +org.apache.orc:orc-shims +org.apache.parquet:parquet-column org.apache.parquet:parquet-common org.apache.parquet:parquet-encoding -org.apache.parquet:parquet-format +org.apache.parquet:parquet-format-structures org.apache.parquet:parquet-hadoop org.apache.parquet:parquet-jackson org.apache.thrift:libfb303 org.apache.thrift:libthrift +org.apache.ws.xmlschema:xmlschema-core +org.apache.xbean:xbean-asm9-shaded +org.apache.yetus:audience-annotations +org.apache.zookeeper:zookeeper +org.apache.zookeeper:zookeeper-jute org.codehaus.jackson:jackson-core-asl org.codehaus.jackson:jackson-mapper-asl org.datanucleus:datanucleus-api-jdo org.datanucleus:datanucleus-core org.datanucleus:datanucleus-rdbms -org.lz4:lz4-java -org.xerial.snappy:snappy-java -stax:stax-api -xerces:xercesImpl -org.codehaus.jackson:jackson-jaxrs -org.codehaus.jackson:jackson-xc +org.datanucleus:javax.jdo org.eclipse.jetty:jetty-client org.eclipse.jetty:jetty-http org.eclipse.jetty:jetty-io -org.eclipse.jetty:jetty-jndi org.eclipse.jetty:jetty-plus org.eclipse.jetty:jetty-proxy org.eclipse.jetty:jetty-security @@ -376,43 +372,44 @@ org.eclipse.jetty:jetty-server org.eclipse.jetty:jetty-servlet org.eclipse.jetty:jetty-servlets org.eclipse.jetty:jetty-util -org.eclipse.jetty:jetty-webapp -org.eclipse.jetty:jetty-xml +org.glassfish.jersey.containers:jersey-container-servlet +org.glassfish.jersey.containers:jersey-container-servlet-core +org.glassfish.jersey.core:jersey-client +org.glassfish.jersey.core:jersey-common +org.glassfish.jersey.core:jersey-server +org.glassfish.jersey.inject:jersey-hk2 +org.json4s:json4s-ast_2.13 +org.json4s:json4s-core_2.13 +org.json4s:json4s-jackson-core_2.13 +org.json4s:json4s-jackson_2.13 +org.json4s:json4s-scalap_2.13 +org.lz4:lz4-java +org.objenesis:objenesis +org.roaringbitmap:RoaringBitmap +org.rocksdb:rocksdbjni org.scala-lang:scala-compiler org.scala-lang:scala-library org.scala-lang:scala-reflect +org.scala-lang.modules:scala-collection-compat_2.13 +org.scala-lang.modules:scala-parallel-collections_2.13 org.scala-lang.modules:scala-parser-combinators_2.13 org.scala-lang.modules:scala-xml_2.13 -com.github.joshelser:dropwizard-metrics-hadoop-metrics2-reporter -com.zaxxer.HikariCP -org.apache.hive:hive-beeline -org.apache.hive:hive-cli -org.apache.hive:hive-common -org.apache.hive:hive-exec -org.apache.hive:hive-jdbc -org.apache.hive:hive-llap-common -org.apache.hive:hive-metastore -org.apache.hive:hive-serde -org.apache.hive:hive-service-rpc -org.apache.hive:hive-shims-0.23 -org.apache.hive:hive-shims -org.apache.hive:hive-common -org.apache.hive:hive-shims-scheduler -org.apache.hive:hive-storage-api -org.apache.hive:hive-vector-code-gen -org.datanucleus:javax.jdo -com.tdunning:json -org.apache.velocity:velocity -org.apache.yetus:audience-annotations -com.google.cloud.bigdataoss:gcs-connector +org.scalanlp:breeze-macros_2.13 +org.scalanlp:breeze_2.13 +org.snakeyaml:snakeyaml-engine +org.xerial.snappy:snappy-java +org.yaml:snakeyaml +oro:oro +stax:stax-api +xerces:xercesImpl core/src/main/java/org/apache/spark/util/collection/TimSort.java core/src/main/resources/org/apache/spark/ui/static/bootstrap* core/src/main/resources/org/apache/spark/ui/static/vis* -docs/js/vendor/bootstrap.js core/src/main/resources/org/apache/spark/ui/static/d3-flamegraph.min.js core/src/main/resources/org/apache/spark/ui/static/d3-flamegraph.css + ------------------------------------------------------------------------------------ This product bundles various third-party components under other open source licenses. This section summarizes those components and their licenses. See licenses-binary/ @@ -421,45 +418,37 @@ for text of these licenses. Python Software Foundation License ---------------------------------- - python/pyspark/loose_version.py BSD 2-Clause ------------ - com.github.luben:zstd-jni +com.github.wendykierp:JTransforms javolution:javolution -com.esotericsoftware:kryo-shaded -com.esotericsoftware:minlog -com.esotericsoftware:reflectasm -org.codehaus.janino:commons-compiler -org.codehaus.janino:janino jline:jline org.jodd:jodd-core -com.github.wendykierp:JTransforms pl.edu.icm:JLargeArrays BSD 3-Clause ------------ - +com.esotericsoftware:kryo-shaded +com.esotericsoftware:minlog +com.esotericsoftware:reflectasm com.google.protobuf:protobuf-java -dk.brics.automaton:automaton -org.antlr:antlr-runtime +com.thoughtworks.paranamer:paranamer +net.sf.py4j:py4j +net.sourceforge.f2j:arpack_combined_all org.antlr:ST4 -org.antlr:stringtemplate +org.antlr:antlr-runtime org.antlr:antlr4-runtime -antlr:antlr -com.thoughtworks.paranamer:paranamer +org.codehaus.janino:commons-compiler +org.codehaus.janino:janino org.fusesource.leveldbjni:leveldbjni-all -net.sourceforge.f2j:arpack_combined_all -xmlenc:xmlenc -net.sf.py4j:py4j +org.jline:jline org.jpmml:pmml-model -org.jpmml:pmml-schema org.threeten:threeten-extra -org.jdom:jdom2 python/lib/py4j-*-src.zip python/pyspark/cloudpickle.py @@ -472,95 +461,68 @@ is distributed under the 3-Clause BSD license. MIT License ----------- - -com.microsoft.sqlserver:mssql-jdbc +com.github.scopt:scopt_2.13 +dev.ludovic.netlib:blas +dev.ludovic.netlib:arpack +dev.ludovic.netlib:lapack +net.razorvine:pickle +org.checkerframework:checker-qual +org.typelevel:algebra_2.13:jar +org.typelevel:cats-kernel_2.13 org.typelevel:spire_2.13 org.typelevel:spire-macros_2.13 org.typelevel:spire-platform_2.13 org.typelevel:spire-util_2.13 -org.typelevel:algebra_2.13:jar -org.typelevel:cats-kernel_2.13 -org.typelevel:machinist_2.13 -net.razorvine:pickle org.slf4j:jcl-over-slf4j org.slf4j:jul-to-slf4j org.slf4j:slf4j-api -org.slf4j:slf4j-log4j12 -com.github.scopt:scopt_2.13 -dev.ludovic.netlib:blas -dev.ludovic.netlib:arpack -dev.ludovic.netlib:lapack core/src/main/resources/org/apache/spark/ui/static/dagre-d3.min.js core/src/main/resources/org/apache/spark/ui/static/*dataTables* core/src/main/resources/org/apache/spark/ui/static/graphlib-dot.min.js core/src/main/resources/org/apache/spark/ui/static/jquery* core/src/main/resources/org/apache/spark/ui/static/sorttable.js -docs/js/vendor/anchor.min.js -docs/js/vendor/jquery* -docs/js/vendor/modernizer* + ISC License ----------- - core/src/main/resources/org/apache/spark/ui/static/d3.min.js Common Development and Distribution License (CDDL) 1.0 ------------------------------------------------------ - javax.activation:activation http://www.oracle.com/technetwork/java/javase/tech/index-jsp-138795.html -javax.xml.stream:stax-api https://jcp.org/en/jsr/detail?id=173 -javax.transaction:javax.transaction-api -javax.xml.bind:jaxb-api +javax.transaction:transaction-api Common Development and Distribution License (CDDL) 1.1 ------------------------------------------------------ - -javax.el:javax.el-api https://javaee.github.io/uel-ri/ -javax.servlet.jsp:jsp-api javax.transaction:jta http://www.oracle.com/technetwork/java/index.html javax.xml.bind:jaxb-api https://github.com/javaee/jaxb-v2 -org.glassfish.hk2:hk2-api https://github.com/javaee/glassfish -org.glassfish.hk2:hk2-locator (same) -org.glassfish.hk2:hk2-utils -org.glassfish.hk2:osgi-resource-locator -org.glassfish.hk2.external:aopalliance-repackaged -org.glassfish.hk2.external:javax.inject -org.glassfish.jersey.bundles.repackaged:jersey-guava -org.glassfish.jersey.containers:jersey-container-servlet -org.glassfish.jersey.containers:jersey-container-servlet-core -org.glassfish.jersey.core:jersey-client -org.glassfish.jersey.core:jersey-common -org.glassfish.jersey.core:jersey-server -org.glassfish.jersey.media:jersey-media-jaxb + Eclipse Distribution License (EDL) 1.0 -------------------------------------- - -org.glassfish.jaxb:jaxb-runtime -jakarta.activation:jakarta.activation-api -jakarta.xml.bind:jakarta.xml.bind-api com.sun.istack:istack-commons-runtime - +jakarta.xml.bind:jakarta.xml.bind-api +org.glassfish.jaxb:jaxb-runtime +org.glassfish.jaxb:txw2 Eclipse Public License (EPL) 2.0 -------------------------------- - -jakarta.annotation:jakarta-annotation-api https://projects.eclipse.org/projects/ee4j.ca +jakarta.annotation:jakarta.annotation-api https://projects.eclipse.org/projects/ee4j.ca jakarta.servlet:jakarta.servlet-api https://projects.eclipse.org/projects/ee4j.servlet jakarta.ws.rs:jakarta.ws.rs-api https://github.com/eclipse-ee4j/jaxrs-api -org.glassfish.hk2.external:jakarta.inject -com.github.jnr:jnr-posix +org.glassfish.hk2.external:aopalliance-repackaged +org.glassfish.hk2:hk2-api +org.glassfish.hk2:hk2-locator +org.glassfish.hk2:hk2-utils +org.glassfish.hk2:osgi-resource-locator Public Domain ------------- - -aopalliance:aopalliance -net.iharder:base64 org.tukaani:xz @@ -573,3 +535,7 @@ data/mllib/images/kittens/54893.jpg data/mllib/images/kittens/DP153539.jpg data/mllib/images/kittens/DP802813.jpg data/mllib/images/multi-channel/chr30.4.184.jpg + +Unicode/ICU License +------------------- +com.ibm.icu:icu4j diff --git a/NOTICE-binary b/NOTICE-binary index 5f1c1c617c36f..c4cfe0e9f8b31 100644 --- a/NOTICE-binary +++ b/NOTICE-binary @@ -33,11 +33,12 @@ services. // Version 2.0, in this case for // ------------------------------------------------------------------ -Hive Beeline -Copyright 2016 The Apache Software Foundation +=== NOTICE FOR com.clearspring.analytics:streams === +stream-api +Copyright 2016 AddThis -This product includes software developed at -The Apache Software Foundation (http://www.apache.org/). +This product includes software developed by AddThis. +=== END OF NOTICE FOR com.clearspring.analytics:streams === Apache Avro Copyright 2009-2014 The Apache Software Foundation @@ -131,14 +132,6 @@ been derived from the works by JSR-166 EG, Doug Lea, and Jason T. Greene: * http://gee.cs.oswego.edu/cgi-bin/viewcvs.cgi/jsr166/ * http://viewvc.jboss.org/cgi-bin/viewvc.cgi/jbosscache/experimental/jsr166/ -This product contains a modified version of Robert Harder's Public Domain -Base64 Encoder and Decoder, which can be obtained at: - - * LICENSE: - * license/LICENSE.base64.txt (Public Domain) - * HOMEPAGE: - * http://iharder.sourceforge.net/current/java/base64/ - This product contains a modified portion of 'Webbit', an event based WebSocket and HTTP server, which can be obtained at: @@ -338,120 +331,102 @@ which has the following notices: Copyright 2002-2012 Ramnivas Laddad, Juergen Hoeller, Chris Beams The binary distribution of this product bundles binaries of -Jetty 6.1.26, +Jetty 11.0.20, which has the following notices: - * ============================================================== - Jetty Web Container - Copyright 1995-2016 Mort Bay Consulting Pty Ltd. - ============================================================== - - The Jetty Web Container is Copyright Mort Bay Consulting Pty Ltd - unless otherwise noted. - - Jetty is dual licensed under both - - * The Apache 2.0 License - http://www.apache.org/licenses/LICENSE-2.0.html - - and - - * The Eclipse Public 1.0 License - http://www.eclipse.org/legal/epl-v10.html - - Jetty may be distributed under either license. - - ------ - Eclipse - - The following artifacts are EPL. - * org.eclipse.jetty.orbit:org.eclipse.jdt.core +========================= +Notices for Eclipse Jetty +========================= +This content is produced and maintained by the Eclipse Jetty project. - The following artifacts are EPL and ASL2. - * org.eclipse.jetty.orbit:javax.security.auth.message +Project home: https://eclipse.dev/jetty/ - The following artifacts are EPL and CDDL 1.0. - * org.eclipse.jetty.orbit:javax.mail.glassfish +Trademarks +---------- +Eclipse Jetty, and Jetty are trademarks of the Eclipse Foundation. - ------ - Oracle +Copyright +--------- +All contributions are the property of the respective authors or of +entities to which copyright has been assigned by the authors (eg. employer). - The following artifacts are CDDL + GPLv2 with classpath exception. - https://glassfish.dev.java.net/nonav/public/CDDL+GPL.html +Declared Project Licenses +------------------------- +This artifacts of this project are made available under the terms of: - * javax.servlet:javax.servlet-api - * javax.annotation:javax.annotation-api - * javax.transaction:javax.transaction-api - * javax.websocket:javax.websocket-api + * the Eclipse Public License v2.0 + https://www.eclipse.org/legal/epl-2.0 + SPDX-License-Identifier: EPL-2.0 - ------ - Oracle OpenJDK + or - If ALPN is used to negotiate HTTP/2 connections, then the following - artifacts may be included in the distribution or downloaded when ALPN - module is selected. + * the Apache License, Version 2.0 + https://www.apache.org/licenses/LICENSE-2.0 + SPDX-License-Identifier: Apache-2.0 - * java.sun.security.ssl +The following dependencies are EPL. + * org.eclipse.jetty.orbit:org.eclipse.jdt.core - These artifacts replace/modify OpenJDK classes. The modififications - are hosted at github and both modified and original are under GPL v2 with - classpath exceptions. - http://openjdk.java.net/legal/gplv2+ce.html +The following dependencies are EPL and ASL2. + * org.eclipse.jetty.orbit:javax.security.auth.message - ------ - OW2 +The following dependencies are EPL and CDDL 1.0. + * org.eclipse.jetty.orbit:javax.mail.glassfish - The following artifacts are licensed by the OW2 Foundation according to the - terms of http://asm.ow2.org/license.html +The following dependencies are CDDL + GPLv2 with classpath exception. +https://glassfish.dev.java.net/nonav/public/CDDL+GPL.html - org.ow2.asm:asm-commons - org.ow2.asm:asm + * jakarta.servlet:jakarta.servlet-api + * javax.annotation:javax.annotation-api + * javax.transaction:javax.transaction-api + * javax.websocket:javax.websocket-api - ------ - Apache +The following dependencies are licensed by the OW2 Foundation according to the +terms of http://asm.ow2.org/license.html - The following artifacts are ASL2 licensed. + * org.ow2.asm:asm-commons + * org.ow2.asm:asm - org.apache.taglibs:taglibs-standard-spec - org.apache.taglibs:taglibs-standard-impl +The following dependencies are ASL2 licensed. - ------ - MortBay + * org.apache.taglibs:taglibs-standard-spec + * org.apache.taglibs:taglibs-standard-impl - The following artifacts are ASL2 licensed. Based on selected classes from - following Apache Tomcat jars, all ASL2 licensed. +The following dependencies are ASL2 licensed. Based on selected classes from +following Apache Tomcat jars, all ASL2 licensed. - org.mortbay.jasper:apache-jsp - org.apache.tomcat:tomcat-jasper - org.apache.tomcat:tomcat-juli - org.apache.tomcat:tomcat-jsp-api - org.apache.tomcat:tomcat-el-api - org.apache.tomcat:tomcat-jasper-el - org.apache.tomcat:tomcat-api - org.apache.tomcat:tomcat-util-scan - org.apache.tomcat:tomcat-util + * org.mortbay.jasper:apache-jsp + * org.apache.tomcat:tomcat-jasper + * org.apache.tomcat:tomcat-juli + * org.apache.tomcat:tomcat-jsp-api + * org.apache.tomcat:tomcat-el-api + * org.apache.tomcat:tomcat-jasper-el + * org.apache.tomcat:tomcat-api + * org.apache.tomcat:tomcat-util-scan + * org.apache.tomcat:tomcat-util + * org.mortbay.jasper:apache-el + * org.apache.tomcat:tomcat-jasper-el + * org.apache.tomcat:tomcat-el-api - org.mortbay.jasper:apache-el - org.apache.tomcat:tomcat-jasper-el - org.apache.tomcat:tomcat-el-api +The following artifacts are CDDL + GPLv2 with classpath exception. +https://glassfish.dev.java.net/nonav/public/CDDL+GPL.html - ------ - Mortbay + * org.eclipse.jetty.toolchain:jetty-schemas - The following artifacts are CDDL + GPLv2 with classpath exception. +Cryptography +------------ +Content may contain encryption software. The country in which you are currently +may have restrictions on the import, possession, and use, and/or re-export to +another country, of encryption software. BEFORE using any encryption software, +please check the country's laws, regulations and policies concerning the import, +possession, or use, and re-export of encryption software, to see if this is +permitted. - https://glassfish.dev.java.net/nonav/public/CDDL+GPL.html - - org.eclipse.jetty.toolchain:jetty-schemas - - ------ - Assorted - - The UnixCrypt.java code implements the one way cryptography used by - Unix systems for simple password protection. Copyright 1996 Aki Yoshida, - modified April 2001 by Iris Van den Broeke, Daniel Deville. - Permission to use, copy, modify and distribute UnixCrypt - for non-commercial or commercial purposes and without fee is - granted provided that the copyright notice appears in all copies./ +The UnixCrypt.java code implements the one way cryptography used by +Unix systems for simple password protection. Copyright 1996 Aki Yoshida, +modified April 2001 by Iris Van den Broeke, Daniel Deville. +Permission to use, copy, modify and distribute UnixCrypt +for non-commercial or commercial purposes and without fee is +granted provided that the copyright notice appears in all copies. The binary distribution of this product bundles binaries of Snappy for Java 1.0.4.1, @@ -506,36 +481,9 @@ Copyright 2001-2006 The Apache Software Foundation Apache Commons BeanUtils Copyright 2000-2008 The Apache Software Foundation -ApacheDS Protocol Kerberos Codec -Copyright 2003-2013 The Apache Software Foundation - -ApacheDS I18n -Copyright 2003-2013 The Apache Software Foundation - -Apache Directory API ASN.1 API -Copyright 2003-2013 The Apache Software Foundation - -Apache Directory LDAP API Utilities -Copyright 2003-2013 The Apache Software Foundation - Curator Client Copyright 2011-2015 The Apache Software Foundation -htrace-core -Copyright 2015 The Apache Software Foundation - - ========================================================================= - == NOTICE file corresponding to section 4(d) of the Apache License, == - == Version 2.0, in this case for the Apache Xerces Java distribution. == - ========================================================================= - - Portions of this software were originally based on the following: - - software copyright (c) 1999, IBM Corporation., http://www.ibm.com. - - software copyright (c) 1999, Sun Microsystems., http://www.sun.com. - - voluntary contributions made by Paul Eng on behalf of the - Apache Software Foundation that were originally developed at iClick, Inc., - software copyright (c) 1999. - # Jackson JSON processor Jackson is a high-performance, Free/Open Source JSON processing library. @@ -656,21 +604,12 @@ Copyright 2001-2009 The Apache Software Foundation Apache Commons Daemon Copyright 1999-2019 The Apache Software Foundation -Google Guice - Extensions - Servlet -Copyright 2006-2011 Google, Inc. - Apache Commons IO Copyright 2002-2012 The Apache Software Foundation -Google Guice - Core Library -Copyright 2006-2011 Google, Inc. - Apache Parquet Hadoop Bundle (Incubating) Copyright 2015 The Apache Software Foundation -Hive Query Language -Copyright 2016 The Apache Software Foundation - Apache Extras Companion for log4j 1.2. Copyright 2007 The Apache Software Foundation @@ -884,18 +823,6 @@ Some data files (under analysis/icu/src/data) are derived from Unicode data such as the Unicode Character Database. See http://unicode.org/copyright.html for more details. -Brics Automaton (under core/src/java/org/apache/lucene/util/automaton) is -BSD-licensed, created by Anders Møller. See http://www.brics.dk/automaton/ - -The levenshtein automata tables (under core/src/java/org/apache/lucene/util/automaton) were -automatically generated with the moman/finenight FSA library, created by -Jean-Philippe Barrette-LaPierre. This library is available under an MIT license, -see http://sites.google.com/site/rrettesite/moman and -http://bitbucket.org/jpbarrette/moman/overview/ - -The class org.apache.lucene.util.WeakIdentityMap was derived from -the Apache CXF project and is Apache License 2.0. - The Google Code Prettify is Apache License 2.0. See http://code.google.com/p/google-code-prettify/ @@ -1378,13 +1305,6 @@ Copyright (C) 2010 The Android Open Source Project This product includes software developed by The Android Open Source Project -Apache Velocity - -Copyright (C) 2000-2007 The Apache Software Foundation - -This product includes software developed at -The Apache Software Foundation (http://www.apache.org/). - Apache Yetus - Audience Annotations Copyright 2015-2017 The Apache Software Foundation @@ -1404,102 +1324,6 @@ This product includes software developed at The Apache Software Foundation (http://www.apache.org/). -Kerby-kerb Admin -Copyright 2014-2017 The Apache Software Foundation - -This product includes software developed at -The Apache Software Foundation (http://www.apache.org/). - - -Kerby-kerb Client -Copyright 2014-2017 The Apache Software Foundation - -This product includes software developed at -The Apache Software Foundation (http://www.apache.org/). - - -Kerby-kerb Common -Copyright 2014-2017 The Apache Software Foundation - -This product includes software developed at -The Apache Software Foundation (http://www.apache.org/). - - -Kerby-kerb core -Copyright 2014-2017 The Apache Software Foundation - -This product includes software developed at -The Apache Software Foundation (http://www.apache.org/). - - -Kerby-kerb Crypto -Copyright 2014-2017 The Apache Software Foundation - -This product includes software developed at -The Apache Software Foundation (http://www.apache.org/). - - -Kerby-kerb Identity -Copyright 2014-2017 The Apache Software Foundation - -This product includes software developed at -The Apache Software Foundation (http://www.apache.org/). - - -Kerby-kerb Server -Copyright 2014-2017 The Apache Software Foundation - -This product includes software developed at -The Apache Software Foundation (http://www.apache.org/). - - -Kerb Simple Kdc -Copyright 2014-2017 The Apache Software Foundation - -This product includes software developed at -The Apache Software Foundation (http://www.apache.org/). - - -Kerby-kerb Util -Copyright 2014-2017 The Apache Software Foundation - -This product includes software developed at -The Apache Software Foundation (http://www.apache.org/). - - -Kerby ASN1 Project -Copyright 2014-2017 The Apache Software Foundation - -This product includes software developed at -The Apache Software Foundation (http://www.apache.org/). - - -Kerby Config -Copyright 2014-2017 The Apache Software Foundation - -This product includes software developed at -The Apache Software Foundation (http://www.apache.org/). - - -Kerby PKIX Project -Copyright 2014-2017 The Apache Software Foundation - -This product includes software developed at -The Apache Software Foundation (http://www.apache.org/). - - -Kerby Util -Copyright 2014-2017 The Apache Software Foundation - -This product includes software developed at -The Apache Software Foundation (http://www.apache.org/). - - -Kerby XDR Project -Copyright 2014-2017 The Apache Software Foundation - -This product includes software developed at -The Apache Software Foundation (http://www.apache.org/). Token provider diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION index 2523104268d36..f7dd261c10fd2 100644 --- a/R/pkg/DESCRIPTION +++ b/R/pkg/DESCRIPTION @@ -21,7 +21,7 @@ Suggests: testthat, e1071, survival, - arrow (>= 1.0.0) + arrow (>= 10.0.0) Collate: 'schema.R' 'generics.R' diff --git a/assembly/pom.xml b/assembly/pom.xml index 6c31ec745b5bd..58e7ae5bb0c7f 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -136,10 +136,6 @@ spark-yarn_${scala.binary.version} ${project.version} - - org.apache.hadoop - hadoop-yarn-server-web-proxy - diff --git a/bin/spark-class2.cmd b/bin/spark-class2.cmd index 800ec0c02c22f..8703f5a86f109 100755 --- a/bin/spark-class2.cmd +++ b/bin/spark-class2.cmd @@ -61,14 +61,24 @@ if not "x%JAVA_HOME%"=="x" ( ) ) +rem SPARK-23015: We create a temporary text file when launching Spark. +rem This file must be given a unique name or else we risk a race condition when launching multiple instances close together. +rem The best way to create a unique file name is to add a GUID to the file name. Use Powershell to generate the GUID. +where powershell.exe >nul 2>&1 +if %errorlevel%==0 ( + FOR /F %%a IN ('POWERSHELL -COMMAND "$([guid]::NewGuid().ToString())"') DO (set RANDOM_SUFFIX=%%a) +) else ( + rem If Powershell is not installed, try to create a random file name suffix using the Windows %RANDOM%. + rem %RANDOM% is seeded with 1-second granularity so it is highly likely that two Spark instances + rem launched within the same second will fail to start. + rem Note that Powershell is automatically installed on all Windows OS from Windows 7/Windows Server 2008 R2 and onward. + set RANDOM_SUFFIX=%RANDOM% +) + rem The launcher library prints the command to be executed in a single line suitable for being rem executed by the batch interpreter. So read all the output of the launcher into a variable. -:gen -set LAUNCHER_OUTPUT=%temp%\spark-class-launcher-output-%RANDOM%.txt -rem SPARK-28302: %RANDOM% would return the same number if we call it instantly after last call, -rem so we should make it sure to generate unique file to avoid process collision of writing into -rem the same file concurrently. -if exist %LAUNCHER_OUTPUT% goto :gen +set LAUNCHER_OUTPUT=%temp%\spark-class-launcher-output-%RANDOM_SUFFIX%.txt + rem unset SHELL to indicate non-bash environment to launcher/Main set SHELL= "%RUNNER%" -Xmx128m -cp "%LAUNCH_CLASSPATH%" org.apache.spark.launcher.Main %* > %LAUNCHER_OUTPUT% diff --git a/build/mvn b/build/mvn index 3179099304c7a..3735461562e54 100755 --- a/build/mvn +++ b/build/mvn @@ -146,30 +146,6 @@ install_mvn() { fi } -# Determine the Scala version from the root pom.xml file, set the Scala URL, -# and, with that, download the specific version of Scala necessary under -# the build/ folder -install_scala() { - # determine the Scala version used in Spark - local scala_binary_version=`grep "scala.binary.version" "${_DIR}/../pom.xml" | head -n1 | awk -F '[<>]' '{print $3}'` - local scala_version=`grep "scala.version" "${_DIR}/../pom.xml" | grep ${scala_binary_version} | head -n1 | awk -F '[<>]' '{print $3}'` - local scala_bin="${_DIR}/scala-${scala_version}/bin/scala" - local TYPESAFE_MIRROR=${TYPESAFE_MIRROR:-https://downloads.lightbend.com} - local SCALA_TARBALL="scala-${scala_version}.tgz" - - install_app \ - "${TYPESAFE_MIRROR}" \ - "scala/${scala_version}/${SCALA_TARBALL}" \ - "" \ - "" \ - ${SCALA_TARBALL} \ - "scala-${scala_version}/bin/scala" - - SCALA_COMPILER="$(cd "$(dirname "${scala_bin}")/../lib" && pwd)/scala-compiler.jar" - SCALA_LIBRARY="$(cd "$(dirname "${scala_bin}")/../lib" && pwd)/scala-library.jar" -} - -install_scala install_mvn # Reset the current working directory diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml index 3820d1b8e395c..046648e9c2aec 100644 --- a/common/kvstore/pom.xml +++ b/common/kvstore/pom.xml @@ -40,6 +40,12 @@ spark-tags_${scala.binary.version} + + org.apache.spark + spark-common-utils_${scala.binary.version} + ${project.version} + + com.google.guava guava diff --git a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBIterator.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBIterator.java index b830e6afc6172..69757fdc65d68 100644 --- a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBIterator.java +++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBIterator.java @@ -29,8 +29,9 @@ import com.google.common.base.Preconditions; import com.google.common.base.Throwables; import org.iq80.leveldb.DBIterator; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; + +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; class LevelDBIterator implements KVStoreIterator { @@ -302,7 +303,7 @@ static int compare(byte[] a, byte[] b) { } static class ResourceCleaner implements Runnable { - private static final Logger LOG = LoggerFactory.getLogger(ResourceCleaner.class); + private static final SparkLogger LOG = SparkLoggerFactory.getLogger(ResourceCleaner.class); private final DBIterator dbIterator; diff --git a/common/kvstore/src/test/java/org/apache/spark/util/kvstore/DBIteratorSuite.java b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/DBIteratorSuite.java index daedd56890a68..72c3690d1a187 100644 --- a/common/kvstore/src/test/java/org/apache/spark/util/kvstore/DBIteratorSuite.java +++ b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/DBIteratorSuite.java @@ -32,8 +32,10 @@ import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; +// checkstyle.off: RegexpSinglelineJava import org.slf4j.Logger; import org.slf4j.LoggerFactory; +// checkstyle.on: RegexpSinglelineJava import static org.junit.jupiter.api.Assertions.*; public abstract class DBIteratorSuite { diff --git a/common/kvstore/src/test/java/org/apache/spark/util/kvstore/LevelDBBenchmark.java b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/LevelDBBenchmark.java index 3158c18f9e1d3..ff6db8fc34c96 100644 --- a/common/kvstore/src/test/java/org/apache/spark/util/kvstore/LevelDBBenchmark.java +++ b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/LevelDBBenchmark.java @@ -34,7 +34,9 @@ import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; +// checkstyle.off: RegexpSinglelineJava import org.slf4j.LoggerFactory; +// checkstyle.on: RegexpSinglelineJava import static org.junit.jupiter.api.Assertions.*; /** diff --git a/common/kvstore/src/test/java/org/apache/spark/util/kvstore/RocksDBBenchmark.java b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/RocksDBBenchmark.java index e18a3c3b1c288..c1b8009e97e66 100644 --- a/common/kvstore/src/test/java/org/apache/spark/util/kvstore/RocksDBBenchmark.java +++ b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/RocksDBBenchmark.java @@ -34,7 +34,9 @@ import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; +// checkstyle.off: RegexpSinglelineJava import org.slf4j.LoggerFactory; +// checkstyle.on: RegexpSinglelineJava import static org.junit.jupiter.api.Assertions.*; /** diff --git a/common/network-common/src/main/java/org/apache/spark/network/TransportContext.java b/common/network-common/src/main/java/org/apache/spark/network/TransportContext.java index 9f3b9c59256b9..e8ce6840e3fc3 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/TransportContext.java +++ b/common/network-common/src/main/java/org/apache/spark/network/TransportContext.java @@ -34,9 +34,9 @@ import io.netty.handler.stream.ChunkedWriteHandler; import io.netty.handler.timeout.IdleStateHandler; import io.netty.handler.codec.MessageToMessageEncoder; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; import org.apache.spark.network.client.TransportClient; import org.apache.spark.network.client.TransportClientBootstrap; import org.apache.spark.network.client.TransportClientFactory; @@ -73,7 +73,7 @@ * processes to send messages back to the client on an existing channel. */ public class TransportContext implements Closeable { - private static final Logger logger = LoggerFactory.getLogger(TransportContext.class); + private static final SparkLogger logger = SparkLoggerFactory.getLogger(TransportContext.class); private static final NettyLogger nettyLogger = new NettyLogger(); private final TransportConf conf; diff --git a/common/network-common/src/main/java/org/apache/spark/network/client/TransportClient.java b/common/network-common/src/main/java/org/apache/spark/network/client/TransportClient.java index 40825e06b82fd..4c144a73a9299 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/client/TransportClient.java +++ b/common/network-common/src/main/java/org/apache/spark/network/client/TransportClient.java @@ -35,9 +35,11 @@ import io.netty.util.concurrent.GenericFutureListener; import org.apache.commons.lang3.builder.ToStringBuilder; import org.apache.commons.lang3.builder.ToStringStyle; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.MDC; import org.apache.spark.network.buffer.ManagedBuffer; import org.apache.spark.network.buffer.NioManagedBuffer; import org.apache.spark.network.protocol.*; @@ -71,7 +73,7 @@ * Concurrency: thread safe and can be called from multiple threads. */ public class TransportClient implements Closeable { - private static final Logger logger = LoggerFactory.getLogger(TransportClient.class); + private static final SparkLogger logger = SparkLoggerFactory.getLogger(TransportClient.class); private final Channel channel; private final TransportResponseHandler handler; @@ -364,11 +366,13 @@ public void operationComplete(Future future) throws Exception { getRemoteAddress(channel), timeTaken); } } else { - String errorMsg = String.format("Failed to send RPC %s to %s: %s", requestId, - getRemoteAddress(channel), future.cause()); - logger.error(errorMsg, future.cause()); + logger.error("Failed to send RPC {} to {}", future.cause(), + MDC.of(LogKeys.REQUEST_ID$.MODULE$, requestId), + MDC.of(LogKeys.HOST_PORT$.MODULE$, getRemoteAddress(channel))); channel.close(); try { + String errorMsg = String.format("Failed to send RPC %s to %s: %s", requestId, + getRemoteAddress(channel), future.cause()); handleFailure(errorMsg, future.cause()); } catch (Exception e) { logger.error("Uncaught exception in RPC response callback handler!", e); diff --git a/common/network-common/src/main/java/org/apache/spark/network/client/TransportClientFactory.java b/common/network-common/src/main/java/org/apache/spark/network/client/TransportClientFactory.java index fd48020caac7f..e1f19f956cc0a 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/client/TransportClientFactory.java +++ b/common/network-common/src/main/java/org/apache/spark/network/client/TransportClientFactory.java @@ -42,9 +42,11 @@ import io.netty.handler.ssl.SslHandler; import io.netty.util.concurrent.Future; import io.netty.util.concurrent.GenericFutureListener; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.MDC; import org.apache.spark.network.TransportContext; import org.apache.spark.network.server.TransportChannelHandler; import org.apache.spark.network.util.*; @@ -77,7 +79,8 @@ private static class ClientPool { } } - private static final Logger logger = LoggerFactory.getLogger(TransportClientFactory.class); + private static final SparkLogger logger = + SparkLoggerFactory.getLogger(TransportClientFactory.class); private final TransportContext context; private final TransportConf conf; @@ -169,8 +172,10 @@ public TransportClient createClient(String remoteHost, int remotePort, boolean f // this code was able to update things. TransportChannelHandler handler = cachedClient.getChannel().pipeline() .get(TransportChannelHandler.class); - synchronized (handler) { - handler.getResponseHandler().updateTimeOfLastRequest(); + if (handler != null) { + synchronized (handler) { + handler.getResponseHandler().updateTimeOfLastRequest(); + } } if (cachedClient.isActive()) { @@ -188,7 +193,9 @@ public TransportClient createClient(String remoteHost, int remotePort, boolean f final String resolvMsg = resolvedAddress.isUnresolved() ? "failed" : "succeed"; if (hostResolveTimeMs > 2000) { logger.warn("DNS resolution {} for {} took {} ms", - resolvMsg, resolvedAddress, hostResolveTimeMs); + MDC.of(LogKeys.STATUS$.MODULE$, resolvMsg), + MDC.of(LogKeys.HOST_PORT$.MODULE$, resolvedAddress), + MDC.of(LogKeys.TIME$.MODULE$, hostResolveTimeMs)); } else { logger.trace("DNS resolution {} for {} took {} ms", resolvMsg, resolvedAddress, hostResolveTimeMs); @@ -202,7 +209,8 @@ public TransportClient createClient(String remoteHost, int remotePort, boolean f logger.trace("Returning cached connection to {}: {}", resolvedAddress, cachedClient); return cachedClient; } else { - logger.info("Found inactive connection to {}, creating a new one.", resolvedAddress); + logger.info("Found inactive connection to {}, creating a new one.", + MDC.of(LogKeys.HOST_PORT$.MODULE$, resolvedAddress)); } } // If this connection should fast fail when last connection failed in last fast fail time @@ -305,8 +313,8 @@ public void operationComplete(final Future handshakeFuture) { if (handshakeFuture.isSuccess()) { logger.debug("{} successfully completed TLS handshake to ", address); } else { - logger.info( - "failed to complete TLS handshake to " + address, handshakeFuture.cause()); + logger.info("failed to complete TLS handshake to {}", handshakeFuture.cause(), + MDC.of(LogKeys.HOST_PORT$.MODULE$, address)); cf.channel().close(); } } @@ -331,14 +339,17 @@ public void operationComplete(final Future handshakeFuture) { } } catch (Exception e) { // catch non-RuntimeExceptions too as bootstrap may be written in Scala long bootstrapTimeMs = (System.nanoTime() - preBootstrap) / 1000000; - logger.error("Exception while bootstrapping client after " + bootstrapTimeMs + " ms", e); + logger.error("Exception while bootstrapping client after {} ms", e, + MDC.of(LogKeys.BOOTSTRAP_TIME$.MODULE$, bootstrapTimeMs)); client.close(); throw Throwables.propagate(e); } long postBootstrap = System.nanoTime(); logger.info("Successfully created connection to {} after {} ms ({} ms spent in bootstraps)", - address, (postBootstrap - preConnect) / 1000000, (postBootstrap - preBootstrap) / 1000000); + MDC.of(LogKeys.HOST_PORT$.MODULE$, address), + MDC.of(LogKeys.ELAPSED_TIME$.MODULE$, (postBootstrap - preConnect) / 1000000), + MDC.of(LogKeys.BOOTSTRAP_TIME$.MODULE$, (postBootstrap - preBootstrap) / 1000000)); return client; } diff --git a/common/network-common/src/main/java/org/apache/spark/network/client/TransportResponseHandler.java b/common/network-common/src/main/java/org/apache/spark/network/client/TransportResponseHandler.java index 9041678435106..be4cf4a58abeb 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/client/TransportResponseHandler.java +++ b/common/network-common/src/main/java/org/apache/spark/network/client/TransportResponseHandler.java @@ -28,9 +28,11 @@ import io.netty.channel.Channel; import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.lang3.tuple.Pair; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.MDC; import org.apache.spark.network.protocol.ChunkFetchFailure; import org.apache.spark.network.protocol.ChunkFetchSuccess; import org.apache.spark.network.protocol.MergedBlockMetaSuccess; @@ -51,7 +53,8 @@ * Concurrency: thread safe and can be called from multiple threads. */ public class TransportResponseHandler extends MessageHandler { - private static final Logger logger = LoggerFactory.getLogger(TransportResponseHandler.class); + private static final SparkLogger logger = + SparkLoggerFactory.getLogger(TransportResponseHandler.class); private final Channel channel; @@ -143,7 +146,8 @@ public void channelInactive() { if (hasOutstandingRequests()) { String remoteAddress = getRemoteAddress(channel); logger.error("Still have {} requests outstanding when connection from {} is closed", - numOutstandingRequests(), remoteAddress); + MDC.of(LogKeys.COUNT$.MODULE$, numOutstandingRequests()), + MDC.of(LogKeys.HOST_PORT$.MODULE$, remoteAddress)); failOutstandingRequests(new IOException("Connection from " + remoteAddress + " closed")); } } @@ -153,7 +157,8 @@ public void exceptionCaught(Throwable cause) { if (hasOutstandingRequests()) { String remoteAddress = getRemoteAddress(channel); logger.error("Still have {} requests outstanding when connection from {} is closed", - numOutstandingRequests(), remoteAddress); + MDC.of(LogKeys.COUNT$.MODULE$, numOutstandingRequests()), + MDC.of(LogKeys.HOST_PORT$.MODULE$, remoteAddress)); failOutstandingRequests(cause); } } @@ -164,7 +169,8 @@ public void handle(ResponseMessage message) throws Exception { ChunkReceivedCallback listener = outstandingFetches.get(resp.streamChunkId); if (listener == null) { logger.warn("Ignoring response for block {} from {} since it is not outstanding", - resp.streamChunkId, getRemoteAddress(channel)); + MDC.of(LogKeys.STREAM_CHUNK_ID$.MODULE$, resp.streamChunkId), + MDC.of(LogKeys.HOST_PORT$.MODULE$, getRemoteAddress(channel))); resp.body().release(); } else { outstandingFetches.remove(resp.streamChunkId); @@ -175,7 +181,9 @@ public void handle(ResponseMessage message) throws Exception { ChunkReceivedCallback listener = outstandingFetches.get(resp.streamChunkId); if (listener == null) { logger.warn("Ignoring response for block {} from {} ({}) since it is not outstanding", - resp.streamChunkId, getRemoteAddress(channel), resp.errorString); + MDC.of(LogKeys.STREAM_CHUNK_ID$.MODULE$, resp.streamChunkId), + MDC.of(LogKeys.HOST_PORT$.MODULE$, getRemoteAddress(channel)), + MDC.of(LogKeys.ERROR$.MODULE$, resp.errorString)); } else { outstandingFetches.remove(resp.streamChunkId); listener.onFailure(resp.streamChunkId.chunkIndex(), new ChunkFetchFailureException( @@ -185,7 +193,9 @@ public void handle(ResponseMessage message) throws Exception { RpcResponseCallback listener = (RpcResponseCallback) outstandingRpcs.get(resp.requestId); if (listener == null) { logger.warn("Ignoring response for RPC {} from {} ({} bytes) since it is not outstanding", - resp.requestId, getRemoteAddress(channel), resp.body().size()); + MDC.of(LogKeys.REQUEST_ID$.MODULE$, resp.requestId), + MDC.of(LogKeys.HOST_PORT$.MODULE$, getRemoteAddress(channel)), + MDC.of(LogKeys.RESPONSE_BODY_SIZE$.MODULE$, resp.body().size())); resp.body().release(); } else { outstandingRpcs.remove(resp.requestId); @@ -199,7 +209,9 @@ public void handle(ResponseMessage message) throws Exception { BaseResponseCallback listener = outstandingRpcs.get(resp.requestId); if (listener == null) { logger.warn("Ignoring response for RPC {} from {} ({}) since it is not outstanding", - resp.requestId, getRemoteAddress(channel), resp.errorString); + MDC.of(LogKeys.REQUEST_ID$.MODULE$, resp.requestId), + MDC.of(LogKeys.HOST_PORT$.MODULE$, getRemoteAddress(channel)), + MDC.of(LogKeys.ERROR$.MODULE$, resp.errorString)); } else { outstandingRpcs.remove(resp.requestId); listener.onFailure(new RuntimeException(resp.errorString)); @@ -209,9 +221,11 @@ public void handle(ResponseMessage message) throws Exception { MergedBlockMetaResponseCallback listener = (MergedBlockMetaResponseCallback) outstandingRpcs.get(resp.requestId); if (listener == null) { - logger.warn( - "Ignoring response for MergedBlockMetaRequest {} from {} ({} bytes) since it is not" - + " outstanding", resp.requestId, getRemoteAddress(channel), resp.body().size()); + logger.warn("Ignoring response for MergedBlockMetaRequest {} from {} ({} bytes) since " + + "it is not outstanding", + MDC.of(LogKeys.REQUEST_ID$.MODULE$, resp.requestId), + MDC.of(LogKeys.HOST_PORT$.MODULE$, getRemoteAddress(channel)), + MDC.of(LogKeys.RESPONSE_BODY_SIZE$.MODULE$, resp.body().size())); } else { outstandingRpcs.remove(resp.requestId); listener.onSuccess(resp.getNumChunks(), resp.body()); @@ -255,7 +269,8 @@ public void handle(ResponseMessage message) throws Exception { logger.warn("Error in stream failure handler.", ioe); } } else { - logger.warn("Stream failure with unknown callback: {}", resp.error); + logger.warn("Stream failure with unknown callback: {}", + MDC.of(LogKeys.ERROR$.MODULE$, resp.error)); } } else { throw new IllegalStateException("Unknown response type: " + message.type()); diff --git a/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthClientBootstrap.java b/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthClientBootstrap.java index b55541017c9d2..08e2c084fe67b 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthClientBootstrap.java +++ b/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthClientBootstrap.java @@ -26,9 +26,9 @@ import io.netty.buffer.ByteBuf; import io.netty.buffer.Unpooled; import io.netty.channel.Channel; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; import org.apache.spark.network.client.TransportClient; import org.apache.spark.network.client.TransportClientBootstrap; import org.apache.spark.network.sasl.SaslClientBootstrap; @@ -47,7 +47,7 @@ */ public class AuthClientBootstrap implements TransportClientBootstrap { - private static final Logger LOG = LoggerFactory.getLogger(AuthClientBootstrap.class); + private static final SparkLogger LOG = SparkLoggerFactory.getLogger(AuthClientBootstrap.class); private final TransportConf conf; private final String appId; diff --git a/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthEngine.java b/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthEngine.java index cb68cfb5a0e88..8449a774a404a 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthEngine.java +++ b/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthEngine.java @@ -45,6 +45,8 @@ class AuthEngine implements Closeable { public static final byte[] INPUT_IV_INFO = "inputIv".getBytes(UTF_8); public static final byte[] OUTPUT_IV_INFO = "outputIv".getBytes(UTF_8); private static final String MAC_ALGORITHM = "HMACSHA256"; + private static final String LEGACY_CIPHER_ALGORITHM = "AES/CTR/NoPadding"; + private static final String CIPHER_ALGORITHM = "AES/GCM/NoPadding"; private static final int AES_GCM_KEY_SIZE_BYTES = 16; private static final byte[] EMPTY_TRANSCRIPT = new byte[0]; private static final int UNSAFE_SKIP_HKDF_VERSION = 1; @@ -227,12 +229,19 @@ private TransportCipher generateTransportCipher( OUTPUT_IV_INFO, // This is the HKDF info field used to differentiate IV values AES_GCM_KEY_SIZE_BYTES); SecretKeySpec sessionKey = new SecretKeySpec(derivedKey, "AES"); - return new TransportCipher( - cryptoConf, - conf.cipherTransformation(), - sessionKey, - isClient ? clientIv : serverIv, // If it's the client, use the client IV first - isClient ? serverIv : clientIv); + if (LEGACY_CIPHER_ALGORITHM.equalsIgnoreCase(conf.cipherTransformation())) { + return new CtrTransportCipher( + cryptoConf, + sessionKey, + isClient ? clientIv : serverIv, // If it's the client, use the client IV first + isClient ? serverIv : clientIv); + } else if (CIPHER_ALGORITHM.equalsIgnoreCase(conf.cipherTransformation())) { + return new GcmTransportCipher(sessionKey); + } else { + throw new IllegalArgumentException( + String.format("Unsupported cipher mode: %s. %s and %s are supported.", + conf.cipherTransformation(), CIPHER_ALGORITHM, LEGACY_CIPHER_ALGORITHM)); + } } private byte[] getTranscript(AuthMessage... encryptedPublicKeys) { diff --git a/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthRpcHandler.java b/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthRpcHandler.java index 9a7ce8b7b31d6..65367743e24f9 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthRpcHandler.java +++ b/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthRpcHandler.java @@ -25,9 +25,11 @@ import io.netty.buffer.ByteBuf; import io.netty.buffer.Unpooled; import io.netty.channel.Channel; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.MDC; import org.apache.spark.network.client.RpcResponseCallback; import org.apache.spark.network.client.TransportClient; import org.apache.spark.network.sasl.SecretKeyHolder; @@ -46,7 +48,7 @@ * authenticated. A connection may be authenticated at most once. */ class AuthRpcHandler extends AbstractAuthRpcHandler { - private static final Logger LOG = LoggerFactory.getLogger(AuthRpcHandler.class); + private static final SparkLogger LOG = SparkLoggerFactory.getLogger(AuthRpcHandler.class); /** Transport configuration. */ private final TransportConf conf; @@ -91,7 +93,7 @@ protected boolean doAuthChallenge( } catch (RuntimeException e) { if (conf.saslFallback()) { LOG.warn("Failed to parse new auth challenge, reverting to SASL for client {}.", - channel.remoteAddress()); + MDC.of(LogKeys.HOST_PORT$.MODULE$, channel.remoteAddress())); saslHandler = new SaslRpcHandler(conf, channel, null, secretKeyHolder); message.position(position); message.limit(limit); diff --git a/common/network-common/src/main/java/org/apache/spark/network/crypto/CtrTransportCipher.java b/common/network-common/src/main/java/org/apache/spark/network/crypto/CtrTransportCipher.java new file mode 100644 index 0000000000000..85b893751b39c --- /dev/null +++ b/common/network-common/src/main/java/org/apache/spark/network/crypto/CtrTransportCipher.java @@ -0,0 +1,381 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.network.crypto; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.channels.ReadableByteChannel; +import java.nio.channels.WritableByteChannel; +import java.security.GeneralSecurityException; +import java.util.Properties; +import javax.crypto.spec.SecretKeySpec; +import javax.crypto.spec.IvParameterSpec; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Preconditions; +import io.netty.buffer.ByteBuf; +import io.netty.buffer.Unpooled; +import io.netty.channel.*; +import org.apache.commons.crypto.stream.CryptoInputStream; +import org.apache.commons.crypto.stream.CryptoOutputStream; + +import org.apache.spark.network.util.AbstractFileRegion; +import org.apache.spark.network.util.ByteArrayReadableChannel; +import org.apache.spark.network.util.ByteArrayWritableChannel; + +/** + * Cipher for encryption and decryption. + */ +public class CtrTransportCipher implements TransportCipher { + @VisibleForTesting + static final String ENCRYPTION_HANDLER_NAME = "CtrTransportEncryption"; + private static final String DECRYPTION_HANDLER_NAME = "CtrTransportDecryption"; + @VisibleForTesting + static final int STREAM_BUFFER_SIZE = 1024 * 32; + + private final Properties conf; + private static final String CIPHER_ALGORITHM = "AES/CTR/NoPadding"; + private final SecretKeySpec key; + private final byte[] inIv; + private final byte[] outIv; + + public CtrTransportCipher( + Properties conf, + SecretKeySpec key, + byte[] inIv, + byte[] outIv) { + this.conf = conf; + this.key = key; + this.inIv = inIv; + this.outIv = outIv; + } + + /* + * This method is for testing purposes only. + */ + @VisibleForTesting + public String getKeyId() throws GeneralSecurityException { + return TransportCipherUtil.getKeyId(key); + } + + @VisibleForTesting + SecretKeySpec getKey() { + return key; + } + + /** The IV for the input channel (i.e. output channel of the remote side). */ + public byte[] getInputIv() { + return inIv; + } + + /** The IV for the output channel (i.e. input channel of the remote side). */ + public byte[] getOutputIv() { + return outIv; + } + + @VisibleForTesting + CryptoOutputStream createOutputStream(WritableByteChannel ch) throws IOException { + return new CryptoOutputStream(CIPHER_ALGORITHM, conf, ch, key, new IvParameterSpec(outIv)); + } + + @VisibleForTesting + CryptoInputStream createInputStream(ReadableByteChannel ch) throws IOException { + return new CryptoInputStream(CIPHER_ALGORITHM, conf, ch, key, new IvParameterSpec(inIv)); + } + + /** + * Add handlers to channel. + * + * @param ch the channel for adding handlers + * @throws IOException + */ + public void addToChannel(Channel ch) throws IOException { + ch.pipeline() + .addFirst(ENCRYPTION_HANDLER_NAME, new EncryptionHandler(this)) + .addFirst(DECRYPTION_HANDLER_NAME, new DecryptionHandler(this)); + } + + @VisibleForTesting + static class EncryptionHandler extends ChannelOutboundHandlerAdapter { + private final ByteArrayWritableChannel byteEncChannel; + private final CryptoOutputStream cos; + private final ByteArrayWritableChannel byteRawChannel; + private boolean isCipherValid; + + EncryptionHandler(CtrTransportCipher cipher) throws IOException { + byteEncChannel = new ByteArrayWritableChannel(STREAM_BUFFER_SIZE); + cos = cipher.createOutputStream(byteEncChannel); + byteRawChannel = new ByteArrayWritableChannel(STREAM_BUFFER_SIZE); + isCipherValid = true; + } + + @Override + public void write(ChannelHandlerContext ctx, Object msg, ChannelPromise promise) + throws Exception { + ctx.write(createEncryptedMessage(msg), promise); + } + + @VisibleForTesting + EncryptedMessage createEncryptedMessage(Object msg) { + return new EncryptedMessage(this, cos, msg, byteEncChannel, byteRawChannel); + } + + @Override + public void close(ChannelHandlerContext ctx, ChannelPromise promise) throws Exception { + try { + if (isCipherValid) { + cos.close(); + } + } finally { + super.close(ctx, promise); + } + } + + /** + * SPARK-25535. Workaround for CRYPTO-141. Avoid further interaction with the underlying cipher + * after an error occurs. + */ + void reportError() { + this.isCipherValid = false; + } + + boolean isCipherValid() { + return isCipherValid; + } + } + + private static class DecryptionHandler extends ChannelInboundHandlerAdapter { + private final CryptoInputStream cis; + private final ByteArrayReadableChannel byteChannel; + private boolean isCipherValid; + + DecryptionHandler(CtrTransportCipher cipher) throws IOException { + byteChannel = new ByteArrayReadableChannel(); + cis = cipher.createInputStream(byteChannel); + isCipherValid = true; + } + + @Override + public void channelRead(ChannelHandlerContext ctx, Object data) throws Exception { + ByteBuf buffer = (ByteBuf) data; + + try { + if (!isCipherValid) { + throw new IOException("Cipher is in invalid state."); + } + byte[] decryptedData = new byte[buffer.readableBytes()]; + byteChannel.feedData(buffer); + + int offset = 0; + while (offset < decryptedData.length) { + // SPARK-25535: workaround for CRYPTO-141. + try { + offset += cis.read(decryptedData, offset, decryptedData.length - offset); + } catch (InternalError ie) { + isCipherValid = false; + throw ie; + } + } + + ctx.fireChannelRead(Unpooled.wrappedBuffer(decryptedData, 0, decryptedData.length)); + } finally { + buffer.release(); + } + } + + @Override + public void handlerRemoved(ChannelHandlerContext ctx) throws Exception { + // We do the closing of the stream / channel in handlerRemoved(...) as + // this method will be called in all cases: + // + // - when the Channel becomes inactive + // - when the handler is removed from the ChannelPipeline + try { + if (isCipherValid) { + cis.close(); + } + } finally { + super.handlerRemoved(ctx); + } + } + } + + @VisibleForTesting + static class EncryptedMessage extends AbstractFileRegion { + private final boolean isByteBuf; + private final ByteBuf buf; + private final FileRegion region; + private final CryptoOutputStream cos; + private final EncryptionHandler handler; + private final long count; + private long transferred; + + // Due to streaming issue CRYPTO-125: https://issues.apache.org/jira/browse/CRYPTO-125, it has + // to utilize two helper ByteArrayWritableChannel for streaming. One is used to receive raw data + // from upper handler, another is used to store encrypted data. + private final ByteArrayWritableChannel byteEncChannel; + private final ByteArrayWritableChannel byteRawChannel; + + private ByteBuffer currentEncrypted; + + EncryptedMessage( + EncryptionHandler handler, + CryptoOutputStream cos, + Object msg, + ByteArrayWritableChannel byteEncChannel, + ByteArrayWritableChannel byteRawChannel) { + Preconditions.checkArgument(msg instanceof ByteBuf || msg instanceof FileRegion, + "Unrecognized message type: %s", msg.getClass().getName()); + this.handler = handler; + this.isByteBuf = msg instanceof ByteBuf; + this.buf = isByteBuf ? (ByteBuf) msg : null; + this.region = isByteBuf ? null : (FileRegion) msg; + this.transferred = 0; + this.cos = cos; + this.byteEncChannel = byteEncChannel; + this.byteRawChannel = byteRawChannel; + this.count = isByteBuf ? buf.readableBytes() : region.count(); + } + + @Override + public long count() { + return count; + } + + @Override + public long position() { + return 0; + } + + @Override + public long transferred() { + return transferred; + } + + @Override + public EncryptedMessage touch(Object o) { + super.touch(o); + if (region != null) { + region.touch(o); + } + if (buf != null) { + buf.touch(o); + } + return this; + } + + @Override + public EncryptedMessage retain(int increment) { + super.retain(increment); + if (region != null) { + region.retain(increment); + } + if (buf != null) { + buf.retain(increment); + } + return this; + } + + @Override + public boolean release(int decrement) { + if (region != null) { + region.release(decrement); + } + if (buf != null) { + buf.release(decrement); + } + return super.release(decrement); + } + + @Override + public long transferTo(WritableByteChannel target, long position) throws IOException { + Preconditions.checkArgument(position == transferred(), "Invalid position."); + + if (transferred == count) { + return 0; + } + + long totalBytesWritten = 0L; + do { + if (currentEncrypted == null) { + encryptMore(); + } + + long remaining = currentEncrypted.remaining(); + if (remaining == 0) { + // Just for safety to avoid endless loop. It usually won't happen, but since the + // underlying `region.transferTo` is allowed to transfer 0 bytes, we should handle it for + // safety. + currentEncrypted = null; + byteEncChannel.reset(); + return totalBytesWritten; + } + + long bytesWritten = target.write(currentEncrypted); + totalBytesWritten += bytesWritten; + transferred += bytesWritten; + if (bytesWritten < remaining) { + // break as the underlying buffer in "target" is full + break; + } + currentEncrypted = null; + byteEncChannel.reset(); + } while (transferred < count); + + return totalBytesWritten; + } + + private void encryptMore() throws IOException { + if (!handler.isCipherValid()) { + throw new IOException("Cipher is in invalid state."); + } + byteRawChannel.reset(); + + if (isByteBuf) { + int copied = byteRawChannel.write(buf.nioBuffer()); + buf.skipBytes(copied); + } else { + region.transferTo(byteRawChannel, region.transferred()); + } + + try { + cos.write(byteRawChannel.getData(), 0, byteRawChannel.length()); + cos.flush(); + } catch (InternalError ie) { + handler.reportError(); + throw ie; + } + + currentEncrypted = ByteBuffer.wrap(byteEncChannel.getData(), + 0, byteEncChannel.length()); + } + + @Override + protected void deallocate() { + byteRawChannel.reset(); + byteEncChannel.reset(); + if (region != null) { + region.release(); + } + if (buf != null) { + buf.release(); + } + } + } + +} diff --git a/common/network-common/src/main/java/org/apache/spark/network/crypto/GcmTransportCipher.java b/common/network-common/src/main/java/org/apache/spark/network/crypto/GcmTransportCipher.java new file mode 100644 index 0000000000000..c3540838bef09 --- /dev/null +++ b/common/network-common/src/main/java/org/apache/spark/network/crypto/GcmTransportCipher.java @@ -0,0 +1,410 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.network.crypto; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Preconditions; +import com.google.common.primitives.Longs; +import com.google.crypto.tink.subtle.*; +import io.netty.buffer.ByteBuf; +import io.netty.buffer.Unpooled; +import io.netty.channel.*; +import io.netty.util.ReferenceCounted; +import org.apache.spark.network.util.AbstractFileRegion; +import org.apache.spark.network.util.ByteBufferWriteableChannel; + +import javax.crypto.spec.SecretKeySpec; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.channels.WritableByteChannel; +import java.security.GeneralSecurityException; +import java.security.InvalidAlgorithmParameterException; + +public class GcmTransportCipher implements TransportCipher { + private static final String HKDF_ALG = "HmacSha256"; + private static final int LENGTH_HEADER_BYTES = 8; + @VisibleForTesting + static final int CIPHERTEXT_BUFFER_SIZE = 32 * 1024; // 32KB + private final SecretKeySpec aesKey; + + public GcmTransportCipher(SecretKeySpec aesKey) { + this.aesKey = aesKey; + } + + AesGcmHkdfStreaming getAesGcmHkdfStreaming() throws InvalidAlgorithmParameterException { + return new AesGcmHkdfStreaming( + aesKey.getEncoded(), + HKDF_ALG, + aesKey.getEncoded().length, + CIPHERTEXT_BUFFER_SIZE, + 0); + } + + /* + * This method is for testing purposes only. + */ + @VisibleForTesting + public String getKeyId() throws GeneralSecurityException { + return TransportCipherUtil.getKeyId(aesKey); + } + + @VisibleForTesting + EncryptionHandler getEncryptionHandler() throws GeneralSecurityException { + return new EncryptionHandler(); + } + + @VisibleForTesting + DecryptionHandler getDecryptionHandler() throws GeneralSecurityException { + return new DecryptionHandler(); + } + + public void addToChannel(Channel ch) throws GeneralSecurityException { + ch.pipeline() + .addFirst("GcmTransportEncryption", getEncryptionHandler()) + .addFirst("GcmTransportDecryption", getDecryptionHandler()); + } + + @VisibleForTesting + class EncryptionHandler extends ChannelOutboundHandlerAdapter { + private final ByteBuffer plaintextBuffer; + private final ByteBuffer ciphertextBuffer; + private final AesGcmHkdfStreaming aesGcmHkdfStreaming; + + EncryptionHandler() throws InvalidAlgorithmParameterException { + aesGcmHkdfStreaming = getAesGcmHkdfStreaming(); + plaintextBuffer = ByteBuffer.allocate(aesGcmHkdfStreaming.getPlaintextSegmentSize()); + ciphertextBuffer = ByteBuffer.allocate(aesGcmHkdfStreaming.getCiphertextSegmentSize()); + } + + @Override + public void write(ChannelHandlerContext ctx, Object msg, ChannelPromise promise) + throws Exception { + GcmEncryptedMessage encryptedMessage = new GcmEncryptedMessage( + aesGcmHkdfStreaming, + msg, + plaintextBuffer, + ciphertextBuffer); + ctx.write(encryptedMessage, promise); + } + } + + static class GcmEncryptedMessage extends AbstractFileRegion { + private final Object plaintextMessage; + private final ByteBuffer plaintextBuffer; + private final ByteBuffer ciphertextBuffer; + private final ByteBuffer headerByteBuffer; + private final long bytesToRead; + private long bytesRead = 0; + private final StreamSegmentEncrypter encrypter; + private long transferred = 0; + private final long encryptedCount; + + GcmEncryptedMessage(AesGcmHkdfStreaming aesGcmHkdfStreaming, + Object plaintextMessage, + ByteBuffer plaintextBuffer, + ByteBuffer ciphertextBuffer) throws GeneralSecurityException { + Preconditions.checkArgument( + plaintextMessage instanceof ByteBuf || plaintextMessage instanceof FileRegion, + "Unrecognized message type: %s", plaintextMessage.getClass().getName()); + this.plaintextMessage = plaintextMessage; + this.plaintextBuffer = plaintextBuffer; + this.ciphertextBuffer = ciphertextBuffer; + // If the ciphertext buffer cannot be fully written the target, transferTo may + // return with it containing some unwritten data. The initial call we'll explicitly + // set its limit to 0 to indicate the first call to transferTo. + this.ciphertextBuffer.limit(0); + + this.bytesToRead = getReadableBytes(); + this.encryptedCount = + LENGTH_HEADER_BYTES + aesGcmHkdfStreaming.expectedCiphertextSize(bytesToRead); + byte[] lengthAad = Longs.toByteArray(encryptedCount); + this.encrypter = aesGcmHkdfStreaming.newStreamSegmentEncrypter(lengthAad); + this.headerByteBuffer = createHeaderByteBuffer(); + } + + // The format of the output is: + // [8 byte length][Internal IV and header][Ciphertext][Auth Tag] + private ByteBuffer createHeaderByteBuffer() { + ByteBuffer encrypterHeader = encrypter.getHeader(); + return ByteBuffer + .allocate(encrypterHeader.remaining() + LENGTH_HEADER_BYTES) + .putLong(encryptedCount) + .put(encrypterHeader) + .flip(); + } + + @Override + public long position() { + return 0; + } + + @Override + public long transferred() { + return transferred; + } + + @Override + public long count() { + return encryptedCount; + } + + @Override + public GcmEncryptedMessage touch(Object o) { + super.touch(o); + if (plaintextMessage instanceof ByteBuf byteBuf) { + byteBuf.touch(o); + } else if (plaintextMessage instanceof FileRegion fileRegion) { + fileRegion.touch(o); + } + return this; + } + + @Override + public GcmEncryptedMessage retain(int increment) { + super.retain(increment); + if (plaintextMessage instanceof ByteBuf byteBuf) { + byteBuf.retain(increment); + } else if (plaintextMessage instanceof FileRegion fileRegion) { + fileRegion.retain(increment); + } + return this; + } + + @Override + public boolean release(int decrement) { + if (plaintextMessage instanceof ByteBuf byteBuf) { + byteBuf.release(decrement); + } else if (plaintextMessage instanceof FileRegion fileRegion) { + fileRegion.release(decrement); + } + return super.release(decrement); + } + + @Override + public long transferTo(WritableByteChannel target, long position) throws IOException { + int transferredThisCall = 0; + // If the header has is not empty, try to write it out to the target. + if (headerByteBuffer.hasRemaining()) { + int written = target.write(headerByteBuffer); + transferredThisCall += written; + this.transferred += written; + if (headerByteBuffer.hasRemaining()) { + return written; + } + } + // If the ciphertext buffer is not empty, try to write it to the target. + if (ciphertextBuffer.hasRemaining()) { + int written = target.write(ciphertextBuffer); + transferredThisCall += written; + this.transferred += written; + if (ciphertextBuffer.hasRemaining()) { + return transferredThisCall; + } + } + while (bytesRead < bytesToRead) { + long readableBytes = getReadableBytes(); + int readLimit = + (int) Math.min(readableBytes, plaintextBuffer.remaining()); + if (plaintextMessage instanceof ByteBuf byteBuf) { + Preconditions.checkState(0 == plaintextBuffer.position()); + plaintextBuffer.limit(readLimit); + byteBuf.readBytes(plaintextBuffer); + Preconditions.checkState(readLimit == plaintextBuffer.position()); + } else if (plaintextMessage instanceof FileRegion fileRegion) { + ByteBufferWriteableChannel plaintextChannel = + new ByteBufferWriteableChannel(plaintextBuffer); + long plaintextRead = + fileRegion.transferTo(plaintextChannel, fileRegion.transferred()); + if (plaintextRead < readLimit) { + // If we do not read a full plaintext buffer or all the available + // readable bytes, return what was transferred this call. + return transferredThisCall; + } + } + boolean lastSegment = getReadableBytes() == 0; + plaintextBuffer.flip(); + bytesRead += plaintextBuffer.remaining(); + ciphertextBuffer.clear(); + try { + encrypter.encryptSegment(plaintextBuffer, lastSegment, ciphertextBuffer); + } catch (GeneralSecurityException e) { + throw new IllegalStateException("GeneralSecurityException from encrypter", e); + } + plaintextBuffer.clear(); + ciphertextBuffer.flip(); + int written = target.write(ciphertextBuffer); + transferredThisCall += written; + this.transferred += written; + if (ciphertextBuffer.hasRemaining()) { + // In this case, upon calling transferTo again, it will try to write the + // remaining ciphertext buffer in the conditional before this loop. + return transferredThisCall; + } + } + return transferredThisCall; + } + + private long getReadableBytes() { + if (plaintextMessage instanceof ByteBuf byteBuf) { + return byteBuf.readableBytes(); + } else if (plaintextMessage instanceof FileRegion fileRegion) { + return fileRegion.count() - fileRegion.transferred(); + } else { + throw new IllegalArgumentException("Unsupported message type: " + + plaintextMessage.getClass().getName()); + } + } + + @Override + protected void deallocate() { + if (plaintextMessage instanceof ReferenceCounted referenceCounted) { + referenceCounted.release(); + } + plaintextBuffer.clear(); + ciphertextBuffer.clear(); + } + } + + @VisibleForTesting + class DecryptionHandler extends ChannelInboundHandlerAdapter { + private final ByteBuffer expectedLengthBuffer; + private final ByteBuffer headerBuffer; + private final ByteBuffer ciphertextBuffer; + private final AesGcmHkdfStreaming aesGcmHkdfStreaming; + private final StreamSegmentDecrypter decrypter; + private final int plaintextSegmentSize; + private boolean decrypterInit = false; + private boolean completed = false; + private int segmentNumber = 0; + private long expectedLength = -1; + private long ciphertextRead = 0; + + DecryptionHandler() throws GeneralSecurityException { + aesGcmHkdfStreaming = getAesGcmHkdfStreaming(); + expectedLengthBuffer = ByteBuffer.allocate(LENGTH_HEADER_BYTES); + headerBuffer = ByteBuffer.allocate(aesGcmHkdfStreaming.getHeaderLength()); + ciphertextBuffer = + ByteBuffer.allocate(aesGcmHkdfStreaming.getCiphertextSegmentSize()); + decrypter = aesGcmHkdfStreaming.newStreamSegmentDecrypter(); + plaintextSegmentSize = aesGcmHkdfStreaming.getPlaintextSegmentSize(); + } + + private boolean initalizeExpectedLength(ByteBuf ciphertextNettyBuf) { + if (expectedLength < 0) { + ciphertextNettyBuf.readBytes(expectedLengthBuffer); + if (expectedLengthBuffer.hasRemaining()) { + // We did not read enough bytes to initialize the expected length. + return false; + } + expectedLengthBuffer.flip(); + expectedLength = expectedLengthBuffer.getLong(); + if (expectedLength < 0) { + throw new IllegalStateException("Invalid expected ciphertext length."); + } + ciphertextRead += LENGTH_HEADER_BYTES; + } + return true; + } + + private boolean initalizeDecrypter(ByteBuf ciphertextNettyBuf) + throws GeneralSecurityException { + // Check if the ciphertext header has been read. This contains + // the IV and other internal metadata. + if (!decrypterInit) { + ciphertextNettyBuf.readBytes(headerBuffer); + if (headerBuffer.hasRemaining()) { + // We did not read enough bytes to initialize the header. + return false; + } + headerBuffer.flip(); + byte[] lengthAad = Longs.toByteArray(expectedLength); + decrypter.init(headerBuffer, lengthAad); + decrypterInit = true; + ciphertextRead += aesGcmHkdfStreaming.getHeaderLength(); + if (expectedLength == ciphertextRead) { + // If the expected length is just the header, the ciphertext is 0 length. + completed = true; + } + } + return true; + } + + @Override + public void channelRead(ChannelHandlerContext ctx, Object ciphertextMessage) + throws GeneralSecurityException { + Preconditions.checkArgument(ciphertextMessage instanceof ByteBuf, + "Unrecognized message type: %s", + ciphertextMessage.getClass().getName()); + ByteBuf ciphertextNettyBuf = (ByteBuf) ciphertextMessage; + // The format of the output is: + // [8 byte length][Internal IV and header][Ciphertext][Auth Tag] + try { + if (!initalizeExpectedLength(ciphertextNettyBuf)) { + // We have not read enough bytes to initialize the expected length. + return; + } + if (!initalizeDecrypter(ciphertextNettyBuf)) { + // We have not read enough bytes to initialize a header, needed to + // initialize a decrypter. + return; + } + int nettyBufReadableBytes = ciphertextNettyBuf.readableBytes(); + while (nettyBufReadableBytes > 0 && !completed) { + // Read the ciphertext into the local buffer + int readableBytes = Integer.min( + nettyBufReadableBytes, + ciphertextBuffer.remaining()); + int expectedRemaining = (int) (expectedLength - ciphertextRead); + int bytesToRead = Integer.min(readableBytes, expectedRemaining); + // The smallest ciphertext size is 16 bytes for the auth tag + ciphertextBuffer.limit(ciphertextBuffer.position() + bytesToRead); + ciphertextNettyBuf.readBytes(ciphertextBuffer); + ciphertextRead += bytesToRead; + // Check if this is the last segment + if (ciphertextRead == expectedLength) { + completed = true; + } else if (ciphertextRead > expectedLength) { + throw new IllegalStateException("Read more ciphertext than expected."); + } + // If the ciphertext buffer is full, or this is the last segment, + // then decrypt it and fire a read. + if (ciphertextBuffer.limit() == ciphertextBuffer.capacity() || completed) { + ByteBuffer plaintextBuffer = ByteBuffer.allocate(plaintextSegmentSize); + ciphertextBuffer.flip(); + decrypter.decryptSegment( + ciphertextBuffer, + segmentNumber, + completed, + plaintextBuffer); + segmentNumber++; + // Clear the ciphertext buffer because it's been read + ciphertextBuffer.clear(); + plaintextBuffer.flip(); + ctx.fireChannelRead(Unpooled.wrappedBuffer(plaintextBuffer)); + } else { + // Set the ciphertext buffer up to read the next chunk + ciphertextBuffer.limit(ciphertextBuffer.capacity()); + } + nettyBufReadableBytes = ciphertextNettyBuf.readableBytes(); + } + } finally { + ciphertextNettyBuf.release(); + } + } + } +} diff --git a/common/network-common/src/main/java/org/apache/spark/network/crypto/TransportCipher.java b/common/network-common/src/main/java/org/apache/spark/network/crypto/TransportCipher.java index b507f911fe11a..355c552720185 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/crypto/TransportCipher.java +++ b/common/network-common/src/main/java/org/apache/spark/network/crypto/TransportCipher.java @@ -17,362 +17,32 @@ package org.apache.spark.network.crypto; -import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.channels.ReadableByteChannel; -import java.nio.channels.WritableByteChannel; -import java.util.Properties; -import javax.crypto.spec.SecretKeySpec; -import javax.crypto.spec.IvParameterSpec; - import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; -import io.netty.buffer.ByteBuf; -import io.netty.buffer.Unpooled; -import io.netty.channel.*; -import org.apache.commons.crypto.stream.CryptoInputStream; -import org.apache.commons.crypto.stream.CryptoOutputStream; - -import org.apache.spark.network.util.AbstractFileRegion; -import org.apache.spark.network.util.ByteArrayReadableChannel; -import org.apache.spark.network.util.ByteArrayWritableChannel; - -/** - * Cipher for encryption and decryption. - */ -public class TransportCipher { - @VisibleForTesting - static final String ENCRYPTION_HANDLER_NAME = "TransportEncryption"; - private static final String DECRYPTION_HANDLER_NAME = "TransportDecryption"; - @VisibleForTesting - static final int STREAM_BUFFER_SIZE = 1024 * 32; - - private final Properties conf; - private final String cipher; - private final SecretKeySpec key; - private final byte[] inIv; - private final byte[] outIv; - - public TransportCipher( - Properties conf, - String cipher, - SecretKeySpec key, - byte[] inIv, - byte[] outIv) { - this.conf = conf; - this.cipher = cipher; - this.key = key; - this.inIv = inIv; - this.outIv = outIv; - } - - public String getCipherTransformation() { - return cipher; - } - - @VisibleForTesting - SecretKeySpec getKey() { - return key; - } - - /** The IV for the input channel (i.e. output channel of the remote side). */ - public byte[] getInputIv() { - return inIv; - } - - /** The IV for the output channel (i.e. input channel of the remote side). */ - public byte[] getOutputIv() { - return outIv; - } - - @VisibleForTesting - CryptoOutputStream createOutputStream(WritableByteChannel ch) throws IOException { - return new CryptoOutputStream(cipher, conf, ch, key, new IvParameterSpec(outIv)); - } - - @VisibleForTesting - CryptoInputStream createInputStream(ReadableByteChannel ch) throws IOException { - return new CryptoInputStream(cipher, conf, ch, key, new IvParameterSpec(inIv)); - } - - /** - * Add handlers to channel. - * - * @param ch the channel for adding handlers - * @throws IOException - */ - public void addToChannel(Channel ch) throws IOException { - ch.pipeline() - .addFirst(ENCRYPTION_HANDLER_NAME, new EncryptionHandler(this)) - .addFirst(DECRYPTION_HANDLER_NAME, new DecryptionHandler(this)); - } - - @VisibleForTesting - static class EncryptionHandler extends ChannelOutboundHandlerAdapter { - private final ByteArrayWritableChannel byteEncChannel; - private final CryptoOutputStream cos; - private final ByteArrayWritableChannel byteRawChannel; - private boolean isCipherValid; - - EncryptionHandler(TransportCipher cipher) throws IOException { - byteEncChannel = new ByteArrayWritableChannel(STREAM_BUFFER_SIZE); - cos = cipher.createOutputStream(byteEncChannel); - byteRawChannel = new ByteArrayWritableChannel(STREAM_BUFFER_SIZE); - isCipherValid = true; - } +import com.google.crypto.tink.subtle.Hex; +import com.google.crypto.tink.subtle.Hkdf; +import io.netty.channel.Channel; - @Override - public void write(ChannelHandlerContext ctx, Object msg, ChannelPromise promise) - throws Exception { - ctx.write(createEncryptedMessage(msg), promise); - } - - @VisibleForTesting - EncryptedMessage createEncryptedMessage(Object msg) { - return new EncryptedMessage(this, cos, msg, byteEncChannel, byteRawChannel); - } +import javax.crypto.spec.SecretKeySpec; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.security.GeneralSecurityException; - @Override - public void close(ChannelHandlerContext ctx, ChannelPromise promise) throws Exception { - try { - if (isCipherValid) { - cos.close(); - } - } finally { - super.close(ctx, promise); - } - } +interface TransportCipher { + String getKeyId() throws GeneralSecurityException; + void addToChannel(Channel channel) throws IOException, GeneralSecurityException; +} - /** - * SPARK-25535. Workaround for CRYPTO-141. Avoid further interaction with the underlying cipher - * after an error occurs. +class TransportCipherUtil { + /* + * This method is used for testing to verify key derivation. */ - void reportError() { - this.isCipherValid = false; - } - - boolean isCipherValid() { - return isCipherValid; - } - } - - private static class DecryptionHandler extends ChannelInboundHandlerAdapter { - private final CryptoInputStream cis; - private final ByteArrayReadableChannel byteChannel; - private boolean isCipherValid; - - DecryptionHandler(TransportCipher cipher) throws IOException { - byteChannel = new ByteArrayReadableChannel(); - cis = cipher.createInputStream(byteChannel); - isCipherValid = true; - } - - @Override - public void channelRead(ChannelHandlerContext ctx, Object data) throws Exception { - ByteBuf buffer = (ByteBuf) data; - - try { - if (!isCipherValid) { - throw new IOException("Cipher is in invalid state."); - } - byte[] decryptedData = new byte[buffer.readableBytes()]; - byteChannel.feedData(buffer); - - int offset = 0; - while (offset < decryptedData.length) { - // SPARK-25535: workaround for CRYPTO-141. - try { - offset += cis.read(decryptedData, offset, decryptedData.length - offset); - } catch (InternalError ie) { - isCipherValid = false; - throw ie; - } - } - - ctx.fireChannelRead(Unpooled.wrappedBuffer(decryptedData, 0, decryptedData.length)); - } finally { - buffer.release(); - } - } - - @Override - public void handlerRemoved(ChannelHandlerContext ctx) throws Exception { - // We do the closing of the stream / channel in handlerRemoved(...) as - // this method will be called in all cases: - // - // - when the Channel becomes inactive - // - when the handler is removed from the ChannelPipeline - try { - if (isCipherValid) { - cis.close(); - } - } finally { - super.handlerRemoved(ctx); - } - } - } - - @VisibleForTesting - static class EncryptedMessage extends AbstractFileRegion { - private final boolean isByteBuf; - private final ByteBuf buf; - private final FileRegion region; - private final CryptoOutputStream cos; - private final EncryptionHandler handler; - private final long count; - private long transferred; - - // Due to streaming issue CRYPTO-125: https://issues.apache.org/jira/browse/CRYPTO-125, it has - // to utilize two helper ByteArrayWritableChannel for streaming. One is used to receive raw data - // from upper handler, another is used to store encrypted data. - private final ByteArrayWritableChannel byteEncChannel; - private final ByteArrayWritableChannel byteRawChannel; - - private ByteBuffer currentEncrypted; - - EncryptedMessage( - EncryptionHandler handler, - CryptoOutputStream cos, - Object msg, - ByteArrayWritableChannel byteEncChannel, - ByteArrayWritableChannel byteRawChannel) { - Preconditions.checkArgument(msg instanceof ByteBuf || msg instanceof FileRegion, - "Unrecognized message type: %s", msg.getClass().getName()); - this.handler = handler; - this.isByteBuf = msg instanceof ByteBuf; - this.buf = isByteBuf ? (ByteBuf) msg : null; - this.region = isByteBuf ? null : (FileRegion) msg; - this.transferred = 0; - this.cos = cos; - this.byteEncChannel = byteEncChannel; - this.byteRawChannel = byteRawChannel; - this.count = isByteBuf ? buf.readableBytes() : region.count(); - } - - @Override - public long count() { - return count; - } - - @Override - public long position() { - return 0; - } - - @Override - public long transferred() { - return transferred; - } - - @Override - public EncryptedMessage touch(Object o) { - super.touch(o); - if (region != null) { - region.touch(o); - } - if (buf != null) { - buf.touch(o); - } - return this; - } - - @Override - public EncryptedMessage retain(int increment) { - super.retain(increment); - if (region != null) { - region.retain(increment); - } - if (buf != null) { - buf.retain(increment); - } - return this; - } - - @Override - public boolean release(int decrement) { - if (region != null) { - region.release(decrement); - } - if (buf != null) { - buf.release(decrement); - } - return super.release(decrement); - } - - @Override - public long transferTo(WritableByteChannel target, long position) throws IOException { - Preconditions.checkArgument(position == transferred(), "Invalid position."); - - if (transferred == count) { - return 0; - } - - long totalBytesWritten = 0L; - do { - if (currentEncrypted == null) { - encryptMore(); - } - - long remaining = currentEncrypted.remaining(); - if (remaining == 0) { - // Just for safety to avoid endless loop. It usually won't happen, but since the - // underlying `region.transferTo` is allowed to transfer 0 bytes, we should handle it for - // safety. - currentEncrypted = null; - byteEncChannel.reset(); - return totalBytesWritten; - } - - long bytesWritten = target.write(currentEncrypted); - totalBytesWritten += bytesWritten; - transferred += bytesWritten; - if (bytesWritten < remaining) { - // break as the underlying buffer in "target" is full - break; - } - currentEncrypted = null; - byteEncChannel.reset(); - } while (transferred < count); - - return totalBytesWritten; - } - - private void encryptMore() throws IOException { - if (!handler.isCipherValid()) { - throw new IOException("Cipher is in invalid state."); - } - byteRawChannel.reset(); - - if (isByteBuf) { - int copied = byteRawChannel.write(buf.nioBuffer()); - buf.skipBytes(copied); - } else { - region.transferTo(byteRawChannel, region.transferred()); - } - - try { - cos.write(byteRawChannel.getData(), 0, byteRawChannel.length()); - cos.flush(); - } catch (InternalError ie) { - handler.reportError(); - throw ie; - } - - currentEncrypted = ByteBuffer.wrap(byteEncChannel.getData(), - 0, byteEncChannel.length()); - } - - @Override - protected void deallocate() { - byteRawChannel.reset(); - byteEncChannel.reset(); - if (region != null) { - region.release(); - } - if (buf != null) { - buf.release(); - } + @VisibleForTesting + static String getKeyId(SecretKeySpec key) throws GeneralSecurityException { + byte[] keyIdBytes = Hkdf.computeHkdf("HmacSha256", + key.getEncoded(), + null, + "keyID".getBytes(StandardCharsets.UTF_8), + 32); + return Hex.encode(keyIdBytes); } - } - } diff --git a/common/network-common/src/main/java/org/apache/spark/network/protocol/EncryptedMessageWithHeader.java b/common/network-common/src/main/java/org/apache/spark/network/protocol/EncryptedMessageWithHeader.java index d9f83ce8bac35..321ac13881c2a 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/protocol/EncryptedMessageWithHeader.java +++ b/common/network-common/src/main/java/org/apache/spark/network/protocol/EncryptedMessageWithHeader.java @@ -1,4 +1,3 @@ - /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -15,6 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.spark.network.protocol; import java.io.EOFException; diff --git a/common/network-common/src/main/java/org/apache/spark/network/protocol/MessageDecoder.java b/common/network-common/src/main/java/org/apache/spark/network/protocol/MessageDecoder.java index 29369f6c20600..a9b700a7800e0 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/protocol/MessageDecoder.java +++ b/common/network-common/src/main/java/org/apache/spark/network/protocol/MessageDecoder.java @@ -23,8 +23,9 @@ import io.netty.channel.ChannelHandler; import io.netty.channel.ChannelHandlerContext; import io.netty.handler.codec.MessageToMessageDecoder; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; + +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; /** * Decoder used by the client side to encode server-to-client responses. @@ -33,7 +34,7 @@ @ChannelHandler.Sharable public final class MessageDecoder extends MessageToMessageDecoder { - private static final Logger logger = LoggerFactory.getLogger(MessageDecoder.class); + private static final SparkLogger logger = SparkLoggerFactory.getLogger(MessageDecoder.class); public static final MessageDecoder INSTANCE = new MessageDecoder(); diff --git a/common/network-common/src/main/java/org/apache/spark/network/protocol/MessageEncoder.java b/common/network-common/src/main/java/org/apache/spark/network/protocol/MessageEncoder.java index 00de47dc9fc2d..ab20fb908eb42 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/protocol/MessageEncoder.java +++ b/common/network-common/src/main/java/org/apache/spark/network/protocol/MessageEncoder.java @@ -23,8 +23,11 @@ import io.netty.channel.ChannelHandler; import io.netty.channel.ChannelHandlerContext; import io.netty.handler.codec.MessageToMessageEncoder; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; + +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; +import org.apache.spark.internal.MDC; /** * Encoder used by the server side to encode server-to-client responses. @@ -33,7 +36,7 @@ @ChannelHandler.Sharable public final class MessageEncoder extends MessageToMessageEncoder { - private static final Logger logger = LoggerFactory.getLogger(MessageEncoder.class); + private static final SparkLogger logger = SparkLoggerFactory.getLogger(MessageEncoder.class); public static final MessageEncoder INSTANCE = new MessageEncoder(); @@ -62,8 +65,9 @@ public void encode(ChannelHandlerContext ctx, Message in, List out) thro if (in instanceof AbstractResponseMessage resp) { // Re-encode this message as a failure response. String error = e.getMessage() != null ? e.getMessage() : "null"; - logger.error(String.format("Error processing %s for client %s", - in, ctx.channel().remoteAddress()), e); + logger.error("Error processing {} for client {}", e, + MDC.of(LogKeys.MESSAGE$.MODULE$, in), + MDC.of(LogKeys.HOST_PORT$.MODULE$, ctx.channel().remoteAddress())); encode(ctx, resp.createFailureResponse(error), out); } else { throw e; diff --git a/common/network-common/src/main/java/org/apache/spark/network/protocol/SslMessageEncoder.java b/common/network-common/src/main/java/org/apache/spark/network/protocol/SslMessageEncoder.java index 3177971a95d57..abe6ccca7bfd6 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/protocol/SslMessageEncoder.java +++ b/common/network-common/src/main/java/org/apache/spark/network/protocol/SslMessageEncoder.java @@ -26,8 +26,10 @@ import io.netty.handler.codec.MessageToMessageEncoder; import io.netty.handler.stream.ChunkedStream; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.MDC; /** * Encoder used by the server side to encode secure (SSL) server-to-client responses. @@ -36,7 +38,7 @@ @ChannelHandler.Sharable public final class SslMessageEncoder extends MessageToMessageEncoder { - private static final Logger logger = LoggerFactory.getLogger(SslMessageEncoder.class); + private static final SparkLogger logger = SparkLoggerFactory.getLogger(SslMessageEncoder.class); private SslMessageEncoder() {} @@ -68,8 +70,9 @@ public void encode(ChannelHandlerContext ctx, Message in, List out) thro if (in instanceof AbstractResponseMessage resp) { // Re-encode this message as a failure response. String error = e.getMessage() != null ? e.getMessage() : "null"; - logger.error(String.format("Error processing %s for client %s", - in, ctx.channel().remoteAddress()), e); + logger.error("Error processing {} for client {}", e, + MDC.of(LogKeys.MESSAGE$.MODULE$, in), + MDC.of(LogKeys.HOST_PORT$.MODULE$, ctx.channel().remoteAddress())); encode(ctx, resp.createFailureResponse(error), out); } else { throw e; diff --git a/common/network-common/src/main/java/org/apache/spark/network/sasl/SaslClientBootstrap.java b/common/network-common/src/main/java/org/apache/spark/network/sasl/SaslClientBootstrap.java index 7f29af3688eec..0a355d28c3668 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/sasl/SaslClientBootstrap.java +++ b/common/network-common/src/main/java/org/apache/spark/network/sasl/SaslClientBootstrap.java @@ -26,9 +26,9 @@ import io.netty.buffer.ByteBuf; import io.netty.buffer.Unpooled; import io.netty.channel.Channel; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; import org.apache.spark.network.client.TransportClient; import org.apache.spark.network.client.TransportClientBootstrap; import org.apache.spark.network.util.JavaUtils; @@ -39,7 +39,7 @@ * server should be setup with a {@link SaslRpcHandler} with matching keys for the given appId. */ public class SaslClientBootstrap implements TransportClientBootstrap { - private static final Logger logger = LoggerFactory.getLogger(SaslClientBootstrap.class); + private static final SparkLogger logger = SparkLoggerFactory.getLogger(SaslClientBootstrap.class); private final TransportConf conf; private final String appId; diff --git a/common/network-common/src/main/java/org/apache/spark/network/sasl/SaslRpcHandler.java b/common/network-common/src/main/java/org/apache/spark/network/sasl/SaslRpcHandler.java index cc9e88fcf98e7..b5fffe583ec63 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/sasl/SaslRpcHandler.java +++ b/common/network-common/src/main/java/org/apache/spark/network/sasl/SaslRpcHandler.java @@ -24,9 +24,9 @@ import io.netty.buffer.ByteBuf; import io.netty.buffer.Unpooled; import io.netty.channel.Channel; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; import org.apache.spark.network.client.RpcResponseCallback; import org.apache.spark.network.client.TransportClient; import org.apache.spark.network.server.AbstractAuthRpcHandler; @@ -43,7 +43,7 @@ * which are individual RPCs. */ public class SaslRpcHandler extends AbstractAuthRpcHandler { - private static final Logger logger = LoggerFactory.getLogger(SaslRpcHandler.class); + private static final SparkLogger logger = SparkLoggerFactory.getLogger(SaslRpcHandler.class); /** Transport configuration. */ private final TransportConf conf; diff --git a/common/network-common/src/main/java/org/apache/spark/network/sasl/SparkSaslClient.java b/common/network-common/src/main/java/org/apache/spark/network/sasl/SparkSaslClient.java index 524ff0a310655..3600c1045dbf4 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/sasl/SparkSaslClient.java +++ b/common/network-common/src/main/java/org/apache/spark/network/sasl/SparkSaslClient.java @@ -31,8 +31,9 @@ import com.google.common.base.Throwables; import com.google.common.collect.ImmutableMap; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; + +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; import static org.apache.spark.network.sasl.SparkSaslServer.*; @@ -42,7 +43,7 @@ * firstToken, which is then followed by a set of challenges and responses. */ public class SparkSaslClient implements SaslEncryptionBackend { - private static final Logger logger = LoggerFactory.getLogger(SparkSaslClient.class); + private static final SparkLogger logger = SparkLoggerFactory.getLogger(SparkSaslClient.class); private final String secretKeyId; private final SecretKeyHolder secretKeyHolder; diff --git a/common/network-common/src/main/java/org/apache/spark/network/sasl/SparkSaslServer.java b/common/network-common/src/main/java/org/apache/spark/network/sasl/SparkSaslServer.java index 26e5718cb4a70..b897650afe832 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/sasl/SparkSaslServer.java +++ b/common/network-common/src/main/java/org/apache/spark/network/sasl/SparkSaslServer.java @@ -36,8 +36,9 @@ import io.netty.buffer.ByteBuf; import io.netty.buffer.Unpooled; import io.netty.handler.codec.base64.Base64; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; + +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; /** * A SASL Server for Spark which simply keeps track of the state of a single SASL session, from the @@ -45,7 +46,7 @@ * connections on some socket.) */ public class SparkSaslServer implements SaslEncryptionBackend { - private static final Logger logger = LoggerFactory.getLogger(SparkSaslServer.class); + private static final SparkLogger logger = SparkLoggerFactory.getLogger(SparkSaslServer.class); /** * This is passed as the server name when creating the sasl client/server. diff --git a/common/network-common/src/main/java/org/apache/spark/network/server/ChunkFetchRequestHandler.java b/common/network-common/src/main/java/org/apache/spark/network/server/ChunkFetchRequestHandler.java index e49141c7b9679..cc0bed7ed5b6d 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/server/ChunkFetchRequestHandler.java +++ b/common/network-common/src/main/java/org/apache/spark/network/server/ChunkFetchRequestHandler.java @@ -25,9 +25,11 @@ import io.netty.channel.ChannelFutureListener; import io.netty.channel.ChannelHandlerContext; import io.netty.channel.SimpleChannelInboundHandler; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; +import org.apache.spark.internal.MDC; import org.apache.spark.network.buffer.ManagedBuffer; import org.apache.spark.network.client.TransportClient; import org.apache.spark.network.protocol.ChunkFetchFailure; @@ -49,7 +51,8 @@ * registering executors, or waiting for response for an OpenBlocks messages. */ public class ChunkFetchRequestHandler extends SimpleChannelInboundHandler { - private static final Logger logger = LoggerFactory.getLogger(ChunkFetchRequestHandler.class); + private static final SparkLogger logger = + SparkLoggerFactory.getLogger(ChunkFetchRequestHandler.class); private final TransportClient client; private final StreamManager streamManager; @@ -70,7 +73,8 @@ public ChunkFetchRequestHandler( @Override public void exceptionCaught(ChannelHandlerContext ctx, Throwable cause) throws Exception { - logger.warn("Exception in connection from " + getRemoteAddress(ctx.channel()), cause); + logger.warn("Exception in connection from {}", cause, + MDC.of(LogKeys.HOST_PORT$.MODULE$, getRemoteAddress(ctx.channel()))); ctx.close(); } @@ -92,7 +96,8 @@ public void processFetchRequest( long chunksBeingTransferred = streamManager.chunksBeingTransferred(); if (chunksBeingTransferred >= maxChunksBeingTransferred) { logger.warn("The number of chunks being transferred {} is above {}, close the connection.", - chunksBeingTransferred, maxChunksBeingTransferred); + MDC.of(LogKeys.NUM_CHUNKS$.MODULE$, chunksBeingTransferred), + MDC.of(LogKeys.MAX_NUM_CHUNKS$.MODULE$, maxChunksBeingTransferred)); channel.close(); return; } @@ -105,8 +110,9 @@ public void processFetchRequest( throw new IllegalStateException("Chunk was not found"); } } catch (Exception e) { - logger.error(String.format("Error opening block %s for request from %s", - msg.streamChunkId, getRemoteAddress(channel)), e); + logger.error("Error opening block {} for request from {}", e, + MDC.of(LogKeys.STREAM_CHUNK_ID$.MODULE$, msg.streamChunkId), + MDC.of(LogKeys.HOST_PORT$.MODULE$, getRemoteAddress(channel))); respond(channel, new ChunkFetchFailure(msg.streamChunkId, Throwables.getStackTraceAsString(e))); return; @@ -145,8 +151,10 @@ private ChannelFuture respond( if (future.isSuccess()) { logger.trace("Sent result {} to client {}", result, remoteAddress); } else { - logger.error(String.format("Error sending result %s to %s; closing connection", - result, remoteAddress), future.cause()); + logger.error("Error sending result {} to {}; closing connection", + future.cause(), + MDC.of(LogKeys.RESULT$.MODULE$, result), + MDC.of(LogKeys.HOST_PORT$.MODULE$, remoteAddress)); channel.close(); } }); diff --git a/common/network-common/src/main/java/org/apache/spark/network/server/OneForOneStreamManager.java b/common/network-common/src/main/java/org/apache/spark/network/server/OneForOneStreamManager.java index ace409eb3f48d..f322293782dee 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/server/OneForOneStreamManager.java +++ b/common/network-common/src/main/java/org/apache/spark/network/server/OneForOneStreamManager.java @@ -28,9 +28,9 @@ import io.netty.channel.Channel; import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.lang3.tuple.Pair; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; import org.apache.spark.network.buffer.ManagedBuffer; import org.apache.spark.network.client.TransportClient; @@ -39,7 +39,8 @@ * individually fetched as chunks by the client. Each registered buffer is one chunk. */ public class OneForOneStreamManager extends StreamManager { - private static final Logger logger = LoggerFactory.getLogger(OneForOneStreamManager.class); + private static final SparkLogger logger = + SparkLoggerFactory.getLogger(OneForOneStreamManager.class); private final AtomicLong nextStreamId; private final ConcurrentHashMap streams; diff --git a/common/network-common/src/main/java/org/apache/spark/network/server/RpcHandler.java b/common/network-common/src/main/java/org/apache/spark/network/server/RpcHandler.java index 0b894277561fd..a7c38917d17f6 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/server/RpcHandler.java +++ b/common/network-common/src/main/java/org/apache/spark/network/server/RpcHandler.java @@ -19,9 +19,8 @@ import java.nio.ByteBuffer; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; import org.apache.spark.network.client.MergedBlockMetaResponseCallback; import org.apache.spark.network.client.RpcResponseCallback; import org.apache.spark.network.client.StreamCallbackWithID; @@ -123,7 +122,7 @@ public void exceptionCaught(Throwable cause, TransportClient client) { } private static class OneWayRpcCallback implements RpcResponseCallback { - private static final Logger logger = LoggerFactory.getLogger(OneWayRpcCallback.class); + private static final SparkLogger logger = SparkLoggerFactory.getLogger(OneWayRpcCallback.class); @Override public void onSuccess(ByteBuffer response) { diff --git a/common/network-common/src/main/java/org/apache/spark/network/server/TransportChannelHandler.java b/common/network-common/src/main/java/org/apache/spark/network/server/TransportChannelHandler.java index a504e8c20a7f7..283f0f0a431fd 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/server/TransportChannelHandler.java +++ b/common/network-common/src/main/java/org/apache/spark/network/server/TransportChannelHandler.java @@ -22,9 +22,11 @@ import io.netty.handler.timeout.IdleState; import io.netty.handler.timeout.IdleStateEvent; import org.apache.spark.network.TransportContext; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.MDC; import org.apache.spark.network.client.TransportClient; import org.apache.spark.network.client.TransportResponseHandler; import org.apache.spark.network.protocol.ChunkFetchRequest; @@ -51,7 +53,8 @@ * timeout if the client is continuously sending but getting no responses, for simplicity. */ public class TransportChannelHandler extends SimpleChannelInboundHandler { - private static final Logger logger = LoggerFactory.getLogger(TransportChannelHandler.class); + private static final SparkLogger logger = + SparkLoggerFactory.getLogger(TransportChannelHandler.class); private final TransportClient client; private final TransportResponseHandler responseHandler; @@ -84,8 +87,8 @@ public TransportClient getClient() { @Override public void exceptionCaught(ChannelHandlerContext ctx, Throwable cause) throws Exception { - logger.warn("Exception in connection from " + getRemoteAddress(ctx.channel()), - cause); + logger.warn("Exception in connection from {}", cause, + MDC.of(LogKeys.HOST_PORT$.MODULE$, getRemoteAddress(ctx.channel()))); requestHandler.exceptionCaught(cause); responseHandler.exceptionCaught(cause); ctx.close(); @@ -165,7 +168,9 @@ public void userEventTriggered(ChannelHandlerContext ctx, Object evt) throws Exc logger.error("Connection to {} has been quiet for {} ms while there are outstanding " + "requests. Assuming connection is dead; please adjust" + " spark.{}.io.connectionTimeout if this is wrong.", - address, requestTimeoutNs / 1000 / 1000, transportContext.getConf().getModuleName()); + MDC.of(LogKeys.HOST_PORT$.MODULE$, address), + MDC.of(LogKeys.TIMEOUT$.MODULE$, requestTimeoutNs / 1000 / 1000), + MDC.of(LogKeys.MODULE_NAME$.MODULE$, transportContext.getConf().getModuleName())); client.timeOut(); ctx.close(); } else if (closeIdleConnections) { diff --git a/common/network-common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java b/common/network-common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java index c5e6da4cf6c7d..687c3040ed083 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java +++ b/common/network-common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java @@ -25,9 +25,10 @@ import io.netty.channel.Channel; import io.netty.channel.ChannelFuture; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.MDC; import org.apache.spark.network.buffer.ManagedBuffer; import org.apache.spark.network.buffer.NioManagedBuffer; import org.apache.spark.network.client.*; @@ -45,7 +46,8 @@ */ public class TransportRequestHandler extends MessageHandler { - private static final Logger logger = LoggerFactory.getLogger(TransportRequestHandler.class); + private static final SparkLogger logger = + SparkLoggerFactory.getLogger(TransportRequestHandler.class); /** The Netty channel that this handler is associated with. */ private final Channel channel; @@ -130,7 +132,8 @@ private void processStreamRequest(final StreamRequest req) { long chunksBeingTransferred = streamManager.chunksBeingTransferred(); if (chunksBeingTransferred >= maxChunksBeingTransferred) { logger.warn("The number of chunks being transferred {} is above {}, close the connection.", - chunksBeingTransferred, maxChunksBeingTransferred); + MDC.of(LogKeys.NUM_CHUNKS$.MODULE$, chunksBeingTransferred), + MDC.of(LogKeys.MAX_NUM_CHUNKS$.MODULE$, maxChunksBeingTransferred)); channel.close(); return; } @@ -139,8 +142,9 @@ private void processStreamRequest(final StreamRequest req) { try { buf = streamManager.openStream(req.streamId); } catch (Exception e) { - logger.error(String.format( - "Error opening stream %s for request from %s", req.streamId, getRemoteAddress(channel)), e); + logger.error("Error opening stream {} for request from {}", e, + MDC.of(LogKeys.STREAM_ID$.MODULE$, req.streamId), + MDC.of(LogKeys.HOST_PORT$.MODULE$, getRemoteAddress(channel))); respond(new StreamFailure(req.streamId, Throwables.getStackTraceAsString(e))); return; } @@ -172,7 +176,8 @@ public void onFailure(Throwable e) { } }); } catch (Exception e) { - logger.error("Error while invoking RpcHandler#receive() on RPC id " + req.requestId, e); + logger.error("Error while invoking RpcHandler#receive() on RPC id {}", e, + MDC.of(LogKeys.REQUEST_ID$.MODULE$, req.requestId)); respond(new RpcFailure(req.requestId, Throwables.getStackTraceAsString(e))); } finally { req.body().release(); @@ -257,7 +262,8 @@ public String getID() { respond(new RpcResponse(req.requestId, new NioManagedBuffer(blockPushNonFatalFailure.getResponse()))); } else { - logger.error("Error while invoking RpcHandler#receive() on RPC id " + req.requestId, e); + logger.error("Error while invoking RpcHandler#receive() on RPC id {}", e, + MDC.of(LogKeys.REQUEST_ID$.MODULE$, req.requestId)); respond(new RpcFailure(req.requestId, Throwables.getStackTraceAsString(e))); } // We choose to totally fail the channel, rather than trying to recover as we do in other @@ -298,7 +304,9 @@ public void onFailure(Throwable e) { }); } catch (Exception e) { logger.error("Error while invoking receiveMergeBlockMetaReq() for appId {} shuffleId {} " - + "reduceId {}", req.appId, req.shuffleId, req.appId, e); + + "reduceId {}", e, MDC.of(LogKeys.APP_ID$.MODULE$, req.appId), + MDC.of(LogKeys.SHUFFLE_ID$.MODULE$, req.shuffleId), + MDC.of(LogKeys.REDUCE_ID$.MODULE$, req.reduceId)); respond(new RpcFailure(req.requestId, Throwables.getStackTraceAsString(e))); } } @@ -313,8 +321,9 @@ private ChannelFuture respond(Encodable result) { if (future.isSuccess()) { logger.trace("Sent result {} to client {}", result, remoteAddress); } else { - logger.error(String.format("Error sending result %s to %s; closing connection", - result, remoteAddress), future.cause()); + logger.error("Error sending result {} to {}; closing connection", future.cause(), + MDC.of(LogKeys.RESULT$.MODULE$, result), + MDC.of(LogKeys.HOST_PORT$.MODULE$, remoteAddress)); channel.close(); } }); diff --git a/common/network-common/src/main/java/org/apache/spark/network/server/TransportServer.java b/common/network-common/src/main/java/org/apache/spark/network/server/TransportServer.java index 6f2e4b8a502a2..d1a19652f5649 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/server/TransportServer.java +++ b/common/network-common/src/main/java/org/apache/spark/network/server/TransportServer.java @@ -34,9 +34,9 @@ import io.netty.channel.EventLoopGroup; import io.netty.channel.socket.SocketChannel; import org.apache.commons.lang3.SystemUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; import org.apache.spark.network.TransportContext; import org.apache.spark.network.util.*; @@ -44,7 +44,7 @@ * Server for the efficient, low-level streaming service. */ public class TransportServer implements Closeable { - private static final Logger logger = LoggerFactory.getLogger(TransportServer.class); + private static final SparkLogger logger = SparkLoggerFactory.getLogger(TransportServer.class); private final TransportContext context; private final TransportConf conf; diff --git a/common/network-common/src/main/java/org/apache/spark/network/ssl/ReloadingX509TrustManager.java b/common/network-common/src/main/java/org/apache/spark/network/ssl/ReloadingX509TrustManager.java index 18618e7d5c8be..09609d0ac8ad9 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/ssl/ReloadingX509TrustManager.java +++ b/common/network-common/src/main/java/org/apache/spark/network/ssl/ReloadingX509TrustManager.java @@ -30,9 +30,8 @@ import com.google.common.annotations.VisibleForTesting; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; /** * A {@link TrustManager} implementation that reloads its configuration when @@ -46,7 +45,8 @@ public final class ReloadingX509TrustManager implements X509TrustManager, Runnable { - private static final Logger logger = LoggerFactory.getLogger(ReloadingX509TrustManager.class); + private static final SparkLogger logger = + SparkLoggerFactory.getLogger(ReloadingX509TrustManager.class); private final String type; private final File file; @@ -211,13 +211,13 @@ public void run() { this.reloadCount += 1; } catch (Exception ex) { logger.warn( - "Could not load truststore (keep using existing one) : " + ex.toString(), + "Could not load truststore (keep using existing one) : ", ex ); } } } catch (IOException ex) { - logger.warn("Could not check whether truststore needs reloading: " + ex.toString(), ex); + logger.warn("Could not check whether truststore needs reloading: ", ex); } needsReloadCheckCounts++; } diff --git a/common/network-common/src/main/java/org/apache/spark/network/ssl/SSLFactory.java b/common/network-common/src/main/java/org/apache/spark/network/ssl/SSLFactory.java index 82951d2130112..a2e42e3eb39f6 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/ssl/SSLFactory.java +++ b/common/network-common/src/main/java/org/apache/spark/network/ssl/SSLFactory.java @@ -49,13 +49,12 @@ import io.netty.handler.ssl.SslContextBuilder; import io.netty.handler.ssl.SslProvider; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; import org.apache.spark.network.util.JavaUtils; public class SSLFactory { - private static final Logger logger = LoggerFactory.getLogger(SSLFactory.class); + private static final SparkLogger logger = SparkLoggerFactory.getLogger(SSLFactory.class); /** * For a configuration specifying keystore/truststore files @@ -136,7 +135,7 @@ public void destroy() { try { manager.destroy(); } catch (InterruptedException ex) { - logger.info("Interrupted while destroying trust manager: " + ex.toString(), ex); + logger.info("Interrupted while destroying trust manager: ", ex); } } } diff --git a/common/network-common/src/main/java/org/apache/spark/network/util/ByteBufferWriteableChannel.java b/common/network-common/src/main/java/org/apache/spark/network/util/ByteBufferWriteableChannel.java new file mode 100644 index 0000000000000..b20240cfcaa6d --- /dev/null +++ b/common/network-common/src/main/java/org/apache/spark/network/util/ByteBufferWriteableChannel.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.network.util; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.channels.ClosedChannelException; +import java.nio.channels.WritableByteChannel; + +public class ByteBufferWriteableChannel implements WritableByteChannel { + private final ByteBuffer destination; + private boolean open; + + public ByteBufferWriteableChannel(ByteBuffer destination) { + this.destination = destination; + this.open = true; + } + + @Override + public int write(ByteBuffer src) throws IOException { + if (!isOpen()) { + throw new ClosedChannelException(); + } + int bytesToWrite = Math.min(src.remaining(), destination.remaining()); + // Destination buffer is full + if (bytesToWrite == 0) { + return 0; + } + ByteBuffer temp = src.slice().limit(bytesToWrite); + destination.put(temp); + src.position(src.position() + bytesToWrite); + return bytesToWrite; + } + + @Override + public boolean isOpen() { + return open; + } + + @Override + public void close() { + open = false; + } +} diff --git a/common/network-common/src/main/java/org/apache/spark/network/util/DBProvider.java b/common/network-common/src/main/java/org/apache/spark/network/util/DBProvider.java index 5a25bdda23355..94a64b3f4037c 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/util/DBProvider.java +++ b/common/network-common/src/main/java/org/apache/spark/network/util/DBProvider.java @@ -21,9 +21,9 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.annotations.VisibleForTesting; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; import org.apache.spark.network.shuffledb.DB; import org.apache.spark.network.shuffledb.DBBackend; import org.apache.spark.network.shuffledb.LevelDB; @@ -31,7 +31,7 @@ import org.apache.spark.network.shuffledb.StoreVersion; public class DBProvider { - private static final Logger logger = LoggerFactory.getLogger(DBProvider.class); + private static final SparkLogger logger = SparkLoggerFactory.getLogger(DBProvider.class); public static DB initDB( DBBackend dbBackend, File dbFile, diff --git a/common/network-common/src/main/java/org/apache/spark/network/util/LevelDBProvider.java b/common/network-common/src/main/java/org/apache/spark/network/util/LevelDBProvider.java index aa8be0c663bc2..391931961a474 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/util/LevelDBProvider.java +++ b/common/network-common/src/main/java/org/apache/spark/network/util/LevelDBProvider.java @@ -26,16 +26,18 @@ import org.fusesource.leveldbjni.internal.NativeDB; import org.iq80.leveldb.DB; import org.iq80.leveldb.Options; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.MDC; import org.apache.spark.network.shuffledb.StoreVersion; /** * LevelDB utility class available in the network package. */ public class LevelDBProvider { - private static final Logger logger = LoggerFactory.getLogger(LevelDBProvider.class); + private static final SparkLogger logger = SparkLoggerFactory.getLogger(LevelDBProvider.class); public static DB initLevelDB(File dbFile, StoreVersion version, ObjectMapper mapper) throws IOException { @@ -48,7 +50,7 @@ public static DB initLevelDB(File dbFile, StoreVersion version, ObjectMapper map tmpDb = JniDBFactory.factory.open(dbFile, options); } catch (NativeDB.DBException e) { if (e.isNotFound() || e.getMessage().contains(" does not exist ")) { - logger.info("Creating state database at " + dbFile); + logger.info("Creating state database at {}", MDC.of(LogKeys.PATH$.MODULE$, dbFile)); options.createIfMissing(true); try { tmpDb = JniDBFactory.factory.open(dbFile, options); @@ -58,17 +60,17 @@ public static DB initLevelDB(File dbFile, StoreVersion version, ObjectMapper map } else { // the leveldb file seems to be corrupt somehow. Lets just blow it away and create a new // one, so we can keep processing new apps - logger.error("error opening leveldb file {}. Creating new file, will not be able to " + - "recover state for existing applications", dbFile, e); + logger.error("error opening leveldb file {}. Creating new file, will not be able to " + + "recover state for existing applications", e, MDC.of(LogKeys.PATH$.MODULE$, dbFile)); if (dbFile.isDirectory()) { for (File f : dbFile.listFiles()) { if (!f.delete()) { - logger.warn("error deleting {}", f.getPath()); + logger.warn("error deleting {}", MDC.of(LogKeys.PATH$.MODULE$, f.getPath())); } } } if (!dbFile.delete()) { - logger.warn("error deleting {}", dbFile.getPath()); + logger.warn("error deleting {}", MDC.of(LogKeys.PATH$.MODULE$, dbFile.getPath())); } options.createIfMissing(true); try { @@ -99,7 +101,7 @@ static DB initLevelDB(File file) throws IOException { } private static class LevelDBLogger implements org.iq80.leveldb.Logger { - private static final Logger LOG = LoggerFactory.getLogger(LevelDBLogger.class); + private static final SparkLogger LOG = SparkLoggerFactory.getLogger(LevelDBLogger.class); @Override public void log(String message) { diff --git a/common/network-common/src/main/java/org/apache/spark/network/util/NettyLogger.java b/common/network-common/src/main/java/org/apache/spark/network/util/NettyLogger.java index cb66784e41918..a7063151fae89 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/util/NettyLogger.java +++ b/common/network-common/src/main/java/org/apache/spark/network/util/NettyLogger.java @@ -25,11 +25,12 @@ import io.netty.channel.ChannelHandlerContext; import io.netty.handler.logging.LoggingHandler; import io.netty.handler.logging.LogLevel; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; + +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; public class NettyLogger { - private static final Logger logger = LoggerFactory.getLogger(NettyLogger.class); + private static final SparkLogger logger = SparkLoggerFactory.getLogger(NettyLogger.class); /** A Netty LoggingHandler which does not dump the message contents. */ private static class NoContentLoggingHandler extends LoggingHandler { diff --git a/common/network-common/src/main/java/org/apache/spark/network/util/RocksDBProvider.java b/common/network-common/src/main/java/org/apache/spark/network/util/RocksDBProvider.java index f3b7b48355a06..1753c124c9935 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/util/RocksDBProvider.java +++ b/common/network-common/src/main/java/org/apache/spark/network/util/RocksDBProvider.java @@ -24,9 +24,11 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.annotations.VisibleForTesting; import org.rocksdb.*; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.MDC; import org.apache.spark.network.shuffledb.StoreVersion; /** @@ -38,7 +40,7 @@ public class RocksDBProvider { org.rocksdb.RocksDB.loadLibrary(); } - private static final Logger logger = LoggerFactory.getLogger(RocksDBProvider.class); + private static final SparkLogger logger = SparkLoggerFactory.getLogger(RocksDBProvider.class); public static RocksDB initRockDB(File dbFile, StoreVersion version, ObjectMapper mapper) throws IOException { @@ -65,7 +67,7 @@ public static RocksDB initRockDB(File dbFile, StoreVersion version, ObjectMapper tmpDb = RocksDB.open(dbOptions, dbFile.toString()); } catch (RocksDBException e) { if (e.getStatus().getCode() == Status.Code.NotFound) { - logger.info("Creating state database at " + dbFile); + logger.info("Creating state database at {}", MDC.of(LogKeys.PATH$.MODULE$, dbFile)); dbOptions.setCreateIfMissing(true); try { tmpDb = RocksDB.open(dbOptions, dbFile.toString()); @@ -76,16 +78,16 @@ public static RocksDB initRockDB(File dbFile, StoreVersion version, ObjectMapper // the RocksDB file seems to be corrupt somehow. Let's just blow it away and create // a new one, so we can keep processing new apps logger.error("error opening rocksdb file {}. Creating new file, will not be able to " + - "recover state for existing applications", dbFile, e); + "recover state for existing applications", e, MDC.of(LogKeys.PATH$.MODULE$, dbFile)); if (dbFile.isDirectory()) { for (File f : Objects.requireNonNull(dbFile.listFiles())) { if (!f.delete()) { - logger.warn("error deleting {}", f.getPath()); + logger.warn("error deleting {}", MDC.of(LogKeys.PATH$.MODULE$, f.getPath())); } } } if (!dbFile.delete()) { - logger.warn("error deleting {}", dbFile.getPath()); + logger.warn("error deleting {}", MDC.of(LogKeys.PATH$.MODULE$, dbFile.getPath())); } dbOptions.setCreateIfMissing(true); try { @@ -133,10 +135,10 @@ static RocksDB initRocksDB(File file) throws IOException { } private static class RocksDBLogger extends org.rocksdb.Logger { - private static final Logger LOG = LoggerFactory.getLogger(RocksDBLogger.class); + private static final SparkLogger LOG = SparkLoggerFactory.getLogger(RocksDBLogger.class); RocksDBLogger(Options options) { - super(options); + super(options.infoLogLevel()); } @Override diff --git a/common/network-common/src/test/java/org/apache/spark/network/crypto/AuthEngineSuite.java b/common/network-common/src/test/java/org/apache/spark/network/crypto/AuthEngineSuite.java index e9846be20c9b0..628de9e780337 100644 --- a/common/network-common/src/test/java/org/apache/spark/network/crypto/AuthEngineSuite.java +++ b/common/network-common/src/test/java/org/apache/spark/network/crypto/AuthEngineSuite.java @@ -18,75 +18,76 @@ package org.apache.spark.network.crypto; import java.nio.ByteBuffer; -import java.nio.channels.WritableByteChannel; import java.security.GeneralSecurityException; -import java.util.Collections; -import java.util.Random; +import java.util.Map; +import com.google.common.collect.ImmutableMap; import com.google.crypto.tink.subtle.Hex; -import io.netty.buffer.ByteBuf; -import io.netty.buffer.Unpooled; -import io.netty.channel.FileRegion; -import org.apache.spark.network.util.ByteArrayWritableChannel; -import org.apache.spark.network.util.ConfigProvider; -import org.apache.spark.network.util.MapConfigProvider; -import org.apache.spark.network.util.TransportConf; +import org.apache.spark.network.util.*; + import static org.junit.jupiter.api.Assertions.*; -import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; -import static org.mockito.Mockito.*; -import org.mockito.invocation.InvocationOnMock; -import org.mockito.stubbing.Answer; - -public class AuthEngineSuite { - private static final String clientPrivate = - "efe6b68b3fce92158e3637f6ef9d937e75558928dd4b401de04b43d300a73186"; - private static final String clientChallengeHex = - "fb00000005617070496400000010890b6e960f48e998777267a7e4e623220000003c48ad7dc7ec9466da9" + - "3bda9f11488dc9404050e02c661d87d67c782444944c6e369b27e0a416c30845a2d9e64271511ca98b41d" + - "65f8c426e18ff380f6"; - private static final String serverResponseHex = - "fb00000005617070496400000010708451c9dd2792c97c1ca66e6df449ef0000003c64fe899ecdaf458d4" + - "e25e9d5c5a380b8e6d1a184692fac065ed84f8592c18e9629f9c636809dca2ffc041f20346eb53db78738" + - "08ecad08b46b5ee3ff"; - private static final String derivedKey = "2d6e7a9048c8265c33a8f3747bfcc84c"; +abstract class AuthEngineSuite { + static final String clientPrivate = + "efe6b68b3fce92158e3637f6ef9d937e75558928dd4b401de04b43d300a73186"; + static final String clientChallengeHex = + "fb00000005617070496400000010890b6e960f48e998777267a7e4e623220000003c48ad7dc7ec9466da9" + + "3bda9f11488dc9404050e02c661d87d67c782444944c6e369b27e0a416c30845a2d9e64271511ca98b41d" + + "65f8c426e18ff380f6"; + static final String serverResponseHex = + "fb00000005617070496400000010708451c9dd2792c97c1ca66e6df449ef0000003c64fe899ecdaf458d4" + + "e25e9d5c5a380b8e6d1a184692fac065ed84f8592c18e9629f9c636809dca2ffc041f20346eb53db78738" + + "08ecad08b46b5ee3ff"; + static final String derivedKeyId = + "de04fd52d71040ed9d260579dacfdf4f5695f991ce8ddb1dde05a7335880906e"; // This key would have been derived for version 1.0 protocol that did not run a final HKDF round. - private static final String unsafeDerivedKey = - "31963f15a320d5c90333f7ecf5cf3a31c7eaf151de07fef8494663a9f47cfd31"; - - private static final String inputIv = "fc6a5dc8b90a9dad8f54f08b51a59ed2"; - private static final String outputIv = "a72709baf00785cad6329ce09f631f71"; - private static TransportConf conf; - - @BeforeAll - public static void setUp() { - ConfigProvider v2Provider = new MapConfigProvider(Collections.singletonMap( - "spark.network.crypto.authEngineVersion", "2")); - conf = new TransportConf("rpc", v2Provider); + static final String unsafeDerivedKey = + "31963f15a320d5c90333f7ecf5cf3a31c7eaf151de07fef8494663a9f47cfd31"; + static TransportConf conf; + + static TransportConf getConf(int authEngineVerison, boolean useCtr) { + String authEngineVersion = (authEngineVerison == 1) ? "1" : "2"; + String mode = useCtr ? "AES/CTR/NoPadding" : "AES/GCM/NoPadding"; + Map confMap = ImmutableMap.of( + "spark.network.crypto.enabled", "true", + "spark.network.crypto.authEngineVersion", authEngineVersion, + "spark.network.crypto.cipher", mode + ); + ConfigProvider v2Provider = new MapConfigProvider(confMap); + return new TransportConf("rpc", v2Provider); } @Test public void testAuthEngine() throws Exception { - try (AuthEngine client = new AuthEngine("appId", "secret", conf); AuthEngine server = new AuthEngine("appId", "secret", conf)) { AuthMessage clientChallenge = client.challenge(); AuthMessage serverResponse = server.response(clientChallenge); client.deriveSessionCipher(clientChallenge, serverResponse); - TransportCipher serverCipher = server.sessionCipher(); TransportCipher clientCipher = client.sessionCipher(); + assertEquals(clientCipher.getKeyId(), serverCipher.getKeyId()); + } + } - assertArrayEquals(serverCipher.getInputIv(), clientCipher.getOutputIv()); - assertArrayEquals(serverCipher.getOutputIv(), clientCipher.getInputIv()); - assertEquals(serverCipher.getKey(), clientCipher.getKey()); + @Test + public void testFixedChallengeResponse() throws Exception { + try (AuthEngine client = new AuthEngine("appId", "secret", conf)) { + byte[] clientPrivateKey = Hex.decode(clientPrivate); + client.setClientPrivateKey(clientPrivateKey); + AuthMessage clientChallenge = + AuthMessage.decodeMessage(ByteBuffer.wrap(Hex.decode(clientChallengeHex))); + AuthMessage serverResponse = + AuthMessage.decodeMessage(ByteBuffer.wrap(Hex.decode(serverResponseHex))); + // Verify that the client will accept an old transcript. + client.deriveSessionCipher(clientChallenge, serverResponse); + assertEquals(client.sessionCipher().getKeyId(), derivedKeyId); } } @Test public void testCorruptChallengeAppId() throws Exception { - try (AuthEngine client = new AuthEngine("appId", "secret", conf); AuthEngine server = new AuthEngine("appId", "secret", conf)) { AuthMessage clientChallenge = client.challenge(); @@ -98,7 +99,6 @@ public void testCorruptChallengeAppId() throws Exception { @Test public void testCorruptChallengeSalt() throws Exception { - try (AuthEngine client = new AuthEngine("appId", "secret", conf); AuthEngine server = new AuthEngine("appId", "secret", conf)) { AuthMessage clientChallenge = client.challenge(); @@ -109,7 +109,6 @@ public void testCorruptChallengeSalt() throws Exception { @Test public void testCorruptChallengeCiphertext() throws Exception { - try (AuthEngine client = new AuthEngine("appId", "secret", conf); AuthEngine server = new AuthEngine("appId", "secret", conf)) { AuthMessage clientChallenge = client.challenge(); @@ -120,7 +119,6 @@ public void testCorruptChallengeCiphertext() throws Exception { @Test public void testCorruptResponseAppId() throws Exception { - try (AuthEngine client = new AuthEngine("appId", "secret", conf); AuthEngine server = new AuthEngine("appId", "secret", conf)) { AuthMessage clientChallenge = client.challenge(); @@ -134,20 +132,18 @@ public void testCorruptResponseAppId() throws Exception { @Test public void testCorruptResponseSalt() throws Exception { - try (AuthEngine client = new AuthEngine("appId", "secret", conf); AuthEngine server = new AuthEngine("appId", "secret", conf)) { AuthMessage clientChallenge = client.challenge(); AuthMessage serverResponse = server.response(clientChallenge); serverResponse.salt()[0] ^= 1; assertThrows(GeneralSecurityException.class, - () -> client.deriveSessionCipher(clientChallenge, serverResponse)); + () -> client.deriveSessionCipher(clientChallenge, serverResponse)); } } @Test public void testCorruptServerCiphertext() throws Exception { - try (AuthEngine client = new AuthEngine("appId", "secret", conf); AuthEngine server = new AuthEngine("appId", "secret", conf)) { AuthMessage clientChallenge = client.challenge(); @@ -169,45 +165,6 @@ public void testFixedChallenge() throws Exception { } } - @Test - public void testFixedChallengeResponse() throws Exception { - try (AuthEngine client = new AuthEngine("appId", "secret", conf)) { - byte[] clientPrivateKey = Hex.decode(clientPrivate); - client.setClientPrivateKey(clientPrivateKey); - AuthMessage clientChallenge = - AuthMessage.decodeMessage(ByteBuffer.wrap(Hex.decode(clientChallengeHex))); - AuthMessage serverResponse = - AuthMessage.decodeMessage(ByteBuffer.wrap(Hex.decode(serverResponseHex))); - // Verify that the client will accept an old transcript. - client.deriveSessionCipher(clientChallenge, serverResponse); - TransportCipher clientCipher = client.sessionCipher(); - assertEquals(Hex.encode(clientCipher.getKey().getEncoded()), derivedKey); - assertEquals(Hex.encode(clientCipher.getInputIv()), inputIv); - assertEquals(Hex.encode(clientCipher.getOutputIv()), outputIv); - } - } - - @Test - public void testFixedChallengeResponseUnsafeVersion() throws Exception { - ConfigProvider v1Provider = new MapConfigProvider(Collections.singletonMap( - "spark.network.crypto.authEngineVersion", "1")); - TransportConf v1Conf = new TransportConf("rpc", v1Provider); - try (AuthEngine client = new AuthEngine("appId", "secret", v1Conf)) { - byte[] clientPrivateKey = Hex.decode(clientPrivate); - client.setClientPrivateKey(clientPrivateKey); - AuthMessage clientChallenge = - AuthMessage.decodeMessage(ByteBuffer.wrap(Hex.decode(clientChallengeHex))); - AuthMessage serverResponse = - AuthMessage.decodeMessage(ByteBuffer.wrap(Hex.decode(serverResponseHex))); - // Verify that the client will accept an old transcript. - client.deriveSessionCipher(clientChallenge, serverResponse); - TransportCipher clientCipher = client.sessionCipher(); - assertEquals(Hex.encode(clientCipher.getKey().getEncoded()), unsafeDerivedKey); - assertEquals(Hex.encode(clientCipher.getInputIv()), inputIv); - assertEquals(Hex.encode(clientCipher.getOutputIv()), outputIv); - } - } - @Test public void testMismatchedSecret() throws Exception { try (AuthEngine client = new AuthEngine("appId", "secret", conf); @@ -216,70 +173,4 @@ public void testMismatchedSecret() throws Exception { assertThrows(GeneralSecurityException.class, () -> server.response(clientChallenge)); } } - - @Test - public void testEncryptedMessage() throws Exception { - try (AuthEngine client = new AuthEngine("appId", "secret", conf); - AuthEngine server = new AuthEngine("appId", "secret", conf)) { - AuthMessage clientChallenge = client.challenge(); - AuthMessage serverResponse = server.response(clientChallenge); - client.deriveSessionCipher(clientChallenge, serverResponse); - - TransportCipher cipher = server.sessionCipher(); - TransportCipher.EncryptionHandler handler = new TransportCipher.EncryptionHandler(cipher); - - byte[] data = new byte[TransportCipher.STREAM_BUFFER_SIZE + 1]; - new Random().nextBytes(data); - ByteBuf buf = Unpooled.wrappedBuffer(data); - - ByteArrayWritableChannel channel = new ByteArrayWritableChannel(data.length); - TransportCipher.EncryptedMessage emsg = handler.createEncryptedMessage(buf); - while (emsg.transferred() < emsg.count()) { - emsg.transferTo(channel, emsg.transferred()); - } - assertEquals(data.length, channel.length()); - } - } - - @Test - public void testEncryptedMessageWhenTransferringZeroBytes() throws Exception { - try (AuthEngine client = new AuthEngine("appId", "secret", conf); - AuthEngine server = new AuthEngine("appId", "secret", conf)) { - AuthMessage clientChallenge = client.challenge(); - AuthMessage serverResponse = server.response(clientChallenge); - client.deriveSessionCipher(clientChallenge, serverResponse); - - TransportCipher cipher = server.sessionCipher(); - TransportCipher.EncryptionHandler handler = new TransportCipher.EncryptionHandler(cipher); - - int testDataLength = 4; - FileRegion region = mock(FileRegion.class); - when(region.count()).thenReturn((long) testDataLength); - // Make `region.transferTo` do nothing in first call and transfer 4 bytes in the second one. - when(region.transferTo(any(), anyLong())).thenAnswer(new Answer() { - - private boolean firstTime = true; - - @Override - public Long answer(InvocationOnMock invocationOnMock) throws Throwable { - if (firstTime) { - firstTime = false; - return 0L; - } else { - WritableByteChannel channel = invocationOnMock.getArgument(0); - channel.write(ByteBuffer.wrap(new byte[testDataLength])); - return (long) testDataLength; - } - } - }); - - TransportCipher.EncryptedMessage emsg = handler.createEncryptedMessage(region); - ByteArrayWritableChannel channel = new ByteArrayWritableChannel(testDataLength); - // "transferTo" should act correctly when the underlying FileRegion transfers 0 bytes. - assertEquals(0L, emsg.transferTo(channel, emsg.transferred())); - assertEquals(testDataLength, emsg.transferTo(channel, emsg.transferred())); - assertEquals(emsg.transferred(), emsg.count()); - assertEquals(4, channel.length()); - } - } } diff --git a/common/network-common/src/test/java/org/apache/spark/network/crypto/AuthIntegrationSuite.java b/common/network-common/src/test/java/org/apache/spark/network/crypto/AuthIntegrationSuite.java index 90f6c874a6c84..cb5929f7c65b4 100644 --- a/common/network-common/src/test/java/org/apache/spark/network/crypto/AuthIntegrationSuite.java +++ b/common/network-common/src/test/java/org/apache/spark/network/crypto/AuthIntegrationSuite.java @@ -49,7 +49,7 @@ public class AuthIntegrationSuite { private AuthTestCtx ctx; @AfterEach - public void cleanUp() throws Exception { + public void cleanUp() { if (ctx != null) { ctx.close(); } @@ -57,8 +57,8 @@ public void cleanUp() throws Exception { } @Test - public void testNewAuth() throws Exception { - ctx = new AuthTestCtx(); + public void testNewCtrAuth() throws Exception { + ctx = new AuthTestCtx(new DummyRpcHandler(), "AES/CTR/NoPadding"); ctx.createServer("secret"); ctx.createClient("secret"); @@ -68,8 +68,28 @@ public void testNewAuth() throws Exception { } @Test - public void testAuthFailure() throws Exception { - ctx = new AuthTestCtx(); + public void testNewGcmAuth() throws Exception { + ctx = new AuthTestCtx(new DummyRpcHandler(), "AES/GCM/NoPadding"); + ctx.createServer("secret"); + ctx.createClient("secret"); + ByteBuffer reply = ctx.client.sendRpcSync(JavaUtils.stringToBytes("Ping"), 5000); + assertEquals("Pong", JavaUtils.bytesToString(reply)); + assertNull(ctx.authRpcHandler.saslHandler); + } + + @Test + public void testCtrAuthFailure() throws Exception { + ctx = new AuthTestCtx(new DummyRpcHandler(), "AES/CTR/NoPadding"); + ctx.createServer("server"); + + assertThrows(Exception.class, () -> ctx.createClient("client")); + assertFalse(ctx.authRpcHandler.isAuthenticated()); + assertFalse(ctx.serverChannel.isActive()); + } + + @Test + public void testGcmAuthFailure() throws Exception { + ctx = new AuthTestCtx(new DummyRpcHandler(), "AES/GCM/NoPadding"); ctx.createServer("server"); assertThrows(Exception.class, () -> ctx.createClient("client")); @@ -100,7 +120,7 @@ public void testSaslClientFallback() throws Exception { } @Test - public void testAuthReplay() throws Exception { + public void testCtrAuthReplay() throws Exception { // This test covers the case where an attacker replays a challenge message sniffed from the // network, but doesn't know the actual secret. The server should close the connection as // soon as a message is sent after authentication is performed. This is emulated by removing @@ -110,16 +130,16 @@ public void testAuthReplay() throws Exception { ctx.createClient("secret"); assertNotNull(ctx.client.getChannel().pipeline() - .remove(TransportCipher.ENCRYPTION_HANDLER_NAME)); + .remove(CtrTransportCipher.ENCRYPTION_HANDLER_NAME)); assertThrows(Exception.class, () -> ctx.client.sendRpcSync(JavaUtils.stringToBytes("Ping"), 5000)); assertTrue(ctx.authRpcHandler.isAuthenticated()); } @Test - public void testLargeMessageEncryption() throws Exception { + public void testLargeCtrMessageEncryption() throws Exception { // Use a big length to create a message that cannot be put into the encryption buffer completely - final int testErrorMessageLength = TransportCipher.STREAM_BUFFER_SIZE; + final int testErrorMessageLength = CtrTransportCipher.STREAM_BUFFER_SIZE; ctx = new AuthTestCtx(new RpcHandler() { @Override public void receive( @@ -157,6 +177,23 @@ public void testValidMergedBlockMetaReqHandler() throws Exception { assertNotNull(ctx.authRpcHandler.getMergedBlockMetaReqHandler()); } + private static class DummyRpcHandler extends RpcHandler { + @Override + public void receive( + TransportClient client, + ByteBuffer message, + RpcResponseCallback callback) { + String messageString = JavaUtils.bytesToString(message); + assertEquals("Ping", messageString); + callback.onSuccess(JavaUtils.stringToBytes("Pong")); + } + + @Override + public StreamManager getStreamManager() { + return null; + } + } + private static class AuthTestCtx { private final String appId = "testAppId"; @@ -169,25 +206,17 @@ private static class AuthTestCtx { volatile AuthRpcHandler authRpcHandler; AuthTestCtx() throws Exception { - this(new RpcHandler() { - @Override - public void receive( - TransportClient client, - ByteBuffer message, - RpcResponseCallback callback) { - assertEquals("Ping", JavaUtils.bytesToString(message)); - callback.onSuccess(JavaUtils.stringToBytes("Pong")); - } - - @Override - public StreamManager getStreamManager() { - return null; - } - }); + this(new DummyRpcHandler()); } AuthTestCtx(RpcHandler rpcHandler) throws Exception { - Map testConf = ImmutableMap.of("spark.network.crypto.enabled", "true"); + this(rpcHandler, "AES/CTR/NoPadding"); + } + + AuthTestCtx(RpcHandler rpcHandler, String mode) throws Exception { + Map testConf = ImmutableMap.of( + "spark.network.crypto.enabled", "true", + "spark.network.crypto.cipher", mode); this.conf = new TransportConf("rpc", new MapConfigProvider(testConf)); this.ctx = new TransportContext(conf, rpcHandler); } diff --git a/common/network-common/src/test/java/org/apache/spark/network/crypto/CtrAuthEngineSuite.java b/common/network-common/src/test/java/org/apache/spark/network/crypto/CtrAuthEngineSuite.java new file mode 100644 index 0000000000000..c353ee392ff4f --- /dev/null +++ b/common/network-common/src/test/java/org/apache/spark/network/crypto/CtrAuthEngineSuite.java @@ -0,0 +1,177 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.network.crypto; + +import com.google.crypto.tink.subtle.Hex; +import io.netty.buffer.ByteBuf; +import io.netty.buffer.Unpooled; +import io.netty.channel.FileRegion; +import org.apache.spark.network.util.ByteArrayWritableChannel; +import org.apache.spark.network.util.TransportConf; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.mockito.invocation.InvocationOnMock; +import org.mockito.stubbing.Answer; + +import java.nio.ByteBuffer; +import java.nio.channels.WritableByteChannel; +import java.util.Random; + +import static org.junit.jupiter.api.Assertions.*; +import static org.mockito.Mockito.*; + +public class CtrAuthEngineSuite extends AuthEngineSuite { + private static final String inputIv = "fc6a5dc8b90a9dad8f54f08b51a59ed2"; + private static final String outputIv = "a72709baf00785cad6329ce09f631f71"; + + @BeforeAll + public static void setUp() { + conf = getConf(2, true); + } + + @Test + public void testAuthEngine() throws Exception { + try (AuthEngine client = new AuthEngine("appId", "secret", conf); + AuthEngine server = new AuthEngine("appId", "secret", conf)) { + AuthMessage clientChallenge = client.challenge(); + AuthMessage serverResponse = server.response(clientChallenge); + client.deriveSessionCipher(clientChallenge, serverResponse); + + TransportCipher serverCipher = server.sessionCipher(); + TransportCipher clientCipher = client.sessionCipher(); + assert(clientCipher instanceof CtrTransportCipher); + assert(serverCipher instanceof CtrTransportCipher); + CtrTransportCipher ctrClient = (CtrTransportCipher) clientCipher; + CtrTransportCipher ctrServer = (CtrTransportCipher) serverCipher; + assertArrayEquals(ctrServer.getInputIv(), ctrClient.getOutputIv()); + assertArrayEquals(ctrServer.getOutputIv(), ctrClient.getInputIv()); + assertEquals(ctrServer.getKey(), ctrClient.getKey()); + } + } + + @Test + public void testCtrFixedChallengeIvResponse() throws Exception { + try (AuthEngine client = new AuthEngine("appId", "secret", conf)) { + byte[] clientPrivateKey = Hex.decode(clientPrivate); + client.setClientPrivateKey(clientPrivateKey); + AuthMessage clientChallenge = + AuthMessage.decodeMessage(ByteBuffer.wrap(Hex.decode(clientChallengeHex))); + AuthMessage serverResponse = + AuthMessage.decodeMessage(ByteBuffer.wrap(Hex.decode(serverResponseHex))); + // Verify that the client will accept an old transcript. + client.deriveSessionCipher(clientChallenge, serverResponse); + TransportCipher clientCipher = client.sessionCipher(); + assertEquals(clientCipher.getKeyId(), derivedKeyId); + assert(clientCipher instanceof CtrTransportCipher); + CtrTransportCipher ctrTransportCipher = (CtrTransportCipher) clientCipher; + assertEquals(Hex.encode(ctrTransportCipher.getInputIv()), inputIv); + assertEquals(Hex.encode(ctrTransportCipher.getOutputIv()), outputIv); + } + } + + @Test + public void testFixedChallengeResponseUnsafeVersion() throws Exception { + TransportConf v1Conf = getConf(1, true); + try (AuthEngine client = new AuthEngine("appId", "secret", v1Conf)) { + byte[] clientPrivateKey = Hex.decode(clientPrivate); + client.setClientPrivateKey(clientPrivateKey); + AuthMessage clientChallenge = + AuthMessage.decodeMessage(ByteBuffer.wrap(Hex.decode(clientChallengeHex))); + AuthMessage serverResponse = + AuthMessage.decodeMessage(ByteBuffer.wrap(Hex.decode(serverResponseHex))); + // Verify that the client will accept an old transcript. + client.deriveSessionCipher(clientChallenge, serverResponse); + TransportCipher clientCipher = client.sessionCipher(); + assert(clientCipher instanceof CtrTransportCipher); + CtrTransportCipher ctrTransportCipher = (CtrTransportCipher) clientCipher; + assertEquals(Hex.encode(ctrTransportCipher.getKey().getEncoded()), unsafeDerivedKey); + assertEquals(Hex.encode(ctrTransportCipher.getInputIv()), inputIv); + assertEquals(Hex.encode(ctrTransportCipher.getOutputIv()), outputIv); + } + } + + @Test + public void testCtrEncryptedMessage() throws Exception { + try (AuthEngine client = new AuthEngine("appId", "secret", conf); + AuthEngine server = new AuthEngine("appId", "secret", conf)) { + AuthMessage clientChallenge = client.challenge(); + AuthMessage serverResponse = server.response(clientChallenge); + client.deriveSessionCipher(clientChallenge, serverResponse); + + TransportCipher clientCipher = server.sessionCipher(); + assert(clientCipher instanceof CtrTransportCipher); + CtrTransportCipher ctrTransportCipher = (CtrTransportCipher) clientCipher; + CtrTransportCipher.EncryptionHandler handler = + new CtrTransportCipher.EncryptionHandler(ctrTransportCipher); + + byte[] data = new byte[CtrTransportCipher.STREAM_BUFFER_SIZE + 1]; + new Random().nextBytes(data); + ByteBuf buf = Unpooled.wrappedBuffer(data); + + ByteArrayWritableChannel channel = new ByteArrayWritableChannel(data.length); + CtrTransportCipher.EncryptedMessage emsg = handler.createEncryptedMessage(buf); + while (emsg.transferred() < emsg.count()) { + emsg.transferTo(channel, emsg.transferred()); + } + assertEquals(data.length, channel.length()); + } + } + + @Test + public void testCtrEncryptedMessageWhenTransferringZeroBytes() throws Exception { + try (AuthEngine client = new AuthEngine("appId", "secret", conf); + AuthEngine server = new AuthEngine("appId", "secret", conf)) { + AuthMessage clientChallenge = client.challenge(); + AuthMessage serverResponse = server.response(clientChallenge); + client.deriveSessionCipher(clientChallenge, serverResponse); + TransportCipher clientCipher = server.sessionCipher(); + assert(clientCipher instanceof CtrTransportCipher); + CtrTransportCipher ctrTransportCipher = (CtrTransportCipher) clientCipher; + CtrTransportCipher.EncryptionHandler handler = + new CtrTransportCipher.EncryptionHandler(ctrTransportCipher); + int testDataLength = 4; + FileRegion region = mock(FileRegion.class); + when(region.count()).thenReturn((long) testDataLength); + // Make `region.transferTo` do nothing in first call and transfer 4 bytes in the second one. + when(region.transferTo(any(), anyLong())).thenAnswer(new Answer() { + + private boolean firstTime = true; + + @Override + public Long answer(InvocationOnMock invocationOnMock) throws Throwable { + if (firstTime) { + firstTime = false; + return 0L; + } else { + WritableByteChannel channel = invocationOnMock.getArgument(0); + channel.write(ByteBuffer.wrap(new byte[testDataLength])); + return (long) testDataLength; + } + } + }); + + CtrTransportCipher.EncryptedMessage emsg = handler.createEncryptedMessage(region); + ByteArrayWritableChannel channel = new ByteArrayWritableChannel(testDataLength); + // "transferTo" should act correctly when the underlying FileRegion transfers 0 bytes. + assertEquals(0L, emsg.transferTo(channel, emsg.transferred())); + assertEquals(testDataLength, emsg.transferTo(channel, emsg.transferred())); + assertEquals(emsg.transferred(), emsg.count()); + assertEquals(4, channel.length()); + } + } +} diff --git a/common/network-common/src/test/java/org/apache/spark/network/crypto/GcmAuthEngineSuite.java b/common/network-common/src/test/java/org/apache/spark/network/crypto/GcmAuthEngineSuite.java new file mode 100644 index 0000000000000..20efb8d57dcbf --- /dev/null +++ b/common/network-common/src/test/java/org/apache/spark/network/crypto/GcmAuthEngineSuite.java @@ -0,0 +1,339 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.network.crypto; + +import io.netty.buffer.ByteBuf; +import io.netty.buffer.Unpooled; +import io.netty.channel.ChannelHandlerContext; +import io.netty.channel.ChannelPromise; +import org.apache.spark.network.util.*; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.mockito.ArgumentCaptor; + +import javax.crypto.AEADBadTagException; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.channels.WritableByteChannel; +import java.util.Arrays; + +import static org.junit.jupiter.api.Assertions.*; +import static org.mockito.Mockito.*; + +public class GcmAuthEngineSuite extends AuthEngineSuite { + + @BeforeAll + public static void setUp() { + // Uses GCM mode + conf = getConf(2, false); + } + + @Test + public void testGcmEncryptedMessage() throws Exception { + TransportConf gcmConf = getConf(2, false); + try (AuthEngine client = new AuthEngine("appId", "secret", gcmConf); + AuthEngine server = new AuthEngine("appId", "secret", gcmConf)) { + AuthMessage clientChallenge = client.challenge(); + AuthMessage serverResponse = server.response(clientChallenge); + client.deriveSessionCipher(clientChallenge, serverResponse); + TransportCipher clientCipher = server.sessionCipher(); + // Verify that it derives a GcmTransportCipher + assert (clientCipher instanceof GcmTransportCipher); + GcmTransportCipher gcmTransportCipher = (GcmTransportCipher) clientCipher; + GcmTransportCipher.EncryptionHandler encryptionHandler = + gcmTransportCipher.getEncryptionHandler(); + GcmTransportCipher.DecryptionHandler decryptionHandler = + gcmTransportCipher.getDecryptionHandler(); + // Allocating 1.5x the buffer size to test multiple segments and a fractional segment. + int plaintextSegmentSize = GcmTransportCipher.CIPHERTEXT_BUFFER_SIZE - 16; + byte[] data = new byte[plaintextSegmentSize + (plaintextSegmentSize / 2)]; + // Just writing some bytes. + data[0] = 'a'; + data[data.length / 2] = 'b'; + data[data.length - 10] = 'c'; + ByteBuf buf = Unpooled.wrappedBuffer(data); + + // Mock the context and capture the arguments passed to it + ChannelHandlerContext ctx = mock(ChannelHandlerContext.class); + ChannelPromise promise = mock(ChannelPromise.class); + ArgumentCaptor captorWrappedEncrypted = + ArgumentCaptor.forClass(GcmTransportCipher.GcmEncryptedMessage.class); + encryptionHandler.write(ctx, buf, promise); + verify(ctx).write(captorWrappedEncrypted.capture(), eq(promise)); + + // Get the encrypted value and pass it to the decryption handler + GcmTransportCipher.GcmEncryptedMessage encrypted = + captorWrappedEncrypted.getValue(); + ByteBuffer ciphertextBuffer = + ByteBuffer.allocate((int) encrypted.count()); + ByteBufferWriteableChannel channel = + new ByteBufferWriteableChannel(ciphertextBuffer); + encrypted.transferTo(channel, 0); + ciphertextBuffer.flip(); + ByteBuf ciphertext = Unpooled.wrappedBuffer(ciphertextBuffer); + + // Capture the decrypted values and verify them + ArgumentCaptor captorPlaintext = ArgumentCaptor.forClass(ByteBuf.class); + decryptionHandler.channelRead(ctx, ciphertext); + verify(ctx, times(2)) + .fireChannelRead(captorPlaintext.capture()); + ByteBuf lastPlaintextSegment = captorPlaintext.getValue(); + assertEquals(plaintextSegmentSize/2, + lastPlaintextSegment.readableBytes()); + assertEquals('c', + lastPlaintextSegment.getByte((plaintextSegmentSize/2) - 10)); + } + } + + static class FakeRegion extends AbstractFileRegion { + private final ByteBuffer[] source; + private int sourcePosition; + private final long count; + + FakeRegion(ByteBuffer... source) { + this.source = source; + sourcePosition = 0; + count = remaining(); + } + + private long remaining() { + long remaining = 0; + for (ByteBuffer buffer : source) { + remaining += buffer.remaining(); + } + return remaining; + } + + @Override + public long position() { + return 0; + } + + @Override + public long transferred() { + return count - remaining(); + } + + @Override + public long count() { + return count; + } + + @Override + public long transferTo(WritableByteChannel target, long position) throws IOException { + if (sourcePosition < source.length) { + ByteBuffer currentBuffer = source[sourcePosition]; + long written = target.write(currentBuffer); + if (!currentBuffer.hasRemaining()) { + sourcePosition++; + } + return written; + } else { + return 0; + } + } + + @Override + protected void deallocate() { + } + } + + private static ByteBuffer getTestByteBuf(int size, byte fill) { + byte[] data = new byte[size]; + Arrays.fill(data, fill); + return ByteBuffer.wrap(data); + } + + @Test + public void testGcmEncryptedMessageFileRegion() throws Exception { + TransportConf gcmConf = getConf(2, false); + try (AuthEngine client = new AuthEngine("appId", "secret", gcmConf); + AuthEngine server = new AuthEngine("appId", "secret", gcmConf)) { + AuthMessage clientChallenge = client.challenge(); + AuthMessage serverResponse = server.response(clientChallenge); + client.deriveSessionCipher(clientChallenge, serverResponse); + TransportCipher clientCipher = server.sessionCipher(); + // Verify that it derives a GcmTransportCipher + assert (clientCipher instanceof GcmTransportCipher); + GcmTransportCipher gcmTransportCipher = (GcmTransportCipher) clientCipher; + GcmTransportCipher.EncryptionHandler encryptionHandler = + gcmTransportCipher.getEncryptionHandler(); + GcmTransportCipher.DecryptionHandler decryptionHandler = + gcmTransportCipher.getDecryptionHandler(); + // Allocating 1.5x the buffer size to test multiple segments and a fractional segment. + int plaintextSegmentSize = GcmTransportCipher.CIPHERTEXT_BUFFER_SIZE - 16; + int halfSegmentSize = plaintextSegmentSize / 2; + int totalSize = plaintextSegmentSize + halfSegmentSize; + + // Set up some fragmented segments to test + ByteBuffer halfSegment = getTestByteBuf(halfSegmentSize, (byte) 'a'); + int smallFragmentSize = 128; + ByteBuffer smallFragment = getTestByteBuf(smallFragmentSize, (byte) 'b'); + int remainderSize = totalSize - halfSegmentSize - smallFragmentSize; + ByteBuffer remainder = getTestByteBuf(remainderSize, (byte) 'c'); + FakeRegion fakeRegion = new FakeRegion(halfSegment, smallFragment, remainder); + assertEquals(totalSize, fakeRegion.count()); + + ChannelHandlerContext ctx = mock(ChannelHandlerContext.class); + ChannelPromise promise = mock(ChannelPromise.class); + ArgumentCaptor captorWrappedEncrypted = + ArgumentCaptor.forClass(GcmTransportCipher.GcmEncryptedMessage.class); + encryptionHandler.write(ctx, fakeRegion, promise); + verify(ctx).write(captorWrappedEncrypted.capture(), eq(promise)); + + // Get the encrypted value and pass it to the decryption handler + GcmTransportCipher.GcmEncryptedMessage encrypted = + captorWrappedEncrypted.getValue(); + ByteBuffer ciphertextBuffer = + ByteBuffer.allocate((int) encrypted.count()); + ByteBufferWriteableChannel channel = + new ByteBufferWriteableChannel(ciphertextBuffer); + + // We'll simulate the FileRegion only transferring half a segment. + // The encrypted message should buffer the partial segment plaintext. + long ciphertextTransferred = 0; + while (ciphertextTransferred < encrypted.count()) { + long chunkTransferred = encrypted.transferTo(channel, 0); + ciphertextTransferred += chunkTransferred; + } + assertEquals(encrypted.count(), ciphertextTransferred); + + ciphertextBuffer.flip(); + ByteBuf ciphertext = Unpooled.wrappedBuffer(ciphertextBuffer); + + // Capture the decrypted values and verify them + ArgumentCaptor captorPlaintext = ArgumentCaptor.forClass(ByteBuf.class); + decryptionHandler.channelRead(ctx, ciphertext); + verify(ctx, times(2)).fireChannelRead(captorPlaintext.capture()); + ByteBuf plaintext = captorPlaintext.getValue(); + // We expect this to be the last partial plaintext segment + int expectedLength = totalSize % plaintextSegmentSize; + assertEquals(expectedLength, plaintext.readableBytes()); + // This will be the "remainder" segment that is filled to 'c' + assertEquals('c', plaintext.getByte(0)); + } + } + + + @Test + public void testGcmUnalignedDecryption() throws Exception { + TransportConf gcmConf = getConf(2, false); + try (AuthEngine client = new AuthEngine("appId", "secret", gcmConf); + AuthEngine server = new AuthEngine("appId", "secret", gcmConf)) { + AuthMessage clientChallenge = client.challenge(); + AuthMessage serverResponse = server.response(clientChallenge); + client.deriveSessionCipher(clientChallenge, serverResponse); + TransportCipher clientCipher = server.sessionCipher(); + // Verify that it derives a GcmTransportCipher + assert (clientCipher instanceof GcmTransportCipher); + GcmTransportCipher gcmTransportCipher = (GcmTransportCipher) clientCipher; + GcmTransportCipher.EncryptionHandler encryptionHandler = + gcmTransportCipher.getEncryptionHandler(); + GcmTransportCipher.DecryptionHandler decryptionHandler = + gcmTransportCipher.getDecryptionHandler(); + // Allocating 1.5x the buffer size to test multiple segments and a fractional segment. + int plaintextSegmentSize = GcmTransportCipher.CIPHERTEXT_BUFFER_SIZE - 16; + int plaintextSize = plaintextSegmentSize + (plaintextSegmentSize / 2); + byte[] data = new byte[plaintextSize]; + Arrays.fill(data, (byte) 'x'); + ByteBuf buf = Unpooled.wrappedBuffer(data); + + // Mock the context and capture the arguments passed to it + ChannelHandlerContext ctx = mock(ChannelHandlerContext.class); + ChannelPromise promise = mock(ChannelPromise.class); + ArgumentCaptor captorWrappedEncrypted = + ArgumentCaptor.forClass(GcmTransportCipher.GcmEncryptedMessage.class); + encryptionHandler.write(ctx, buf, promise); + verify(ctx).write(captorWrappedEncrypted.capture(), eq(promise)); + + // Get the encrypted value and pass it to the decryption handler + GcmTransportCipher.GcmEncryptedMessage encrypted = + captorWrappedEncrypted.getValue(); + ByteBuffer ciphertextBuffer = + ByteBuffer.allocate((int) encrypted.count()); + ByteBufferWriteableChannel channel = + new ByteBufferWriteableChannel(ciphertextBuffer); + encrypted.transferTo(channel, 0); + ciphertextBuffer.flip(); + ByteBuf ciphertext = Unpooled.wrappedBuffer(ciphertextBuffer); + + // Split up the ciphertext into some different sized chunks + int firstChunkSize = plaintextSize / 2; + ByteBuf mockCiphertext = spy(ciphertext); + when(mockCiphertext.readableBytes()) + .thenReturn(firstChunkSize, firstChunkSize).thenCallRealMethod(); + + // Capture the decrypted values and verify them + ArgumentCaptor captorPlaintext = ArgumentCaptor.forClass(ByteBuf.class); + decryptionHandler.channelRead(ctx, mockCiphertext); + verify(ctx, times(2)).fireChannelRead(captorPlaintext.capture()); + ByteBuf lastPlaintextSegment = captorPlaintext.getValue(); + assertEquals(plaintextSegmentSize/2, + lastPlaintextSegment.readableBytes()); + assertEquals('x', + lastPlaintextSegment.getByte((plaintextSegmentSize/2) - 10)); + } + } + + @Test + public void testCorruptGcmEncryptedMessage() throws Exception { + TransportConf gcmConf = getConf(2, false); + + try (AuthEngine client = new AuthEngine("appId", "secret", gcmConf); + AuthEngine server = new AuthEngine("appId", "secret", gcmConf)) { + AuthMessage clientChallenge = client.challenge(); + AuthMessage serverResponse = server.response(clientChallenge); + client.deriveSessionCipher(clientChallenge, serverResponse); + + TransportCipher clientCipher = server.sessionCipher(); + assert (clientCipher instanceof GcmTransportCipher); + + GcmTransportCipher gcmTransportCipher = (GcmTransportCipher) clientCipher; + GcmTransportCipher.EncryptionHandler encryptionHandler = + gcmTransportCipher.getEncryptionHandler(); + GcmTransportCipher.DecryptionHandler decryptionHandler = + gcmTransportCipher.getDecryptionHandler(); + byte[] zeroData = new byte[1024 * 32]; + // Just writing some bytes. + ByteBuf buf = Unpooled.wrappedBuffer(zeroData); + + // Mock the context and capture the arguments passed to it + ChannelHandlerContext ctx = mock(ChannelHandlerContext.class); + ChannelPromise promise = mock(ChannelPromise.class); + ArgumentCaptor captorWrappedEncrypted = + ArgumentCaptor.forClass(GcmTransportCipher.GcmEncryptedMessage.class); + encryptionHandler.write(ctx, buf, promise); + verify(ctx).write(captorWrappedEncrypted.capture(), eq(promise)); + + GcmTransportCipher.GcmEncryptedMessage encrypted = + captorWrappedEncrypted.getValue(); + ByteBuffer ciphertextBuffer = + ByteBuffer.allocate((int) encrypted.count()); + ByteBufferWriteableChannel channel = + new ByteBufferWriteableChannel(ciphertextBuffer); + encrypted.transferTo(channel, 0); + ciphertextBuffer.flip(); + ByteBuf ciphertext = Unpooled.wrappedBuffer(ciphertextBuffer); + + byte b = ciphertext.getByte(100); + // Inverting the bits of the 100th bit + ciphertext.setByte(100, ~b & 0xFF); + assertThrows(AEADBadTagException.class, () -> decryptionHandler.channelRead(ctx, ciphertext)); + } + } +} diff --git a/common/network-common/src/test/java/org/apache/spark/network/crypto/TransportCipherSuite.java b/common/network-common/src/test/java/org/apache/spark/network/crypto/TransportCipherSuite.java index da62d3b2de31a..8977f29034fe0 100644 --- a/common/network-common/src/test/java/org/apache/spark/network/crypto/TransportCipherSuite.java +++ b/common/network-common/src/test/java/org/apache/spark/network/crypto/TransportCipherSuite.java @@ -41,10 +41,10 @@ public class TransportCipherSuite { @Test - public void testBufferNotLeaksOnInternalError() throws IOException { + public void testCtrBufferNotLeaksOnInternalError() throws IOException { String algorithm = "TestAlgorithm"; TransportConf conf = new TransportConf("Test", MapConfigProvider.EMPTY); - TransportCipher cipher = new TransportCipher(conf.cryptoConf(), conf.cipherTransformation(), + CtrTransportCipher cipher = new CtrTransportCipher(conf.cryptoConf(), new SecretKeySpec(new byte[256], algorithm), new byte[0], new byte[0]) { @Override diff --git a/common/network-common/src/test/java/org/apache/spark/network/protocol/EncryptedMessageWithHeaderSuite.java b/common/network-common/src/test/java/org/apache/spark/network/protocol/EncryptedMessageWithHeaderSuite.java index 7478fa1db7113..2865d411bf673 100644 --- a/common/network-common/src/test/java/org/apache/spark/network/protocol/EncryptedMessageWithHeaderSuite.java +++ b/common/network-common/src/test/java/org/apache/spark/network/protocol/EncryptedMessageWithHeaderSuite.java @@ -116,7 +116,7 @@ public void testChunkedStream() throws Exception { // Validate we read data correctly assertEquals(bodyResult.readableBytes(), chunkSize); - assert(bodyResult.readableBytes() < (randomData.length - readIndex)); + assertTrue(bodyResult.readableBytes() < (randomData.length - readIndex)); while (bodyResult.readableBytes() > 0) { assertEquals(bodyResult.readByte(), randomData[readIndex++]); } diff --git a/common/network-common/src/test/java/org/apache/spark/network/util/TransportFrameDecoderSuite.java b/common/network-common/src/test/java/org/apache/spark/network/util/TransportFrameDecoderSuite.java index aa3891450a933..025be80c5ce4b 100644 --- a/common/network-common/src/test/java/org/apache/spark/network/util/TransportFrameDecoderSuite.java +++ b/common/network-common/src/test/java/org/apache/spark/network/util/TransportFrameDecoderSuite.java @@ -27,8 +27,10 @@ import io.netty.channel.ChannelHandlerContext; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.Test; +// checkstyle.off: RegexpSinglelineJava import org.slf4j.Logger; import org.slf4j.LoggerFactory; +// checkstyle.on: RegexpSinglelineJava import static org.junit.jupiter.api.Assertions.*; import static org.mockito.Mockito.*; diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/sasl/ShuffleSecretManager.java b/common/network-shuffle/src/main/java/org/apache/spark/network/sasl/ShuffleSecretManager.java index 7253101f41df6..d67f2a3099d35 100644 --- a/common/network-shuffle/src/main/java/org/apache/spark/network/sasl/ShuffleSecretManager.java +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/sasl/ShuffleSecretManager.java @@ -20,8 +20,10 @@ import java.nio.ByteBuffer; import java.util.concurrent.ConcurrentHashMap; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.MDC; import org.apache.spark.network.util.JavaUtils; @@ -29,7 +31,8 @@ * A class that manages shuffle secret used by the external shuffle service. */ public class ShuffleSecretManager implements SecretKeyHolder { - private static final Logger logger = LoggerFactory.getLogger(ShuffleSecretManager.class); + private static final SparkLogger logger = + SparkLoggerFactory.getLogger(ShuffleSecretManager.class); private final ConcurrentHashMap shuffleSecretMap; @@ -51,7 +54,8 @@ public void registerApp(String appId, String shuffleSecret) { // Otherwise we have to specifically look at the application attempt in addition // to the applicationId since the secrets change between application attempts on yarn. shuffleSecretMap.put(appId, shuffleSecret); - logger.info("Registered shuffle secret for application {}", appId); + logger.info("Registered shuffle secret for application {}", + MDC.of(LogKeys.APP_ID$.MODULE$, appId)); } /** @@ -67,7 +71,8 @@ public void registerApp(String appId, ByteBuffer shuffleSecret) { */ public void unregisterApp(String appId) { shuffleSecretMap.remove(appId); - logger.info("Unregistered shuffle secret for application {}", appId); + logger.info("Unregistered shuffle secret for application {}", + MDC.of(LogKeys.APP_ID$.MODULE$, appId)); } /** diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/BlockStoreClient.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/BlockStoreClient.java index 32222e910df06..dcb0a52b0d66c 100644 --- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/BlockStoreClient.java +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/BlockStoreClient.java @@ -26,9 +26,11 @@ import java.util.concurrent.CompletableFuture; import com.codahale.metrics.MetricSet; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.MDC; import org.apache.spark.network.buffer.ManagedBuffer; import org.apache.spark.network.client.RpcResponseCallback; import org.apache.spark.network.client.TransportClient; @@ -42,7 +44,7 @@ * or external service. */ public abstract class BlockStoreClient implements Closeable { - protected final Logger logger = LoggerFactory.getLogger(this.getClass()); + protected final SparkLogger logger = SparkLoggerFactory.getLogger(this.getClass()); protected volatile TransportClientFactory clientFactory; protected String appId; @@ -170,16 +172,16 @@ public void onSuccess(ByteBuffer response) { hostLocalDirsCompletable.complete( ((LocalDirsForExecutors) msgObj).getLocalDirsByExec()); } catch (Throwable t) { - logger.warn("Error while trying to get the host local dirs for " + - Arrays.toString(getLocalDirsMessage.execIds), t.getCause()); + logger.warn("Error while trying to get the host local dirs for {}", t.getCause(), + MDC.of(LogKeys.EXECUTOR_IDS$.MODULE$, Arrays.toString(getLocalDirsMessage.execIds))); hostLocalDirsCompletable.completeExceptionally(t); } } @Override public void onFailure(Throwable t) { - logger.warn("Error while trying to get the host local dirs for " + - Arrays.toString(getLocalDirsMessage.execIds), t.getCause()); + logger.warn("Error while trying to get the host local dirs for {}", t.getCause(), + MDC.of(LogKeys.EXECUTOR_IDS$.MODULE$, Arrays.toString(getLocalDirsMessage.execIds))); hostLocalDirsCompletable.completeExceptionally(t); } }); diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalBlockHandler.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalBlockHandler.java index 137572da108a4..5d33bfb345a9e 100644 --- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalBlockHandler.java +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalBlockHandler.java @@ -37,9 +37,11 @@ import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; import com.google.common.collect.Sets; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.MDC; import org.apache.spark.network.buffer.ManagedBuffer; import org.apache.spark.network.client.MergedBlockMetaResponseCallback; import org.apache.spark.network.client.RpcResponseCallback; @@ -65,7 +67,8 @@ */ public class ExternalBlockHandler extends RpcHandler implements RpcHandler.MergedBlockMetaReqHandler { - private static final Logger logger = LoggerFactory.getLogger(ExternalBlockHandler.class); + private static final SparkLogger logger = + SparkLoggerFactory.getLogger(ExternalBlockHandler.class); private static final String SHUFFLE_MERGER_IDENTIFIER = "shuffle-push-merger"; private static final String SHUFFLE_BLOCK_ID = "shuffle"; private static final String SHUFFLE_CHUNK_ID = "shuffleChunk"; @@ -221,7 +224,9 @@ protected void handleMessage( } else if (msgObj instanceof RemoveShuffleMerge msg) { checkAuth(client, msg.appId); logger.info("Removing shuffle merge data for application {} shuffle {} shuffleMerge {}", - msg.appId, msg.shuffleId, msg.shuffleMergeId); + MDC.of(LogKeys.APP_ID$.MODULE$, msg.appId), + MDC.of(LogKeys.SHUFFLE_ID$.MODULE$, msg.shuffleId), + MDC.of(LogKeys.SHUFFLE_MERGE_ID$.MODULE$, msg.shuffleMergeId)); mergeManager.removeShuffleMerge(msg); } else if (msgObj instanceof DiagnoseCorruption msg) { checkAuth(client, msg.appId); diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalBlockStoreClient.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalBlockStoreClient.java index 1451d5712812d..97723f77723d4 100644 --- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalBlockStoreClient.java +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalBlockStoreClient.java @@ -29,6 +29,8 @@ import com.codahale.metrics.MetricSet; import com.google.common.collect.Lists; +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.MDC; import org.apache.spark.network.TransportContext; import org.apache.spark.network.buffer.ManagedBuffer; import org.apache.spark.network.client.MergedBlockMetaResponseCallback; @@ -103,7 +105,8 @@ private void setComparableAppAttemptId(String appAttemptId) { this.comparableAppAttemptId = Integer.parseInt(appAttemptId); } catch (NumberFormatException e) { logger.warn("Push based shuffle requires comparable application attemptId, " + - "but the appAttemptId {} cannot be parsed to Integer", appAttemptId, e); + "but the appAttemptId {} cannot be parsed to Integer", e, + MDC.of(LogKeys.APP_ATTEMPT_ID$.MODULE$, appAttemptId)); } } @@ -217,8 +220,9 @@ public void onFailure(Throwable e) { } }); } catch (Exception e) { - logger.error("Exception while sending finalizeShuffleMerge request to {}:{}", - host, port, e); + logger.error("Exception while sending finalizeShuffleMerge request to {}:{}", e, + MDC.of(LogKeys.HOST$.MODULE$, host), + MDC.of(LogKeys.PORT$.MODULE$, port)); listener.onShuffleMergeFailure(e); } } @@ -316,16 +320,19 @@ public void onSuccess(ByteBuffer response) { BlockTransferMessage msgObj = BlockTransferMessage.Decoder.fromByteBuffer(response); numRemovedBlocksFuture.complete(((BlocksRemoved) msgObj).numRemovedBlocks); } catch (Throwable t) { - logger.warn("Error trying to remove blocks " + Arrays.toString(blockIds) + - " via external shuffle service from executor: " + execId, t); + logger.warn("Error trying to remove blocks {} via external shuffle service from " + + "executor: {}", t, + MDC.of(LogKeys.BLOCK_IDS$.MODULE$, Arrays.toString(blockIds)), + MDC.of(LogKeys.EXECUTOR_ID$.MODULE$, execId)); numRemovedBlocksFuture.complete(0); } } @Override public void onFailure(Throwable e) { - logger.warn("Error trying to remove blocks " + Arrays.toString(blockIds) + - " via external shuffle service from executor: " + execId, e); + logger.warn("Error trying to remove blocks {} via external shuffle service from " + + "executor: {}", e, MDC.of(LogKeys.BLOCK_IDS$.MODULE$, Arrays.toString(blockIds)), + MDC.of(LogKeys.EXECUTOR_ID$.MODULE$, execId)); numRemovedBlocksFuture.complete(0); } }); diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolver.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolver.java index 429e5f03b9eaf..e43eedd8b25eb 100644 --- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolver.java +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolver.java @@ -38,9 +38,11 @@ import com.google.common.cache.LoadingCache; import com.google.common.cache.Weigher; import com.google.common.collect.Maps; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.MDC; import org.apache.spark.network.buffer.FileSegmentManagedBuffer; import org.apache.spark.network.buffer.ManagedBuffer; import org.apache.spark.network.shuffle.checksum.Cause; @@ -62,7 +64,8 @@ * from Spark's IndexShuffleBlockResolver. */ public class ExternalShuffleBlockResolver { - private static final Logger logger = LoggerFactory.getLogger(ExternalShuffleBlockResolver.class); + private static final SparkLogger logger = + SparkLoggerFactory.getLogger(ExternalShuffleBlockResolver.class); private static final ObjectMapper mapper = new ObjectMapper(); @@ -131,7 +134,8 @@ public ShuffleIndexInformation load(String filePath) throws IOException { db = DBProvider.initDB(dbBackend, this.registeredExecutorFile, CURRENT_VERSION, mapper); if (db != null) { logger.info("Use {} as the implementation of {}", - dbBackend, Constants.SHUFFLE_SERVICE_DB_BACKEND); + MDC.of(LogKeys.SHUFFLE_DB_BACKEND_NAME$.MODULE$, dbBackend), + MDC.of(LogKeys.SHUFFLE_DB_BACKEND_KEY$.MODULE$, Constants.SHUFFLE_SERVICE_DB_BACKEND)); executors = reloadRegisteredExecutors(db); } else { executors = Maps.newConcurrentMap(); @@ -149,7 +153,9 @@ public void registerExecutor( String execId, ExecutorShuffleInfo executorInfo) { AppExecId fullId = new AppExecId(appId, execId); - logger.info("Registered executor {} with {}", fullId, executorInfo); + logger.info("Registered executor {} with {}", + MDC.of(LogKeys.APP_EXECUTOR_ID$.MODULE$, fullId), + MDC.of(LogKeys.EXECUTOR_SHUFFLE_INFO$.MODULE$, executorInfo)); try { if (db != null && AppsWithRecoveryDisabled.isRecoveryEnabledForApp(appId)) { byte[] key = dbAppExecKey(fullId); @@ -214,7 +220,9 @@ public ManagedBuffer getRddBlockData( * this method. */ public void applicationRemoved(String appId, boolean cleanupLocalDirs) { - logger.info("Application {} removed, cleanupLocalDirs = {}", appId, cleanupLocalDirs); + logger.info("Application {} removed, cleanupLocalDirs = {}", + MDC.of(LogKeys.APP_ID$.MODULE$, appId), + MDC.of(LogKeys.CLEANUP_LOCAL_DIRS$.MODULE$, cleanupLocalDirs)); Iterator> it = executors.entrySet().iterator(); while (it.hasNext()) { Map.Entry entry = it.next(); @@ -228,12 +236,15 @@ public void applicationRemoved(String appId, boolean cleanupLocalDirs) { try { db.delete(dbAppExecKey(fullId)); } catch (IOException e) { - logger.error("Error deleting {} from executor state db", appId, e); + logger.error("Error deleting {} from executor state db", e, + MDC.of(LogKeys.APP_ID$.MODULE$, appId)); } } if (cleanupLocalDirs) { - logger.info("Cleaning up executor {}'s {} local dirs", fullId, executor.localDirs.length); + logger.info("Cleaning up executor {}'s {} local dirs", + MDC.of(LogKeys.APP_EXECUTOR_ID$.MODULE$, fullId), + MDC.of(LogKeys.NUM_LOCAL_DIRS$.MODULE$, executor.localDirs.length)); // Execute the actual deletion in a different thread, as it may take some time. directoryCleaner.execute(() -> deleteExecutorDirs(executor.localDirs)); @@ -248,15 +259,18 @@ public void applicationRemoved(String appId, boolean cleanupLocalDirs) { */ public void executorRemoved(String executorId, String appId) { logger.info("Clean up non-shuffle and non-RDD files associated with the finished executor {}", - executorId); + MDC.of(LogKeys.EXECUTOR_ID$.MODULE$, executorId)); AppExecId fullId = new AppExecId(appId, executorId); final ExecutorShuffleInfo executor = executors.get(fullId); if (executor == null) { // Executor not registered, skip clean up of the local directories. - logger.info("Executor is not registered (appId={}, execId={})", appId, executorId); + logger.info("Executor is not registered (appId={}, execId={})", + MDC.of(LogKeys.APP_ID$.MODULE$, appId), + MDC.of(LogKeys.EXECUTOR_ID$.MODULE$, executorId)); } else { logger.info("Cleaning up non-shuffle and non-RDD files in executor {}'s {} local dirs", - fullId, executor.localDirs.length); + MDC.of(LogKeys.APP_EXECUTOR_ID$.MODULE$, fullId), + MDC.of(LogKeys.NUM_LOCAL_DIRS$.MODULE$, executor.localDirs.length)); // Execute the actual deletion in a different thread, as it may take some time. directoryCleaner.execute(() -> deleteNonShuffleServiceServedFiles(executor.localDirs)); @@ -273,7 +287,8 @@ private void deleteExecutorDirs(String[] dirs) { JavaUtils.deleteRecursively(new File(localDir)); logger.debug("Successfully cleaned up directory: {}", localDir); } catch (Exception e) { - logger.error("Failed to delete directory: " + localDir, e); + logger.error("Failed to delete directory: {}", e, + MDC.of(LogKeys.PATH$.MODULE$, localDir)); } } } @@ -295,8 +310,8 @@ private void deleteNonShuffleServiceServedFiles(String[] dirs) { logger.debug("Successfully cleaned up files not served by shuffle service in directory: {}", localDir); } catch (Exception e) { - logger.error("Failed to delete files not served by shuffle service in directory: " - + localDir, e); + logger.error("Failed to delete files not served by shuffle service in directory: {}", e, + MDC.of(LogKeys.PATH$.MODULE$, localDir)); } } } @@ -368,7 +383,8 @@ public int removeBlocks(String appId, String execId, String[] blockIds) { if (file.delete()) { numRemovedBlocks++; } else { - logger.warn("Failed to delete block: " + file.getAbsolutePath()); + logger.warn("Failed to delete block: {}", + MDC.of(LogKeys.PATH$.MODULE$, file.getAbsolutePath())); } } return numRemovedBlocks; @@ -472,7 +488,8 @@ static ConcurrentMap reloadRegisteredExecutors(D break; } AppExecId id = parseDbAppExecKey(key); - logger.info("Reloading registered executors: " + id.toString()); + logger.info("Reloading registered executors: {}", + MDC.of(LogKeys.APP_EXECUTOR_ID$.MODULE$, id)); ExecutorShuffleInfo shuffleInfo = mapper.readValue(e.getValue(), ExecutorShuffleInfo.class); registeredExecutors.put(id, shuffleInfo); diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/OneForOneBlockFetcher.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/OneForOneBlockFetcher.java index b93db3f570b86..c5c6ab313e193 100644 --- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/OneForOneBlockFetcher.java +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/OneForOneBlockFetcher.java @@ -26,9 +26,9 @@ import com.google.common.primitives.Ints; import com.google.common.primitives.Longs; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; import org.apache.spark.network.buffer.ManagedBuffer; import org.apache.spark.network.client.ChunkReceivedCallback; import org.apache.spark.network.client.RpcResponseCallback; @@ -53,7 +53,8 @@ * {@link org.apache.spark.network.server.OneForOneStreamManager} on the server side. */ public class OneForOneBlockFetcher { - private static final Logger logger = LoggerFactory.getLogger(OneForOneBlockFetcher.class); + private static final SparkLogger logger = + SparkLoggerFactory.getLogger(OneForOneBlockFetcher.class); private static final String SHUFFLE_BLOCK_PREFIX = "shuffle_"; private static final String SHUFFLE_CHUNK_PREFIX = "shuffleChunk_"; private static final String SHUFFLE_BLOCK_SPLIT = "shuffle"; diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/OneForOneBlockPusher.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/OneForOneBlockPusher.java index 8885dc9f2e2c5..d90ca1a88a267 100644 --- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/OneForOneBlockPusher.java +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/OneForOneBlockPusher.java @@ -22,9 +22,9 @@ import java.util.Map; import com.google.common.base.Preconditions; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; import org.apache.spark.network.buffer.ManagedBuffer; import org.apache.spark.network.buffer.NioManagedBuffer; import org.apache.spark.network.client.RpcResponseCallback; @@ -44,7 +44,8 @@ * @since 3.1.0 */ public class OneForOneBlockPusher { - private static final Logger logger = LoggerFactory.getLogger(OneForOneBlockPusher.class); + private static final SparkLogger logger = + SparkLoggerFactory.getLogger(OneForOneBlockPusher.class); private static final ErrorHandler PUSH_ERROR_HANDLER = new ErrorHandler.BlockPushErrorHandler(); public static final String SHUFFLE_PUSH_BLOCK_PREFIX = "shufflePush"; diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/RemoteBlockPushResolver.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/RemoteBlockPushResolver.java index 5f9576843b476..02a38eac5b409 100644 --- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/RemoteBlockPushResolver.java +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/RemoteBlockPushResolver.java @@ -64,9 +64,11 @@ import com.google.common.primitives.Longs; import org.roaringbitmap.RoaringBitmap; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.MDC; import org.apache.spark.network.buffer.FileSegmentManagedBuffer; import org.apache.spark.network.buffer.ManagedBuffer; import org.apache.spark.network.client.StreamCallbackWithID; @@ -96,7 +98,8 @@ public class RemoteBlockPushResolver implements MergedShuffleFileManager { private static final Cleaner CLEANER = Cleaner.create(); - private static final Logger logger = LoggerFactory.getLogger(RemoteBlockPushResolver.class); + private static final SparkLogger logger = + SparkLoggerFactory.getLogger(RemoteBlockPushResolver.class); public static final String MERGED_SHUFFLE_FILE_NAME_PREFIX = "shuffleMerged"; public static final String SHUFFLE_META_DELIMITER = ":"; @@ -184,7 +187,8 @@ public ShuffleIndexInformation load(String filePath) throws IOException { db = DBProvider.initDB(dbBackend, this.recoveryFile, CURRENT_VERSION, mapper); if (db != null) { logger.info("Use {} as the implementation of {}", - dbBackend, Constants.SHUFFLE_SERVICE_DB_BACKEND); + MDC.of(LogKeys.SHUFFLE_DB_BACKEND_NAME$.MODULE$, dbBackend), + MDC.of(LogKeys.SHUFFLE_DB_BACKEND_KEY$.MODULE$, Constants.SHUFFLE_SERVICE_DB_BACKEND)); reloadAndCleanUpAppShuffleInfo(db); } this.pushMergeMetrics = new PushMergeMetrics(); @@ -229,8 +233,11 @@ AppShufflePartitionInfo getOrCreateAppShufflePartitionInfo( shuffles.compute(shuffleId, (id, mergePartitionsInfo) -> { if (mergePartitionsInfo == null) { logger.info("{} attempt {} shuffle {} shuffleMerge {}: creating a new shuffle " + - "merge metadata", appShuffleInfo.appId, appShuffleInfo.attemptId, shuffleId, - shuffleMergeId); + "merge metadata", + MDC.of(LogKeys.APP_ID$.MODULE$, appShuffleInfo.appId), + MDC.of(LogKeys.APP_ATTEMPT_ID$.MODULE$, appShuffleInfo.attemptId), + MDC.of(LogKeys.SHUFFLE_ID$.MODULE$, shuffleId), + MDC.of(LogKeys.SHUFFLE_MERGE_ID$.MODULE$, shuffleMergeId)); return new AppShuffleMergePartitionsInfo(shuffleMergeId, false); } else { int latestShuffleMergeId = mergePartitionsInfo.shuffleMergeId; @@ -248,8 +255,11 @@ AppShufflePartitionInfo getOrCreateAppShufflePartitionInfo( new AppAttemptShuffleMergeId(appShuffleInfo.appId, appShuffleInfo.attemptId, shuffleId, latestShuffleMergeId); logger.info("{}: creating a new shuffle merge metadata since received " + - "shuffleMergeId {} is higher than latest shuffleMergeId {}", - currrentAppAttemptShuffleMergeId, shuffleMergeId, latestShuffleMergeId); + "shuffleMergeId {} is higher than latest shuffleMergeId {}", + MDC.of(LogKeys.APP_ATTEMPT_SHUFFLE_MERGE_ID$.MODULE$, + currrentAppAttemptShuffleMergeId), + MDC.of(LogKeys.SHUFFLE_MERGE_ID$.MODULE$, shuffleMergeId), + MDC.of(LogKeys.LATEST_SHUFFLE_MERGE_ID$.MODULE$, latestShuffleMergeId)); submitCleanupTask(() -> closeAndDeleteOutdatedPartitions(currrentAppAttemptShuffleMergeId, mergePartitionsInfo.shuffleMergePartitions)); @@ -282,9 +292,14 @@ AppShufflePartitionInfo getOrCreateAppShufflePartitionInfo( dataFile, indexFile, metaFile); } catch (IOException e) { logger.error("{} attempt {} shuffle {} shuffleMerge {}: cannot create merged shuffle " + - "partition with data file {}, index file {}, and meta file {}", appShuffleInfo.appId, - appShuffleInfo.attemptId, shuffleId, shuffleMergeId, dataFile.getAbsolutePath(), - indexFile.getAbsolutePath(), metaFile.getAbsolutePath()); + "partition with data file {}, index file {}, and meta file {}", + MDC.of(LogKeys.APP_ID$.MODULE$, appShuffleInfo.appId), + MDC.of(LogKeys.APP_ATTEMPT_ID$.MODULE$, appShuffleInfo.attemptId), + MDC.of(LogKeys.SHUFFLE_ID$.MODULE$, shuffleId), + MDC.of(LogKeys.SHUFFLE_MERGE_ID$.MODULE$, shuffleMergeId), + MDC.of(LogKeys.DATA_FILE$.MODULE$, dataFile.getAbsolutePath()), + MDC.of(LogKeys.INDEX_FILE$.MODULE$, indexFile.getAbsolutePath()), + MDC.of(LogKeys.META_FILE$.MODULE$, metaFile.getAbsolutePath())); throw new RuntimeException( String.format("Cannot initialize merged shuffle partition for appId %s shuffleId %s " + "shuffleMergeId %s reduceId %s", appShuffleInfo.appId, shuffleId, shuffleMergeId, @@ -395,7 +410,9 @@ private void removeOldApplicationAttemptsFromDb(AppShuffleInfo info) { @Override public void applicationRemoved(String appId, boolean cleanupLocalDirs) { - logger.info("Application {} removed, cleanupLocalDirs = {}", appId, cleanupLocalDirs); + logger.info("Application {} removed, cleanupLocalDirs = {}", + MDC.of(LogKeys.APP_ID$.MODULE$, appId), + MDC.of(LogKeys.CLEANUP_LOCAL_DIRS$.MODULE$, cleanupLocalDirs)); // Cleanup the DB within critical section to gain the consistency between // DB and in-memory hashmap. AtomicReference ref = new AtomicReference<>(null); @@ -505,8 +522,8 @@ void removeAppAttemptPathInfoFromDB(String appId, int attemptId) { byte[] key = getDbAppAttemptPathsKey(appAttemptId); db.delete(key); } catch (Exception e) { - logger.error("Failed to remove the application attempt {} local path in DB", - appAttemptId, e); + logger.error("Failed to remove the application attempt {} local path in DB", e, + MDC.of(LogKeys.APP_ATTEMPT_ID$.MODULE$, appAttemptId)); } } } @@ -576,7 +593,10 @@ void deleteMergedFiles( } } logger.info("Delete {} data files, {} index files, {} meta files for {}", - dataFilesDeleteCnt, indexFilesDeleteCnt, metaFilesDeleteCnt, appAttemptShuffleMergeId); + MDC.of(LogKeys.NUM_DATA_FILES$.MODULE$, dataFilesDeleteCnt), + MDC.of(LogKeys.NUM_INDEX_FILES$.MODULE$, indexFilesDeleteCnt), + MDC.of(LogKeys.NUM_META_FILES$.MODULE$, metaFilesDeleteCnt), + MDC.of(LogKeys.APP_ATTEMPT_SHUFFLE_MERGE_ID$.MODULE$, appAttemptShuffleMergeId)); } /** @@ -588,8 +608,8 @@ void removeAppShufflePartitionInfoFromDB(AppAttemptShuffleMergeId appAttemptShuf try { db.delete(getDbAppAttemptShufflePartitionKey(appAttemptShuffleMergeId)); } catch (Exception e) { - logger.error("Error deleting {} from application shuffle merged partition info in DB", - appAttemptShuffleMergeId, e); + logger.error("Error deleting {} from application shuffle merged partition info in DB", e, + MDC.of(LogKeys.APP_ATTEMPT_SHUFFLE_MERGE_ID$.MODULE$, appAttemptShuffleMergeId)); } } } @@ -608,7 +628,8 @@ void deleteExecutorDirs(AppShuffleInfo appShuffleInfo) { logger.debug("Successfully cleaned up directory: {}", localDir); } } catch (Exception e) { - logger.error("Failed to delete directory: {}", localDir, e); + logger.error("Failed to delete directory: {}", e, + MDC.of(LogKeys.PATH$.MODULE$, localDir)); } } } @@ -738,7 +759,10 @@ public ByteBuffer getCompletionResponse() { @Override public MergeStatuses finalizeShuffleMerge(FinalizeShuffleMerge msg) { logger.info("{} attempt {} shuffle {} shuffleMerge {}: finalize shuffle merge", - msg.appId, msg.appAttemptId, msg.shuffleId, msg.shuffleMergeId); + MDC.of(LogKeys.APP_ID$.MODULE$, msg.appId), + MDC.of(LogKeys.APP_ATTEMPT_ID$.MODULE$, msg.appAttemptId), + MDC.of(LogKeys.SHUFFLE_ID$.MODULE$, msg.shuffleId), + MDC.of(LogKeys.SHUFFLE_MERGE_ID$.MODULE$, msg.shuffleMergeId)); AppShuffleInfo appShuffleInfo = validateAndGetAppShuffleInfo(msg.appId); if (appShuffleInfo.attemptId != msg.appAttemptId) { // If finalizeShuffleMerge from a former application attempt, it is considered late, @@ -821,9 +845,13 @@ public MergeStatuses finalizeShuffleMerge(FinalizeShuffleMerge msg) { } } catch (IOException ioe) { logger.warn("{} attempt {} shuffle {} shuffleMerge {}: exception while " + - "finalizing shuffle partition {}. Exception message: {}", msg.appId, - msg.appAttemptId, msg.shuffleId, msg.shuffleMergeId, partition.reduceId, - ioe.getMessage()); + "finalizing shuffle partition {}. Exception message: {}", + MDC.of(LogKeys.APP_ID$.MODULE$, msg.appId), + MDC.of(LogKeys.APP_ATTEMPT_ID$.MODULE$, msg.appAttemptId), + MDC.of(LogKeys.SHUFFLE_ID$.MODULE$, msg.shuffleId), + MDC.of(LogKeys.SHUFFLE_MERGE_ID$.MODULE$, msg.shuffleMergeId), + MDC.of(LogKeys.REDUCE_ID$.MODULE$, partition.reduceId), + MDC.of(LogKeys.EXCEPTION$.MODULE$, ioe.getMessage())); } finally { partition.cleanable.clean(); } @@ -835,7 +863,10 @@ public MergeStatuses finalizeShuffleMerge(FinalizeShuffleMerge msg) { appShuffleInfo.shuffles.get(msg.shuffleId).setReduceIds(Ints.toArray(reduceIds)); } logger.info("{} attempt {} shuffle {} shuffleMerge {}: finalization of shuffle merge completed", - msg.appId, msg.appAttemptId, msg.shuffleId, msg.shuffleMergeId); + MDC.of(LogKeys.APP_ID$.MODULE$, msg.appId), + MDC.of(LogKeys.APP_ATTEMPT_ID$.MODULE$, msg.appAttemptId), + MDC.of(LogKeys.SHUFFLE_ID$.MODULE$, msg.shuffleId), + MDC.of(LogKeys.SHUFFLE_MERGE_ID$.MODULE$, msg.shuffleMergeId)); return mergeStatuses; } @@ -903,7 +934,8 @@ public void registerExecutor(String appId, ExecutorShuffleInfo executorInfo) { if (originalAppShuffleInfo.get() != null) { AppShuffleInfo appShuffleInfo = originalAppShuffleInfo.get(); logger.warn("Cleanup shuffle info and merged shuffle files for {}_{} as new " + - "application attempt registered", appId, appShuffleInfo.attemptId); + "application attempt registered", MDC.of(LogKeys.APP_ID$.MODULE$, appId), + MDC.of(LogKeys.APP_ATTEMPT_ID$.MODULE$, appShuffleInfo.attemptId)); // Clean up all the merge shuffle related information in the DB for the former attempt submitCleanupTask( () -> closeAndDeletePartitionsIfNeeded(appShuffleInfo, true) @@ -959,11 +991,13 @@ private void shutdownMergedShuffleCleanerNow() { try { List unfinishedTasks = mergedShuffleCleaner.shutdownNow(); logger.warn("There are still {} tasks not completed in mergedShuffleCleaner " + - "after {} seconds.", unfinishedTasks.size(), cleanerShutdownTimeout); + "after {} ms.", + MDC.of(LogKeys.COUNT$.MODULE$, unfinishedTasks.size()), + MDC.of(LogKeys.TIMEOUT$.MODULE$, cleanerShutdownTimeout * 1000L)); // Wait a while for tasks to respond to being cancelled if (!mergedShuffleCleaner.awaitTermination(cleanerShutdownTimeout, TimeUnit.SECONDS)) { - logger.warn("mergedShuffleCleaner did not terminate in {} seconds.", - cleanerShutdownTimeout); + logger.warn("mergedShuffleCleaner did not terminate in {} ms.", + MDC.of(LogKeys.TIMEOUT$.MODULE$, cleanerShutdownTimeout * 1000L)); } } catch (InterruptedException ignored) { Thread.currentThread().interrupt(); @@ -982,7 +1016,8 @@ private void writeAppPathsInfoToDb(String appId, int attemptId, AppPathsInfo app byte[] value = valueStr.getBytes(StandardCharsets.UTF_8); db.put(key, value); } catch (Exception e) { - logger.error("Error saving registered app paths info for {}", appAttemptId, e); + logger.error("Error saving registered app paths info for {}", e, + MDC.of(LogKeys.APP_ATTEMPT_ID$.MODULE$, appAttemptId)); } } } @@ -999,7 +1034,8 @@ private void writeAppAttemptShuffleMergeInfoToDB( byte[] dbKey = getDbAppAttemptShufflePartitionKey(appAttemptShuffleMergeId); db.put(dbKey, new byte[0]); } catch (Exception e) { - logger.error("Error saving active app shuffle partition {}", appAttemptShuffleMergeId, e); + logger.error("Error saving active app shuffle partition {}", e, + MDC.of(LogKeys.APP_ATTEMPT_SHUFFLE_MERGE_ID$.MODULE$, appAttemptShuffleMergeId)); } } } @@ -1100,7 +1136,8 @@ List reloadActiveAppAttemptsPathInfo(DB db) throws IOException { // Add the former outdated DB key to deletion list dbKeysToBeRemoved.add(getDbAppAttemptPathsKey(existingAppAttemptId)); } catch (IOException e) { - logger.error("Failed to get the DB key for {}", existingAppAttemptId, e); + logger.error("Failed to get the DB key for {}", e, + MDC.of(LogKeys.APP_ATTEMPT_ID$.MODULE$, existingAppAttemptId)); } } return new AppShuffleInfo( @@ -1149,8 +1186,8 @@ List reloadFinalizedAppAttemptsShuffleMergeInfo(DB db) throws IOExceptio dbKeysToBeRemoved.add( getDbAppAttemptShufflePartitionKey(appAttemptShuffleMergeId)); } catch (Exception e) { - logger.error("Error getting the DB key for {}", - appAttemptShuffleMergeId, e); + logger.error("Error getting the DB key for {}", e, MDC.of( + LogKeys.APP_ATTEMPT_SHUFFLE_MERGE_ID$.MODULE$, appAttemptShuffleMergeId)); } } return new AppShuffleMergePartitionsInfo(partitionId.shuffleMergeId, true); @@ -1178,7 +1215,8 @@ void removeOutdatedKeyValuesInDB(List dbKeysToBeRemoved) { try { db.delete(key); } catch (Exception e) { - logger.error("Error deleting dangling key {} in DB", key, e); + logger.error("Error deleting dangling key {} in DB", e, + MDC.of(LogKeys.KEY$.MODULE$, key)); } } ); @@ -1560,7 +1598,8 @@ public void onComplete(String streamId) throws IOException { @Override public void onFailure(String streamId, Throwable throwable) throws IOException { if (ERROR_HANDLER.shouldLogError(throwable)) { - logger.error("Encountered issue when merging {}", streamId, throwable); + logger.error("Encountered issue when merging {}", throwable, + MDC.of(LogKeys.STREAM_ID$.MODULE$, streamId)); } else { logger.debug("Encountered issue when merging {}", streamId, throwable); } @@ -1821,7 +1860,8 @@ void updateChunkInfo(long chunkOffset, int mapIndex) throws IOException { indexMetaUpdateFailed = false; } catch (IOException ioe) { logger.warn("{} reduceId {} update to index/meta failed", - appAttemptShuffleMergeId, reduceId); + MDC.of(LogKeys.APP_ATTEMPT_SHUFFLE_MERGE_ID$.MODULE$, appAttemptShuffleMergeId), + MDC.of(LogKeys.REDUCE_ID$.MODULE$, reduceId)); indexMetaUpdateFailed = true; // Any exception here is propagated to the caller and the caller can decide whether to // abort or not. @@ -1873,7 +1913,8 @@ private void finalizePartition() throws IOException { private void deleteAllFiles() { if (!dataFile.delete()) { logger.info("Error deleting data file for {} reduceId {}", - appAttemptShuffleMergeId, reduceId); + MDC.of(LogKeys.APP_ATTEMPT_SHUFFLE_MERGE_ID$.MODULE$, appAttemptShuffleMergeId), + MDC.of(LogKeys.REDUCE_ID$.MODULE$, reduceId)); } metaFile.delete(); indexFile.delete(); @@ -1942,19 +1983,22 @@ private void closeAllFiles( } } catch (IOException ioe) { logger.warn("Error closing data channel for {} reduceId {}", - appAttemptShuffleMergeId, reduceId); + MDC.of(LogKeys.APP_ATTEMPT_SHUFFLE_MERGE_ID$.MODULE$, appAttemptShuffleMergeId), + MDC.of(LogKeys.REDUCE_ID$.MODULE$, reduceId)); } try { metaFile.close(); } catch (IOException ioe) { logger.warn("Error closing meta file for {} reduceId {}", - appAttemptShuffleMergeId, reduceId); + MDC.of(LogKeys.APP_ATTEMPT_SHUFFLE_MERGE_ID$.MODULE$, appAttemptShuffleMergeId), + MDC.of(LogKeys.REDUCE_ID$.MODULE$, reduceId)); } try { indexFile.close(); } catch (IOException ioe) { logger.warn("Error closing index file for {} reduceId {}", - appAttemptShuffleMergeId, reduceId); + MDC.of(LogKeys.APP_ATTEMPT_SHUFFLE_MERGE_ID$.MODULE$, appAttemptShuffleMergeId), + MDC.of(LogKeys.REDUCE_ID$.MODULE$, reduceId)); } } } @@ -1999,7 +2043,9 @@ private AppPathsInfo( this.subDirsPerLocalDir = subDirsPerLocalDir; if (logger.isInfoEnabled()) { logger.info("Updated active local dirs {} and sub dirs {} for application {}", - Arrays.toString(activeLocalDirs),subDirsPerLocalDir, appId); + MDC.of(LogKeys.PATHS$.MODULE$, Arrays.toString(activeLocalDirs)), + MDC.of(LogKeys.NUM_SUB_DIRS$.MODULE$, subDirsPerLocalDir), + MDC.of(LogKeys.APP_ID$.MODULE$, appId)); } } diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/RetryingBlockTransferor.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/RetryingBlockTransferor.java index c628b201b2027..31c454f63a92e 100644 --- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/RetryingBlockTransferor.java +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/RetryingBlockTransferor.java @@ -28,9 +28,11 @@ import com.google.common.base.Preconditions; import com.google.common.collect.Sets; import com.google.common.util.concurrent.Uninterruptibles; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.MDC; import org.apache.spark.network.buffer.ManagedBuffer; import org.apache.spark.network.sasl.SaslTimeoutException; import org.apache.spark.network.util.NettyUtils; @@ -68,7 +70,8 @@ void createAndStart(String[] blockIds, BlockTransferListener listener) private static final ExecutorService executorService = Executors.newCachedThreadPool( NettyUtils.createThreadFactory("Block Transfer Retry")); - private static final Logger logger = LoggerFactory.getLogger(RetryingBlockTransferor.class); + private static final SparkLogger logger = + SparkLoggerFactory.getLogger(RetryingBlockTransferor.class); /** Used to initiate new Block transfer on our remaining blocks. */ private final BlockTransferStarter transferStarter; @@ -177,10 +180,16 @@ private void transferAllOutstanding() { try { transferStarter.createAndStart(blockIdsToTransfer, myListener); } catch (Exception e) { - logger.error(String.format("Exception while beginning %s of %s outstanding blocks %s", - listener.getTransferType(), blockIdsToTransfer.length, - numRetries > 0 ? "(after " + numRetries + " retries)" : ""), e); - + if (numRetries > 0) { + logger.error("Exception while beginning {} of {} outstanding blocks (after {} retries)", e, + MDC.of(LogKeys.TRANSFER_TYPE$.MODULE$, listener.getTransferType()), + MDC.of(LogKeys.NUM_BLOCKS$.MODULE$, blockIdsToTransfer.length), + MDC.of(LogKeys.NUM_RETRY$.MODULE$, numRetries)); + } else { + logger.error("Exception while beginning {} of {} outstanding blocks", e, + MDC.of(LogKeys.TRANSFER_TYPE$.MODULE$, listener.getTransferType()), + MDC.of(LogKeys.NUM_BLOCKS$.MODULE$, blockIdsToTransfer.length)); + } if (shouldRetry(e) && initiateRetry(e)) { // successfully initiated a retry return; @@ -207,8 +216,11 @@ synchronized boolean initiateRetry(Throwable e) { currentListener = new RetryingBlockTransferListener(); logger.info("Retrying {} ({}/{}) for {} outstanding blocks after {} ms", - listener.getTransferType(), retryCount, maxRetries, outstandingBlocksIds.size(), - retryWaitTime); + MDC.of(LogKeys.TRANSFER_TYPE$.MODULE$, listener.getTransferType()), + MDC.of(LogKeys.NUM_RETRY$.MODULE$, retryCount), + MDC.of(LogKeys.MAX_ATTEMPTS$.MODULE$, maxRetries), + MDC.of(LogKeys.NUM_BLOCKS$.MODULE$, outstandingBlocksIds.size()), + MDC.of(LogKeys.RETRY_WAIT_TIME$.MODULE$, retryWaitTime)); try { executorService.execute(() -> { @@ -298,9 +310,10 @@ private void handleBlockTransferFailure(String blockId, Throwable exception) { } } else { if (errorHandler.shouldLogError(exception)) { - logger.error( - String.format("Failed to %s block %s, and will not retry (%s retries)", - listener.getTransferType(), blockId, retryCount), exception); + logger.error("Failed to {} block {}, and will not retry ({} retries)", exception, + MDC.of(LogKeys.TRANSFER_TYPE$.MODULE$, listener.getTransferType()), + MDC.of(LogKeys.BLOCK_ID$.MODULE$, blockId), + MDC.of(LogKeys.NUM_RETRY$.MODULE$,retryCount)); } else { logger.debug( String.format("Failed to %s block %s, and will not retry (%s retries)", diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleTransportContext.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleTransportContext.java index a0794113a080d..705d47aab3b50 100644 --- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleTransportContext.java +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleTransportContext.java @@ -29,9 +29,8 @@ import io.netty.channel.socket.SocketChannel; import io.netty.handler.codec.MessageToMessageDecoder; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; import org.apache.spark.network.TransportContext; import org.apache.spark.network.protocol.Message; import org.apache.spark.network.protocol.MessageDecoder; @@ -52,7 +51,8 @@ * are processed in the separate handlers. * */ public class ShuffleTransportContext extends TransportContext { - private static final Logger logger = LoggerFactory.getLogger(ShuffleTransportContext.class); + private static final SparkLogger logger = + SparkLoggerFactory.getLogger(ShuffleTransportContext.class); private static final ShuffleMessageDecoder SHUFFLE_DECODER = new ShuffleMessageDecoder(MessageDecoder.INSTANCE); private final EventLoopGroup finalizeWorkers; @@ -158,7 +158,7 @@ record RpcRequestInternal(BlockTransferMessage.Type messageType, RpcRequest rpcR } static class FinalizedHandler extends SimpleChannelInboundHandler { - private static final Logger logger = LoggerFactory.getLogger(FinalizedHandler.class); + private static final SparkLogger logger = SparkLoggerFactory.getLogger(FinalizedHandler.class); public static final String HANDLER_NAME = "finalizeHandler"; private final TransportRequestHandler transportRequestHandler; diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/checksum/ShuffleChecksumHelper.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/checksum/ShuffleChecksumHelper.java index 1feac49752c8f..f9c0c60c2f2c6 100644 --- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/checksum/ShuffleChecksumHelper.java +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/checksum/ShuffleChecksumHelper.java @@ -25,9 +25,11 @@ import java.util.zip.Checksum; import com.google.common.io.ByteStreams; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.MDC; import org.apache.spark.annotation.Private; import org.apache.spark.network.buffer.ManagedBuffer; @@ -36,8 +38,8 @@ */ @Private public class ShuffleChecksumHelper { - private static final Logger logger = - LoggerFactory.getLogger(ShuffleChecksumHelper.class); + private static final SparkLogger logger = + SparkLoggerFactory.getLogger(ShuffleChecksumHelper.class); public static final int CHECKSUM_CALCULATION_BUFFER = 8192; public static final Checksum[] EMPTY_CHECKSUM = new Checksum[0]; @@ -149,7 +151,8 @@ public static Cause diagnoseCorruption( cause = Cause.UNSUPPORTED_CHECKSUM_ALGORITHM; } catch (FileNotFoundException e) { // Even if checksum is enabled, a checksum file may not exist if error throws during writing. - logger.warn("Checksum file " + checksumFile.getName() + " doesn't exit"); + logger.warn("Checksum file {} doesn't exit", + MDC.of(LogKeys.PATH$.MODULE$, checksumFile.getName())); cause = Cause.UNKNOWN_ISSUE; } catch (Exception e) { logger.warn("Unable to diagnose shuffle block corruption", e); @@ -162,7 +165,9 @@ public static Cause diagnoseCorruption( checksumByReader, checksumByWriter, checksumByReCalculation); } else { logger.info("Shuffle corruption diagnosis took {} ms, checksum file {}, cause {}", - duration, checksumFile.getAbsolutePath(), cause); + MDC.of(LogKeys.TIME$.MODULE$, duration), + MDC.of(LogKeys.PATH$.MODULE$, checksumFile.getAbsolutePath()), + MDC.of(LogKeys.REASON$.MODULE$, cause)); } return cause; } diff --git a/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/RemoteBlockPushResolverSuite.java b/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/RemoteBlockPushResolverSuite.java index fbde165fb39c9..edd5e1961a501 100644 --- a/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/RemoteBlockPushResolverSuite.java +++ b/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/RemoteBlockPushResolverSuite.java @@ -47,8 +47,10 @@ import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.roaringbitmap.RoaringBitmap; +// checkstyle.off: RegexpSinglelineJava import org.slf4j.Logger; import org.slf4j.LoggerFactory; +// checkstyle.on: RegexpSinglelineJava import static org.junit.jupiter.api.Assertions.*; diff --git a/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/RetryingBlockTransferorSuite.java b/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/RetryingBlockTransferorSuite.java index 3725973ae7333..84c8b1b3353f2 100644 --- a/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/RetryingBlockTransferorSuite.java +++ b/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/RetryingBlockTransferorSuite.java @@ -288,7 +288,7 @@ public void testRetryOnSaslTimeout() throws IOException, InterruptedException { verify(listener, timeout(5000)).onBlockTransferSuccess("b0", block0); verify(listener).getTransferType(); verifyNoMoreInteractions(listener); - assert(_retryingBlockTransferor.getRetryCount() == 0); + assertEquals(0, _retryingBlockTransferor.getRetryCount()); } @Test @@ -310,7 +310,7 @@ public void testRepeatedSaslRetryFailures() throws IOException, InterruptedExcep verify(listener, timeout(5000)).onBlockTransferFailure("b0", saslTimeoutException); verify(listener, times(3)).getTransferType(); verifyNoMoreInteractions(listener); - assert(_retryingBlockTransferor.getRetryCount() == MAX_RETRIES); + assertEquals(MAX_RETRIES, _retryingBlockTransferor.getRetryCount()); } @Test @@ -339,7 +339,7 @@ public void testBlockTransferFailureAfterSasl() throws IOException, InterruptedE // This should be equal to 1 because after the SASL exception is retried, // retryCount should be set back to 0. Then after that b1 encounters an // exception that is retried. - assert(_retryingBlockTransferor.getRetryCount() == 1); + assertEquals(1, _retryingBlockTransferor.getRetryCount()); } @Test @@ -368,7 +368,7 @@ public void testIOExceptionFailsConnectionEvenWithSaslException() verify(listener, timeout(5000)).onBlockTransferFailure("b0", saslExceptionFinal); verify(listener, atLeastOnce()).getTransferType(); verifyNoMoreInteractions(listener); - assert(_retryingBlockTransferor.getRetryCount() == MAX_RETRIES); + assertEquals(MAX_RETRIES, _retryingBlockTransferor.getRetryCount()); } @Test diff --git a/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/TestShuffleDataContext.java b/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/TestShuffleDataContext.java index 4068224665597..eeb936773aaad 100644 --- a/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/TestShuffleDataContext.java +++ b/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/TestShuffleDataContext.java @@ -28,8 +28,10 @@ import org.apache.spark.network.shuffle.protocol.ExecutorShuffleInfo; import org.apache.spark.network.util.JavaUtils; import org.junit.jupiter.api.Assertions; +// checkstyle.off: RegexpSinglelineJava import org.slf4j.Logger; import org.slf4j.LoggerFactory; +// checkstyle.on: RegexpSinglelineJava /** * Manages some sort-shuffle data, including the creation diff --git a/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java b/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java index 4cfbd8e96ac61..e0af3c5ae2468 100644 --- a/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java +++ b/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java @@ -53,9 +53,11 @@ import org.apache.spark.network.shuffledb.StoreVersion; import org.apache.spark.network.util.DBProvider; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.MDC; import org.apache.spark.network.TransportContext; import org.apache.spark.network.crypto.AuthServerBootstrap; import org.apache.spark.network.sasl.ShuffleSecretManager; @@ -100,8 +102,9 @@ * This {@code classpath} configuration is only supported on YARN versions >= 2.9.0. */ public class YarnShuffleService extends AuxiliaryService { - private static final Logger defaultLogger = LoggerFactory.getLogger(YarnShuffleService.class); - private Logger logger = defaultLogger; + private static final SparkLogger defaultSparkLogger = + SparkLoggerFactory.getLogger(YarnShuffleService.class); + private SparkLogger logger = defaultSparkLogger; // Port on which the shuffle server listens for fetch requests private static final String SPARK_SHUFFLE_SERVICE_PORT_KEY = "spark.shuffle.service.port"; @@ -237,14 +240,14 @@ protected void serviceInit(Configuration externalConf) throws Exception { .getResource(SHUFFLE_SERVICE_CONF_OVERLAY_RESOURCE_NAME); if (confOverlayUrl != null) { logger.info("Initializing Spark YARN shuffle service with configuration overlay from {}", - confOverlayUrl); + MDC.of(LogKeys.SHUFFLE_SERVICE_CONF_OVERLAY_URL$.MODULE$, confOverlayUrl)); _conf.addResource(confOverlayUrl); } String logsNamespace = _conf.get(SPARK_SHUFFLE_SERVICE_LOGS_NAMESPACE_KEY, ""); if (!logsNamespace.isEmpty()) { String className = YarnShuffleService.class.getName(); - logger = LoggerFactory.getLogger(className + "." + logsNamespace); + logger = SparkLoggerFactory.getLogger(className + "." + logsNamespace); } super.serviceInit(_conf); @@ -262,7 +265,8 @@ protected void serviceInit(Configuration externalConf) throws Exception { DBBackend.ROCKSDB.name()); dbBackend = DBBackend.byName(dbBackendName); logger.info("Use {} as the implementation of {}", - dbBackend, Constants.SHUFFLE_SERVICE_DB_BACKEND); + MDC.of(LogKeys.SHUFFLE_DB_BACKEND_NAME$.MODULE$, dbBackend), + MDC.of(LogKeys.SHUFFLE_DB_BACKEND_KEY$.MODULE$, Constants.SHUFFLE_SERVICE_DB_BACKEND)); } try { @@ -326,11 +330,12 @@ protected void serviceInit(Configuration externalConf) throws Exception { "PushBasedShuffleMergeManager", "Metrics on the push-based shuffle merge manager", mergeManagerMetrics); logger.info("Registered metrics with Hadoop's DefaultMetricsSystem using namespace '{}'", - metricsNamespace); + MDC.of(LogKeys.SHUFFLE_SERVICE_METRICS_NAMESPACE$.MODULE$, metricsNamespace)); - logger.info("Started YARN shuffle service for Spark on port {}. " + - "Authentication is {}. Registered executor file is {}", port, authEnabledString, - registeredExecutorFile); + logger.info("Started YARN shuffle service for Spark on port {}. Authentication is {}. " + + "Registered executor file is {}", MDC.of(LogKeys.PORT$.MODULE$, port), + MDC.of(LogKeys.AUTH_ENABLED$.MODULE$, authEnabledString), + MDC.of(LogKeys.REGISTERED_EXECUTOR_FILE$.MODULE$, registeredExecutorFile)); } catch (Exception e) { if (stopOnFailure) { throw e; @@ -363,7 +368,8 @@ static MergedShuffleFileManager newMergedShuffleFileManagerInstance( return mergeManagerSubClazz.getConstructor(TransportConf.class, File.class) .newInstance(conf, mergeManagerFile); } catch (Exception e) { - defaultLogger.error("Unable to create an instance of {}", mergeManagerImplClassName); + defaultSparkLogger.error("Unable to create an instance of {}", + MDC.of(LogKeys.CLASS_NAME$.MODULE$, mergeManagerImplClassName)); return new NoOpMergedShuffleFileManager(conf, mergeManagerFile); } } @@ -426,7 +432,8 @@ public void initializeApplication(ApplicationInitializationContext context) { Object metadataStorageVal = metaInfo.get(SPARK_SHUFFLE_SERVER_RECOVERY_DISABLED); if (metadataStorageVal != null && (Boolean) metadataStorageVal) { AppsWithRecoveryDisabled.disableRecoveryOfApp(appId); - logger.info("Disabling metadata persistence for application {}", appId); + logger.info("Disabling metadata persistence for application {}", + MDC.of(LogKeys.APP_ID$.MODULE$, appId)); } } catch (IOException ioe) { logger.warn("Unable to parse application data for service: " + payload); @@ -449,7 +456,8 @@ public void initializeApplication(ApplicationInitializationContext context) { secretManager.registerApp(appId, shuffleSecret); } } catch (Exception e) { - logger.error("Exception when initializing application {}", appId, e); + logger.error("Exception when initializing application {}", e, + MDC.of(LogKeys.APP_ID$.MODULE$, appId)); } } @@ -463,14 +471,16 @@ public void stopApplication(ApplicationTerminationContext context) { try { db.delete(dbAppKey(fullId)); } catch (IOException e) { - logger.error("Error deleting {} from executor state db", appId, e); + logger.error("Error deleting {} from executor state db", e, + MDC.of(LogKeys.APP_ID$.MODULE$, appId)); } } secretManager.unregisterApp(appId); } blockHandler.applicationRemoved(appId, false /* clean up local dirs */); } catch (Exception e) { - logger.error("Exception when stopping application {}", appId, e); + logger.error("Exception when stopping application {}", e, + MDC.of(LogKeys.APP_ID$.MODULE$, appId)); } finally { AppsWithRecoveryDisabled.removeApp(appId); } @@ -479,13 +489,13 @@ public void stopApplication(ApplicationTerminationContext context) { @Override public void initializeContainer(ContainerInitializationContext context) { ContainerId containerId = context.getContainerId(); - logger.info("Initializing container {}", containerId); + logger.info("Initializing container {}", MDC.of(LogKeys.CONTAINER_ID$.MODULE$, containerId)); } @Override public void stopContainer(ContainerTerminationContext context) { ContainerId containerId = context.getContainerId(); - logger.info("Stopping container {}", containerId); + logger.info("Stopping container {}", MDC.of(LogKeys.CONTAINER_ID$.MODULE$, containerId)); } /** @@ -566,8 +576,9 @@ protected File initRecoveryDb(String dbName) { fs.rename(copyFrom, newLoc); } catch (Exception e) { // Fail to move recovery file to new path, just continue on with new DB location - logger.error("Failed to move recovery file {} to the path {}", - dbName, _recoveryPath.toString(), e); + logger.error("Failed to move recovery file {} to the path {}", e, + MDC.of(LogKeys.SHUFFLE_MERGE_RECOVERY_FILE$.MODULE$, dbName), + MDC.of(LogKeys.PATH$.MODULE$, _recoveryPath.toString())); } } return new File(newLoc.toUri().getPath()); diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java new file mode 100644 index 0000000000000..272a8aa128141 --- /dev/null +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -0,0 +1,814 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.catalyst.util; + +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.text.BreakIterator; +import com.ibm.icu.text.StringSearch; +import com.ibm.icu.util.ULocale; + +import org.apache.spark.unsafe.UTF8StringBuilder; +import org.apache.spark.unsafe.types.UTF8String; + +import static org.apache.spark.unsafe.Platform.BYTE_ARRAY_OFFSET; +import static org.apache.spark.unsafe.Platform.copyMemory; + +import java.util.HashMap; +import java.util.Map; + +/** + * Utility class for collation-aware UTF8String operations. + */ +public class CollationAwareUTF8String { + + /** + * The constant value to indicate that the match is not found when searching for a pattern + * string in a target string. + */ + private static final int MATCH_NOT_FOUND = -1; + + /** + * Returns whether the target string starts with the specified prefix, starting from the + * specified position (0-based index referring to character position in UTF8String), with respect + * to the UTF8_LCASE collation. The method assumes that the prefix is already lowercased + * prior to method call to avoid the overhead of calling .toLowerCase() multiple times on the + * same prefix string. + * + * @param target the string to be searched in + * @param lowercasePattern the string to be searched for + * @param startPos the start position for searching (in the target string) + * @return whether the target string starts with the specified prefix in UTF8_LCASE + */ + public static boolean lowercaseMatchFrom( + final UTF8String target, + final UTF8String lowercasePattern, + int startPos) { + return lowercaseMatchLengthFrom(target, lowercasePattern, startPos) != MATCH_NOT_FOUND; + } + + /** + * Returns the length of the substring of the target string that starts with the specified + * prefix, starting from the specified position (0-based index referring to character position + * in UTF8String), with respect to the UTF8_LCASE collation. The method assumes that the + * prefix is already lowercased. The method only considers the part of target string that + * starts from the specified (inclusive) position (that is, the method does not look at UTF8 + * characters of the target string at or after position `endPos`). If the prefix is not found, + * MATCH_NOT_FOUND is returned. + * + * @param target the string to be searched in + * @param lowercasePattern the string to be searched for + * @param startPos the start position for searching (in the target string) + * @return length of the target substring that starts with the specified prefix in lowercase + */ + private static int lowercaseMatchLengthFrom( + final UTF8String target, + final UTF8String lowercasePattern, + int startPos) { + assert startPos >= 0; + for (int len = 0; len <= target.numChars() - startPos; ++len) { + if (target.substring(startPos, startPos + len).toLowerCase().equals(lowercasePattern)) { + return len; + } + } + return MATCH_NOT_FOUND; + } + + /** + * Returns the position of the first occurrence of the pattern string in the target string, + * starting from the specified position (0-based index referring to character position in + * UTF8String), with respect to the UTF8_LCASE collation. The method assumes that the + * pattern string is already lowercased prior to call. If the pattern is not found, + * MATCH_NOT_FOUND is returned. + * + * @param target the string to be searched in + * @param lowercasePattern the string to be searched for + * @param startPos the start position for searching (in the target string) + * @return the position of the first occurrence of pattern in target + */ + private static int lowercaseFind( + final UTF8String target, + final UTF8String lowercasePattern, + int startPos) { + assert startPos >= 0; + for (int i = startPos; i <= target.numChars(); ++i) { + if (lowercaseMatchFrom(target, lowercasePattern, i)) { + return i; + } + } + return MATCH_NOT_FOUND; + } + + /** + * Returns whether the target string ends with the specified suffix, ending at the specified + * position (0-based index referring to character position in UTF8String), with respect to the + * UTF8_LCASE collation. The method assumes that the suffix is already lowercased prior + * to method call to avoid the overhead of calling .toLowerCase() multiple times on the same + * suffix string. + * + * @param target the string to be searched in + * @param lowercasePattern the string to be searched for + * @param endPos the end position for searching (in the target string) + * @return whether the target string ends with the specified suffix in lowercase + */ + public static boolean lowercaseMatchUntil( + final UTF8String target, + final UTF8String lowercasePattern, + int endPos) { + return lowercaseMatchLengthUntil(target, lowercasePattern, endPos) != MATCH_NOT_FOUND; + } + + /** + * Returns the length of the substring of the target string that ends with the specified + * suffix, ending at the specified position (0-based index referring to character position in + * UTF8String), with respect to the UTF8_LCASE collation. The method assumes that the + * suffix is already lowercased. The method only considers the part of target string that ends + * at the specified (non-inclusive) position (that is, the method does not look at UTF8 + * characters of the target string at or after position `endPos`). If the suffix is not found, + * MATCH_NOT_FOUND is returned. + * + * @param target the string to be searched in + * @param lowercasePattern the string to be searched for + * @param endPos the end position for searching (in the target string) + * @return length of the target substring that ends with the specified suffix in lowercase + */ + private static int lowercaseMatchLengthUntil( + final UTF8String target, + final UTF8String lowercasePattern, + int endPos) { + assert endPos <= target.numChars(); + for (int len = 0; len <= endPos; ++len) { + if (target.substring(endPos - len, endPos).toLowerCase().equals(lowercasePattern)) { + return len; + } + } + return MATCH_NOT_FOUND; + } + + /** + * Returns the position of the last occurrence of the pattern string in the target string, + * ending at the specified position (0-based index referring to character position in + * UTF8String), with respect to the UTF8_LCASE collation. The method assumes that the + * pattern string is already lowercased prior to call. If the pattern is not found, + * MATCH_NOT_FOUND is returned. + * + * @param target the string to be searched in + * @param lowercasePattern the string to be searched for + * @param endPos the end position for searching (in the target string) + * @return the position of the last occurrence of pattern in target + */ + private static int lowercaseRFind( + final UTF8String target, + final UTF8String lowercasePattern, + int endPos) { + assert endPos <= target.numChars(); + for (int i = endPos; i >= 0; --i) { + if (lowercaseMatchUntil(target, lowercasePattern, i)) { + return i; + } + } + return MATCH_NOT_FOUND; + } + + /** + * Lowercase UTF8String comparison used for UTF8_LCASE collation. While the default + * UTF8String comparison is equivalent to a.toLowerCase().binaryCompare(b.toLowerCase()), this + * method uses code points to compare the strings in a case-insensitive manner using ICU rules, + * as well as handling special rules for one-to-many case mappings (see: lowerCaseCodePoints). + * + * @param left The first UTF8String to compare. + * @param right The second UTF8String to compare. + * @return An integer representing the comparison result. + */ + public static int compareLowerCase(final UTF8String left, final UTF8String right) { + // Only if both strings are ASCII, we can use faster comparison (no string allocations). + if (left.isFullAscii() && right.isFullAscii()) { + return compareLowerCaseAscii(left, right); + } + return compareLowerCaseSlow(left, right); + } + + /** + * Fast version of the `compareLowerCase` method, used when both arguments are ASCII strings. + * + * @param left The first ASCII UTF8String to compare. + * @param right The second ASCII UTF8String to compare. + * @return An integer representing the comparison result. + */ + private static int compareLowerCaseAscii(final UTF8String left, final UTF8String right) { + int leftBytes = left.numBytes(), rightBytes = right.numBytes(); + for (int curr = 0; curr < leftBytes && curr < rightBytes; curr++) { + int lowerLeftByte = Character.toLowerCase(left.getByte(curr)); + int lowerRightByte = Character.toLowerCase(right.getByte(curr)); + if (lowerLeftByte != lowerRightByte) { + return lowerLeftByte - lowerRightByte; + } + } + return leftBytes - rightBytes; + } + + /** + * Slow version of the `compareLowerCase` method, used when both arguments are non-ASCII strings. + * + * @param left The first non-ASCII UTF8String to compare. + * @param right The second non-ASCII UTF8String to compare. + * @return An integer representing the comparison result. + */ + private static int compareLowerCaseSlow(final UTF8String left, final UTF8String right) { + return lowerCaseCodePoints(left).binaryCompare(lowerCaseCodePoints(right)); + } + + /* + * Performs string replacement for ICU collations by searching for instances of the search + * string in the `src` string, with respect to the specified collation, and then replacing + * them with the replace string. The method returns a new UTF8String with all instances of the + * search string replaced using the replace string. Similar to UTF8String.findInSet behavior + * used for UTF8_BINARY, the method returns the `src` string if the `search` string is empty. + * + * @param src the string to be searched in + * @param search the string to be searched for + * @param replace the string to be used as replacement + * @param collationId the collation ID to use for string search + * @return the position of the first occurrence of `match` in `set` + */ + public static UTF8String replace(final UTF8String src, final UTF8String search, + final UTF8String replace, final int collationId) { + // This collation aware implementation is based on existing implementation on UTF8String + if (src.numBytes() == 0 || search.numBytes() == 0) { + return src; + } + + StringSearch stringSearch = CollationFactory.getStringSearch(src, search, collationId); + + // Find the first occurrence of the search string. + int end = stringSearch.next(); + if (end == StringSearch.DONE) { + // Search string was not found, so string is unchanged. + return src; + } + + // Initialize byte positions + int c = 0; + int byteStart = 0; // position in byte + int byteEnd = 0; // position in byte + while (byteEnd < src.numBytes() && c < end) { + byteEnd += UTF8String.numBytesForFirstByte(src.getByte(byteEnd)); + c += 1; + } + + // At least one match was found. Estimate space needed for result. + // The 16x multiplier here is chosen to match commons-lang3's implementation. + int increase = Math.max(0, Math.abs(replace.numBytes() - search.numBytes())) * 16; + final UTF8StringBuilder buf = new UTF8StringBuilder(src.numBytes() + increase); + while (end != StringSearch.DONE) { + buf.appendBytes(src.getBaseObject(), src.getBaseOffset() + byteStart, byteEnd - byteStart); + buf.append(replace); + + // Move byteStart to the beginning of the current match + byteStart = byteEnd; + int cs = c; + // Move cs to the end of the current match + // This is necessary because the search string may contain 'multi-character' characters + while (byteStart < src.numBytes() && cs < c + stringSearch.getMatchLength()) { + byteStart += UTF8String.numBytesForFirstByte(src.getByte(byteStart)); + cs += 1; + } + // Go to next match + end = stringSearch.next(); + // Update byte positions + while (byteEnd < src.numBytes() && c < end) { + byteEnd += UTF8String.numBytesForFirstByte(src.getByte(byteEnd)); + c += 1; + } + } + buf.appendBytes(src.getBaseObject(), src.getBaseOffset() + byteStart, + src.numBytes() - byteStart); + return buf.build(); + } + + /* + * Performs string replacement for UTF8_LCASE collation by searching for instances of the search + * string in the src string, with respect to lowercased string versions, and then replacing + * them with the replace string. The method returns a new UTF8String with all instances of the + * search string replaced using the replace string. Similar to UTF8String.findInSet behavior + * used for UTF8_BINARY, the method returns the `src` string if the `search` string is empty. + * + * @param src the string to be searched in + * @param search the string to be searched for + * @param replace the string to be used as replacement + * @param collationId the collation ID to use for string search + * @return the position of the first occurrence of `match` in `set` + */ + public static UTF8String lowercaseReplace(final UTF8String src, final UTF8String search, + final UTF8String replace) { + if (src.numBytes() == 0 || search.numBytes() == 0) { + return src; + } + + // TODO(SPARK-48725): Use lowerCaseCodePoints instead of UTF8String.toLowerCase. + UTF8String lowercaseSearch = search.toLowerCase(); + + int start = 0; + int end = lowercaseFind(src, lowercaseSearch, start); + if (end == -1) { + // Search string was not found, so string is unchanged. + return src; + } + + // At least one match was found. Estimate space needed for result. + // The 16x multiplier here is chosen to match commons-lang3's implementation. + int increase = Math.max(0, replace.numBytes() - search.numBytes()) * 16; + final UTF8StringBuilder buf = new UTF8StringBuilder(src.numBytes() + increase); + while (end != -1) { + buf.append(src.substring(start, end)); + buf.append(replace); + // Update character positions + start = end + lowercaseMatchLengthFrom(src, lowercaseSearch, end); + end = lowercaseFind(src, lowercaseSearch, start); + } + buf.append(src.substring(start, src.numChars())); + return buf.build(); + } + + /** + * Convert the input string to uppercase using the ICU root locale rules. + * + * @param target the input string + * @return the uppercase string + */ + public static UTF8String toUpperCase(final UTF8String target) { + if (target.isFullAscii()) return target.toUpperCaseAscii(); + return toUpperCaseSlow(target); + } + + private static UTF8String toUpperCaseSlow(final UTF8String target) { + // Note: In order to achieve the desired behavior, we use the ICU UCharacter class to + // convert the string to uppercase, which only accepts a Java strings as input. + // TODO(SPARK-48715): All UTF8String -> String conversions should use `makeValid` + return UTF8String.fromString(UCharacter.toUpperCase(target.toString())); + } + + /** + * Convert the input string to uppercase using the specified ICU collation rules. + * + * @param target the input string + * @return the uppercase string + */ + public static UTF8String toUpperCase(final UTF8String target, final int collationId) { + if (target.isFullAscii()) return target.toUpperCaseAscii(); + return toUpperCaseSlow(target, collationId); + } + + private static UTF8String toUpperCaseSlow(final UTF8String target, final int collationId) { + // Note: In order to achieve the desired behavior, we use the ICU UCharacter class to + // convert the string to uppercase, which only accepts a Java strings as input. + ULocale locale = CollationFactory.fetchCollation(collationId) + .collator.getLocale(ULocale.ACTUAL_LOCALE); + // TODO(SPARK-48715): All UTF8String -> String conversions should use `makeValid` + return UTF8String.fromString(UCharacter.toUpperCase(locale, target.toString())); + } + + /** + * Convert the input string to lowercase using the ICU root locale rules. + * + * @param target the input string + * @return the lowercase string + */ + public static UTF8String toLowerCase(final UTF8String target) { + if (target.isFullAscii()) return target.toLowerCaseAscii(); + return toLowerCaseSlow(target); + } + + private static UTF8String toLowerCaseSlow(final UTF8String target) { + // Note: In order to achieve the desired behavior, we use the ICU UCharacter class to + // convert the string to lowercase, which only accepts a Java strings as input. + // TODO(SPARK-48715): All UTF8String -> String conversions should use `makeValid` + return UTF8String.fromString(UCharacter.toLowerCase(target.toString())); + } + + /** + * Convert the input string to lowercase using the specified ICU collation rules. + * + * @param target the input string + * @return the lowercase string + */ + public static UTF8String toLowerCase(final UTF8String target, final int collationId) { + if (target.isFullAscii()) return target.toLowerCaseAscii(); + return toLowerCaseSlow(target, collationId); + } + + private static UTF8String toLowerCaseSlow(final UTF8String target, final int collationId) { + // Note: In order to achieve the desired behavior, we use the ICU UCharacter class to + // convert the string to lowercase, which only accepts a Java strings as input. + ULocale locale = CollationFactory.fetchCollation(collationId) + .collator.getLocale(ULocale.ACTUAL_LOCALE); + // TODO(SPARK-48715): All UTF8String -> String conversions should use `makeValid` + return UTF8String.fromString(UCharacter.toLowerCase(locale, target.toString())); + } + + /** + * Converts a single code point to lowercase using ICU rules, with special handling for + * one-to-many case mappings (i.e. characters that map to multiple characters in lowercase) and + * context-insensitive case mappings (i.e. characters that map to different characters based on + * string context - e.g. the position in the string relative to other characters). + * + * @param codePoint The code point to convert to lowercase. + * @param sb The StringBuilder to append the lowercase character to. + */ + private static void lowercaseCodePoint(final int codePoint, final StringBuilder sb) { + if (codePoint == 0x0130) { + // Latin capital letter I with dot above is mapped to 2 lowercase characters. + sb.appendCodePoint(0x0069); + sb.appendCodePoint(0x0307); + } + else if (codePoint == 0x03C2) { + // Greek final and non-final capital letter sigma should be mapped the same. + sb.appendCodePoint(0x03C3); + } + else { + // All other characters should follow context-unaware ICU single-code point case mapping. + sb.appendCodePoint(UCharacter.toLowerCase(codePoint)); + } + } + + /** + * Converts an entire string to lowercase using ICU rules, code point by code point, with + * special handling for one-to-many case mappings (i.e. characters that map to multiple + * characters in lowercase). Also, this method omits information about context-sensitive case + * mappings using special handling in the `lowercaseCodePoint` method. + * + * @param target The target string to convert to lowercase. + * @return The string converted to lowercase in a context-unaware manner. + */ + public static UTF8String lowerCaseCodePoints(final UTF8String target) { + if (target.isFullAscii()) return target.toLowerCaseAscii(); + return lowerCaseCodePointsSlow(target); + } + + private static UTF8String lowerCaseCodePointsSlow(final UTF8String target) { + // TODO(SPARK-48715): All UTF8String -> String conversions should use `makeValid` + String targetString = target.toString(); + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < targetString.length(); ++i) { + lowercaseCodePoint(targetString.codePointAt(i), sb); + } + return UTF8String.fromString(sb.toString()); + } + + /** + * Convert the input string to titlecase using the ICU root locale rules. + */ + public static UTF8String toTitleCase(final UTF8String target) { + // Note: In order to achieve the desired behavior, we use the ICU UCharacter class to + // convert the string to titlecase, which only accepts a Java strings as input. + // TODO(SPARK-48715): All UTF8String -> String conversions should use `makeValid` + return UTF8String.fromString(UCharacter.toTitleCase(target.toString(), + BreakIterator.getWordInstance())); + } + + /** + * Convert the input string to titlecase using the specified ICU collation rules. + */ + public static UTF8String toTitleCase(final UTF8String target, final int collationId) { + ULocale locale = CollationFactory.fetchCollation(collationId) + .collator.getLocale(ULocale.ACTUAL_LOCALE); + // TODO(SPARK-48715): All UTF8String -> String conversions should use `makeValid` + return UTF8String.fromString(UCharacter.toTitleCase(locale, target.toString(), + BreakIterator.getWordInstance(locale))); + } + + /* + * Returns the position of the first occurrence of the match string in the set string, + * counting ASCII commas as delimiters. The match string is compared in a collation-aware manner, + * with respect to the specified collation ID. Similar to UTF8String.findInSet behavior used + * for UTF8_BINARY collation, the method returns 0 if the match string contains no commas. + * + * @param match the string to be searched for + * @param set the string to be searched in + * @param collationId the collation ID to use for string comparison + * @return the position of the first occurrence of `match` in `set` + */ + public static int findInSet(final UTF8String match, final UTF8String set, int collationId) { + // If the "word" string contains a comma, FindInSet should return 0. + if (match.contains(UTF8String.fromString(","))) { + return 0; + } + // Otherwise, search for commas in "set" and compare each substring with "word". + int byteIndex = 0, charIndex = 0, wordCount = 1, lastComma = -1; + while (byteIndex < set.numBytes()) { + byte nextByte = set.getByte(byteIndex); + if (nextByte == (byte) ',') { + if (set.substring(lastComma + 1, charIndex).semanticEquals(match, collationId)) { + return wordCount; + } + lastComma = charIndex; + ++wordCount; + } + byteIndex += UTF8String.numBytesForFirstByte(nextByte); + ++charIndex; + } + if (set.substring(lastComma + 1, set.numBytes()).semanticEquals(match, collationId)) { + return wordCount; + } + // If no match is found, return 0. + return 0; + } + + /** + * Returns the position of the first occurrence of the pattern string in the target string, + * starting from the specified position (0-based index referring to character position in + * UTF8String), with respect to the UTF8_LCASE collation. If the pattern is not found, + * MATCH_NOT_FOUND is returned. + * + * @param target the string to be searched in + * @param pattern the string to be searched for + * @param start the start position for searching (in the target string) + * @return the position of the first occurrence of pattern in target + */ + public static int lowercaseIndexOf(final UTF8String target, final UTF8String pattern, + final int start) { + if (pattern.numChars() == 0) return target.indexOfEmpty(start); + return lowercaseFind(target, pattern.toLowerCase(), start); + } + + public static int indexOf(final UTF8String target, final UTF8String pattern, + final int start, final int collationId) { + if (pattern.numBytes() == 0) return target.indexOfEmpty(start); + if (target.numBytes() == 0) return MATCH_NOT_FOUND; + + StringSearch stringSearch = CollationFactory.getStringSearch(target, pattern, collationId); + stringSearch.setIndex(start); + + return stringSearch.next(); + } + + public static int find(UTF8String target, UTF8String pattern, int start, + int collationId) { + assert (pattern.numBytes() > 0); + + StringSearch stringSearch = CollationFactory.getStringSearch(target, pattern, collationId); + // Set search start position (start from character at start position) + stringSearch.setIndex(target.bytePosToChar(start)); + + // Return either the byte position or -1 if not found + return target.charPosToByte(stringSearch.next()); + } + + public static UTF8String subStringIndex(final UTF8String string, final UTF8String delimiter, + int count, final int collationId) { + if (delimiter.numBytes() == 0 || count == 0 || string.numBytes() == 0) { + return UTF8String.EMPTY_UTF8; + } + if (count > 0) { + int idx = -1; + while (count > 0) { + idx = find(string, delimiter, idx + 1, collationId); + if (idx >= 0) { + count --; + } else { + // can not find enough delim + return string; + } + } + if (idx == 0) { + return UTF8String.EMPTY_UTF8; + } + byte[] bytes = new byte[idx]; + copyMemory(string.getBaseObject(), string.getBaseOffset(), bytes, BYTE_ARRAY_OFFSET, idx); + return UTF8String.fromBytes(bytes); + + } else { + count = -count; + + StringSearch stringSearch = CollationFactory + .getStringSearch(string, delimiter, collationId); + + int start = string.numChars() - 1; + int lastMatchLength = 0; + int prevStart = -1; + while (count > 0) { + stringSearch.reset(); + prevStart = -1; + int matchStart = stringSearch.next(); + lastMatchLength = stringSearch.getMatchLength(); + while (matchStart <= start) { + if (matchStart != StringSearch.DONE) { + // Found a match, update the start position + prevStart = matchStart; + matchStart = stringSearch.next(); + } else { + break; + } + } + + if (prevStart == -1) { + // can not find enough delim + return string; + } else { + start = prevStart - 1; + count--; + } + } + + int resultStart = prevStart + lastMatchLength; + if (resultStart == string.numChars()) { + return UTF8String.EMPTY_UTF8; + } + + return string.substring(resultStart, string.numChars()); + } + } + + public static UTF8String lowercaseSubStringIndex(final UTF8String string, + final UTF8String delimiter, int count) { + if (delimiter.numBytes() == 0 || count == 0) { + return UTF8String.EMPTY_UTF8; + } + + UTF8String lowercaseDelimiter = delimiter.toLowerCase(); + + if (count > 0) { + // Search left to right (note: the start code point is inclusive). + int matchLength = -1; + while (count > 0) { + matchLength = lowercaseFind(string, lowercaseDelimiter, matchLength + 1); + if (matchLength > MATCH_NOT_FOUND) --count; // Found a delimiter. + else return string; // Cannot find enough delimiters in the string. + } + return string.substring(0, matchLength); + } else { + // Search right to left (note: the end code point is exclusive). + int matchLength = string.numChars() + 1; + count = -count; + while (count > 0) { + matchLength = lowercaseRFind(string, lowercaseDelimiter, matchLength - 1); + if (matchLength > MATCH_NOT_FOUND) --count; // Found a delimiter. + else return string; // Cannot find enough delimiters in the string. + } + return string.substring(matchLength, string.numChars()); + } + } + + public static Map getCollationAwareDict(UTF8String string, + Map dict, int collationId) { + // TODO(SPARK-48715): All UTF8String -> String conversions should use `makeValid` + String srcStr = string.toString(); + + Map collationAwareDict = new HashMap<>(); + for (String key : dict.keySet()) { + StringSearch stringSearch = + CollationFactory.getStringSearch(string, UTF8String.fromString(key), collationId); + + int pos = 0; + while ((pos = stringSearch.next()) != StringSearch.DONE) { + int codePoint = srcStr.codePointAt(pos); + int charCount = Character.charCount(codePoint); + String newKey = srcStr.substring(pos, pos + charCount); + + boolean exists = false; + for (String existingKey : collationAwareDict.keySet()) { + if (stringSearch.getCollator().compare(existingKey, newKey) == 0) { + collationAwareDict.put(newKey, collationAwareDict.get(existingKey)); + exists = true; + break; + } + } + + if (!exists) { + collationAwareDict.put(newKey, dict.get(key)); + } + } + } + + return collationAwareDict; + } + + public static UTF8String lowercaseTrim( + final UTF8String srcString, + final UTF8String trimString) { + // Matching UTF8String behavior for null `trimString`. + if (trimString == null) { + return null; + } + + UTF8String leftTrimmed = lowercaseTrimLeft(srcString, trimString); + return lowercaseTrimRight(leftTrimmed, trimString); + } + + public static UTF8String lowercaseTrimLeft( + final UTF8String srcString, + final UTF8String trimString) { + // Matching UTF8String behavior for null `trimString`. + if (trimString == null) { + return null; + } + + // The searching byte position in the srcString. + int searchIdx = 0; + // The byte position of a first non-matching character in the srcString. + int trimByteIdx = 0; + // Number of bytes in srcString. + int numBytes = srcString.numBytes(); + // Convert trimString to lowercase, so it can be searched properly. + UTF8String lowercaseTrimString = trimString.toLowerCase(); + + while (searchIdx < numBytes) { + UTF8String searchChar = srcString.copyUTF8String( + searchIdx, + searchIdx + UTF8String.numBytesForFirstByte(srcString.getByte(searchIdx)) - 1); + int searchCharBytes = searchChar.numBytes(); + + // Try to find the matching for the searchChar in the trimString. + if (lowercaseTrimString.find(searchChar.toLowerCase(), 0) >= 0) { + trimByteIdx += searchCharBytes; + searchIdx += searchCharBytes; + } else { + // No matching, exit the search. + break; + } + } + + if (searchIdx == 0) { + // Nothing trimmed - return original string (not converted to lowercase). + return srcString; + } + if (trimByteIdx >= numBytes) { + // Everything trimmed. + return UTF8String.EMPTY_UTF8; + } + return srcString.copyUTF8String(trimByteIdx, numBytes - 1); + } + + public static UTF8String lowercaseTrimRight( + final UTF8String srcString, + final UTF8String trimString) { + // Matching UTF8String behavior for null `trimString`. + if (trimString == null) { + return null; + } + + // Number of bytes iterated from the srcString. + int byteIdx = 0; + // Number of characters iterated from the srcString. + int numChars = 0; + // Number of bytes in srcString. + int numBytes = srcString.numBytes(); + // Array of character length for the srcString. + int[] stringCharLen = new int[numBytes]; + // Array of the first byte position for each character in the srcString. + int[] stringCharPos = new int[numBytes]; + // Convert trimString to lowercase, so it can be searched properly. + UTF8String lowercaseTrimString = trimString.toLowerCase(); + + // Build the position and length array. + while (byteIdx < numBytes) { + stringCharPos[numChars] = byteIdx; + stringCharLen[numChars] = UTF8String.numBytesForFirstByte(srcString.getByte(byteIdx)); + byteIdx += stringCharLen[numChars]; + numChars++; + } + + // Index trimEnd points to the first no matching byte position from the right side of + // the source string. + int trimByteIdx = numBytes - 1; + + while (numChars > 0) { + UTF8String searchChar = srcString.copyUTF8String( + stringCharPos[numChars - 1], + stringCharPos[numChars - 1] + stringCharLen[numChars - 1] - 1); + + if(lowercaseTrimString.find(searchChar.toLowerCase(), 0) >= 0) { + trimByteIdx -= stringCharLen[numChars - 1]; + numChars--; + } else { + break; + } + } + + if (trimByteIdx == numBytes - 1) { + // Nothing trimmed. + return srcString; + } + if (trimByteIdx < 0) { + // Everything trimmed. + return UTF8String.EMPTY_UTF8; + } + return srcString.copyUTF8String(0, trimByteIdx); + } + + // TODO: Add more collation-aware UTF8String operations here. + +} diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java index 9786c559da44b..f13f66e384e0f 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java @@ -19,12 +19,15 @@ import java.text.CharacterIterator; import java.text.StringCharacterIterator; import java.util.*; +import java.util.concurrent.ConcurrentHashMap; +import java.util.function.Function; import java.util.function.BiFunction; import java.util.function.ToLongFunction; import com.ibm.icu.text.RuleBasedCollator; import com.ibm.icu.text.StringSearch; import com.ibm.icu.util.ULocale; +import com.ibm.icu.text.CollationKey; import com.ibm.icu.text.Collator; import org.apache.spark.SparkException; @@ -35,11 +38,62 @@ * Provides functionality to the UTF8String object which respects defined collation settings. */ public final class CollationFactory { + + /** + * Identifier for single a collation. + */ + public static class CollationIdentifier { + private final String provider; + private final String name; + private final String version; + + public CollationIdentifier(String provider, String collationName, String version) { + this.provider = provider; + this.name = collationName; + this.version = version; + } + + public static CollationIdentifier fromString(String identifier) { + long numDots = identifier.chars().filter(ch -> ch == '.').count(); + assert(numDots > 0); + + if (numDots == 1) { + String[] parts = identifier.split("\\.", 2); + return new CollationIdentifier(parts[0], parts[1], null); + } + + String[] parts = identifier.split("\\.", 3); + return new CollationIdentifier(parts[0], parts[1], parts[2]); + } + + /** + * Returns the identifier's string value without the version. + * This is used for the table schema as the schema doesn't care about the version, + * only the statistics do. + */ + public String toStringWithoutVersion() { + return String.format("%s.%s", provider, name); + } + + public String getProvider() { + return provider; + } + + public String getName() { + return name; + } + + public Optional getVersion() { + return Optional.ofNullable(version); + } + } + /** * Entry encapsulating all information about a collation. */ public static class Collation { public final String collationName; + public final String provider; public final Collator collator; public final Comparator comparator; @@ -81,13 +135,14 @@ public static class Collation { /** * Support for Lowercase Equality implies that it is possible to check equality on * byte by byte level, but only after calling "UTF8String.toLowerCase" on both arguments. - * This allows custom collation support for UTF8_BINARY_LCASE collation in various Spark + * This allows custom collation support for UTF8_LCASE collation in various Spark * expressions, as this particular collation is not supported by the external ICU library. */ public final boolean supportsLowercaseEquality; public Collation( String collationName, + String provider, Collator collator, Comparator comparator, String version, @@ -96,6 +151,7 @@ public Collation( boolean supportsBinaryOrdering, boolean supportsLowercaseEquality) { this.collationName = collationName; + this.provider = provider; this.collator = collator; this.comparator = comparator; this.version = version; @@ -109,6 +165,8 @@ public Collation( // No Collation can simultaneously support binary equality and lowercase equality assert(!supportsBinaryEquality || !supportsLowercaseEquality); + assert(SUPPORTED_PROVIDERS.contains(provider)); + if (supportsBinaryEquality) { this.equalsFunction = UTF8String::equals; } else { @@ -117,76 +175,576 @@ public Collation( } /** - * Constructor with comparators that are inherited from the given collator. + * Collation ID is defined as 32-bit integer. We specify binary layouts for different classes of + * collations. Classes of collations are differentiated by most significant 3 bits (bit 31, 30 + * and 29), bit 31 being most significant and bit 0 being least significant. + * --- + * General collation ID binary layout: + * bit 31: 1 for INDETERMINATE (requires all other bits to be 1 as well), 0 otherwise. + * bit 30: 0 for predefined, 1 for user-defined. + * Following bits are specified for predefined collations: + * bit 29: 0 for UTF8_BINARY, 1 for ICU collations. + * bit 28-24: Reserved. + * bit 23-22: Reserved for version. + * bit 21-18: Reserved for space trimming. + * bit 17-0: Depend on collation family. + * --- + * INDETERMINATE collation ID binary layout: + * bit 31-0: 1 + * INDETERMINATE collation ID is equal to -1. + * --- + * User-defined collation ID binary layout: + * bit 31: 0 + * bit 30: 1 + * bit 29-0: Undefined, reserved for future use. + * --- + * UTF8_BINARY collation ID binary layout: + * bit 31-24: Zeroes. + * bit 23-22: Zeroes, reserved for version. + * bit 21-18: Zeroes, reserved for space trimming. + * bit 17-3: Zeroes. + * bit 2: 0, reserved for accent sensitivity. + * bit 1: 0, reserved for uppercase and case-insensitive. + * bit 0: 0 = case-sensitive, 1 = lowercase. + * --- + * ICU collation ID binary layout: + * bit 31-30: Zeroes. + * bit 29: 1 + * bit 28-24: Zeroes. + * bit 23-22: Zeroes, reserved for version. + * bit 21-18: Zeroes, reserved for space trimming. + * bit 17: 0 = case-sensitive, 1 = case-insensitive. + * bit 16: 0 = accent-sensitive, 1 = accent-insensitive. + * bit 15-14: Zeroes, reserved for punctuation sensitivity. + * bit 13-12: Zeroes, reserved for first letter preference. + * bit 11-0: Locale ID as specified in `ICULocaleToId` mapping. + * --- + * Some illustrative examples of collation name to ID mapping: + * - UTF8_BINARY -> 0 + * - UTF8_LCASE -> 1 + * - UNICODE -> 0x20000000 + * - UNICODE_AI -> 0x20010000 + * - UNICODE_CI -> 0x20020000 + * - UNICODE_CI_AI -> 0x20030000 + * - af -> 0x20000001 + * - af_CI_AI -> 0x20030001 */ - public Collation( - String collationName, - Collator collator, - String version, - boolean supportsBinaryEquality, - boolean supportsBinaryOrdering, - boolean supportsLowercaseEquality) { - this( - collationName, - collator, - (s1, s2) -> collator.compare(s1.toString(), s2.toString()), - version, - s -> (long)collator.getCollationKey(s.toString()).hashCode(), - supportsBinaryEquality, - supportsBinaryOrdering, - supportsLowercaseEquality); + private abstract static class CollationSpec { + + /** + * Bit 30 in collation ID having value 0 for predefined and 1 for user-defined collation. + */ + private enum DefinitionOrigin { + PREDEFINED, USER_DEFINED + } + + /** + * Bit 29 in collation ID having value 0 for UTF8_BINARY family and 1 for ICU family of + * collations. + */ + protected enum ImplementationProvider { + UTF8_BINARY, ICU + } + + /** + * Offset in binary collation ID layout. + */ + private static final int DEFINITION_ORIGIN_OFFSET = 30; + + /** + * Bitmask corresponding to width in bits in binary collation ID layout. + */ + private static final int DEFINITION_ORIGIN_MASK = 0b1; + + /** + * Offset in binary collation ID layout. + */ + protected static final int IMPLEMENTATION_PROVIDER_OFFSET = 29; + + /** + * Bitmask corresponding to width in bits in binary collation ID layout. + */ + protected static final int IMPLEMENTATION_PROVIDER_MASK = 0b1; + + private static final int INDETERMINATE_COLLATION_ID = -1; + + /** + * Thread-safe cache mapping collation IDs to corresponding `Collation` instances. + * We add entries to this cache lazily as new `Collation` instances are requested. + */ + private static final Map collationMap = new ConcurrentHashMap<>(); + + /** + * Utility function to retrieve `ImplementationProvider` enum instance from collation ID. + */ + private static ImplementationProvider getImplementationProvider(int collationId) { + return ImplementationProvider.values()[SpecifierUtils.getSpecValue(collationId, + IMPLEMENTATION_PROVIDER_OFFSET, IMPLEMENTATION_PROVIDER_MASK)]; + } + + /** + * Utility function to retrieve `DefinitionOrigin` enum instance from collation ID. + */ + private static DefinitionOrigin getDefinitionOrigin(int collationId) { + return DefinitionOrigin.values()[SpecifierUtils.getSpecValue(collationId, + DEFINITION_ORIGIN_OFFSET, DEFINITION_ORIGIN_MASK)]; + } + + /** + * Main entry point for retrieving `Collation` instance from collation ID. + */ + private static Collation fetchCollation(int collationId) { + // User-defined collations and INDETERMINATE collations cannot produce a `Collation` + // instance. + assert (collationId >= 0 && getDefinitionOrigin(collationId) + == DefinitionOrigin.PREDEFINED); + if (collationId == UTF8_BINARY_COLLATION_ID) { + // Skip cache. + return CollationSpecUTF8.UTF8_BINARY_COLLATION; + } else if (collationMap.containsKey(collationId)) { + // Already in cache. + return collationMap.get(collationId); + } else { + // Build `Collation` instance and put into cache. + CollationSpec spec; + ImplementationProvider implementationProvider = getImplementationProvider(collationId); + if (implementationProvider == ImplementationProvider.UTF8_BINARY) { + spec = CollationSpecUTF8.fromCollationId(collationId); + } else { + spec = CollationSpecICU.fromCollationId(collationId); + } + Collation collation = spec.buildCollation(); + collationMap.put(collationId, collation); + return collation; + } + } + + /** + * Method for constructing errors thrown on providing invalid collation name. + */ + protected static SparkException collationInvalidNameException(String collationName) { + Map params = new HashMap<>(); + final int maxSuggestions = 3; + params.put("collationName", collationName); + params.put("proposals", getClosestSuggestionsOnInvalidName(collationName, maxSuggestions)); + return new SparkException("COLLATION_INVALID_NAME", + SparkException.constructMessageParams(params), null); + } + + private static int collationNameToId(String collationName) throws SparkException { + // Collation names provided by user are treated as case-insensitive. + String collationNameUpper = collationName.toUpperCase(); + if (collationNameUpper.startsWith("UTF8_")) { + return CollationSpecUTF8.collationNameToId(collationName, collationNameUpper); + } else { + return CollationSpecICU.collationNameToId(collationName, collationNameUpper); + } + } + + protected abstract Collation buildCollation(); } - } - private static final Collation[] collationTable = new Collation[4]; - private static final HashMap collationNameToIdMap = new HashMap<>(); - - public static final int UTF8_BINARY_COLLATION_ID = 0; - public static final int UTF8_BINARY_LCASE_COLLATION_ID = 1; - - static { - // Binary comparison. This is the default collation. - // No custom comparators will be used for this collation. - // Instead, we rely on byte for byte comparison. - collationTable[0] = new Collation( - "UTF8_BINARY", - null, - UTF8String::binaryCompare, - "1.0", - s -> (long)s.hashCode(), - true, - true, - false); - - // Case-insensitive UTF8 binary collation. - // TODO: Do in place comparisons instead of creating new strings. - collationTable[1] = new Collation( - "UTF8_BINARY_LCASE", - null, - UTF8String::compareLowerCase, - "1.0", - (s) -> (long)s.toLowerCase().hashCode(), - false, - false, - true); - - // UNICODE case sensitive comparison (ROOT locale, in ICU). - collationTable[2] = new Collation( - "UNICODE", Collator.getInstance(ULocale.ROOT), "153.120.0.0", true, false, false); - collationTable[2].collator.setStrength(Collator.TERTIARY); - collationTable[2].collator.freeze(); - - // UNICODE case-insensitive comparison (ROOT locale, in ICU + Secondary strength). - collationTable[3] = new Collation( - "UNICODE_CI", Collator.getInstance(ULocale.ROOT), "153.120.0.0", false, false, false); - collationTable[3].collator.setStrength(Collator.SECONDARY); - collationTable[3].collator.freeze(); - - for (int i = 0; i < collationTable.length; i++) { - collationNameToIdMap.put(collationTable[i].collationName, i); + private static class CollationSpecUTF8 extends CollationSpec { + + /** + * Bit 0 in collation ID having value 0 for plain UTF8_BINARY and 1 for UTF8_LCASE + * collation. + */ + private enum CaseSensitivity { + UNSPECIFIED, LCASE + } + + /** + * Offset in binary collation ID layout. + */ + private static final int CASE_SENSITIVITY_OFFSET = 0; + + /** + * Bitmask corresponding to width in bits in binary collation ID layout. + */ + private static final int CASE_SENSITIVITY_MASK = 0b1; + + private static final int UTF8_BINARY_COLLATION_ID = + new CollationSpecUTF8(CaseSensitivity.UNSPECIFIED).collationId; + private static final int UTF8_LCASE_COLLATION_ID = + new CollationSpecUTF8(CaseSensitivity.LCASE).collationId; + protected static Collation UTF8_BINARY_COLLATION = + new CollationSpecUTF8(CaseSensitivity.UNSPECIFIED).buildCollation(); + protected static Collation UTF8_LCASE_COLLATION = + new CollationSpecUTF8(CaseSensitivity.LCASE).buildCollation(); + + private final int collationId; + + private CollationSpecUTF8(CaseSensitivity caseSensitivity) { + this.collationId = + SpecifierUtils.setSpecValue(0, CASE_SENSITIVITY_OFFSET, caseSensitivity); + } + + private static int collationNameToId(String originalName, String collationName) + throws SparkException { + if (UTF8_BINARY_COLLATION.collationName.equals(collationName)) { + return UTF8_BINARY_COLLATION_ID; + } else if (UTF8_LCASE_COLLATION.collationName.equals(collationName)) { + return UTF8_LCASE_COLLATION_ID; + } else { + // Throw exception with original (before case conversion) collation name. + throw collationInvalidNameException(originalName); + } + } + + private static CollationSpecUTF8 fromCollationId(int collationId) { + // Extract case sensitivity from collation ID. + int caseConversionOrdinal = SpecifierUtils.getSpecValue(collationId, + CASE_SENSITIVITY_OFFSET, CASE_SENSITIVITY_MASK); + // Verify only case sensitivity bits were set settable in UTF8_BINARY family of collations. + assert (SpecifierUtils.removeSpec(collationId, + CASE_SENSITIVITY_OFFSET, CASE_SENSITIVITY_MASK) == 0); + return new CollationSpecUTF8(CaseSensitivity.values()[caseConversionOrdinal]); + } + + @Override + protected Collation buildCollation() { + if (collationId == UTF8_BINARY_COLLATION_ID) { + return new Collation( + "UTF8_BINARY", + PROVIDER_SPARK, + null, + UTF8String::binaryCompare, + "1.0", + s -> (long) s.hashCode(), + /* supportsBinaryEquality = */ true, + /* supportsBinaryOrdering = */ true, + /* supportsLowercaseEquality = */ false); + } else { + return new Collation( + "UTF8_LCASE", + PROVIDER_SPARK, + null, + CollationAwareUTF8String::compareLowerCase, + "1.0", + s -> (long) CollationAwareUTF8String.lowerCaseCodePoints(s).hashCode(), + /* supportsBinaryEquality = */ false, + /* supportsBinaryOrdering = */ false, + /* supportsLowercaseEquality = */ true); + } + } + } + + private static class CollationSpecICU extends CollationSpec { + + /** + * Bit 17 in collation ID having value 0 for case-sensitive and 1 for case-insensitive + * collation. + */ + private enum CaseSensitivity { + CS, CI + } + + /** + * Bit 16 in collation ID having value 0 for accent-sensitive and 1 for accent-insensitive + * collation. + */ + private enum AccentSensitivity { + AS, AI + } + + /** + * Offset in binary collation ID layout. + */ + private static final int CASE_SENSITIVITY_OFFSET = 17; + + /** + * Bitmask corresponding to width in bits in binary collation ID layout. + */ + private static final int CASE_SENSITIVITY_MASK = 0b1; + + /** + * Offset in binary collation ID layout. + */ + private static final int ACCENT_SENSITIVITY_OFFSET = 16; + + /** + * Bitmask corresponding to width in bits in binary collation ID layout. + */ + private static final int ACCENT_SENSITIVITY_MASK = 0b1; + + /** + * Array of locale names, each locale ID corresponds to the index in this array. + */ + private static final String[] ICULocaleNames; + + /** + * Mapping of locale names to corresponding `ULocale` instance. + */ + private static final Map ICULocaleMap = new HashMap<>(); + + /** + * Used to parse user input collation names which are converted to uppercase. + */ + private static final Map ICULocaleMapUppercase = new HashMap<>(); + + /** + * Reverse mapping of `ICULocaleNames`. + */ + private static final Map ICULocaleToId = new HashMap<>(); + + /** + * ICU library Collator version passed to `Collation` instance. + */ + private static final String ICU_COLLATOR_VERSION = "153.120.0.0"; + + static { + ICULocaleMap.put("UNICODE", ULocale.ROOT); + // ICU-implemented `ULocale`s which have corresponding `Collator` installed. + ULocale[] locales = Collator.getAvailableULocales(); + // Build locale names in format: language["_" optional script]["_" optional country code]. + // Examples: en, en_USA, sr_Cyrl_SRB + for (ULocale locale : locales) { + // Skip variants. + if (locale.getVariant().isEmpty()) { + String language = locale.getLanguage(); + // Require non-empty language as first component of locale name. + assert (!language.isEmpty()); + StringBuilder builder = new StringBuilder(language); + // Script tag. + String script = locale.getScript(); + if (!script.isEmpty()) { + builder.append('_'); + builder.append(script); + } + // 3-letter country code. + String country = locale.getISO3Country(); + if (!country.isEmpty()) { + builder.append('_'); + builder.append(country); + } + String localeName = builder.toString(); + // Verify locale names are unique. + assert (!ICULocaleMap.containsKey(localeName)); + ICULocaleMap.put(localeName, locale); + } + } + // Construct uppercase-normalized locale name mapping. + for (String localeName : ICULocaleMap.keySet()) { + String localeUppercase = localeName.toUpperCase(); + // Locale names are unique case-insensitively. + assert (!ICULocaleMapUppercase.containsKey(localeUppercase)); + ICULocaleMapUppercase.put(localeUppercase, localeName); + } + // Construct locale name to ID mapping. Locale ID is defined as index in `ICULocaleNames`. + ICULocaleNames = ICULocaleMap.keySet().toArray(new String[0]); + Arrays.sort(ICULocaleNames); + // Maximum number of locale IDs as defined by binary layout. + assert (ICULocaleNames.length <= (1 << 12)); + for (int i = 0; i < ICULocaleNames.length; ++i) { + ICULocaleToId.put(ICULocaleNames[i], i); + } + } + + private static final int UNICODE_COLLATION_ID = + new CollationSpecICU("UNICODE", CaseSensitivity.CS, AccentSensitivity.AS).collationId; + private static final int UNICODE_CI_COLLATION_ID = + new CollationSpecICU("UNICODE", CaseSensitivity.CI, AccentSensitivity.AS).collationId; + + private final CaseSensitivity caseSensitivity; + private final AccentSensitivity accentSensitivity; + private final String locale; + private final int collationId; + + private CollationSpecICU(String locale, CaseSensitivity caseSensitivity, + AccentSensitivity accentSensitivity) { + this.locale = locale; + this.caseSensitivity = caseSensitivity; + this.accentSensitivity = accentSensitivity; + // Construct collation ID from locale, case-sensitivity and accent-sensitivity specifiers. + int collationId = ICULocaleToId.get(locale); + // Mandatory ICU implementation provider. + collationId = SpecifierUtils.setSpecValue(collationId, IMPLEMENTATION_PROVIDER_OFFSET, + ImplementationProvider.ICU); + collationId = SpecifierUtils.setSpecValue(collationId, CASE_SENSITIVITY_OFFSET, + caseSensitivity); + collationId = SpecifierUtils.setSpecValue(collationId, ACCENT_SENSITIVITY_OFFSET, + accentSensitivity); + this.collationId = collationId; + } + + private static int collationNameToId( + String originalName, String collationName) throws SparkException { + // Search for the longest locale match because specifiers are designed to be different from + // script tag and country code, meaning the only valid locale name match can be the longest + // one. + int lastPos = -1; + for (int i = 1; i <= collationName.length(); i++) { + String localeName = collationName.substring(0, i); + if (ICULocaleMapUppercase.containsKey(localeName)) { + lastPos = i; + } + } + if (lastPos == -1) { + throw collationInvalidNameException(originalName); + } else { + String locale = collationName.substring(0, lastPos); + int collationId = ICULocaleToId.get(ICULocaleMapUppercase.get(locale)); + + // Try all combinations of AS/AI and CS/CI. + CaseSensitivity caseSensitivity; + AccentSensitivity accentSensitivity; + if (collationName.equals(locale) || + collationName.equals(locale + "_AS") || + collationName.equals(locale + "_CS") || + collationName.equals(locale + "_AS_CS") || + collationName.equals(locale + "_CS_AS") + ) { + caseSensitivity = CaseSensitivity.CS; + accentSensitivity = AccentSensitivity.AS; + } else if (collationName.equals(locale + "_CI") || + collationName.equals(locale + "_AS_CI") || + collationName.equals(locale + "_CI_AS")) { + caseSensitivity = CaseSensitivity.CI; + accentSensitivity = AccentSensitivity.AS; + } else if (collationName.equals(locale + "_AI") || + collationName.equals(locale + "_CS_AI") || + collationName.equals(locale + "_AI_CS")) { + caseSensitivity = CaseSensitivity.CS; + accentSensitivity = AccentSensitivity.AI; + } else if (collationName.equals(locale + "_AI_CI") || + collationName.equals(locale + "_CI_AI")) { + caseSensitivity = CaseSensitivity.CI; + accentSensitivity = AccentSensitivity.AI; + } else { + throw collationInvalidNameException(originalName); + } + + // Build collation ID from computed specifiers. + collationId = SpecifierUtils.setSpecValue(collationId, + IMPLEMENTATION_PROVIDER_OFFSET, ImplementationProvider.ICU); + collationId = SpecifierUtils.setSpecValue(collationId, + CASE_SENSITIVITY_OFFSET, caseSensitivity); + collationId = SpecifierUtils.setSpecValue(collationId, + ACCENT_SENSITIVITY_OFFSET, accentSensitivity); + return collationId; + } + } + + private static CollationSpecICU fromCollationId(int collationId) { + // Parse specifiers from collation ID. + int caseSensitivityOrdinal = SpecifierUtils.getSpecValue(collationId, + CASE_SENSITIVITY_OFFSET, CASE_SENSITIVITY_MASK); + int accentSensitivityOrdinal = SpecifierUtils.getSpecValue(collationId, + ACCENT_SENSITIVITY_OFFSET, ACCENT_SENSITIVITY_MASK); + collationId = SpecifierUtils.removeSpec(collationId, + IMPLEMENTATION_PROVIDER_OFFSET, IMPLEMENTATION_PROVIDER_MASK); + collationId = SpecifierUtils.removeSpec(collationId, + CASE_SENSITIVITY_OFFSET, CASE_SENSITIVITY_MASK); + collationId = SpecifierUtils.removeSpec(collationId, + ACCENT_SENSITIVITY_OFFSET, ACCENT_SENSITIVITY_MASK); + // Locale ID remains after removing all other specifiers. + int localeId = collationId; + // Verify locale ID is valid against `ICULocaleNames` array. + assert(localeId >= 0 && localeId < ICULocaleNames.length); + CaseSensitivity caseSensitivity = CaseSensitivity.values()[caseSensitivityOrdinal]; + AccentSensitivity accentSensitivity = AccentSensitivity.values()[accentSensitivityOrdinal]; + String locale = ICULocaleNames[localeId]; + return new CollationSpecICU(locale, caseSensitivity, accentSensitivity); + } + + @Override + protected Collation buildCollation() { + ULocale.Builder builder = new ULocale.Builder(); + builder.setLocale(ICULocaleMap.get(locale)); + // Compute unicode locale keyword for all combinations of case/accent sensitivity. + if (caseSensitivity == CaseSensitivity.CS && + accentSensitivity == AccentSensitivity.AS) { + builder.setUnicodeLocaleKeyword("ks", "level3"); + } else if (caseSensitivity == CaseSensitivity.CS && + accentSensitivity == AccentSensitivity.AI) { + builder + .setUnicodeLocaleKeyword("ks", "level1") + .setUnicodeLocaleKeyword("kc", "true"); + } else if (caseSensitivity == CaseSensitivity.CI && + accentSensitivity == AccentSensitivity.AS) { + builder.setUnicodeLocaleKeyword("ks", "level2"); + } else if (caseSensitivity == CaseSensitivity.CI && + accentSensitivity == AccentSensitivity.AI) { + builder.setUnicodeLocaleKeyword("ks", "level1"); + } + ULocale resultLocale = builder.build(); + Collator collator = Collator.getInstance(resultLocale); + // Freeze ICU collator to ensure thread safety. + collator.freeze(); + return new Collation( + collationName(), + PROVIDER_ICU, + collator, + (s1, s2) -> collator.compare(s1.toString(), s2.toString()), + ICU_COLLATOR_VERSION, + s -> (long) collator.getCollationKey(s.toString()).hashCode(), + /* supportsBinaryEquality = */ false, + /* supportsBinaryOrdering = */ false, + /* supportsLowercaseEquality = */ false); + } + + /** + * Compute normalized collation name. Components of collation name are given in order: + * - Locale name + * - Optional case sensitivity when non-default preceded by underscore + * - Optional accent sensitivity when non-default preceded by underscore + * Examples: en, en_USA_CI_AI, sr_Cyrl_SRB_AI. + */ + private String collationName() { + StringBuilder builder = new StringBuilder(); + builder.append(locale); + if (caseSensitivity != CaseSensitivity.CS) { + builder.append('_'); + builder.append(caseSensitivity.toString()); + } + if (accentSensitivity != AccentSensitivity.AS) { + builder.append('_'); + builder.append(accentSensitivity.toString()); + } + return builder.toString(); + } + } + + /** + * Utility class for manipulating conversions between collation IDs and specifier enums/locale + * IDs. Scope bitwise operations here to avoid confusion. + */ + private static class SpecifierUtils { + private static int getSpecValue(int collationId, int offset, int mask) { + return (collationId >> offset) & mask; + } + + private static int removeSpec(int collationId, int offset, int mask) { + return collationId & ~(mask << offset); + } + + private static int setSpecValue(int collationId, int offset, Enum spec) { + return collationId | (spec.ordinal() << offset); + } + } + + /** Returns the collation identifier. */ + public CollationIdentifier identifier() { + return new CollationIdentifier(provider, collationName, version); } } + public static final String PROVIDER_SPARK = "spark"; + public static final String PROVIDER_ICU = "icu"; + public static final List SUPPORTED_PROVIDERS = List.of(PROVIDER_SPARK, PROVIDER_ICU); + + public static final int UTF8_BINARY_COLLATION_ID = + Collation.CollationSpecUTF8.UTF8_BINARY_COLLATION_ID; + public static final int UTF8_LCASE_COLLATION_ID = + Collation.CollationSpecUTF8.UTF8_LCASE_COLLATION_ID; + public static final int UNICODE_COLLATION_ID = + Collation.CollationSpecICU.UNICODE_COLLATION_ID; + public static final int UNICODE_CI_COLLATION_ID = + Collation.CollationSpecICU.UNICODE_CI_COLLATION_ID; + public static final int INDETERMINATE_COLLATION_ID = + Collation.CollationSpec.INDETERMINATE_COLLATION_ID; + /** * Returns a StringSearch object for the given pattern and target strings, under collation * rules corresponding to the given collationId. The external ICU library StringSearch object can @@ -196,33 +754,27 @@ public static StringSearch getStringSearch( final UTF8String targetUTF8String, final UTF8String patternUTF8String, final int collationId) { - String pattern = patternUTF8String.toString(); - CharacterIterator target = new StringCharacterIterator(targetUTF8String.toString()); - Collator collator = CollationFactory.fetchCollation(collationId).collator; - return new StringSearch(pattern, target, (RuleBasedCollator) collator); - } - - /** - * Returns if the given collationName is valid one. - */ - public static boolean isValidCollation(String collationName) { - return collationNameToIdMap.containsKey(collationName.toUpperCase()); + return getStringSearch(targetUTF8String.toString(), patternUTF8String.toString(), collationId); } /** - * Returns closest valid name to collationName + * Returns a StringSearch object for the given pattern and target strings, under collation + * rules corresponding to the given collationId. The external ICU library StringSearch object can + * be used to find occurrences of the pattern in the target string, while respecting collation. */ - public static String getClosestCollation(String collationName) { - Collation suggestion = Collections.min(List.of(collationTable), Comparator.comparingInt( - c -> UTF8String.fromString(c.collationName).levenshteinDistance( - UTF8String.fromString(collationName.toUpperCase())))); - return suggestion.collationName; + public static StringSearch getStringSearch( + final String targetString, + final String patternString, + final int collationId) { + CharacterIterator target = new StringCharacterIterator(targetString); + Collator collator = CollationFactory.fetchCollation(collationId).collator; + return new StringSearch(patternString, target, (RuleBasedCollator) collator); } /** * Returns a collation-unaware StringSearch object for the given pattern and target strings. * While this object does not respect collation, it can be used to find occurrences of the pattern - * in the target string for UTF8_BINARY or UTF8_BINARY_LCASE (if arguments are lowercased). + * in the target string for UTF8_BINARY or UTF8_LCASE (if arguments are lowercased). */ public static StringSearch getStringSearch( final UTF8String targetUTF8String, @@ -231,32 +783,139 @@ public static StringSearch getStringSearch( } /** - * Returns the collation id for the given collation name. + * Returns the collation ID for the given collation name. */ public static int collationNameToId(String collationName) throws SparkException { - String normalizedName = collationName.toUpperCase(); - if (collationNameToIdMap.containsKey(normalizedName)) { - return collationNameToIdMap.get(normalizedName); - } else { - Collation suggestion = Collections.min(List.of(collationTable), Comparator.comparingInt( - c -> UTF8String.fromString(c.collationName).levenshteinDistance( - UTF8String.fromString(normalizedName)))); + return Collation.CollationSpec.collationNameToId(collationName); + } - Map params = new HashMap<>(); - params.put("collationName", collationName); - params.put("proposal", suggestion.collationName); + public static void assertValidProvider(String provider) throws SparkException { + if (!SUPPORTED_PROVIDERS.contains(provider.toLowerCase())) { + Map params = Map.of( + "provider", provider, + "supportedProviders", String.join(", ", SUPPORTED_PROVIDERS) + ); throw new SparkException( - "COLLATION_INVALID_NAME", SparkException.constructMessageParams(params), null); + "COLLATION_INVALID_PROVIDER", SparkException.constructMessageParams(params), null); } } public static Collation fetchCollation(int collationId) { - return collationTable[collationId]; + return Collation.CollationSpec.fetchCollation(collationId); } public static Collation fetchCollation(String collationName) throws SparkException { - int collationId = collationNameToId(collationName); - return collationTable[collationId]; + return fetchCollation(collationNameToId(collationName)); + } + + public static String[] getICULocaleNames() { + return Collation.CollationSpecICU.ICULocaleNames; + } + + public static UTF8String getCollationKey(UTF8String input, int collationId) { + Collation collation = fetchCollation(collationId); + if (collation.supportsBinaryEquality) { + return input; + } else if (collation.supportsLowercaseEquality) { + return input.toLowerCase(); + } else { + CollationKey collationKey = collation.collator.getCollationKey(input.toString()); + return UTF8String.fromBytes(collationKey.toByteArray()); + } + } + + public static byte[] getCollationKeyBytes(UTF8String input, int collationId) { + Collation collation = fetchCollation(collationId); + if (collation.supportsBinaryEquality) { + return input.getBytes(); + } else if (collation.supportsLowercaseEquality) { + return input.toLowerCase().getBytes(); + } else { + return collation.collator.getCollationKey(input.toString()).toByteArray(); + } + } + + /** + * Returns same string if collation name is valid or the closest suggestion if it is invalid. + */ + public static String getClosestSuggestionsOnInvalidName( + String collationName, int maxSuggestions) { + String[] validRootNames; + String[] validModifiers; + if (collationName.startsWith("UTF8_")) { + validRootNames = new String[]{ + Collation.CollationSpecUTF8.UTF8_BINARY_COLLATION.collationName, + Collation.CollationSpecUTF8.UTF8_LCASE_COLLATION.collationName + }; + validModifiers = new String[0]; + } else { + validRootNames = getICULocaleNames(); + validModifiers = new String[]{"_CI", "_AI", "_CS", "_AS"}; + } + + // Split modifiers and locale name. + final int MODIFIER_LENGTH = 3; + String localeName = collationName.toUpperCase(); + List modifiers = new ArrayList<>(); + while (Arrays.stream(validModifiers).anyMatch(localeName::endsWith)) { + modifiers.add(localeName.substring(localeName.length() - MODIFIER_LENGTH)); + localeName = localeName.substring(0, localeName.length() - MODIFIER_LENGTH); + } + + // Suggest version with unique modifiers. + Collections.reverse(modifiers); + modifiers = modifiers.stream().distinct().toList(); + + // Remove conflicting settings. + if (modifiers.contains("_CI") && modifiers.contains(("_CS"))) { + modifiers = modifiers.stream().filter(m -> !m.equals("_CI")).toList(); + } + + if (modifiers.contains("_AI") && modifiers.contains(("_AS"))) { + modifiers = modifiers.stream().filter(m -> !m.equals("_AI")).toList(); + } + + final String finalLocaleName = localeName; + Comparator distanceComparator = (c1, c2) -> { + int distance1 = UTF8String.fromString(c1.toUpperCase()) + .levenshteinDistance(UTF8String.fromString(finalLocaleName)); + int distance2 = UTF8String.fromString(c2.toUpperCase()) + .levenshteinDistance(UTF8String.fromString(finalLocaleName)); + return Integer.compare(distance1, distance2); + }; + + String[] rootNamesByDistance = Arrays.copyOf(validRootNames, validRootNames.length); + Arrays.sort(rootNamesByDistance, distanceComparator); + Function isCollationNameValid = name -> { + try { + collationNameToId(name); + return true; + } catch (SparkException e) { + return false; + } + }; + + final int suggestionThreshold = 3; + final ArrayList suggestions = new ArrayList<>(maxSuggestions); + for (int i = 0; i < maxSuggestions; i++) { + // Add at least one suggestion. + // Add others if distance from the original is lower than threshold. + String suggestion = rootNamesByDistance[i] + String.join("", modifiers); + assert(isCollationNameValid.apply(suggestion)); + if (suggestions.isEmpty()) { + suggestions.add(suggestion); + } else { + int distance = UTF8String.fromString(suggestion.toUpperCase()) + .levenshteinDistance(UTF8String.fromString(collationName.toUpperCase())); + if (distance < suggestionThreshold) { + suggestions.add(suggestion); + } else { + break; + } + } + } + + return String.join(", ", suggestions); } } diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java index fe1952921b7fb..fa4a40b74ab24 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java @@ -20,6 +20,11 @@ import org.apache.spark.unsafe.types.UTF8String; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.regex.Pattern; + /** * Static entry point for collation-aware expressions (StringExpressions, RegexpExpressions, and * other expressions that require custom collation support), as well as private utility methods for @@ -31,6 +36,62 @@ public final class CollationSupport { * Collation-aware string expressions. */ + public static class StringSplitSQL { + public static UTF8String[] exec(final UTF8String s, final UTF8String d, final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + if (collation.supportsBinaryEquality) { + return execBinary(s, d); + } else if (collation.supportsLowercaseEquality) { + return execLowercase(s, d); + } else { + return execICU(s, d, collationId); + } + } + public static String genCode(final String s, final String d, final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + String expr = "CollationSupport.StringSplitSQL.exec"; + if (collation.supportsBinaryEquality) { + return String.format(expr + "Binary(%s, %s)", s, d); + } else if (collation.supportsLowercaseEquality) { + return String.format(expr + "Lowercase(%s, %s)", s, d); + } else { + return String.format(expr + "ICU(%s, %s, %d)", s, d, collationId); + } + } + public static UTF8String[] execBinary(final UTF8String string, final UTF8String delimiter) { + return string.splitSQL(delimiter, -1); + } + public static UTF8String[] execLowercase(final UTF8String string, final UTF8String delimiter) { + if (delimiter.numBytes() == 0) return new UTF8String[] { string }; + if (string.numBytes() == 0) return new UTF8String[] { UTF8String.EMPTY_UTF8 }; + Pattern pattern = Pattern.compile(Pattern.quote(delimiter.toString()), + CollationSupport.lowercaseRegexFlags); + String[] splits = pattern.split(string.toString(), -1); + UTF8String[] res = new UTF8String[splits.length]; + for (int i = 0; i < res.length; i++) { + res[i] = UTF8String.fromString(splits[i]); + } + return res; + } + public static UTF8String[] execICU(final UTF8String string, final UTF8String delimiter, + final int collationId) { + if (delimiter.numBytes() == 0) return new UTF8String[] { string }; + if (string.numBytes() == 0) return new UTF8String[] { UTF8String.EMPTY_UTF8 }; + List strings = new ArrayList<>(); + String target = string.toString(), pattern = delimiter.toString(); + StringSearch stringSearch = CollationFactory.getStringSearch(target, pattern, collationId); + int start = 0, end; + while ((end = stringSearch.next()) != StringSearch.DONE) { + strings.add(UTF8String.fromString(target.substring(start, end))); + start = end + stringSearch.getMatchLength(); + } + if (start <= target.length()) { + strings.add(UTF8String.fromString(target.substring(start))); + } + return strings.toArray(new UTF8String[0]); + } + } + public static class Contains { public static boolean exec(final UTF8String l, final UTF8String r, final int collationId) { CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); @@ -57,7 +118,7 @@ public static boolean execBinary(final UTF8String l, final UTF8String r) { return l.contains(r); } public static boolean execLowercase(final UTF8String l, final UTF8String r) { - return l.toLowerCase().contains(r.toLowerCase()); + return CollationAwareUTF8String.lowercaseIndexOf(l, r, 0) >= 0; } public static boolean execICU(final UTF8String l, final UTF8String r, final int collationId) { @@ -95,11 +156,14 @@ public static boolean execBinary(final UTF8String l, final UTF8String r) { return l.startsWith(r); } public static boolean execLowercase(final UTF8String l, final UTF8String r) { - return l.toLowerCase().startsWith(r.toLowerCase()); + return CollationAwareUTF8String.lowercaseMatchFrom(l, r.toLowerCase(), 0); } public static boolean execICU(final UTF8String l, final UTF8String r, final int collationId) { - return CollationAwareUTF8String.matchAt(l, r, 0, collationId); + if (r.numBytes() == 0) return true; + if (l.numBytes() == 0) return false; + StringSearch stringSearch = CollationFactory.getStringSearch(l, r, collationId); + return stringSearch.first() == 0; } } @@ -129,46 +193,582 @@ public static boolean execBinary(final UTF8String l, final UTF8String r) { return l.endsWith(r); } public static boolean execLowercase(final UTF8String l, final UTF8String r) { - return l.toLowerCase().endsWith(r.toLowerCase()); + return CollationAwareUTF8String.lowercaseMatchUntil(l, r.toLowerCase(), l.numChars()); } public static boolean execICU(final UTF8String l, final UTF8String r, final int collationId) { - return CollationAwareUTF8String.matchAt(l, r, l.numBytes() - r.numBytes(), collationId); + if (r.numBytes() == 0) return true; + if (l.numBytes() == 0) return false; + StringSearch stringSearch = CollationFactory.getStringSearch(l, r, collationId); + int endIndex = stringSearch.getTarget().getEndIndex(); + return stringSearch.last() == endIndex - stringSearch.getMatchLength(); } } - // TODO: Add more collation-aware string expressions. + public static class Upper { + public static UTF8String exec(final UTF8String v, final int collationId, boolean useICU) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + if (collation.supportsBinaryEquality) { + return useICU ? execBinaryICU(v) : execBinary(v); + } else if (collation.supportsLowercaseEquality) { + return execLowercase(v); + } else { + return execICU(v, collationId); + } + } + public static String genCode(final String v, final int collationId, boolean useICU) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + String expr = "CollationSupport.Upper.exec"; + if (collation.supportsBinaryEquality) { + String funcName = useICU ? "BinaryICU" : "Binary"; + return String.format(expr + "%s(%s)", funcName, v); + } else if (collation.supportsLowercaseEquality) { + return String.format(expr + "Lowercase(%s)", v); + } else { + return String.format(expr + "ICU(%s, %d)", v, collationId); + } + } + public static UTF8String execBinary(final UTF8String v) { + return v.toUpperCase(); + } + public static UTF8String execBinaryICU(final UTF8String v) { + return CollationAwareUTF8String.toUpperCase(v); + } + public static UTF8String execLowercase(final UTF8String v) { + return CollationAwareUTF8String.toUpperCase(v); + } + public static UTF8String execICU(final UTF8String v, final int collationId) { + return CollationAwareUTF8String.toUpperCase(v, collationId); + } + } - /** - * Collation-aware regexp expressions. - */ + public static class Lower { + public static UTF8String exec(final UTF8String v, final int collationId, boolean useICU) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + if (collation.supportsBinaryEquality) { + return useICU ? execBinaryICU(v) : execBinary(v); + } else if (collation.supportsLowercaseEquality) { + return execLowercase(v); + } else { + return execICU(v, collationId); + } + } + public static String genCode(final String v, final int collationId, boolean useICU) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + String expr = "CollationSupport.Lower.exec"; + if (collation.supportsBinaryEquality) { + String funcName = useICU ? "BinaryICU" : "Binary"; + return String.format(expr + "%s(%s)", funcName, v); + } else if (collation.supportsLowercaseEquality) { + return String.format(expr + "Lowercase(%s)", v); + } else { + return String.format(expr + "ICU(%s, %d)", v, collationId); + } + } + public static UTF8String execBinary(final UTF8String v) { + return v.toLowerCase(); + } + public static UTF8String execBinaryICU(final UTF8String v) { + return CollationAwareUTF8String.toLowerCase(v); + } + public static UTF8String execLowercase(final UTF8String v) { + return CollationAwareUTF8String.toLowerCase(v); + } + public static UTF8String execICU(final UTF8String v, final int collationId) { + return CollationAwareUTF8String.toLowerCase(v, collationId); + } + } - // TODO: Add more collation-aware regexp expressions. + public static class InitCap { + public static UTF8String exec(final UTF8String v, final int collationId, boolean useICU) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + if (collation.supportsBinaryEquality) { + return useICU ? execBinaryICU(v) : execBinary(v); + } else if (collation.supportsLowercaseEquality) { + return execLowercase(v); + } else { + return execICU(v, collationId); + } + } - /** - * Other collation-aware expressions. - */ + public static String genCode(final String v, final int collationId, boolean useICU) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + String expr = "CollationSupport.InitCap.exec"; + if (collation.supportsBinaryEquality) { + String funcName = useICU ? "BinaryICU" : "Binary"; + return String.format(expr + "%s(%s)", funcName, v); + } else if (collation.supportsLowercaseEquality) { + return String.format(expr + "Lowercase(%s)", v); + } else { + return String.format(expr + "ICU(%s, %d)", v, collationId); + } + } + public static UTF8String execBinary(final UTF8String v) { + return v.toLowerCase().toTitleCase(); + } + public static UTF8String execBinaryICU(final UTF8String v) { + return CollationAwareUTF8String.toLowerCase(v).toTitleCaseICU(); + } + public static UTF8String execLowercase(final UTF8String v) { + return CollationAwareUTF8String.toTitleCase(v); + } + public static UTF8String execICU(final UTF8String v, final int collationId) { + return CollationAwareUTF8String.toTitleCase(v, collationId); + } + } - // TODO: Add other collation-aware expressions. + public static class FindInSet { + public static int exec(final UTF8String word, final UTF8String set, final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + if (collation.supportsBinaryEquality) { + return execBinary(word, set); + } else { + return execCollationAware(word, set, collationId); + } + } + public static String genCode(final String word, final String set, final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + String expr = "CollationSupport.FindInSet.exec"; + if (collation.supportsBinaryEquality) { + return String.format(expr + "Binary(%s, %s)", word, set); + } else { + return String.format(expr + "execCollationAware(%s, %s, %d)", word, set, collationId); + } + } + public static int execBinary(final UTF8String word, final UTF8String set) { + return set.findInSet(word); + } + public static int execCollationAware(final UTF8String word, final UTF8String set, + final int collationId) { + return CollationAwareUTF8String.findInSet(word, set, collationId); + } + } - /** - * Utility class for collation-aware UTF8String operations. - */ + public static class StringInstr { + public static int exec(final UTF8String string, final UTF8String substring, + final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + if (collation.supportsBinaryEquality) { + return execBinary(string, substring); + } else if (collation.supportsLowercaseEquality) { + return execLowercase(string, substring); + } else { + return execICU(string, substring, collationId); + } + } + public static String genCode(final String string, final String substring, + final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + String expr = "CollationSupport.StringInstr.exec"; + if (collation.supportsBinaryEquality) { + return String.format(expr + "Binary(%s, %s)", string, substring); + } else if (collation.supportsLowercaseEquality) { + return String.format(expr + "Lowercase(%s, %s)", string, substring); + } else { + return String.format(expr + "ICU(%s, %s, %d)", string, substring, collationId); + } + } + public static int execBinary(final UTF8String string, final UTF8String substring) { + return string.indexOf(substring, 0); + } + public static int execLowercase(final UTF8String string, final UTF8String substring) { + return CollationAwareUTF8String.lowercaseIndexOf(string, substring, 0); + } + public static int execICU(final UTF8String string, final UTF8String substring, + final int collationId) { + return CollationAwareUTF8String.indexOf(string, substring, 0, collationId); + } + } + + public static class StringReplace { + public static UTF8String exec(final UTF8String src, final UTF8String search, + final UTF8String replace, final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + if (collation.supportsBinaryEquality) { + return execBinary(src, search, replace); + } else if (collation.supportsLowercaseEquality) { + return execLowercase(src, search, replace); + } else { + return execICU(src, search, replace, collationId); + } + } + public static String genCode(final String src, final String search, final String replace, + final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + String expr = "CollationSupport.StringReplace.exec"; + if (collation.supportsBinaryEquality) { + return String.format(expr + "Binary(%s, %s, %s)", src, search, replace); + } else if (collation.supportsLowercaseEquality) { + return String.format(expr + "Lowercase(%s, %s, %s)", src, search, replace); + } else { + return String.format(expr + "ICU(%s, %s, %s, %d)", src, search, replace, collationId); + } + } + public static UTF8String execBinary(final UTF8String src, final UTF8String search, + final UTF8String replace) { + return src.replace(search, replace); + } + public static UTF8String execLowercase(final UTF8String src, final UTF8String search, + final UTF8String replace) { + return CollationAwareUTF8String.lowercaseReplace(src, search, replace); + } + public static UTF8String execICU(final UTF8String src, final UTF8String search, + final UTF8String replace, final int collationId) { + return CollationAwareUTF8String.replace(src, search, replace, collationId); + } + } + + public static class StringLocate { + public static int exec(final UTF8String string, final UTF8String substring, final int start, + final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + if (collation.supportsBinaryEquality) { + return execBinary(string, substring, start); + } else if (collation.supportsLowercaseEquality) { + return execLowercase(string, substring, start); + } else { + return execICU(string, substring, start, collationId); + } + } + public static String genCode(final String string, final String substring, final int start, + final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + String expr = "CollationSupport.StringLocate.exec"; + if (collation.supportsBinaryEquality) { + return String.format(expr + "Binary(%s, %s, %d)", string, substring, start); + } else if (collation.supportsLowercaseEquality) { + return String.format(expr + "Lowercase(%s, %s, %d)", string, substring, start); + } else { + return String.format(expr + "ICU(%s, %s, %d, %d)", string, substring, start, collationId); + } + } + public static int execBinary(final UTF8String string, final UTF8String substring, + final int start) { + return string.indexOf(substring, start); + } + public static int execLowercase(final UTF8String string, final UTF8String substring, + final int start) { + return CollationAwareUTF8String.lowercaseIndexOf(string, substring, start); + } + public static int execICU(final UTF8String string, final UTF8String substring, final int start, + final int collationId) { + return CollationAwareUTF8String.indexOf(string, substring, start, collationId); + } + } + + public static class SubstringIndex { + public static UTF8String exec(final UTF8String string, final UTF8String delimiter, + final int count, final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + if (collation.supportsBinaryEquality) { + return execBinary(string, delimiter, count); + } else if (collation.supportsLowercaseEquality) { + return execLowercase(string, delimiter, count); + } else { + return execICU(string, delimiter, count, collationId); + } + } + public static String genCode(final String string, final String delimiter, + final int count, final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + String expr = "CollationSupport.SubstringIndex.exec"; + if (collation.supportsBinaryEquality) { + return String.format(expr + "Binary(%s, %s, %d)", string, delimiter, count); + } else if (collation.supportsLowercaseEquality) { + return String.format(expr + "Lowercase(%s, %s, %d)", string, delimiter, count); + } else { + return String.format(expr + "ICU(%s, %s, %d, %d)", string, delimiter, count, collationId); + } + } + public static UTF8String execBinary(final UTF8String string, final UTF8String delimiter, + final int count) { + return string.subStringIndex(delimiter, count); + } + public static UTF8String execLowercase(final UTF8String string, final UTF8String delimiter, + final int count) { + return CollationAwareUTF8String.lowercaseSubStringIndex(string, delimiter, count); + } + public static UTF8String execICU(final UTF8String string, final UTF8String delimiter, + final int count, final int collationId) { + return CollationAwareUTF8String.subStringIndex(string, delimiter, count, + collationId); + } + } + + public static class StringTranslate { + public static UTF8String exec(final UTF8String source, Map dict, + final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + if (collation.supportsBinaryEquality) { + return execBinary(source, dict); + } else if (collation.supportsLowercaseEquality) { + return execLowercase(source, dict); + } else { + return execICU(source, dict, collationId); + } + } + public static String genCode(final String source, final String dict, final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + String expr = "CollationSupport.EndsWith.exec"; + if (collation.supportsBinaryEquality) { + return String.format(expr + "Binary(%s, %s)", source, dict); + } else if (collation.supportsLowercaseEquality) { + return String.format(expr + "Lowercase(%s, %s)", source, dict); + } else { + return String.format(expr + "ICU(%s, %s, %d)", source, dict, collationId); + } + } + public static UTF8String execBinary(final UTF8String source, Map dict) { + return source.translate(dict); + } + public static UTF8String execLowercase(final UTF8String source, Map dict) { + String srcStr = source.toString(); + StringBuilder sb = new StringBuilder(); + int charCount = 0; + for (int k = 0; k < srcStr.length(); k += charCount) { + int codePoint = srcStr.codePointAt(k); + charCount = Character.charCount(codePoint); + String subStr = srcStr.substring(k, k + charCount); + String translated = dict.get(subStr.toLowerCase()); + if (null == translated) { + sb.append(subStr); + } else if (!"\0".equals(translated)) { + sb.append(translated); + } + } + return UTF8String.fromString(sb.toString()); + } + public static UTF8String execICU(final UTF8String source, Map dict, + final int collationId) { + return source.translate(CollationAwareUTF8String.getCollationAwareDict( + source, dict, collationId)); + } + } + + public static class StringTrim { + public static UTF8String exec( + final UTF8String srcString, + final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + if (collation.supportsBinaryEquality) { + return execBinary(srcString); + } else { + return execLowercase(srcString); + } + } + public static UTF8String exec( + final UTF8String srcString, + final UTF8String trimString, + final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + if (collation.supportsBinaryEquality) { + return execBinary(srcString, trimString); + } else { + return execLowercase(srcString, trimString); + } + } + public static String genCode( + final String srcString, + final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + String expr = "CollationSupport.StringTrim.exec"; + if (collation.supportsBinaryEquality) { + return String.format(expr + "Binary(%s)", srcString); + } { + return String.format(expr + "Lowercase(%s)", srcString); + } + } + public static String genCode( + final String srcString, + final String trimString, + final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + String expr = "CollationSupport.StringTrim.exec"; + if (collation.supportsBinaryEquality) { + return String.format(expr + "Binary(%s, %s)", srcString, trimString); + } else { + return String.format(expr + "Lowercase(%s, %s)", srcString, trimString); + } + } + public static UTF8String execBinary( + final UTF8String srcString) { + return srcString.trim(); + } + public static UTF8String execBinary( + final UTF8String srcString, + final UTF8String trimString) { + return srcString.trim(trimString); + } + public static UTF8String execLowercase( + final UTF8String srcString) { + return srcString.trim(); + } + public static UTF8String execLowercase( + final UTF8String srcString, + final UTF8String trimString) { + return CollationAwareUTF8String.lowercaseTrim(srcString, trimString); + } + } - private static class CollationAwareUTF8String { + public static class StringTrimLeft { + public static UTF8String exec( + final UTF8String srcString, + final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + if (collation.supportsBinaryEquality) { + return execBinary(srcString); + } else { + return execLowercase(srcString); + } + } + public static UTF8String exec( + final UTF8String srcString, + final UTF8String trimString, + final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + if (collation.supportsBinaryEquality) { + return execBinary(srcString, trimString); + } else { + return execLowercase(srcString, trimString); + } + } + public static String genCode( + final String srcString, + final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + String expr = "CollationSupport.StringTrimLeft.exec"; + if (collation.supportsBinaryEquality) { + return String.format(expr + "Binary(%s)", srcString); + } else { + return String.format(expr + "Lowercase(%s)", srcString); + } + } + public static String genCode( + final String srcString, + final String trimString, + final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + String expr = "CollationSupport.StringTrimLeft.exec"; + if (collation.supportsBinaryEquality) { + return String.format(expr + "Binary(%s, %s)", srcString, trimString); + } else { + return String.format(expr + "Lowercase(%s, %s)", srcString, trimString); + } + } + public static UTF8String execBinary( + final UTF8String srcString) { + return srcString.trimLeft(); + } + public static UTF8String execBinary( + final UTF8String srcString, + final UTF8String trimString) { + return srcString.trimLeft(trimString); + } + public static UTF8String execLowercase( + final UTF8String srcString) { + return srcString.trimLeft(); + } + public static UTF8String execLowercase( + final UTF8String srcString, + final UTF8String trimString) { + return CollationAwareUTF8String.lowercaseTrimLeft(srcString, trimString); + } + } - private static boolean matchAt(final UTF8String target, final UTF8String pattern, - final int pos, final int collationId) { - if (pattern.numChars() + pos > target.numChars() || pos < 0) { - return false; + public static class StringTrimRight { + public static UTF8String exec( + final UTF8String srcString, + final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + if (collation.supportsBinaryEquality) { + return execBinary(srcString); + } else { + return execLowercase(srcString); + } + } + public static UTF8String exec( + final UTF8String srcString, + final UTF8String trimString, + final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + if (collation.supportsBinaryEquality) { + return execBinary(srcString, trimString); + } else { + return execLowercase(srcString, trimString); } - if (pattern.numBytes() == 0 || target.numBytes() == 0) { - return pattern.numBytes() == 0; + } + public static String genCode( + final String srcString, + final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + String expr = "CollationSupport.StringTrimRight.exec"; + if (collation.supportsBinaryEquality) { + return String.format(expr + "Binary(%s)", srcString); + } else { + return String.format(expr + "Lowercase(%s)", srcString); } - return CollationFactory.getStringSearch(target.substring( - pos, pos + pattern.numChars()), pattern, collationId).last() == 0; } + public static String genCode( + final String srcString, + final String trimString, + final int collationId) { + CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); + String expr = "CollationSupport.StringTrimRight.exec"; + if (collation.supportsBinaryEquality) { + return String.format(expr + "Binary(%s, %s)", srcString, trimString); + } else { + return String.format(expr + "Lowercase(%s, %s)", srcString, trimString); + } + } + public static UTF8String execBinary( + final UTF8String srcString) { + return srcString.trimRight(); + } + public static UTF8String execBinary( + final UTF8String srcString, + final UTF8String trimString) { + return srcString.trimRight(trimString); + } + public static UTF8String execLowercase( + final UTF8String srcString) { + return srcString.trimRight(); + } + public static UTF8String execLowercase( + final UTF8String srcString, + final UTF8String trimString) { + return CollationAwareUTF8String.lowercaseTrimRight(srcString, trimString); + } + } + + // TODO: Add more collation-aware string expressions. + + /** + * Collation-aware regexp expressions. + */ + + public static boolean supportsLowercaseRegex(final int collationId) { + // for regex, only Unicode case-insensitive matching is possible, + // so UTF8_LCASE is treated as UNICODE_CI in this context + return CollationFactory.fetchCollation(collationId).supportsLowercaseEquality; + } + private static final int lowercaseRegexFlags = Pattern.UNICODE_CASE | Pattern.CASE_INSENSITIVE; + public static int collationAwareRegexFlags(final int collationId) { + return supportsLowercaseRegex(collationId) ? lowercaseRegexFlags : 0; } + private static final UTF8String lowercaseRegexPrefix = UTF8String.fromString("(?ui)"); + public static UTF8String lowercaseRegex(final UTF8String regex) { + return UTF8String.concat(lowercaseRegexPrefix, regex); + } + public static UTF8String collationAwareRegex(final UTF8String regex, final int collationId) { + return supportsLowercaseRegex(collationId) ? lowercaseRegex(regex) : regex; + } + + /** + * Other collation-aware expressions. + */ + + // TODO: Add other collation-aware expressions. + } diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index 2009f1d20442c..a2372d28a6c41 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -21,7 +21,10 @@ import java.io.*; import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; +import java.util.ArrayList; import java.util.Arrays; +import java.util.function.Function; +import java.util.Iterator; import java.util.Map; import java.util.regex.Pattern; @@ -29,6 +32,7 @@ import com.esotericsoftware.kryo.KryoSerializable; import com.esotericsoftware.kryo.io.Input; import com.esotericsoftware.kryo.io.Output; +import com.ibm.icu.lang.UCharacter; import org.apache.spark.sql.catalyst.util.CollationFactory; import org.apache.spark.unsafe.Platform; @@ -56,6 +60,7 @@ public final class UTF8String implements Comparable, Externalizable, private Object base; private long offset; private int numBytes; + private volatile int numChars = -1; public Object getBaseObject() { return base; } public long getBaseOffset() { return offset; } @@ -100,6 +105,8 @@ public final class UTF8String implements Comparable, Externalizable, private static final UTF8String COMMA_UTF8 = UTF8String.fromString(","); public static final UTF8String EMPTY_UTF8 = UTF8String.fromString(""); + public static final UTF8String ZERO_UTF8 = UTF8String.fromString("0"); + /** * Creates an UTF8String from byte array, which should be encoded in UTF-8. @@ -114,6 +121,14 @@ public static UTF8String fromBytes(byte[] bytes) { } } + private static UTF8String fromBytes(ArrayList bytes) { + byte[] byteArray = new byte[bytes.size()]; + for (int i = 0; i < bytes.size(); i++) { + byteArray[i] = bytes.get(i); + } + return fromBytes(byteArray); + } + /** * Creates an UTF8String from byte array, which should be encoded in UTF-8. * @@ -224,7 +239,7 @@ public void writeTo(OutputStream out) throws IOException { * Returns the number of bytes for a code point with the first byte as `b` * @param b The first byte of a code point */ - private static int numBytesForFirstByte(final byte b) { + public static int numBytesForFirstByte(final byte b) { final int offset = b & 0xFF; byte numBytes = bytesOfCodePointInUTF8[offset]; return (numBytes == 0) ? 1: numBytes; // Skip the first byte disallowed in UTF-8 @@ -241,6 +256,16 @@ public int numBytes() { * Returns the number of code points in it. */ public int numChars() { + if (numChars == -1) numChars = getNumChars(); + return numChars; + } + + /** + * Private helper method to calculate the number of code points in the UTF-8 string. Counting + * the code points is a linear time operation, as we need to scan the entire UTF-8 string. + * Hence, this method should generally only be called once for non-empty UTF-8 strings. + */ + private int getNumChars() { int len = 0; for (int i = 0; i < numBytes; i += numBytesForFirstByte(getByte(i))) { len += 1; @@ -270,6 +295,228 @@ public byte[] getBytes() { } } + /** + * Utility methods and constants for UTF-8 string validation. + */ + + private static boolean isValidContinuationByte(byte b) { + return b >= (byte) 0x80 && b <= (byte) 0xBF; + } + + private static boolean isValidSecondByte(byte b, byte firstByte) { + return switch (firstByte) { + case (byte) 0xE0 -> b >= (byte) 0xA0 && b <= (byte) 0xBF; + case (byte) 0xED -> b >= (byte) 0x80 && b <= (byte) 0x9F; + case (byte) 0xF0 -> b >= (byte) 0x90 && b <= (byte) 0xBF; + case (byte) 0xF4 -> b >= (byte) 0x80 && b <= (byte) 0x8F; + default -> isValidContinuationByte(b); + }; + } + + private static final byte[] UNICODE_REPLACEMENT_CHARACTER = + new byte[] { (byte) 0xEF, (byte) 0xBF, (byte) 0xBD }; + + private static void appendReplacementCharacter(ArrayList bytes) { + for (byte b : UTF8String.UNICODE_REPLACEMENT_CHARACTER) bytes.add(b); + } + + /** + * Returns a validated version of the current UTF-8 string by replacing invalid UTF-8 sequences + * with the Unicode replacement character (U+FFFD), as per the rules defined in the Unicode + * standard 15, Section 3.9, Paragraph D86, Table 3-7. This behaviour is consistent with the + * behaviour of `UnicodeString` in ICU4C. + * + * @return A new UTF8String that is a valid UTF8 string. + */ + public UTF8String makeValid() { + ArrayList bytes = new ArrayList<>(); + int byteIndex = 0; + while (byteIndex < numBytes) { + // Read the first byte. + byte firstByte = getByte(byteIndex); + int expectedLen = bytesOfCodePointInUTF8[firstByte & 0xFF]; + int codePointLen = Math.min(expectedLen, numBytes - byteIndex); + // 0B UTF-8 sequence (invalid first byte). + if (codePointLen == 0) { + appendReplacementCharacter(bytes); + ++byteIndex; + continue; + } + // 1B UTF-8 sequence (ASCII or truncated). + if (codePointLen == 1) { + if (firstByte >= 0) bytes.add(firstByte); + else appendReplacementCharacter(bytes); + ++byteIndex; + continue; + } + // Read the second byte. + byte secondByte = getByte(byteIndex + 1); + if (!isValidSecondByte(secondByte, firstByte)) { + appendReplacementCharacter(bytes); + ++byteIndex; + continue; + } + // Read remaining continuation bytes. + int continuationBytes = 2; + for (; continuationBytes < codePointLen; ++continuationBytes) { + byte nextByte = getByte(byteIndex + continuationBytes); + if (!isValidContinuationByte(nextByte)) { + break; + } + } + // Invalid UTF-8 sequence (not enough continuation bytes). + if (continuationBytes < expectedLen) { + appendReplacementCharacter(bytes); + byteIndex += continuationBytes; + continue; + } + // Valid UTF-8 sequence. + for (int i = 0; i < codePointLen; ++i) { + bytes.add(getByte(byteIndex + i)); + } + byteIndex += codePointLen; + } + return UTF8String.fromBytes(bytes); + } + + /** + * Checks if the current UTF8String is valid. + * + * @return If string represents a valid UTF8 string. + */ + public boolean isValid() { + int byteIndex = 0; + while (byteIndex < numBytes) { + // Read the first byte. + byte firstByte = getByte(byteIndex); + int expectedLen = bytesOfCodePointInUTF8[firstByte & 0xFF]; + int codePointLen = Math.min(expectedLen, numBytes - byteIndex); + // 0B UTF-8 sequence (invalid first byte). + if (codePointLen == 0) return false; + // 1B UTF-8 sequence (ASCII or truncated). + if (codePointLen == 1) { + if (firstByte >= 0) { + ++byteIndex; + continue; + } + else return false; + } + // Read the second byte. + byte secondByte = getByte(byteIndex + 1); + if (!isValidSecondByte(secondByte, firstByte)) return false; + // Read remaining continuation bytes. + int continuationBytes = 2; + for (; continuationBytes < codePointLen; ++continuationBytes) { + byte nextByte = getByte(byteIndex + continuationBytes); + if (!isValidContinuationByte(nextByte)) return false; + } + // Invalid UTF-8 sequence (not enough continuation bytes). + if (continuationBytes < expectedLen) return false; + // Valid UTF-8 sequence. + byteIndex += codePointLen; + } + return true; + } + + /** + * Code point iteration over a UTF8String can be done using one of two modes: + * 1. CODE_POINT_ITERATOR_ASSUME_VALID: The caller ensures that the UTF8String is valid and does + * not contain any invalid UTF-8 byte sequences. In this case, the code point iterator will + * return the code points in the current string one by one, as integers. If an invalid code + * point is found within the string during iteration, an exception will be thrown. This mode + * is more dangerous, but faster - since no scan is needed prior to beginning iteration. + * 2. CODE_POINT_ITERATOR_MAKE_VALID: The caller does not ensure that the UTF8String is valid, + * but instead expects the code point iterator to first check whether the current UTF8String + * is valid, then perform the invalid byte sequence replacement using `makeValid`, and finally + * begin the code point iteration over the resulting valid UTF8String. However, the original + * UTF8String stays unchanged. This mode is safer, but slower - due to initial validation. + * The default mode is CODE_POINT_ITERATOR_ASSUME_VALID. + */ + public enum CodePointIteratorType { + CODE_POINT_ITERATOR_ASSUME_VALID, // USE ONLY WITH VALID STRINGS + CODE_POINT_ITERATOR_MAKE_VALID + } + + /** + * Returns a code point iterator for this UTF8String. + */ + public Iterator codePointIterator() { + return codePointIterator(CodePointIteratorType.CODE_POINT_ITERATOR_ASSUME_VALID); + } + + public Iterator codePointIterator(CodePointIteratorType iteratorMode) { + if (iteratorMode == CodePointIteratorType.CODE_POINT_ITERATOR_MAKE_VALID && !isValid()) { + return makeValid().codePointIterator(); + } + return new CodePointIterator(); + } + + /** + * Code point iterator implementation for the UTF8String class. The iterator will return code + * points in the current string one by one, as integers. However, the code point iterator is only + * guaranteed to work if the current UTF8String does not contain any invalid UTF-8 byte sequences. + * If the current string contains any invalid UTF-8 byte sequences, exceptions will be thrown. + */ + private class CodePointIterator implements Iterator { + // Byte index used to iterate over the current UTF8String. + private int byteIndex = 0; + + @Override + public boolean hasNext() { + return byteIndex < numBytes; + } + + @Override + public Integer next() { + if (!hasNext()) { + throw new IndexOutOfBoundsException(); + } + int codePoint = codePointFrom(byteIndex); + byteIndex += numBytesForFirstByte(getByte(byteIndex)); + return codePoint; + } + } + + /** + * Reverse version of the code point iterator for this UTF8String, returns code points in the + * current string one by one, as integers, in reverse order. The logic is similar to the above. + */ + + public Iterator reverseCodePointIterator() { + return reverseCodePointIterator(CodePointIteratorType.CODE_POINT_ITERATOR_ASSUME_VALID); + } + + public Iterator reverseCodePointIterator(CodePointIteratorType iteratorMode) { + if (iteratorMode == CodePointIteratorType.CODE_POINT_ITERATOR_MAKE_VALID && !isValid()) { + return makeValid().reverseCodePointIterator(); + } + return new ReverseCodePointIterator(); + } + + private class ReverseCodePointIterator implements Iterator { + private int byteIndex = numBytes - 1; + + @Override + public boolean hasNext() { + return byteIndex >= 0; + } + + @Override + public Integer next() { + if (!hasNext()) { + throw new IndexOutOfBoundsException(); + } + while (byteIndex > 0 && isContinuationByte(getByte(byteIndex))) { + --byteIndex; + } + return codePointFrom(byteIndex--); + } + + private boolean isContinuationByte(byte b) { + return (b & 0xC0) == 0x80; + } + } + /** * Returns a substring of this. * @param start the position of first code point @@ -342,10 +589,53 @@ public boolean contains(final UTF8String substring) { } /** - * Returns the byte at position `i`. + * Returns the byte at (byte) position `byteIndex`. If byte index is invalid, returns 0. */ - private byte getByte(int i) { - return Platform.getByte(base, offset + i); + public byte getByte(int byteIndex) { + return Platform.getByte(base, offset + byteIndex); + } + + /** + * Returns the code point at (char) position `charIndex`. If char index is invalid, throws + * exception. Note that this method is not efficient as it needs to traverse the UTF-8 string. + * If `byteIndex` of the first byte in the code point is known, use `codePointFrom` instead. + */ + public int getChar(int charIndex) { + if (charIndex < 0 || charIndex >= numChars()) { + throw new IndexOutOfBoundsException(); + } + int charCount = 0, byteCount = 0; + while (charCount < charIndex) { + byteCount += numBytesForFirstByte(getByte(byteCount)); + charCount += 1; + } + return codePointFrom(byteCount); + } + + /** + * Returns the code point starting from the byte at position `byteIndex`. + * If byte index is invalid, throws exception. + */ + public int codePointFrom(int byteIndex) { + if (byteIndex < 0 || byteIndex >= numBytes) { + throw new IndexOutOfBoundsException(); + } + byte b = getByte(byteIndex); + int numBytes = numBytesForFirstByte(b); + return switch (numBytes) { + case 1 -> + b & 0x7F; + case 2 -> + ((b & 0x1F) << 6) | (getByte(byteIndex + 1) & 0x3F); + case 3 -> + ((b & 0x0F) << 12) | ((getByte(byteIndex + 1) & 0x3F) << 6) | + (getByte(byteIndex + 2) & 0x3F); + case 4 -> + ((b & 0x07) << 18) | ((getByte(byteIndex + 1) & 0x3F) << 12) | + ((getByte(byteIndex + 2) & 0x3F) << 6) | (getByte(byteIndex + 3) & 0x3F); + default -> + throw new IllegalStateException("Error in UTF-8 code point"); + }; } public boolean matchAt(final UTF8String s, int pos) { @@ -364,56 +654,34 @@ public boolean endsWith(final UTF8String suffix) { } /** - * Returns the upper case of this string + * Method for ASCII character conversion using a functional interface for chars. */ - public UTF8String toUpperCase() { - if (numBytes == 0) { - return EMPTY_UTF8; - } - // Optimization - do char level uppercase conversion in case of chars in ASCII range - for (int i = 0; i < numBytes; i++) { - if (getByte(i) < 0) { - // non-ASCII - return toUpperCaseSlow(); - } - } + + private UTF8String convertAscii(Function charConverter) { byte[] bytes = new byte[numBytes]; for (int i = 0; i < numBytes; i++) { - bytes[i] = (byte) Character.toUpperCase(getByte(i)); + bytes[i] = (byte) charConverter.apply((char) getByte(i)).charValue(); } return fromBytes(bytes); } - private UTF8String toUpperCaseSlow() { - return fromString(toString().toUpperCase()); - } - /** - * Optimized lowercase comparison for UTF8_BINARY_LCASE collation - * a.compareLowerCase(b) is equivalent to a.toLowerCase().binaryCompare(b.toLowerCase()) + * Returns the upper case of this string */ - public int compareLowerCase(UTF8String other) { - int curr; - for (curr = 0; curr < numBytes && curr < other.numBytes; curr++) { - byte left, right; - if ((left = getByte(curr)) < 0 || (right = other.getByte(curr)) < 0) { - return compareLowerCaseSuffixSlow(other, curr); - } - int lowerLeft = Character.toLowerCase(left); - int lowerRight = Character.toLowerCase(right); - if (lowerLeft != lowerRight) { - return lowerLeft - lowerRight; - } + public UTF8String toUpperCase() { + if (numBytes == 0) { + return EMPTY_UTF8; } - return numBytes - other.numBytes; + + return isFullAscii() ? toUpperCaseAscii() : toUpperCaseSlow(); + } + + public UTF8String toUpperCaseAscii() { + return convertAscii(Character::toUpperCase); } - private int compareLowerCaseSuffixSlow(UTF8String other, int pref) { - UTF8String suffixLeft = UTF8String.fromAddress(base, offset + pref, - numBytes - pref); - UTF8String suffixRight = UTF8String.fromAddress(other.base, other.offset + pref, - other.numBytes - pref); - return suffixLeft.toLowerCaseSlow().binaryCompare(suffixRight.toLowerCaseSlow()); + private UTF8String toUpperCaseSlow() { + return fromString(toString().toUpperCase()); } /** @@ -423,43 +691,57 @@ public UTF8String toLowerCase() { if (numBytes == 0) { return EMPTY_UTF8; } - // Optimization - do char level lowercase conversion in case of chars in ASCII range - for (int i = 0; i < numBytes; i++) { + + return isFullAscii() ? toLowerCaseAscii() : toLowerCaseSlow(); + } + + public boolean isFullAscii() { + for (var i = 0; i < numBytes; i++) { if (getByte(i) < 0) { - // non-ASCII - return toLowerCaseSlow(); + return false; } } - byte[] bytes = new byte[numBytes]; - for (int i = 0; i < numBytes; i++) { - bytes[i] = (byte) Character.toLowerCase(getByte(i)); - } - return fromBytes(bytes); + return true; } private UTF8String toLowerCaseSlow() { return fromString(toString().toLowerCase()); } + public UTF8String toLowerCaseAscii() { + return convertAscii(Character::toLowerCase); + } + /** - * Returns the title case of this string, that could be used as title. + * Returns the title case of this string, that could be used as title. There are essentially two + * different version of this method - one using the JVM case mapping rules, and the other using + * the ICU case mapping rules. ASCII implementation is the same for both, but please refer to the + * respective methods for the slow (non-ASCII) implementation for more details on the differences. */ public UTF8String toTitleCase() { if (numBytes == 0) { return EMPTY_UTF8; } - // Optimization - in case of ASCII chars we can skip copying the data to and from StringBuilder - byte prev = ' ', curr; - for (int i = 0; i < numBytes; i++) { - curr = getByte(i); - if (prev == ' ' && curr < 0) { - // non-ASCII - return toTitleCaseSlow(); - } - prev = curr; + + return isFullAscii() ? toTitleCaseAscii() : toTitleCaseSlow(); + } + + public UTF8String toTitleCaseICU() { + if (numBytes == 0) { + return EMPTY_UTF8; } + + return isFullAscii() ? toTitleCaseAscii() : toTitleCaseSlowICU(); + } + + /* + * Fast path to return the title case of this string, given that all characters are ASCII. + * This implementation essentially works for all collations currently supported in Spark. + * This method is more efficient, because it skips copying the data to and from StringBuilder. + */ + private UTF8String toTitleCaseAscii() { byte[] bytes = new byte[numBytes]; - prev = ' '; + byte prev = ' ', curr; for (int i = 0; i < numBytes; i++) { curr = getByte(i); if (prev == ' ') { @@ -472,6 +754,11 @@ public UTF8String toTitleCase() { return fromBytes(bytes); } + /* + * Slow path to return the title case of this string, according to JVM case mapping rules. + * This is considered the "old" behaviour for UTF8_BINARY collation, and is not recommended. + * To use this, set the spark.sql.ICU_CASE_MAPPINGS_ENABLED configuration to `false`. + */ private UTF8String toTitleCaseSlow() { StringBuilder sb = new StringBuilder(); String s = toString(); @@ -485,6 +772,24 @@ private UTF8String toTitleCaseSlow() { return fromString(sb.toString()); } + /* + * Slow path to return the title case of this string, according to ICU case mapping rules. + * This is considered the "new" behaviour for UTF8_BINARY collation, and is recommended. + * This is used by default, since spark.sql.ICU_CASE_MAPPINGS_ENABLED is set to `true`. + */ + private UTF8String toTitleCaseSlowICU() { + StringBuilder sb = new StringBuilder(); + String s = toString(); + sb.append(s); + sb.setCharAt(0, (char) UCharacter.toTitleCase(sb.charAt(0))); + for (int i = 1; i < s.length(); i++) { + if (sb.charAt(i - 1) == ' ') { + sb.setCharAt(i, (char) UCharacter.toTitleCase(sb.charAt(i))); + } + } + return fromString(sb.toString()); + } + /* * Returns the index of the string `match` in this String. This string has to be a comma separated * list. If `match` contains a comma 0 will be returned. If the `match` isn't part of this String, @@ -521,7 +826,7 @@ public int findInSet(UTF8String match) { * @param end the end position of the current UTF8String in bytes. * @return a new UTF8String in the position of [start, end] of current UTF8String bytes. */ - private UTF8String copyUTF8String(int start, int end) { + public UTF8String copyUTF8String(int start, int end) { int len = end - start + 1; byte[] newBytes = new byte[len]; copyMemory(base, offset + start, newBytes, BYTE_ARRAY_OFFSET, len); @@ -766,6 +1071,17 @@ public UTF8String repeat(int times) { return UTF8String.fromBytes(newBytes); } + /** + * Returns the (default) position of the first occurrence of an empty substr in the current + * string from the specified position (0-based index). + * + * @param start the start position of the current string for searching + * @return the position of the first occurrence of the empty substr (now, always 0) + */ + public int indexOfEmpty(int start) { + return 0; // TODO: Fix this behaviour (SPARK-48284) + } + /** * Returns the position of the first occurrence of substr in * current string from the specified position (0-based index). @@ -776,7 +1092,7 @@ public UTF8String repeat(int times) { */ public int indexOf(UTF8String v, int start) { if (v.numBytes() == 0) { - return 0; + return indexOfEmpty(start); } // locate to the start position. @@ -801,10 +1117,34 @@ public int indexOf(UTF8String v, int start) { return -1; } + public int charPosToByte(int charPos) { + if (charPos < 0) { + return -1; + } + + int i = 0; + int c = 0; + while (i < numBytes && c < charPos) { + i += numBytesForFirstByte(getByte(i)); + c += 1; + } + return i; + } + + public int bytePosToChar(int bytePos) { + int i = 0; + int c = 0; + while (i < numBytes && i < bytePos) { + i += numBytesForFirstByte(getByte(i)); + c += 1; + } + return c; + } + /** * Find the `str` from left to right. */ - private int find(UTF8String str, int start) { + public int find(UTF8String str, int start) { assert (str.numBytes > 0); while (start <= numBytes - str.numBytes) { if (ByteArrayMethods.arrayEquals(base, offset + start, str.base, str.offset, str.numBytes)) { @@ -818,7 +1158,7 @@ private int find(UTF8String str, int start) { /** * Find the `str` from right to left. */ - private int rfind(UTF8String str, int start) { + public int rfind(UTF8String str, int start) { assert (str.numBytes > 0); while (start >= 0) { if (ByteArrayMethods.arrayEquals(base, offset + start, str.base, str.offset, str.numBytes)) { @@ -1718,4 +2058,21 @@ public void read(Kryo kryo, Input in) { in.read((byte[]) base); } + /** + * Convert a long value to its binary format stripping leading zeros. + */ + public static UTF8String toBinaryString(long val) { + int zeros = Long.numberOfLeadingZeros(val); + if (zeros == Long.SIZE) { + return UTF8String.ZERO_UTF8; + } else { + int length = Long.SIZE - zeros; + byte[] bytes = new byte[length]; + do { + bytes[--length] = (byte) ((val & 0x1) == 1 ? '1': '0'); + val >>>= 1; + } while (length > 0); + return fromBytes(bytes); + } + } } diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CalendarIntervalSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CalendarIntervalSuite.java index 0a1ee279316f1..aacc4507861ad 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CalendarIntervalSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CalendarIntervalSuite.java @@ -1,19 +1,19 @@ /* -* Licensed to the Apache Software Foundation (ASF) under one or more -* contributor license agreements. See the NOTICE file distributed with -* this work for additional information regarding copyright ownership. -* The ASF licenses this file to You under the Apache License, Version 2.0 -* (the "License"); you may not use this file except in compliance with -* the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.spark.unsafe.types; diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index 099a13a025e7e..d084ef098248f 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -17,15 +17,179 @@ package org.apache.spark.unsafe.types; import org.apache.spark.SparkException; +import org.apache.spark.sql.catalyst.util.CollationAwareUTF8String; import org.apache.spark.sql.catalyst.util.CollationFactory; import org.apache.spark.sql.catalyst.util.CollationSupport; import org.junit.jupiter.api.Test; import static org.junit.jupiter.api.Assertions.*; - +// checkstyle.off: AvoidEscapedUnicodeCharacters public class CollationSupportSuite { + /** + * A list containing some of the supported collations in Spark. Use this list to iterate over + * all the important collation groups (binary, lowercase, icu) for complete unit test coverage. + * Note: this list may come in handy when the Spark function result is the same regardless of + * the specified collations (as often seen in some pass-through Spark expressions). + */ + private final String[] testSupportedCollations = + {"UTF8_BINARY", "UTF8_LCASE", "UNICODE", "UNICODE_CI"}; + + /** + * Collation-aware UTF8String comparison. + */ + + private void assertStringCompare(String s1, String s2, String collationName, int expected) + throws SparkException { + UTF8String l = UTF8String.fromString(s1); + UTF8String r = UTF8String.fromString(s2); + int compare = CollationFactory.fetchCollation(collationName).comparator.compare(l, r); + assertEquals(Integer.signum(expected), Integer.signum(compare)); + } + + @Test + public void testCompare() throws SparkException { + for (String collationName: testSupportedCollations) { + // Edge cases + assertStringCompare("", "", collationName, 0); + assertStringCompare("a", "", collationName, 1); + assertStringCompare("", "a", collationName, -1); + // Basic tests + assertStringCompare("a", "a", collationName, 0); + assertStringCompare("a", "b", collationName, -1); + assertStringCompare("b", "a", collationName, 1); + assertStringCompare("A", "A", collationName, 0); + assertStringCompare("A", "B", collationName, -1); + assertStringCompare("B", "A", collationName, 1); + assertStringCompare("aa", "a", collationName, 1); + assertStringCompare("b", "bb", collationName, -1); + assertStringCompare("abc", "a", collationName, 1); + assertStringCompare("abc", "b", collationName, -1); + assertStringCompare("abc", "ab", collationName, 1); + assertStringCompare("abc", "abc", collationName, 0); + // ASCII strings + assertStringCompare("aaaa", "aaa", collationName, 1); + assertStringCompare("hello", "world", collationName, -1); + assertStringCompare("Spark", "Spark", collationName, 0); + // Non-ASCII strings + assertStringCompare("ü", "ü", collationName, 0); + assertStringCompare("ü", "", collationName, 1); + assertStringCompare("", "ü", collationName, -1); + assertStringCompare("äü", "äü", collationName, 0); + assertStringCompare("äxx", "äx", collationName, 1); + assertStringCompare("a", "ä", collationName, -1); + } + // Non-ASCII strings + assertStringCompare("äü", "bü", "UTF8_BINARY", 1); + assertStringCompare("bxx", "bü", "UTF8_BINARY", -1); + assertStringCompare("äü", "bü", "UTF8_LCASE", 1); + assertStringCompare("bxx", "bü", "UTF8_LCASE", -1); + assertStringCompare("äü", "bü", "UNICODE", -1); + assertStringCompare("bxx", "bü", "UNICODE", 1); + assertStringCompare("äü", "bü", "UNICODE_CI", -1); + assertStringCompare("bxx", "bü", "UNICODE_CI", 1); + // Case variation + assertStringCompare("AbCd", "aBcD", "UTF8_BINARY", -1); + assertStringCompare("ABCD", "abcd", "UTF8_LCASE", 0); + assertStringCompare("AbcD", "aBCd", "UNICODE", 1); + assertStringCompare("abcd", "ABCD", "UNICODE_CI", 0); + // Accent variation + assertStringCompare("aBćD", "ABĆD", "UTF8_BINARY", 1); + assertStringCompare("AbCδ", "ABCΔ", "UTF8_LCASE", 0); + assertStringCompare("äBCd", "ÄBCD", "UNICODE", -1); + assertStringCompare("Ab́cD", "AB́CD", "UNICODE_CI", 0); + // Case-variable character length + assertStringCompare("i\u0307", "İ", "UTF8_BINARY", -1); + assertStringCompare("İ", "i\u0307", "UTF8_BINARY", 1); + assertStringCompare("i\u0307", "İ", "UTF8_LCASE", 0); + assertStringCompare("İ", "i\u0307", "UTF8_LCASE", 0); + assertStringCompare("i\u0307", "İ", "UNICODE", -1); + assertStringCompare("İ", "i\u0307", "UNICODE", 1); + assertStringCompare("i\u0307", "İ", "UNICODE_CI", 0); + assertStringCompare("İ", "i\u0307", "UNICODE_CI", 0); + assertStringCompare("i\u0307İ", "i\u0307İ", "UTF8_LCASE", 0); + assertStringCompare("i\u0307İ", "İi\u0307", "UTF8_LCASE", 0); + assertStringCompare("İi\u0307", "i\u0307İ", "UTF8_LCASE", 0); + assertStringCompare("İi\u0307", "İi\u0307", "UTF8_LCASE", 0); + assertStringCompare("i\u0307İ", "i\u0307İ", "UNICODE_CI", 0); + assertStringCompare("i\u0307İ", "İi\u0307", "UNICODE_CI", 0); + assertStringCompare("İi\u0307", "i\u0307İ", "UNICODE_CI", 0); + assertStringCompare("İi\u0307", "İi\u0307", "UNICODE_CI", 0); + // Conditional case mapping + assertStringCompare("ς", "σ", "UTF8_BINARY", -1); + assertStringCompare("ς", "Σ", "UTF8_BINARY", 1); + assertStringCompare("σ", "Σ", "UTF8_BINARY", 1); + assertStringCompare("ς", "σ", "UTF8_LCASE", 0); + assertStringCompare("ς", "Σ", "UTF8_LCASE", 0); + assertStringCompare("σ", "Σ", "UTF8_LCASE", 0); + assertStringCompare("ς", "σ", "UNICODE", 1); + assertStringCompare("ς", "Σ", "UNICODE", 1); + assertStringCompare("σ", "Σ", "UNICODE", -1); + assertStringCompare("ς", "σ", "UNICODE_CI", 0); + assertStringCompare("ς", "Σ", "UNICODE_CI", 0); + assertStringCompare("σ", "Σ", "UNICODE_CI", 0); + // Maximum code point. + int maxCodePoint = Character.MAX_CODE_POINT; + String maxCodePointStr = new String(Character.toChars(maxCodePoint)); + for (int i = 0; i < maxCodePoint && Character.isValidCodePoint(i); ++i) { + assertStringCompare(new String(Character.toChars(i)), maxCodePointStr, "UTF8_BINARY", -1); + assertStringCompare(new String(Character.toChars(i)), maxCodePointStr, "UTF8_LCASE", -1); + } + // Minimum code point. + int minCodePoint = Character.MIN_CODE_POINT; + String minCodePointStr = new String(Character.toChars(minCodePoint)); + for (int i = minCodePoint + 1; i <= maxCodePoint && Character.isValidCodePoint(i); ++i) { + assertStringCompare(new String(Character.toChars(i)), minCodePointStr, "UTF8_BINARY", 1); + assertStringCompare(new String(Character.toChars(i)), minCodePointStr, "UTF8_LCASE", 1); + } + } + + private void assertLowerCaseCodePoints(UTF8String target, UTF8String expected, + Boolean useCodePoints) { + if (useCodePoints) { + assertEquals(expected, CollationAwareUTF8String.lowerCaseCodePoints(target)); + } else { + assertEquals(expected, target.toLowerCase()); + } + } + + @Test + public void testLowerCaseCodePoints() { + // Edge cases + assertLowerCaseCodePoints(UTF8String.fromString(""), UTF8String.fromString(""), false); + assertLowerCaseCodePoints(UTF8String.fromString(""), UTF8String.fromString(""), true); + // Basic tests + assertLowerCaseCodePoints(UTF8String.fromString("abcd"), UTF8String.fromString("abcd"), false); + assertLowerCaseCodePoints(UTF8String.fromString("AbCd"), UTF8String.fromString("abcd"), false); + assertLowerCaseCodePoints(UTF8String.fromString("abcd"), UTF8String.fromString("abcd"), true); + assertLowerCaseCodePoints(UTF8String.fromString("aBcD"), UTF8String.fromString("abcd"), true); + // Accent variation + assertLowerCaseCodePoints(UTF8String.fromString("AbĆd"), UTF8String.fromString("abćd"), false); + assertLowerCaseCodePoints(UTF8String.fromString("aBcΔ"), UTF8String.fromString("abcδ"), true); + // Case-variable character length + assertLowerCaseCodePoints( + UTF8String.fromString("İoDiNe"), UTF8String.fromString("i̇odine"), false); + assertLowerCaseCodePoints( + UTF8String.fromString("Abi̇o12"), UTF8String.fromString("abi̇o12"), false); + assertLowerCaseCodePoints( + UTF8String.fromString("İodInE"), UTF8String.fromString("i̇odine"), true); + assertLowerCaseCodePoints( + UTF8String.fromString("aBi̇o12"), UTF8String.fromString("abi̇o12"), true); + // Conditional case mapping + assertLowerCaseCodePoints( + UTF8String.fromString("ΘΑΛΑΣΣΙΝΟΣ"), UTF8String.fromString("θαλασσινος"), false); + assertLowerCaseCodePoints( + UTF8String.fromString("ΘΑΛΑΣΣΙΝΟΣ"), UTF8String.fromString("θαλασσινοσ"), true); + // Surrogate pairs are treated as invalid UTF8 sequences + assertLowerCaseCodePoints(UTF8String.fromBytes(new byte[] + {(byte) 0xED, (byte) 0xA0, (byte) 0x80, (byte) 0xED, (byte) 0xB0, (byte) 0x80}), + UTF8String.fromString("\uFFFD\uFFFD"), false); + assertLowerCaseCodePoints(UTF8String.fromBytes(new byte[] + {(byte) 0xED, (byte) 0xA0, (byte) 0x80, (byte) 0xED, (byte) 0xB0, (byte) 0x80}), + UTF8String.fromString("\uFFFD\uFFFD"), true); + } + /** * Collation-aware string expressions. */ @@ -47,9 +211,9 @@ public void testContains() throws SparkException { assertContains("", "", "UNICODE", true); assertContains("c", "", "UNICODE", true); assertContains("", "c", "UNICODE", false); - assertContains("", "", "UTF8_BINARY_LCASE", true); - assertContains("c", "", "UTF8_BINARY_LCASE", true); - assertContains("", "c", "UTF8_BINARY_LCASE", false); + assertContains("", "", "UTF8_LCASE", true); + assertContains("c", "", "UTF8_LCASE", true); + assertContains("", "c", "UTF8_LCASE", false); assertContains("", "", "UNICODE_CI", true); assertContains("c", "", "UNICODE_CI", true); assertContains("", "c", "UNICODE_CI", false); @@ -60,9 +224,9 @@ public void testContains() throws SparkException { assertContains("abcde", "abcde", "UNICODE", true); assertContains("abcde", "aBcDe", "UNICODE", false); assertContains("abcde", "fghij", "UNICODE", false); - assertContains("abcde", "C", "UTF8_BINARY_LCASE", true); - assertContains("abcde", "AbCdE", "UTF8_BINARY_LCASE", true); - assertContains("abcde", "X", "UTF8_BINARY_LCASE", false); + assertContains("abcde", "C", "UTF8_LCASE", true); + assertContains("abcde", "AbCdE", "UTF8_LCASE", true); + assertContains("abcde", "X", "UTF8_LCASE", false); assertContains("abcde", "c", "UNICODE_CI", true); assertContains("abcde", "bCD", "UNICODE_CI", true); assertContains("abcde", "123", "UNICODE_CI", false); @@ -71,8 +235,8 @@ public void testContains() throws SparkException { assertContains("aBcDe", "BcD", "UTF8_BINARY", true); assertContains("aBcDe", "abcde", "UNICODE", false); assertContains("aBcDe", "aBcDe", "UNICODE", true); - assertContains("aBcDe", "bcd", "UTF8_BINARY_LCASE", true); - assertContains("aBcDe", "BCD", "UTF8_BINARY_LCASE", true); + assertContains("aBcDe", "bcd", "UTF8_LCASE", true); + assertContains("aBcDe", "BCD", "UTF8_LCASE", true); assertContains("aBcDe", "abcde", "UNICODE_CI", true); assertContains("aBcDe", "AbCdE", "UNICODE_CI", true); // Accent variation @@ -80,8 +244,8 @@ public void testContains() throws SparkException { assertContains("aBcDe", "BćD", "UTF8_BINARY", false); assertContains("aBcDe", "abćde", "UNICODE", false); assertContains("aBcDe", "aBćDe", "UNICODE", false); - assertContains("aBcDe", "bćd", "UTF8_BINARY_LCASE", false); - assertContains("aBcDe", "BĆD", "UTF8_BINARY_LCASE", false); + assertContains("aBcDe", "bćd", "UTF8_LCASE", false); + assertContains("aBcDe", "BĆD", "UTF8_LCASE", false); assertContains("aBcDe", "abćde", "UNICODE_CI", false); assertContains("aBcDe", "AbĆdE", "UNICODE_CI", false); // Variable byte length characters @@ -93,14 +257,48 @@ public void testContains() throws SparkException { assertContains("ab世De", "AB世dE", "UNICODE", false); assertContains("äbćδe", "äbćδe", "UNICODE", true); assertContains("äbćδe", "ÄBcΔÉ", "UNICODE", false); - assertContains("ab世De", "b世D", "UTF8_BINARY_LCASE", true); - assertContains("ab世De", "B世d", "UTF8_BINARY_LCASE", true); - assertContains("äbćδe", "bćδ", "UTF8_BINARY_LCASE", true); - assertContains("äbćδe", "BcΔ", "UTF8_BINARY_LCASE", false); + assertContains("ab世De", "b世D", "UTF8_LCASE", true); + assertContains("ab世De", "B世d", "UTF8_LCASE", true); + assertContains("äbćδe", "bćδ", "UTF8_LCASE", true); + assertContains("äbćδe", "BcΔ", "UTF8_LCASE", false); assertContains("ab世De", "ab世De", "UNICODE_CI", true); assertContains("ab世De", "AB世dE", "UNICODE_CI", true); assertContains("äbćδe", "ÄbćδE", "UNICODE_CI", true); assertContains("äbćδe", "ÄBcΔÉ", "UNICODE_CI", false); + // Characters with the same binary lowercase representation + assertContains("The Kelvin.", "Kelvin", "UTF8_LCASE", true); + assertContains("The Kelvin.", "Kelvin", "UTF8_LCASE", true); + assertContains("The KKelvin.", "KKelvin", "UTF8_LCASE", true); + assertContains("2 Kelvin.", "2 Kelvin", "UTF8_LCASE", true); + assertContains("2 Kelvin.", "2 Kelvin", "UTF8_LCASE", true); + assertContains("The KKelvin.", "KKelvin,", "UTF8_LCASE", false); + // Case-variable character length + assertContains("i̇", "i", "UNICODE_CI", false); + assertContains("i̇", "\u0307", "UNICODE_CI", false); + assertContains("i̇", "İ", "UNICODE_CI", true); + assertContains("İ", "i", "UNICODE_CI", false); + assertContains("adi̇os", "io", "UNICODE_CI", false); + assertContains("adi̇os", "Io", "UNICODE_CI", false); + assertContains("adi̇os", "i̇o", "UNICODE_CI", true); + assertContains("adi̇os", "İo", "UNICODE_CI", true); + assertContains("adİos", "io", "UNICODE_CI", false); + assertContains("adİos", "Io", "UNICODE_CI", false); + assertContains("adİos", "i̇o", "UNICODE_CI", true); + assertContains("adİos", "İo", "UNICODE_CI", true); + assertContains("i̇", "i", "UTF8_LCASE", true); // != UNICODE_CI + assertContains("İ", "\u0307", "UTF8_LCASE", false); + assertContains("İ", "i", "UTF8_LCASE", false); + assertContains("i̇", "\u0307", "UTF8_LCASE", true); // != UNICODE_CI + assertContains("i̇", "İ", "UTF8_LCASE", true); + assertContains("İ", "i", "UTF8_LCASE", false); + assertContains("adi̇os", "io", "UTF8_LCASE", false); + assertContains("adi̇os", "Io", "UTF8_LCASE", false); + assertContains("adi̇os", "i̇o", "UTF8_LCASE", true); + assertContains("adi̇os", "İo", "UTF8_LCASE", true); + assertContains("adİos", "io", "UTF8_LCASE", false); + assertContains("adİos", "Io", "UTF8_LCASE", false); + assertContains("adİos", "i̇o", "UTF8_LCASE", true); + assertContains("adİos", "İo", "UTF8_LCASE", true); } private void assertStartsWith( @@ -121,9 +319,9 @@ public void testStartsWith() throws SparkException { assertStartsWith("", "", "UNICODE", true); assertStartsWith("c", "", "UNICODE", true); assertStartsWith("", "c", "UNICODE", false); - assertStartsWith("", "", "UTF8_BINARY_LCASE", true); - assertStartsWith("c", "", "UTF8_BINARY_LCASE", true); - assertStartsWith("", "c", "UTF8_BINARY_LCASE", false); + assertStartsWith("", "", "UTF8_LCASE", true); + assertStartsWith("c", "", "UTF8_LCASE", true); + assertStartsWith("", "c", "UTF8_LCASE", false); assertStartsWith("", "", "UNICODE_CI", true); assertStartsWith("c", "", "UNICODE_CI", true); assertStartsWith("", "c", "UNICODE_CI", false); @@ -134,19 +332,20 @@ public void testStartsWith() throws SparkException { assertStartsWith("abcde", "abcde", "UNICODE", true); assertStartsWith("abcde", "aBcDe", "UNICODE", false); assertStartsWith("abcde", "fghij", "UNICODE", false); - assertStartsWith("abcde", "A", "UTF8_BINARY_LCASE", true); - assertStartsWith("abcde", "AbCdE", "UTF8_BINARY_LCASE", true); - assertStartsWith("abcde", "X", "UTF8_BINARY_LCASE", false); + assertStartsWith("abcde", "A", "UTF8_LCASE", true); + assertStartsWith("abcde", "AbCdE", "UTF8_LCASE", true); + assertStartsWith("abcde", "X", "UTF8_LCASE", false); assertStartsWith("abcde", "a", "UNICODE_CI", true); assertStartsWith("abcde", "aBC", "UNICODE_CI", true); + assertStartsWith("abcde", "bcd", "UNICODE_CI", false); assertStartsWith("abcde", "123", "UNICODE_CI", false); // Case variation assertStartsWith("aBcDe", "abc", "UTF8_BINARY", false); assertStartsWith("aBcDe", "aBc", "UTF8_BINARY", true); assertStartsWith("aBcDe", "abcde", "UNICODE", false); assertStartsWith("aBcDe", "aBcDe", "UNICODE", true); - assertStartsWith("aBcDe", "abc", "UTF8_BINARY_LCASE", true); - assertStartsWith("aBcDe", "ABC", "UTF8_BINARY_LCASE", true); + assertStartsWith("aBcDe", "abc", "UTF8_LCASE", true); + assertStartsWith("aBcDe", "ABC", "UTF8_LCASE", true); assertStartsWith("aBcDe", "abcde", "UNICODE_CI", true); assertStartsWith("aBcDe", "AbCdE", "UNICODE_CI", true); // Accent variation @@ -154,8 +353,8 @@ public void testStartsWith() throws SparkException { assertStartsWith("aBcDe", "aBć", "UTF8_BINARY", false); assertStartsWith("aBcDe", "abćde", "UNICODE", false); assertStartsWith("aBcDe", "aBćDe", "UNICODE", false); - assertStartsWith("aBcDe", "abć", "UTF8_BINARY_LCASE", false); - assertStartsWith("aBcDe", "ABĆ", "UTF8_BINARY_LCASE", false); + assertStartsWith("aBcDe", "abć", "UTF8_LCASE", false); + assertStartsWith("aBcDe", "ABĆ", "UTF8_LCASE", false); assertStartsWith("aBcDe", "abćde", "UNICODE_CI", false); assertStartsWith("aBcDe", "AbĆdE", "UNICODE_CI", false); // Variable byte length characters @@ -167,14 +366,52 @@ public void testStartsWith() throws SparkException { assertStartsWith("ab世De", "AB世dE", "UNICODE", false); assertStartsWith("äbćδe", "äbćδe", "UNICODE", true); assertStartsWith("äbćδe", "ÄBcΔÉ", "UNICODE", false); - assertStartsWith("ab世De", "ab世", "UTF8_BINARY_LCASE", true); - assertStartsWith("ab世De", "aB世", "UTF8_BINARY_LCASE", true); - assertStartsWith("äbćδe", "äbć", "UTF8_BINARY_LCASE", true); - assertStartsWith("äbćδe", "äBc", "UTF8_BINARY_LCASE", false); + assertStartsWith("ab世De", "ab世", "UTF8_LCASE", true); + assertStartsWith("ab世De", "aB世", "UTF8_LCASE", true); + assertStartsWith("äbćδe", "äbć", "UTF8_LCASE", true); + assertStartsWith("äbćδe", "äBc", "UTF8_LCASE", false); assertStartsWith("ab世De", "ab世De", "UNICODE_CI", true); assertStartsWith("ab世De", "AB世dE", "UNICODE_CI", true); assertStartsWith("äbćδe", "ÄbćδE", "UNICODE_CI", true); assertStartsWith("äbćδe", "ÄBcΔÉ", "UNICODE_CI", false); + // Characters with the same binary lowercase representation + assertStartsWith("Kelvin.", "Kelvin", "UTF8_LCASE", true); + assertStartsWith("Kelvin.", "Kelvin", "UTF8_LCASE", true); + assertStartsWith("KKelvin.", "KKelvin", "UTF8_LCASE", true); + assertStartsWith("2 Kelvin.", "2 Kelvin", "UTF8_LCASE", true); + assertStartsWith("2 Kelvin.", "2 Kelvin", "UTF8_LCASE", true); + assertStartsWith("KKelvin.", "KKelvin,", "UTF8_LCASE", false); + // Case-variable character length + assertStartsWith("i̇", "i", "UNICODE_CI", false); + assertStartsWith("i̇", "İ", "UNICODE_CI", true); + assertStartsWith("İ", "i", "UNICODE_CI", false); + assertStartsWith("İİİ", "i̇i̇", "UNICODE_CI", true); + assertStartsWith("İİİ", "i̇i", "UNICODE_CI", false); + assertStartsWith("İi̇İ", "i̇İ", "UNICODE_CI", true); + assertStartsWith("i̇İi̇i̇", "İi̇İi", "UNICODE_CI", false); + assertStartsWith("i̇onic", "io", "UNICODE_CI", false); + assertStartsWith("i̇onic", "Io", "UNICODE_CI", false); + assertStartsWith("i̇onic", "i̇o", "UNICODE_CI", true); + assertStartsWith("i̇onic", "İo", "UNICODE_CI", true); + assertStartsWith("İonic", "io", "UNICODE_CI", false); + assertStartsWith("İonic", "Io", "UNICODE_CI", false); + assertStartsWith("İonic", "i̇o", "UNICODE_CI", true); + assertStartsWith("İonic", "İo", "UNICODE_CI", true); + assertStartsWith("i̇", "i", "UTF8_LCASE", true); // != UNICODE_CI + assertStartsWith("i̇", "İ", "UTF8_LCASE", true); + assertStartsWith("İ", "i", "UTF8_LCASE", false); + assertStartsWith("İİİ", "i̇i̇", "UTF8_LCASE", true); + assertStartsWith("İİİ", "i̇i", "UTF8_LCASE", false); + assertStartsWith("İi̇İ", "i̇İ", "UTF8_LCASE", true); + assertStartsWith("i̇İi̇i̇", "İi̇İi", "UTF8_LCASE", true); // != UNICODE_CI + assertStartsWith("i̇onic", "io", "UTF8_LCASE", false); + assertStartsWith("i̇onic", "Io", "UTF8_LCASE", false); + assertStartsWith("i̇onic", "i̇o", "UTF8_LCASE", true); + assertStartsWith("i̇onic", "İo", "UTF8_LCASE", true); + assertStartsWith("İonic", "io", "UTF8_LCASE", false); + assertStartsWith("İonic", "Io", "UTF8_LCASE", false); + assertStartsWith("İonic", "i̇o", "UTF8_LCASE", true); + assertStartsWith("İonic", "İo", "UTF8_LCASE", true); } private void assertEndsWith(String pattern, String suffix, String collationName, boolean expected) @@ -194,9 +431,9 @@ public void testEndsWith() throws SparkException { assertEndsWith("", "", "UNICODE", true); assertEndsWith("c", "", "UNICODE", true); assertEndsWith("", "c", "UNICODE", false); - assertEndsWith("", "", "UTF8_BINARY_LCASE", true); - assertEndsWith("c", "", "UTF8_BINARY_LCASE", true); - assertEndsWith("", "c", "UTF8_BINARY_LCASE", false); + assertEndsWith("", "", "UTF8_LCASE", true); + assertEndsWith("c", "", "UTF8_LCASE", true); + assertEndsWith("", "c", "UTF8_LCASE", false); assertEndsWith("", "", "UNICODE_CI", true); assertEndsWith("c", "", "UNICODE_CI", true); assertEndsWith("", "c", "UNICODE_CI", false); @@ -207,19 +444,20 @@ public void testEndsWith() throws SparkException { assertEndsWith("abcde", "abcde", "UNICODE", true); assertEndsWith("abcde", "aBcDe", "UNICODE", false); assertEndsWith("abcde", "fghij", "UNICODE", false); - assertEndsWith("abcde", "E", "UTF8_BINARY_LCASE", true); - assertEndsWith("abcde", "AbCdE", "UTF8_BINARY_LCASE", true); - assertEndsWith("abcde", "X", "UTF8_BINARY_LCASE", false); + assertEndsWith("abcde", "E", "UTF8_LCASE", true); + assertEndsWith("abcde", "AbCdE", "UTF8_LCASE", true); + assertEndsWith("abcde", "X", "UTF8_LCASE", false); assertEndsWith("abcde", "e", "UNICODE_CI", true); assertEndsWith("abcde", "CDe", "UNICODE_CI", true); + assertEndsWith("abcde", "bcd", "UNICODE_CI", false); assertEndsWith("abcde", "123", "UNICODE_CI", false); // Case variation assertEndsWith("aBcDe", "cde", "UTF8_BINARY", false); assertEndsWith("aBcDe", "cDe", "UTF8_BINARY", true); assertEndsWith("aBcDe", "abcde", "UNICODE", false); assertEndsWith("aBcDe", "aBcDe", "UNICODE", true); - assertEndsWith("aBcDe", "cde", "UTF8_BINARY_LCASE", true); - assertEndsWith("aBcDe", "CDE", "UTF8_BINARY_LCASE", true); + assertEndsWith("aBcDe", "cde", "UTF8_LCASE", true); + assertEndsWith("aBcDe", "CDE", "UTF8_LCASE", true); assertEndsWith("aBcDe", "abcde", "UNICODE_CI", true); assertEndsWith("aBcDe", "AbCdE", "UNICODE_CI", true); // Accent variation @@ -227,8 +465,8 @@ public void testEndsWith() throws SparkException { assertEndsWith("aBcDe", "ćDe", "UTF8_BINARY", false); assertEndsWith("aBcDe", "abćde", "UNICODE", false); assertEndsWith("aBcDe", "aBćDe", "UNICODE", false); - assertEndsWith("aBcDe", "ćde", "UTF8_BINARY_LCASE", false); - assertEndsWith("aBcDe", "ĆDE", "UTF8_BINARY_LCASE", false); + assertEndsWith("aBcDe", "ćde", "UTF8_LCASE", false); + assertEndsWith("aBcDe", "ĆDE", "UTF8_LCASE", false); assertEndsWith("aBcDe", "abćde", "UNICODE_CI", false); assertEndsWith("aBcDe", "AbĆdE", "UNICODE_CI", false); // Variable byte length characters @@ -240,14 +478,904 @@ public void testEndsWith() throws SparkException { assertEndsWith("ab世De", "AB世dE", "UNICODE", false); assertEndsWith("äbćδe", "äbćδe", "UNICODE", true); assertEndsWith("äbćδe", "ÄBcΔÉ", "UNICODE", false); - assertEndsWith("ab世De", "世De", "UTF8_BINARY_LCASE", true); - assertEndsWith("ab世De", "世dE", "UTF8_BINARY_LCASE", true); - assertEndsWith("äbćδe", "ćδe", "UTF8_BINARY_LCASE", true); - assertEndsWith("äbćδe", "cδE", "UTF8_BINARY_LCASE", false); + assertEndsWith("ab世De", "世De", "UTF8_LCASE", true); + assertEndsWith("ab世De", "世dE", "UTF8_LCASE", true); + assertEndsWith("äbćδe", "ćδe", "UTF8_LCASE", true); + assertEndsWith("äbćδe", "cδE", "UTF8_LCASE", false); assertEndsWith("ab世De", "ab世De", "UNICODE_CI", true); assertEndsWith("ab世De", "AB世dE", "UNICODE_CI", true); assertEndsWith("äbćδe", "ÄbćδE", "UNICODE_CI", true); assertEndsWith("äbćδe", "ÄBcΔÉ", "UNICODE_CI", false); + // Characters with the same binary lowercase representation + assertEndsWith("The Kelvin", "Kelvin", "UTF8_LCASE", true); + assertEndsWith("The Kelvin", "Kelvin", "UTF8_LCASE", true); + assertEndsWith("The KKelvin", "KKelvin", "UTF8_LCASE", true); + assertEndsWith("The 2 Kelvin", "2 Kelvin", "UTF8_LCASE", true); + assertEndsWith("The 2 Kelvin", "2 Kelvin", "UTF8_LCASE", true); + assertEndsWith("The KKelvin", "KKelvin,", "UTF8_LCASE", false); + // Case-variable character length + assertEndsWith("i̇", "\u0307", "UNICODE_CI", false); + assertEndsWith("i̇", "İ", "UNICODE_CI", true); + assertEndsWith("İ", "i", "UNICODE_CI", false); + assertEndsWith("İİİ", "i̇i̇", "UNICODE_CI", true); + assertEndsWith("İİİ", "ii̇", "UNICODE_CI", false); + assertEndsWith("İi̇İ", "İi̇", "UNICODE_CI", true); + assertEndsWith("i̇İi̇i̇", "\u0307İi̇İ", "UNICODE_CI", false); + assertEndsWith("the i̇o", "io", "UNICODE_CI", false); + assertEndsWith("the i̇o", "Io", "UNICODE_CI", false); + assertEndsWith("the i̇o", "i̇o", "UNICODE_CI", true); + assertEndsWith("the i̇o", "İo", "UNICODE_CI", true); + assertEndsWith("the İo", "io", "UNICODE_CI", false); + assertEndsWith("the İo", "Io", "UNICODE_CI", false); + assertEndsWith("the İo", "i̇o", "UNICODE_CI", true); + assertEndsWith("the İo", "İo", "UNICODE_CI", true); + assertEndsWith("i̇", "\u0307", "UTF8_LCASE", true); // != UNICODE_CI + assertEndsWith("i̇", "İ", "UTF8_LCASE", true); + assertEndsWith("İ", "\u0307", "UTF8_LCASE", false); + assertEndsWith("İİİ", "i̇i̇", "UTF8_LCASE", true); + assertEndsWith("İİİ", "ii̇", "UTF8_LCASE", false); + assertEndsWith("İi̇İ", "İi̇", "UTF8_LCASE", true); + assertEndsWith("i̇İi̇i̇", "\u0307İi̇İ", "UTF8_LCASE", true); // != UNICODE_CI + assertEndsWith("i̇İi̇i̇", "\u0307İİ", "UTF8_LCASE", false); + assertEndsWith("the i̇o", "io", "UTF8_LCASE", false); + assertEndsWith("the i̇o", "Io", "UTF8_LCASE", false); + assertEndsWith("the i̇o", "i̇o", "UTF8_LCASE", true); + assertEndsWith("the i̇o", "İo", "UTF8_LCASE", true); + assertEndsWith("the İo", "io", "UTF8_LCASE", false); + assertEndsWith("the İo", "Io", "UTF8_LCASE", false); + assertEndsWith("the İo", "i̇o", "UTF8_LCASE", true); + assertEndsWith("the İo", "İo", "UTF8_LCASE", true); + } + + private void assertStringSplitSQL(String str, String delimiter, String collationName, + UTF8String[] expected) throws SparkException { + UTF8String s = UTF8String.fromString(str); + UTF8String d = UTF8String.fromString(delimiter); + int collationId = CollationFactory.collationNameToId(collationName); + assertArrayEquals(expected, CollationSupport.StringSplitSQL.exec(s, d, collationId)); + } + + @Test + public void testStringSplitSQL() throws SparkException { + // Possible splits + var empty_match = new UTF8String[] { UTF8String.fromString("") }; + var array_abc = new UTF8String[] { UTF8String.fromString("abc") }; + var array_1a2 = new UTF8String[] { UTF8String.fromString("1a2") }; + var array_AaXbB = new UTF8String[] { UTF8String.fromString("AaXbB") }; + var array_aBcDe = new UTF8String[] { UTF8String.fromString("aBcDe") }; + var array_special = new UTF8String[] { UTF8String.fromString("äb世De") }; + var array_abcde = new UTF8String[] { UTF8String.fromString("äbćδe") }; + var full_match = new UTF8String[] { UTF8String.fromString(""), UTF8String.fromString("") }; + var array_1_2 = new UTF8String[] { UTF8String.fromString("1"), UTF8String.fromString("2") }; + var array_A_B = new UTF8String[] { UTF8String.fromString("A"), UTF8String.fromString("B") }; + var array_a_e = new UTF8String[] { UTF8String.fromString("ä"), UTF8String.fromString("e") }; + var array_Aa_bB = new UTF8String[] { UTF8String.fromString("Aa"), UTF8String.fromString("bB") }; + // Edge cases + assertStringSplitSQL("", "", "UTF8_BINARY", empty_match); + assertStringSplitSQL("abc", "", "UTF8_BINARY", array_abc); + assertStringSplitSQL("", "abc", "UTF8_BINARY", empty_match); + assertStringSplitSQL("", "", "UNICODE", empty_match); + assertStringSplitSQL("abc", "", "UNICODE", array_abc); + assertStringSplitSQL("", "abc", "UNICODE", empty_match); + assertStringSplitSQL("", "", "UTF8_LCASE", empty_match); + assertStringSplitSQL("abc", "", "UTF8_LCASE", array_abc); + assertStringSplitSQL("", "abc", "UTF8_LCASE", empty_match); + assertStringSplitSQL("", "", "UNICODE_CI", empty_match); + assertStringSplitSQL("abc", "", "UNICODE_CI", array_abc); + assertStringSplitSQL("", "abc", "UNICODE_CI", empty_match); + // Basic tests + assertStringSplitSQL("1a2", "a", "UTF8_BINARY", array_1_2); + assertStringSplitSQL("1a2", "A", "UTF8_BINARY", array_1a2); + assertStringSplitSQL("1a2", "b", "UTF8_BINARY", array_1a2); + assertStringSplitSQL("1a2", "1a2", "UNICODE", full_match); + assertStringSplitSQL("1a2", "1A2", "UNICODE", array_1a2); + assertStringSplitSQL("1a2", "3b4", "UNICODE", array_1a2); + assertStringSplitSQL("1a2", "A", "UTF8_LCASE", array_1_2); + assertStringSplitSQL("1a2", "1A2", "UTF8_LCASE", full_match); + assertStringSplitSQL("1a2", "X", "UTF8_LCASE", array_1a2); + assertStringSplitSQL("1a2", "a", "UNICODE_CI", array_1_2); + assertStringSplitSQL("1a2", "A", "UNICODE_CI", array_1_2); + assertStringSplitSQL("1a2", "1A2", "UNICODE_CI", full_match); + assertStringSplitSQL("1a2", "123", "UNICODE_CI", array_1a2); + // Case variation + assertStringSplitSQL("AaXbB", "x", "UTF8_BINARY", array_AaXbB); + assertStringSplitSQL("AaXbB", "X", "UTF8_BINARY", array_Aa_bB); + assertStringSplitSQL("AaXbB", "axb", "UNICODE", array_AaXbB); + assertStringSplitSQL("AaXbB", "aXb", "UNICODE", array_A_B); + assertStringSplitSQL("AaXbB", "axb", "UTF8_LCASE", array_A_B); + assertStringSplitSQL("AaXbB", "AXB", "UTF8_LCASE", array_A_B); + assertStringSplitSQL("AaXbB", "axb", "UNICODE_CI", array_A_B); + assertStringSplitSQL("AaXbB", "AxB", "UNICODE_CI", array_A_B); + // Accent variation + assertStringSplitSQL("aBcDe", "bćd", "UTF8_BINARY", array_aBcDe); + assertStringSplitSQL("aBcDe", "BćD", "UTF8_BINARY", array_aBcDe); + assertStringSplitSQL("aBcDe", "abćde", "UNICODE", array_aBcDe); + assertStringSplitSQL("aBcDe", "aBćDe", "UNICODE", array_aBcDe); + assertStringSplitSQL("aBcDe", "bćd", "UTF8_LCASE", array_aBcDe); + assertStringSplitSQL("aBcDe", "BĆD", "UTF8_LCASE", array_aBcDe); + assertStringSplitSQL("aBcDe", "abćde", "UNICODE_CI", array_aBcDe); + assertStringSplitSQL("aBcDe", "AbĆdE", "UNICODE_CI", array_aBcDe); + // Variable byte length characters + assertStringSplitSQL("äb世De", "b世D", "UTF8_BINARY", array_a_e); + assertStringSplitSQL("äb世De", "B世d", "UTF8_BINARY", array_special); + assertStringSplitSQL("äbćδe", "bćδ", "UTF8_BINARY", array_a_e); + assertStringSplitSQL("äbćδe", "BcΔ", "UTF8_BINARY", array_abcde); + assertStringSplitSQL("äb世De", "äb世De", "UNICODE", full_match); + assertStringSplitSQL("äb世De", "äB世de", "UNICODE", array_special); + assertStringSplitSQL("äbćδe", "äbćδe", "UNICODE", full_match); + assertStringSplitSQL("äbćδe", "ÄBcΔÉ", "UNICODE", array_abcde); + assertStringSplitSQL("äb世De", "b世D", "UTF8_LCASE", array_a_e); + assertStringSplitSQL("äb世De", "B世d", "UTF8_LCASE", array_a_e); + assertStringSplitSQL("äbćδe", "bćδ", "UTF8_LCASE", array_a_e); + assertStringSplitSQL("äbćδe", "BcΔ", "UTF8_LCASE", array_abcde); + assertStringSplitSQL("äb世De", "ab世De", "UNICODE_CI", array_special); + assertStringSplitSQL("äb世De", "AB世dE", "UNICODE_CI", array_special); + assertStringSplitSQL("äbćδe", "ÄbćδE", "UNICODE_CI", full_match); + assertStringSplitSQL("äbćδe", "ÄBcΔÉ", "UNICODE_CI", array_abcde); + } + + private void assertUpper(String target, String collationName, String expected) + throws SparkException { + UTF8String target_utf8 = UTF8String.fromString(target); + UTF8String expected_utf8 = UTF8String.fromString(expected); + int collationId = CollationFactory.collationNameToId(collationName); + // Testing the new ICU-based implementation of the Upper function. + assertEquals(expected_utf8, CollationSupport.Upper.exec(target_utf8, collationId, true)); + // Testing the old JVM-based implementation of the Upper function. + assertEquals(expected_utf8, CollationSupport.Upper.exec(target_utf8, collationId, false)); + // Note: results should be the same in these tests for both ICU and JVM-based implementations. + } + + @Test + public void testUpper() throws SparkException { + // Edge cases + assertUpper("", "UTF8_BINARY", ""); + assertUpper("", "UTF8_LCASE", ""); + assertUpper("", "UNICODE", ""); + assertUpper("", "UNICODE_CI", ""); + // Basic tests + assertUpper("abcde", "UTF8_BINARY", "ABCDE"); + assertUpper("abcde", "UTF8_LCASE", "ABCDE"); + assertUpper("abcde", "UNICODE", "ABCDE"); + assertUpper("abcde", "UNICODE_CI", "ABCDE"); + // Uppercase present + assertUpper("AbCdE", "UTF8_BINARY", "ABCDE"); + assertUpper("aBcDe", "UTF8_BINARY", "ABCDE"); + assertUpper("AbCdE", "UTF8_LCASE", "ABCDE"); + assertUpper("aBcDe", "UTF8_LCASE", "ABCDE"); + assertUpper("AbCdE", "UNICODE", "ABCDE"); + assertUpper("aBcDe", "UNICODE", "ABCDE"); + assertUpper("AbCdE", "UNICODE_CI", "ABCDE"); + assertUpper("aBcDe", "UNICODE_CI", "ABCDE"); + // Accent letters + assertUpper("aBćDe","UTF8_BINARY", "ABĆDE"); + assertUpper("aBćDe","UTF8_LCASE", "ABĆDE"); + assertUpper("aBćDe","UNICODE", "ABĆDE"); + assertUpper("aBćDe","UNICODE_CI", "ABĆDE"); + // Variable byte length characters + assertUpper("ab世De", "UTF8_BINARY", "AB世DE"); + assertUpper("äbćδe", "UTF8_BINARY", "ÄBĆΔE"); + assertUpper("ab世De", "UTF8_LCASE", "AB世DE"); + assertUpper("äbćδe", "UTF8_LCASE", "ÄBĆΔE"); + assertUpper("ab世De", "UNICODE", "AB世DE"); + assertUpper("äbćδe", "UNICODE", "ÄBĆΔE"); + assertUpper("ab世De", "UNICODE_CI", "AB世DE"); + assertUpper("äbćδe", "UNICODE_CI", "ÄBĆΔE"); + // Case-variable character length + assertUpper("i\u0307o", "UTF8_BINARY","I\u0307O"); + assertUpper("i\u0307o", "UTF8_LCASE","I\u0307O"); + assertUpper("i\u0307o", "UNICODE","I\u0307O"); + assertUpper("i\u0307o", "UNICODE_CI","I\u0307O"); + assertUpper("ß fi ffi ff st ῗ", "UTF8_BINARY","SS FI FFI FF ST \u0399\u0308\u0342"); + assertUpper("ß fi ffi ff st ῗ", "UTF8_LCASE","SS FI FFI FF ST \u0399\u0308\u0342"); + assertUpper("ß fi ffi ff st ῗ", "UNICODE","SS FI FFI FF ST \u0399\u0308\u0342"); + assertUpper("ß fi ffi ff st ῗ", "UNICODE","SS FI FFI FF ST \u0399\u0308\u0342"); + } + + private void assertLower(String target, String collationName, String expected) + throws SparkException { + UTF8String target_utf8 = UTF8String.fromString(target); + UTF8String expected_utf8 = UTF8String.fromString(expected); + int collationId = CollationFactory.collationNameToId(collationName); + // Testing the new ICU-based implementation of the Lower function. + assertEquals(expected_utf8, CollationSupport.Lower.exec(target_utf8, collationId, true)); + // Testing the old JVM-based implementation of the Lower function. + assertEquals(expected_utf8, CollationSupport.Lower.exec(target_utf8, collationId, false)); + // Note: results should be the same in these tests for both ICU and JVM-based implementations. + } + + @Test + public void testLower() throws SparkException { + // Edge cases + assertLower("", "UTF8_BINARY", ""); + assertLower("", "UTF8_LCASE", ""); + assertLower("", "UNICODE", ""); + assertLower("", "UNICODE_CI", ""); + // Basic tests + assertLower("ABCDE", "UTF8_BINARY", "abcde"); + assertLower("ABCDE", "UTF8_LCASE", "abcde"); + assertLower("ABCDE", "UNICODE", "abcde"); + assertLower("ABCDE", "UNICODE_CI", "abcde"); + // Uppercase present + assertLower("AbCdE", "UTF8_BINARY", "abcde"); + assertLower("aBcDe", "UTF8_BINARY", "abcde"); + assertLower("AbCdE", "UTF8_LCASE", "abcde"); + assertLower("aBcDe", "UTF8_LCASE", "abcde"); + assertLower("AbCdE", "UNICODE", "abcde"); + assertLower("aBcDe", "UNICODE", "abcde"); + assertLower("AbCdE", "UNICODE_CI", "abcde"); + assertLower("aBcDe", "UNICODE_CI", "abcde"); + // Accent letters + assertLower("AbĆdE","UTF8_BINARY", "abćde"); + assertLower("AbĆdE","UTF8_LCASE", "abćde"); + assertLower("AbĆdE","UNICODE", "abćde"); + assertLower("AbĆdE","UNICODE_CI", "abćde"); + // Variable byte length characters + assertLower("aB世De", "UTF8_BINARY", "ab世de"); + assertLower("ÄBĆΔE", "UTF8_BINARY", "äbćδe"); + assertLower("aB世De", "UTF8_LCASE", "ab世de"); + assertLower("ÄBĆΔE", "UTF8_LCASE", "äbćδe"); + assertLower("aB世De", "UNICODE", "ab世de"); + assertLower("ÄBĆΔE", "UNICODE", "äbćδe"); + assertLower("aB世De", "UNICODE_CI", "ab世de"); + assertLower("ÄBĆΔE", "UNICODE_CI", "äbćδe"); + // Case-variable character length + assertLower("İo", "UTF8_BINARY","i\u0307o"); + assertLower("İo", "UTF8_LCASE","i\u0307o"); + assertLower("İo", "UNICODE","i\u0307o"); + assertLower("İo", "UNICODE_CI","i\u0307o"); + } + + private void assertInitCap(String target, String collationName, String expected) + throws SparkException { + UTF8String target_utf8 = UTF8String.fromString(target); + UTF8String expected_utf8 = UTF8String.fromString(expected); + int collationId = CollationFactory.collationNameToId(collationName); + // Testing the new ICU-based implementation of the Lower function. + assertEquals(expected_utf8, CollationSupport.InitCap.exec(target_utf8, collationId, true)); + // Testing the old JVM-based implementation of the Lower function. + assertEquals(expected_utf8, CollationSupport.InitCap.exec(target_utf8, collationId, false)); + // Note: results should be the same in these tests for both ICU and JVM-based implementations. + } + + @Test + public void testInitCap() throws SparkException { + // Edge cases + assertInitCap("", "UTF8_BINARY", ""); + assertInitCap("", "UTF8_LCASE", ""); + assertInitCap("", "UNICODE", ""); + assertInitCap("", "UNICODE_CI", ""); + // Basic tests + assertInitCap("ABCDE", "UTF8_BINARY", "Abcde"); + assertInitCap("ABCDE", "UTF8_LCASE", "Abcde"); + assertInitCap("ABCDE", "UNICODE", "Abcde"); + assertInitCap("ABCDE", "UNICODE_CI", "Abcde"); + // Uppercase present + assertInitCap("AbCdE", "UTF8_BINARY", "Abcde"); + assertInitCap("aBcDe", "UTF8_BINARY", "Abcde"); + assertInitCap("AbCdE", "UTF8_LCASE", "Abcde"); + assertInitCap("aBcDe", "UTF8_LCASE", "Abcde"); + assertInitCap("AbCdE", "UNICODE", "Abcde"); + assertInitCap("aBcDe", "UNICODE", "Abcde"); + assertInitCap("AbCdE", "UNICODE_CI", "Abcde"); + assertInitCap("aBcDe", "UNICODE_CI", "Abcde"); + // Accent letters + assertInitCap("AbĆdE", "UTF8_BINARY", "Abćde"); + assertInitCap("AbĆdE", "UTF8_LCASE", "Abćde"); + assertInitCap("AbĆdE", "UNICODE", "Abćde"); + assertInitCap("AbĆdE", "UNICODE_CI", "Abćde"); + // Variable byte length characters + assertInitCap("aB 世 De", "UTF8_BINARY", "Ab 世 De"); + assertInitCap("ÄBĆΔE", "UTF8_BINARY", "Äbćδe"); + assertInitCap("aB 世 De", "UTF8_LCASE", "Ab 世 De"); + assertInitCap("ÄBĆΔE", "UTF8_LCASE", "Äbćδe"); + assertInitCap("aB 世 De", "UNICODE", "Ab 世 De"); + assertInitCap("ÄBĆΔE", "UNICODE", "Äbćδe"); + assertInitCap("aB 世 de", "UNICODE_CI", "Ab 世 De"); + assertInitCap("ÄBĆΔE", "UNICODE_CI", "Äbćδe"); + // Case-variable character length + assertInitCap("İo", "UTF8_BINARY", "I\u0307o"); + assertInitCap("İo", "UTF8_LCASE", "İo"); + assertInitCap("İo", "UNICODE", "İo"); + assertInitCap("İo", "UNICODE_CI", "İo"); + assertInitCap("i\u0307o", "UTF8_BINARY", "I\u0307o"); + assertInitCap("i\u0307o", "UTF8_LCASE", "I\u0307o"); + assertInitCap("i\u0307o", "UNICODE", "I\u0307o"); + assertInitCap("i\u0307o", "UNICODE_CI", "I\u0307o"); + // Different possible word boundaries + assertInitCap("a b c", "UTF8_BINARY", "A B C"); + assertInitCap("a b c", "UNICODE", "A B C"); + assertInitCap("a b c", "UTF8_LCASE", "A B C"); + assertInitCap("a b c", "UNICODE_CI", "A B C"); + assertInitCap("a.b,c", "UTF8_BINARY", "A.b,c"); + assertInitCap("a.b,c", "UNICODE", "A.b,C"); + assertInitCap("a.b,c", "UTF8_LCASE", "A.b,C"); + assertInitCap("a.b,c", "UNICODE_CI", "A.b,C"); + assertInitCap("a. b-c", "UTF8_BINARY", "A. B-c"); + assertInitCap("a. b-c", "UNICODE", "A. B-C"); + assertInitCap("a. b-c", "UTF8_LCASE", "A. B-C"); + assertInitCap("a. b-c", "UNICODE_CI", "A. B-C"); + assertInitCap("a?b世c", "UTF8_BINARY", "A?b世c"); + assertInitCap("a?b世c", "UNICODE", "A?B世C"); + assertInitCap("a?b世c", "UTF8_LCASE", "A?B世C"); + assertInitCap("a?b世c", "UNICODE_CI", "A?B世C"); + // Titlecase characters that are different from uppercase characters + assertInitCap("dzDZDz", "UTF8_BINARY", "Dzdzdz"); + assertInitCap("dzDZDz", "UNICODE", "Dzdzdz"); + assertInitCap("dzDZDz", "UTF8_LCASE", "Dzdzdz"); + assertInitCap("dzDZDz", "UNICODE_CI", "Dzdzdz"); + assertInitCap("džaba Ljubav NJegova", "UTF8_BINARY", "Džaba Ljubav Njegova"); + assertInitCap("džaba Ljubav NJegova", "UNICODE", "Džaba Ljubav Njegova"); + assertInitCap("džaba Ljubav NJegova", "UTF8_LCASE", "Džaba Ljubav Njegova"); + assertInitCap("džaba Ljubav NJegova", "UNICODE_CI", "Džaba Ljubav Njegova"); + assertInitCap("ß fi ffi ff st ΣΗΜΕΡΙΝΟΣ ΑΣΗΜΕΝΙΟΣ İOTA", "UTF8_BINARY", + "ß fi ffi ff st Σημερινος Ασημενιος I\u0307ota"); + assertInitCap("ß fi ffi ff st ΣΗΜΕΡΙΝΟΣ ΑΣΗΜΕΝΙΟΣ İOTA", "UTF8_LCASE", + "Ss Fi Ffi Ff St Σημερινος Ασημενιος İota"); + assertInitCap("ß fi ffi ff st ΣΗΜΕΡΙΝΟΣ ΑΣΗΜΕΝΙΟΣ İOTA", "UNICODE", + "Ss Fi Ffi Ff St Σημερινος Ασημενιος İota"); + assertInitCap("ß fi ffi ff st ΣΗΜΕΡΙΝΟΣ ΑΣΗΜΕΝΙΟΣ İOTA", "UNICODE_CI", + "Ss Fi Ffi Ff St Σημερινος Ασημενιος İota"); + } + + private void assertStringInstr(String string, String substring, String collationName, + Integer expected) throws SparkException { + UTF8String str = UTF8String.fromString(string); + UTF8String substr = UTF8String.fromString(substring); + int collationId = CollationFactory.collationNameToId(collationName); + assertEquals(expected, CollationSupport.StringInstr.exec(str, substr, collationId) + 1); + } + + @Test + public void testStringInstr() throws SparkException { + assertStringInstr("aaads", "Aa", "UTF8_BINARY", 0); + assertStringInstr("aaaDs", "de", "UTF8_BINARY", 0); + assertStringInstr("aaads", "ds", "UTF8_BINARY", 4); + assertStringInstr("xxxx", "", "UTF8_BINARY", 1); + assertStringInstr("", "xxxx", "UTF8_BINARY", 0); + assertStringInstr("test大千世界X大千世界", "大千", "UTF8_BINARY", 5); + assertStringInstr("test大千世界X大千世界", "界X", "UTF8_BINARY", 8); + assertStringInstr("aaads", "Aa", "UTF8_LCASE", 1); + assertStringInstr("aaaDs", "de", "UTF8_LCASE", 0); + assertStringInstr("aaaDs", "ds", "UTF8_LCASE", 4); + assertStringInstr("xxxx", "", "UTF8_LCASE", 1); + assertStringInstr("", "xxxx", "UTF8_LCASE", 0); + assertStringInstr("test大千世界X大千世界", "大千", "UTF8_LCASE", 5); + assertStringInstr("test大千世界X大千世界", "界x", "UTF8_LCASE", 8); + assertStringInstr("aaads", "Aa", "UNICODE", 0); + assertStringInstr("aaads", "aa", "UNICODE", 1); + assertStringInstr("aaads", "de", "UNICODE", 0); + assertStringInstr("xxxx", "", "UNICODE", 1); + assertStringInstr("", "xxxx", "UNICODE", 0); + assertStringInstr("test大千世界X大千世界", "界x", "UNICODE", 0); + assertStringInstr("test大千世界X大千世界", "界X", "UNICODE", 8); + assertStringInstr("xxxx", "", "UNICODE_CI", 1); + assertStringInstr("", "xxxx", "UNICODE_CI", 0); + assertStringInstr("aaads", "AD", "UNICODE_CI", 3); + assertStringInstr("aaads", "dS", "UNICODE_CI", 4); + assertStringInstr("test大千世界X大千世界", "界y", "UNICODE_CI", 0); + assertStringInstr("test大千世界X大千世界", "界x", "UNICODE_CI", 8); + assertStringInstr("i̇", "i", "UNICODE_CI", 0); + assertStringInstr("i̇", "\u0307", "UNICODE_CI", 0); + assertStringInstr("i̇", "İ", "UNICODE_CI", 1); + assertStringInstr("İ", "i", "UNICODE_CI", 0); + assertStringInstr("İoi̇o12", "i̇o", "UNICODE_CI", 1); + assertStringInstr("i̇oİo12", "İo", "UNICODE_CI", 1); + assertStringInstr("abİoi̇o", "i̇o", "UNICODE_CI", 3); + assertStringInstr("abi̇oİo", "İo", "UNICODE_CI", 3); + assertStringInstr("ai̇oxXİo", "Xx", "UNICODE_CI", 5); + assertStringInstr("aİoi̇oxx", "XX", "UNICODE_CI", 7); + assertStringInstr("i̇", "i", "UTF8_LCASE", 1); // != UNICODE_CI + assertStringInstr("i̇", "\u0307", "UTF8_LCASE", 2); // != UNICODE_CI + assertStringInstr("i̇", "İ", "UTF8_LCASE", 1); + assertStringInstr("İ", "i", "UTF8_LCASE", 0); + assertStringInstr("İoi̇o12", "i̇o", "UTF8_LCASE", 1); + assertStringInstr("i̇oİo12", "İo", "UTF8_LCASE", 1); + assertStringInstr("abİoi̇o", "i̇o", "UTF8_LCASE", 3); + assertStringInstr("abi̇oİo", "İo", "UTF8_LCASE", 3); + assertStringInstr("abI\u0307oi̇o", "İo", "UTF8_LCASE", 3); + assertStringInstr("ai̇oxXİo", "Xx", "UTF8_LCASE", 5); + assertStringInstr("abİoi̇o", "\u0307o", "UTF8_LCASE", 6); + assertStringInstr("aİoi̇oxx", "XX", "UTF8_LCASE", 7); + } + + private void assertFindInSet(String word, UTF8String set, String collationName, + Integer expected) throws SparkException { + UTF8String w = UTF8String.fromString(word); + int collationId = CollationFactory.collationNameToId(collationName); + assertEquals(expected, CollationSupport.FindInSet.exec(w, set, collationId)); + } + + @Test + public void testFindInSet() throws SparkException { + assertFindInSet("AB", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 0); + assertFindInSet("abc", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 1); + assertFindInSet("def", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 5); + assertFindInSet("d,ef", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 0); + assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 0); + assertFindInSet("", UTF8String.fromString(",abc,b,ab,c,def"), "UTF8_BINARY", 1); + assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def,"), "UTF8_BINARY", 6); + assertFindInSet("", UTF8String.fromString("abc"), "UTF8_BINARY", 0); + assertFindInSet("a", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 0); + assertFindInSet("c", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 4); + assertFindInSet("AB", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 3); + assertFindInSet("AbC", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 1); + assertFindInSet("abcd", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 0); + assertFindInSet("d,ef", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 0); + assertFindInSet("XX", UTF8String.fromString("xx"), "UTF8_LCASE", 1); + assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 0); + assertFindInSet("", UTF8String.fromString(",abc,b,ab,c,def"), "UTF8_LCASE", 1); + assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def,"), "UTF8_LCASE", 6); + assertFindInSet("", UTF8String.fromString("abc"), "UTF8_LCASE", 0); + assertFindInSet("界x", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UTF8_LCASE", 4); + assertFindInSet("a", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0); + assertFindInSet("ab", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 3); + assertFindInSet("Ab", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0); + assertFindInSet("d,ef", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0); + assertFindInSet("", UTF8String.fromString(",abc,b,ab,c,def"), "UNICODE", 1); + assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def,"), "UNICODE", 6); + assertFindInSet("", UTF8String.fromString("abc"), "UNICODE", 0); + assertFindInSet("xx", UTF8String.fromString("xx"), "UNICODE", 1); + assertFindInSet("界x", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UNICODE", 0); + assertFindInSet("大", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UNICODE", 5); + assertFindInSet("a", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE_CI", 0); + assertFindInSet("C", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE_CI", 4); + assertFindInSet("DeF", UTF8String.fromString("abc,b,ab,c,dEf"), "UNICODE_CI", 5); + assertFindInSet("DEFG", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE_CI", 0); + assertFindInSet("", UTF8String.fromString(",abc,b,ab,c,def"), "UNICODE_CI", 1); + assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def,"), "UNICODE_CI", 6); + assertFindInSet("", UTF8String.fromString("abc"), "UNICODE_CI", 0); + assertFindInSet("XX", UTF8String.fromString("xx"), "UNICODE_CI", 1); + assertFindInSet("界x", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UNICODE_CI", 4); + assertFindInSet("界x", UTF8String.fromString("test,大千,界Xx,世,界X,大,千,世界"), "UNICODE_CI", 5); + assertFindInSet("大", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UNICODE_CI", 5); + assertFindInSet("i̇", UTF8String.fromString("İ"), "UNICODE_CI", 1); + assertFindInSet("i", UTF8String.fromString("İ"), "UNICODE_CI", 0); + assertFindInSet("i̇", UTF8String.fromString("i̇"), "UNICODE_CI", 1); + assertFindInSet("i", UTF8String.fromString("i̇"), "UNICODE_CI", 0); + assertFindInSet("i̇", UTF8String.fromString("İ,"), "UNICODE_CI", 1); + assertFindInSet("i", UTF8String.fromString("İ,"), "UNICODE_CI", 0); + assertFindInSet("i̇", UTF8String.fromString("i̇,"), "UNICODE_CI", 1); + assertFindInSet("i", UTF8String.fromString("i̇,"), "UNICODE_CI", 0); + assertFindInSet("i̇", UTF8String.fromString("ab,İ"), "UNICODE_CI", 2); + assertFindInSet("i", UTF8String.fromString("ab,İ"), "UNICODE_CI", 0); + assertFindInSet("i̇", UTF8String.fromString("ab,i̇"), "UNICODE_CI", 2); + assertFindInSet("i", UTF8String.fromString("ab,i̇"), "UNICODE_CI", 0); + assertFindInSet("i̇", UTF8String.fromString("ab,İ,12"), "UNICODE_CI", 2); + assertFindInSet("i", UTF8String.fromString("ab,İ,12"), "UNICODE_CI", 0); + assertFindInSet("i̇", UTF8String.fromString("ab,i̇,12"), "UNICODE_CI", 2); + assertFindInSet("i", UTF8String.fromString("ab,i̇,12"), "UNICODE_CI", 0); + assertFindInSet("i̇o", UTF8String.fromString("ab,İo,12"), "UNICODE_CI", 2); + assertFindInSet("İo", UTF8String.fromString("ab,i̇o,12"), "UNICODE_CI", 2); + assertFindInSet("i̇", UTF8String.fromString("İ"), "UTF8_LCASE", 1); + assertFindInSet("i", UTF8String.fromString("İ"), "UTF8_LCASE", 0); + assertFindInSet("i̇", UTF8String.fromString("i̇"), "UTF8_LCASE", 1); + assertFindInSet("i", UTF8String.fromString("i̇"), "UTF8_LCASE", 0); + assertFindInSet("i̇", UTF8String.fromString("İ,"), "UTF8_LCASE", 1); + assertFindInSet("i", UTF8String.fromString("İ,"), "UTF8_LCASE", 0); + assertFindInSet("i̇", UTF8String.fromString("i̇,"), "UTF8_LCASE", 1); + assertFindInSet("i", UTF8String.fromString("i̇,"), "UTF8_LCASE", 0); + assertFindInSet("i̇", UTF8String.fromString("ab,İ"), "UTF8_LCASE", 2); + assertFindInSet("i", UTF8String.fromString("ab,İ"), "UTF8_LCASE", 0); + assertFindInSet("i̇", UTF8String.fromString("ab,i̇"), "UTF8_LCASE", 2); + assertFindInSet("i", UTF8String.fromString("ab,i̇"), "UTF8_LCASE", 0); + assertFindInSet("i̇", UTF8String.fromString("ab,İ,12"), "UTF8_LCASE", 2); + assertFindInSet("i", UTF8String.fromString("ab,İ,12"), "UTF8_LCASE", 0); + assertFindInSet("i̇", UTF8String.fromString("ab,i̇,12"), "UTF8_LCASE", 2); + assertFindInSet("i", UTF8String.fromString("ab,i̇,12"), "UTF8_LCASE", 0); + assertFindInSet("i̇o", UTF8String.fromString("ab,İo,12"), "UTF8_LCASE", 2); + assertFindInSet("İo", UTF8String.fromString("ab,i̇o,12"), "UTF8_LCASE", 2); + // Invalid UTF8 strings + assertFindInSet("C", UTF8String.fromBytes( + new byte[] { 0x41, (byte) 0xC2, 0x2C, 0x42, 0x2C, 0x43, 0x2C, 0x43, 0x2C, 0x56 }), + "UTF8_BINARY", 3); + assertFindInSet("c", UTF8String.fromBytes( + new byte[] { 0x41, (byte) 0xC2, 0x2C, 0x42, 0x2C, 0x43, 0x2C, 0x43, 0x2C, 0x56 }), + "UTF8_LCASE", 2); + assertFindInSet("C", UTF8String.fromBytes( + new byte[] { 0x41, (byte) 0xC2, 0x2C, 0x42, 0x2C, 0x43, 0x2C, 0x43, 0x2C, 0x56 }), + "UNICODE", 2); + assertFindInSet("c", UTF8String.fromBytes( + new byte[] { 0x41, (byte) 0xC2, 0x2C, 0x42, 0x2C, 0x43, 0x2C, 0x43, 0x2C, 0x56 }), + "UNICODE_CI", 2); + } + + private void assertReplace(String source, String search, String replace, String collationName, + String expected) throws SparkException { + UTF8String src = UTF8String.fromString(source); + UTF8String sear = UTF8String.fromString(search); + UTF8String repl = UTF8String.fromString(replace); + int collationId = CollationFactory.collationNameToId(collationName); + assertEquals(expected, CollationSupport.StringReplace + .exec(src, sear, repl, collationId).toString()); + } + + @Test + public void testReplace() throws SparkException { + assertReplace("r世eplace", "pl", "123", "UTF8_BINARY", "r世e123ace"); + assertReplace("replace", "pl", "", "UTF8_BINARY", "reace"); + assertReplace("repl世ace", "Pl", "", "UTF8_BINARY", "repl世ace"); + assertReplace("replace", "", "123", "UTF8_BINARY", "replace"); + assertReplace("abcabc", "b", "12", "UTF8_BINARY", "a12ca12c"); + assertReplace("abcdabcd", "bc", "", "UTF8_BINARY", "adad"); + assertReplace("r世eplace", "pl", "xx", "UTF8_LCASE", "r世exxace"); + assertReplace("repl世ace", "PL", "AB", "UTF8_LCASE", "reAB世ace"); + assertReplace("Replace", "", "123", "UTF8_LCASE", "Replace"); + assertReplace("re世place", "世", "x", "UTF8_LCASE", "rexplace"); + assertReplace("abcaBc", "B", "12", "UTF8_LCASE", "a12ca12c"); + assertReplace("AbcdabCd", "Bc", "", "UTF8_LCASE", "Adad"); + assertReplace("re世place", "plx", "123", "UNICODE", "re世place"); + assertReplace("世Replace", "re", "", "UNICODE", "世Replace"); + assertReplace("replace世", "", "123", "UNICODE", "replace世"); + assertReplace("aBc世abc", "b", "12", "UNICODE", "aBc世a12c"); + assertReplace("abcdabcd", "bc", "", "UNICODE", "adad"); + assertReplace("replace", "plx", "123", "UNICODE_CI", "replace"); + assertReplace("Replace", "re", "", "UNICODE_CI", "place"); + assertReplace("replace", "", "123", "UNICODE_CI", "replace"); + assertReplace("aBc世abc", "b", "12", "UNICODE_CI", "a12c世a12c"); + assertReplace("a世Bcdabcd", "bC", "", "UNICODE_CI", "a世dad"); + assertReplace("abi̇12", "i", "X", "UNICODE_CI", "abi̇12"); + assertReplace("abi̇12", "\u0307", "X", "UNICODE_CI", "abi̇12"); + assertReplace("abi̇12", "İ", "X", "UNICODE_CI", "abX12"); + assertReplace("abİ12", "i", "X", "UNICODE_CI", "abİ12"); + assertReplace("İi̇İi̇İi̇", "i̇", "x", "UNICODE_CI", "xxxxxx"); + assertReplace("İi̇İi̇İi̇", "i", "x", "UNICODE_CI", "İi̇İi̇İi̇"); + assertReplace("abİo12i̇o", "i̇o", "xx", "UNICODE_CI", "abxx12xx"); + assertReplace("abi̇o12i̇o", "İo", "yy", "UNICODE_CI", "abyy12yy"); + assertReplace("abi̇12", "i", "X", "UTF8_LCASE", "abX\u030712"); // != UNICODE_CI + assertReplace("abi̇12", "\u0307", "X", "UTF8_LCASE", "abiX12"); // != UNICODE_CI + assertReplace("abi̇12", "İ", "X", "UTF8_LCASE", "abX12"); + assertReplace("abİ12", "i", "X", "UTF8_LCASE", "abİ12"); + assertReplace("İi̇İi̇İi̇", "i̇", "x", "UTF8_LCASE", "xxxxxx"); + assertReplace("İi̇İi̇İi̇", "i", "x", "UTF8_LCASE", + "İx\u0307İx\u0307İx\u0307"); // != UNICODE_CI + assertReplace("abİo12i̇o", "i̇o", "xx", "UTF8_LCASE", "abxx12xx"); + assertReplace("abi̇o12i̇o", "İo", "yy", "UTF8_LCASE", "abyy12yy"); + } + + private void assertLocate(String substring, String string, Integer start, String collationName, + Integer expected) throws SparkException { + UTF8String substr = UTF8String.fromString(substring); + UTF8String str = UTF8String.fromString(string); + int collationId = CollationFactory.collationNameToId(collationName); + assertEquals(expected, CollationSupport.StringLocate.exec(str, substr, + start - 1, collationId) + 1); + } + + @Test + public void testLocate() throws SparkException { + // If you add tests with start < 1 be careful to understand the behavior of the indexOf method + // and usage of indexOf in the StringLocate class. + assertLocate("aa", "aaads", 1, "UTF8_BINARY", 1); + assertLocate("aa", "aaads", 2, "UTF8_BINARY", 2); + assertLocate("aa", "aaads", 3, "UTF8_BINARY", 0); + assertLocate("Aa", "aaads", 1, "UTF8_BINARY", 0); + assertLocate("Aa", "aAads", 1, "UTF8_BINARY", 2); + assertLocate("界x", "test大千世界X大千世界", 1, "UTF8_BINARY", 0); + assertLocate("界X", "test大千世界X大千世界", 1, "UTF8_BINARY", 8); + assertLocate("界", "test大千世界X大千世界", 13, "UTF8_BINARY", 13); + assertLocate("AA", "aaads", 1, "UTF8_LCASE", 1); + assertLocate("aa", "aAads", 2, "UTF8_LCASE", 2); + assertLocate("aa", "aaAds", 3, "UTF8_LCASE", 0); + assertLocate("abC", "abcabc", 1, "UTF8_LCASE", 1); + assertLocate("abC", "abCabc", 2, "UTF8_LCASE", 4); + assertLocate("abc", "abcabc", 4, "UTF8_LCASE", 4); + assertLocate("界x", "test大千世界X大千世界", 1, "UTF8_LCASE", 8); + assertLocate("界X", "test大千世界Xtest大千世界", 1, "UTF8_LCASE", 8); + assertLocate("界", "test大千世界X大千世界", 13, "UTF8_LCASE", 13); + assertLocate("大千", "test大千世界大千世界", 1, "UTF8_LCASE", 5); + assertLocate("大千", "test大千世界大千世界", 9, "UTF8_LCASE", 9); + assertLocate("大千", "大千世界大千世界", 1, "UTF8_LCASE", 1); + assertLocate("aa", "Aaads", 1, "UNICODE", 2); + assertLocate("AA", "aaads", 1, "UNICODE", 0); + assertLocate("aa", "aAads", 2, "UNICODE", 0); + assertLocate("aa", "aaAds", 3, "UNICODE", 0); + assertLocate("abC", "abcabc", 1, "UNICODE", 0); + assertLocate("abC", "abCabc", 2, "UNICODE", 0); + assertLocate("abC", "abCabC", 2, "UNICODE", 4); + assertLocate("abc", "abcabc", 1, "UNICODE", 1); + assertLocate("abc", "abcabc", 3, "UNICODE", 4); + assertLocate("界x", "test大千世界X大千世界", 1, "UNICODE", 0); + assertLocate("界X", "test大千世界X大千世界", 1, "UNICODE", 8); + assertLocate("界", "test大千世界X大千世界", 13, "UNICODE", 13); + assertLocate("AA", "aaads", 1, "UNICODE_CI", 1); + assertLocate("aa", "aAads", 2, "UNICODE_CI", 2); + assertLocate("aa", "aaAds", 3, "UNICODE_CI", 0); + assertLocate("abC", "abcabc", 1, "UNICODE_CI", 1); + assertLocate("abC", "abCabc", 2, "UNICODE_CI", 4); + assertLocate("abc", "abcabc", 4, "UNICODE_CI", 4); + assertLocate("界x", "test大千世界X大千世界", 1, "UNICODE_CI", 8); + assertLocate("界", "test大千世界X大千世界", 13, "UNICODE_CI", 13); + assertLocate("大千", "test大千世界大千世界", 1, "UNICODE_CI", 5); + assertLocate("大千", "test大千世界大千世界", 9, "UNICODE_CI", 9); + assertLocate("大千", "大千世界大千世界", 1, "UNICODE_CI", 1); + // Case-variable character length + assertLocate("\u0307", "i̇", 1, "UTF8_BINARY", 2); + assertLocate("\u0307", "İ", 1, "UTF8_LCASE", 0); // != UTF8_BINARY + assertLocate("i", "i̇", 1, "UNICODE_CI", 0); + assertLocate("\u0307", "i̇", 1, "UNICODE_CI", 0); + assertLocate("i̇", "i", 1, "UNICODE_CI", 0); + assertLocate("İ", "i̇", 1, "UNICODE_CI", 1); + assertLocate("İ", "i", 1, "UNICODE_CI", 0); + assertLocate("i", "i̇", 1, "UTF8_LCASE", 1); // != UNICODE_CI + assertLocate("\u0307", "i̇", 1, "UTF8_LCASE", 2); // != UNICODE_CI + assertLocate("i̇", "i", 1, "UTF8_LCASE", 0); + assertLocate("İ", "i̇", 1, "UTF8_LCASE", 1); + assertLocate("İ", "i", 1, "UTF8_LCASE", 0); + assertLocate("i̇o", "İo世界大千世界", 1, "UNICODE_CI", 1); + assertLocate("i̇o", "大千İo世界大千世界", 1, "UNICODE_CI", 3); + assertLocate("i̇o", "世界İo大千世界大千İo", 4, "UNICODE_CI", 11); + assertLocate("İo", "i̇o世界大千世界", 1, "UNICODE_CI", 1); + assertLocate("İo", "大千i̇o世界大千世界", 1, "UNICODE_CI", 3); + assertLocate("İo", "世界i̇o大千世界大千i̇o", 4, "UNICODE_CI", 12); + } + + private void assertSubstringIndex(String string, String delimiter, Integer count, + String collationName, String expected) throws SparkException { + UTF8String str = UTF8String.fromString(string); + UTF8String delim = UTF8String.fromString(delimiter); + int collationId = CollationFactory.collationNameToId(collationName); + assertEquals(expected, + CollationSupport.SubstringIndex.exec(str, delim, count, collationId).toString()); + } + + @Test + public void testSubstringIndex() throws SparkException { + assertSubstringIndex("wwwgapachegorg", "g", -3, "UTF8_BINARY", "apachegorg"); + assertSubstringIndex("www||apache||org", "||", 2, "UTF8_BINARY", "www||apache"); + assertSubstringIndex("aaaaaaaaaa", "aa", 2, "UTF8_BINARY", "a"); + assertSubstringIndex("AaAaAaAaAa", "aa", 2, "UTF8_LCASE", "A"); + assertSubstringIndex("www.apache.org", ".", 3, "UTF8_LCASE", "www.apache.org"); + assertSubstringIndex("wwwXapacheXorg", "x", 2, "UTF8_LCASE", "wwwXapache"); + assertSubstringIndex("wwwxapachexorg", "X", 1, "UTF8_LCASE", "www"); + assertSubstringIndex("www.apache.org", ".", 0, "UTF8_LCASE", ""); + assertSubstringIndex("www.apache.ORG", ".", -3, "UTF8_LCASE", "www.apache.ORG"); + assertSubstringIndex("wwwGapacheGorg", "g", 1, "UTF8_LCASE", "www"); + assertSubstringIndex("wwwGapacheGorg", "g", 3, "UTF8_LCASE", "wwwGapacheGor"); + assertSubstringIndex("gwwwGapacheGorg", "g", 3, "UTF8_LCASE", "gwwwGapache"); + assertSubstringIndex("wwwGapacheGorg", "g", -3, "UTF8_LCASE", "apacheGorg"); + assertSubstringIndex("wwwmapacheMorg", "M", -2, "UTF8_LCASE", "apacheMorg"); + assertSubstringIndex("www.apache.org", ".", -1, "UTF8_LCASE", "org"); + assertSubstringIndex("www.apache.org.", ".", -1, "UTF8_LCASE", ""); + assertSubstringIndex("", ".", -2, "UTF8_LCASE", ""); + assertSubstringIndex("test大千世界X大千世界", "x", -1, "UTF8_LCASE", "大千世界"); + assertSubstringIndex("test大千世界X大千世界", "X", 1, "UTF8_LCASE", "test大千世界"); + assertSubstringIndex("test大千世界大千世界", "千", 2, "UTF8_LCASE", "test大千世界大"); + assertSubstringIndex("www||APACHE||org", "||", 2, "UTF8_LCASE", "www||APACHE"); + assertSubstringIndex("www||APACHE||org", "||", -1, "UTF8_LCASE", "org"); + assertSubstringIndex("AaAaAaAaAa", "Aa", 2, "UNICODE", "Aa"); + assertSubstringIndex("wwwYapacheyorg", "y", 3, "UNICODE", "wwwYapacheyorg"); + assertSubstringIndex("www.apache.org", ".", 2, "UNICODE", "www.apache"); + assertSubstringIndex("wwwYapacheYorg", "Y", 1, "UNICODE", "www"); + assertSubstringIndex("wwwYapacheYorg", "y", 1, "UNICODE", "wwwYapacheYorg"); + assertSubstringIndex("wwwGapacheGorg", "g", 1, "UNICODE", "wwwGapacheGor"); + assertSubstringIndex("GwwwGapacheGorG", "G", 3, "UNICODE", "GwwwGapache"); + assertSubstringIndex("wwwGapacheGorG", "G", -3, "UNICODE", "apacheGorG"); + assertSubstringIndex("www.apache.org", ".", 0, "UNICODE", ""); + assertSubstringIndex("www.apache.org", ".", -3, "UNICODE", "www.apache.org"); + assertSubstringIndex("www.apache.org", ".", -2, "UNICODE", "apache.org"); + assertSubstringIndex("www.apache.org", ".", -1, "UNICODE", "org"); + assertSubstringIndex("", ".", -2, "UNICODE", ""); + assertSubstringIndex("test大千世界X大千世界", "X", -1, "UNICODE", "大千世界"); + assertSubstringIndex("test大千世界X大千世界", "X", 1, "UNICODE", "test大千世界"); + assertSubstringIndex("大x千世界大千世x界", "x", 1, "UNICODE", "大"); + assertSubstringIndex("大x千世界大千世x界", "x", -1, "UNICODE", "界"); + assertSubstringIndex("大x千世界大千世x界", "x", -2, "UNICODE", "千世界大千世x界"); + assertSubstringIndex("大千世界大千世界", "千", 2, "UNICODE", "大千世界大"); + assertSubstringIndex("www||apache||org", "||", 2, "UNICODE", "www||apache"); + assertSubstringIndex("AaAaAaAaAa", "aa", 2, "UNICODE_CI", "A"); + assertSubstringIndex("www.apache.org", ".", 3, "UNICODE_CI", "www.apache.org"); + assertSubstringIndex("wwwXapacheXorg", "x", 2, "UNICODE_CI", "wwwXapache"); + assertSubstringIndex("wwwxapacheXorg", "X", 1, "UNICODE_CI", "www"); + assertSubstringIndex("www.apache.org", ".", 0, "UNICODE_CI", ""); + assertSubstringIndex("wwwGapacheGorg", "G", 3, "UNICODE_CI", "wwwGapacheGor"); + assertSubstringIndex("gwwwGapacheGorg", "g", 3, "UNICODE_CI", "gwwwGapache"); + assertSubstringIndex("gwwwGapacheGorg", "g", -3, "UNICODE_CI", "apacheGorg"); + assertSubstringIndex("www.apache.ORG", ".", -3, "UNICODE_CI", "www.apache.ORG"); + assertSubstringIndex("wwwmapacheMorg", "M", -2, "UNICODE_CI", "apacheMorg"); + assertSubstringIndex("www.apache.org", ".", -1, "UNICODE_CI", "org"); + assertSubstringIndex("", ".", -2, "UNICODE_CI", ""); + assertSubstringIndex("test大千世界X大千世界", "X", -1, "UNICODE_CI", "大千世界"); + assertSubstringIndex("test大千世界X大千世界", "X", 1, "UNICODE_CI", "test大千世界"); + assertSubstringIndex("test大千世界大千世界", "千", 2, "UNICODE_CI", "test大千世界大"); + assertSubstringIndex("www||APACHE||org", "||", 2, "UNICODE_CI", "www||APACHE"); + assertSubstringIndex("abİo12", "i̇o", 1, "UNICODE_CI", "ab"); + assertSubstringIndex("abİo12", "i̇o", -1, "UNICODE_CI", "12"); + assertSubstringIndex("abi̇o12", "İo", 1, "UNICODE_CI", "ab"); + assertSubstringIndex("abi̇o12", "İo", -1, "UNICODE_CI", "12"); + assertSubstringIndex("ai̇bi̇o12", "İo", 1, "UNICODE_CI", "ai̇b"); + assertSubstringIndex("ai̇bi̇o12i̇o", "İo", 2, "UNICODE_CI", "ai̇bi̇o12"); + assertSubstringIndex("ai̇bi̇o12i̇o", "İo", -1, "UNICODE_CI", ""); + assertSubstringIndex("ai̇bi̇o12i̇o", "İo", -2, "UNICODE_CI", "12i̇o"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", -4, "UNICODE_CI", "İo12İoi̇o"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", -4, "UNICODE_CI", "İo12İoi̇o"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", -4, "UNICODE_CI", "i̇o12i̇oİo"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", -4, "UNICODE_CI", "i̇o12i̇oİo"); + assertSubstringIndex("abi̇12", "i", 1, "UNICODE_CI", "abi̇12"); + assertSubstringIndex("abi̇12", "\u0307", 1, "UNICODE_CI", "abi̇12"); + assertSubstringIndex("abi̇12", "İ", 1, "UNICODE_CI", "ab"); + assertSubstringIndex("abİ12", "i", 1, "UNICODE_CI", "abİ12"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", -4, "UNICODE_CI", "İo12İoi̇o"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", -4, "UNICODE_CI", "İo12İoi̇o"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", -4, "UNICODE_CI", "i̇o12i̇oİo"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", -4, "UNICODE_CI", "i̇o12i̇oİo"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", 3, "UNICODE_CI", "ai̇bi̇oİo12"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", 3, "UNICODE_CI", "ai̇bi̇oİo12"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", 3, "UNICODE_CI", "ai̇bİoi̇o12"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", 3, "UNICODE_CI", "ai̇bİoi̇o12"); + assertSubstringIndex("abi̇12", "i", 1, "UTF8_LCASE", "ab"); // != UNICODE_CI + assertSubstringIndex("abi̇12", "\u0307", 1, "UTF8_LCASE", "abi"); // != UNICODE_CI + assertSubstringIndex("abi̇12", "İ", 1, "UTF8_LCASE", "ab"); + assertSubstringIndex("abİ12", "i", 1, "UTF8_LCASE", "abİ12"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", -4, "UTF8_LCASE", "İo12İoi̇o"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", -4, "UTF8_LCASE", "İo12İoi̇o"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", -4, "UTF8_LCASE", "i̇o12i̇oİo"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", -4, "UTF8_LCASE", "i̇o12i̇oİo"); + assertSubstringIndex("bİoi̇o12i̇o", "\u0307oi", 1, "UTF8_LCASE", "bİoi̇o12i̇o"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", 3, "UTF8_LCASE", "ai̇bi̇oİo12"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", 3, "UTF8_LCASE", "ai̇bi̇oİo12"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", 3, "UTF8_LCASE", "ai̇bİoi̇o12"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", 3, "UTF8_LCASE", "ai̇bİoi̇o12"); + assertSubstringIndex("bİoi̇o12i̇o", "\u0307oi", 1, "UTF8_LCASE", "bİoi̇o12i̇o"); + } + + private void assertStringTrim( + String collation, + String sourceString, + String trimString, + String expectedResultString) throws SparkException { + int collationId = CollationFactory.collationNameToId(collation); + String result; + + if (trimString == null) { + result = CollationSupport.StringTrim.exec( + UTF8String.fromString(sourceString), collationId).toString(); + } else { + result = CollationSupport.StringTrim.exec( + UTF8String + .fromString(sourceString), UTF8String.fromString(trimString), collationId) + .toString(); + } + + assertEquals(expectedResultString, result); + } + + private void assertStringTrimLeft( + String collation, + String sourceString, + String trimString, + String expectedResultString) throws SparkException { + int collationId = CollationFactory.collationNameToId(collation); + String result; + + if (trimString == null) { + result = CollationSupport.StringTrimLeft.exec( + UTF8String.fromString(sourceString), collationId).toString(); + } else { + result = CollationSupport.StringTrimLeft.exec( + UTF8String + .fromString(sourceString), UTF8String.fromString(trimString), collationId) + .toString(); + } + + assertEquals(expectedResultString, result); + } + + private void assertStringTrimRight( + String collation, + String sourceString, + String trimString, + String expectedResultString) throws SparkException { + int collationId = CollationFactory.collationNameToId(collation); + String result; + + if (trimString == null) { + result = CollationSupport.StringTrimRight.exec( + UTF8String.fromString(sourceString), collationId).toString(); + } else { + result = CollationSupport.StringTrimRight.exec( + UTF8String + .fromString(sourceString), UTF8String.fromString(trimString), collationId) + .toString(); + } + + assertEquals(expectedResultString, result); + } + + @Test + public void testStringTrim() throws SparkException { + assertStringTrim("UTF8_BINARY", "asd", null, "asd"); + assertStringTrim("UTF8_BINARY", " asd ", null, "asd"); + assertStringTrim("UTF8_BINARY", " a世a ", null, "a世a"); + assertStringTrim("UTF8_BINARY", "asd", "x", "asd"); + assertStringTrim("UTF8_BINARY", "xxasdxx", "x", "asd"); + assertStringTrim("UTF8_BINARY", "xa世ax", "x", "a世a"); + + assertStringTrimLeft("UTF8_BINARY", "asd", null, "asd"); + assertStringTrimLeft("UTF8_BINARY", " asd ", null, "asd "); + assertStringTrimLeft("UTF8_BINARY", " a世a ", null, "a世a "); + assertStringTrimLeft("UTF8_BINARY", "asd", "x", "asd"); + assertStringTrimLeft("UTF8_BINARY", "xxasdxx", "x", "asdxx"); + assertStringTrimLeft("UTF8_BINARY", "xa世ax", "x", "a世ax"); + + assertStringTrimRight("UTF8_BINARY", "asd", null, "asd"); + assertStringTrimRight("UTF8_BINARY", " asd ", null, " asd"); + assertStringTrimRight("UTF8_BINARY", " a世a ", null, " a世a"); + assertStringTrimRight("UTF8_BINARY", "asd", "x", "asd"); + assertStringTrimRight("UTF8_BINARY", "xxasdxx", "x", "xxasd"); + assertStringTrimRight("UTF8_BINARY", "xa世ax", "x", "xa世a"); + + assertStringTrim("UTF8_LCASE", "asd", null, "asd"); + assertStringTrim("UTF8_LCASE", " asd ", null, "asd"); + assertStringTrim("UTF8_LCASE", " a世a ", null, "a世a"); + assertStringTrim("UTF8_LCASE", "asd", "x", "asd"); + assertStringTrim("UTF8_LCASE", "xxasdxx", "x", "asd"); + assertStringTrim("UTF8_LCASE", "xa世ax", "x", "a世a"); + + assertStringTrimLeft("UTF8_LCASE", "asd", null, "asd"); + assertStringTrimLeft("UTF8_LCASE", " asd ", null, "asd "); + assertStringTrimLeft("UTF8_LCASE", " a世a ", null, "a世a "); + assertStringTrimLeft("UTF8_LCASE", "asd", "x", "asd"); + assertStringTrimLeft("UTF8_LCASE", "xxasdxx", "x", "asdxx"); + assertStringTrimLeft("UTF8_LCASE", "xa世ax", "x", "a世ax"); + + assertStringTrimRight("UTF8_LCASE", "asd", null, "asd"); + assertStringTrimRight("UTF8_LCASE", " asd ", null, " asd"); + assertStringTrimRight("UTF8_LCASE", " a世a ", null, " a世a"); + assertStringTrimRight("UTF8_LCASE", "asd", "x", "asd"); + assertStringTrimRight("UTF8_LCASE", "xxasdxx", "x", "xxasd"); + assertStringTrimRight("UTF8_LCASE", "xa世ax", "x", "xa世a"); + + assertStringTrim("UTF8_LCASE", "asd", null, "asd"); + assertStringTrim("UTF8_LCASE", " asd ", null, "asd"); + assertStringTrim("UTF8_LCASE", " a世a ", null, "a世a"); + assertStringTrim("UTF8_LCASE", "asd", "x", "asd"); + assertStringTrim("UTF8_LCASE", "xxasdxx", "x", "asd"); + assertStringTrim("UTF8_LCASE", "xa世ax", "x", "a世a"); + + // Test cases where trimString has more than one character + assertStringTrim("UTF8_BINARY", "ddsXXXaa", "asd", "XXX"); + assertStringTrimLeft("UTF8_BINARY", "ddsXXXaa", "asd", "XXXaa"); + assertStringTrimRight("UTF8_BINARY", "ddsXXXaa", "asd", "ddsXXX"); + + assertStringTrim("UTF8_LCASE", "ddsXXXaa", "asd", "XXX"); + assertStringTrimLeft("UTF8_LCASE", "ddsXXXaa", "asd", "XXXaa"); + assertStringTrimRight("UTF8_LCASE", "ddsXXXaa", "asd", "ddsXXX"); + + // Test cases specific to collation type + // uppercase trim, lowercase src + assertStringTrim("UTF8_BINARY", "asd", "A", "asd"); + assertStringTrim("UTF8_LCASE", "asd", "A", "sd"); + + // lowercase trim, uppercase src + assertStringTrim("UTF8_BINARY", "ASD", "a", "ASD"); + assertStringTrim("UTF8_LCASE", "ASD", "a", "SD"); + + // uppercase and lowercase chars of different byte-length (utf8) + assertStringTrim("UTF8_BINARY", "ẞaaaẞ", "ß", "ẞaaaẞ"); + assertStringTrimLeft("UTF8_BINARY", "ẞaaaẞ", "ß", "ẞaaaẞ"); + assertStringTrimRight("UTF8_BINARY", "ẞaaaẞ", "ß", "ẞaaaẞ"); + + assertStringTrim("UTF8_LCASE", "ẞaaaẞ", "ß", "aaa"); + assertStringTrimLeft("UTF8_LCASE", "ẞaaaẞ", "ß", "aaaẞ"); + assertStringTrimRight("UTF8_LCASE", "ẞaaaẞ", "ß", "ẞaaa"); + + assertStringTrim("UTF8_BINARY", "ßaaaß", "ẞ", "ßaaaß"); + assertStringTrimLeft("UTF8_BINARY", "ßaaaß", "ẞ", "ßaaaß"); + assertStringTrimRight("UTF8_BINARY", "ßaaaß", "ẞ", "ßaaaß"); + + assertStringTrim("UTF8_LCASE", "ßaaaß", "ẞ", "aaa"); + assertStringTrimLeft("UTF8_LCASE", "ßaaaß", "ẞ", "aaaß"); + assertStringTrimRight("UTF8_LCASE", "ßaaaß", "ẞ", "ßaaa"); + + // different byte-length (utf8) chars trimmed + assertStringTrim("UTF8_BINARY", "Ëaaaẞ", "Ëẞ", "aaa"); + assertStringTrimLeft("UTF8_BINARY", "Ëaaaẞ", "Ëẞ", "aaaẞ"); + assertStringTrimRight("UTF8_BINARY", "Ëaaaẞ", "Ëẞ", "Ëaaa"); + + assertStringTrim("UTF8_LCASE", "Ëaaaẞ", "Ëẞ", "aaa"); + assertStringTrimLeft("UTF8_LCASE", "Ëaaaẞ", "Ëẞ", "aaaẞ"); + assertStringTrimRight("UTF8_LCASE", "Ëaaaẞ", "Ëẞ", "Ëaaa"); } // TODO: Test more collation-aware string expressions. @@ -265,3 +1393,4 @@ public void testEndsWith() throws SparkException { // TODO: Test other collation-aware expressions. } +// checkstyle.on: AvoidEscapedUnicodeCharacters diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java index 934b93c9345b9..d690da53c7c66 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java @@ -1,19 +1,19 @@ /* -* Licensed to the Apache Software Foundation (ASF) under one or more -* contributor license agreements. See the NOTICE file distributed with -* this work for additional information regarding copyright ownership. -* The ASF licenses this file to You under the Apache License, Version 2.0 -* (the "License"); you may not use this file except in compliance with -* the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.spark.unsafe.types; @@ -28,6 +28,7 @@ import org.apache.spark.unsafe.Platform; import org.junit.jupiter.api.Test; +import static org.apache.spark.unsafe.types.UTF8String.fromString; import static org.junit.jupiter.api.Assertions.*; import static org.apache.spark.unsafe.Platform.BYTE_ARRAY_OFFSET; @@ -107,29 +108,6 @@ public void binaryCompareTo() { assertTrue(fromString("你好123").binaryCompare(fromString("你好122")) > 0); } - @Test - public void lowercaseComparison() { - // SPARK-47693: Test optimized lowercase comparison of UTF8String instances - // ASCII - assertEquals(fromString("aaa").compareLowerCase(fromString("AAA")), 0); - assertTrue(fromString("aaa").compareLowerCase(fromString("AAAA")) < 0); - assertTrue(fromString("AAA").compareLowerCase(fromString("aaaa")) < 0); - assertTrue(fromString("a").compareLowerCase(fromString("B")) < 0); - assertTrue(fromString("b").compareLowerCase(fromString("A")) > 0); - assertEquals(fromString("aAa").compareLowerCase(fromString("AaA")), 0); - assertTrue(fromString("abcd").compareLowerCase(fromString("abC")) > 0); - assertTrue(fromString("ABC").compareLowerCase(fromString("abcd")) < 0); - assertEquals(fromString("abcd").compareLowerCase(fromString("abcd")), 0); - // non-ASCII - assertEquals(fromString("ü").compareLowerCase(fromString("Ü")), 0); - assertEquals(fromString("Äü").compareLowerCase(fromString("äÜ")), 0); - assertTrue(fromString("a").compareLowerCase(fromString("ä")) < 0); - assertTrue(fromString("a").compareLowerCase(fromString("Ä")) < 0); - assertTrue(fromString("A").compareLowerCase(fromString("ä")) < 0); - assertTrue(fromString("bä").compareLowerCase(fromString("aü")) > 0); - assertTrue(fromString("bxxxxxxxxxx").compareLowerCase(fromString("bü")) < 0); - } - protected static void testUpperandLower(String upper, String lower) { UTF8String us = fromString(upper); UTF8String ls = fromString(lower); @@ -902,4 +880,481 @@ public void skipWrongFirstByte() { assertEquals(1, fromBytes(c).numChars()); } } + + private void testMakeValid(String input, String expected) { + ByteArrayOutputStream exp = new ByteArrayOutputStream(); + for (String hex : expected.split(" ")) exp.write(Integer.parseInt(hex.substring(2), 16)); + ByteArrayOutputStream inp = new ByteArrayOutputStream(); + for (String hex : input.split(" ")) inp.write(Integer.parseInt(hex.substring(2), 16)); + assertEquals(fromBytes(exp.toByteArray()), fromBytes(inp.toByteArray()).makeValid()); + } + @Test + public void makeValid() { + // Basic tests + assertEquals(EMPTY_UTF8, EMPTY_UTF8.makeValid()); + assertEquals(fromString(""), fromString("").makeValid()); + assertEquals(fromString("abc"), fromString("abc").makeValid()); + assertEquals(fromString("hello"), fromString("hello").makeValid()); + assertEquals(fromString("大千世界"), fromString("大千世界").makeValid()); + assertEquals(fromBytes(new byte[] {}), fromBytes(new byte[] {}).makeValid()); + assertEquals(fromBytes(new byte[] {0x61}), fromBytes(new byte[] {0x61}).makeValid()); + assertEquals(fromBytes(new byte[] {0x7F}), fromBytes(new byte[] {0x7F}).makeValid()); + assertEquals(fromBytes(new byte[] {(byte) 0xEF, (byte) 0xBF, (byte) 0xBD}), + fromBytes(new byte[] {(byte) 0xFF}).makeValid()); + assertEquals(fromBytes(new byte[] {(byte) 0xC2, (byte) 0x80}), + fromBytes(new byte[] {(byte) 0xC2, (byte) 0x80}).makeValid()); + assertEquals(fromBytes(new byte[] {(byte) 0xDF, (byte) 0xBF}), + fromBytes(new byte[] {(byte) 0xDF, (byte) 0xBF}).makeValid()); + assertEquals(fromString("��"), + fromBytes(new byte[] {(byte) 0xC0, (byte) 0x80}).makeValid()); + assertEquals(fromString("��"), + fromBytes(new byte[] {(byte) 0xC1, (byte) 0xBF}).makeValid()); + assertEquals(fromString("��"), + fromBytes(new byte[] {(byte) 0xDF, (byte) 0xC0}).makeValid()); + assertEquals(fromString("��"), + fromBytes(new byte[] {(byte) 0xDF, (byte) 0xFF}).makeValid()); + assertEquals(fromString("��"), + fromBytes(new byte[] {(byte) 0x80, (byte) 0x80}).makeValid()); + assertEquals(fromBytes(new byte[] {(byte) 0xE0, (byte) 0xA0, (byte) 0x80}), + fromBytes(new byte[] {(byte) 0xE0, (byte) 0xA0, (byte) 0x80}).makeValid()); + assertEquals(fromBytes(new byte[] {(byte) 0xEF, (byte) 0xBF, (byte) 0xBF}), + fromBytes(new byte[] {(byte) 0xEF, (byte) 0xBF, (byte) 0xBF}).makeValid()); + assertEquals(fromBytes(new byte[] {(byte) 0xEF, (byte) 0xBF, (byte) 0xBF}), + fromBytes(new byte[] {(byte) 0xEF, (byte) 0xBF, (byte) 0xBF}).makeValid()); + assertEquals(fromString("���"), + fromBytes(new byte[] {(byte) 0xE0, (byte) 0x9F, (byte) 0x80}).makeValid()); + assertEquals(fromString("���"), + fromBytes(new byte[] {(byte) 0xE0, (byte) 0xC0, (byte) 0x80}).makeValid()); + assertEquals(fromString("��"), + fromBytes(new byte[] {(byte) 0xEF, (byte) 0xBF, (byte) 0xC0}).makeValid()); + assertEquals(fromString("���"), + fromBytes(new byte[] {(byte) 0x80, (byte) 0x80, (byte) 0x80}).makeValid()); + assertEquals(fromBytes(new byte[] {(byte) 0xF0, (byte) 0x90, (byte) 0x80, (byte) 0x80}), + fromBytes(new byte[] {(byte) 0xF0, (byte) 0x90, (byte) 0x80, (byte) 0x80}).makeValid()); + assertEquals(fromBytes(new byte[] {(byte) 0xF4, (byte) 0x8F, (byte) 0xBF, (byte) 0xBF}), + fromBytes(new byte[] {(byte) 0xF4, (byte) 0x8F, (byte) 0xBF, (byte) 0xBF}).makeValid()); + assertEquals(fromString("����"), + fromBytes(new byte[] {(byte) 0xF0, (byte) 0x8F, (byte) 0x80, (byte) 0x80}).makeValid()); + assertEquals(fromBytes(new byte[] {(byte) 0xEF, (byte) 0xBF, (byte) 0xBD, (byte) 0x7F}), + fromBytes(new byte[] {(byte) 0xF0, (byte) 0x90, (byte) 0x80, (byte) 0x7F}).makeValid()); + assertEquals(fromString("����"), + fromBytes(new byte[] {(byte) 0xF4, (byte) 0x90, (byte) 0x80, (byte) 0x80}).makeValid()); + assertEquals(fromString("��"), + fromBytes(new byte[] {(byte) 0xF4, (byte) 0x8F, (byte) 0xBF, (byte) 0xC0}).makeValid()); + assertEquals(fromString("�����"), fromBytes(new byte[] + {(byte) 0xF8, (byte) 0x8F, (byte) 0x80, (byte) 0x80, (byte) 0x80}).makeValid()); + assertEquals(fromString("�����"), fromBytes(new byte[] + {(byte) 0xF8, (byte) 0x90, (byte) 0x80, (byte) 0x80, (byte) 0x80}).makeValid()); + assertEquals(fromString("�����"), fromBytes(new byte[] + {(byte) 0xFB, (byte) 0x8F, (byte) 0xBF, (byte) 0xBF, (byte) 0xBF}).makeValid()); + assertEquals(fromString("�����"), fromBytes(new byte[] + {(byte) 0xFB, (byte) 0x90, (byte) 0x80, (byte) 0x80, (byte) 0x80}).makeValid()); + // More tests + testMakeValid("0x42", "0x42"); + testMakeValid("0x80", "0xEF 0xBF 0xBD"); + testMakeValid("0xF4 0x92 0x12", "0xEF 0xBF 0xBD 0xEF 0xBF 0xBD 0x12"); + testMakeValid("0x84 0xA5 0x63 0x64", "0xEF 0xBF 0xBD 0xEF 0xBF 0xBD 0x63 0x64"); + testMakeValid("0xF4 0x9B", "0xEF 0xBF 0xBD 0xEF 0xBF 0xBD"); + testMakeValid("0xE2 0x0C", "0xEF 0xBF 0xBD 0x0C"); + testMakeValid("0xE8 0x3E", "0xEF 0xBF 0xBD 0x3E"); + testMakeValid("0x17 0x1B 0xC2", "0x17 0x1B 0xEF 0xBF 0xBD"); + testMakeValid("0x29 0xB2 0x7C 0xA0", "0x29 0xEF 0xBF 0xBD 0x7C 0xEF 0xBF 0xBD"); + testMakeValid("0x13 0x68 0x28 0x2A 0x83", "0x13 0x68 0x28 0x2A 0xEF 0xBF 0xBD"); + testMakeValid("0x3B 0xFE 0xC2 0xB7 0x9E", "0x3B 0xEF 0xBF 0xBD 0xC2 0xB7 0xEF 0xBF 0xBD"); + testMakeValid("0xFC", "0xEF 0xBF 0xBD"); + testMakeValid("0x8E 0xBD", "0xEF 0xBF 0xBD 0xEF 0xBF 0xBD"); + testMakeValid("0x6A 0x8B 0x5C 0x5A", "0x6A 0xEF 0xBF 0xBD 0x5C 0x5A"); + testMakeValid("0xB4 0xC0", "0xEF 0xBF 0xBD 0xEF 0xBF 0xBD"); + testMakeValid("0xB0 0xD3", "0xEF 0xBF 0xBD 0xEF 0xBF 0xBD"); + testMakeValid("0xAA", "0xEF 0xBF 0xBD"); + testMakeValid("0x1A 0x66", "0x1A 0x66"); + testMakeValid("0x32 0xB5 0x5F", "0x32 0xEF 0xBF 0xBD 0x5F"); + testMakeValid("0x70 0xB6 0x69 0xBC", "0x70 0xEF 0xBF 0xBD 0x69 0xEF 0xBF 0xBD"); + testMakeValid("0xDC 0x15 0x82", "0xEF 0xBF 0xBD 0x15 0xEF 0xBF 0xBD"); + testMakeValid("0x7F 0x3F 0x72 0xBB", "0x7F 0x3F 0x72 0xEF 0xBF 0xBD"); + testMakeValid("0x3C 0x3D 0x1F 0x6C 0x75", "0x3C 0x3D 0x1F 0x6C 0x75"); + testMakeValid("0xBF 0x10 0xF4", "0xEF 0xBF 0xBD 0x10 0xEF 0xBF 0xBD"); + testMakeValid("0xBF 0xDD 0x89", "0xEF 0xBF 0xBD 0xDD 0x89"); + testMakeValid("0x85 0xD4 0x90 0x5E 0x6C", "0xEF 0xBF 0xBD 0xD4 0x90 0x5E 0x6C"); + testMakeValid("0x0A 0x92 0x3C 0x17 0x7D", "0x0A 0xEF 0xBF 0xBD 0x3C 0x17 0x7D"); + testMakeValid("0x49 0x4E 0x2C 0x7B 0x1C", "0x49 0x4E 0x2C 0x7B 0x1C"); + testMakeValid("0x2E 0x0C 0x3E 0x02 0x9D", "0x2E 0x0C 0x3E 0x02 0xEF 0xBF 0xBD"); + testMakeValid("0xF9 0xF2 0x11", "0xEF 0xBF 0xBD 0xEF 0xBF 0xBD 0x11"); + testMakeValid("0xD1", "0xEF 0xBF 0xBD"); + testMakeValid("0x74 0x35 0x56 0xC4", "0x74 0x35 0x56 0xEF 0xBF 0xBD"); + testMakeValid("0xBE 0x45 0x7F", "0xEF 0xBF 0xBD 0x45 0x7F"); + testMakeValid("0x2E", "0x2E"); + testMakeValid("0x3B 0xC3", "0x3B 0xEF 0xBF 0xBD"); + testMakeValid("0xF9 0x58 0xE0 0x84", "0xEF 0xBF 0xBD 0x58 0xEF 0xBF 0xBD 0xEF 0xBF 0xBD"); + testMakeValid("0x72 0xF9 0x4F", "0x72 0xEF 0xBF 0xBD 0x4F"); + testMakeValid("0xF2 0x2A 0x38", "0xEF 0xBF 0xBD 0x2A 0x38"); + testMakeValid("0x66 0xFC 0x8D 0x5F", "0x66 0xEF 0xBF 0xBD 0xEF 0xBF 0xBD 0x5F"); + testMakeValid("0x6D", "0x6D"); + testMakeValid("0x6E 0xDF 0xDD", "0x6E 0xEF 0xBF 0xBD 0xEF 0xBF 0xBD"); + testMakeValid("0x50 0xE8 0x45 0xDB", "0x50 0xEF 0xBF 0xBD 0x45 0xEF 0xBF 0xBD"); + testMakeValid("0x71 0x06 0x1F 0x87", "0x71 0x06 0x1F 0xEF 0xBF 0xBD"); + testMakeValid("0x02", "0x02"); + testMakeValid("0x20 0x2B 0x38 0xCB 0xDF", "0x20 0x2B 0x38 0xEF 0xBF 0xBD 0xEF 0xBF 0xBD"); + testMakeValid("0x10", "0x10"); + testMakeValid("0x15", "0x15"); + testMakeValid("0xC5 0xEA 0x2A", "0xEF 0xBF 0xBD 0xEF 0xBF 0xBD 0x2A"); + testMakeValid("0x72 0x2D 0xBB 0x06", "0x72 0x2D 0xEF 0xBF 0xBD 0x06"); + testMakeValid("0xB7", "0xEF 0xBF 0xBD"); + testMakeValid("0xB8 0xB5", "0xEF 0xBF 0xBD 0xEF 0xBF 0xBD"); + testMakeValid("0x9C 0x76 0x17", "0xEF 0xBF 0xBD 0x76 0x17"); + } + + private void testIsValid(String input, String expected) { + ByteArrayOutputStream inp = new ByteArrayOutputStream(); + for (String hex : input.split(" ")) inp.write(Integer.parseInt(hex.substring(2), 16)); + assertEquals(input.equals(expected), fromBytes(inp.toByteArray()).isValid()); + } + @Test + public void isValid() { + // Basic tests + assertTrue(EMPTY_UTF8.isValid()); + assertTrue(fromString("").isValid()); + assertTrue(fromString("abc").isValid()); + assertTrue(fromString("hello").isValid()); + assertTrue(fromString("大千世界").isValid()); + assertTrue(fromString("数据砖头").isValid()); + assertTrue(fromBytes(new byte[] {}).isValid()); + assertTrue(fromBytes(new byte[] {0x61}).isValid()); + assertTrue(fromBytes(new byte[] {0x7F}).isValid()); + assertFalse(fromBytes(new byte[] {(byte) 0xFF}).isValid()); + assertTrue(fromBytes(new byte[] {(byte) 0xC2, (byte) 0x80}).isValid()); + assertTrue(fromBytes(new byte[] {(byte) 0xDF, (byte) 0xBF}).isValid()); + assertFalse(fromBytes(new byte[] {(byte) 0xC0, (byte) 0x80}).isValid()); + assertFalse(fromBytes(new byte[] {(byte) 0xC1, (byte) 0xBF}).isValid()); + assertFalse(fromBytes(new byte[] {(byte) 0xDF, (byte) 0xC0}).isValid()); + assertFalse(fromBytes(new byte[] {(byte) 0xDF, (byte) 0xFF}).isValid()); + assertFalse(fromBytes(new byte[] {(byte) 0x80, (byte) 0x80}).isValid()); + assertTrue(fromBytes(new byte[] {(byte) 0xE0, (byte) 0xA0, (byte) 0x80}).isValid()); + assertTrue(fromBytes(new byte[] {(byte) 0xEF, (byte) 0xBF, (byte) 0xBF}).isValid()); + assertTrue(fromBytes(new byte[] {(byte) 0xEF, (byte) 0xBF, (byte) 0xBF}).isValid()); + assertFalse(fromBytes(new byte[] {(byte) 0xE0, (byte) 0x9F, (byte) 0x80}).isValid()); + assertFalse(fromBytes(new byte[] {(byte) 0xE0, (byte) 0xC0, (byte) 0x80}).isValid()); + assertFalse(fromBytes(new byte[] {(byte) 0xEF, (byte) 0xBF, (byte) 0xC0}).isValid()); + assertFalse(fromBytes(new byte[] {(byte) 0x80, (byte) 0x80, (byte) 0x80}).isValid()); + assertTrue(fromBytes( + new byte[] {(byte) 0xF0, (byte) 0x90, (byte) 0x80, (byte) 0x80}).isValid()); + assertTrue(fromBytes( + new byte[] {(byte) 0xF4, (byte) 0x8F, (byte) 0xBF, (byte) 0xBF}).isValid()); + assertFalse(fromBytes( + new byte[] {(byte) 0xF0, (byte) 0x8F, (byte) 0x80, (byte) 0x80}).isValid()); + assertFalse(fromBytes( + new byte[] {(byte) 0xF0, (byte) 0x90, (byte) 0x80, (byte) 0x7F}).isValid()); + assertFalse(fromBytes( + new byte[] {(byte) 0xF4, (byte) 0x90, (byte) 0x80, (byte) 0x80}).isValid()); + assertFalse(fromBytes( + new byte[] {(byte) 0xF4, (byte) 0x8F, (byte) 0xBF, (byte) 0xC0}).isValid()); + assertFalse(fromBytes( + new byte[] {(byte) 0xF8, (byte) 0x8F, (byte) 0x80, (byte) 0x80, (byte) 0x80}).isValid()); + assertFalse(fromBytes( + new byte[] {(byte) 0xF8, (byte) 0x90, (byte) 0x80, (byte) 0x80, (byte) 0x80}).isValid()); + assertFalse(fromBytes( + new byte[] {(byte) 0xFB, (byte) 0x8F, (byte) 0xBF, (byte) 0xBF, (byte) 0xBF}).isValid()); + assertFalse(fromBytes( + new byte[] {(byte) 0xFB, (byte) 0x90, (byte) 0x80, (byte) 0x80, (byte) 0x80}).isValid()); + // More tests + testIsValid("0x42", "0x42"); + testIsValid("0x80", "0xEF 0xBF 0xBD"); + testIsValid("0xF4 0x92 0x12", "0xEF 0xBF 0xBD 0xEF 0xBF 0xBD 0x12"); + testIsValid("0x84 0xA5 0x63 0x64", "0xEF 0xBF 0xBD 0xEF 0xBF 0xBD 0x63 0x64"); + testIsValid("0xF4 0x9B", "0xEF 0xBF 0xBD 0xEF 0xBF 0xBD"); + testIsValid("0xE2 0x0C", "0xEF 0xBF 0xBD 0x0C"); + testIsValid("0xE8 0x3E", "0xEF 0xBF 0xBD 0x3E"); + testIsValid("0x17 0x1B 0xC2", "0x17 0x1B 0xEF 0xBF 0xBD"); + testIsValid("0x29 0xB2 0x7C 0xA0", "0x29 0xEF 0xBF 0xBD 0x7C 0xEF 0xBF 0xBD"); + testIsValid("0x13 0x68 0x28 0x2A 0x83", "0x13 0x68 0x28 0x2A 0xEF 0xBF 0xBD"); + testIsValid("0x3B 0xFE 0xC2 0xB7 0x9E", "0x3B 0xEF 0xBF 0xBD 0xC2 0xB7 0xEF 0xBF 0xBD"); + testIsValid("0xFC", "0xEF 0xBF 0xBD"); + testIsValid("0x8E 0xBD", "0xEF 0xBF 0xBD 0xEF 0xBF 0xBD"); + testIsValid("0x6A 0x8B 0x5C 0x5A", "0x6A 0xEF 0xBF 0xBD 0x5C 0x5A"); + testIsValid("0xB4 0xC0", "0xEF 0xBF 0xBD 0xEF 0xBF 0xBD"); + testIsValid("0xB0 0xD3", "0xEF 0xBF 0xBD 0xEF 0xBF 0xBD"); + testIsValid("0xAA", "0xEF 0xBF 0xBD"); + testIsValid("0x1A 0x66", "0x1A 0x66"); + testIsValid("0x32 0xB5 0x5F", "0x32 0xEF 0xBF 0xBD 0x5F"); + testIsValid("0x70 0xB6 0x69 0xBC", "0x70 0xEF 0xBF 0xBD 0x69 0xEF 0xBF 0xBD"); + testIsValid("0xDC 0x15 0x82", "0xEF 0xBF 0xBD 0x15 0xEF 0xBF 0xBD"); + testIsValid("0x7F 0x3F 0x72 0xBB", "0x7F 0x3F 0x72 0xEF 0xBF 0xBD"); + testIsValid("0x3C 0x3D 0x1F 0x6C 0x75", "0x3C 0x3D 0x1F 0x6C 0x75"); + testIsValid("0xBF 0x10 0xF4", "0xEF 0xBF 0xBD 0x10 0xEF 0xBF 0xBD"); + testIsValid("0xBF 0xDD 0x89", "0xEF 0xBF 0xBD 0xDD 0x89"); + testIsValid("0x85 0xD4 0x90 0x5E 0x6C", "0xEF 0xBF 0xBD 0xD4 0x90 0x5E 0x6C"); + testIsValid("0x0A 0x92 0x3C 0x17 0x7D", "0x0A 0xEF 0xBF 0xBD 0x3C 0x17 0x7D"); + testIsValid("0x49 0x4E 0x2C 0x7B 0x1C", "0x49 0x4E 0x2C 0x7B 0x1C"); + testIsValid("0x2E 0x0C 0x3E 0x02 0x9D", "0x2E 0x0C 0x3E 0x02 0xEF 0xBF 0xBD"); + testIsValid("0xF9 0xF2 0x11", "0xEF 0xBF 0xBD 0xEF 0xBF 0xBD 0x11"); + testIsValid("0xD1", "0xEF 0xBF 0xBD"); + testIsValid("0x74 0x35 0x56 0xC4", "0x74 0x35 0x56 0xEF 0xBF 0xBD"); + testIsValid("0xBE 0x45 0x7F", "0xEF 0xBF 0xBD 0x45 0x7F"); + testIsValid("0x2E", "0x2E"); + testIsValid("0x3B 0xC3", "0x3B 0xEF 0xBF 0xBD"); + testIsValid("0xF9 0x58 0xE0 0x84", "0xEF 0xBF 0xBD 0x58 0xEF 0xBF 0xBD 0xEF 0xBF 0xBD"); + testIsValid("0x72 0xF9 0x4F", "0x72 0xEF 0xBF 0xBD 0x4F"); + testIsValid("0xF2 0x2A 0x38", "0xEF 0xBF 0xBD 0x2A 0x38"); + testIsValid("0x66 0xFC 0x8D 0x5F", "0x66 0xEF 0xBF 0xBD 0xEF 0xBF 0xBD 0x5F"); + testIsValid("0x6D", "0x6D"); + testIsValid("0x6E 0xDF 0xDD", "0x6E 0xEF 0xBF 0xBD 0xEF 0xBF 0xBD"); + testIsValid("0x50 0xE8 0x45 0xDB", "0x50 0xEF 0xBF 0xBD 0x45 0xEF 0xBF 0xBD"); + testIsValid("0x71 0x06 0x1F 0x87", "0x71 0x06 0x1F 0xEF 0xBF 0xBD"); + testIsValid("0x02", "0x02"); + testIsValid("0x20 0x2B 0x38 0xCB 0xDF", "0x20 0x2B 0x38 0xEF 0xBF 0xBD 0xEF 0xBF 0xBD"); + testIsValid("0x10", "0x10"); + testIsValid("0x15", "0x15"); + testIsValid("0xC5 0xEA 0x2A", "0xEF 0xBF 0xBD 0xEF 0xBF 0xBD 0x2A"); + testIsValid("0x72 0x2D 0xBB 0x06", "0x72 0x2D 0xEF 0xBF 0xBD 0x06"); + testIsValid("0xB7", "0xEF 0xBF 0xBD"); + testIsValid("0xB8 0xB5", "0xEF 0xBF 0xBD 0xEF 0xBF 0xBD"); + testIsValid("0x9C 0x76 0x17", "0xEF 0xBF 0xBD 0x76 0x17"); + } + + @Test + public void testGetByte() { + // Valid UTF-8 string + String validString = "abcde"; + UTF8String validUTF8String = fromString(validString); + // Valid byte index handling + for (int i = 0; i < validString.length(); ++i) { + assertEquals(validString.charAt(i), validUTF8String.getByte(i)); + } + // Invalid byte index handling + assertEquals(0, validUTF8String.getByte(-1)); + assertEquals(0, validUTF8String.getByte(validString.length())); + assertEquals(0, validUTF8String.getByte(validString.length() + 1)); + + // Invalid UTF-8 string + byte[] invalidString = new byte[] {(byte) 0x41, (byte) 0x42, (byte) 0x80}; + UTF8String invalidUTF8String = fromBytes(invalidString); + // Valid byte index handling + for (int i = 0; i < invalidString.length; ++i) { + assertEquals(invalidString[i], invalidUTF8String.getByte(i)); + } + // Invalid byte index handling + assertEquals(0, invalidUTF8String.getByte(-1)); + assertEquals(0, invalidUTF8String.getByte(invalidString.length)); + assertEquals(0, invalidUTF8String.getByte(invalidString.length + 1)); + } + + @Test + public void testGetChar() { + // Valid UTF-8 string + String str = "abcde"; + UTF8String s = fromString(str); + // Valid character index handling + for (int i = 0; i < str.length(); ++i) { + assertEquals(str.charAt(i), s.getChar(i)); + } + // Invalid character index handling + assertThrows(IndexOutOfBoundsException.class, () -> s.getChar(-1)); + assertThrows(IndexOutOfBoundsException.class, () -> s.getChar(str.length())); + assertThrows(IndexOutOfBoundsException.class, () -> s.getChar(str.length() + 1)); + + // Invalid UTF-8 string + byte[] invalidString = new byte[] {(byte) 0x41, (byte) 0x42, (byte) 0x80}; + UTF8String invalidUTF8String = fromBytes(invalidString); + // Valid byte index handling + for (int i = 0; i < invalidString.length; ++i) { + if (Character.isValidCodePoint(invalidString[i])) { + assertEquals(invalidString[i], invalidUTF8String.getChar(i)); + } else { + assertEquals(0, invalidUTF8String.getChar(i)); + } + } + // Invalid byte index handling + assertThrows(IndexOutOfBoundsException.class, () -> s.getChar(-1)); + assertThrows(IndexOutOfBoundsException.class, () -> s.getChar(str.length())); + assertThrows(IndexOutOfBoundsException.class, () -> s.getChar(str.length() + 1)); + } + + @Test + public void testCodePointFrom() { + // Valid UTF-8 string + String str = "abcde"; + UTF8String s = fromString(str); + // Valid character index handling + for (int i = 0; i < str.length(); ++i) { + assertEquals(str.charAt(i), s.codePointFrom(i)); + } + // Invalid character index handling + assertThrows(IndexOutOfBoundsException.class, () -> s.codePointFrom(-1)); + assertThrows(IndexOutOfBoundsException.class, () -> s.codePointFrom(str.length())); + assertThrows(IndexOutOfBoundsException.class, () -> s.codePointFrom(str.length() + 1)); + + // Invalid UTF-8 string + byte[] invalidString = new byte[] {(byte) 0x41, (byte) 0x42, (byte) 0x80}; + UTF8String invalidUTF8String = fromBytes(invalidString); + // Valid byte index handling + for (int i = 0; i < invalidString.length; ++i) { + if (Character.isValidCodePoint(invalidString[i])) { + assertEquals(invalidString[i], invalidUTF8String.codePointFrom(i)); + } else { + assertEquals(0, invalidUTF8String.codePointFrom(i)); + } + } + // Invalid byte index handling + assertThrows(IndexOutOfBoundsException.class, () -> s.codePointFrom(-1)); + assertThrows(IndexOutOfBoundsException.class, () -> s.codePointFrom(str.length())); + assertThrows(IndexOutOfBoundsException.class, () -> s.codePointFrom(str.length() + 1)); + } + + @Test + public void utf8StringCodePoints() { + String s = "aéह 日å!"; + UTF8String s0 = fromString(s); + for (int i = 0; i < s.length(); ++i) { + assertEquals(s.codePointAt(i), s0.getChar(i)); + } + + UTF8String s1 = fromBytes(new byte[] {0x41, (byte) 0xC3, (byte) 0xB1, (byte) 0xE2, + (byte) 0x82, (byte) 0xAC, (byte) 0xF0, (byte) 0x90, (byte) 0x8D, (byte) 0x88}); + // numBytesForFirstByte + assertEquals(1, UTF8String.numBytesForFirstByte(s1.getByte(0))); + assertEquals(2, UTF8String.numBytesForFirstByte(s1.getByte(1))); + assertEquals(3, UTF8String.numBytesForFirstByte(s1.getByte(3))); + assertEquals(4, UTF8String.numBytesForFirstByte(s1.getByte(6))); + // getByte + assertEquals((byte) 0x41, s1.getByte(0)); + assertEquals((byte) 0xC3, s1.getByte(1)); + assertEquals((byte) 0xE2, s1.getByte(3)); + assertEquals((byte) 0xF0, s1.getByte(6)); + // codePointFrom + assertEquals(0x41, s1.codePointFrom(0)); + assertEquals(0xF1, s1.codePointFrom(1)); + assertEquals(0x20AC, s1.codePointFrom(3)); + assertEquals(0x10348, s1.codePointFrom(6)); + assertThrows(IndexOutOfBoundsException.class, () -> s1.codePointFrom(-1)); + assertThrows(IndexOutOfBoundsException.class, () -> s1.codePointFrom(99)); + // getChar + assertEquals(0x41, s1.getChar(0)); + assertEquals(0xF1, s1.getChar(1)); + assertEquals(0x20AC, s1.getChar(2)); + assertEquals(0x10348, s1.getChar(3)); + assertThrows(IndexOutOfBoundsException.class, () -> s1.getChar(-1)); + assertThrows(IndexOutOfBoundsException.class, () -> s1.getChar(99)); + + UTF8String s2 = fromString("Añ€𐍈"); + // numBytesForFirstByte + assertEquals(1, UTF8String.numBytesForFirstByte(s2.getByte(0))); + assertEquals(2, UTF8String.numBytesForFirstByte(s2.getByte(1))); + assertEquals(3, UTF8String.numBytesForFirstByte(s2.getByte(3))); + assertEquals(4, UTF8String.numBytesForFirstByte(s2.getByte(6))); + // getByte + assertEquals((byte) 0x41, s2.getByte(0)); + assertEquals((byte) 0xC3, s2.getByte(1)); + assertEquals((byte) 0xE2, s2.getByte(3)); + assertEquals((byte) 0xF0, s2.getByte(6)); + // codePointFrom + assertEquals(0x41, s2.codePointFrom(0)); + assertEquals(0xF1, s2.codePointFrom(1)); + assertEquals(0x20AC, s2.codePointFrom(3)); + assertEquals(0x10348, s2.codePointFrom(6)); + assertThrows(IndexOutOfBoundsException.class, () -> s2.codePointFrom(-1)); + assertThrows(IndexOutOfBoundsException.class, () -> s2.codePointFrom(99)); + // getChar + assertEquals(0x41, s2.getChar(0)); + assertEquals(0xF1, s2.getChar(1)); + assertEquals(0x20AC, s2.getChar(2)); + assertEquals(0x10348, s2.getChar(3)); + assertThrows(IndexOutOfBoundsException.class, () -> s2.getChar(-1)); + assertThrows(IndexOutOfBoundsException.class, () -> s2.getChar(99)); + + UTF8String s3 = EMPTY_UTF8; + // codePointFrom + assertThrows(IndexOutOfBoundsException.class, () -> s3.codePointFrom(0)); + assertThrows(IndexOutOfBoundsException.class, () -> s3.codePointFrom(-1)); + assertThrows(IndexOutOfBoundsException.class, () -> s3.codePointFrom(99)); + // getChar + assertThrows(IndexOutOfBoundsException.class, () -> s3.getChar(0)); + assertThrows(IndexOutOfBoundsException.class, () -> s3.getChar(-1)); + assertThrows(IndexOutOfBoundsException.class, () -> s3.getChar(99)); + } + + private void testCodePointIterator(UTF8String utf8String) { + CodePointIteratorType iteratorMode = utf8String.isValid() ? + CodePointIteratorType.CODE_POINT_ITERATOR_ASSUME_VALID : + CodePointIteratorType.CODE_POINT_ITERATOR_MAKE_VALID; + Iterator iterator = utf8String.codePointIterator(iteratorMode); + for (int i = 0; i < utf8String.numChars(); ++i) { + assertTrue(iterator.hasNext()); + int codePoint = (utf8String.isValid() ? utf8String : utf8String.makeValid()).getChar(i); + assertEquals(codePoint, (int) iterator.next()); + } + assertFalse(iterator.hasNext()); + } + @Test + public void codePointIterator() { + // Valid UTF8 strings. + testCodePointIterator(fromString("")); + testCodePointIterator(fromString("abc")); + testCodePointIterator(fromString("a!2&^R")); + testCodePointIterator(fromString("aéह 日å!")); + testCodePointIterator(fromBytes(new byte[] {(byte) 0x41})); + testCodePointIterator(fromBytes(new byte[] {(byte) 0xC2, (byte) 0xA3})); + testCodePointIterator(fromBytes(new byte[] {(byte) 0xE2, (byte) 0x82, (byte) 0xAC})); + // Invalid UTF8 strings. + testCodePointIterator(fromBytes(new byte[] {(byte) 0xFF})); + testCodePointIterator(fromBytes(new byte[] {(byte) 0x80})); + testCodePointIterator(fromBytes(new byte[] {(byte) 0xC2, (byte) 0x80})); + testCodePointIterator(fromBytes(new byte[] {(byte) 0xE2, (byte) 0x82, (byte) 0x80})); + testCodePointIterator(fromBytes(new byte[] {(byte) 0x41, (byte) 0x80, (byte) 0x42})); + testCodePointIterator(fromBytes(new byte[] { + (byte) 0x41, (byte) 0xC2, (byte) 0x80, (byte) 0x42})); + testCodePointIterator(fromBytes(new byte[] { + (byte) 0x41, (byte) 0xE2, (byte) 0x82, (byte) 0x80, (byte) 0x42})); + } + + private void testReverseCodePointIterator(UTF8String utf8String) { + CodePointIteratorType iteratorMode = utf8String.isValid() ? + CodePointIteratorType.CODE_POINT_ITERATOR_ASSUME_VALID : + CodePointIteratorType.CODE_POINT_ITERATOR_MAKE_VALID; + Iterator iterator = utf8String.codePointIterator(iteratorMode); + for (int i = 0; i < utf8String.numChars(); ++i) { + assertTrue(iterator.hasNext()); + int codePoint = (utf8String.isValid() ? utf8String : utf8String.makeValid()).getChar(i); + assertEquals(codePoint, (int) iterator.next()); + } + assertFalse(iterator.hasNext()); + } + @Test + public void reverseCodePointIterator() { + // Valid UTF8 strings + testReverseCodePointIterator(fromString("")); + testReverseCodePointIterator(fromString("abc")); + testReverseCodePointIterator(fromString("a!2&^R")); + testReverseCodePointIterator(fromString("aéह 日å!")); + testReverseCodePointIterator(fromBytes(new byte[] {(byte) 0x41})); + testReverseCodePointIterator(fromBytes(new byte[] {(byte) 0xC2, (byte) 0xA3})); + testReverseCodePointIterator(fromBytes(new byte[] {(byte) 0xE2, (byte) 0x82, (byte) 0xAC})); + // Invalid UTF8 strings + testReverseCodePointIterator(fromBytes(new byte[] {(byte) 0xFF})); + testReverseCodePointIterator(fromBytes(new byte[] {(byte) 0x80})); + testReverseCodePointIterator(fromBytes(new byte[] {(byte) 0xC2, (byte) 0x80})); + testReverseCodePointIterator(fromBytes(new byte[] {(byte) 0xE2, (byte) 0x82, (byte) 0x80})); + testReverseCodePointIterator(fromBytes(new byte[] {(byte) 0x41, (byte) 0x80, (byte) 0x42})); + testReverseCodePointIterator(fromBytes(new byte[] { + (byte) 0x41, (byte) 0xC2, (byte) 0x80, (byte) 0x42})); + testReverseCodePointIterator(fromBytes(new byte[] { + (byte) 0x41, (byte) 0xE2, (byte) 0x82, (byte) 0x80, (byte) 0x42})); + } + + @Test + public void toBinaryString() { + assertEquals(ZERO_UTF8, UTF8String.toBinaryString(0)); + assertEquals(UTF8String.fromString("1"), UTF8String.toBinaryString(1)); + assertEquals(UTF8String.fromString("10"), UTF8String.toBinaryString(2)); + assertEquals(UTF8String.fromString("100"), UTF8String.toBinaryString(4)); + assertEquals(UTF8String.fromString("111"), UTF8String.toBinaryString(7)); + assertEquals( + UTF8String.fromString("1111111111111111111111111111111111111111111111111111111111110011"), + UTF8String.toBinaryString(-13)); + assertEquals( + UTF8String.fromString("1000000000000000000000000000000000000000000000000000000000000000"), + UTF8String.toBinaryString(Long.MIN_VALUE)); + assertEquals( + UTF8String.fromString("111111111111111111111111111111111111111111111111111111111111111"), + UTF8String.toBinaryString(Long.MAX_VALUE)); + } } diff --git a/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/CollationFactorySuite.scala b/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/CollationFactorySuite.scala index 768d26bf0e11e..3c29daeff168f 100644 --- a/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/CollationFactorySuite.scala +++ b/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/CollationFactorySuite.scala @@ -20,7 +20,10 @@ package org.apache.spark.unsafe.types import scala.collection.parallel.immutable.ParSeq import scala.jdk.CollectionConverters.MapHasAsScala +import com.ibm.icu.util.ULocale + import org.apache.spark.SparkException +import org.apache.spark.sql.catalyst.util.CollationFactory.fetchCollation // scalastyle:off import org.scalatest.funsuite.AnyFunSuite import org.scalatest.matchers.must.Matchers @@ -30,31 +33,93 @@ import org.apache.spark.unsafe.types.UTF8String.{fromString => toUTF8} class CollationFactorySuite extends AnyFunSuite with Matchers { // scalastyle:ignore funsuite test("collationId stability") { - val utf8Binary = fetchCollation(0) + assert(INDETERMINATE_COLLATION_ID == -1) + + assert(UTF8_BINARY_COLLATION_ID == 0) + val utf8Binary = fetchCollation(UTF8_BINARY_COLLATION_ID) assert(utf8Binary.collationName == "UTF8_BINARY") assert(utf8Binary.supportsBinaryEquality) - val utf8BinaryLcase = fetchCollation(1) - assert(utf8BinaryLcase.collationName == "UTF8_BINARY_LCASE") + assert(UTF8_LCASE_COLLATION_ID == 1) + val utf8BinaryLcase = fetchCollation(UTF8_LCASE_COLLATION_ID) + assert(utf8BinaryLcase.collationName == "UTF8_LCASE") assert(!utf8BinaryLcase.supportsBinaryEquality) - val unicode = fetchCollation(2) + assert(UNICODE_COLLATION_ID == (1 << 29)) + val unicode = fetchCollation(UNICODE_COLLATION_ID) assert(unicode.collationName == "UNICODE") - assert(unicode.supportsBinaryEquality); + assert(!unicode.supportsBinaryEquality) - val unicodeCi = fetchCollation(3) + assert(UNICODE_CI_COLLATION_ID == ((1 << 29) | (1 << 17))) + val unicodeCi = fetchCollation(UNICODE_CI_COLLATION_ID) assert(unicodeCi.collationName == "UNICODE_CI") assert(!unicodeCi.supportsBinaryEquality) } - test("fetch invalid collation name") { - val error = intercept[SparkException] { - fetchCollation("UTF8_BS") + test("UTF8_BINARY and ICU root locale collation names") { + // Collation name already normalized. + Seq( + "UTF8_BINARY", + "UTF8_LCASE", + "UNICODE", + "UNICODE_CI", + "UNICODE_AI", + "UNICODE_CI_AI" + ).foreach(collationName => { + val col = fetchCollation(collationName) + assert(col.collationName == collationName) + }) + // Collation name normalization. + Seq( + // ICU root locale. + ("UNICODE_CS", "UNICODE"), + ("UNICODE_CS_AS", "UNICODE"), + ("UNICODE_CI_AS", "UNICODE_CI"), + ("UNICODE_AI_CS", "UNICODE_AI"), + ("UNICODE_AI_CI", "UNICODE_CI_AI"), + // Randomized case collation names. + ("utf8_binary", "UTF8_BINARY"), + ("UtF8_LcasE", "UTF8_LCASE"), + ("unicode", "UNICODE"), + ("UnICoDe_cs_aI", "UNICODE_AI") + ).foreach{ + case (name, normalized) => + val col = fetchCollation(name) + assert(col.collationName == normalized) } + } - assert(error.getErrorClass === "COLLATION_INVALID_NAME") - assert(error.getMessageParameters.asScala === - Map("proposal" -> "UTF8_BINARY", "collationName" -> "UTF8_BS")) + test("fetch invalid UTF8_BINARY and ICU root locale collation names") { + Seq( + ("UTF8_BINARY_CS", "UTF8_BINARY"), + ("UTF8_BINARY_AS", "UTF8_BINARY"), // this should be UNICODE_AS + ("UTF8_BINARY_CS_AS","UTF8_BINARY"), // this should be UNICODE_CS_AS + ("UTF8_BINARY_AS_CS","UTF8_BINARY"), + ("UTF8_BINARY_CI","UTF8_BINARY"), + ("UTF8_BINARY_AI","UTF8_BINARY"), + ("UTF8_BINARY_CI_AI","UTF8_BINARY"), + ("UTF8_BINARY_AI_CI","UTF8_BINARY"), + ("UTF8_BS","UTF8_LCASE"), + ("BINARY_UTF8","ar_SAU"), + ("UTF8_BINARY_A","UTF8_BINARY"), + ("UNICODE_X","UNICODE"), + ("UNICODE_CI_X","UNICODE"), + ("UNICODE_LCASE_X","UNICODE"), + ("UTF8_UNICODE","UTF8_LCASE"), + ("UTF8_BINARY_UNICODE","UTF8_BINARY"), + ("CI_UNICODE", "UNICODE"), + ("LCASE_UNICODE", "UNICODE"), + ("UNICODE_UNSPECIFIED", "UNICODE"), + ("UNICODE_CI_UNSPECIFIED", "UNICODE"), + ("UNICODE_UNSPECIFIED_CI_UNSPECIFIED", "UNICODE"), + ("UNICODE_INDETERMINATE", "UNICODE"), + ("UNICODE_CI_INDETERMINATE", "UNICODE") + ).foreach{case (collationName, proposals) => + val error = intercept[SparkException] { fetchCollation(collationName) } + assert(error.getErrorClass === "COLLATION_INVALID_NAME") + assert(error.getMessageParameters.asScala === Map( + "collationName" -> collationName, "proposals" -> proposals)) + } } case class CollationTestCase[R](collationName: String, s1: String, s2: String, expectedResult: R) @@ -64,18 +129,24 @@ class CollationFactorySuite extends AnyFunSuite with Matchers { // scalastyle:ig CollationTestCase("UTF8_BINARY", "aaa", "aaa", true), CollationTestCase("UTF8_BINARY", "aaa", "AAA", false), CollationTestCase("UTF8_BINARY", "aaa", "bbb", false), - CollationTestCase("UTF8_BINARY_LCASE", "aaa", "aaa", true), - CollationTestCase("UTF8_BINARY_LCASE", "aaa", "AAA", true), - CollationTestCase("UTF8_BINARY_LCASE", "aaa", "AaA", true), - CollationTestCase("UTF8_BINARY_LCASE", "aaa", "AaA", true), - CollationTestCase("UTF8_BINARY_LCASE", "aaa", "aa", false), - CollationTestCase("UTF8_BINARY_LCASE", "aaa", "bbb", false), + CollationTestCase("UTF8_BINARY", "å", "a\u030A", false), + CollationTestCase("UTF8_LCASE", "aaa", "aaa", true), + CollationTestCase("UTF8_LCASE", "aaa", "AAA", true), + CollationTestCase("UTF8_LCASE", "aaa", "AaA", true), + CollationTestCase("UTF8_LCASE", "aaa", "AaA", true), + CollationTestCase("UTF8_LCASE", "aaa", "aa", false), + CollationTestCase("UTF8_LCASE", "aaa", "bbb", false), + CollationTestCase("UTF8_LCASE", "å", "a\u030A", false), CollationTestCase("UNICODE", "aaa", "aaa", true), CollationTestCase("UNICODE", "aaa", "AAA", false), CollationTestCase("UNICODE", "aaa", "bbb", false), + CollationTestCase("UNICODE", "å", "a\u030A", true), CollationTestCase("UNICODE_CI", "aaa", "aaa", true), CollationTestCase("UNICODE_CI", "aaa", "AAA", true), - CollationTestCase("UNICODE_CI", "aaa", "bbb", false)) + CollationTestCase("UNICODE_CI", "aaa", "bbb", false), + CollationTestCase("UNICODE_CI", "å", "a\u030A", true), + CollationTestCase("UNICODE_CI", "Å", "a\u030A", true) + ) checks.foreach(testCase => { val collation = fetchCollation(testCase.collationName) @@ -94,12 +165,12 @@ class CollationFactorySuite extends AnyFunSuite with Matchers { // scalastyle:ig CollationTestCase("UTF8_BINARY", "aaa", "AAA", 1), CollationTestCase("UTF8_BINARY", "aaa", "bbb", -1), CollationTestCase("UTF8_BINARY", "aaa", "BBB", 1), - CollationTestCase("UTF8_BINARY_LCASE", "aaa", "aaa", 0), - CollationTestCase("UTF8_BINARY_LCASE", "aaa", "AAA", 0), - CollationTestCase("UTF8_BINARY_LCASE", "aaa", "AaA", 0), - CollationTestCase("UTF8_BINARY_LCASE", "aaa", "AaA", 0), - CollationTestCase("UTF8_BINARY_LCASE", "aaa", "aa", 1), - CollationTestCase("UTF8_BINARY_LCASE", "aaa", "bbb", -1), + CollationTestCase("UTF8_LCASE", "aaa", "aaa", 0), + CollationTestCase("UTF8_LCASE", "aaa", "AAA", 0), + CollationTestCase("UTF8_LCASE", "aaa", "AaA", 0), + CollationTestCase("UTF8_LCASE", "aaa", "AaA", 0), + CollationTestCase("UTF8_LCASE", "aaa", "aa", 1), + CollationTestCase("UTF8_LCASE", "aaa", "bbb", -1), CollationTestCase("UNICODE", "aaa", "aaa", 0), CollationTestCase("UNICODE", "aaa", "AAA", -1), CollationTestCase("UNICODE", "aaa", "bbb", -1), @@ -152,4 +223,246 @@ class CollationFactorySuite extends AnyFunSuite with Matchers { // scalastyle:ig } }) } + + test("test collation caching") { + Seq( + "UTF8_BINARY", + "UTF8_LCASE", + "UNICODE", + "UNICODE_CI", + "UNICODE_AI", + "UNICODE_CI_AI", + "UNICODE_AI_CI" + ).foreach(collationId => { + val col1 = fetchCollation(collationId) + val col2 = fetchCollation(collationId) + assert(col1 eq col2) // Check for reference equality. + }) + } + + test("collations with ICU non-root localization") { + Seq( + // Language only. + "en", + "en_CS", + "en_CI", + "en_AS", + "en_AI", + // Language + 3-letter country code. + "en_USA", + "en_USA_CS", + "en_USA_CI", + "en_USA_AS", + "en_USA_AI", + // Language + script code. + "sr_Cyrl", + "sr_Cyrl_CS", + "sr_Cyrl_CI", + "sr_Cyrl_AS", + "sr_Cyrl_AI", + // Language + script code + 3-letter country code. + "sr_Cyrl_SRB", + "sr_Cyrl_SRB_CS", + "sr_Cyrl_SRB_CI", + "sr_Cyrl_SRB_AS", + "sr_Cyrl_SRB_AI" + ).foreach(collationICU => { + val col = fetchCollation(collationICU) + assert(col.collator.getLocale(ULocale.VALID_LOCALE) != ULocale.ROOT) + }) + } + + test("invalid names of collations with ICU non-root localization") { + Seq( + ("en_US", "en_USA"), // Must use 3-letter country code + ("eN_US", "en_USA"), // verify that proper casing is captured in error. + ("enn", "en, nn, bn"), + ("en_AAA", "en_USA"), + ("en_Something", "UNICODE"), + ("en_Something_USA", "en_USA"), + ("en_LCASE", "en_USA"), + ("en_UCASE", "en_USA"), + ("en_CI_LCASE", "UNICODE"), + ("en_CI_UCASE", "en_USA"), + ("en_CI_UNSPECIFIED", "en_USA"), + ("en_USA_UNSPECIFIED", "en_USA"), + ("en_USA_UNSPECIFIED_CI", "en_USA_CI"), + ("en_INDETERMINATE", "en_USA"), + ("en_USA_INDETERMINATE", "en_USA"), + ("en_Latn_USA", "en_USA"), + ("en_Cyrl_USA", "en_USA"), + ("en_USA_AAA", "en_USA"), + ("sr_Cyrl_SRB_AAA", "sr_Cyrl_SRB"), + // Invalid ordering of language, script and country code. + ("USA_en", "en"), + ("sr_SRB_Cyrl", "sr_Cyrl"), + ("SRB_sr", "ar_SAU"), + ("SRB_sr_Cyrl", "bs_Cyrl"), + ("SRB_Cyrl_sr", "sr_Cyrl_SRB"), + ("Cyrl_sr", "sr_Cyrl_SRB"), + ("Cyrl_sr_SRB", "sr_Cyrl_SRB"), + ("Cyrl_SRB_sr", "sr_Cyrl_SRB"), + // Collation specifiers in the middle of locale. + ("CI_en", "ceb"), + ("USA_CI_en", "UNICODE"), + ("en_CI_USA", "en_USA"), + ("CI_sr_Cyrl_SRB", "sr_Cyrl_SRB"), + ("sr_CI_Cyrl_SRB", "sr_Cyrl_SRB"), + ("sr_Cyrl_CI_SRB", "sr_Cyrl_SRB"), + ("CI_Cyrl_sr", "sr_Cyrl_SRB"), + ("Cyrl_CI_sr", "he_ISR"), + ("Cyrl_CI_sr_SRB", "sr_Cyrl_SRB"), + ("Cyrl_sr_CI_SRB", "sr_Cyrl_SRB"), + // no locale specified + ("_CI_AI", "af_CI_AI, am_CI_AI, ar_CI_AI"), + ("", "af, am, ar") + ).foreach { case (collationName, proposals) => { + val error = intercept[SparkException] { fetchCollation(collationName) } + assert(error.getErrorClass === "COLLATION_INVALID_NAME") + + assert(error.getMessageParameters.asScala === Map( + "collationName" -> collationName, "proposals" -> proposals)) + }} + } + + test("collations name normalization for ICU non-root localization") { + Seq( + ("en_USA", "en_USA"), + ("en_CS", "en"), + ("en_AS", "en"), + ("en_CS_AS", "en"), + ("en_AS_CS", "en"), + ("en_CI", "en_CI"), + ("en_AI", "en_AI"), + ("en_AI_CI", "en_CI_AI"), + ("en_CI_AI", "en_CI_AI"), + ("en_CS_AI", "en_AI"), + ("en_AI_CS", "en_AI"), + ("en_CI_AS", "en_CI"), + ("en_AS_CI", "en_CI"), + ("en_USA_AI_CI", "en_USA_CI_AI"), + // Randomized case. + ("EN_USA", "en_USA"), + ("SR_CYRL", "sr_Cyrl"), + ("sr_cyrl_srb", "sr_Cyrl_SRB"), + ("sR_cYRl_sRb", "sr_Cyrl_SRB") + ).foreach { + case (name, normalized) => + val col = fetchCollation(name) + assert(col.collationName == normalized) + } + } + + test("invalid collationId") { + val badCollationIds = Seq( + INDETERMINATE_COLLATION_ID, // Indeterminate collation. + 1 << 30, // User-defined collation range. + (1 << 30) | 1, // User-defined collation range. + (1 << 30) | (1 << 29), // User-defined collation range. + 1 << 1, // UTF8_BINARY mandatory zero bit 1 breach. + 1 << 2, // UTF8_BINARY mandatory zero bit 2 breach. + 1 << 3, // UTF8_BINARY mandatory zero bit 3 breach. + 1 << 4, // UTF8_BINARY mandatory zero bit 4 breach. + 1 << 5, // UTF8_BINARY mandatory zero bit 5 breach. + 1 << 6, // UTF8_BINARY mandatory zero bit 6 breach. + 1 << 7, // UTF8_BINARY mandatory zero bit 7 breach. + 1 << 8, // UTF8_BINARY mandatory zero bit 8 breach. + 1 << 9, // UTF8_BINARY mandatory zero bit 9 breach. + 1 << 10, // UTF8_BINARY mandatory zero bit 10 breach. + 1 << 11, // UTF8_BINARY mandatory zero bit 11 breach. + 1 << 12, // UTF8_BINARY mandatory zero bit 12 breach. + 1 << 13, // UTF8_BINARY mandatory zero bit 13 breach. + 1 << 14, // UTF8_BINARY mandatory zero bit 14 breach. + 1 << 15, // UTF8_BINARY mandatory zero bit 15 breach. + 1 << 16, // UTF8_BINARY mandatory zero bit 16 breach. + 1 << 17, // UTF8_BINARY mandatory zero bit 17 breach. + 1 << 18, // UTF8_BINARY mandatory zero bit 18 breach. + 1 << 19, // UTF8_BINARY mandatory zero bit 19 breach. + 1 << 20, // UTF8_BINARY mandatory zero bit 20 breach. + 1 << 23, // UTF8_BINARY mandatory zero bit 23 breach. + 1 << 24, // UTF8_BINARY mandatory zero bit 24 breach. + 1 << 25, // UTF8_BINARY mandatory zero bit 25 breach. + 1 << 26, // UTF8_BINARY mandatory zero bit 26 breach. + 1 << 27, // UTF8_BINARY mandatory zero bit 27 breach. + 1 << 28, // UTF8_BINARY mandatory zero bit 28 breach. + (1 << 29) | (1 << 12), // ICU mandatory zero bit 12 breach. + (1 << 29) | (1 << 13), // ICU mandatory zero bit 13 breach. + (1 << 29) | (1 << 14), // ICU mandatory zero bit 14 breach. + (1 << 29) | (1 << 15), // ICU mandatory zero bit 15 breach. + (1 << 29) | (1 << 18), // ICU mandatory zero bit 18 breach. + (1 << 29) | (1 << 19), // ICU mandatory zero bit 19 breach. + (1 << 29) | (1 << 20), // ICU mandatory zero bit 20 breach. + (1 << 29) | (1 << 21), // ICU mandatory zero bit 21 breach. + (1 << 29) | (1 << 22), // ICU mandatory zero bit 22 breach. + (1 << 29) | (1 << 23), // ICU mandatory zero bit 23 breach. + (1 << 29) | (1 << 24), // ICU mandatory zero bit 24 breach. + (1 << 29) | (1 << 25), // ICU mandatory zero bit 25 breach. + (1 << 29) | (1 << 26), // ICU mandatory zero bit 26 breach. + (1 << 29) | (1 << 27), // ICU mandatory zero bit 27 breach. + (1 << 29) | (1 << 28), // ICU mandatory zero bit 28 breach. + (1 << 29) | 0xFFFF // ICU with invalid locale id. + ) + badCollationIds.foreach(collationId => { + // Assumptions about collation id will break and assert statement will fail. + intercept[AssertionError](fetchCollation(collationId)) + }) + } + + test("repeated and/or incompatible and/or misplaced specifiers in collation name") { + Seq( + ("UTF8_LCASE_LCASE", "UTF8_LCASE"), + ("UNICODE_CS_CS", "UNICODE_CS"), + ("UNICODE_CI_CI", "UNICODE_CI"), + ("UNICODE_CI_CS", "UNICODE_CS"), + ("UNICODE_CS_CI", "UNICODE_CS"), + ("UNICODE_AS_AS", "UNICODE_AS"), + ("UNICODE_AI_AI", "UNICODE_AI"), + ("UNICODE_AS_AI", "UNICODE_AS"), + ("UNICODE_AI_AS", "UNICODE_AS"), + ("UNICODE_AS_CS_AI", "UNICODE_AS_CS"), + ("UNICODE_CS_AI_CI", "UNICODE_CS_AI"), + ("UNICODE_CS_AS_CI_AI", "UNICODE_CS_AS"), + ("UNICODE__CS__AS", "UNICODE_AS"), + ("UNICODE-CS-AS", "UNICODE"), + ("UNICODECSAS", "UNICODE"), + ("_CS_AS_UNICODE", "UNICODE") + ).foreach { case (collationName, proposals) => + val error = intercept[SparkException] { + fetchCollation(collationName) + } + + assert(error.getErrorClass === "COLLATION_INVALID_NAME") + assert(error.getMessageParameters.asScala === Map( + "collationName" -> collationName, "proposals" -> proposals)) + } + } + + test("basic ICU collator checks") { + Seq( + CollationTestCase("UNICODE_CI", "a", "A", true), + CollationTestCase("UNICODE_CI", "a", "å", false), + CollationTestCase("UNICODE_CI", "a", "Å", false), + CollationTestCase("UNICODE_AI", "a", "A", false), + CollationTestCase("UNICODE_AI", "a", "å", true), + CollationTestCase("UNICODE_AI", "a", "Å", false), + CollationTestCase("UNICODE_CI_AI", "a", "A", true), + CollationTestCase("UNICODE_CI_AI", "a", "å", true), + CollationTestCase("UNICODE_CI_AI", "a", "Å", true) + ).foreach(testCase => { + val collation = fetchCollation(testCase.collationName) + assert(collation.equalsFunction(toUTF8(testCase.s1), toUTF8(testCase.s2)) == + testCase.expectedResult) + }) + Seq( + CollationTestCase("en", "a", "A", -1), + CollationTestCase("en_CI", "a", "A", 0), + CollationTestCase("en_AI", "a", "å", 0), + CollationTestCase("sv", "Kypper", "Köpfe", -1), + CollationTestCase("de", "Kypper", "Köpfe", 1) + ).foreach(testCase => { + val collation = fetchCollation(testCase.collationName) + val result = collation.comparator.compare(toUTF8(testCase.s1), toUTF8(testCase.s2)) + assert(Integer.signum(result) == testCase.expectedResult) + }) + } } diff --git a/common/utils/src/main/java/org/apache/spark/internal/SparkLogger.java b/common/utils/src/main/java/org/apache/spark/internal/SparkLogger.java new file mode 100644 index 0000000000000..8c210a4fab3c3 --- /dev/null +++ b/common/utils/src/main/java/org/apache/spark/internal/SparkLogger.java @@ -0,0 +1,246 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.internal; + +import java.util.HashMap; +import java.util.Map; +import java.util.function.Consumer; + +import org.apache.logging.log4j.CloseableThreadContext; +import org.apache.logging.log4j.message.MessageFactory; +import org.apache.logging.log4j.message.ParameterizedMessageFactory; +// checkstyle.off: RegexpSinglelineJava +import org.slf4j.Logger; +// checkstyle.on: RegexpSinglelineJava + +// checkstyle.off: RegexpSinglelineJava +/** + * Guidelines for the Structured Logging Framework - Java Logging + *

+ * + * Use the `org.apache.spark.internal.SparkLoggerFactory` to get the logger instance in Java code: + * Getting Logger Instance: + * Instead of using `org.slf4j.LoggerFactory`, use `org.apache.spark.internal.SparkLoggerFactory` + * to ensure structured logging. + *

+ * + * import org.apache.spark.internal.SparkLogger; + * import org.apache.spark.internal.SparkLoggerFactory; + * private static final SparkLogger logger = SparkLoggerFactory.getLogger(JavaUtils.class); + *

+ * + * Logging Messages with Variables: + * When logging messages with variables, wrap all the variables with `MDC`s and they will be + * automatically added to the Mapped Diagnostic Context (MDC). + *

+ * + * import org.apache.spark.internal.LogKeys; + * import org.apache.spark.internal.MDC; + * logger.error("Unable to delete file for partition {}", MDC.of(LogKeys.PARTITION_ID$.MODULE$, i)); + *

+ * + * Constant String Messages: + * For logging constant string messages, use the standard logging methods. + *

+ * + * logger.error("Failed to abort the writer after failing to write map output.", e); + *

+ * + * If you want to output logs in `java code` through the structured log framework, + * you can define `custom LogKey` and use it in `java` code as follows: + *

+ * + * // To add a `custom LogKey`, implement `LogKey` + * public static class CUSTOM_LOG_KEY implements LogKey { } + * import org.apache.spark.internal.MDC; + * logger.error("Unable to delete key {} for cache", MDC.of(CUSTOM_LOG_KEY, "key")); + */ +// checkstyle.on: RegexpSinglelineJava +public class SparkLogger { + + private static final MessageFactory MESSAGE_FACTORY = ParameterizedMessageFactory.INSTANCE; + private final Logger slf4jLogger; + + SparkLogger(Logger slf4jLogger) { + this.slf4jLogger = slf4jLogger; + } + + public boolean isErrorEnabled() { + return slf4jLogger.isErrorEnabled(); + } + + public void error(String msg) { + slf4jLogger.error(msg); + } + + public void error(String msg, Throwable throwable) { + slf4jLogger.error(msg, throwable); + } + + public void error(String msg, MDC... mdcs) { + if (mdcs == null || mdcs.length == 0) { + slf4jLogger.error(msg); + } else if (slf4jLogger.isErrorEnabled()) { + withLogContext(msg, mdcs, null, mt -> slf4jLogger.error(mt.message)); + } + } + + public void error(String msg, Throwable throwable, MDC... mdcs) { + if (mdcs == null || mdcs.length == 0) { + slf4jLogger.error(msg, throwable); + } else if (slf4jLogger.isErrorEnabled()) { + withLogContext(msg, mdcs, throwable, mt -> slf4jLogger.error(mt.message, mt.throwable)); + } + } + + public boolean isWarnEnabled() { + return slf4jLogger.isWarnEnabled(); + } + + public void warn(String msg) { + slf4jLogger.warn(msg); + } + + public void warn(String msg, Throwable throwable) { + slf4jLogger.warn(msg, throwable); + } + + public void warn(String msg, MDC... mdcs) { + if (mdcs == null || mdcs.length == 0) { + slf4jLogger.warn(msg); + } else if (slf4jLogger.isWarnEnabled()) { + withLogContext(msg, mdcs, null, mt -> slf4jLogger.warn(mt.message)); + } + } + + public void warn(String msg, Throwable throwable, MDC... mdcs) { + if (mdcs == null || mdcs.length == 0) { + slf4jLogger.warn(msg, throwable); + } else if (slf4jLogger.isWarnEnabled()) { + withLogContext(msg, mdcs, throwable, mt -> slf4jLogger.warn(mt.message, mt.throwable)); + } + } + + public boolean isInfoEnabled() { + return slf4jLogger.isInfoEnabled(); + } + + public void info(String msg) { + slf4jLogger.info(msg); + } + + public void info(String msg, Throwable throwable) { + slf4jLogger.info(msg, throwable); + } + + public void info(String msg, MDC... mdcs) { + if (mdcs == null || mdcs.length == 0) { + slf4jLogger.info(msg); + } else if (slf4jLogger.isInfoEnabled()) { + withLogContext(msg, mdcs, null, mt -> slf4jLogger.info(mt.message)); + } + } + + public void info(String msg, Throwable throwable, MDC... mdcs) { + if (mdcs == null || mdcs.length == 0) { + slf4jLogger.info(msg, throwable); + } else if (slf4jLogger.isInfoEnabled()) { + withLogContext(msg, mdcs, throwable, mt -> slf4jLogger.info(mt.message, mt.throwable)); + } + } + + public boolean isDebugEnabled() { + return slf4jLogger.isDebugEnabled(); + } + + public void debug(String msg) { + slf4jLogger.debug(msg); + } + + public void debug(String format, Object arg) { + slf4jLogger.debug(format, arg); + } + + public void debug(String format, Object arg1, Object arg2) { + slf4jLogger.debug(format, arg1, arg2); + } + + public void debug(String format, Object... arguments) { + slf4jLogger.debug(format, arguments); + } + + public void debug(String msg, Throwable throwable) { + slf4jLogger.debug(msg, throwable); + } + + public boolean isTraceEnabled() { + return slf4jLogger.isTraceEnabled(); + } + + public void trace(String msg) { + slf4jLogger.trace(msg); + } + + public void trace(String format, Object arg) { + slf4jLogger.trace(format, arg); + } + + public void trace(String format, Object arg1, Object arg2) { + slf4jLogger.trace(format, arg1, arg2); + } + + public void trace(String format, Object... arguments) { + slf4jLogger.trace(format, arguments); + } + + public void trace(String msg, Throwable throwable) { + slf4jLogger.trace(msg, throwable); + } + + private void withLogContext( + String pattern, + MDC[] mdcs, + Throwable throwable, + Consumer func) { + Map context = new HashMap<>(); + Object[] args = new Object[mdcs.length]; + for (int index = 0; index < mdcs.length; index++) { + MDC mdc = mdcs[index]; + String value = (mdc.value() != null) ? mdc.value().toString() : null; + if (Logging$.MODULE$.isStructuredLoggingEnabled()) { + context.put(mdc.key().name(), value); + } + args[index] = value; + } + MessageThrowable messageThrowable = MessageThrowable.of( + MESSAGE_FACTORY.newMessage(pattern, args).getFormattedMessage(), throwable); + try (CloseableThreadContext.Instance ignored = CloseableThreadContext.putAll(context)) { + func.accept(messageThrowable); + } + } + + private record MessageThrowable(String message, Throwable throwable) { + static MessageThrowable of(String message, Throwable throwable) { + return new MessageThrowable(message, throwable); + } + } + + public Logger getSlf4jLogger() { + return slf4jLogger; + } +} diff --git a/common/utils/src/main/java/org/apache/spark/internal/SparkLoggerFactory.java b/common/utils/src/main/java/org/apache/spark/internal/SparkLoggerFactory.java new file mode 100644 index 0000000000000..a59c007362419 --- /dev/null +++ b/common/utils/src/main/java/org/apache/spark/internal/SparkLoggerFactory.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.internal; + +// checkstyle.off: RegexpSinglelineJava +import org.slf4j.LoggerFactory; +// checkstyle.on: RegexpSinglelineJava + +public class SparkLoggerFactory { + + public static SparkLogger getLogger(String name) { + return new SparkLogger(LoggerFactory.getLogger(name)); + } + + public static SparkLogger getLogger(Class clazz) { + return new SparkLogger(LoggerFactory.getLogger(clazz)); + } +} diff --git a/common/utils/src/main/java/org/apache/spark/network/util/JavaUtils.java b/common/utils/src/main/java/org/apache/spark/network/util/JavaUtils.java index 8e1cc470e0ccf..90dddc2cb08c1 100644 --- a/common/utils/src/main/java/org/apache/spark/network/util/JavaUtils.java +++ b/common/utils/src/main/java/org/apache/spark/network/util/JavaUtils.java @@ -29,15 +29,18 @@ import java.util.regex.Pattern; import org.apache.commons.lang3.SystemUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; + +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.MDC; /** * General utilities available in the network package. Many of these are sourced from Spark's * own Utils, just accessible within this package. */ public class JavaUtils { - private static final Logger logger = LoggerFactory.getLogger(JavaUtils.class); + private static final SparkLogger logger = SparkLoggerFactory.getLogger(JavaUtils.class); /** * Define a default value for driver memory here since this value is referenced across the code @@ -112,7 +115,7 @@ public static void deleteRecursively(File file, FilenameFilter filter) throws IO return; } catch (IOException e) { logger.warn("Attempt to delete using native Unix OS command failed for path = {}. " + - "Falling back to Java IO way", file.getAbsolutePath(), e); + "Falling back to Java IO way", e, MDC.of(LogKeys.PATH$.MODULE$, file.getAbsolutePath())); } } @@ -228,6 +231,8 @@ private static boolean isSymlink(File file) throws IOException { Map.entry("pb", ByteUnit.PiB)); } + private static final Pattern TIME_STRING_PATTERN = Pattern.compile("(-?[0-9]+)([a-z]+)?"); + /** * Convert a passed time string (e.g. 50s, 100ms, or 250us) to a time count in the given unit. * The unit is also considered the default if the given string does not specify a unit. @@ -236,7 +241,7 @@ public static long timeStringAs(String str, TimeUnit unit) { String lower = str.toLowerCase(Locale.ROOT).trim(); try { - Matcher m = Pattern.compile("(-?[0-9]+)([a-z]+)?").matcher(lower); + Matcher m = TIME_STRING_PATTERN.matcher(lower); if (!m.matches()) { throw new NumberFormatException("Failed to parse time string: " + str); } @@ -276,6 +281,11 @@ public static long timeStringAsSec(String str) { return timeStringAs(str, TimeUnit.SECONDS); } + private static final Pattern BYTE_STRING_PATTERN = + Pattern.compile("([0-9]+)([a-z]+)?"); + private static final Pattern BYTE_STRING_FRACTION_PATTERN = + Pattern.compile("([0-9]+\\.[0-9]+)([a-z]+)?"); + /** * Convert a passed byte string (e.g. 50b, 100kb, or 250mb) to the given. If no suffix is * provided, a direct conversion to the provided unit is attempted. @@ -284,8 +294,8 @@ public static long byteStringAs(String str, ByteUnit unit) { String lower = str.toLowerCase(Locale.ROOT).trim(); try { - Matcher m = Pattern.compile("([0-9]+)([a-z]+)?").matcher(lower); - Matcher fractionMatcher = Pattern.compile("([0-9]+\\.[0-9]+)([a-z]+)?").matcher(lower); + Matcher m = BYTE_STRING_PATTERN.matcher(lower); + Matcher fractionMatcher = BYTE_STRING_FRACTION_PATTERN.matcher(lower); if (m.matches()) { long val = Long.parseLong(m.group(1)); @@ -396,7 +406,7 @@ public static File createDirectory(String root, String namePrefix) throws IOExce dir = new File(root, namePrefix + "-" + UUID.randomUUID()); Files.createDirectories(dir.toPath()); } catch (IOException | SecurityException e) { - logger.error("Failed to create directory " + dir, e); + logger.error("Failed to create directory {}", e, MDC.of(LogKeys.PATH$.MODULE$, dir)); dir = null; } } diff --git a/common/utils/src/main/resources/error/README.md b/common/utils/src/main/resources/error/README.md index e2f68a1af9f4a..575e2ebad35a3 100644 --- a/common/utils/src/main/resources/error/README.md +++ b/common/utils/src/main/resources/error/README.md @@ -16,9 +16,9 @@ The error state / SQLSTATE itself is comprised of two parts: 2. Error sub-class Acceptable values for these various error parts are defined in the following files: -* `error-classes.json` -* `error-states.json` -* `error-conditions.json` +* [`error-classes.json`](error-classes.json) +* [`error-states.json`](error-states.json) +* [`error-conditions.json`](error-conditions.json) The terms error class, state, and condition come from the SQL standard. @@ -34,6 +34,7 @@ The terms error class, state, and condition come from the SQL standard. * Error condition: `AS_OF_JOIN` * Error sub-condition: `TOLERANCE_IS_NON_NEGATIVE` * Error sub-condition: `TOLERANCE_IS_UNFOLDABLE` + * Error sub-condition: `UNSUPPORTED_DIRECTION` ### Inconsistent Use of the Term "Error Class" @@ -41,7 +42,7 @@ Unfortunately, we have historically used the term "error class" inconsistently t Fixing this will require renaming `SparkException.errorClass` to `SparkException.errorCondition` and making similar changes to `ErrorClassesJsonReader` and other parts of the codebase. We will address this in [SPARK-47429]. Until that is complete, we will have to live with the fact that a string like `DATATYPE_MISSING_SIZE` is called an "error condition" in our user-facing documentation but an "error class" in the code. -For more details, please see [SPARK-46810][SPARK-46810]. +For more details, please see [SPARK-46810]. [SPARK-46810]: https://issues.apache.org/jira/browse/SPARK-46810 [SPARK-47429]: https://issues.apache.org/jira/browse/SPARK-47429 @@ -51,9 +52,9 @@ For more details, please see [SPARK-46810][SPARK-46810]. 1. Check if the error is an internal error. Internal errors are bugs in the code that we do not expect users to encounter; this does not include unsupported operations. If true, use the error condition `INTERNAL_ERROR` and skip to step 4. -2. Check if an appropriate error condition already exists in `error-conditions.json`. +2. Check if an appropriate error condition already exists in [`error-conditions.json`](error-conditions.json). If true, use the error condition and skip to step 4. -3. Add a new condition to `error-conditions.json`. If the new condition requires a new error state, add the new error state to `error-states.json`. +3. Add a new condition to [`error-conditions.json`](error-conditions.json). If the new condition requires a new error state, add the new error state to [`error-states.json`](error-states.json). 4. Check if the exception type already extends `SparkThrowable`. If true, skip to step 6. 5. Mix `SparkThrowable` into the exception. @@ -165,7 +166,7 @@ For example: The existing `XXKD0` is used for an internal analyzer error. #### ANSI/ISO standard -The SQLSTATEs in `error-states.json` are collated from: +The SQLSTATEs in [`error-states.json`](error-states.json) are collated from: - SQL2016 - DB2 zOS/LUW - PostgreSQL 15 diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json index e1c8c881f98f3..9a3011635daa3 100644 --- a/common/utils/src/main/resources/error/error-conditions.json +++ b/common/utils/src/main/resources/error/error-conditions.json @@ -90,6 +90,11 @@ "message" : [ "The input argument `tolerance` must be a constant." ] + }, + "UNSUPPORTED_DIRECTION" : { + "message" : [ + "Unsupported as-of join direction ''. Supported as-of join direction include: ." + ] } }, "sqlState" : "42604" @@ -101,6 +106,13 @@ ], "sqlState" : "22KD3" }, + "AVRO_NOT_LOADED_SQL_FUNCTIONS_UNUSABLE" : { + "message" : [ + "Cannot call the SQL function because the Avro data source is not loaded.", + "Please restart your job or session with the 'spark-avro' package loaded, such as by using the --packages argument on the command line, and then retry your query or command again." + ], + "sqlState" : "22KD3" + }, "BATCH_METADATA_NOT_FOUND" : { "message" : [ "Unable to find batch ." @@ -119,12 +131,24 @@ ], "sqlState" : "42KDE" }, + "CANNOT_ALTER_COLLATION_BUCKET_COLUMN" : { + "message" : [ + "ALTER TABLE (ALTER|CHANGE) COLUMN cannot change collation of type/subtypes of bucket columns, but found the bucket column in the table ." + ], + "sqlState" : "428FR" + }, "CANNOT_ALTER_PARTITION_COLUMN" : { "message" : [ "ALTER TABLE (ALTER|CHANGE) COLUMN is not supported for partition columns, but found the partition column in the table ." ], "sqlState" : "428FR" }, + "CANNOT_ASSIGN_EVENT_TIME_COLUMN_WITHOUT_WATERMARK" : { + "message" : [ + "Watermark needs to be defined to reassign event time column. Failed to find watermark definition in the streaming query." + ], + "sqlState" : "42611" + }, "CANNOT_CAST_DATATYPE" : { "message" : [ "Cannot cast to ." @@ -212,6 +236,11 @@ "Error reading delta file of : does not exist." ] }, + "CANNOT_READ_MISSING_SNAPSHOT_FILE" : { + "message" : [ + "Error reading snapshot file of : does not exist." + ] + }, "CANNOT_READ_SNAPSHOT_FILE_KEY_SIZE" : { "message" : [ "Error reading snapshot file of : key size cannot be ." @@ -227,6 +256,11 @@ "Error reading streaming state file of does not exist. If the stream job is restarted with a new or updated state operation, please create a new checkpoint location or clear the existing checkpoint location." ] }, + "SNAPSHOT_PARTITION_ID_NOT_FOUND" : { + "message" : [ + "Partition id not found for state of operator at ." + ] + }, "UNCATEGORIZED" : { "message" : [ "" @@ -463,7 +497,13 @@ }, "COLLATION_INVALID_NAME" : { "message" : [ - "The value does not represent a correct collation name. Suggested valid collation name: []." + "The value does not represent a correct collation name. Suggested valid collation names: []." + ], + "sqlState" : "42704" + }, + "COLLATION_INVALID_PROVIDER" : { + "message" : [ + "The value does not represent a correct collation provider. Supported providers are: []." ], "sqlState" : "42704" }, @@ -736,6 +776,11 @@ "Input to the function cannot contain elements of the \"MAP\" type. In Spark, same maps may have different hashcode, thus hash expressions are prohibited on \"MAP\" elements. To restore previous behavior set \"spark.sql.legacy.allowHashOnMapType\" to \"true\"." ] }, + "HASH_VARIANT_TYPE" : { + "message" : [ + "Input to the function cannot contain elements of the \"VARIANT\" type yet." + ] + }, "INPUT_SIZE_NOT_ONE" : { "message" : [ "Length of should be 1." @@ -753,7 +798,7 @@ }, "INVALID_JSON_SCHEMA" : { "message" : [ - "Input schema must be a struct, an array or a map." + "Input schema must be a struct, an array, a map or a variant." ] }, "INVALID_MAP_KEY_TYPE" : { @@ -1036,7 +1081,7 @@ }, "DUPLICATE_ROUTINE_PARAMETER_ASSIGNMENT" : { "message" : [ - "Call to function is invalid because it includes multiple argument assignments to the same parameter name ." + "Call to routine is invalid because it includes multiple argument assignments to the same parameter name ." ], "subClass" : { "BOTH_POSITIONAL_AND_NAMED" : { @@ -1052,6 +1097,14 @@ }, "sqlState" : "4274K" }, + "EMITTING_ROWS_OLDER_THAN_WATERMARK_NOT_ALLOWED" : { + "message" : [ + "Previous node emitted a row with eventTime= which is older than current_watermark_value=", + "This can lead to correctness issues in the stateful operators downstream in the execution pipeline.", + "Please correct the operator logic to emit rows after current global watermark value." + ], + "sqlState" : "42815" + }, "EMPTY_JSON_FIELD_VALUE" : { "message" : [ "Failed to parse an empty string for data type ." @@ -1224,6 +1277,11 @@ "List namespaces." ] }, + "LOAD_TABLE" : { + "message" : [ + "Load the table ." + ] + }, "NAMESPACE_EXISTS" : { "message" : [ "Check that the namespace exists." @@ -1313,7 +1371,20 @@ ], "sqlState" : "2203G" }, - "FIELDS_ALREADY_EXISTS" : { + "FAILED_TO_PARSE_TOO_COMPLEX" : { + "message" : [ + "The statement, including potential SQL functions and referenced views, was too complex to parse.", + "To mitigate this error divide the statement into multiple, less complex chunks." + ], + "sqlState" : "54001" + }, + "FEATURE_NOT_ENABLED" : { + "message" : [ + "The feature is not enabled. Consider setting the config to to enable this capability." + ], + "sqlState" : "56038" + }, + "FIELD_ALREADY_EXISTS" : { "message" : [ "Cannot column, because already exists in ." ], @@ -1883,7 +1954,7 @@ "subClass" : { "DEFAULT_COLLATION" : { "message" : [ - "Cannot resolve the given default collation. Did you mean ''?" + "Cannot resolve the given default collation. Suggested valid collation names: ['']?" ] }, "TIME_ZONE" : { @@ -1983,6 +2054,11 @@ "Delimiter cannot be empty string." ] }, + "NULL_VALUE" : { + "message" : [ + "Delimiter cannot be null." + ] + }, "SINGLE_BACKSLASH" : { "message" : [ "Single backslash is prohibited. It has special meaning as beginning of an escape sequence. To get the backslash character, pass a string with two backslashes as the delimiter." @@ -2304,12 +2380,24 @@ }, "sqlState" : "42K0K" }, + "INVALID_JOIN_TYPE_FOR_JOINWITH" : { + "message" : [ + "Invalid join type in joinWith: ." + ], + "sqlState" : "42613" + }, "INVALID_JSON_DATA_TYPE" : { "message" : [ "Failed to convert the JSON string '' to a data type. Please enter a valid data type." ], "sqlState" : "2203G" }, + "INVALID_JSON_DATA_TYPE_FOR_COLLATIONS" : { + "message" : [ + "Collations can only be applied to string types, but the JSON data type is ." + ], + "sqlState" : "2203G" + }, "INVALID_JSON_ROOT_FIELD" : { "message" : [ "Cannot convert JSON root field to target Spark type." @@ -2621,6 +2709,12 @@ ], "sqlState" : "42000" }, + "INVALID_SINGLE_VARIANT_COLUMN" : { + "message" : [ + "The `singleVariantColumn` option cannot be used if there is also a user specified schema." + ], + "sqlState" : "42613" + }, "INVALID_SQL_ARG" : { "message" : [ "The argument of `sql()` is invalid. Consider to replace it either by a SQL literal or by collection constructor functions such as `map()`, `array()`, `struct()`." @@ -2637,9 +2731,9 @@ "ANALYZE TABLE(S) ... COMPUTE STATISTICS ... must be either NOSCAN or empty." ] }, - "CREATE_FUNC_WITH_IF_NOT_EXISTS_AND_REPLACE" : { + "CREATE_ROUTINE_WITH_IF_NOT_EXISTS_AND_REPLACE" : { "message" : [ - "CREATE FUNCTION with both IF NOT EXISTS and REPLACE is not allowed." + "Cannot create a routine with both IF NOT EXISTS and REPLACE specified." ] }, "CREATE_TEMP_FUNC_WITH_DATABASE" : { @@ -2825,6 +2919,12 @@ ], "sqlState" : "42000" }, + "INVALID_UTF8_STRING" : { + "message" : [ + "Invalid UTF8 byte sequence found in string: ." + ], + "sqlState" : "22029" + }, "INVALID_VARIABLE_TYPE_FOR_QUERY_EXECUTE_IMMEDIATE" : { "message" : [ "Variable type must be string type but got ." @@ -2922,6 +3022,12 @@ ], "sqlState" : "42710" }, + "MALFORMED_CHARACTER_CODING" : { + "message" : [ + "Invalid value found when performing with " + ], + "sqlState" : "22000" + }, "MALFORMED_CSV_RECORD" : { "message" : [ "Malformed CSV record: " @@ -3154,6 +3260,12 @@ ], "sqlState" : "42809" }, + "NOT_NULL_ASSERT_VIOLATION" : { + "message" : [ + "NULL value appeared in non-nullable field: If the schema is inferred from a Scala tuple/case class, or a Java bean, please try to use scala.Option[_] or other nullable types (such as java.lang.Integer instead of int/scala.Int)." + ], + "sqlState" : "42000" + }, "NOT_NULL_CONSTRAINT_VIOLATION" : { "message" : [ "Assigning a NULL is not allowed here." @@ -3256,6 +3368,12 @@ ], "sqlState" : "42000" }, + "NULL_DATA_SOURCE_OPTION" : { + "message" : [ + "Data source read/write option

+ * + * `LogKey`s serve as identifiers for mapped diagnostic contexts (MDC) within logs. + * Follow these guidelines when adding a new LogKey: + *

    + *
  • + * Define all structured logging keys in `LogKey.scala`, and sort them alphabetically for + * ease of search. + *
  • + *
  • + * Use `UPPER_SNAKE_CASE` for key names. + *
  • + *
  • + * Key names should be both simple and broad, yet include specific identifiers like `STAGE_ID`, + * `TASK_ID`, and `JOB_ID` when needed for clarity. For instance, use `MAX_ATTEMPTS` as a + * general key instead of creating separate keys for each scenario such as + * `EXECUTOR_STATE_SYNC_MAX_ATTEMPTS` and `MAX_TASK_FAILURES`. + * This balances simplicity with the detail needed for effective logging. + *
  • + *
  • + * Use abbreviations in names if they are widely understood, + * such as `APP_ID` for APPLICATION_ID, and `K8S` for KUBERNETES. + *
  • + *
  • + * For time-related keys, use milliseconds as the unit of time. + *
  • + *
*/ -object LogKey extends Enumeration { - val ACCUMULATOR_ID = Value - val ANALYSIS_ERROR = Value - val APP_DESC = Value - val APP_ID = Value - val APP_STATE = Value - val BATCH_ID = Value - val BLOCK_ID = Value - val BLOCK_MANAGER_ID = Value - val BROADCAST_ID = Value - val BUCKET = Value - val BYTECODE_SIZE = Value - val CACHE_AUTO_REMOVED_SIZE = Value - val CACHE_UNTIL_HIGHEST_CONSUMED_SIZE = Value - val CACHE_UNTIL_LAST_PRODUCED_SIZE = Value - val CATEGORICAL_FEATURES = Value - val CLASS_LOADER = Value - val CLASS_NAME = Value - val CLUSTER_ID = Value - val CODEC_LEVEL = Value - val CODEC_NAME = Value - val COLUMN_DATA_TYPE_SOURCE = Value - val COLUMN_DATA_TYPE_TARGET = Value - val COLUMN_DEFAULT_VALUE = Value - val COLUMN_NAME = Value - val COMMAND = Value - val COMMAND_OUTPUT = Value - val COMPONENT = Value - val CONFIG = Value - val CONFIG2 = Value - val CONFIG3 = Value - val CONFIG4 = Value - val CONFIG5 = Value - val CONSUMER = Value - val CONTAINER = Value - val CONTAINER_ID = Value - val COUNT = Value - val CSV_HEADER_COLUMN_NAME = Value - val CSV_HEADER_COLUMN_NAMES = Value - val CSV_HEADER_LENGTH = Value - val CSV_SCHEMA_FIELD_NAME = Value - val CSV_SCHEMA_FIELD_NAMES = Value - val CSV_SOURCE = Value - val DATA = Value - val DATABASE_NAME = Value - val DATAFRAME_CACHE_ENTRY = Value - val DATAFRAME_ID = Value - val DESCRIPTION = Value - val DRIVER_ID = Value - val DROPPED_PARTITIONS = Value - val DURATION = Value - val END_POINT = Value - val ENGINE = Value - val ERROR = Value - val EVENT_LOOP = Value - val EVENT_QUEUE = Value - val EXECUTE_INFO = Value - val EXECUTE_KEY = Value - val EXECUTOR_ENV_REGEX = Value - val EXECUTOR_ID = Value - val EXECUTOR_IDS = Value - val EXECUTOR_STATE = Value - val EXIT_CODE = Value - val EXPRESSION_TERMS = Value - val FAILURES = Value - val FALLBACK_VERSION = Value - val FIELD_NAME = Value - val FILE_FORMAT = Value - val FILE_FORMAT2 = Value - val FROM_OFFSET = Value - val FUNCTION_NAME = Value - val FUNCTION_PARAMETER = Value - val GROUP_ID = Value - val HADOOP_VERSION = Value - val HISTORY_DIR = Value - val HIVE_OPERATION_STATE = Value - val HIVE_OPERATION_TYPE = Value - val HOST = Value - val HOST_PORT = Value - val INDEX = Value - val INFERENCE_MODE = Value - val INITIAL_CAPACITY = Value - val INTERVAL = Value - val JOB_ID = Value - val JOIN_CONDITION = Value - val JOIN_CONDITION_SUB_EXPRESSION = Value - val KAFKA_PULLS_COUNT = Value - val KAFKA_RECORDS_PULLED_COUNT = Value - val KEY = Value - val LAST_ACCESS_TIME = Value - val LEARNING_RATE = Value - val LINE = Value - val LINE_NUM = Value - val LISTENER = Value - val LOAD_FACTOR = Value - val LOG_TYPE = Value - val MASTER_URL = Value - val MAX_ATTEMPTS = Value - val MAX_CACHE_UNTIL_HIGHEST_CONSUMED_SIZE = Value - val MAX_CACHE_UNTIL_LAST_PRODUCED_SIZE = Value - val MAX_CAPACITY = Value - val MAX_CATEGORIES = Value - val MAX_EXECUTOR_FAILURES = Value - val MAX_SIZE = Value - val MERGE_DIR_NAME = Value - val MESSAGE = Value - val METHOD_NAME = Value - val MIN_SIZE = Value - val NEW_VALUE = Value - val NUM_COLUMNS = Value - val NUM_ITERATIONS = Value - val OBJECT_ID = Value - val OFFSET = Value - val OFFSETS = Value - val OLD_BLOCK_MANAGER_ID = Value - val OLD_VALUE = Value - val OPTIMIZER_CLASS_NAME = Value - val OP_ID = Value - val OP_TYPE = Value - val PARSE_MODE = Value - val PARTITION_ID = Value - val PARTITION_SPECIFICATION = Value - val PARTITION_SPECS = Value - val PATH = Value - val PATHS = Value - val POD_ID = Value - val POD_NAME = Value - val POD_NAMESPACE = Value - val POD_PHASE = Value - val POLICY = Value - val PORT = Value - val PRODUCER_ID = Value - val QUERY_CACHE_VALUE = Value - val QUERY_HINT = Value - val QUERY_ID = Value - val QUERY_PLAN = Value - val QUERY_PLAN_COMPARISON = Value - val QUERY_PLAN_LENGTH_ACTUAL = Value - val QUERY_PLAN_LENGTH_MAX = Value - val RANGE = Value - val RDD_ID = Value - val REASON = Value - val REATTACHABLE = Value - val RECEIVED_BLOCK_INFO = Value - val REDUCE_ID = Value - val RELATION_NAME = Value - val REMAINING_PARTITIONS = Value - val RESOURCE_NAME = Value - val RETRY_COUNT = Value - val RETRY_INTERVAL = Value - val RULE_BATCH_NAME = Value - val RULE_NAME = Value - val RULE_NUMBER_OF_RUNS = Value - val RUN_ID = Value - val SCHEMA = Value - val SCHEMA2 = Value - val SERVICE_NAME = Value - val SESSION_HOLD_INFO = Value - val SESSION_ID = Value - val SESSION_KEY = Value - val SHARD_ID = Value - val SHUFFLE_BLOCK_INFO = Value - val SHUFFLE_ID = Value - val SHUFFLE_MERGE_ID = Value - val SIZE = Value - val SLEEP_TIME = Value - val SQL_TEXT = Value - val STAGE_ID = Value - val STATEMENT_ID = Value - val STATUS = Value - val STREAM_ID = Value - val STREAM_NAME = Value - val SUBMISSION_ID = Value - val SUBSAMPLING_RATE = Value - val TABLE_NAME = Value - val TASK_ATTEMPT_ID = Value - val TASK_ID = Value - val TASK_NAME = Value - val TASK_SET_NAME = Value - val TASK_STATE = Value - val THREAD = Value - val THREAD_NAME = Value - val TID = Value - val TIME = Value - val TIMEOUT = Value - val TIME_UNITS = Value - val TIP = Value - val TOPIC = Value - val TOPIC_PARTITION = Value - val TOPIC_PARTITIONS = Value - val TOPIC_PARTITION_OFFSET = Value - val TOPIC_PARTITION_OFFSET_RANGE = Value - val TOTAL_EFFECTIVE_TIME = Value - val TOTAL_RECORDS_READ = Value - val TOTAL_SIZE = Value - val TOTAL_TIME = Value - val TOTAL_TIME_READ = Value - val UNSUPPORTED_EXPRESSION = Value - val UNSUPPORTED_HINT_REASON = Value - val UNTIL_OFFSET = Value - val URI = Value - val USER_ID = Value - val USER_NAME = Value - val WAIT_RESULT_TIME = Value - val WAIT_SEND_TIME = Value - val WAIT_TIME = Value - val WATERMARK_CONSTRAINT = Value - val WORKER_URL = Value - val XSD_PATH = Value +trait LogKey { + private lazy val _name: String = getClass.getSimpleName.stripSuffix("$").toLowerCase(Locale.ROOT) + def name: String = _name +} - type LogKey = Value +/** + * Various keys used for mapped diagnostic contexts(MDC) in logging. All structured logging keys + * should be defined here for standardization. + */ +private[spark] object LogKeys { + case object ACCUMULATOR_ID extends LogKey + case object ACL_ENABLED extends LogKey + case object ACTUAL_NUM_FILES extends LogKey + case object ACTUAL_PARTITION_COLUMN extends LogKey + case object ADDED_JARS extends LogKey + case object ADMIN_ACLS extends LogKey + case object ADMIN_ACL_GROUPS extends LogKey + case object ADVISORY_TARGET_SIZE extends LogKey + case object AGGREGATE_FUNCTIONS extends LogKey + case object ALIGNED_FROM_TIME extends LogKey + case object ALIGNED_TO_TIME extends LogKey + case object ALPHA extends LogKey + case object ANALYSIS_ERROR extends LogKey + case object APP_ATTEMPT_ID extends LogKey + case object APP_ATTEMPT_SHUFFLE_MERGE_ID extends LogKey + case object APP_DESC extends LogKey + case object APP_EXECUTOR_ID extends LogKey + case object APP_ID extends LogKey + case object APP_NAME extends LogKey + case object APP_STATE extends LogKey + case object ARCHIVE_NAME extends LogKey + case object ARGS extends LogKey + case object ARTIFACTS extends LogKey + case object ARTIFACT_ID extends LogKey + case object ATTRIBUTE_MAP extends LogKey + case object AUTH_ENABLED extends LogKey + case object AVG_BATCH_PROC_TIME extends LogKey + case object BACKUP_FILE extends LogKey + case object BARRIER_EPOCH extends LogKey + case object BARRIER_ID extends LogKey + case object BATCH_ID extends LogKey + case object BATCH_NAME extends LogKey + case object BATCH_TIMESTAMP extends LogKey + case object BATCH_WRITE extends LogKey + case object BIND_ADDRESS extends LogKey + case object BLOCK_ID extends LogKey + case object BLOCK_IDS extends LogKey + case object BLOCK_MANAGER_ID extends LogKey + case object BLOCK_MANAGER_IDS extends LogKey + case object BLOCK_TYPE extends LogKey + case object BOOT extends LogKey + case object BOOTSTRAP_TIME extends LogKey + case object BROADCAST extends LogKey + case object BROADCAST_ID extends LogKey + case object BROADCAST_OUTPUT_STATUS_SIZE extends LogKey + case object BUCKET extends LogKey + case object BYTECODE_SIZE extends LogKey + case object BYTE_BUFFER extends LogKey + case object BYTE_SIZE extends LogKey + case object CACHED_TABLE_PARTITION_METADATA_SIZE extends LogKey + case object CACHE_AUTO_REMOVED_SIZE extends LogKey + case object CACHE_UNTIL_HIGHEST_CONSUMED_SIZE extends LogKey + case object CACHE_UNTIL_LAST_PRODUCED_SIZE extends LogKey + case object CALL_SITE_LONG_FORM extends LogKey + case object CALL_SITE_SHORT_FORM extends LogKey + case object CANCEL_FUTURE_JOBS extends LogKey + case object CATALOG_NAME extends LogKey + case object CATEGORICAL_FEATURES extends LogKey + case object CHECKPOINT_FILE extends LogKey + case object CHECKPOINT_INTERVAL extends LogKey + case object CHECKPOINT_LOCATION extends LogKey + case object CHECKPOINT_PATH extends LogKey + case object CHECKPOINT_ROOT extends LogKey + case object CHECKPOINT_TIME extends LogKey + case object CHOSEN_WATERMARK extends LogKey + case object CLASSIFIER extends LogKey + case object CLASS_LOADER extends LogKey + case object CLASS_NAME extends LogKey + case object CLASS_PATH extends LogKey + case object CLASS_PATHS extends LogKey + case object CLAUSES extends LogKey + case object CLEANUP_LOCAL_DIRS extends LogKey + case object CLUSTER_CENTROIDS extends LogKey + case object CLUSTER_ID extends LogKey + case object CLUSTER_LABEL extends LogKey + case object CLUSTER_LEVEL extends LogKey + case object CLUSTER_WEIGHT extends LogKey + case object CODEC_LEVEL extends LogKey + case object CODEC_NAME extends LogKey + case object CODEGEN_STAGE_ID extends LogKey + case object COLUMN_DATA_TYPE_SOURCE extends LogKey + case object COLUMN_DATA_TYPE_TARGET extends LogKey + case object COLUMN_DEFAULT_VALUE extends LogKey + case object COLUMN_NAME extends LogKey + case object COMMAND extends LogKey + case object COMMAND_OUTPUT extends LogKey + case object COMMITTED_VERSION extends LogKey + case object COMPACT_INTERVAL extends LogKey + case object COMPONENT extends LogKey + case object COMPUTE extends LogKey + case object CONFIG extends LogKey + case object CONFIG2 extends LogKey + case object CONFIG3 extends LogKey + case object CONFIG4 extends LogKey + case object CONFIG5 extends LogKey + case object CONFIG_DEPRECATION_MESSAGE extends LogKey + case object CONFIG_KEY_UPDATED extends LogKey + case object CONFIG_VERSION extends LogKey + case object CONSUMER extends LogKey + case object CONTAINER extends LogKey + case object CONTAINER_ID extends LogKey + case object CONTAINER_STATE extends LogKey + case object CONTEXT extends LogKey + case object COST extends LogKey + case object COUNT extends LogKey + case object CREATED_POOL_NAME extends LogKey + case object CREATION_SITE extends LogKey + case object CREDENTIALS_RENEWAL_INTERVAL_RATIO extends LogKey + case object CROSS_VALIDATION_METRIC extends LogKey + case object CROSS_VALIDATION_METRICS extends LogKey + case object CSV_HEADER_COLUMN_NAME extends LogKey + case object CSV_HEADER_COLUMN_NAMES extends LogKey + case object CSV_HEADER_LENGTH extends LogKey + case object CSV_SCHEMA_FIELD_NAME extends LogKey + case object CSV_SCHEMA_FIELD_NAMES extends LogKey + case object CSV_SOURCE extends LogKey + case object CURRENT_BATCH_ID extends LogKey + case object CURRENT_DISK_SIZE extends LogKey + case object CURRENT_FILE extends LogKey + case object CURRENT_MEMORY_SIZE extends LogKey + case object CURRENT_PATH extends LogKey + case object CURRENT_TIME extends LogKey + case object DATA extends LogKey + case object DATABASE_NAME extends LogKey + case object DATAFRAME_CACHE_ENTRY extends LogKey + case object DATAFRAME_ID extends LogKey + case object DATA_FILE extends LogKey + case object DATA_SOURCE extends LogKey + case object DATA_SOURCES extends LogKey + case object DEFAULT_COMPACT_INTERVAL extends LogKey + case object DEFAULT_ISOLATION_LEVEL extends LogKey + case object DEFAULT_NAME extends LogKey + case object DEFAULT_VALUE extends LogKey + case object DELAY extends LogKey + case object DELEGATE extends LogKey + case object DELTA extends LogKey + case object DEPRECATED_KEY extends LogKey + case object DERIVATIVE extends LogKey + case object DESCRIPTION extends LogKey + case object DESIRED_NUM_PARTITIONS extends LogKey + case object DESIRED_TREE_DEPTH extends LogKey + case object DESTINATION_PATH extends LogKey + case object DFS_FILE extends LogKey + case object DIFF_DELTA extends LogKey + case object DIVISIBLE_CLUSTER_INDICES_SIZE extends LogKey + case object DRIVER_ID extends LogKey + case object DRIVER_MEMORY_SIZE extends LogKey + case object DRIVER_STATE extends LogKey + case object DROPPED_PARTITIONS extends LogKey + case object DSTREAM extends LogKey + case object DURATION extends LogKey + case object EARLIEST_LOADED_VERSION extends LogKey + case object EFFECTIVE_STORAGE_LEVEL extends LogKey + case object ELAPSED_TIME extends LogKey + case object ENCODING extends LogKey + case object END_INDEX extends LogKey + case object END_POINT extends LogKey + case object END_VERSION extends LogKey + case object ENGINE extends LogKey + case object EPOCH extends LogKey + case object ERROR extends LogKey + case object ESTIMATOR_PARAM_MAP extends LogKey + case object EVALUATED_FILTERS extends LogKey + case object EVENT extends LogKey + case object EVENT_LOG_DESTINATION extends LogKey + case object EVENT_LOOP extends LogKey + case object EVENT_NAME extends LogKey + case object EVENT_QUEUE extends LogKey + case object EXCEPTION extends LogKey + case object EXECUTE_INFO extends LogKey + case object EXECUTE_KEY extends LogKey + case object EXECUTION_MEMORY_SIZE extends LogKey + case object EXECUTION_PLAN_LEAVES extends LogKey + case object EXECUTOR_BACKEND extends LogKey + case object EXECUTOR_ENVS extends LogKey + case object EXECUTOR_ENV_REGEX extends LogKey + case object EXECUTOR_ID extends LogKey + case object EXECUTOR_IDS extends LogKey + case object EXECUTOR_LAUNCH_COMMANDS extends LogKey + case object EXECUTOR_MEMORY_SIZE extends LogKey + case object EXECUTOR_RESOURCES extends LogKey + case object EXECUTOR_SHUFFLE_INFO extends LogKey + case object EXECUTOR_STATE extends LogKey + case object EXECUTOR_TIMEOUT extends LogKey + case object EXECUTOR_USER_CLASS_PATH_FIRST extends LogKey + case object EXEC_AMOUNT extends LogKey + case object EXISTING_FILE extends LogKey + case object EXISTING_PATH extends LogKey + case object EXIT_CODE extends LogKey + case object EXPECTED_NUM_FILES extends LogKey + case object EXPECTED_PARTITION_COLUMN extends LogKey + case object EXPIRY_TIMESTAMP extends LogKey + case object EXPR extends LogKey + case object EXPR_TERMS extends LogKey + case object EXTENDED_EXPLAIN_GENERATOR extends LogKey + case object FAILED_STAGE extends LogKey + case object FAILED_STAGE_NAME extends LogKey + case object FAILURES extends LogKey + case object FALLBACK_VERSION extends LogKey + case object FEATURE_COLUMN extends LogKey + case object FEATURE_DIMENSION extends LogKey + case object FEATURE_NAME extends LogKey + case object FETCH_SIZE extends LogKey + case object FIELD_NAME extends LogKey + case object FILES extends LogKey + case object FILE_ABSOLUTE_PATH extends LogKey + case object FILE_END_OFFSET extends LogKey + case object FILE_FORMAT extends LogKey + case object FILE_FORMAT2 extends LogKey + case object FILE_LENGTH_XATTR extends LogKey + case object FILE_MODIFICATION_TIME extends LogKey + case object FILE_NAME extends LogKey + case object FILE_NAME2 extends LogKey + case object FILE_NAME3 extends LogKey + case object FILE_START_OFFSET extends LogKey + case object FILE_SYSTEM extends LogKey + case object FILE_VERSION extends LogKey + case object FILTER extends LogKey + case object FINAL_CONTEXT extends LogKey + case object FINAL_OUTPUT_PATH extends LogKey + case object FINAL_PATH extends LogKey + case object FINISH_TRIGGER_DURATION extends LogKey + case object FREE_MEMORY_SIZE extends LogKey + case object FROM_OFFSET extends LogKey + case object FROM_TIME extends LogKey + case object FS_DATA_OUTPUT_STREAM extends LogKey + case object FUNCTION_NAME extends LogKey + case object FUNCTION_PARAM extends LogKey + case object GLOBAL_INIT_FILE extends LogKey + case object GLOBAL_WATERMARK extends LogKey + case object GROUP_BY_EXPRS extends LogKey + case object GROUP_ID extends LogKey + case object HADOOP_VERSION extends LogKey + case object HASH_JOIN_KEYS extends LogKey + case object HASH_MAP_SIZE extends LogKey + case object HEARTBEAT extends LogKey + case object HEARTBEAT_INTERVAL extends LogKey + case object HISTORY_DIR extends LogKey + case object HIVE_CLIENT_VERSION extends LogKey + case object HIVE_METASTORE_VERSION extends LogKey + case object HIVE_OPERATION_STATE extends LogKey + case object HIVE_OPERATION_TYPE extends LogKey + case object HOST extends LogKey + case object HOSTS extends LogKey + case object HOST_LOCAL_BLOCKS_SIZE extends LogKey + case object HOST_PORT extends LogKey + case object HOST_PORT2 extends LogKey + case object HUGE_METHOD_LIMIT extends LogKey + case object HYBRID_STORE_DISK_BACKEND extends LogKey + case object IDENTIFIER extends LogKey + case object INCOMPATIBLE_TYPES extends LogKey + case object INDEX extends LogKey + case object INDEX_FILE extends LogKey + case object INDEX_NAME extends LogKey + case object INFERENCE_MODE extends LogKey + case object INIT extends LogKey + case object INITIAL_CAPACITY extends LogKey + case object INITIAL_HEARTBEAT_INTERVAL extends LogKey + case object INIT_MODE extends LogKey + case object INPUT extends LogKey + case object INPUT_SPLIT extends LogKey + case object INTEGRAL extends LogKey + case object INTERVAL extends LogKey + case object ISOLATION_LEVEL extends LogKey + case object ISSUE_DATE extends LogKey + case object IS_NETWORK_REQUEST_DONE extends LogKey + case object JAR_ENTRY extends LogKey + case object JAR_MESSAGE extends LogKey + case object JAR_URL extends LogKey + case object JAVA_VERSION extends LogKey + case object JAVA_VM_NAME extends LogKey + case object JOB_ID extends LogKey + case object JOIN_CONDITION extends LogKey + case object JOIN_CONDITION_SUB_EXPR extends LogKey + case object JOIN_TYPE extends LogKey + case object K8S_CONTEXT extends LogKey + case object KEY extends LogKey + case object KEY2 extends LogKey + case object KEYTAB extends LogKey + case object KEYTAB_FILE extends LogKey + case object KILL_EXECUTORS extends LogKey + case object LABEL_COLUMN extends LogKey + case object LARGEST_CLUSTER_INDEX extends LogKey + case object LAST_ACCESS_TIME extends LogKey + case object LAST_VALID_TIME extends LogKey + case object LATEST_BATCH_ID extends LogKey + case object LATEST_COMMITTED_BATCH_ID extends LogKey + case object LATEST_SHUFFLE_MERGE_ID extends LogKey + case object LEARNING_RATE extends LogKey + case object LEFT_EXPR extends LogKey + case object LEFT_LOGICAL_PLAN_STATS_SIZE_IN_BYTES extends LogKey + case object LINE extends LogKey + case object LINE_NUM extends LogKey + case object LISTENER extends LogKey + case object LOADED_VERSION extends LogKey + case object LOAD_FACTOR extends LogKey + case object LOAD_TIME extends LogKey + case object LOCALE extends LogKey + case object LOCAL_BLOCKS_SIZE extends LogKey + case object LOCAL_SCRATCH_DIR extends LogKey + case object LOCATION extends LogKey + case object LOGICAL_PLAN extends LogKey + case object LOGICAL_PLAN_COLUMNS extends LogKey + case object LOGICAL_PLAN_LEAVES extends LogKey + case object LOG_ID extends LogKey + case object LOG_LEVEL extends LogKey + case object LOG_OFFSET extends LogKey + case object LOG_TYPE extends LogKey + case object LOWER_BOUND extends LogKey + case object MALFORMATTED_STRING extends LogKey + case object MAP_ID extends LogKey + case object MASTER_URL extends LogKey + case object MAX_ATTEMPTS extends LogKey + case object MAX_CACHE_UNTIL_HIGHEST_CONSUMED_SIZE extends LogKey + case object MAX_CACHE_UNTIL_LAST_PRODUCED_SIZE extends LogKey + case object MAX_CAPACITY extends LogKey + case object MAX_CATEGORIES extends LogKey + case object MAX_EXECUTOR_FAILURES extends LogKey + case object MAX_FILE_VERSION extends LogKey + case object MAX_JVM_METHOD_PARAMS_LENGTH extends LogKey + case object MAX_MEMORY_SIZE extends LogKey + case object MAX_METHOD_CODE_SIZE extends LogKey + case object MAX_NUM_BINS extends LogKey + case object MAX_NUM_CHUNKS extends LogKey + case object MAX_NUM_FILES extends LogKey + case object MAX_NUM_LOG_POLICY extends LogKey + case object MAX_NUM_PARTITIONS extends LogKey + case object MAX_NUM_POSSIBLE_BINS extends LogKey + case object MAX_NUM_ROWS_IN_MEMORY_BUFFER extends LogKey + case object MAX_SERVICE_NAME_LENGTH extends LogKey + case object MAX_SIZE extends LogKey + case object MAX_SLOTS extends LogKey + case object MAX_SPLIT_BYTES extends LogKey + case object MAX_TABLE_PARTITION_METADATA_SIZE extends LogKey + case object MEMORY_CONSUMER extends LogKey + case object MEMORY_POOL_NAME extends LogKey + case object MEMORY_SIZE extends LogKey + case object MEMORY_THRESHOLD_SIZE extends LogKey + case object MERGE_DIR_NAME extends LogKey + case object MESSAGE extends LogKey + case object METADATA extends LogKey + case object METADATA_DIRECTORY extends LogKey + case object METADATA_JSON extends LogKey + case object META_FILE extends LogKey + case object METHOD_NAME extends LogKey + case object METHOD_PARAM_TYPES extends LogKey + case object METRICS_JSON extends LogKey + case object METRIC_NAME extends LogKey + case object MINI_BATCH_FRACTION extends LogKey + case object MIN_COMPACTION_BATCH_ID extends LogKey + case object MIN_NUM_FREQUENT_PATTERN extends LogKey + case object MIN_POINT_PER_CLUSTER extends LogKey + case object MIN_RATE extends LogKey + case object MIN_SHARE extends LogKey + case object MIN_SIZE extends LogKey + case object MIN_TIME extends LogKey + case object MIN_VERSION_NUM extends LogKey + case object MISSING_PARENT_STAGES extends LogKey + case object MODEL_WEIGHTS extends LogKey + case object MODULE_NAME extends LogKey + case object NAMESPACE extends LogKey + case object NETWORK_IF extends LogKey + case object NEW_FEATURE_COLUMN_NAME extends LogKey + case object NEW_LABEL_COLUMN_NAME extends LogKey + case object NEW_PATH extends LogKey + case object NEW_RDD_ID extends LogKey + case object NEW_STATE extends LogKey + case object NEW_VALUE extends LogKey + case object NEXT_RENEWAL_TIME extends LogKey + case object NODES extends LogKey + case object NODE_LOCATION extends LogKey + case object NON_BUILT_IN_CONNECTORS extends LogKey + case object NORM extends LogKey + case object NUM_ADDED_PARTITIONS extends LogKey + case object NUM_APPS extends LogKey + case object NUM_ATTEMPT extends LogKey + case object NUM_BIN extends LogKey + case object NUM_BLOCKS extends LogKey + case object NUM_BROADCAST_BLOCK extends LogKey + case object NUM_BYTES extends LogKey + case object NUM_BYTES_CURRENT extends LogKey + case object NUM_BYTES_EVICTED extends LogKey + case object NUM_BYTES_MAX extends LogKey + case object NUM_BYTES_TO_FREE extends LogKey + case object NUM_BYTES_TO_WARN extends LogKey + case object NUM_BYTES_USED extends LogKey + case object NUM_CATEGORIES extends LogKey + case object NUM_CHECKSUM_FILE extends LogKey + case object NUM_CHUNKS extends LogKey + case object NUM_CLASSES extends LogKey + case object NUM_COEFFICIENTS extends LogKey + case object NUM_COLUMNS extends LogKey + case object NUM_CONCURRENT_WRITER extends LogKey + case object NUM_CORES extends LogKey + case object NUM_DATA_FILE extends LogKey + case object NUM_DATA_FILES extends LogKey + case object NUM_DECOMMISSIONED extends LogKey + case object NUM_DRIVERS extends LogKey + case object NUM_DROPPED_PARTITIONS extends LogKey + case object NUM_EFFECTIVE_RULE_OF_RUNS extends LogKey + case object NUM_ELEMENTS_SPILL_THRESHOLD extends LogKey + case object NUM_EVENTS extends LogKey + case object NUM_EXAMPLES extends LogKey + case object NUM_EXECUTORS extends LogKey + case object NUM_EXECUTORS_EXITED extends LogKey + case object NUM_EXECUTORS_KILLED extends LogKey + case object NUM_EXECUTOR_CORES extends LogKey + case object NUM_EXECUTOR_CORES_REMAINING extends LogKey + case object NUM_EXECUTOR_CORES_TOTAL extends LogKey + case object NUM_EXECUTOR_DESIRED extends LogKey + case object NUM_EXECUTOR_LAUNCH extends LogKey + case object NUM_EXECUTOR_TARGET extends LogKey + case object NUM_FAILURES extends LogKey + case object NUM_FEATURES extends LogKey + case object NUM_FILES extends LogKey + case object NUM_FILES_COPIED extends LogKey + case object NUM_FILES_FAILED_TO_DELETE extends LogKey + case object NUM_FILES_REUSED extends LogKey + case object NUM_FREQUENT_ITEMS extends LogKey + case object NUM_HOST_LOCAL_BLOCKS extends LogKey + case object NUM_INDEX_FILE extends LogKey + case object NUM_INDEX_FILES extends LogKey + case object NUM_ITERATIONS extends LogKey + case object NUM_KAFKA_PULLS extends LogKey + case object NUM_KAFKA_RECORDS_PULLED extends LogKey + case object NUM_LEADING_SINGULAR_VALUES extends LogKey + case object NUM_LEFT_PARTITION_VALUES extends LogKey + case object NUM_LOADED_ENTRIES extends LogKey + case object NUM_LOCAL_BLOCKS extends LogKey + case object NUM_LOCAL_DIRS extends LogKey + case object NUM_LOCAL_FREQUENT_PATTERN extends LogKey + case object NUM_MERGERS extends LogKey + case object NUM_MERGER_LOCATIONS extends LogKey + case object NUM_META_FILES extends LogKey + case object NUM_NODES extends LogKey + case object NUM_PARTITIONS extends LogKey + case object NUM_PARTITIONS2 extends LogKey + case object NUM_PATHS extends LogKey + case object NUM_PEERS extends LogKey + case object NUM_PEERS_REPLICATED_TO extends LogKey + case object NUM_PEERS_TO_REPLICATE_TO extends LogKey + case object NUM_PENDING_LAUNCH_TASKS extends LogKey + case object NUM_POD extends LogKey + case object NUM_POD_SHARED_SLOT extends LogKey + case object NUM_POD_TARGET extends LogKey + case object NUM_POINT extends LogKey + case object NUM_PREFIXES extends LogKey + case object NUM_PRUNED extends LogKey + case object NUM_PUSH_MERGED_LOCAL_BLOCKS extends LogKey + case object NUM_RECEIVERS extends LogKey + case object NUM_RECORDS_READ extends LogKey + case object NUM_RELEASED_LOCKS extends LogKey + case object NUM_REMAINED extends LogKey + case object NUM_REMOTE_BLOCKS extends LogKey + case object NUM_REMOVED_WORKERS extends LogKey + case object NUM_REPLICAS extends LogKey + case object NUM_REQUESTS extends LogKey + case object NUM_REQUEST_SYNC_TASK extends LogKey + case object NUM_RESOURCE_SLOTS extends LogKey + case object NUM_RETRIES extends LogKey + case object NUM_RETRY extends LogKey + case object NUM_RIGHT_PARTITION_VALUES extends LogKey + case object NUM_ROWS extends LogKey + case object NUM_RULE_OF_RUNS extends LogKey + case object NUM_SEQUENCES extends LogKey + case object NUM_SLOTS extends LogKey + case object NUM_SPILL_INFOS extends LogKey + case object NUM_SPILL_WRITERS extends LogKey + case object NUM_SUB_DIRS extends LogKey + case object NUM_SUCCESSFUL_TASKS extends LogKey + case object NUM_TASKS extends LogKey + case object NUM_TASK_CPUS extends LogKey + case object NUM_TRAIN_WORD extends LogKey + case object NUM_UNFINISHED_DECOMMISSIONED extends LogKey + case object NUM_VERSIONS_RETAIN extends LogKey + case object NUM_WEIGHTED_EXAMPLES extends LogKey + case object NUM_WORKERS extends LogKey + case object OBJECT_AGG_SORT_BASED_FALLBACK_THRESHOLD extends LogKey + case object OBJECT_ID extends LogKey + case object OFFSET extends LogKey + case object OFFSETS extends LogKey + case object OFFSET_SEQUENCE_METADATA extends LogKey + case object OLD_BLOCK_MANAGER_ID extends LogKey + case object OLD_GENERATION_GC extends LogKey + case object OLD_VALUE extends LogKey + case object OPEN_COST_IN_BYTES extends LogKey + case object OPERATION_HANDLE extends LogKey + case object OPERATION_HANDLE_ID extends LogKey + case object OPERATION_ID extends LogKey + case object OPTIMIZED_PLAN_COLUMNS extends LogKey + case object OPTIMIZER_CLASS_NAME extends LogKey + case object OPTIONS extends LogKey + case object OP_ID extends LogKey + case object OP_TYPE extends LogKey + case object ORIGINAL_DISK_SIZE extends LogKey + case object ORIGINAL_MEMORY_SIZE extends LogKey + case object OS_ARCH extends LogKey + case object OS_NAME extends LogKey + case object OS_VERSION extends LogKey + case object OUTPUT extends LogKey + case object OVERHEAD_MEMORY_SIZE extends LogKey + case object PAGE_SIZE extends LogKey + case object PARENT_STAGES extends LogKey + case object PARSE_MODE extends LogKey + case object PARTITIONED_FILE_READER extends LogKey + case object PARTITIONER extends LogKey + case object PARTITION_ID extends LogKey + case object PARTITION_IDS extends LogKey + case object PARTITION_SIZE extends LogKey + case object PARTITION_SPECIFICATION extends LogKey + case object PARTITION_SPECS extends LogKey + case object PATH extends LogKey + case object PATHS extends LogKey + case object PEER extends LogKey + case object PERCENT extends LogKey + case object PIPELINE_STAGE_UID extends LogKey + case object PLUGIN_NAME extends LogKey + case object POD_ID extends LogKey + case object POD_NAME extends LogKey + case object POD_NAMESPACE extends LogKey + case object POD_PHASE extends LogKey + case object POD_STATE extends LogKey + case object POINT_OF_CENTER extends LogKey + case object POLICY extends LogKey + case object POOL_NAME extends LogKey + case object PORT extends LogKey + case object PORT2 extends LogKey + case object POST_SCAN_FILTERS extends LogKey + case object PREDICATE extends LogKey + case object PREDICATES extends LogKey + case object PREFERRED_SERVICE_NAME extends LogKey + case object PREFIX extends LogKey + case object PRETTY_ID_STRING extends LogKey + case object PRINCIPAL extends LogKey + case object PROCESS extends LogKey + case object PROCESSING_TIME extends LogKey + case object PRODUCER_ID extends LogKey + case object PROPERTY_NAME extends LogKey + case object PROPORTIONAL extends LogKey + case object PROTOCOL_VERSION extends LogKey + case object PROVIDER extends LogKey + case object PUSHED_FILTERS extends LogKey + case object PUSH_MERGED_LOCAL_BLOCKS_SIZE extends LogKey + case object PVC_METADATA_NAME extends LogKey + case object PYTHON_EXEC extends LogKey + case object PYTHON_PACKAGES extends LogKey + case object PYTHON_VERSION extends LogKey + case object PYTHON_WORKER_MODULE extends LogKey + case object PYTHON_WORKER_RESPONSE extends LogKey + case object QUANTILES extends LogKey + case object QUERY_CACHE_VALUE extends LogKey + case object QUERY_HINT extends LogKey + case object QUERY_ID extends LogKey + case object QUERY_PLAN extends LogKey + case object QUERY_PLAN_COMPARISON extends LogKey + case object QUERY_PLAN_LENGTH_ACTUAL extends LogKey + case object QUERY_PLAN_LENGTH_MAX extends LogKey + case object QUERY_RUN_ID extends LogKey + case object RANGE extends LogKey + case object RATE_LIMIT extends LogKey + case object RATIO extends LogKey + case object RDD extends LogKey + case object RDD_CHECKPOINT_DIR extends LogKey + case object RDD_DEBUG_STRING extends LogKey + case object RDD_DESCRIPTION extends LogKey + case object RDD_ID extends LogKey + case object READ_LIMIT extends LogKey + case object REASON extends LogKey + case object REATTACHABLE extends LogKey + case object RECEIVED_BLOCK_INFO extends LogKey + case object RECEIVED_BLOCK_TRACKER_LOG_EVENT extends LogKey + case object RECEIVER_ID extends LogKey + case object RECEIVER_IDS extends LogKey + case object RECORDS extends LogKey + case object RECOVERY_STATE extends LogKey + case object REDACTED_STATEMENT extends LogKey + case object REDUCE_ID extends LogKey + case object REGEX extends LogKey + case object REGISTERED_EXECUTOR_FILE extends LogKey + case object REGISTER_MERGE_RESULTS extends LogKey + case object RELATION_NAME extends LogKey + case object RELATION_OUTPUT extends LogKey + case object RELATIVE_TOLERANCE extends LogKey + case object RELEASED_LOCKS extends LogKey + case object REMAINING_PARTITIONS extends LogKey + case object REMOTE_ADDRESS extends LogKey + case object REMOTE_BLOCKS_SIZE extends LogKey + case object REMOVE_FROM_MASTER extends LogKey + case object REPORT_DETAILS extends LogKey + case object REQUESTER_SIZE extends LogKey + case object REQUEST_EXECUTORS extends LogKey + case object REQUEST_ID extends LogKey + case object RESOURCE extends LogKey + case object RESOURCE_NAME extends LogKey + case object RESOURCE_PROFILE_ID extends LogKey + case object RESOURCE_PROFILE_IDS extends LogKey + case object RESOURCE_PROFILE_TO_TOTAL_EXECS extends LogKey + case object RESPONSE_BODY_SIZE extends LogKey + case object RESULT extends LogKey + case object RESULT_SIZE_BYTES extends LogKey + case object RESULT_SIZE_BYTES_MAX extends LogKey + case object RETRY_INTERVAL extends LogKey + case object RETRY_WAIT_TIME extends LogKey + case object RIGHT_EXPR extends LogKey + case object RIGHT_LOGICAL_PLAN_STATS_SIZE_IN_BYTES extends LogKey + case object RMSE extends LogKey + case object ROCKS_DB_LOG_LEVEL extends LogKey + case object ROCKS_DB_LOG_MESSAGE extends LogKey + case object RPC_ADDRESS extends LogKey + case object RPC_ENDPOINT_REF extends LogKey + case object RPC_MESSAGE_CAPACITY extends LogKey + case object RULE_NAME extends LogKey + case object RUN_ID extends LogKey + case object SCALA_VERSION extends LogKey + case object SCALING_DOWN_RATIO extends LogKey + case object SCALING_UP_RATIO extends LogKey + case object SCHEDULER_POOL_NAME extends LogKey + case object SCHEDULING_MODE extends LogKey + case object SCHEMA extends LogKey + case object SCHEMA2 extends LogKey + case object SERVER_NAME extends LogKey + case object SERVICE_NAME extends LogKey + case object SERVLET_CONTEXT_HANDLER_PATH extends LogKey + case object SESSION_HANDLE extends LogKey + case object SESSION_HOLD_INFO extends LogKey + case object SESSION_ID extends LogKey + case object SESSION_KEY extends LogKey + case object SET_CLIENT_INFO_REQUEST extends LogKey + case object SHARD_ID extends LogKey + case object SHORTER_SERVICE_NAME extends LogKey + case object SHORT_USER_NAME extends LogKey + case object SHUFFLE_BLOCK_INFO extends LogKey + case object SHUFFLE_DB_BACKEND_KEY extends LogKey + case object SHUFFLE_DB_BACKEND_NAME extends LogKey + case object SHUFFLE_ID extends LogKey + case object SHUFFLE_MERGE_ID extends LogKey + case object SHUFFLE_MERGE_RECOVERY_FILE extends LogKey + case object SHUFFLE_SERVICE_CONF_OVERLAY_URL extends LogKey + case object SHUFFLE_SERVICE_METRICS_NAMESPACE extends LogKey + case object SHUFFLE_SERVICE_NAME extends LogKey + case object SIGMAS_LENGTH extends LogKey + case object SIGNAL extends LogKey + case object SINK extends LogKey + case object SIZE extends LogKey + case object SLEEP_TIME extends LogKey + case object SLIDE_DURATION extends LogKey + case object SMALLEST_CLUSTER_INDEX extends LogKey + case object SNAPSHOT_VERSION extends LogKey + case object SOCKET_ADDRESS extends LogKey + case object SOURCE extends LogKey + case object SOURCE_PATH extends LogKey + case object SPARK_BRANCH extends LogKey + case object SPARK_BUILD_DATE extends LogKey + case object SPARK_BUILD_USER extends LogKey + case object SPARK_DATA_STREAM extends LogKey + case object SPARK_PLAN_ID extends LogKey + case object SPARK_REPO_URL extends LogKey + case object SPARK_REVISION extends LogKey + case object SPARK_VERSION extends LogKey + case object SPILL_TIMES extends LogKey + case object SQL_TEXT extends LogKey + case object SRC_PATH extends LogKey + case object STAGE extends LogKey + case object STAGES extends LogKey + case object STAGE_ATTEMPT extends LogKey + case object STAGE_ID extends LogKey + case object STAGE_NAME extends LogKey + case object START_INDEX extends LogKey + case object STATEMENT_ID extends LogKey + case object STATE_STORE_ID extends LogKey + case object STATE_STORE_PROVIDER extends LogKey + case object STATE_STORE_VERSION extends LogKey + case object STATS extends LogKey + case object STATUS extends LogKey + case object STDERR extends LogKey + case object STOP_SITE_SHORT_FORM extends LogKey + case object STORAGE_LEVEL extends LogKey + case object STORAGE_LEVEL_DESERIALIZED extends LogKey + case object STORAGE_LEVEL_REPLICATION extends LogKey + case object STORAGE_MEMORY_SIZE extends LogKey + case object STORE_ID extends LogKey + case object STREAMING_CONTEXT extends LogKey + case object STREAMING_DATA_SOURCE_DESCRIPTION extends LogKey + case object STREAMING_DATA_SOURCE_NAME extends LogKey + case object STREAMING_OFFSETS_END extends LogKey + case object STREAMING_OFFSETS_START extends LogKey + case object STREAMING_QUERY_PROGRESS extends LogKey + case object STREAMING_SOURCE extends LogKey + case object STREAMING_TABLE extends LogKey + case object STREAMING_WRITE extends LogKey + case object STREAM_CHUNK_ID extends LogKey + case object STREAM_ID extends LogKey + case object STREAM_NAME extends LogKey + case object SUBMISSION_ID extends LogKey + case object SUBSAMPLING_RATE extends LogKey + case object SUB_QUERY extends LogKey + case object TABLE_NAME extends LogKey + case object TABLE_TYPE extends LogKey + case object TABLE_TYPES extends LogKey + case object TAG extends LogKey + case object TARGET_NUM_EXECUTOR extends LogKey + case object TARGET_NUM_EXECUTOR_DELTA extends LogKey + case object TARGET_PATH extends LogKey + case object TARGET_SIZE extends LogKey + case object TASK_ATTEMPT_ID extends LogKey + case object TASK_ID extends LogKey + case object TASK_INDEX extends LogKey + case object TASK_LOCALITY extends LogKey + case object TASK_NAME extends LogKey + case object TASK_REQUIREMENTS extends LogKey + case object TASK_RESOURCES extends LogKey + case object TASK_RESOURCE_ASSIGNMENTS extends LogKey + case object TASK_SET_MANAGER extends LogKey + case object TASK_SET_NAME extends LogKey + case object TASK_STATE extends LogKey + case object TEMP_FILE extends LogKey + case object TEMP_OUTPUT_PATH extends LogKey + case object TEMP_PATH extends LogKey + case object TEST_SIZE extends LogKey + case object THREAD extends LogKey + case object THREAD_ID extends LogKey + case object THREAD_NAME extends LogKey + case object THREAD_POOL_KEEPALIVE_TIME extends LogKey + case object THREAD_POOL_SIZE extends LogKey + case object THREAD_POOL_WAIT_QUEUE_SIZE extends LogKey + case object THRESHOLD extends LogKey + case object THRESH_TIME extends LogKey + case object TIME extends LogKey + case object TIMEOUT extends LogKey + case object TIMER extends LogKey + case object TIMESTAMP extends LogKey + case object TIME_UNITS extends LogKey + case object TIP extends LogKey + case object TOKEN extends LogKey + case object TOKEN_KIND extends LogKey + case object TOKEN_REGEX extends LogKey + case object TOKEN_RENEWER extends LogKey + case object TOPIC extends LogKey + case object TOPIC_PARTITION extends LogKey + case object TOPIC_PARTITIONS extends LogKey + case object TOPIC_PARTITION_OFFSET extends LogKey + case object TOPIC_PARTITION_OFFSET_RANGE extends LogKey + case object TOTAL extends LogKey + case object TOTAL_EFFECTIVE_TIME extends LogKey + case object TOTAL_SIZE extends LogKey + case object TOTAL_TIME extends LogKey + case object TOTAL_TIME_READ extends LogKey + case object TO_TIME extends LogKey + case object TRAINING_SIZE extends LogKey + case object TRAIN_VALIDATION_SPLIT_METRIC extends LogKey + case object TRAIN_VALIDATION_SPLIT_METRICS extends LogKey + case object TRANSFER_TYPE extends LogKey + case object TREE_NODE extends LogKey + case object TRIGGER_INTERVAL extends LogKey + case object UI_FILTER extends LogKey + case object UI_FILTER_PARAMS extends LogKey + case object UI_PROXY_BASE extends LogKey + case object UNKNOWN_PARAM extends LogKey + case object UNSUPPORTED_EXPR extends LogKey + case object UNSUPPORTED_HINT_REASON extends LogKey + case object UNTIL_OFFSET extends LogKey + case object UPPER_BOUND extends LogKey + case object URI extends LogKey + case object URIS extends LogKey + case object URL extends LogKey + case object URL2 extends LogKey + case object URLS extends LogKey + case object USER_ID extends LogKey + case object USER_NAME extends LogKey + case object UUID extends LogKey + case object VALUE extends LogKey + case object VERSION_NUM extends LogKey + case object VIRTUAL_CORES extends LogKey + case object VOCAB_SIZE extends LogKey + case object WAIT_RESULT_TIME extends LogKey + case object WAIT_SEND_TIME extends LogKey + case object WATERMARK_CONSTRAINT extends LogKey + case object WEB_URL extends LogKey + case object WEIGHT extends LogKey + case object WORKER extends LogKey + case object WORKER_HOST extends LogKey + case object WORKER_ID extends LogKey + case object WORKER_PORT extends LogKey + case object WORKER_URL extends LogKey + case object WRITE_AHEAD_LOG_INFO extends LogKey + case object WRITE_AHEAD_LOG_RECORD_HANDLE extends LogKey + case object WRITE_JOB_UUID extends LogKey + case object XML_SCHEDULING_MODE extends LogKey + case object XSD_PATH extends LogKey + case object YOUNG_GENERATION_GC extends LogKey + case object ZERO_TIME extends LogKey } diff --git a/common/utils/src/main/scala/org/apache/spark/internal/Logging.scala b/common/utils/src/main/scala/org/apache/spark/internal/Logging.scala index 607f3637e6418..8eea9b44da26d 100644 --- a/common/utils/src/main/scala/org/apache/spark/internal/Logging.scala +++ b/common/utils/src/main/scala/org/apache/spark/internal/Logging.scala @@ -17,8 +17,6 @@ package org.apache.spark.internal -import java.util.Locale - import scala.jdk.CollectionConverters._ import org.apache.logging.log4j.{CloseableThreadContext, Level, LogManager} @@ -29,9 +27,46 @@ import org.apache.logging.log4j.core.filter.AbstractFilter import org.slf4j.{Logger, LoggerFactory} import org.apache.spark.internal.Logging.SparkShellLoggingFilter -import org.apache.spark.internal.LogKey.LogKey import org.apache.spark.util.SparkClassUtils +/** + * Guidelines for the Structured Logging Framework - Scala Logging + *

+ * + * Use the `org.apache.spark.internal.Logging` trait for logging in Scala code: + * Logging Messages with Variables: + * When logging a message with variables, wrap all the variables with `MDC`s and they will be + * automatically added to the Mapped Diagnostic Context (MDC). + * This allows for structured logging and better log analysis. + *

+ * + * logInfo(log"Trying to recover app: ${MDC(LogKeys.APP_ID, app.id)}") + *

+ * + * Constant String Messages: + * If you are logging a constant string message, use the log methods that accept a constant + * string. + *

+ * + * logInfo("StateStore stopped") + *

+ * + * Exceptions: + * To ensure logs are compatible with Spark SQL and log analysis tools, avoid + * `Exception.printStackTrace()`. Use `logError`, `logWarning`, and `logInfo` methods from + * the `Logging` trait to log exceptions, maintaining structured and parsable logs. + *

+ * + * If you want to output logs in `scala code` through the structured log framework, + * you can define `custom LogKey` and use it in `scala` code as follows: + *

+ * + * // To add a `custom LogKey`, implement `LogKey` + * case object CUSTOM_LOG_KEY extends LogKey + * import org.apache.spark.internal.MDC; + * logInfo(log"${MDC(CUSTOM_LOG_KEY, "key")}") + */ + /** * Mapped Diagnostic Context (MDC) that will be used in log messages. * The values of the MDC will be inline in the log message, while the key-value pairs will be @@ -42,6 +77,10 @@ case class MDC(key: LogKey, value: Any) { "the class of value cannot be MessageWithContext") } +object MDC { + def of(key: LogKey, value: Any): MDC = MDC(key, value) +} + /** * Wrapper class for log messages that include a logging context. * This is used as the return type of the string interpolator `LogStringContext`. @@ -60,9 +99,11 @@ case class MessageWithContext(message: String, context: java.util.HashMap[String * Companion class for lazy evaluation of the MessageWithContext instance. */ class LogEntry(messageWithContext: => MessageWithContext) { - def message: String = messageWithContext.message + private lazy val cachedMessageWithContext: MessageWithContext = messageWithContext + + def message: String = cachedMessageWithContext.message - def context: java.util.HashMap[String, String] = messageWithContext.context + def context: java.util.HashMap[String, String] = cachedMessageWithContext.context } /** @@ -104,18 +145,18 @@ trait Logging { implicit class LogStringContext(val sc: StringContext) { def log(args: MDC*): MessageWithContext = { val processedParts = sc.parts.iterator - val sb = new StringBuilder(processedParts.next()) + val sb = new StringBuilder(StringContext.processEscapes(processedParts.next())) val context = new java.util.HashMap[String, String]() args.foreach { mdc => val value = if (mdc.value != null) mdc.value.toString else null sb.append(value) if (Logging.isStructuredLoggingEnabled) { - context.put(mdc.key.toString.toLowerCase(Locale.ROOT), value) + context.put(mdc.key.name, value) } if (processedParts.hasNext) { - sb.append(processedParts.next()) + sb.append(StringContext.processEscapes(processedParts.next())) } } diff --git a/common/utils/src/main/scala/org/apache/spark/internal/README.md b/common/utils/src/main/scala/org/apache/spark/internal/README.md deleted file mode 100644 index 81c542fd3d9c6..0000000000000 --- a/common/utils/src/main/scala/org/apache/spark/internal/README.md +++ /dev/null @@ -1,14 +0,0 @@ -# Guidelines for the Structured Logging Framework - -## LogKey - -LogKeys serve as identifiers for mapped diagnostic contexts (MDC) within logs. Follow these guidelines when adding new LogKeys: -* Define all structured logging keys in `LogKey.scala`, and sort them alphabetically for ease of search. -* Use `UPPER_SNAKE_CASE` for key names. -* Key names should be both simple and broad, yet include specific identifiers like `STAGE_ID`, `TASK_ID`, and `JOB_ID` when needed for clarity. For instance, use `MAX_ATTEMPTS` as a general key instead of creating separate keys for each scenario such as `EXECUTOR_STATE_SYNC_MAX_ATTEMPTS` and `MAX_TASK_FAILURES`. This balances simplicity with the detail needed for effective logging. -* Use abbreviations in names if they are widely understood, such as `APP_ID` for APPLICATION_ID, and `K8S` for KUBERNETES. -* For time-related keys, use milliseconds as the unit of time. - -## Exceptions - -To ensure logs are compatible with Spark SQL and log analysis tools, avoid `Exception.printStackTrace()`. Use `logError`, `logWarning`, and `logInfo` methods from the `Logging` trait to log exceptions, maintaining structured and parsable logs. diff --git a/common/utils/src/main/scala/org/apache/spark/util/LogUtils.scala b/common/utils/src/main/scala/org/apache/spark/util/LogUtils.scala new file mode 100644 index 0000000000000..5a798ffad3a92 --- /dev/null +++ b/common/utils/src/main/scala/org/apache/spark/util/LogUtils.scala @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.util + +import org.apache.spark.annotation.DeveloperApi + +/** + * :: : DeveloperApi :: + * Utils for querying Spark logs with Spark SQL. + * + * @since 4.0.0 + */ +@DeveloperApi +object LogUtils { + /** + * Schema for structured Spark logs. + * Example usage: + * val logDf = spark.read.schema(LOG_SCHEMA).json("path/to/logs") + */ + val LOG_SCHEMA: String = """ + |ts TIMESTAMP, + |level STRING, + |msg STRING, + |context map, + |exception STRUCT< + | class STRING, + | msg STRING, + | stacktrace ARRAY> + |>, + |logger STRING""".stripMargin +} diff --git a/common/utils/src/main/scala/org/apache/spark/util/MavenUtils.scala b/common/utils/src/main/scala/org/apache/spark/util/MavenUtils.scala index 08291859a32cc..546981c8b5435 100644 --- a/common/utils/src/main/scala/org/apache/spark/util/MavenUtils.scala +++ b/common/utils/src/main/scala/org/apache/spark/util/MavenUtils.scala @@ -36,7 +36,7 @@ import org.apache.ivy.plugins.repository.file.FileRepository import org.apache.ivy.plugins.resolver.{ChainResolver, FileSystemResolver, IBiblioResolver} import org.apache.spark.SparkException -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.util.ArrayImplicits._ /** Provides utility functions to be used inside SparkSubmit. */ @@ -215,7 +215,7 @@ private[spark] object MavenUtils extends Logging { if (artifactInfo.getExt == "jar") { true } else { - logInfo(s"Skipping non-jar dependency ${artifactInfo.getId}") + logInfo(log"Skipping non-jar dependency ${MDC(LogKeys.ARTIFACT_ID, artifactInfo.getId)}") false } } @@ -462,14 +462,13 @@ private[spark] object MavenUtils extends Logging { val sysOut = System.out // Default configuration name for ivy val ivyConfName = "default" - - // A Module descriptor must be specified. Entries are dummy strings - val md = getModuleDescriptor - - md.setDefaultConf(ivyConfName) + var md: DefaultModuleDescriptor = null try { // To prevent ivy from logging to system out System.setOut(printStream) + // A Module descriptor must be specified. Entries are dummy strings + md = getModuleDescriptor + md.setDefaultConf(ivyConfName) val artifacts = extractMavenCoordinates(coordinates) // Directories for caching downloads through ivy and storing the jars when maven coordinates // are supplied to spark-submit @@ -516,8 +515,9 @@ private[spark] object MavenUtils extends Logging { val failedReports = rr.getArtifactsReports(DownloadStatus.FAILED, true) if (failedReports.nonEmpty && noCacheIvySettings.isDefined) { val failedArtifacts = failedReports.map(r => r.getArtifact) - logInfo(s"Download failed: ${failedArtifacts.mkString("[", ", ", "]")}, " + - s"attempt to retry while skipping local-m2-cache.") + logInfo(log"Download failed: " + + log"${MDC(LogKeys.ARTIFACTS, failedArtifacts.mkString("[", ", ", "]"))}, " + + log"attempt to retry while skipping local-m2-cache.") failedArtifacts.foreach(artifact => { clearInvalidIvyCacheFiles(artifact.getModuleRevisionId, ivySettings.getDefaultCache) }) @@ -548,7 +548,9 @@ private[spark] object MavenUtils extends Logging { } } finally { System.setOut(sysOut) - clearIvyResolutionFiles(md.getModuleRevisionId, ivySettings.getDefaultCache, ivyConfName) + if (md != null) { + clearIvyResolutionFiles(md.getModuleRevisionId, ivySettings.getDefaultCache, ivyConfName) + } } } } diff --git a/common/utils/src/main/scala/org/apache/spark/util/SparkErrorUtils.scala b/common/utils/src/main/scala/org/apache/spark/util/SparkErrorUtils.scala index 8194d1e424173..9f604e4bf47f2 100644 --- a/common/utils/src/main/scala/org/apache/spark/util/SparkErrorUtils.scala +++ b/common/utils/src/main/scala/org/apache/spark/util/SparkErrorUtils.scala @@ -21,7 +21,7 @@ import java.nio.charset.StandardCharsets.UTF_8 import scala.util.control.NonFatal -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} private[spark] trait SparkErrorUtils extends Logging { /** @@ -74,7 +74,8 @@ private[spark] trait SparkErrorUtils extends Logging { } catch { case t: Throwable if (originalThrowable != null && originalThrowable != t) => originalThrowable.addSuppressed(t) - logWarning(s"Suppressing exception in finally: ${t.getMessage}", t) + logWarning( + log"Suppressing exception in finally: ${MDC(LogKeys.MESSAGE, t.getMessage)}", t) throw originalThrowable } } diff --git a/common/utils/src/main/scala/org/apache/spark/util/SparkFileUtils.scala b/common/utils/src/main/scala/org/apache/spark/util/SparkFileUtils.scala index e12f8acdadd3c..22f03df1b2697 100644 --- a/common/utils/src/main/scala/org/apache/spark/util/SparkFileUtils.scala +++ b/common/utils/src/main/scala/org/apache/spark/util/SparkFileUtils.scala @@ -20,7 +20,7 @@ import java.io.File import java.net.{URI, URISyntaxException} import java.nio.file.Files -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.network.util.JavaUtils private[spark] trait SparkFileUtils extends Logging { @@ -77,12 +77,12 @@ private[spark] trait SparkFileUtils extends Logging { // remove the check when we're sure that Files.createDirectories() would never fail silently. Files.createDirectories(dir.toPath) if ( !dir.exists() || !dir.isDirectory) { - logError(s"Failed to create directory " + dir) + logError(log"Failed to create directory ${MDC(LogKeys.PATH, dir)}") } dir.isDirectory } catch { case e: Exception => - logError(s"Failed to create directory " + dir, e) + logError(log"Failed to create directory ${MDC(LogKeys.PATH, dir)}", e) false } } diff --git a/common/utils/src/test/java/org/apache/spark/util/PatternSparkLoggerSuite.java b/common/utils/src/test/java/org/apache/spark/util/PatternSparkLoggerSuite.java new file mode 100644 index 0000000000000..6bfe595def1d4 --- /dev/null +++ b/common/utils/src/test/java/org/apache/spark/util/PatternSparkLoggerSuite.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.util; + +import org.apache.logging.log4j.Level; + +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; + +public class PatternSparkLoggerSuite extends SparkLoggerSuiteBase { + + private static final SparkLogger LOGGER = + SparkLoggerFactory.getLogger(PatternSparkLoggerSuite.class); + + private String toRegexPattern(Level level, String msg) { + return msg + .replace("", level.toString()) + .replace("", className()); + } + + @Override + SparkLogger logger() { + return LOGGER; + } + + @Override + String className() { + return PatternSparkLoggerSuite.class.getSimpleName(); + } + + @Override + String logFilePath() { + return "target/pattern.log"; + } + + @Override + String expectedPatternForBasicMsg(Level level) { + return toRegexPattern(level, ".* : This is a log message\n"); + } + + @Override + String expectedPatternForBasicMsgWithEscapeChar(Level level) { + return toRegexPattern(level, + ".* : This is a log message\\nThis is a new line \\t other msg\\n"); + } + + @Override + String expectedPatternForBasicMsgWithException(Level level) { + return toRegexPattern(level, """ + .* : This is a log message + [\\s\\S]*"""); + } + + @Override + String expectedPatternForMsgWithMDC(Level level) { + return toRegexPattern(level, ".* : Lost executor 1.\n"); + } + + @Override + String expectedPatternForMsgWithMDCs(Level level) { + return toRegexPattern(level, + ".* : Lost executor 1, reason: the shuffle data is too large\n"); + } + + @Override + String expectedPatternForMsgWithMDCsAndException(Level level) { + return toRegexPattern(level,""" + .* : Lost executor 1, reason: the shuffle data is too large + [\\s\\S]*"""); + } + + @Override + String expectedPatternForMsgWithMDCValueIsNull(Level level) { + return toRegexPattern(level, ".* : Lost executor null.\n"); + } + + @Override + String expectedPatternForScalaCustomLogKey(Level level) { + return toRegexPattern(level, ".* : Scala custom log message.\n"); + } + + @Override + String expectedPatternForJavaCustomLogKey(Level level) { + return toRegexPattern(level, ".* : Java custom log message.\n"); + } +} diff --git a/common/utils/src/test/java/org/apache/spark/util/SparkLoggerSuiteBase.java b/common/utils/src/test/java/org/apache/spark/util/SparkLoggerSuiteBase.java new file mode 100644 index 0000000000000..186088ede1d0b --- /dev/null +++ b/common/utils/src/test/java/org/apache/spark/util/SparkLoggerSuiteBase.java @@ -0,0 +1,274 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.util; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.util.List; + +import org.apache.commons.lang3.tuple.Pair; +import org.apache.logging.log4j.Level; +import org.junit.jupiter.api.Test; + +import org.apache.spark.internal.LogKey; +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.MDC; +import org.apache.spark.internal.SparkLogger; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +public abstract class SparkLoggerSuiteBase { + + abstract SparkLogger logger(); + abstract String className(); + abstract String logFilePath(); + + private File logFile() throws IOException { + String pwd = new File(".").getCanonicalPath(); + return new File(pwd + File.separator + logFilePath()); + } + + // Return the newly added log contents in the log file after executing the function `f` + private String captureLogOutput(Runnable func) throws IOException { + String content = ""; + if (logFile().exists()) { + content = Files.readString(logFile().toPath()); + } + func.run(); + String newContent = Files.readString(logFile().toPath()); + return newContent.substring(content.length()); + } + + @FunctionalInterface + private interface ExpectedResult { + String apply(Level level) throws IOException; + } + + private void checkLogOutput(Level level, Runnable func, ExpectedResult result) { + try { + assertTrue(captureLogOutput(func).matches(result.apply(level))); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private final String basicMsg = "This is a log message"; + + private final String basicMsgWithEscapeChar = + "This is a log message\nThis is a new line \t other msg"; + + private final MDC executorIDMDC = MDC.of(LogKeys.EXECUTOR_ID$.MODULE$, "1"); + private final String msgWithMDC = "Lost executor {}."; + + private final MDC[] mdcs = new MDC[] { + MDC.of(LogKeys.EXECUTOR_ID$.MODULE$, "1"), + MDC.of(LogKeys.REASON$.MODULE$, "the shuffle data is too large")}; + private final String msgWithMDCs = "Lost executor {}, reason: {}"; + + private final MDC[] emptyMDCs = new MDC[0]; + + private final MDC executorIDMDCValueIsNull = MDC.of(LogKeys.EXECUTOR_ID$.MODULE$, null); + + private final MDC scalaCustomLogMDC = + MDC.of(CustomLogKeys.CUSTOM_LOG_KEY$.MODULE$, "Scala custom log message."); + + private final MDC javaCustomLogMDC = + MDC.of(JavaCustomLogKeys.CUSTOM_LOG_KEY, "Java custom log message."); + + // test for basic message (without any mdc) + abstract String expectedPatternForBasicMsg(Level level); + + // test for basic message (with escape char) + abstract String expectedPatternForBasicMsgWithEscapeChar(Level level); + + // test for basic message and exception + abstract String expectedPatternForBasicMsgWithException(Level level); + + // test for message (with mdc) + abstract String expectedPatternForMsgWithMDC(Level level); + + // test for message (with mdcs) + abstract String expectedPatternForMsgWithMDCs(Level level); + + // test for message (with mdcs and exception) + abstract String expectedPatternForMsgWithMDCsAndException(Level level); + + // test for message (with empty mdcs and exception) + String expectedPatternForMsgWithEmptyMDCsAndException(Level level) { + return expectedPatternForBasicMsgWithException(level); + } + + // test for message (with mdc - the value is null) + abstract String expectedPatternForMsgWithMDCValueIsNull(Level level); + + // test for scala custom LogKey + abstract String expectedPatternForScalaCustomLogKey(Level level); + + // test for java custom LogKey + abstract String expectedPatternForJavaCustomLogKey(Level level); + + @Test + public void testBasicMsg() { + Runnable errorFn = () -> logger().error(basicMsg); + Runnable warnFn = () -> logger().warn(basicMsg); + Runnable infoFn = () -> logger().info(basicMsg); + Runnable debugFn = () -> logger().debug(basicMsg); + Runnable traceFn = () -> logger().trace(basicMsg); + List.of( + Pair.of(Level.ERROR, errorFn), + Pair.of(Level.WARN, warnFn), + Pair.of(Level.INFO, infoFn), + Pair.of(Level.DEBUG, debugFn), + Pair.of(Level.TRACE, traceFn)).forEach(pair -> + checkLogOutput(pair.getLeft(), pair.getRight(), this::expectedPatternForBasicMsg)); + } + + @Test + public void testBasicMsgWithEscapeChar() { + Runnable errorFn = () -> logger().error(basicMsgWithEscapeChar); + Runnable warnFn = () -> logger().warn(basicMsgWithEscapeChar); + Runnable infoFn = () -> logger().info(basicMsgWithEscapeChar); + Runnable debugFn = () -> logger().debug(basicMsgWithEscapeChar); + Runnable traceFn = () -> logger().trace(basicMsgWithEscapeChar); + List.of( + Pair.of(Level.ERROR, errorFn), + Pair.of(Level.WARN, warnFn), + Pair.of(Level.INFO, infoFn), + Pair.of(Level.DEBUG, debugFn), + Pair.of(Level.TRACE, traceFn)).forEach(pair -> + checkLogOutput(pair.getLeft(), pair.getRight(), + this::expectedPatternForBasicMsgWithEscapeChar)); + } + + @Test + public void testBasicLoggerWithException() { + Throwable exception = new RuntimeException("OOM"); + Runnable errorFn = () -> logger().error(basicMsg, exception); + Runnable warnFn = () -> logger().warn(basicMsg, exception); + Runnable infoFn = () -> logger().info(basicMsg, exception); + Runnable debugFn = () -> logger().debug(basicMsg, exception); + Runnable traceFn = () -> logger().trace(basicMsg, exception); + List.of( + Pair.of(Level.ERROR, errorFn), + Pair.of(Level.WARN, warnFn), + Pair.of(Level.INFO, infoFn), + Pair.of(Level.DEBUG, debugFn), + Pair.of(Level.TRACE, traceFn)).forEach(pair -> + checkLogOutput(pair.getLeft(), pair.getRight(), + this::expectedPatternForBasicMsgWithException)); + } + + @Test + public void testLoggerWithMDC() { + Runnable errorFn = () -> logger().error(msgWithMDC, executorIDMDC); + Runnable warnFn = () -> logger().warn(msgWithMDC, executorIDMDC); + Runnable infoFn = () -> logger().info(msgWithMDC, executorIDMDC); + List.of( + Pair.of(Level.ERROR, errorFn), + Pair.of(Level.WARN, warnFn), + Pair.of(Level.INFO, infoFn)).forEach(pair -> + checkLogOutput(pair.getLeft(), pair.getRight(), this::expectedPatternForMsgWithMDC)); + } + + @Test + public void testLoggerWithMDCs() { + Runnable errorFn = () -> logger().error(msgWithMDCs, mdcs); + Runnable warnFn = () -> logger().warn(msgWithMDCs, mdcs); + Runnable infoFn = () -> logger().info(msgWithMDCs, mdcs); + List.of( + Pair.of(Level.ERROR, errorFn), + Pair.of(Level.WARN, warnFn), + Pair.of(Level.INFO, infoFn)).forEach(pair -> + checkLogOutput(pair.getLeft(), pair.getRight(), this::expectedPatternForMsgWithMDCs)); + } + + @Test + public void testLoggerWithEmptyMDCsAndException() { + Throwable exception = new RuntimeException("OOM"); + Runnable errorFn = () -> logger().error(basicMsg, exception, emptyMDCs); + Runnable warnFn = () -> logger().warn(basicMsg, exception, emptyMDCs); + Runnable infoFn = () -> logger().info(basicMsg, exception, emptyMDCs); + List.of( + Pair.of(Level.ERROR, errorFn), + Pair.of(Level.WARN, warnFn), + Pair.of(Level.INFO, infoFn)).forEach(pair -> + checkLogOutput(pair.getLeft(), pair.getRight(), + this::expectedPatternForMsgWithEmptyMDCsAndException)); + } + + @Test + public void testLoggerWithMDCsAndException() { + Throwable exception = new RuntimeException("OOM"); + Runnable errorFn = () -> logger().error(msgWithMDCs, exception, mdcs); + Runnable warnFn = () -> logger().warn(msgWithMDCs, exception, mdcs); + Runnable infoFn = () -> logger().info(msgWithMDCs, exception, mdcs); + List.of( + Pair.of(Level.ERROR, errorFn), + Pair.of(Level.WARN, warnFn), + Pair.of(Level.INFO, infoFn)).forEach(pair -> + checkLogOutput(pair.getLeft(), pair.getRight(), + this::expectedPatternForMsgWithMDCsAndException) + ); + } + + @Test + public void testLoggerWithMDCValueIsNull() { + Runnable errorFn = () -> logger().error(msgWithMDC, executorIDMDCValueIsNull); + Runnable warnFn = () -> logger().warn(msgWithMDC, executorIDMDCValueIsNull); + Runnable infoFn = () -> logger().info(msgWithMDC, executorIDMDCValueIsNull); + List.of( + Pair.of(Level.ERROR, errorFn), + Pair.of(Level.WARN, warnFn), + Pair.of(Level.INFO, infoFn)).forEach(pair -> + checkLogOutput(pair.getLeft(), pair.getRight(), + this::expectedPatternForMsgWithMDCValueIsNull)); + } + + @Test + public void testLoggerWithScalaCustomLogKey() { + Runnable errorFn = () -> logger().error("{}", scalaCustomLogMDC); + Runnable warnFn = () -> logger().warn("{}", scalaCustomLogMDC); + Runnable infoFn = () -> logger().info("{}", scalaCustomLogMDC); + List.of( + Pair.of(Level.ERROR, errorFn), + Pair.of(Level.WARN, warnFn), + Pair.of(Level.INFO, infoFn)).forEach(pair -> + checkLogOutput(pair.getLeft(), pair.getRight(), this::expectedPatternForScalaCustomLogKey)); + } + + @Test + public void testLoggerWithJavaCustomLogKey() { + Runnable errorFn = () -> logger().error("{}", javaCustomLogMDC); + Runnable warnFn = () -> logger().warn("{}", javaCustomLogMDC); + Runnable infoFn = () -> logger().info("{}", javaCustomLogMDC); + List.of( + Pair.of(Level.ERROR, errorFn), + Pair.of(Level.WARN, warnFn), + Pair.of(Level.INFO, infoFn)).forEach(pair -> + checkLogOutput(pair.getLeft(), pair.getRight(), this::expectedPatternForJavaCustomLogKey)); + } +} + +class JavaCustomLogKeys { + // Custom `LogKey` must be `implements LogKey` + public static class CUSTOM_LOG_KEY implements LogKey { } + + // Singleton + public static final CUSTOM_LOG_KEY CUSTOM_LOG_KEY = new CUSTOM_LOG_KEY(); +} diff --git a/common/utils/src/test/java/org/apache/spark/util/StructuredSparkLoggerSuite.java b/common/utils/src/test/java/org/apache/spark/util/StructuredSparkLoggerSuite.java new file mode 100644 index 0000000000000..6959fe11820ff --- /dev/null +++ b/common/utils/src/test/java/org/apache/spark/util/StructuredSparkLoggerSuite.java @@ -0,0 +1,190 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.util; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.logging.log4j.Level; + +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; + +public class StructuredSparkLoggerSuite extends SparkLoggerSuiteBase { + + private static final SparkLogger LOGGER = + SparkLoggerFactory.getLogger(StructuredSparkLoggerSuite.class); + + private static final ObjectMapper JSON_MAPPER = new ObjectMapper(); + private String compactAndToRegexPattern(Level level, String json) { + try { + return JSON_MAPPER.readTree(json).toString() + .replace("", level.toString()) + .replace("", className()) + .replace("", "[^\"]+") + .replace("\"\"", ".*") + .replace("{", "\\{") + "\n"; + } catch (JsonProcessingException e) { + throw new RuntimeException(e); + } + } + + @Override + SparkLogger logger() { + return LOGGER; + } + + @Override + String className() { + return StructuredSparkLoggerSuite.class.getSimpleName(); + } + + @Override + String logFilePath() { + return "target/structured.log"; + } + + @Override + String expectedPatternForBasicMsg(Level level) { + return compactAndToRegexPattern(level, """ + { + "ts": "", + "level": "", + "msg": "This is a log message", + "logger": "" + }"""); + } + + @Override + String expectedPatternForBasicMsgWithEscapeChar(Level level) { + return compactAndToRegexPattern(level, """ + { + "ts": "", + "level": "", + "msg": "This is a log message\\\\nThis is a new line \\\\t other msg", + "logger": "" + }"""); + } + + @Override + String expectedPatternForBasicMsgWithException(Level level) { + return compactAndToRegexPattern(level, """ + { + "ts": "", + "level": "", + "msg": "This is a log message", + "exception": { + "class": "java.lang.RuntimeException", + "msg": "OOM", + "stacktrace": "" + }, + "logger": "" + }"""); + } + + @Override + String expectedPatternForMsgWithMDC(Level level) { + return compactAndToRegexPattern(level, """ + { + "ts": "", + "level": "", + "msg": "Lost executor 1.", + "context": { + "executor_id": "1" + }, + "logger": "" + }"""); + } + + @Override + String expectedPatternForMsgWithMDCs(Level level) { + return compactAndToRegexPattern(level, """ + { + "ts": "", + "level": "", + "msg": "Lost executor 1, reason: the shuffle data is too large", + "context": { + "executor_id": "1", + "reason": "the shuffle data is too large" + }, + "logger": "" + }"""); + } + + @Override + String expectedPatternForMsgWithMDCsAndException(Level level) { + return compactAndToRegexPattern(level, """ + { + "ts": "", + "level": "", + "msg": "Lost executor 1, reason: the shuffle data is too large", + "context": { + "executor_id": "1", + "reason": "the shuffle data is too large" + }, + "exception": { + "class": "java.lang.RuntimeException", + "msg": "OOM", + "stacktrace": "" + }, + "logger": "" + }"""); + } + + @Override + String expectedPatternForMsgWithMDCValueIsNull(Level level) { + return compactAndToRegexPattern(level, """ + { + "ts": "", + "level": "", + "msg": "Lost executor null.", + "context": { + "executor_id": null + }, + "logger": "" + }"""); + } + + @Override + String expectedPatternForScalaCustomLogKey(Level level) { + return compactAndToRegexPattern(level, """ + { + "ts": "", + "level": "", + "msg": "Scala custom log message.", + "context": { + "custom_log_key": "Scala custom log message." + }, + "logger": "" + }"""); + } + + @Override + String expectedPatternForJavaCustomLogKey(Level level) { + return compactAndToRegexPattern(level, """ + { + "ts": "", + "level": "", + "msg": "Java custom log message.", + "context": { + "custom_log_key": "Java custom log message." + }, + "logger": "" + }"""); + } +} + diff --git a/common/utils/src/test/resources/log4j2.properties b/common/utils/src/test/resources/log4j2.properties index e3bd8689993d6..cb38f5b55a0ba 100644 --- a/common/utils/src/test/resources/log4j2.properties +++ b/common/utils/src/test/resources/log4j2.properties @@ -39,12 +39,22 @@ appender.pattern.layout.type = PatternLayout appender.pattern.layout.pattern = %d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n%ex # Custom loggers -logger.structured.name = org.apache.spark.util.StructuredLoggingSuite -logger.structured.level = trace -logger.structured.appenderRefs = structured -logger.structured.appenderRef.structured.ref = structured - -logger.pattern.name = org.apache.spark.util.PatternLoggingSuite -logger.pattern.level = trace -logger.pattern.appenderRefs = pattern -logger.pattern.appenderRef.pattern.ref = pattern +logger.structured_logging.name = org.apache.spark.util.StructuredLoggingSuite +logger.structured_logging.level = trace +logger.structured_logging.appenderRefs = structured +logger.structured_logging.appenderRef.structured.ref = structured + +logger.pattern_logging.name = org.apache.spark.util.PatternLoggingSuite +logger.pattern_logging.level = trace +logger.pattern_logging.appenderRefs = pattern +logger.pattern_logging.appenderRef.pattern.ref = pattern + +logger.structured_logger.name = org.apache.spark.util.StructuredSparkLoggerSuite +logger.structured_logger.level = trace +logger.structured_logger.appenderRefs = structured +logger.structured_logger.appenderRef.structured.ref = structured + +logger.pattern_logger.name = org.apache.spark.util.PatternSparkLoggerSuite +logger.pattern_logger.level = trace +logger.pattern_logger.appenderRefs = pattern +logger.pattern_logger.appenderRef.pattern.ref = pattern diff --git a/common/utils/src/test/scala/org/apache/spark/util/LogKeySuite.scala b/common/utils/src/test/scala/org/apache/spark/util/LogKeySuite.scala index 24a24538ad72b..17e360f510a24 100644 --- a/common/utils/src/test/scala/org/apache/spark/util/LogKeySuite.scala +++ b/common/utils/src/test/scala/org/apache/spark/util/LogKeySuite.scala @@ -22,16 +22,16 @@ import java.nio.file.{Files, Path} import java.util.{ArrayList => JList} import scala.jdk.CollectionConverters._ +import scala.reflect.runtime.universe._ import org.apache.commons.io.FileUtils import org.scalatest.funsuite.AnyFunSuite // scalastyle:ignore funsuite -import org.apache.spark.internal.{Logging, LogKey} -import org.apache.spark.internal.LogKey.LogKey +import org.apache.spark.internal.{Logging, LogKeys} // scalastyle:off line.size.limit /** - * To re-generate the LogKey class file, run: + * To re-generate the file `LogKey.scala`, run: * {{{ * SPARK_GENERATE_GOLDEN_FILES=1 build/sbt "common-utils/testOnly org.apache.spark.util.LogKeySuite" * }}} @@ -57,20 +57,20 @@ class LogKeySuite private val logKeyFilePath = getWorkspaceFilePath("common", "utils", "src", "main", "scala", "org", "apache", "spark", "internal", "LogKey.scala") - // regenerate the file `LogKey.scala` with its enumeration fields sorted alphabetically + // regenerate the file `LogKey.scala` with its members sorted alphabetically private def regenerateLogKeyFile( - originalKeys: Seq[LogKey], sortedKeys: Seq[LogKey]): Unit = { + originalKeys: Seq[String], sortedKeys: Seq[String]): Unit = { if (originalKeys != sortedKeys) { val logKeyFile = logKeyFilePath.toFile - logInfo(s"Regenerating LogKey file $logKeyFile") + logInfo(s"Regenerating the file $logKeyFile") val originalContents = FileUtils.readLines(logKeyFile, StandardCharsets.UTF_8) val sortedContents = new JList[String]() var firstMatch = false originalContents.asScala.foreach { line => - if (line.trim.startsWith("val ") && line.trim.endsWith(" = Value")) { + if (line.trim.startsWith("case object ") && line.trim.endsWith(" extends LogKey")) { if (!firstMatch) { - sortedKeys.foreach { logKey => - sortedContents.add(s" val ${logKey.toString} = Value") + sortedKeys.foreach { key => + sortedContents.add(s" case object $key extends LogKey") } firstMatch = true } @@ -83,14 +83,21 @@ class LogKeySuite } } - test("LogKey enumeration fields are correctly sorted") { - val originalKeys = LogKey.values.toSeq - val sortedKeys = originalKeys.sortBy(_.toString) + test("The members of LogKeys are correctly sorted") { + val originalKeys = getAllLogKeys.reverse + val sortedKeys = originalKeys.sorted if (regenerateGoldenFiles) { regenerateLogKeyFile(originalKeys, sortedKeys) } else { assert(originalKeys === sortedKeys, - "LogKey enumeration fields must be sorted alphabetically") + "The members of LogKeys must be sorted alphabetically") } } + + private def getAllLogKeys: Seq[String] = { + val logKeysType = typeOf[LogKeys.type] + val classSymbol = logKeysType.typeSymbol.asClass + val members = classSymbol.typeSignature.members + members.filter(m => m.isTerm && !m.isMethod).map(_.name.toString).toSeq + } } diff --git a/common/utils/src/test/scala/org/apache/spark/util/MDCSuite.scala b/common/utils/src/test/scala/org/apache/spark/util/MDCSuite.scala index 1983f185e8c87..7631c25662219 100644 --- a/common/utils/src/test/scala/org/apache/spark/util/MDCSuite.scala +++ b/common/utils/src/test/scala/org/apache/spark/util/MDCSuite.scala @@ -22,7 +22,7 @@ import scala.jdk.CollectionConverters._ import org.scalatest.funsuite.AnyFunSuite // scalastyle:ignore funsuite import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{EXIT_CODE, OFFSET, RANGE} +import org.apache.spark.internal.LogKeys.{EXIT_CODE, OFFSET, RANGE} class MDCSuite extends AnyFunSuite // scalastyle:ignore funsuite diff --git a/common/utils/src/test/scala/org/apache/spark/util/PatternLoggingSuite.scala b/common/utils/src/test/scala/org/apache/spark/util/PatternLoggingSuite.scala index d06405ab6d990..2ba2b15c49f33 100644 --- a/common/utils/src/test/scala/org/apache/spark/util/PatternLoggingSuite.scala +++ b/common/utils/src/test/scala/org/apache/spark/util/PatternLoggingSuite.scala @@ -34,6 +34,19 @@ class PatternLoggingSuite extends LoggingSuiteBase with BeforeAndAfterAll { s""".*$level $className: This is a log message\n""" } + override def expectedPatternForBasicMsgWithEscapeChar(level: Level): String = { + s""".*$level $className: This is a log message\nThis is a new line \t other msg\n""" + } + + override def expectedPatternForBasicMsgWithEscapeCharMDC(level: Level): String = { + s""".*$level $className: This is a log message\nThis is a new line \t other msg\n""" + } + + override def expectedPatternForMsgWithMDCAndEscapeChar(level: Level): String = { + s""".*$level $className: The first message\nthe first new line\tthe first other msg\n""" + + s"""[\\s\\S]*The second message\nthe second new line\tthe second other msg\n""" + } + override def expectedPatternForBasicMsgWithException(level: Level): String = { s""".*$level $className: This is a log message\n[\\s\\S]*""" } @@ -47,6 +60,10 @@ class PatternLoggingSuite extends LoggingSuiteBase with BeforeAndAfterAll { override def expectedPatternForMsgWithMDCAndException(level: Level): String = s""".*$level $className: Error in executor 1.\njava.lang.RuntimeException: OOM\n[\\s\\S]*""" + override def expectedPatternForCustomLogKey(level: Level): String = { + s""".*$level $className: Custom log message.\n""" + } + override def verifyMsgWithConcat(level: Level, logOutput: String): Unit = { val pattern = s""".*$level $className: Min Size: 2, Max Size: 4. Please double check.\n""" diff --git a/common/utils/src/test/scala/org/apache/spark/util/StructuredLoggingSuite.scala b/common/utils/src/test/scala/org/apache/spark/util/StructuredLoggingSuite.scala index fe03c190fa85e..b3e103f46337c 100644 --- a/common/utils/src/test/scala/org/apache/spark/util/StructuredLoggingSuite.scala +++ b/common/utils/src/test/scala/org/apache/spark/util/StructuredLoggingSuite.scala @@ -25,8 +25,7 @@ import com.fasterxml.jackson.module.scala.DefaultScalaModule import org.apache.logging.log4j.Level import org.scalatest.funsuite.AnyFunSuite // scalastyle:ignore funsuite -import org.apache.spark.internal.{LogEntry, Logging, MDC} -import org.apache.spark.internal.LogKey.{EXECUTOR_ID, MAX_SIZE, MIN_SIZE} +import org.apache.spark.internal.{LogEntry, Logging, LogKey, LogKeys, MDC, MessageWithContext} trait LoggingSuiteBase extends AnyFunSuite // scalastyle:ignore funsuite @@ -54,19 +53,38 @@ trait LoggingSuiteBase def basicMsg: String = "This is a log message" - def msgWithMDC: LogEntry = log"Lost executor ${MDC(EXECUTOR_ID, "1")}." + def basicMsgWithEscapeChar: String = "This is a log message\nThis is a new line \t other msg" - def msgWithMDCValueIsNull: LogEntry = log"Lost executor ${MDC(EXECUTOR_ID, null)}." + def basicMsgWithEscapeCharMDC: LogEntry = + log"This is a log message\nThis is a new line \t other msg" - def msgWithMDCAndException: LogEntry = log"Error in executor ${MDC(EXECUTOR_ID, "1")}." + // scalastyle:off line.size.limit + def msgWithMDCAndEscapeChar: LogEntry = + log"The first message\nthe first new line\tthe first other msg\n${MDC(LogKeys.PATHS, "C:\\Users\\run-all_1.R\nC:\\Users\\run-all_2.R")}\nThe second message\nthe second new line\tthe second other msg" + // scalastyle:on line.size.limit - def msgWithConcat: LogEntry = log"Min Size: ${MDC(MIN_SIZE, "2")}, " + - log"Max Size: ${MDC(MAX_SIZE, "4")}. " + + def msgWithMDC: LogEntry = log"Lost executor ${MDC(LogKeys.EXECUTOR_ID, "1")}." + + def msgWithMDCValueIsNull: LogEntry = log"Lost executor ${MDC(LogKeys.EXECUTOR_ID, null)}." + + def msgWithMDCAndException: LogEntry = log"Error in executor ${MDC(LogKeys.EXECUTOR_ID, "1")}." + + def msgWithConcat: LogEntry = log"Min Size: ${MDC(LogKeys.MIN_SIZE, "2")}, " + + log"Max Size: ${MDC(LogKeys.MAX_SIZE, "4")}. " + log"Please double check." // test for basic message (without any mdc) def expectedPatternForBasicMsg(level: Level): String + // test for basic message (with escape char) + def expectedPatternForBasicMsgWithEscapeChar(level: Level): String + + // test for basic message (with escape char mdc) + def expectedPatternForBasicMsgWithEscapeCharMDC(level: Level): String + + // test for message (with mdc and escape char) + def expectedPatternForMsgWithMDCAndEscapeChar(level: Level): String + // test for basic message and exception def expectedPatternForBasicMsgWithException(level: Level): String @@ -79,6 +97,9 @@ trait LoggingSuiteBase // test for message and exception def expectedPatternForMsgWithMDCAndException(level: Level): String + // test for custom LogKey + def expectedPatternForCustomLogKey(level: Level): String + def verifyMsgWithConcat(level: Level, logOutput: String): Unit test("Basic logging") { @@ -93,6 +114,43 @@ trait LoggingSuiteBase } } + test("Basic logging with escape char") { + Seq( + (Level.ERROR, () => logError(basicMsgWithEscapeChar)), + (Level.WARN, () => logWarning(basicMsgWithEscapeChar)), + (Level.INFO, () => logInfo(basicMsgWithEscapeChar)), + (Level.DEBUG, () => logDebug(basicMsgWithEscapeChar)), + (Level.TRACE, () => logTrace(basicMsgWithEscapeChar))).foreach { case (level, logFunc) => + val logOutput = captureLogOutput(logFunc) + assert(expectedPatternForBasicMsgWithEscapeChar(level).r.matches(logOutput)) + } + } + + test("Basic logging with escape char MDC") { + Seq( + (Level.ERROR, () => logError(basicMsgWithEscapeCharMDC)), + (Level.WARN, () => logWarning(basicMsgWithEscapeCharMDC)), + (Level.INFO, () => logInfo(basicMsgWithEscapeCharMDC)), + (Level.DEBUG, () => logDebug(basicMsgWithEscapeCharMDC)), + (Level.TRACE, () => logTrace(basicMsgWithEscapeCharMDC))).foreach { case (level, logFunc) => + val logOutput = captureLogOutput(logFunc) + assert(expectedPatternForBasicMsgWithEscapeCharMDC(level).r.matches(logOutput)) + } + } + + test("Logging with MDC and escape char") { + Seq( + (Level.ERROR, () => logError(msgWithMDCAndEscapeChar)), + (Level.WARN, () => logWarning(msgWithMDCAndEscapeChar)), + (Level.INFO, () => logInfo(msgWithMDCAndEscapeChar)), + (Level.DEBUG, () => logDebug(msgWithMDCAndEscapeChar)), + (Level.TRACE, () => logTrace(msgWithMDCAndEscapeChar)) + ).foreach { case (level, logFunc) => + val logOutput = captureLogOutput(logFunc) + assert(expectedPatternForMsgWithMDCAndEscapeChar(level).r.matches(logOutput)) + } + } + test("Basic logging with Exception") { val exception = new RuntimeException("OOM") Seq( @@ -144,6 +202,20 @@ trait LoggingSuiteBase } } + private val customLog = log"${MDC(CustomLogKeys.CUSTOM_LOG_KEY, "Custom log message.")}" + test("Logging with custom LogKey") { + Seq( + (Level.ERROR, () => logError(customLog)), + (Level.WARN, () => logWarning(customLog)), + (Level.INFO, () => logInfo(customLog)), + (Level.DEBUG, () => logDebug(customLog)), + (Level.TRACE, () => logTrace(customLog))).foreach { + case (level, logFunc) => + val logOutput = captureLogOutput(logFunc) + assert(expectedPatternForCustomLogKey(level).r.matches(logOutput)) + } + } + test("Logging with concat") { Seq( (Level.ERROR, () => logError(msgWithConcat)), @@ -156,10 +228,41 @@ trait LoggingSuiteBase verifyMsgWithConcat(level, logOutput) } } + + test("LogEntry should construct MessageWithContext only once") { + var constructionCount = 0 + + def constructMessageWithContext(): MessageWithContext = { + constructionCount += 1 + log"Lost executor ${MDC(LogKeys.EXECUTOR_ID, "1")}." + } + logInfo(constructMessageWithContext()) + assert(constructionCount === 1) + } + + test("LogEntry should construct MessageWithContext only once II") { + var constructionCount = 0 + var constructionCount2 = 0 + + def executorId(): String = { + constructionCount += 1 + "1" + } + + def workerId(): String = { + constructionCount2 += 1 + "2" + } + + logInfo(log"Lost executor ${MDC(LogKeys.EXECUTOR_ID, executorId())}." + + log"worker id ${MDC(LogKeys.WORKER_ID, workerId())}") + assert(constructionCount === 1) + assert(constructionCount2 === 1) + } } class StructuredLoggingSuite extends LoggingSuiteBase { - override def className: String = classOf[StructuredLoggingSuite].getName + override def className: String = classOf[StructuredLoggingSuite].getSimpleName override def logFilePath: String = "target/structured.log" private val jsonMapper = new ObjectMapper().registerModule(DefaultScalaModule) @@ -167,6 +270,7 @@ class StructuredLoggingSuite extends LoggingSuiteBase { jsonMapper.readTree(json).toString. replace("", """[^"]+"""). replace("""""""", """.*"""). + replace("", """.*"""). replace("{", """\{""") + "\n" } @@ -181,6 +285,44 @@ class StructuredLoggingSuite extends LoggingSuiteBase { }""") } + override def expectedPatternForBasicMsgWithEscapeChar(level: Level): String = { + compactAndToRegexPattern( + s""" + { + "ts": "", + "level": "$level", + "msg": "This is a log message\\\\nThis is a new line \\\\t other msg", + "logger": "$className" + }""") + } + + override def expectedPatternForBasicMsgWithEscapeCharMDC(level: Level): String = { + compactAndToRegexPattern( + s""" + { + "ts": "", + "level": "$level", + "msg": "This is a log message\\\\nThis is a new line \\\\t other msg", + "logger": "$className" + }""") + } + + override def expectedPatternForMsgWithMDCAndEscapeChar(level: Level): String = { + // scalastyle:off line.size.limit + compactAndToRegexPattern( + s""" + { + "ts": "", + "level": "$level", + "msg": "The first message\\\\nthe first new line\\\\tthe first other msg\\\\n\\\\nThe second message\\\\nthe second new line\\\\tthe second other msg", + "context": { + "paths": "" + }, + "logger": "$className" + }""") + // scalastyle:on line.size.limit + } + override def expectedPatternForBasicMsgWithException(level: Level): String = { compactAndToRegexPattern( s""" @@ -244,6 +386,21 @@ class StructuredLoggingSuite extends LoggingSuiteBase { }""") } + override def expectedPatternForCustomLogKey(level: Level): String = { + compactAndToRegexPattern( + s""" + { + "ts": "", + "level": "$level", + "msg": "Custom log message.", + "context": { + "custom_log_key": "Custom log message." + }, + "logger": "$className" + }""" + ) + } + override def verifyMsgWithConcat(level: Level, logOutput: String): Unit = { val pattern1 = compactAndToRegexPattern( s""" @@ -272,4 +429,18 @@ class StructuredLoggingSuite extends LoggingSuiteBase { }""") assert(pattern1.r.matches(logOutput) || pattern2.r.matches(logOutput)) } + + test("process escape sequences") { + assert(log"\n".message == "\n") + assert(log"\t".message == "\t") + assert(log"\b".message == "\b") + assert(log"\r".message == "\r") + assert((log"\r" + log"\n" + log"\t" + log"\b").message == "\r\n\t\b") + assert((log"\r${MDC(LogKeys.EXECUTOR_ID, 1)}\n".message == "\r1\n")) + } +} + +object CustomLogKeys { + // Custom `LogKey` must be `extends LogKey` + case object CUSTOM_LOG_KEY extends LogKey } diff --git a/common/variant/src/main/java/org/apache/spark/types/variant/VariantBuilder.java b/common/variant/src/main/java/org/apache/spark/types/variant/VariantBuilder.java index ea7a7674baf57..2afba81d192e9 100644 --- a/common/variant/src/main/java/org/apache/spark/types/variant/VariantBuilder.java +++ b/common/variant/src/main/java/org/apache/spark/types/variant/VariantBuilder.java @@ -223,7 +223,7 @@ public void appendFloat(float f) { public void appendBinary(byte[] binary) { checkCapacity(1 + U32_SIZE + binary.length); - writeBuffer[writePos++] = primitiveHeader(LONG_STR); + writeBuffer[writePos++] = primitiveHeader(BINARY); writeLong(writeBuffer, writePos, binary.length, U32_SIZE); writePos += U32_SIZE; System.arraycopy(binary, 0, writeBuffer, writePos, binary.length); diff --git a/common/variant/src/main/java/org/apache/spark/types/variant/VariantUtil.java b/common/variant/src/main/java/org/apache/spark/types/variant/VariantUtil.java index e4e9cc8b4cfac..84e3a45e4b0ee 100644 --- a/common/variant/src/main/java/org/apache/spark/types/variant/VariantUtil.java +++ b/common/variant/src/main/java/org/apache/spark/types/variant/VariantUtil.java @@ -392,6 +392,13 @@ public static double getDouble(byte[] value, int pos) { return Double.longBitsToDouble(readLong(value, pos + 1, 8)); } + // Check whether the precision and scale of the decimal are within the limit. + private static void checkDecimal(BigDecimal d, int maxPrecision) { + if (d.precision() > maxPrecision || d.scale() > maxPrecision) { + throw malformedVariant(); + } + } + // Get a decimal value from variant value `value[pos...]`. // Throw `MALFORMED_VARIANT` if the variant is malformed. public static BigDecimal getDecimal(byte[] value, int pos) { @@ -399,14 +406,18 @@ public static BigDecimal getDecimal(byte[] value, int pos) { int basicType = value[pos] & BASIC_TYPE_MASK; int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; if (basicType != PRIMITIVE) throw unexpectedType(Type.DECIMAL); - int scale = value[pos + 1]; + // Interpret the scale byte as unsigned. If it is a negative byte, the unsigned value must be + // greater than `MAX_DECIMAL16_PRECISION` and will trigger an error in `checkDecimal`. + int scale = value[pos + 1] & 0xFF; BigDecimal result; switch (typeInfo) { case DECIMAL4: result = BigDecimal.valueOf(readLong(value, pos + 2, 4), scale); + checkDecimal(result, MAX_DECIMAL4_PRECISION); break; case DECIMAL8: result = BigDecimal.valueOf(readLong(value, pos + 2, 8), scale); + checkDecimal(result, MAX_DECIMAL8_PRECISION); break; case DECIMAL16: checkIndex(pos + 17, value.length); @@ -417,6 +428,7 @@ public static BigDecimal getDecimal(byte[] value, int pos) { bytes[i] = value[pos + 17 - i]; } result = new BigDecimal(new BigInteger(bytes), scale); + checkDecimal(result, MAX_DECIMAL16_PRECISION); break; default: throw unexpectedType(Type.DECIMAL); diff --git a/conf/log4j2.properties.pattern-layout-template b/conf/log4j2.properties.pattern-layout-template new file mode 100644 index 0000000000000..ab96e03baed20 --- /dev/null +++ b/conf/log4j2.properties.pattern-layout-template @@ -0,0 +1,69 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Set everything to be logged to the console +rootLogger.level = info +rootLogger.appenderRef.stdout.ref = console + +# In the pattern layout configuration below, we specify an explicit `%ex` conversion +# pattern for logging Throwables. If this was omitted, then (by default) Log4J would +# implicitly add an `%xEx` conversion pattern which logs stacktraces with additional +# class packaging information. That extra information can sometimes add a substantial +# performance overhead, so we disable it in our default logging config. +# For more information, see SPARK-39361. +appender.console.type = Console +appender.console.name = console +appender.console.target = SYSTEM_ERR +appender.console.layout.type = PatternLayout +appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n%ex + +# Set the default spark-shell/spark-sql log level to WARN. When running the +# spark-shell/spark-sql, the log level for these classes is used to overwrite +# the root logger's log level, so that the user can have different defaults +# for the shell and regular Spark apps. +logger.repl.name = org.apache.spark.repl.Main +logger.repl.level = warn + +logger.thriftserver.name = org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver +logger.thriftserver.level = warn + +# Settings to quiet third party logs that are too verbose +logger.jetty1.name = org.sparkproject.jetty +logger.jetty1.level = warn +logger.jetty2.name = org.sparkproject.jetty.util.component.AbstractLifeCycle +logger.jetty2.level = error +logger.replexprTyper.name = org.apache.spark.repl.SparkIMain$exprTyper +logger.replexprTyper.level = info +logger.replSparkILoopInterpreter.name = org.apache.spark.repl.SparkILoop$SparkILoopInterpreter +logger.replSparkILoopInterpreter.level = info +logger.parquet1.name = org.apache.parquet +logger.parquet1.level = error +logger.parquet2.name = parquet +logger.parquet2.level = error + +# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support +logger.RetryingHMSHandler.name = org.apache.hadoop.hive.metastore.RetryingHMSHandler +logger.RetryingHMSHandler.level = fatal +logger.FunctionRegistry.name = org.apache.hadoop.hive.ql.exec.FunctionRegistry +logger.FunctionRegistry.level = error + +# For deploying Spark ThriftServer +# SPARK-34128: Suppress undesirable TTransportException warnings involved in THRIFT-4805 +appender.console.filter.1.type = RegexFilter +appender.console.filter.1.regex = .*Thrift error occurred during processing of message.* +appender.console.filter.1.onMatch = deny +appender.console.filter.1.onMismatch = neutral diff --git a/conf/log4j2.properties.template b/conf/log4j2.properties.template index ab96e03baed20..8767245314449 100644 --- a/conf/log4j2.properties.template +++ b/conf/log4j2.properties.template @@ -19,17 +19,11 @@ rootLogger.level = info rootLogger.appenderRef.stdout.ref = console -# In the pattern layout configuration below, we specify an explicit `%ex` conversion -# pattern for logging Throwables. If this was omitted, then (by default) Log4J would -# implicitly add an `%xEx` conversion pattern which logs stacktraces with additional -# class packaging information. That extra information can sometimes add a substantial -# performance overhead, so we disable it in our default logging config. -# For more information, see SPARK-39361. appender.console.type = Console appender.console.name = console appender.console.target = SYSTEM_ERR -appender.console.layout.type = PatternLayout -appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n%ex +appender.console.layout.type = JsonTemplateLayout +appender.console.layout.eventTemplateUri = classpath:org/apache/spark/SparkLayout.json # Set the default spark-shell/spark-sql log level to WARN. When running the # spark-shell/spark-sql, the log level for these classes is used to overwrite diff --git a/connector/avro/benchmarks/AvroReadBenchmark-jdk21-results.txt b/connector/avro/benchmarks/AvroReadBenchmark-jdk21-results.txt index 360f94dfd1e07..e0d9f9b90121f 100644 --- a/connector/avro/benchmarks/AvroReadBenchmark-jdk21-results.txt +++ b/connector/avro/benchmarks/AvroReadBenchmark-jdk21-results.txt @@ -2,140 +2,140 @@ SQL Single Numeric Column Scan ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor SQL Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum 1709 1724 21 9.2 108.7 1.0X +Sum 2124 2129 8 7.4 135.0 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor SQL Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum 1690 1699 13 9.3 107.4 1.0X +Sum 2124 2129 7 7.4 135.1 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor SQL Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum 1673 1678 6 9.4 106.4 1.0X +Sum 2082 2096 19 7.6 132.4 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor SQL Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum 1913 1917 6 8.2 121.6 1.0X +Sum 2079 2091 17 7.6 132.2 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor SQL Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum 1779 1780 2 8.8 113.1 1.0X +Sum 2070 2078 11 7.6 131.6 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor SQL Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum 1810 1823 19 8.7 115.0 1.0X +Sum 2005 2023 25 7.8 127.5 1.0X ================================================================================================ Int and String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Int and String Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum of columns 3263 3268 8 3.2 311.2 1.0X +Sum of columns 3598 3606 12 2.9 343.1 1.0X ================================================================================================ Partitioned Table Scan ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Partitioned Table: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Data column 2226 2231 7 7.1 141.5 1.0X -Partition column 1842 1855 18 8.5 117.1 1.2X -Both columns 2182 2187 7 7.2 138.7 1.0X +Data column 2081 2094 19 7.6 132.3 1.0X +Partition column 1913 1917 5 8.2 121.7 1.1X +Both columns 2141 2171 43 7.3 136.1 1.0X ================================================================================================ Repeated String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Repeated String: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum of string length 2093 2103 15 5.0 199.6 1.0X +Sum of string length 2299 2337 53 4.6 219.3 1.0X ================================================================================================ String with Nulls Scan ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor String with Nulls Scan (0.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum of string length 3044 3076 45 3.4 290.3 1.0X +Sum of string length 3094 3118 33 3.4 295.1 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor String with Nulls Scan (50.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum of string length 2320 2331 15 4.5 221.2 1.0X +Sum of string length 2162 2213 72 4.9 206.2 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor String with Nulls Scan (95.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum of string length 1292 1310 24 8.1 123.3 1.0X +Sum of string length 1205 1210 7 8.7 114.9 1.0X ================================================================================================ Select All From Wide Columns ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Wide Column Scan from 1000 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Select of all columns 19999 20110 158 0.0 39997.0 1.0X +Select of all columns 17970 18066 135 0.0 35940.5 1.0X ================================================================================================ Single Column Scan From Wide Columns ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Single Column Scan from 100 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum of single column 2703 2707 5 0.4 2578.2 1.0X +Sum of single column 3222 3242 29 0.3 3072.7 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Single Column Scan from 200 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum of single column 5225 5235 14 0.2 4983.2 1.0X +Sum of single column 6336 6343 9 0.2 6043.0 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Single Column Scan from 300 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum of single column 7775 7778 4 0.1 7414.5 1.0X +Sum of single column 9410 9463 75 0.1 8974.0 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Filters pushdown: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -w/o filters 5486 5506 19 0.2 5485.8 1.0X -pushdown disabled 5371 5400 25 0.2 5371.5 1.0X -w/ filters 2237 2254 24 0.4 2236.6 2.5X +w/o filters 5535 5544 10 0.2 5535.0 1.0X +pushdown disabled 5450 5479 29 0.2 5450.1 1.0X +w/ filters 2335 2340 9 0.4 2334.5 2.4X diff --git a/connector/avro/benchmarks/AvroReadBenchmark-results.txt b/connector/avro/benchmarks/AvroReadBenchmark-results.txt index 633a0dfcf3bd8..f1065f98b81a2 100644 --- a/connector/avro/benchmarks/AvroReadBenchmark-results.txt +++ b/connector/avro/benchmarks/AvroReadBenchmark-results.txt @@ -2,140 +2,140 @@ SQL Single Numeric Column Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor SQL Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum 1696 1700 6 9.3 107.8 1.0X +Sum 1986 2030 63 7.9 126.3 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor SQL Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum 1641 1663 32 9.6 104.3 1.0X +Sum 1983 2021 54 7.9 126.1 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor SQL Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum 1588 1588 1 9.9 100.9 1.0X +Sum 1955 1977 30 8.0 124.3 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor SQL Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum 1845 1847 4 8.5 117.3 1.0X +Sum 1939 1958 26 8.1 123.3 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor SQL Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum 1649 1650 1 9.5 104.8 1.0X +Sum 1961 1963 3 8.0 124.7 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor SQL Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum 1720 1722 3 9.1 109.4 1.0X +Sum 1944 1946 3 8.1 123.6 1.0X ================================================================================================ Int and String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Int and String Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum of columns 3223 3237 19 3.3 307.4 1.0X +Sum of columns 3345 3376 44 3.1 319.0 1.0X ================================================================================================ Partitioned Table Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Partitioned Table: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Data column 1907 1924 24 8.2 121.2 1.0X -Partition column 1709 1724 21 9.2 108.7 1.1X -Both columns 2040 2046 8 7.7 129.7 0.9X +Data column 2006 2022 22 7.8 127.5 1.0X +Partition column 1761 1765 5 8.9 112.0 1.1X +Both columns 2054 2068 20 7.7 130.6 1.0X ================================================================================================ Repeated String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Repeated String: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum of string length 1993 2012 28 5.3 190.1 1.0X +Sum of string length 2002 2024 31 5.2 191.0 1.0X ================================================================================================ String with Nulls Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor String with Nulls Scan (0.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum of string length 3143 3184 58 3.3 299.7 1.0X +Sum of string length 3103 3141 54 3.4 295.9 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor String with Nulls Scan (50.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum of string length 2223 2224 1 4.7 212.0 1.0X +Sum of string length 2056 2064 11 5.1 196.1 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor String with Nulls Scan (95.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum of string length 1216 1221 7 8.6 116.0 1.0X +Sum of string length 1084 1086 3 9.7 103.3 1.0X ================================================================================================ Select All From Wide Columns ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Wide Column Scan from 1000 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Select of all columns 19275 19347 102 0.0 38549.7 1.0X +Select of all columns 19331 19457 177 0.0 38662.8 1.0X ================================================================================================ Single Column Scan From Wide Columns ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Single Column Scan from 100 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum of single column 2529 2532 4 0.4 2412.0 1.0X +Sum of single column 3178 3191 18 0.3 3030.7 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Single Column Scan from 200 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum of single column 4969 4970 2 0.2 4739.1 1.0X +Sum of single column 6288 6406 167 0.2 5996.4 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Single Column Scan from 300 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Sum of single column 7413 7503 127 0.1 7069.6 1.0X +Sum of single column 9478 9487 12 0.1 9039.2 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Filters pushdown: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -w/o filters 5399 5410 11 0.2 5399.0 1.0X -pushdown disabled 5302 5320 16 0.2 5301.9 1.0X -w/ filters 2108 2178 96 0.5 2107.7 2.6X +w/o filters 5488 5511 22 0.2 5488.5 1.0X +pushdown disabled 5495 5510 15 0.2 5494.9 1.0X +w/ filters 2218 2232 12 0.5 2218.1 2.5X diff --git a/connector/avro/benchmarks/AvroWriteBenchmark-jdk21-results.txt b/connector/avro/benchmarks/AvroWriteBenchmark-jdk21-results.txt index cdeabe1275140..f49e7db17093e 100644 --- a/connector/avro/benchmarks/AvroWriteBenchmark-jdk21-results.txt +++ b/connector/avro/benchmarks/AvroWriteBenchmark-jdk21-results.txt @@ -1,56 +1,56 @@ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Avro writer benchmark: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Output Single Int Column 1473 1503 43 10.7 93.6 1.0X -Output Single Double Column 1491 1514 32 10.5 94.8 1.0X -Output Int and String Column 3118 3125 9 5.0 198.3 0.5X -Output Partitions 2901 2953 74 5.4 184.4 0.5X -Output Buckets 3624 3634 14 4.3 230.4 0.4X +Output Single Int Column 1589 1616 39 9.9 101.0 1.0X +Output Single Double Column 1522 1536 20 10.3 96.8 1.0X +Output Int and String Column 3264 3266 2 4.8 207.5 0.5X +Output Partitions 3054 3094 57 5.1 194.2 0.5X +Output Buckets 4024 4078 76 3.9 255.9 0.4X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Avro compression with different codec: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -BZIP2: 114516 115179 937 0.0 1145165.0 1.0X -DEFLATE: 6419 6424 7 0.0 64190.4 17.8X -UNCOMPRESSED: 5241 5276 50 0.0 52408.0 21.9X -SNAPPY: 4659 4674 20 0.0 46592.4 24.6X -XZ: 56023 58323 3252 0.0 560230.7 2.0X -ZSTANDARD: 5025 5075 70 0.0 50251.0 22.8X +BZIP2: 115682 115702 29 0.0 1156821.8 1.0X +DEFLATE: 6294 6309 20 0.0 62944.7 18.4X +UNCOMPRESSED: 5130 5148 26 0.0 51301.8 22.5X +SNAPPY: 4611 4643 45 0.0 46106.1 25.1X +XZ: 64308 64406 139 0.0 643084.7 1.8X +ZSTANDARD: 4651 4687 51 0.0 46509.7 24.9X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Avro deflate with different levels: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -DEFLATE: deflate.level=1 4886 4908 31 0.0 48858.7 1.0X -DEFLATE: deflate.level=3 4884 4902 26 0.0 48842.0 1.0X -DEFLATE: deflate.level=5 6412 6472 85 0.0 64120.5 0.8X -DEFLATE: deflate.level=7 6498 6515 24 0.0 64982.4 0.8X -DEFLATE: deflate.level=9 6749 6761 17 0.0 67490.9 0.7X +DEFLATE: deflate.level=1 4666 4669 5 0.0 46656.5 1.0X +DEFLATE: deflate.level=3 4646 4648 3 0.0 46463.0 1.0X +DEFLATE: deflate.level=5 6223 6230 11 0.0 62226.1 0.7X +DEFLATE: deflate.level=7 6272 6282 15 0.0 62715.3 0.7X +DEFLATE: deflate.level=9 6628 6635 9 0.0 66283.6 0.7X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Avro xz with different levels: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -XZ: xz.level=1 12318 12319 1 0.0 123181.8 1.0X -XZ: xz.level=3 22329 22469 197 0.0 223294.2 0.6X -XZ: xz.level=5 46825 47393 804 0.0 468247.0 0.3X -XZ: xz.level=7 68116 68235 168 0.0 681159.0 0.2X -XZ: xz.level=9 146500 146710 297 0.0 1464999.7 0.1X +XZ: xz.level=1 12512 12545 46 0.0 125121.6 1.0X +XZ: xz.level=3 23744 23832 124 0.0 237441.6 0.5X +XZ: xz.level=5 48209 50241 2874 0.0 482091.5 0.3X +XZ: xz.level=7 69424 69655 327 0.0 694240.2 0.2X +XZ: xz.level=9 142278 142354 108 0.0 1422778.3 0.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Avro zstandard with different levels: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------- -ZSTANDARD: zstandard.level=1 4767 4801 48 0.0 47673.3 1.0X -ZSTANDARD: zstandard.level=1, zstandard.bufferPool.enabled=true 4825 4855 42 0.0 48252.8 1.0X -ZSTANDARD: zstandard.level=3 4872 4919 65 0.0 48724.3 1.0X -ZSTANDARD: zstandard.level=3, zstandard.bufferPool.enabled=true 4763 4825 89 0.0 47628.2 1.0X -ZSTANDARD: zstandard.level=5 5169 5178 13 0.0 51687.2 0.9X -ZSTANDARD: zstandard.level=5, zstandard.bufferPool.enabled=true 5005 5018 18 0.0 50046.5 1.0X -ZSTANDARD: zstandard.level=7 5502 5507 7 0.0 55020.7 0.9X -ZSTANDARD: zstandard.level=7, zstandard.bufferPool.enabled=true 5327 5364 53 0.0 53270.3 0.9X -ZSTANDARD: zstandard.level=9 6089 6093 5 0.0 60890.2 0.8X -ZSTANDARD: zstandard.level=9, zstandard.bufferPool.enabled=true 6136 6174 53 0.0 61365.0 0.8X +ZSTANDARD: zstandard.level=1 4669 4670 2 0.0 46688.6 1.0X +ZSTANDARD: zstandard.level=1, zstandard.bufferPool.enabled=true 4689 4701 16 0.0 46893.1 1.0X +ZSTANDARD: zstandard.level=3 4805 4819 20 0.0 48048.3 1.0X +ZSTANDARD: zstandard.level=3, zstandard.bufferPool.enabled=true 4667 4670 4 0.0 46666.8 1.0X +ZSTANDARD: zstandard.level=5 4985 5014 41 0.0 49852.2 0.9X +ZSTANDARD: zstandard.level=5, zstandard.bufferPool.enabled=true 4950 4991 59 0.0 49499.4 0.9X +ZSTANDARD: zstandard.level=7 5282 5291 13 0.0 52820.2 0.9X +ZSTANDARD: zstandard.level=7, zstandard.bufferPool.enabled=true 5221 5260 55 0.0 52208.0 0.9X +ZSTANDARD: zstandard.level=9 5997 6034 52 0.0 59974.4 0.8X +ZSTANDARD: zstandard.level=9, zstandard.bufferPool.enabled=true 5888 5949 85 0.0 58885.0 0.8X diff --git a/connector/avro/benchmarks/AvroWriteBenchmark-results.txt b/connector/avro/benchmarks/AvroWriteBenchmark-results.txt index c817dc6337b53..658b9ad7851d4 100644 --- a/connector/avro/benchmarks/AvroWriteBenchmark-results.txt +++ b/connector/avro/benchmarks/AvroWriteBenchmark-results.txt @@ -1,56 +1,56 @@ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Avro writer benchmark: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Output Single Int Column 1440 1454 20 10.9 91.5 1.0X -Output Single Double Column 1631 1646 21 9.6 103.7 0.9X -Output Int and String Column 3281 3290 13 4.8 208.6 0.4X -Output Partitions 2854 2866 18 5.5 181.5 0.5X -Output Buckets 3590 3599 12 4.4 228.3 0.4X +Output Single Int Column 1566 1615 69 10.0 99.6 1.0X +Output Single Double Column 1718 1720 3 9.2 109.2 0.9X +Output Int and String Column 3250 3250 0 4.8 206.6 0.5X +Output Partitions 2869 2870 0 5.5 182.4 0.5X +Output Buckets 3655 3660 7 4.3 232.4 0.4X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Avro compression with different codec: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -BZIP2: 132631 133654 1446 0.0 1326312.3 1.0X -DEFLATE: 6609 6653 63 0.0 66086.0 20.1X -UNCOMPRESSED: 5556 5556 1 0.0 55555.7 23.9X -SNAPPY: 4880 4880 0 0.0 48799.4 27.2X -XZ: 55326 55531 290 0.0 553260.0 2.4X -ZSTANDARD: 5044 5079 50 0.0 50437.1 26.3X +BZIP2: 131005 132600 2255 0.0 1310049.5 1.0X +DEFLATE: 6673 6696 34 0.0 66725.6 19.6X +UNCOMPRESSED: 5469 5506 51 0.0 54692.2 24.0X +SNAPPY: 4970 5003 47 0.0 49696.0 26.4X +XZ: 55374 55620 347 0.0 553743.6 2.4X +ZSTANDARD: 4998 5044 64 0.0 49984.1 26.2X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Avro deflate with different levels: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -DEFLATE: deflate.level=1 5017 5074 81 0.0 50166.3 1.0X -DEFLATE: deflate.level=3 5002 5011 13 0.0 50021.7 1.0X -DEFLATE: deflate.level=5 6549 6579 43 0.0 65487.6 0.8X -DEFLATE: deflate.level=7 6699 6725 37 0.0 66987.3 0.7X -DEFLATE: deflate.level=9 6939 6948 13 0.0 69392.2 0.7X +DEFLATE: deflate.level=1 4996 5017 30 0.0 49961.8 1.0X +DEFLATE: deflate.level=3 5013 5026 18 0.0 50129.7 1.0X +DEFLATE: deflate.level=5 6557 6574 23 0.0 65574.0 0.8X +DEFLATE: deflate.level=7 6593 6624 44 0.0 65929.1 0.8X +DEFLATE: deflate.level=9 6973 6983 14 0.0 69725.4 0.7X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Avro xz with different levels: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -XZ: xz.level=1 12308 12335 39 0.0 123075.6 1.0X -XZ: xz.level=3 22817 22991 247 0.0 228166.4 0.5X -XZ: xz.level=5 48180 48301 172 0.0 481795.0 0.3X -XZ: xz.level=7 70991 72142 1628 0.0 709910.8 0.2X -XZ: xz.level=9 159363 160356 1405 0.0 1593630.7 0.1X +XZ: xz.level=1 12335 12414 113 0.0 123346.1 1.0X +XZ: xz.level=3 22830 22901 101 0.0 228298.8 0.5X +XZ: xz.level=5 47861 48099 336 0.0 478610.6 0.3X +XZ: xz.level=7 71299 71967 944 0.0 712993.0 0.2X +XZ: xz.level=9 159311 159585 388 0.0 1593106.7 0.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Avro zstandard with different levels: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------- -ZSTANDARD: zstandard.level=1 4915 4931 23 0.0 49148.9 1.0X -ZSTANDARD: zstandard.level=1, zstandard.bufferPool.enabled=true 4893 4925 46 0.0 48931.9 1.0X -ZSTANDARD: zstandard.level=3 5049 5066 24 0.0 50491.9 1.0X -ZSTANDARD: zstandard.level=3, zstandard.bufferPool.enabled=true 4962 4969 10 0.0 49621.0 1.0X -ZSTANDARD: zstandard.level=5 5295 5373 110 0.0 52946.1 0.9X -ZSTANDARD: zstandard.level=5, zstandard.bufferPool.enabled=true 5237 5238 1 0.0 52368.1 0.9X -ZSTANDARD: zstandard.level=7 5750 5789 54 0.0 57503.5 0.9X -ZSTANDARD: zstandard.level=7, zstandard.bufferPool.enabled=true 5658 5711 75 0.0 56577.4 0.9X -ZSTANDARD: zstandard.level=9 6719 6733 20 0.0 67194.8 0.7X -ZSTANDARD: zstandard.level=9, zstandard.bufferPool.enabled=true 6694 6711 25 0.0 66935.4 0.7X +ZSTANDARD: zstandard.level=1 4917 4951 48 0.0 49169.8 1.0X +ZSTANDARD: zstandard.level=1, zstandard.bufferPool.enabled=true 4885 4904 26 0.0 48848.3 1.0X +ZSTANDARD: zstandard.level=3 5045 5051 9 0.0 50448.8 1.0X +ZSTANDARD: zstandard.level=3, zstandard.bufferPool.enabled=true 4926 4931 7 0.0 49258.9 1.0X +ZSTANDARD: zstandard.level=5 5366 5437 101 0.0 53656.6 0.9X +ZSTANDARD: zstandard.level=5, zstandard.bufferPool.enabled=true 5261 5305 62 0.0 52610.6 0.9X +ZSTANDARD: zstandard.level=7 5673 5680 9 0.0 56731.6 0.9X +ZSTANDARD: zstandard.level=7, zstandard.bufferPool.enabled=true 5592 5615 33 0.0 55917.0 0.9X +ZSTANDARD: zstandard.level=9 6662 6663 2 0.0 66620.2 0.7X +ZSTANDARD: zstandard.level=9, zstandard.bufferPool.enabled=true 6759 6760 1 0.0 67591.8 0.7X diff --git a/connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroUtils.scala b/connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroUtils.scala index 4bedd625e6091..7cbc30f1fb3dc 100644 --- a/connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroUtils.scala +++ b/connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroUtils.scala @@ -32,7 +32,7 @@ import org.apache.hadoop.mapreduce.Job import org.apache.spark.{SparkException, SparkIllegalArgumentException} import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{CODEC_LEVEL, CODEC_NAME, CONFIG, PATH} +import org.apache.spark.internal.LogKeys.{CODEC_LEVEL, CODEC_NAME, CONFIG, PATH} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.avro.AvroCompressionCodec._ import org.apache.spark.sql.avro.AvroOptions.IGNORE_EXTENSION diff --git a/connector/avro/src/main/scala/org/apache/spark/sql/avro/SchemaConverters.scala b/connector/avro/src/main/scala/org/apache/spark/sql/avro/SchemaConverters.scala index 387526d40f68f..b2285aa966ddb 100644 --- a/connector/avro/src/main/scala/org/apache/spark/sql/avro/SchemaConverters.scala +++ b/connector/avro/src/main/scala/org/apache/spark/sql/avro/SchemaConverters.scala @@ -208,14 +208,12 @@ object SchemaConverters { // could be "a" and "A" and we need to distinguish them. In this case, we throw // an exception. // Stable id prefix can be empty so the name of the field can be just the type. - val tempFieldName = - s"${stableIdPrefixForUnionType}${s.getName.toLowerCase(Locale.ROOT)}" - if (fieldNameSet.contains(tempFieldName)) { + val tempFieldName = s"${stableIdPrefixForUnionType}${s.getName}" + if (!fieldNameSet.add(tempFieldName.toLowerCase(Locale.ROOT))) { throw new IncompatibleSchemaException( - "Cannot generate stable indentifier for Avro union type due to name " + + "Cannot generate stable identifier for Avro union type due to name " + s"conflict of type name ${s.getName}") } - fieldNameSet.add(tempFieldName) tempFieldName } else { s"member$i" diff --git a/connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroFunctionsSuite.scala b/connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroFunctionsSuite.scala index d16ddb4973205..c807685db0f0c 100644 --- a/connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroFunctionsSuite.scala +++ b/connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroFunctionsSuite.scala @@ -26,7 +26,7 @@ import org.apache.avro.generic.{GenericDatumWriter, GenericRecord, GenericRecord import org.apache.avro.io.EncoderFactory import org.apache.spark.SparkException -import org.apache.spark.sql.{QueryTest, Row} +import org.apache.spark.sql.{AnalysisException, QueryTest, Row} import org.apache.spark.sql.execution.LocalTableScanExec import org.apache.spark.sql.functions.{col, lit, struct} import org.apache.spark.sql.internal.SQLConf @@ -286,4 +286,85 @@ class AvroFunctionsSuite extends QueryTest with SharedSparkSession { assert(msg.contains("Invalid default for field id: null not a \"long\"")) } } + + test("SPARK-48545: from_avro and to_avro SQL functions") { + withTable("t") { + sql( + """ + |create table t as + | select named_struct('u', named_struct('member0', member0, 'member1', member1)) as s + | from values (1, null), (null, 'a') tab(member0, member1) + |""".stripMargin) + val jsonFormatSchema = + """ + |{ + | "type": "record", + | "name": "struct", + | "fields": [{ + | "name": "u", + | "type": ["int","string"] + | }] + |} + |""".stripMargin + val toAvroSql = + s""" + |select to_avro(s, '$jsonFormatSchema') as result from t + |""".stripMargin + val avroResult = spark.sql(toAvroSql).collect() + assert(avroResult != null) + checkAnswer( + spark.sql(s"select from_avro(result, '$jsonFormatSchema', map()).u from ($toAvroSql)"), + Seq(Row(Row(1, null)), + Row(Row(null, "a")))) + + // Negative tests. + checkError( + exception = intercept[AnalysisException](sql( + s""" + |select to_avro(s, 42) as result from t + |""".stripMargin)), + errorClass = "DATATYPE_MISMATCH.TYPE_CHECK_FAILURE_WITH_HINT", + parameters = Map("sqlExpr" -> "\"toavro(s, 42)\"", + "msg" -> ("The second argument of the TO_AVRO SQL function must be a constant string " + + "containing the JSON representation of the schema to use for converting the value to " + + "AVRO format"), + "hint" -> ""), + queryContext = Array(ExpectedContext( + fragment = "to_avro(s, 42)", + start = 8, + stop = 21))) + checkError( + exception = intercept[AnalysisException](sql( + s""" + |select from_avro(s, 42, '') as result from t + |""".stripMargin)), + errorClass = "DATATYPE_MISMATCH.TYPE_CHECK_FAILURE_WITH_HINT", + parameters = Map("sqlExpr" -> "\"fromavro(s, 42, )\"", + "msg" -> ("The second argument of the FROM_AVRO SQL function must be a constant string " + + "containing the JSON representation of the schema to use for converting the value " + + "from AVRO format"), + "hint" -> ""), + queryContext = Array(ExpectedContext( + fragment = "from_avro(s, 42, '')", + start = 8, + stop = 27))) + checkError( + exception = intercept[AnalysisException](sql( + s""" + |select from_avro(s, '$jsonFormatSchema', 42) as result from t + |""".stripMargin)), + errorClass = "DATATYPE_MISMATCH.TYPE_CHECK_FAILURE_WITH_HINT", + parameters = Map( + "sqlExpr" -> + s"\"fromavro(s, $jsonFormatSchema, 42)\"".stripMargin, + "msg" -> ("The third argument of the FROM_AVRO SQL function must be a constant map of " + + "strings to strings containing the options to use for converting the value " + + "from AVRO format"), + "hint" -> ""), + queryContext = Array(ExpectedContext( + fragment = s"from_avro(s, '$jsonFormatSchema', 42)", + start = 8, + stop = 138))) + } + } } diff --git a/connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala b/connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala index 67a09812b3f2e..42c13f5e20873 100644 --- a/connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala +++ b/connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala @@ -377,7 +377,7 @@ abstract class AvroSuite "", Seq()) } - assert(e.getMessage.contains("Cannot generate stable indentifier")) + assert(e.getMessage.contains("Cannot generate stable identifier")) } { val e = intercept[Exception] { @@ -388,7 +388,7 @@ abstract class AvroSuite "", Seq()) } - assert(e.getMessage.contains("Cannot generate stable indentifier")) + assert(e.getMessage.contains("Cannot generate stable identifier")) } // Two array types or two map types are not allowed in union. { @@ -441,6 +441,33 @@ abstract class AvroSuite } } + test("SPARK-47904: Test that field name case is preserved") { + checkUnionStableId( + List( + Schema.createEnum("myENUM", "", null, List[String]("E1", "e2").asJava), + Schema.createRecord("myRecord", "", null, false, + List[Schema.Field](new Schema.Field("f", Schema.createFixed("myField", "", null, 6))) + .asJava), + Schema.createRecord("myRecord2", "", null, false, + List[Schema.Field](new Schema.Field("F", Schema.create(Type.FLOAT))) + .asJava)), + "struct, " + + "member_myRecord2: struct>", + Seq()) + + { + val e = intercept[Exception] { + checkUnionStableId( + List( + Schema.createRecord("myRecord", "", null, false, List[Schema.Field]().asJava), + Schema.createRecord("myrecord", "", null, false, List[Schema.Field]().asJava)), + "", + Seq()) + } + assert(e.getMessage.contains("Cannot generate stable identifier")) + } + } + test("SPARK-46930: Use custom prefix for stable ids when converting Union type") { // Test default "member_" prefix. checkUnionStableId( @@ -1624,7 +1651,7 @@ abstract class AvroSuite errorClass = "UNSUPPORTED_DATA_TYPE_FOR_DATASOURCE", parameters = Map( "columnName" -> "`testType()`", - "columnType" -> "\"INTERVAL\"", + "columnType" -> "UDT(\"INTERVAL\")", "format" -> "Avro") ) } diff --git a/connector/avro/src/test/scala/org/apache/spark/sql/execution/benchmark/AvroReadBenchmark.scala b/connector/avro/src/test/scala/org/apache/spark/sql/execution/benchmark/AvroReadBenchmark.scala index 80f0d6bc7b6eb..7b16a75d62164 100644 --- a/connector/avro/src/test/scala/org/apache/spark/sql/execution/benchmark/AvroReadBenchmark.scala +++ b/connector/avro/src/test/scala/org/apache/spark/sql/execution/benchmark/AvroReadBenchmark.scala @@ -66,7 +66,7 @@ object AvroReadBenchmark extends SqlBasedBenchmark { import spark.implicits._ spark.range(values).map(_ => Random.nextLong()).createOrReplaceTempView("t1") - prepareTable(dir, spark.sql(s"SELECT CAST(value as ${dataType.sql}) id FROM t1")) + prepareTable(dir, spark.sql(s"SELECT mod(value, 255) id FROM t1")) benchmark.addCase("Sum") { _ => spark.sql("SELECT sum(id) FROM avroTable").noop() @@ -87,7 +87,7 @@ object AvroReadBenchmark extends SqlBasedBenchmark { prepareTable( dir, - spark.sql("SELECT CAST(value AS INT) AS c1, CAST(value as STRING) AS c2 FROM t1")) + spark.sql(s"SELECT value % ${Int.MaxValue} AS c1, CAST(value as STRING) AS c2 FROM t1")) benchmark.addCase("Sum of columns") { _ => spark.sql("SELECT sum(c1), sum(length(c2)) FROM avroTable").noop() @@ -106,7 +106,8 @@ object AvroReadBenchmark extends SqlBasedBenchmark { import spark.implicits._ spark.range(values).map(_ => Random.nextLong()).createOrReplaceTempView("t1") - prepareTable(dir, spark.sql("SELECT value % 2 AS p, value AS id FROM t1"), Some("p")) + prepareTable(dir, + spark.sql(s"SELECT value % 2 AS p, value % ${Int.MaxValue} AS id FROM t1"), Some("p")) benchmark.addCase("Data column") { _ => spark.sql("SELECT sum(id) FROM avroTable").noop() @@ -176,7 +177,7 @@ object AvroReadBenchmark extends SqlBasedBenchmark { withTempTable("t1", "avroTable") { import spark.implicits._ val middle = width / 2 - val selectExpr = (1 to width).map(i => s"value as c$i") + val selectExpr = (1 to width).map(i => s"value % ${Int.MaxValue} as c$i") spark.range(values).map(_ => Random.nextLong()).toDF() .selectExpr(selectExpr: _*).createOrReplaceTempView("t1") @@ -198,7 +199,6 @@ object AvroReadBenchmark extends SqlBasedBenchmark { withTempPath { dir => withTempTable("t1", "avroTable") { import spark.implicits._ - val middle = width / 2 val selectExpr = (1 to width).map(i => s"value as c$i") spark.range(values).map(_ => Random.nextLong()).toDF() .selectExpr(selectExpr: _*) diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/Column.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/Column.scala index c23d49440248c..3562675898224 100644 --- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/Column.scala +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/Column.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql import scala.jdk.CollectionConverters._ -import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.connect.proto import org.apache.spark.connect.proto.Expression.SortOrder.NullOrdering import org.apache.spark.connect.proto.Expression.SortOrder.SortDirection @@ -52,7 +52,7 @@ import org.apache.spark.util.ArrayImplicits._ * * @since 3.4.0 */ -class Column private[sql] (@DeveloperApi val expr: proto.Expression) extends Logging { +class Column(@DeveloperApi val expr: proto.Expression) extends Logging { private[sql] def this(name: String, planId: Option[Long]) = this(Column.nameToExpression(name, planId)) @@ -1323,13 +1323,15 @@ class Column private[sql] (@DeveloperApi val expr: proto.Expression) extends Log def over(): Column = over(Window.spec) } -private[sql] object Column { +object Column { - def apply(name: String): Column = new Column(name) + private[sql] def apply(name: String): Column = new Column(name) - def apply(name: String, planId: Option[Long]): Column = new Column(name, planId) + private[sql] def apply(name: String, planId: Option[Long]): Column = new Column(name, planId) - def nameToExpression(name: String, planId: Option[Long] = None): proto.Expression = { + private[sql] def nameToExpression( + name: String, + planId: Option[Long] = None): proto.Expression = { val builder = proto.Expression.newBuilder() name match { case "*" => @@ -1344,23 +1346,14 @@ private[sql] object Column { builder.build() } - private[sql] def apply(f: proto.Expression.Builder => Unit): Column = { + @Since("4.0.0") + @DeveloperApi + def apply(f: proto.Expression.Builder => Unit): Column = { val builder = proto.Expression.newBuilder() f(builder) new Column(builder.build()) } - @DeveloperApi - @deprecated("Use forExtension(Array[Byte]) instead", "4.0.0") - def apply(extension: com.google.protobuf.Any): Column = { - apply(_.setExtension(extension)) - } - - @DeveloperApi - def forExtension(extension: Array[Byte]): Column = { - apply(_.setExtension(com.google.protobuf.Any.parseFrom(extension))) - } - private[sql] def fn(name: String, inputs: Column*): Column = { fn(name, isDistinct = false, inputs: _*) } diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/Dataset.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/Dataset.scala index 9a42afebf8f2b..e831c264e632a 100644 --- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -34,7 +34,7 @@ import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders._ import org.apache.spark.sql.catalyst.expressions.OrderUtils import org.apache.spark.sql.connect.client.SparkResult import org.apache.spark.sql.connect.common.{DataTypeProtoConverter, StorageLevelProtoConverter, UdfUtils} -import org.apache.spark.sql.expressions.ScalarUserDefinedFunction +import org.apache.spark.sql.expressions.ScalaUserDefinedFunction import org.apache.spark.sql.functions.{struct, to_json} import org.apache.spark.sql.streaming.DataStreamWriter import org.apache.spark.sql.types.{Metadata, StructType} @@ -1387,7 +1387,7 @@ class Dataset[T] private[sql] ( * @since 3.5.0 */ def reduce(func: (T, T) => T): T = { - val udf = ScalarUserDefinedFunction( + val udf = ScalaUserDefinedFunction( function = func, inputEncoders = agnosticEncoder :: agnosticEncoder :: Nil, outputEncoder = agnosticEncoder) @@ -2705,7 +2705,7 @@ class Dataset[T] private[sql] ( * @since 3.5.0 */ def filter(func: T => Boolean): Dataset[T] = { - val udf = ScalarUserDefinedFunction( + val udf = ScalaUserDefinedFunction( function = func, inputEncoders = agnosticEncoder :: Nil, outputEncoder = PrimitiveBooleanEncoder) @@ -2758,7 +2758,7 @@ class Dataset[T] private[sql] ( */ def mapPartitions[U: Encoder](func: Iterator[T] => Iterator[U]): Dataset[U] = { val outputEncoder = encoderFor[U] - val udf = ScalarUserDefinedFunction( + val udf = ScalaUserDefinedFunction( function = func, inputEncoders = agnosticEncoder :: Nil, outputEncoder = outputEncoder) @@ -2830,7 +2830,7 @@ class Dataset[T] private[sql] ( */ @deprecated("use flatMap() or select() with functions.explode() instead", "3.5.0") def explode[A <: Product: TypeTag](input: Column*)(f: Row => IterableOnce[A]): DataFrame = { - val generator = ScalarUserDefinedFunction( + val generator = ScalaUserDefinedFunction( UdfUtils.iterableOnceToSeq(f), UnboundRowEncoder :: Nil, ScalaReflection.encoderFor[Seq[A]]) @@ -2862,7 +2862,7 @@ class Dataset[T] private[sql] ( @deprecated("use flatMap() or select() with functions.explode() instead", "3.5.0") def explode[A, B: TypeTag](inputColumn: String, outputColumn: String)( f: A => IterableOnce[B]): DataFrame = { - val generator = ScalarUserDefinedFunction( + val generator = ScalaUserDefinedFunction( UdfUtils.iterableOnceToSeq(f), Nil, ScalaReflection.encoderFor[Seq[B]]) @@ -3337,24 +3337,170 @@ class Dataset[T] private[sql] ( } } + /** + * Define (named) metrics to observe on the Dataset. This method returns an 'observed' Dataset + * that returns the same result as the input, with the following guarantees:

  • It will + * compute the defined aggregates (metrics) on all the data that is flowing through the Dataset + * at that point.
  • It will report the value of the defined aggregate columns as soon as + * we reach a completion point. A completion point is currently defined as the end of a + * query.
Please note that continuous execution is currently not supported. + * + * The metrics columns must either contain a literal (e.g. lit(42)), or should contain one or + * more aggregate functions (e.g. sum(a) or sum(a + b) + avg(c) - lit(1)). Expressions that + * contain references to the input Dataset's columns must always be wrapped in an aggregate + * function. + * + * A user can retrieve the metrics by calling + * `org.apache.spark.sql.Dataset.collectResult().getObservedMetrics`. + * + * {{{ + * // Observe row count (rows) and highest id (maxid) in the Dataset while writing it + * val observed_ds = ds.observe("my_metrics", count(lit(1)).as("rows"), max($"id").as("maxid")) + * observed_ds.write.parquet("ds.parquet") + * val metrics = observed_ds.collectResult().getObservedMetrics + * }}} + * + * @group typedrel + * @since 4.0.0 + */ + @scala.annotation.varargs def observe(name: String, expr: Column, exprs: Column*): Dataset[T] = { - throw new UnsupportedOperationException("observe is not implemented.") + sparkSession.newDataset(agnosticEncoder) { builder => + builder.getCollectMetricsBuilder + .setInput(plan.getRoot) + .setName(name) + .addAllMetrics((expr +: exprs).map(_.expr).asJava) + } } - def checkpoint(): Dataset[T] = { - throw new UnsupportedOperationException("checkpoint is not implemented.") + /** + * Observe (named) metrics through an `org.apache.spark.sql.Observation` instance. This is + * equivalent to calling `observe(String, Column, Column*)` but does not require to collect all + * results before returning the metrics - the metrics are filled during iterating the results, + * as soon as they are available. This method does not support streaming datasets. + * + * A user can retrieve the metrics by accessing `org.apache.spark.sql.Observation.get`. + * + * {{{ + * // Observe row count (rows) and highest id (maxid) in the Dataset while writing it + * val observation = Observation("my_metrics") + * val observed_ds = ds.observe(observation, count(lit(1)).as("rows"), max($"id").as("maxid")) + * observed_ds.write.parquet("ds.parquet") + * val metrics = observation.get + * }}} + * + * @throws IllegalArgumentException + * If this is a streaming Dataset (this.isStreaming == true) + * + * @group typedrel + * @since 4.0.0 + */ + @scala.annotation.varargs + def observe(observation: Observation, expr: Column, exprs: Column*): Dataset[T] = { + val df = observe(observation.name, expr, exprs: _*) + sparkSession.registerObservation(df.getPlanId.get, observation) + df } - def checkpoint(eager: Boolean): Dataset[T] = { - throw new UnsupportedOperationException("checkpoint is not implemented.") - } + /** + * Eagerly checkpoint a Dataset and return the new Dataset. Checkpointing can be used to + * truncate the logical plan of this Dataset, which is especially useful in iterative algorithms + * where the plan may grow exponentially. It will be saved to files inside the checkpoint + * directory set with `SparkContext#setCheckpointDir`. + * + * @group basic + * @since 4.0.0 + */ + def checkpoint(): Dataset[T] = checkpoint(eager = true, reliableCheckpoint = true) - def localCheckpoint(): Dataset[T] = { - throw new UnsupportedOperationException("localCheckpoint is not implemented.") - } + /** + * Returns a checkpointed version of this Dataset. Checkpointing can be used to truncate the + * logical plan of this Dataset, which is especially useful in iterative algorithms where the + * plan may grow exponentially. It will be saved to files inside the checkpoint directory set + * with `SparkContext#setCheckpointDir`. + * + * @param eager + * Whether to checkpoint this dataframe immediately + * + * @note + * When checkpoint is used with eager = false, the final data that is checkpointed after the + * first action may be different from the data that was used during the job due to + * non-determinism of the underlying operation and retries. If checkpoint is used to achieve + * saving a deterministic snapshot of the data, eager = true should be used. Otherwise, it is + * only deterministic after the first execution, after the checkpoint was finalized. + * + * @group basic + * @since 4.0.0 + */ + def checkpoint(eager: Boolean): Dataset[T] = + checkpoint(eager = eager, reliableCheckpoint = true) + + /** + * Eagerly locally checkpoints a Dataset and return the new Dataset. Checkpointing can be used + * to truncate the logical plan of this Dataset, which is especially useful in iterative + * algorithms where the plan may grow exponentially. Local checkpoints are written to executor + * storage and despite potentially faster they are unreliable and may compromise job completion. + * + * @group basic + * @since 4.0.0 + */ + def localCheckpoint(): Dataset[T] = checkpoint(eager = true, reliableCheckpoint = false) - def localCheckpoint(eager: Boolean): Dataset[T] = { - throw new UnsupportedOperationException("localCheckpoint is not implemented.") + /** + * Locally checkpoints a Dataset and return the new Dataset. Checkpointing can be used to + * truncate the logical plan of this Dataset, which is especially useful in iterative algorithms + * where the plan may grow exponentially. Local checkpoints are written to executor storage and + * despite potentially faster they are unreliable and may compromise job completion. + * + * @param eager + * Whether to checkpoint this dataframe immediately + * + * @note + * When checkpoint is used with eager = false, the final data that is checkpointed after the + * first action may be different from the data that was used during the job due to + * non-determinism of the underlying operation and retries. If checkpoint is used to achieve + * saving a deterministic snapshot of the data, eager = true should be used. Otherwise, it is + * only deterministic after the first execution, after the checkpoint was finalized. + * + * @group basic + * @since 4.0.0 + */ + def localCheckpoint(eager: Boolean): Dataset[T] = + checkpoint(eager = eager, reliableCheckpoint = false) + + /** + * Returns a checkpointed version of this Dataset. + * + * @param eager + * Whether to checkpoint this dataframe immediately + * @param reliableCheckpoint + * Whether to create a reliable checkpoint saved to files inside the checkpoint directory. If + * false creates a local checkpoint using the caching subsystem + */ + private def checkpoint(eager: Boolean, reliableCheckpoint: Boolean): Dataset[T] = { + sparkSession.newDataset(agnosticEncoder) { builder => + val command = sparkSession.newCommand { builder => + builder.getCheckpointCommandBuilder + .setLocal(!reliableCheckpoint) + .setEager(eager) + .setRelation(this.plan.getRoot) + } + val responseIter = sparkSession.execute(command) + try { + val response = responseIter + .find(_.hasCheckpointCommandResult) + .getOrElse(throw new RuntimeException("CheckpointCommandResult must be present")) + + val cachedRemoteRelation = response.getCheckpointCommandResult.getRelation + sparkSession.cleaner.register(cachedRemoteRelation) + + // Update the builder with the values from the result. + builder.setCachedRemoteRelation(cachedRemoteRelation) + } finally { + // consume the rest of the iterator + responseIter.foreach(_ => ()) + } + } } /** diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala index e38adb9b0b27e..953cf23afc330 100644 --- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala @@ -27,7 +27,7 @@ import org.apache.spark.connect.proto import org.apache.spark.sql.catalyst.encoders.AgnosticEncoder import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.ProductEncoder import org.apache.spark.sql.connect.common.UdfUtils -import org.apache.spark.sql.expressions.ScalarUserDefinedFunction +import org.apache.spark.sql.expressions.ScalaUserDefinedFunction import org.apache.spark.sql.functions.col import org.apache.spark.sql.streaming.{GroupState, GroupStateTimeout, OutputMode, StatefulProcessor, StatefulProcessorWithInitialState, TimeMode} @@ -1031,7 +1031,7 @@ private class KeyValueGroupedDatasetImpl[K, V, IK, IV]( override def reduceGroups(f: (V, V) => V): Dataset[(K, V)] = { val inputEncoders = Seq(vEncoder, vEncoder) - val udf = ScalarUserDefinedFunction( + val udf = ScalaUserDefinedFunction( function = f, inputEncoders = inputEncoders, outputEncoder = vEncoder) @@ -1091,7 +1091,7 @@ private class KeyValueGroupedDatasetImpl[K, V, IK, IV]( private def getUdf[U: Encoder](nf: AnyRef, outputEncoder: AgnosticEncoder[U])( inEncoders: AgnosticEncoder[_]*): proto.CommonInlineUserDefinedFunction = { val inputEncoders = kEncoder +: inEncoders // Apply keyAs changes by setting kEncoder - val udf = ScalarUserDefinedFunction( + val udf = ScalaUserDefinedFunction( function = nf, inputEncoders = inputEncoders, outputEncoder = outputEncoder) @@ -1110,7 +1110,7 @@ private object KeyValueGroupedDatasetImpl { ds: Dataset[V], kEncoder: AgnosticEncoder[K], groupingFunc: V => K): KeyValueGroupedDatasetImpl[K, V, K, V] = { - val gf = ScalarUserDefinedFunction( + val gf = ScalaUserDefinedFunction( function = groupingFunc, inputEncoders = ds.agnosticEncoder :: Nil, // Using the original value and key encoders outputEncoder = kEncoder) @@ -1132,7 +1132,7 @@ private object KeyValueGroupedDatasetImpl { vEncoder: AgnosticEncoder[V], groupingExprs: Seq[Column]): KeyValueGroupedDatasetImpl[K, V, K, V] = { // Use a dummy udf to pass the K V encoders - val dummyGroupingFunc = ScalarUserDefinedFunction( + val dummyGroupingFunc = ScalaUserDefinedFunction( function = UdfUtils.noOp[V, K](), inputEncoders = vEncoder :: Nil, outputEncoder = kEncoder).apply(col("*")) diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/Observation.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/Observation.scala new file mode 100644 index 0000000000000..75629b6000f91 --- /dev/null +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/Observation.scala @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import java.util.UUID + +class Observation(name: String) extends ObservationBase(name) { + + /** + * Create an Observation instance without providing a name. This generates a random name. + */ + def this() = this(UUID.randomUUID().toString) +} + +/** + * (Scala-specific) Create instances of Observation via Scala `apply`. + * @since 4.0.0 + */ +object Observation { + + /** + * Observation constructor for creating an anonymous observation. + */ + def apply(): Observation = new Observation() + + /** + * Observation constructor for creating a named observation. + */ + def apply(name: String): Observation = new Observation(name) + +} diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SQLImplicits.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SQLImplicits.scala index 6c626fd716d5b..7799d395d5c6a 100644 --- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SQLImplicits.scala +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SQLImplicits.scala @@ -149,6 +149,7 @@ abstract class SQLImplicits private[sql] (session: SparkSession) extends LowPrio * @deprecated * use [[newSequenceEncoder]] */ + @deprecated("Use newSequenceEncoder instead", "2.2.0") val newIntSeqEncoder: Encoder[Seq[Int]] = newSeqEncoder(PrimitiveIntEncoder) /** @@ -156,6 +157,7 @@ abstract class SQLImplicits private[sql] (session: SparkSession) extends LowPrio * @deprecated * use [[newSequenceEncoder]] */ + @deprecated("Use newSequenceEncoder instead", "2.2.0") val newLongSeqEncoder: Encoder[Seq[Long]] = newSeqEncoder(PrimitiveLongEncoder) /** @@ -163,6 +165,7 @@ abstract class SQLImplicits private[sql] (session: SparkSession) extends LowPrio * @deprecated * use [[newSequenceEncoder]] */ + @deprecated("Use newSequenceEncoder instead", "2.2.0") val newDoubleSeqEncoder: Encoder[Seq[Double]] = newSeqEncoder(PrimitiveDoubleEncoder) /** @@ -170,6 +173,7 @@ abstract class SQLImplicits private[sql] (session: SparkSession) extends LowPrio * @deprecated * use [[newSequenceEncoder]] */ + @deprecated("Use newSequenceEncoder instead", "2.2.0") val newFloatSeqEncoder: Encoder[Seq[Float]] = newSeqEncoder(PrimitiveFloatEncoder) /** @@ -177,6 +181,7 @@ abstract class SQLImplicits private[sql] (session: SparkSession) extends LowPrio * @deprecated * use [[newSequenceEncoder]] */ + @deprecated("Use newSequenceEncoder instead", "2.2.0") val newByteSeqEncoder: Encoder[Seq[Byte]] = newSeqEncoder(PrimitiveByteEncoder) /** @@ -184,6 +189,7 @@ abstract class SQLImplicits private[sql] (session: SparkSession) extends LowPrio * @deprecated * use [[newSequenceEncoder]] */ + @deprecated("Use newSequenceEncoder instead", "2.2.0") val newShortSeqEncoder: Encoder[Seq[Short]] = newSeqEncoder(PrimitiveShortEncoder) /** @@ -191,6 +197,7 @@ abstract class SQLImplicits private[sql] (session: SparkSession) extends LowPrio * @deprecated * use [[newSequenceEncoder]] */ + @deprecated("Use newSequenceEncoder instead", "2.2.0") val newBooleanSeqEncoder: Encoder[Seq[Boolean]] = newSeqEncoder(PrimitiveBooleanEncoder) /** @@ -198,6 +205,7 @@ abstract class SQLImplicits private[sql] (session: SparkSession) extends LowPrio * @deprecated * use [[newSequenceEncoder]] */ + @deprecated("Use newSequenceEncoder instead", "2.2.0") val newStringSeqEncoder: Encoder[Seq[String]] = newSeqEncoder(StringEncoder) /** @@ -205,6 +213,7 @@ abstract class SQLImplicits private[sql] (session: SparkSession) extends LowPrio * @deprecated * use [[newSequenceEncoder]] */ + @deprecated("Use newSequenceEncoder instead", "2.2.0") def newProductSeqEncoder[A <: Product: TypeTag]: Encoder[Seq[A]] = newSeqEncoder(ScalaReflection.encoderFor[A]) diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SparkSession.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SparkSession.scala index 5a2d9bc44c9f7..80336fb1eaea4 100644 --- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SparkSession.scala +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SparkSession.scala @@ -18,6 +18,7 @@ package org.apache.spark.sql import java.io.Closeable import java.net.URI +import java.util.concurrent.ConcurrentHashMap import java.util.concurrent.TimeUnit._ import java.util.concurrent.atomic.{AtomicLong, AtomicReference} @@ -28,7 +29,7 @@ import com.google.common.cache.{CacheBuilder, CacheLoader} import io.grpc.ClientInterceptor import org.apache.arrow.memory.RootAllocator -import org.apache.spark.annotation.{DeveloperApi, Experimental} +import org.apache.spark.annotation.{DeveloperApi, Experimental, Since} import org.apache.spark.connect.proto import org.apache.spark.connect.proto.ExecutePlanResponse import org.apache.spark.internal.Logging @@ -36,11 +37,11 @@ import org.apache.spark.sql.catalog.Catalog import org.apache.spark.sql.catalyst.{JavaTypeInference, ScalaReflection} import org.apache.spark.sql.catalyst.encoders.{AgnosticEncoder, RowEncoder} import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.{BoxedLongEncoder, UnboundRowEncoder} -import org.apache.spark.sql.connect.client.{ClassFinder, SparkConnectClient, SparkResult} +import org.apache.spark.sql.connect.client.{ClassFinder, CloseableIterator, SparkConnectClient, SparkResult} import org.apache.spark.sql.connect.client.SparkConnectClient.Configuration import org.apache.spark.sql.connect.client.arrow.ArrowSerializer import org.apache.spark.sql.functions.lit -import org.apache.spark.sql.internal.{CatalogImpl, SqlApiConf} +import org.apache.spark.sql.internal.{CatalogImpl, SessionCleaner, SqlApiConf} import org.apache.spark.sql.streaming.DataStreamReader import org.apache.spark.sql.streaming.StreamingQueryManager import org.apache.spark.sql.types.StructType @@ -72,6 +73,7 @@ class SparkSession private[sql] ( with Logging { private[this] val allocator = new RootAllocator() + private[sql] lazy val cleaner = new SessionCleaner(this) // a unique session ID for this session from client. private[sql] def sessionId: String = client.sessionId @@ -80,6 +82,8 @@ class SparkSession private[sql] ( client.analyze(proto.AnalyzePlanRequest.AnalyzeCase.SPARK_VERSION).getSparkVersion.getVersion } + private[sql] val observationRegistry = new ConcurrentHashMap[Long, Observation]() + /** * Runtime configuration interface for Spark. * @@ -482,11 +486,15 @@ class SparkSession private[sql] ( } } - private[sql] def newDataFrame(f: proto.Relation.Builder => Unit): DataFrame = { + @Since("4.0.0") + @DeveloperApi + def newDataFrame(f: proto.Relation.Builder => Unit): DataFrame = { newDataset(UnboundRowEncoder)(f) } - private[sql] def newDataset[T](encoder: AgnosticEncoder[T])( + @Since("4.0.0") + @DeveloperApi + def newDataset[T](encoder: AgnosticEncoder[T])( f: proto.Relation.Builder => Unit): Dataset[T] = { val builder = proto.Relation.newBuilder() f(builder) @@ -495,30 +503,6 @@ class SparkSession private[sql] ( new Dataset[T](this, plan, encoder) } - @DeveloperApi - @deprecated("Use newDataFrame(Array[Byte]) instead", "4.0.0") - def newDataFrame(extension: com.google.protobuf.Any): DataFrame = { - newDataFrame(_.setExtension(extension)) - } - - @DeveloperApi - @deprecated("Use newDataFrame(Array[Byte], AgnosticEncoder[T]) instead", "4.0.0") - def newDataset[T]( - extension: com.google.protobuf.Any, - encoder: AgnosticEncoder[T]): Dataset[T] = { - newDataset(encoder)(_.setExtension(extension)) - } - - @DeveloperApi - def newDataFrame(extension: Array[Byte]): DataFrame = { - newDataFrame(_.setExtension(com.google.protobuf.Any.parseFrom(extension))) - } - - @DeveloperApi - def newDataset[T](extension: Array[Byte], encoder: AgnosticEncoder[T]): Dataset[T] = { - newDataset(encoder)(_.setExtension(com.google.protobuf.Any.parseFrom(extension))) - } - private[sql] def newCommand[T](f: proto.Command.Builder => Unit): proto.Command = { val builder = proto.Command.newBuilder() f(builder) @@ -552,8 +536,12 @@ class SparkSession private[sql] ( private[sql] def execute[T](plan: proto.Plan, encoder: AgnosticEncoder[T]): SparkResult[T] = { val value = client.execute(plan) - val result = new SparkResult(value, allocator, encoder, timeZoneId) - result + new SparkResult( + value, + allocator, + encoder, + timeZoneId, + Some(setMetricsAndUnregisterObservation)) } private[sql] def execute(f: proto.Relation.Builder => Unit): Unit = { @@ -565,34 +553,23 @@ class SparkSession private[sql] ( client.execute(plan).foreach(_ => ()) } - private[sql] def execute(command: proto.Command): Seq[ExecutePlanResponse] = { + @Since("4.0.0") + @DeveloperApi + def execute(command: proto.Command): Seq[ExecutePlanResponse] = { val plan = proto.Plan.newBuilder().setCommand(command).build() // .toSeq forces that the iterator is consumed and closed. On top, ignore all // progress messages. client.execute(plan).filter(!_.hasExecutionProgress).toSeq } + private[sql] def execute(plan: proto.Plan): CloseableIterator[ExecutePlanResponse] = + client.execute(plan) + private[sql] def registerUdf(udf: proto.CommonInlineUserDefinedFunction): Unit = { val command = proto.Command.newBuilder().setRegisterFunction(udf).build() execute(command) } - @DeveloperApi - @deprecated("Use execute(Array[Byte]) instead", "4.0.0") - def execute(extension: com.google.protobuf.Any): Unit = { - val command = proto.Command.newBuilder().setExtension(extension).build() - execute(command) - } - - @DeveloperApi - def execute(extension: Array[Byte]): Unit = { - val command = proto.Command - .newBuilder() - .setExtension(com.google.protobuf.Any.parseFrom(extension)) - .build() - execute(command) - } - /** * Add a single artifact to the client session. * @@ -813,6 +790,21 @@ class SparkSession private[sql] ( * Set to false to prevent client.releaseSession on close() (testing only) */ private[sql] var releaseSessionOnClose = true + + private[sql] def registerObservation(planId: Long, observation: Observation): Unit = { + if (observationRegistry.putIfAbsent(planId, observation) != null) { + throw new IllegalArgumentException("An Observation can be used with a Dataset only once") + } + } + + private[sql] def setMetricsAndUnregisterObservation( + planId: Long, + metrics: Map[String, Any]): Unit = { + val observationOrNull = observationRegistry.remove(planId) + if (observationOrNull != null) { + observationOrNull.setMetricsAndNotify(Some(metrics)) + } + } } // The minimal builder needed to create a spark session. @@ -837,10 +829,16 @@ object SparkSession extends Logging { /** * Set the (global) default [[SparkSession]], and (thread-local) active [[SparkSession]] when - * they are not set yet. + * they are not set yet or the associated [[SparkConnectClient]] is unusable. */ private def setDefaultAndActiveSession(session: SparkSession): Unit = { - defaultSession.compareAndSet(null, session) + val currentDefault = defaultSession.getAcquire + if (currentDefault == null || !currentDefault.client.isSessionValid) { + // Update `defaultSession` if it is null or the contained session is not valid. There is a + // chance that the following `compareAndSet` fails if a new default session has just been set, + // but that does not matter since that event has happened after this method was invoked. + defaultSession.compareAndSet(currentDefault, session) + } if (getActiveSession.isEmpty) { setActiveSession(session) } @@ -980,7 +978,7 @@ object SparkSession extends Logging { def appName(name: String): Builder = this private def tryCreateSessionFromClient(): Option[SparkSession] = { - if (client != null) { + if (client != null && client.isSessionValid) { Option(new SparkSession(client, planIdGenerator)) } else { None @@ -1032,7 +1030,16 @@ object SparkSession extends Logging { */ def getOrCreate(): SparkSession = { val session = tryCreateSessionFromClient() - .getOrElse(sessions.get(builder.configuration)) + .getOrElse({ + var existingSession = sessions.get(builder.configuration) + if (!existingSession.client.isSessionValid) { + // If the cached session has become invalid, e.g., due to a server restart, the cache + // entry is invalidated. + sessions.invalidate(builder.configuration) + existingSession = sessions.get(builder.configuration) + } + existingSession + }) setDefaultAndActiveSession(session) applyOptions(session) session @@ -1040,11 +1047,13 @@ object SparkSession extends Logging { } /** - * Returns the default SparkSession. + * Returns the default SparkSession. If the previously set default SparkSession becomes + * unusable, returns None. * * @since 3.5.0 */ - def getDefaultSession: Option[SparkSession] = Option(defaultSession.get()) + def getDefaultSession: Option[SparkSession] = + Option(defaultSession.get()).filter(_.client.isSessionValid) /** * Sets the default SparkSession. @@ -1065,11 +1074,13 @@ object SparkSession extends Logging { } /** - * Returns the active SparkSession for the current thread. + * Returns the active SparkSession for the current thread. If the previously set active + * SparkSession becomes unusable, returns None. * * @since 3.5.0 */ - def getActiveSession: Option[SparkSession] = Option(activeThreadSession.get()) + def getActiveSession: Option[SparkSession] = + Option(activeThreadSession.get()).filter(_.client.isSessionValid) /** * Changes the SparkSession that will be returned in this thread and its children when diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/UDFRegistration.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/UDFRegistration.scala index 2e8211a0966e7..5965a2b7a61de 100644 --- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/UDFRegistration.scala +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/UDFRegistration.scala @@ -22,7 +22,7 @@ import scala.reflect.runtime.universe.{typeTag, TypeTag} import org.apache.spark.internal.Logging import org.apache.spark.sql.api.java._ import org.apache.spark.sql.connect.common.UdfUtils -import org.apache.spark.sql.expressions.{ScalarUserDefinedFunction, UserDefinedFunction} +import org.apache.spark.sql.expressions.{ScalaUserDefinedFunction, UserDefinedFunction} import org.apache.spark.sql.types.DataType /** @@ -62,7 +62,7 @@ class UDFRegistration(session: SparkSession) extends Logging { */ def register(name: String, udf: UserDefinedFunction): UserDefinedFunction = { udf.withName(name) match { - case scalarUdf: ScalarUserDefinedFunction => + case scalarUdf: ScalaUserDefinedFunction => session.registerUdf(scalarUdf.toProto) scalarUdf case other => @@ -97,7 +97,7 @@ class UDFRegistration(session: SparkSession) extends Logging { * @since 3.5.0 */ def register[RT: TypeTag](name: String, func: () => RT): UserDefinedFunction = { - val udf = ScalarUserDefinedFunction(func, typeTag[RT]) + val udf = ScalaUserDefinedFunction(func, typeTag[RT]) register(name, udf) } @@ -108,7 +108,7 @@ class UDFRegistration(session: SparkSession) extends Logging { * @since 3.5.0 */ def register[RT: TypeTag, A1: TypeTag](name: String, func: (A1) => RT): UserDefinedFunction = { - val udf = ScalarUserDefinedFunction(func, typeTag[RT], typeTag[A1]) + val udf = ScalaUserDefinedFunction(func, typeTag[RT], typeTag[A1]) register(name, udf) } @@ -121,7 +121,7 @@ class UDFRegistration(session: SparkSession) extends Logging { def register[RT: TypeTag, A1: TypeTag, A2: TypeTag]( name: String, func: (A1, A2) => RT): UserDefinedFunction = { - val udf = ScalarUserDefinedFunction(func, typeTag[RT], typeTag[A1], typeTag[A2]) + val udf = ScalaUserDefinedFunction(func, typeTag[RT], typeTag[A1], typeTag[A2]) register(name, udf) } @@ -134,7 +134,7 @@ class UDFRegistration(session: SparkSession) extends Logging { def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag]( name: String, func: (A1, A2, A3) => RT): UserDefinedFunction = { - val udf = ScalarUserDefinedFunction(func, typeTag[RT], typeTag[A1], typeTag[A2], typeTag[A3]) + val udf = ScalaUserDefinedFunction(func, typeTag[RT], typeTag[A1], typeTag[A2], typeTag[A3]) register(name, udf) } @@ -147,7 +147,7 @@ class UDFRegistration(session: SparkSession) extends Logging { def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag]( name: String, func: (A1, A2, A3, A4) => RT): UserDefinedFunction = { - val udf = ScalarUserDefinedFunction( + val udf = ScalaUserDefinedFunction( func, typeTag[RT], typeTag[A1], @@ -166,7 +166,7 @@ class UDFRegistration(session: SparkSession) extends Logging { def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag]( name: String, func: (A1, A2, A3, A4, A5) => RT): UserDefinedFunction = { - val udf = ScalarUserDefinedFunction( + val udf = ScalaUserDefinedFunction( func, typeTag[RT], typeTag[A1], @@ -191,7 +191,7 @@ class UDFRegistration(session: SparkSession) extends Logging { A4: TypeTag, A5: TypeTag, A6: TypeTag](name: String, func: (A1, A2, A3, A4, A5, A6) => RT): UserDefinedFunction = { - val udf = ScalarUserDefinedFunction( + val udf = ScalaUserDefinedFunction( func, typeTag[RT], typeTag[A1], @@ -220,7 +220,7 @@ class UDFRegistration(session: SparkSession) extends Logging { A7: TypeTag]( name: String, func: (A1, A2, A3, A4, A5, A6, A7) => RT): UserDefinedFunction = { - val udf = ScalarUserDefinedFunction( + val udf = ScalaUserDefinedFunction( func, typeTag[RT], typeTag[A1], @@ -251,7 +251,7 @@ class UDFRegistration(session: SparkSession) extends Logging { A8: TypeTag]( name: String, func: (A1, A2, A3, A4, A5, A6, A7, A8) => RT): UserDefinedFunction = { - val udf = ScalarUserDefinedFunction( + val udf = ScalaUserDefinedFunction( func, typeTag[RT], typeTag[A1], @@ -284,7 +284,7 @@ class UDFRegistration(session: SparkSession) extends Logging { A9: TypeTag]( name: String, func: (A1, A2, A3, A4, A5, A6, A7, A8, A9) => RT): UserDefinedFunction = { - val udf = ScalarUserDefinedFunction( + val udf = ScalaUserDefinedFunction( func, typeTag[RT], typeTag[A1], @@ -319,7 +319,7 @@ class UDFRegistration(session: SparkSession) extends Logging { A10: TypeTag]( name: String, func: (A1, A2, A3, A4, A5, A6, A7, A8, A9, A10) => RT): UserDefinedFunction = { - val udf = ScalarUserDefinedFunction( + val udf = ScalaUserDefinedFunction( func, typeTag[RT], typeTag[A1], @@ -356,7 +356,7 @@ class UDFRegistration(session: SparkSession) extends Logging { A11: TypeTag]( name: String, func: (A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11) => RT): UserDefinedFunction = { - val udf = ScalarUserDefinedFunction( + val udf = ScalaUserDefinedFunction( func, typeTag[RT], typeTag[A1], @@ -395,7 +395,7 @@ class UDFRegistration(session: SparkSession) extends Logging { A12: TypeTag]( name: String, func: (A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12) => RT): UserDefinedFunction = { - val udf = ScalarUserDefinedFunction( + val udf = ScalaUserDefinedFunction( func, typeTag[RT], typeTag[A1], @@ -437,7 +437,7 @@ class UDFRegistration(session: SparkSession) extends Logging { name: String, func: (A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13) => RT) : UserDefinedFunction = { - val udf = ScalarUserDefinedFunction( + val udf = ScalaUserDefinedFunction( func, typeTag[RT], typeTag[A1], @@ -481,7 +481,7 @@ class UDFRegistration(session: SparkSession) extends Logging { name: String, func: (A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14) => RT) : UserDefinedFunction = { - val udf = ScalarUserDefinedFunction( + val udf = ScalaUserDefinedFunction( func, typeTag[RT], typeTag[A1], @@ -527,7 +527,7 @@ class UDFRegistration(session: SparkSession) extends Logging { name: String, func: (A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15) => RT) : UserDefinedFunction = { - val udf = ScalarUserDefinedFunction( + val udf = ScalaUserDefinedFunction( func, typeTag[RT], typeTag[A1], @@ -575,7 +575,7 @@ class UDFRegistration(session: SparkSession) extends Logging { name: String, func: (A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16) => RT) : UserDefinedFunction = { - val udf = ScalarUserDefinedFunction( + val udf = ScalaUserDefinedFunction( func, typeTag[RT], typeTag[A1], @@ -625,7 +625,7 @@ class UDFRegistration(session: SparkSession) extends Logging { name: String, func: (A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17) => RT) : UserDefinedFunction = { - val udf = ScalarUserDefinedFunction( + val udf = ScalaUserDefinedFunction( func, typeTag[RT], typeTag[A1], @@ -694,7 +694,7 @@ class UDFRegistration(session: SparkSession) extends Logging { A16, A17, A18) => RT): UserDefinedFunction = { - val udf = ScalarUserDefinedFunction( + val udf = ScalaUserDefinedFunction( func, typeTag[RT], typeTag[A1], @@ -766,7 +766,7 @@ class UDFRegistration(session: SparkSession) extends Logging { A17, A18, A19) => RT): UserDefinedFunction = { - val udf = ScalarUserDefinedFunction( + val udf = ScalaUserDefinedFunction( func, typeTag[RT], typeTag[A1], @@ -841,7 +841,7 @@ class UDFRegistration(session: SparkSession) extends Logging { A18, A19, A20) => RT): UserDefinedFunction = { - val udf = ScalarUserDefinedFunction( + val udf = ScalaUserDefinedFunction( func, typeTag[RT], typeTag[A1], @@ -919,7 +919,7 @@ class UDFRegistration(session: SparkSession) extends Logging { A19, A20, A21) => RT): UserDefinedFunction = { - val udf = ScalarUserDefinedFunction( + val udf = ScalaUserDefinedFunction( func, typeTag[RT], typeTag[A1], @@ -1000,7 +1000,7 @@ class UDFRegistration(session: SparkSession) extends Logging { A20, A21, A22) => RT): UserDefinedFunction = { - val udf = ScalarUserDefinedFunction( + val udf = ScalaUserDefinedFunction( func, typeTag[RT], typeTag[A1], @@ -1037,7 +1037,7 @@ class UDFRegistration(session: SparkSession) extends Logging { // | * @since $version // | */ // |def register(name: String, f: UDF$i[$extTypeArgs], returnType: DataType): Unit = { - // | val udf = ScalarUserDefinedFunction(UdfUtils.wrap(f), returnType) + // | val udf = ScalaUserDefinedFunction(UdfUtils.wrap(f), returnType) // | register(name, udf) // |}""".stripMargin) // } @@ -1047,7 +1047,7 @@ class UDFRegistration(session: SparkSession) extends Logging { * @since 3.5.0 */ def register(name: String, f: UDF0[_], returnType: DataType): Unit = { - val udf = ScalarUserDefinedFunction(UdfUtils.wrap(f), returnType) + val udf = ScalaUserDefinedFunction(UdfUtils.wrap(f), returnType) register(name, udf) } @@ -1056,7 +1056,7 @@ class UDFRegistration(session: SparkSession) extends Logging { * @since 3.5.0 */ def register(name: String, f: UDF1[_, _], returnType: DataType): Unit = { - val udf = ScalarUserDefinedFunction(UdfUtils.wrap(f), returnType) + val udf = ScalaUserDefinedFunction(UdfUtils.wrap(f), returnType) register(name, udf) } @@ -1065,7 +1065,7 @@ class UDFRegistration(session: SparkSession) extends Logging { * @since 3.5.0 */ def register(name: String, f: UDF2[_, _, _], returnType: DataType): Unit = { - val udf = ScalarUserDefinedFunction(UdfUtils.wrap(f), returnType) + val udf = ScalaUserDefinedFunction(UdfUtils.wrap(f), returnType) register(name, udf) } @@ -1074,7 +1074,7 @@ class UDFRegistration(session: SparkSession) extends Logging { * @since 3.5.0 */ def register(name: String, f: UDF3[_, _, _, _], returnType: DataType): Unit = { - val udf = ScalarUserDefinedFunction(UdfUtils.wrap(f), returnType) + val udf = ScalaUserDefinedFunction(UdfUtils.wrap(f), returnType) register(name, udf) } @@ -1083,7 +1083,7 @@ class UDFRegistration(session: SparkSession) extends Logging { * @since 3.5.0 */ def register(name: String, f: UDF4[_, _, _, _, _], returnType: DataType): Unit = { - val udf = ScalarUserDefinedFunction(UdfUtils.wrap(f), returnType) + val udf = ScalaUserDefinedFunction(UdfUtils.wrap(f), returnType) register(name, udf) } @@ -1092,7 +1092,7 @@ class UDFRegistration(session: SparkSession) extends Logging { * @since 3.5.0 */ def register(name: String, f: UDF5[_, _, _, _, _, _], returnType: DataType): Unit = { - val udf = ScalarUserDefinedFunction(UdfUtils.wrap(f), returnType) + val udf = ScalaUserDefinedFunction(UdfUtils.wrap(f), returnType) register(name, udf) } @@ -1101,7 +1101,7 @@ class UDFRegistration(session: SparkSession) extends Logging { * @since 3.5.0 */ def register(name: String, f: UDF6[_, _, _, _, _, _, _], returnType: DataType): Unit = { - val udf = ScalarUserDefinedFunction(UdfUtils.wrap(f), returnType) + val udf = ScalaUserDefinedFunction(UdfUtils.wrap(f), returnType) register(name, udf) } @@ -1110,7 +1110,7 @@ class UDFRegistration(session: SparkSession) extends Logging { * @since 3.5.0 */ def register(name: String, f: UDF7[_, _, _, _, _, _, _, _], returnType: DataType): Unit = { - val udf = ScalarUserDefinedFunction(UdfUtils.wrap(f), returnType) + val udf = ScalaUserDefinedFunction(UdfUtils.wrap(f), returnType) register(name, udf) } @@ -1119,7 +1119,7 @@ class UDFRegistration(session: SparkSession) extends Logging { * @since 3.5.0 */ def register(name: String, f: UDF8[_, _, _, _, _, _, _, _, _], returnType: DataType): Unit = { - val udf = ScalarUserDefinedFunction(UdfUtils.wrap(f), returnType) + val udf = ScalaUserDefinedFunction(UdfUtils.wrap(f), returnType) register(name, udf) } @@ -1131,7 +1131,7 @@ class UDFRegistration(session: SparkSession) extends Logging { name: String, f: UDF9[_, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = { - val udf = ScalarUserDefinedFunction(UdfUtils.wrap(f), returnType) + val udf = ScalaUserDefinedFunction(UdfUtils.wrap(f), returnType) register(name, udf) } @@ -1143,7 +1143,7 @@ class UDFRegistration(session: SparkSession) extends Logging { name: String, f: UDF10[_, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = { - val udf = ScalarUserDefinedFunction(UdfUtils.wrap(f), returnType) + val udf = ScalaUserDefinedFunction(UdfUtils.wrap(f), returnType) register(name, udf) } @@ -1155,7 +1155,7 @@ class UDFRegistration(session: SparkSession) extends Logging { name: String, f: UDF11[_, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = { - val udf = ScalarUserDefinedFunction(UdfUtils.wrap(f), returnType) + val udf = ScalaUserDefinedFunction(UdfUtils.wrap(f), returnType) register(name, udf) } @@ -1167,7 +1167,7 @@ class UDFRegistration(session: SparkSession) extends Logging { name: String, f: UDF12[_, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = { - val udf = ScalarUserDefinedFunction(UdfUtils.wrap(f), returnType) + val udf = ScalaUserDefinedFunction(UdfUtils.wrap(f), returnType) register(name, udf) } @@ -1179,7 +1179,7 @@ class UDFRegistration(session: SparkSession) extends Logging { name: String, f: UDF13[_, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = { - val udf = ScalarUserDefinedFunction(UdfUtils.wrap(f), returnType) + val udf = ScalaUserDefinedFunction(UdfUtils.wrap(f), returnType) register(name, udf) } @@ -1191,7 +1191,7 @@ class UDFRegistration(session: SparkSession) extends Logging { name: String, f: UDF14[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = { - val udf = ScalarUserDefinedFunction(UdfUtils.wrap(f), returnType) + val udf = ScalaUserDefinedFunction(UdfUtils.wrap(f), returnType) register(name, udf) } @@ -1203,7 +1203,7 @@ class UDFRegistration(session: SparkSession) extends Logging { name: String, f: UDF15[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = { - val udf = ScalarUserDefinedFunction(UdfUtils.wrap(f), returnType) + val udf = ScalaUserDefinedFunction(UdfUtils.wrap(f), returnType) register(name, udf) } @@ -1215,7 +1215,7 @@ class UDFRegistration(session: SparkSession) extends Logging { name: String, f: UDF16[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = { - val udf = ScalarUserDefinedFunction(UdfUtils.wrap(f), returnType) + val udf = ScalaUserDefinedFunction(UdfUtils.wrap(f), returnType) register(name, udf) } @@ -1227,7 +1227,7 @@ class UDFRegistration(session: SparkSession) extends Logging { name: String, f: UDF17[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = { - val udf = ScalarUserDefinedFunction(UdfUtils.wrap(f), returnType) + val udf = ScalaUserDefinedFunction(UdfUtils.wrap(f), returnType) register(name, udf) } @@ -1239,7 +1239,7 @@ class UDFRegistration(session: SparkSession) extends Logging { name: String, f: UDF18[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = { - val udf = ScalarUserDefinedFunction(UdfUtils.wrap(f), returnType) + val udf = ScalaUserDefinedFunction(UdfUtils.wrap(f), returnType) register(name, udf) } @@ -1251,7 +1251,7 @@ class UDFRegistration(session: SparkSession) extends Logging { name: String, f: UDF19[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = { - val udf = ScalarUserDefinedFunction(UdfUtils.wrap(f), returnType) + val udf = ScalaUserDefinedFunction(UdfUtils.wrap(f), returnType) register(name, udf) } @@ -1263,7 +1263,7 @@ class UDFRegistration(session: SparkSession) extends Logging { name: String, f: UDF20[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = { - val udf = ScalarUserDefinedFunction(UdfUtils.wrap(f), returnType) + val udf = ScalaUserDefinedFunction(UdfUtils.wrap(f), returnType) register(name, udf) } @@ -1275,7 +1275,7 @@ class UDFRegistration(session: SparkSession) extends Logging { name: String, f: UDF21[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = { - val udf = ScalarUserDefinedFunction(UdfUtils.wrap(f), returnType) + val udf = ScalaUserDefinedFunction(UdfUtils.wrap(f), returnType) register(name, udf) } @@ -1287,7 +1287,7 @@ class UDFRegistration(session: SparkSession) extends Logging { name: String, f: UDF22[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = { - val udf = ScalarUserDefinedFunction(UdfUtils.wrap(f), returnType) + val udf = ScalaUserDefinedFunction(UdfUtils.wrap(f), returnType) register(name, udf) } // scalastyle:on line.size.limit diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/application/ConnectRepl.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/application/ConnectRepl.scala index 0360a40578869..9fd3ae4368f4c 100644 --- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/application/ConnectRepl.scala +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/application/ConnectRepl.scala @@ -26,6 +26,7 @@ import ammonite.compiler.iface.CodeWrapper import ammonite.util.{Bind, Imports, Name, Util} import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.internal.Logging import org.apache.spark.sql.SparkSession import org.apache.spark.sql.connect.client.{SparkConnectClient, SparkConnectClientParser} @@ -55,6 +56,10 @@ object ConnectRepl { inputStream: InputStream = System.in, outputStream: OutputStream = System.out, errorStream: OutputStream = System.err): Unit = { + // For interpreters, structured logging is disabled by default to avoid generating mixed + // plain text and structured logs on the same console. + Logging.disableStructuredLogging() + // Build the client. val client = try { diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/expressions/Aggregator.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/expressions/Aggregator.scala new file mode 100644 index 0000000000000..91c8fb57c31bf --- /dev/null +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/expressions/Aggregator.scala @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.expressions + +import org.apache.spark.sql.{Encoder, TypedColumn} + +/** + * A base class for user-defined aggregations, which can be used in `Dataset` operations to take + * all of the elements of a group and reduce them to a single value. + * + * For example, the following aggregator extracts an `int` from a specific class and adds them up: + * {{{ + * case class Data(i: Int) + * + * val customSummer = new Aggregator[Data, Int, Int] { + * def zero: Int = 0 + * def reduce(b: Int, a: Data): Int = b + a.i + * def merge(b1: Int, b2: Int): Int = b1 + b2 + * def finish(r: Int): Int = r + * def bufferEncoder: Encoder[Int] = Encoders.scalaInt + * def outputEncoder: Encoder[Int] = Encoders.scalaInt + * } + * + * spark.udf.register("customSummer", udaf(customSummer)) + * val ds: Dataset[Data] = ... + * val aggregated = ds.selectExpr("customSummer(i)") + * }}} + * + * Based loosely on Aggregator from Algebird: https://github.com/twitter/algebird + * + * @tparam IN + * The input type for the aggregation. + * @tparam BUF + * The type of the intermediate value of the reduction. + * @tparam OUT + * The type of the final output result. + * @since 4.0.0 + */ +@SerialVersionUID(2093413866369130093L) +abstract class Aggregator[-IN, BUF, OUT] extends Serializable { + + /** + * A zero value for this aggregation. Should satisfy the property that any b + zero = b. + * @since 4.0.0 + */ + def zero: BUF + + /** + * Combine two values to produce a new value. For performance, the function may modify `b` and + * return it instead of constructing new object for b. + * @since 4.0.0 + */ + def reduce(b: BUF, a: IN): BUF + + /** + * Merge two intermediate values. + * @since 4.0.0 + */ + def merge(b1: BUF, b2: BUF): BUF + + /** + * Transform the output of the reduction. + * @since 4.0.0 + */ + def finish(reduction: BUF): OUT + + /** + * Specifies the `Encoder` for the intermediate value type. + * @since 4.0.0 + */ + def bufferEncoder: Encoder[BUF] + + /** + * Specifies the `Encoder` for the final output value type. + * @since 4.0.0 + */ + def outputEncoder: Encoder[OUT] + + /** + * Returns this `Aggregator` as a `TypedColumn` that can be used in `Dataset`. operations. + */ + def toColumn: TypedColumn[IN, OUT] = { + throw new UnsupportedOperationException("toColumn is not implemented.") + } +} diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala index c4431e9a87f12..f4499858306a1 100644 --- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala @@ -94,16 +94,17 @@ sealed abstract class UserDefinedFunction { } /** - * Holder class for a scalar user-defined function and it's input/output encoder(s). + * Holder class for a scala user-defined function and it's input/output encoder(s). */ -case class ScalarUserDefinedFunction private[sql] ( +case class ScalaUserDefinedFunction private[sql] ( // SPARK-43198: Eagerly serialize to prevent the UDF from containing a reference to this class. serializedUdfPacket: Array[Byte], inputTypes: Seq[proto.DataType], outputType: proto.DataType, name: Option[String], override val nullable: Boolean, - override val deterministic: Boolean) + override val deterministic: Boolean, + aggregate: Boolean) extends UserDefinedFunction { private[this] lazy val udf = { @@ -114,6 +115,7 @@ case class ScalarUserDefinedFunction private[sql] ( .addAllInputTypes(inputTypes.asJava) .setOutputType(outputType) .setNullable(nullable) + .setAggregate(aggregate) scalaUdfBuilder.build() } @@ -129,11 +131,11 @@ case class ScalarUserDefinedFunction private[sql] ( name.foreach(udfBuilder.setFunctionName) } - override def withName(name: String): ScalarUserDefinedFunction = copy(name = Option(name)) + override def withName(name: String): ScalaUserDefinedFunction = copy(name = Option(name)) - override def asNonNullable(): ScalarUserDefinedFunction = copy(nullable = false) + override def asNonNullable(): ScalaUserDefinedFunction = copy(nullable = false) - override def asNondeterministic(): ScalarUserDefinedFunction = copy(deterministic = false) + override def asNondeterministic(): ScalaUserDefinedFunction = copy(deterministic = false) def toProto: proto.CommonInlineUserDefinedFunction = { val builder = proto.CommonInlineUserDefinedFunction.newBuilder() @@ -146,7 +148,7 @@ case class ScalarUserDefinedFunction private[sql] ( } } -object ScalarUserDefinedFunction { +object ScalaUserDefinedFunction { private val LAMBDA_DESERIALIZATION_ERR_MSG: String = "cannot assign instance of java.lang.invoke.SerializedLambda to field" @@ -169,9 +171,9 @@ object ScalarUserDefinedFunction { private[sql] def apply( function: AnyRef, returnType: TypeTag[_], - parameterTypes: TypeTag[_]*): ScalarUserDefinedFunction = { + parameterTypes: TypeTag[_]*): ScalaUserDefinedFunction = { - ScalarUserDefinedFunction( + ScalaUserDefinedFunction( function = function, // Input can be a row because the input data schema can be found from the plan. inputEncoders = @@ -183,22 +185,24 @@ object ScalarUserDefinedFunction { private[sql] def apply( function: AnyRef, inputEncoders: Seq[AgnosticEncoder[_]], - outputEncoder: AgnosticEncoder[_]): ScalarUserDefinedFunction = { + outputEncoder: AgnosticEncoder[_], + aggregate: Boolean = false): ScalaUserDefinedFunction = { SparkConnectClosureCleaner.clean(function) val udfPacketBytes = SparkSerDeUtils.serialize(UdfPacket(function, inputEncoders, outputEncoder)) checkDeserializable(udfPacketBytes) - ScalarUserDefinedFunction( + ScalaUserDefinedFunction( serializedUdfPacket = udfPacketBytes, inputTypes = inputEncoders.map(_.dataType).map(DataTypeProtoConverter.toConnectProtoType), outputType = DataTypeProtoConverter.toConnectProtoType(outputEncoder.dataType), name = None, nullable = true, - deterministic = true) + deterministic = true, + aggregate = aggregate) } - private[sql] def apply(function: AnyRef, returnType: DataType): ScalarUserDefinedFunction = { - ScalarUserDefinedFunction( + private[sql] def apply(function: AnyRef, returnType: DataType): ScalaUserDefinedFunction = { + ScalaUserDefinedFunction( function = function, inputEncoders = Seq.empty[AgnosticEncoder[_]], outputEncoder = RowEncoder.encoderForDataType(returnType, lenient = false)) diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala index f267baf9854e9..eae239a25589c 100644 --- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala @@ -23,11 +23,12 @@ import scala.reflect.runtime.universe.{typeTag, TypeTag} import org.apache.spark.connect.proto import org.apache.spark.sql.api.java._ +import org.apache.spark.sql.catalyst.ScalaReflection import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.PrimitiveLongEncoder import org.apache.spark.sql.connect.common.LiteralValueProtoConverter._ import org.apache.spark.sql.connect.common.UdfUtils import org.apache.spark.sql.errors.DataTypeErrors -import org.apache.spark.sql.expressions.{ScalarUserDefinedFunction, UserDefinedFunction} +import org.apache.spark.sql.expressions.{Aggregator, ScalaUserDefinedFunction, UserDefinedFunction} import org.apache.spark.sql.types.{DataType, StructType} import org.apache.spark.sql.types.DataType.parseTypeWithFallback import org.apache.spark.util.SparkClassUtils @@ -75,6 +76,7 @@ import org.apache.spark.util.SparkClassUtils * @groupname struct_funcs Struct functions * @groupname csv_funcs CSV functions * @groupname json_funcs JSON functions + * @groupname variant_funcs VARIANT functions * @groupname xml_funcs XML functions * @groupname url_funcs URL functions * @groupname partition_transforms Partition transform functions @@ -1931,6 +1933,14 @@ object functions { */ def try_divide(left: Column, right: Column): Column = Column.fn("try_divide", left, right) + /** + * Returns the remainder of `dividend``/``divisor`. Its result is always null if `divisor` is 0. + * + * @group math_funcs + * @since 4.0.0 + */ + def try_remainder(left: Column, right: Column): Column = Column.fn("try_remainder", left, right) + /** * Returns `left``*``right` and the result is null on overflow. The acceptable input types are * the same with the `*` operator. @@ -4197,6 +4207,20 @@ object functions { */ def split(str: Column, pattern: String): Column = Column.fn("split", str, lit(pattern)) + /** + * Splits str around matches of the given pattern. + * + * @param str + * a string expression to split + * @param pattern + * a column of string representing a regular expression. The regex string should be a Java + * regular expression. + * + * @group string_funcs + * @since 4.0.0 + */ + def split(str: Column, pattern: Column): Column = Column.fn("split", str, pattern) + /** * Splits str around matches of the given pattern. * @@ -4218,6 +4242,27 @@ object functions { def split(str: Column, pattern: String, limit: Int): Column = Column.fn("split", str, lit(pattern), lit(limit)) + /** + * Splits str around matches of the given pattern. + * + * @param str + * a string expression to split + * @param pattern + * a column of string representing a regular expression. The regex string should be a Java + * regular expression. + * @param limit + * a column of integer expression which controls the number of times the regex is applied. + *
  • limit greater than 0: The resulting array's length will not be more than limit, + * and the resulting array's last entry will contain all input beyond the last matched + * regex.
  • limit less than or equal to 0: `regex` will be applied as many times as + * possible, and the resulting array can be of any size.
+ * + * @group string_funcs + * @since 4.0.0 + */ + def split(str: Column, pattern: Column, limit: Column): Column = + Column.fn("split", str, pattern, limit) + /** * Substring starts at `pos` and is of length `len` when str is String type or returns the slice * of byte array that starts at `pos` in byte and is of length `len` when str is Binary type @@ -4231,6 +4276,19 @@ object functions { def substring(str: Column, pos: Int, len: Int): Column = Column.fn("substring", str, lit(pos), lit(len)) + /** + * Substring starts at `pos` and is of length `len` when str is String type or returns the slice + * of byte array that starts at `pos` in byte and is of length `len` when str is Binary type + * + * @note + * The position is not zero based, but 1 based index. + * + * @group string_funcs + * @since 4.0.0 + */ + def substring(str: Column, pos: Column, len: Column): Column = + Column.fn("substring", str, pos, len) + /** * Returns the substring from string str before count occurrences of the delimiter delim. If * count is positive, everything the left of the final delimiter (counting from left) is @@ -5909,6 +5967,25 @@ object functions { */ def timestamp_micros(e: Column): Column = Column.fn("timestamp_micros", e) + /** + * Gets the difference between the timestamps in the specified units by truncating the fraction + * part. + * + * @group datetime_funcs + * @since 4.0.0 + */ + def timestamp_diff(unit: String, start: Column, end: Column): Column = + Column.fn("timestampdiff", lit(unit), start, end) + + /** + * Adds the specified number of units to the given timestamp. + * + * @group datetime_funcs + * @since 4.0.0 + */ + def timestamp_add(unit: String, quantity: Column, ts: Column): Column = + Column.fn("timestampadd", lit(unit), quantity, ts) + /** * Parses the `timestamp` expression with the `format` expression to a timestamp without time * zone. Returns null with invalid input. @@ -6965,16 +7042,92 @@ object functions { } /** - * Parses a JSON string and constructs a Variant value. + * Parses a JSON string and constructs a Variant value. Returns null if the input string is not + * a valid JSON value. * * @param json * a string column that contains JSON data. * - * @group json_funcs + * @group variant_funcs + * @since 4.0.0 + */ + def try_parse_json(json: Column): Column = Column.fn("try_parse_json", json) + + /** + * Parses a JSON string and constructs a Variant value. + * + * @param json + * a string column that contains JSON data. + * @group variant_funcs * @since 4.0.0 */ def parse_json(json: Column): Column = Column.fn("parse_json", json) + /** + * Check if a variant value is a variant null. Returns true if and only if the input is a + * variant null and false otherwise (including in the case of SQL NULL). + * + * @param v + * a variant column. + * @group variant_funcs + * @since 4.0.0 + */ + def is_variant_null(v: Column): Column = Column.fn("is_variant_null", v) + + /** + * Extracts a sub-variant from `v` according to `path`, and then cast the sub-variant to + * `targetType`. Returns null if the path does not exist. Throws an exception if the cast fails. + * + * @param v + * a variant column. + * @param path + * the extraction path. A valid path should start with `$` and is followed by zero or more + * segments like `[123]`, `.name`, `['name']`, or `["name"]`. + * @param targetType + * the target data type to cast into, in a DDL-formatted string. + * @group variant_funcs + * @since 4.0.0 + */ + def variant_get(v: Column, path: String, targetType: String): Column = + Column.fn("variant_get", v, lit(path), lit(targetType)) + + /** + * Extracts a sub-variant from `v` according to `path`, and then cast the sub-variant to + * `targetType`. Returns null if the path does not exist or the cast fails.. + * + * @param v + * a variant column. + * @param path + * the extraction path. A valid path should start with `$` and is followed by zero or more + * segments like `[123]`, `.name`, `['name']`, or `["name"]`. + * @param targetType + * the target data type to cast into, in a DDL-formatted string. + * @group variant_funcs + * @since 4.0.0 + */ + def try_variant_get(v: Column, path: String, targetType: String): Column = + Column.fn("try_variant_get", v, lit(path), lit(targetType)) + + /** + * Returns schema in the SQL format of a variant. + * + * @param v + * a variant column. + * @group variant_funcs + * @since 4.0.0 + */ + def schema_of_variant(v: Column): Column = Column.fn("schema_of_variant", v) + + /** + * Returns the merged schema in the SQL format of a variant column. + * + * @param v + * a variant column. + * @group variant_funcs + * @since 4.0.0 + */ + def schema_of_variant_agg(v: Column): Column = Column.fn("schema_of_variant_agg", v) + /** * Parses a JSON string and infers its schema in DDL format. * @@ -7078,9 +7231,9 @@ object functions { /** * Returns length of array or map. * - * The function returns null for null input if spark.sql.legacy.sizeOfNull is set to false or - * spark.sql.ansi.enabled is set to true. Otherwise, the function returns -1 for null input. - * With the default settings, the function returns -1 for null input. + * This function returns -1 for null input only if spark.sql.ansi.enabled is false and + * spark.sql.legacy.sizeOfNull is true. Otherwise, it returns null for null input. With the + * default settings, the function returns null for null input. * * @group collection_funcs * @since 3.4.0 @@ -7556,9 +7709,9 @@ object functions { /** * Returns length of array or map. This is an alias of `size` function. * - * The function returns null for null input if spark.sql.legacy.sizeOfNull is set to false or - * spark.sql.ansi.enabled is set to true. Otherwise, the function returns -1 for null input. - * With the default settings, the function returns -1 for null input. + * This function returns -1 for null input only if spark.sql.ansi.enabled is false and + * spark.sql.legacy.sizeOfNull is true. Otherwise, it returns null for null input. With the + * default settings, the function returns null for null input. * * @group collection_funcs * @since 3.5.0 @@ -7974,6 +8127,87 @@ object functions { // scalastyle:off line.size.limit + /** + * Obtains a `UserDefinedFunction` that wraps the given `Aggregator` so that it may be used with + * untyped Data Frames. + * {{{ + * val agg = // Aggregator[IN, BUF, OUT] + * + * // declare a UDF based on agg + * val aggUDF = udaf(agg) + * val aggData = df.agg(aggUDF($"colname")) + * + * // register agg as a named function + * spark.udf.register("myAggName", udaf(agg)) + * }}} + * + * @tparam IN + * the aggregator input type + * @tparam BUF + * the aggregating buffer type + * @tparam OUT + * the finalized output type + * + * @param agg + * the typed Aggregator + * + * @return + * a UserDefinedFunction that can be used as an aggregating expression. + * + * @group udf_funcs + * @note + * The input encoder is inferred from the input type IN. + * @since 4.0.0 + */ + def udaf[IN: TypeTag, BUF, OUT](agg: Aggregator[IN, BUF, OUT]): UserDefinedFunction = { + udaf(agg, ScalaReflection.encoderFor[IN]) + } + + /** + * Obtains a `UserDefinedFunction` that wraps the given `Aggregator` so that it may be used with + * untyped Data Frames. + * {{{ + * Aggregator agg = // custom Aggregator + * Encoder enc = // input encoder + * + * // declare a UDF based on agg + * UserDefinedFunction aggUDF = udaf(agg, enc) + * DataFrame aggData = df.agg(aggUDF($"colname")) + * + * // register agg as a named function + * spark.udf.register("myAggName", udaf(agg, enc)) + * }}} + * + * @tparam IN + * the aggregator input type + * @tparam BUF + * the aggregating buffer type + * @tparam OUT + * the finalized output type + * + * @param agg + * the typed Aggregator + * @param inputEncoder + * a specific input encoder to use + * + * @return + * a UserDefinedFunction that can be used as an aggregating expression + * + * @group udf_funcs + * @note + * This overloading takes an explicit input encoder, to support UDAF declarations in Java. + * @since 4.0.0 + */ + def udaf[IN, BUF, OUT]( + agg: Aggregator[IN, BUF, OUT], + inputEncoder: Encoder[IN]): UserDefinedFunction = { + ScalaUserDefinedFunction( + agg, + Seq(encoderFor(inputEncoder)), + encoderFor(agg.outputEncoder), + aggregate = true) + } + /** * Defines a Scala closure of 0 arguments as user-defined function (UDF). The data types are * automatically inferred based on the Scala closure's signature. By default the returned UDF is @@ -7984,7 +8218,7 @@ object functions { * @since 3.4.0 */ def udf[RT: TypeTag](f: () => RT): UserDefinedFunction = { - ScalarUserDefinedFunction(f, typeTag[RT]) + ScalaUserDefinedFunction(f, typeTag[RT]) } /** @@ -7997,7 +8231,7 @@ object functions { * @since 3.4.0 */ def udf[RT: TypeTag, A1: TypeTag](f: A1 => RT): UserDefinedFunction = { - ScalarUserDefinedFunction(f, typeTag[RT], typeTag[A1]) + ScalaUserDefinedFunction(f, typeTag[RT], typeTag[A1]) } /** @@ -8010,7 +8244,7 @@ object functions { * @since 3.4.0 */ def udf[RT: TypeTag, A1: TypeTag, A2: TypeTag](f: (A1, A2) => RT): UserDefinedFunction = { - ScalarUserDefinedFunction(f, typeTag[RT], typeTag[A1], typeTag[A2]) + ScalaUserDefinedFunction(f, typeTag[RT], typeTag[A1], typeTag[A2]) } /** @@ -8024,7 +8258,7 @@ object functions { */ def udf[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag]( f: (A1, A2, A3) => RT): UserDefinedFunction = { - ScalarUserDefinedFunction(f, typeTag[RT], typeTag[A1], typeTag[A2], typeTag[A3]) + ScalaUserDefinedFunction(f, typeTag[RT], typeTag[A1], typeTag[A2], typeTag[A3]) } /** @@ -8038,7 +8272,7 @@ object functions { */ def udf[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag]( f: (A1, A2, A3, A4) => RT): UserDefinedFunction = { - ScalarUserDefinedFunction(f, typeTag[RT], typeTag[A1], typeTag[A2], typeTag[A3], typeTag[A4]) + ScalaUserDefinedFunction(f, typeTag[RT], typeTag[A1], typeTag[A2], typeTag[A3], typeTag[A4]) } /** @@ -8052,7 +8286,7 @@ object functions { */ def udf[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag]( f: (A1, A2, A3, A4, A5) => RT): UserDefinedFunction = { - ScalarUserDefinedFunction( + ScalaUserDefinedFunction( f, typeTag[RT], typeTag[A1], @@ -8079,7 +8313,7 @@ object functions { A4: TypeTag, A5: TypeTag, A6: TypeTag](f: (A1, A2, A3, A4, A5, A6) => RT): UserDefinedFunction = { - ScalarUserDefinedFunction( + ScalaUserDefinedFunction( f, typeTag[RT], typeTag[A1], @@ -8108,7 +8342,7 @@ object functions { A5: TypeTag, A6: TypeTag, A7: TypeTag](f: (A1, A2, A3, A4, A5, A6, A7) => RT): UserDefinedFunction = { - ScalarUserDefinedFunction( + ScalaUserDefinedFunction( f, typeTag[RT], typeTag[A1], @@ -8139,7 +8373,7 @@ object functions { A6: TypeTag, A7: TypeTag, A8: TypeTag](f: (A1, A2, A3, A4, A5, A6, A7, A8) => RT): UserDefinedFunction = { - ScalarUserDefinedFunction( + ScalaUserDefinedFunction( f, typeTag[RT], typeTag[A1], @@ -8172,7 +8406,7 @@ object functions { A7: TypeTag, A8: TypeTag, A9: TypeTag](f: (A1, A2, A3, A4, A5, A6, A7, A8, A9) => RT): UserDefinedFunction = { - ScalarUserDefinedFunction( + ScalaUserDefinedFunction( f, typeTag[RT], typeTag[A1], @@ -8207,7 +8441,7 @@ object functions { A8: TypeTag, A9: TypeTag, A10: TypeTag](f: (A1, A2, A3, A4, A5, A6, A7, A8, A9, A10) => RT): UserDefinedFunction = { - ScalarUserDefinedFunction( + ScalaUserDefinedFunction( f, typeTag[RT], typeTag[A1], @@ -8236,7 +8470,7 @@ object functions { * @since 3.5.0 */ def udf(f: UDF0[_], returnType: DataType): UserDefinedFunction = { - ScalarUserDefinedFunction(UdfUtils.wrap(f), returnType) + ScalaUserDefinedFunction(UdfUtils.wrap(f), returnType) } /** @@ -8249,7 +8483,7 @@ object functions { * @since 3.5.0 */ def udf(f: UDF1[_, _], returnType: DataType): UserDefinedFunction = { - ScalarUserDefinedFunction(UdfUtils.wrap(f), returnType) + ScalaUserDefinedFunction(UdfUtils.wrap(f), returnType) } /** @@ -8262,7 +8496,7 @@ object functions { * @since 3.5.0 */ def udf(f: UDF2[_, _, _], returnType: DataType): UserDefinedFunction = { - ScalarUserDefinedFunction(UdfUtils.wrap(f), returnType) + ScalaUserDefinedFunction(UdfUtils.wrap(f), returnType) } /** @@ -8275,7 +8509,7 @@ object functions { * @since 3.5.0 */ def udf(f: UDF3[_, _, _, _], returnType: DataType): UserDefinedFunction = { - ScalarUserDefinedFunction(UdfUtils.wrap(f), returnType) + ScalaUserDefinedFunction(UdfUtils.wrap(f), returnType) } /** @@ -8288,7 +8522,7 @@ object functions { * @since 3.5.0 */ def udf(f: UDF4[_, _, _, _, _], returnType: DataType): UserDefinedFunction = { - ScalarUserDefinedFunction(UdfUtils.wrap(f), returnType) + ScalaUserDefinedFunction(UdfUtils.wrap(f), returnType) } /** @@ -8301,7 +8535,7 @@ object functions { * @since 3.5.0 */ def udf(f: UDF5[_, _, _, _, _, _], returnType: DataType): UserDefinedFunction = { - ScalarUserDefinedFunction(UdfUtils.wrap(f), returnType) + ScalaUserDefinedFunction(UdfUtils.wrap(f), returnType) } /** @@ -8314,7 +8548,7 @@ object functions { * @since 3.5.0 */ def udf(f: UDF6[_, _, _, _, _, _, _], returnType: DataType): UserDefinedFunction = { - ScalarUserDefinedFunction(UdfUtils.wrap(f), returnType) + ScalaUserDefinedFunction(UdfUtils.wrap(f), returnType) } /** @@ -8327,7 +8561,7 @@ object functions { * @since 3.5.0 */ def udf(f: UDF7[_, _, _, _, _, _, _, _], returnType: DataType): UserDefinedFunction = { - ScalarUserDefinedFunction(UdfUtils.wrap(f), returnType) + ScalaUserDefinedFunction(UdfUtils.wrap(f), returnType) } /** @@ -8340,7 +8574,7 @@ object functions { * @since 3.5.0 */ def udf(f: UDF8[_, _, _, _, _, _, _, _, _], returnType: DataType): UserDefinedFunction = { - ScalarUserDefinedFunction(UdfUtils.wrap(f), returnType) + ScalaUserDefinedFunction(UdfUtils.wrap(f), returnType) } /** @@ -8353,7 +8587,7 @@ object functions { * @since 3.5.0 */ def udf(f: UDF9[_, _, _, _, _, _, _, _, _, _], returnType: DataType): UserDefinedFunction = { - ScalarUserDefinedFunction(UdfUtils.wrap(f), returnType) + ScalaUserDefinedFunction(UdfUtils.wrap(f), returnType) } /** @@ -8368,7 +8602,7 @@ object functions { def udf( f: UDF10[_, _, _, _, _, _, _, _, _, _, _], returnType: DataType): UserDefinedFunction = { - ScalarUserDefinedFunction(UdfUtils.wrap(f), returnType) + ScalaUserDefinedFunction(UdfUtils.wrap(f), returnType) } // scalastyle:off line.size.limit @@ -8398,7 +8632,7 @@ object functions { "Please use Scala `udf` method without return type parameter.", "3.0.0") def udf(f: AnyRef, dataType: DataType): UserDefinedFunction = { - ScalarUserDefinedFunction(f, dataType) + ScalaUserDefinedFunction(f, dataType) } /** diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/internal/SessionCleaner.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/internal/SessionCleaner.scala new file mode 100644 index 0000000000000..21e4f4d141a89 --- /dev/null +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/internal/SessionCleaner.scala @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.internal + +import java.lang.ref.Cleaner + +import org.apache.spark.connect.proto +import org.apache.spark.internal.Logging +import org.apache.spark.sql.SparkSession + +private[sql] class SessionCleaner(session: SparkSession) extends Logging { + private val cleaner = Cleaner.create() + + /** Register a CachedRemoteRelation for cleanup when it is garbage collected. */ + def register(relation: proto.CachedRemoteRelation): Unit = { + val dfID = relation.getRelationId + cleaner.register(relation, () => doCleanupCachedRemoteRelation(dfID)) + } + + private[sql] def doCleanupCachedRemoteRelation(dfID: String): Unit = { + try { + if (!session.client.channel.isShutdown) { + session.execute { + session.newCommand { builder => + builder.getRemoveCachedRemoteRelationCommandBuilder + .setRelation(proto.CachedRemoteRelation.newBuilder().setRelationId(dfID).build()) + } + } + } + } catch { + case e: Throwable => logError("Error in cleaning thread", e) + } + } +} diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala index f05d29c6f1ab4..fe68f3cb0b572 100644 --- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala @@ -36,6 +36,7 @@ import org.apache.spark.sql.execution.streaming.AvailableNowTrigger import org.apache.spark.sql.execution.streaming.ContinuousTrigger import org.apache.spark.sql.execution.streaming.OneTimeTrigger import org.apache.spark.sql.execution.streaming.ProcessingTimeTrigger +import org.apache.spark.sql.streaming.StreamingQueryListener.QueryStartedEvent import org.apache.spark.sql.types.NullType import org.apache.spark.util.SparkSerDeUtils @@ -297,6 +298,11 @@ final class DataStreamWriter[T] private[sql] (ds: Dataset[T]) extends Logging { .build() val resp = ds.sparkSession.execute(startCmd).head + if (resp.getWriteStreamOperationStartResult.hasQueryStartedEventJson) { + val event = QueryStartedEvent.fromJson( + resp.getWriteStreamOperationStartResult.getQueryStartedEventJson) + ds.sparkSession.streams.streamingQueryListenerBus.postToAll(event) + } RemoteStreamingQuery.fromStartCommandResponse(ds.sparkSession, resp) } diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryListener.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryListener.scala index 404bd1b078ba4..fcb4bdcb327bc 100644 --- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryListener.scala +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryListener.scala @@ -19,8 +19,9 @@ package org.apache.spark.sql.streaming import java.util.UUID -import org.json4s.{JObject, JString} -import org.json4s.JsonAST.JValue +import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper} +import com.fasterxml.jackson.module.scala.{ClassTagExtensions, DefaultScalaModule} +import org.json4s.{JObject, JString, JValue} import org.json4s.JsonDSL.{jobject2assoc, pair2Assoc} import org.json4s.jackson.JsonMethods.{compact, render} @@ -120,6 +121,21 @@ object StreamingQueryListener extends Serializable { } } + private[spark] object QueryStartedEvent { + private val mapper = { + val ret = new ObjectMapper() with ClassTagExtensions + ret.registerModule(DefaultScalaModule) + ret.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false) + ret + } + + private[spark] def jsonString(event: QueryStartedEvent): String = + mapper.writeValueAsString(event) + + private[spark] def fromJson(json: String): QueryStartedEvent = + mapper.readValue[QueryStartedEvent](json) + } + /** * Event representing any progress updates in a query. * @param progress @@ -136,6 +152,21 @@ object StreamingQueryListener extends Serializable { private def jsonValue: JValue = JObject("progress" -> progress.jsonValue) } + private[spark] object QueryProgressEvent { + private val mapper = { + val ret = new ObjectMapper() with ClassTagExtensions + ret.registerModule(DefaultScalaModule) + ret.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false) + ret + } + + private[spark] def jsonString(event: QueryProgressEvent): String = + mapper.writeValueAsString(event) + + private[spark] def fromJson(json: String): QueryProgressEvent = + mapper.readValue[QueryProgressEvent](json) + } + /** * Event representing that query is idle and waiting for new data to process. * @@ -161,6 +192,21 @@ object StreamingQueryListener extends Serializable { } } + private[spark] object QueryIdleEvent { + private val mapper = { + val ret = new ObjectMapper() with ClassTagExtensions + ret.registerModule(DefaultScalaModule) + ret.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false) + ret + } + + private[spark] def jsonString(event: QueryTerminatedEvent): String = + mapper.writeValueAsString(event) + + private[spark] def fromJson(json: String): QueryTerminatedEvent = + mapper.readValue[QueryTerminatedEvent](json) + } + /** * Event representing that termination of a query. * @@ -199,4 +245,19 @@ object StreamingQueryListener extends Serializable { ("errorClassOnException" -> JString(errorClassOnException.orNull)) } } + + private[spark] object QueryTerminatedEvent { + private val mapper = { + val ret = new ObjectMapper() with ClassTagExtensions + ret.registerModule(DefaultScalaModule) + ret.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false) + ret + } + + private[spark] def jsonString(event: QueryTerminatedEvent): String = + mapper.writeValueAsString(event) + + private[spark] def fromJson(json: String): QueryTerminatedEvent = + mapper.readValue[QueryTerminatedEvent](json) + } } diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryListenerBus.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryListenerBus.scala new file mode 100644 index 0000000000000..c2934bcfa7058 --- /dev/null +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryListenerBus.scala @@ -0,0 +1,155 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.streaming + +import java.util.concurrent.CopyOnWriteArrayList + +import scala.jdk.CollectionConverters._ + +import org.apache.spark.connect.proto.{Command, ExecutePlanResponse, Plan, StreamingQueryEventType} +import org.apache.spark.internal.{Logging, LogKeys, MDC} +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.connect.client.CloseableIterator +import org.apache.spark.sql.streaming.StreamingQueryListener.{Event, QueryIdleEvent, QueryProgressEvent, QueryStartedEvent, QueryTerminatedEvent} + +class StreamingQueryListenerBus(sparkSession: SparkSession) extends Logging { + private val listeners = new CopyOnWriteArrayList[StreamingQueryListener]() + private var executionThread: Option[Thread] = Option.empty + + val lock = new Object() + + def close(): Unit = { + listeners.forEach(remove(_)) + } + + def append(listener: StreamingQueryListener): Unit = lock.synchronized { + listeners.add(listener) + + if (listeners.size() == 1) { + var iter: Option[CloseableIterator[ExecutePlanResponse]] = Option.empty + try { + iter = Some(registerServerSideListener()) + } catch { + case e: Exception => + logWarning("Failed to add the listener, please add it again.", e) + listeners.remove(listener) + return + } + executionThread = Some(new Thread(new Runnable { + def run(): Unit = { + queryEventHandler(iter.get) + } + })) + // Start the thread + executionThread.get.start() + } + } + + def remove(listener: StreamingQueryListener): Unit = lock.synchronized { + if (listeners.size() == 1) { + val cmdBuilder = Command.newBuilder() + cmdBuilder.getStreamingQueryListenerBusCommandBuilder + .setRemoveListenerBusListener(true) + try { + sparkSession.execute(cmdBuilder.build()) + } catch { + case e: Exception => + logWarning("Failed to remove the listener, please remove it again.", e) + return + } + if (executionThread.isDefined) { + executionThread.get.interrupt() + executionThread = Option.empty + } + } + listeners.remove(listener) + } + + def list(): Array[StreamingQueryListener] = lock.synchronized { + listeners.asScala.toArray + } + + def registerServerSideListener(): CloseableIterator[ExecutePlanResponse] = { + val cmdBuilder = Command.newBuilder() + cmdBuilder.getStreamingQueryListenerBusCommandBuilder + .setAddListenerBusListener(true) + + val plan = Plan.newBuilder().setCommand(cmdBuilder.build()).build() + val iterator = sparkSession.client.execute(plan) + while (iterator.hasNext) { + val response = iterator.next() + if (response.getStreamingQueryListenerEventsResult.hasListenerBusListenerAdded && + response.getStreamingQueryListenerEventsResult.getListenerBusListenerAdded) { + return iterator + } + } + iterator + } + + def queryEventHandler(iter: CloseableIterator[ExecutePlanResponse]): Unit = { + try { + while (iter.hasNext) { + val response = iter.next() + val listenerEvents = response.getStreamingQueryListenerEventsResult.getEventsList + listenerEvents.forEach(event => { + event.getEventType match { + case StreamingQueryEventType.QUERY_PROGRESS_EVENT => + postToAll(QueryProgressEvent.fromJson(event.getEventJson)) + case StreamingQueryEventType.QUERY_IDLE_EVENT => + postToAll(QueryIdleEvent.fromJson(event.getEventJson)) + case StreamingQueryEventType.QUERY_TERMINATED_EVENT => + postToAll(QueryTerminatedEvent.fromJson(event.getEventJson)) + case _ => + logWarning(log"Unknown StreamingQueryListener event: ${MDC(LogKeys.EVENT, event)}") + } + }) + } + } catch { + case e: Exception => + logWarning( + "StreamingQueryListenerBus Handler thread received exception, all client" + + " side listeners are removed and handler thread is terminated.", + e) + lock.synchronized { + executionThread = Option.empty + listeners.forEach(remove(_)) + } + } + } + + def postToAll(event: Event): Unit = lock.synchronized { + listeners.forEach(listener => + try { + event match { + case t: QueryStartedEvent => + listener.onQueryStarted(t) + case t: QueryProgressEvent => + listener.onQueryProgress(t) + case t: QueryIdleEvent => + listener.onQueryIdle(t) + case t: QueryTerminatedEvent => + listener.onQueryTerminated(t) + case _ => + logWarning(log"Unknown StreamingQueryListener event: ${MDC(LogKeys.EVENT, event)}") + } + } catch { + case e: Exception => + logWarning(log"Listener ${MDC(LogKeys.LISTENER, listener)} threw an exception", e) + }) + } +} diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala index fd33efd721932..7efced227d6d1 100644 --- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala @@ -22,16 +22,13 @@ import java.util.concurrent.{ConcurrentHashMap, ConcurrentMap} import scala.jdk.CollectionConverters._ -import com.google.protobuf.ByteString - import org.apache.spark.annotation.Evolving import org.apache.spark.connect.proto.Command import org.apache.spark.connect.proto.StreamingQueryManagerCommand import org.apache.spark.connect.proto.StreamingQueryManagerCommandResult import org.apache.spark.internal.Logging import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.connect.common.{InvalidPlanInput, StreamingListenerPacket} -import org.apache.spark.util.SparkSerDeUtils +import org.apache.spark.sql.connect.common.InvalidPlanInput /** * A class to manage all the [[StreamingQuery]] active in a `SparkSession`. @@ -50,6 +47,12 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) extends Lo private lazy val listenerCache: ConcurrentMap[String, StreamingQueryListener] = new ConcurrentHashMap() + private[spark] val streamingQueryListenerBus = new StreamingQueryListenerBus(sparkSession) + + private[spark] def close(): Unit = { + streamingQueryListenerBus.close() + } + /** * Returns a list of active queries associated with this SQLContext * @@ -153,17 +156,7 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) extends Lo * @since 3.5.0 */ def addListener(listener: StreamingQueryListener): Unit = { - // TODO: [SPARK-44400] Improve the Listener to provide users a way to access the Spark session - // and perform arbitrary actions inside the Listener. Right now users can use - // `val spark = SparkSession.builder.getOrCreate()` to create a Spark session inside the - // Listener, but this is a legacy session instead of a connect remote session. - val id = UUID.randomUUID.toString - cacheListenerById(id, listener) - executeManagerCmd( - _.getAddListenerBuilder - .setListenerPayload(ByteString.copyFrom(SparkSerDeUtils - .serialize(StreamingListenerPacket(id, listener)))) - .setId(id)) + streamingQueryListenerBus.append(listener) } /** @@ -172,11 +165,7 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) extends Lo * @since 3.5.0 */ def removeListener(listener: StreamingQueryListener): Unit = { - val id = getIdByListener(listener) - executeManagerCmd( - _.getRemoveListenerBuilder - .setId(id)) - removeCachedListener(id) + streamingQueryListenerBus.remove(listener) } /** @@ -185,10 +174,7 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) extends Lo * @since 3.5.0 */ def listListeners(): Array[StreamingQueryListener] = { - executeManagerCmd(_.setListListeners(true)).getListListeners.getListenerIdsList.asScala - .filter(listenerCache.containsKey(_)) - .map(listenerCache.get(_)) - .toArray + streamingQueryListenerBus.list() } private def executeManagerCmd( diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/streaming/progress.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/streaming/progress.scala index a0c124f810e92..ebd13bc248f97 100644 --- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/streaming/progress.scala +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/streaming/progress.scala @@ -322,7 +322,7 @@ private object SafeJsonSerializer { /** Convert map to JValue while handling empty maps. Also, this sorts the keys. */ def safeMapToJValue[T](map: ju.Map[String, T], valueToJValue: T => JValue): JValue = { - if (map.isEmpty) return JNothing + if (map == null || map.isEmpty) return JNothing val keys = map.asScala.keySet.toSeq.sorted keys.map { k => k -> valueToJValue(map.get(k)): JObject }.reduce(_ ~ _) } diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/CatalogSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/CatalogSuite.scala index d646fad00c075..0e3a683d2701d 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/CatalogSuite.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/CatalogSuite.scala @@ -22,11 +22,11 @@ import java.io.{File, FilenameFilter} import org.apache.commons.io.FileUtils import org.apache.spark.SparkException -import org.apache.spark.sql.test.{RemoteSparkSession, SQLHelper} +import org.apache.spark.sql.test.{ConnectFunSuite, RemoteSparkSession, SQLHelper} import org.apache.spark.sql.types.{DoubleType, LongType, StructType} import org.apache.spark.storage.StorageLevel -class CatalogSuite extends RemoteSparkSession with SQLHelper { +class CatalogSuite extends ConnectFunSuite with RemoteSparkSession with SQLHelper { test("Database APIs") { val currentDb = spark.catalog.currentDatabase diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/CheckpointSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/CheckpointSuite.scala new file mode 100644 index 0000000000000..e57b051890f56 --- /dev/null +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/CheckpointSuite.scala @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql + +import java.io.{ByteArrayOutputStream, PrintStream} + +import scala.concurrent.duration.DurationInt + +import org.apache.commons.io.output.TeeOutputStream +import org.scalatest.concurrent.Eventually.{eventually, interval, timeout} +import org.scalatest.exceptions.TestFailedDueToTimeoutException + +import org.apache.spark.SparkException +import org.apache.spark.connect.proto +import org.apache.spark.sql.test.{ConnectFunSuite, RemoteSparkSession, SQLHelper} + +class CheckpointSuite extends ConnectFunSuite with RemoteSparkSession with SQLHelper { + + private def captureStdOut(block: => Unit): String = { + val currentOut = Console.out + val capturedOut = new ByteArrayOutputStream() + val newOut = new PrintStream(new TeeOutputStream(currentOut, capturedOut)) + Console.withOut(newOut) { + block + } + capturedOut.toString + } + + private def checkFragments(result: String, fragmentsToCheck: Seq[String]): Unit = { + fragmentsToCheck.foreach { fragment => + assert(result.contains(fragment)) + } + } + + private def testCapturedStdOut(block: => Unit, fragmentsToCheck: String*): Unit = { + checkFragments(captureStdOut(block), fragmentsToCheck) + } + + test("checkpoint") { + val df = spark.range(100).localCheckpoint() + testCapturedStdOut(df.explain(), "ExistingRDD") + } + + test("checkpoint gc") { + val df = spark.range(100).localCheckpoint(eager = true) + val encoder = df.agnosticEncoder + val dfId = df.plan.getRoot.getCachedRemoteRelation.getRelationId + spark.cleaner.doCleanupCachedRemoteRelation(dfId) + + val ex = intercept[SparkException] { + spark + .newDataset(encoder) { builder => + builder.setCachedRemoteRelation( + proto.CachedRemoteRelation + .newBuilder() + .setRelationId(dfId) + .build()) + } + .collect() + } + assert(ex.getMessage.contains(s"No DataFrame with id $dfId is found")) + } + + // This test is flaky because cannot guarantee GC + // You can locally run this to verify the behavior. + ignore("checkpoint gc derived DataFrame") { + var df1 = spark.range(100).localCheckpoint(eager = true) + var derived = df1.repartition(10) + val encoder = df1.agnosticEncoder + val dfId = df1.plan.getRoot.getCachedRemoteRelation.getRelationId + + df1 = null + System.gc() + Thread.sleep(3000L) + + def condition(): Unit = { + val ex = intercept[SparkException] { + spark + .newDataset(encoder) { builder => + builder.setCachedRemoteRelation( + proto.CachedRemoteRelation + .newBuilder() + .setRelationId(dfId) + .build()) + } + .collect() + } + assert(ex.getMessage.contains(s"No DataFrame with id $dfId is found")) + } + + intercept[TestFailedDueToTimeoutException] { + eventually(timeout(5.seconds), interval(1.second))(condition()) + } + + // GC triggers remove the cached remote relation + derived = null + System.gc() + Thread.sleep(3000L) + + // Check the state was removed up on garbage-collection. + eventually(timeout(60.seconds), interval(1.second))(condition()) + } +} diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientDataFrameStatSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientDataFrameStatSuite.scala index 299ff7ff4fe3a..88281352f2479 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientDataFrameStatSuite.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientDataFrameStatSuite.scala @@ -22,9 +22,9 @@ import java.util.Random import org.scalatest.matchers.must.Matchers._ import org.apache.spark.SparkIllegalArgumentException -import org.apache.spark.sql.test.RemoteSparkSession +import org.apache.spark.sql.test.{ConnectFunSuite, RemoteSparkSession} -class ClientDataFrameStatSuite extends RemoteSparkSession { +class ClientDataFrameStatSuite extends ConnectFunSuite with RemoteSparkSession { private def toLetter(i: Int): String = (i + 97).toChar.toString test("approxQuantile") { diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientDatasetSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientDatasetSuite.scala index 4a32b8460bce1..9d6f07cf603aa 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientDatasetSuite.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientDatasetSuite.scala @@ -162,30 +162,6 @@ class ClientDatasetSuite extends ConnectFunSuite with BeforeAndAfterEach { } } - test("command extension deprecated") { - val extension = proto.ExamplePluginCommand.newBuilder().setCustomField("abc").build() - val command = proto.Command - .newBuilder() - .setExtension(com.google.protobuf.Any.pack(extension)) - .build() - val expectedPlan = proto.Plan.newBuilder().setCommand(command).build() - ss.execute(com.google.protobuf.Any.pack(extension)) - val actualPlan = service.getAndClearLatestInputPlan() - assert(actualPlan.equals(expectedPlan)) - } - - test("command extension") { - val extension = proto.ExamplePluginCommand.newBuilder().setCustomField("abc").build() - val command = proto.Command - .newBuilder() - .setExtension(com.google.protobuf.Any.pack(extension)) - .build() - val expectedPlan = proto.Plan.newBuilder().setCommand(command).build() - ss.execute(com.google.protobuf.Any.pack(extension).toByteArray) - val actualPlan = service.getAndClearLatestInputPlan() - assert(actualPlan.equals(expectedPlan)) - } - test("serialize as null") { val session = newSparkSession() val ds = session.range(10) diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala index a0729adb89609..255dd76697987 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala @@ -22,6 +22,8 @@ import java.time.DateTimeException import java.util.Properties import scala.collection.mutable +import scala.concurrent.{ExecutionContext, Future} +import scala.concurrent.duration.DurationInt import scala.jdk.CollectionConverters._ import org.apache.commons.io.FileUtils @@ -38,11 +40,16 @@ import org.apache.spark.sql.catalyst.parser.ParseException import org.apache.spark.sql.connect.client.{SparkConnectClient, SparkResult} import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SqlApiConf -import org.apache.spark.sql.test.{IntegrationTestUtils, RemoteSparkSession, SQLHelper} +import org.apache.spark.sql.test.{ConnectFunSuite, IntegrationTestUtils, RemoteSparkSession, SQLHelper} import org.apache.spark.sql.test.SparkConnectServerUtils.port import org.apache.spark.sql.types._ +import org.apache.spark.util.SparkThreadUtils -class ClientE2ETestSuite extends RemoteSparkSession with SQLHelper with PrivateMethodTester { +class ClientE2ETestSuite + extends ConnectFunSuite + with RemoteSparkSession + with SQLHelper + with PrivateMethodTester { test("throw SparkException with null filename in stack trace elements") { withSQLConf("spark.sql.connect.enrichError.enabled" -> "true") { @@ -1511,6 +1518,46 @@ class ClientE2ETestSuite extends RemoteSparkSession with SQLHelper with PrivateM (0 until 5).foreach(i => assert(row.get(i * 2) === row.get(i * 2 + 1))) } } + + test("Observable metrics") { + val df = spark.range(99).withColumn("extra", col("id") - 1) + val ob1 = new Observation("ob1") + val observedDf = df.observe(ob1, min("id"), avg("id"), max("id")) + val observedObservedDf = observedDf.observe("ob2", min("extra"), avg("extra"), max("extra")) + + val ob1Schema = new StructType() + .add("min(id)", LongType) + .add("avg(id)", DoubleType) + .add("max(id)", LongType) + val ob2Schema = new StructType() + .add("min(extra)", LongType) + .add("avg(extra)", DoubleType) + .add("max(extra)", LongType) + val ob1Metrics = Map("ob1" -> new GenericRowWithSchema(Array(0, 49, 98), ob1Schema)) + val ob2Metrics = Map("ob2" -> new GenericRowWithSchema(Array(-1, 48, 97), ob2Schema)) + + assert(df.collectResult().getObservedMetrics === Map.empty) + assert(observedDf.collectResult().getObservedMetrics === ob1Metrics) + assert(observedObservedDf.collectResult().getObservedMetrics === ob1Metrics ++ ob2Metrics) + } + + test("Observation.get is blocked until the query is finished") { + val df = spark.range(99).withColumn("extra", col("id") - 1) + val observation = new Observation("ob1") + val observedDf = df.observe(observation, min("id"), avg("id"), max("id")) + + // Start a new thread to get the observation + val future = Future(observation.get)(ExecutionContext.global) + // make sure the thread is blocked right now + val e = intercept[java.util.concurrent.TimeoutException] { + SparkThreadUtils.awaitResult(future, 2.seconds) + } + assert(e.getMessage.contains("Future timed out")) + observedDf.collect() + // make sure the thread is unblocked after the query is finished + val metrics = SparkThreadUtils.awaitResult(future, 2.seconds) + assert(metrics === Map("min(id)" -> 0, "avg(id)" -> 49, "max(id)" -> 98)) + } } private[sql] case class ClassData(a: String, b: Int) diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionSuite.scala index b77e92995624f..8a783d880560e 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionSuite.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionSuite.scala @@ -20,10 +20,10 @@ package org.apache.spark.sql import scala.jdk.CollectionConverters._ import org.apache.spark.sql.internal.SqlApiConf -import org.apache.spark.sql.test.{QueryTest, SQLHelper} +import org.apache.spark.sql.test.{QueryTest, RemoteSparkSession} import org.apache.spark.sql.types.{StringType, StructType} -class DataFrameNaFunctionSuite extends QueryTest with SQLHelper { +class DataFrameNaFunctionSuite extends QueryTest with RemoteSparkSession { private def createDF(): DataFrame = { val sparkSession = spark import sparkSession.implicits._ diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/KeyValueGroupedDatasetE2ETestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/KeyValueGroupedDatasetE2ETestSuite.scala index 91516b0069b25..988774d5eec94 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/KeyValueGroupedDatasetE2ETestSuite.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/KeyValueGroupedDatasetE2ETestSuite.scala @@ -22,7 +22,7 @@ import java.util.Arrays import org.apache.spark.sql.catalyst.streaming.InternalOutputModes.Append import org.apache.spark.sql.functions._ import org.apache.spark.sql.streaming.{GroupState, GroupStateTimeout} -import org.apache.spark.sql.test.{QueryTest, SQLHelper} +import org.apache.spark.sql.test.{QueryTest, RemoteSparkSession} import org.apache.spark.sql.types._ import org.apache.spark.util.SparkSerDeUtils @@ -33,7 +33,7 @@ case class ClickState(id: String, count: Int) /** * All tests in this class requires client UDF artifacts synced with the server. */ -class KeyValueGroupedDatasetE2ETestSuite extends QueryTest with SQLHelper { +class KeyValueGroupedDatasetE2ETestSuite extends QueryTest with RemoteSparkSession { lazy val session: SparkSession = spark import session.implicits._ diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala index 5844df8a4889c..77be7c5de04af 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala @@ -36,6 +36,7 @@ import org.apache.spark.sql.{functions => fn} import org.apache.spark.sql.avro.{functions => avroFn} import org.apache.spark.sql.catalyst.ScalaReflection import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.StringEncoder +import org.apache.spark.sql.catalyst.util.CollationFactory import org.apache.spark.sql.connect.client.SparkConnectClient import org.apache.spark.sql.expressions.Window import org.apache.spark.sql.functions.lit @@ -699,7 +700,8 @@ class PlanGenerationTestSuite } test("select collated string") { - val schema = StructType(StructField("s", StringType(1)) :: Nil) + val schema = + StructType(StructField("s", StringType(CollationFactory.UTF8_LCASE_COLLATION_ID)) :: Nil) createLocalRelation(schema.catalogString).select("s") } @@ -1762,14 +1764,26 @@ class PlanGenerationTestSuite fn.split(fn.col("g"), ";") } + functionTest("split using columns") { + fn.split(fn.col("g"), fn.col("g")) + } + functionTest("split with limit") { fn.split(fn.col("g"), ";", 10) } + functionTest("split with limit using columns") { + fn.split(fn.col("g"), lit(";"), fn.col("a")) + } + functionTest("substring") { fn.substring(fn.col("g"), 4, 5) } + functionTest("substring using columns") { + fn.substring(fn.col("g"), fn.col("a"), fn.col("b")) + } + functionTest("substring_index") { fn.substring_index(fn.col("g"), ";", 5) } @@ -2297,6 +2311,14 @@ class PlanGenerationTestSuite fn.timestamp_micros(fn.col("x")) } + temporalFunctionTest("timestamp_diff") { + fn.timestamp_diff("year", fn.col("t"), fn.col("t")) + } + + temporalFunctionTest("timestamp_add") { + fn.timestamp_add("week", fn.col("x"), fn.col("t")) + } + // Array of Long // Array of Long // Array of Array of Long @@ -2481,10 +2503,38 @@ class PlanGenerationTestSuite Collections.singletonMap("allowNumericLeadingZeros", "true")) } + functionTest("try_parse_json") { + fn.try_parse_json(fn.col("g")) + } + functionTest("to_json") { fn.to_json(fn.col("d"), Map(("timestampFormat", "dd/MM/yyyy"))) } + functionTest("parse_json") { + fn.parse_json(fn.col("g")) + } + + functionTest("is_variant_null") { + fn.is_variant_null(fn.parse_json(fn.col("g"))) + } + + functionTest("variant_get") { + fn.variant_get(fn.parse_json(fn.col("g")), "$", "int") + } + + functionTest("try_variant_get") { + fn.try_variant_get(fn.parse_json(fn.col("g")), "$", "int") + } + + functionTest("schema_of_variant") { + fn.schema_of_variant(fn.parse_json(fn.col("g"))) + } + + functionTest("schema_of_variant_agg") { + fn.schema_of_variant_agg(fn.parse_json(fn.col("g"))) + } + functionTest("size") { fn.size(fn.col("f")) } @@ -3191,34 +3241,12 @@ class PlanGenerationTestSuite } /* Extensions */ - test("relation extension deprecated") { - val input = proto.ExamplePluginRelation - .newBuilder() - .setInput(simple.plan.getRoot) - .build() - session.newDataFrame(com.google.protobuf.Any.pack(input)) - } - - test("expression extension deprecated") { - val extension = proto.ExamplePluginExpression - .newBuilder() - .setChild( - proto.Expression - .newBuilder() - .setUnresolvedAttribute(proto.Expression.UnresolvedAttribute - .newBuilder() - .setUnparsedIdentifier("id"))) - .setCustomField("abc") - .build() - simple.select(Column(com.google.protobuf.Any.pack(extension))) - } - test("relation extension") { val input = proto.ExamplePluginRelation .newBuilder() .setInput(simple.plan.getRoot) .build() - session.newDataFrame(com.google.protobuf.Any.pack(input).toByteArray) + session.newDataFrame(_.setExtension(com.google.protobuf.Any.pack(input))) } test("expression extension") { @@ -3232,7 +3260,7 @@ class PlanGenerationTestSuite .setUnparsedIdentifier("id"))) .setCustomField("abc") .build() - simple.select(Column.forExtension(com.google.protobuf.Any.pack(extension).toByteArray)) + simple.select(Column(_.setExtension(com.google.protobuf.Any.pack(extension)))) } test("crosstab") { diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/SQLImplicitsTestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/SQLImplicitsTestSuite.scala index 3e4704b6ab8e0..57342e12fcb51 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/SQLImplicitsTestSuite.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/SQLImplicitsTestSuite.scala @@ -85,6 +85,7 @@ class SQLImplicitsTestSuite extends ConnectFunSuite with BeforeAndAfterAll { testImplicit(booleans) testImplicit(booleans.toSeq) testImplicit(booleans.toSeq)(newBooleanSeqEncoder) + testImplicit(booleans.toSeq)(newSequenceEncoder) testImplicit(booleans.toImmutableArraySeq) val bytes = Array(76.toByte, 59.toByte, 121.toByte) @@ -93,6 +94,7 @@ class SQLImplicitsTestSuite extends ConnectFunSuite with BeforeAndAfterAll { testImplicit(bytes) testImplicit(bytes.toSeq) testImplicit(bytes.toSeq)(newByteSeqEncoder) + testImplicit(bytes.toSeq)(newSequenceEncoder) testImplicit(bytes.toImmutableArraySeq) val shorts = Array(21.toShort, (-213).toShort, 14876.toShort) @@ -101,6 +103,7 @@ class SQLImplicitsTestSuite extends ConnectFunSuite with BeforeAndAfterAll { testImplicit(shorts) testImplicit(shorts.toSeq) testImplicit(shorts.toSeq)(newShortSeqEncoder) + testImplicit(shorts.toSeq)(newSequenceEncoder) testImplicit(shorts.toImmutableArraySeq) val ints = Array(4, 6, 5) @@ -109,6 +112,7 @@ class SQLImplicitsTestSuite extends ConnectFunSuite with BeforeAndAfterAll { testImplicit(ints) testImplicit(ints.toSeq) testImplicit(ints.toSeq)(newIntSeqEncoder) + testImplicit(ints.toSeq)(newSequenceEncoder) testImplicit(ints.toImmutableArraySeq) val longs = Array(System.nanoTime(), System.currentTimeMillis()) @@ -117,6 +121,7 @@ class SQLImplicitsTestSuite extends ConnectFunSuite with BeforeAndAfterAll { testImplicit(longs) testImplicit(longs.toSeq) testImplicit(longs.toSeq)(newLongSeqEncoder) + testImplicit(longs.toSeq)(newSequenceEncoder) testImplicit(longs.toImmutableArraySeq) val floats = Array(3f, 10.9f) @@ -125,6 +130,7 @@ class SQLImplicitsTestSuite extends ConnectFunSuite with BeforeAndAfterAll { testImplicit(floats) testImplicit(floats.toSeq) testImplicit(floats.toSeq)(newFloatSeqEncoder) + testImplicit(floats.toSeq)(newSequenceEncoder) testImplicit(floats.toImmutableArraySeq) val doubles = Array(23.78d, -329.6d) @@ -133,6 +139,7 @@ class SQLImplicitsTestSuite extends ConnectFunSuite with BeforeAndAfterAll { testImplicit(doubles) testImplicit(doubles.toSeq) testImplicit(doubles.toSeq)(newDoubleSeqEncoder) + testImplicit(doubles.toSeq)(newSequenceEncoder) testImplicit(doubles.toImmutableArraySeq) val strings = Array("foo", "baz", "bar") @@ -140,6 +147,7 @@ class SQLImplicitsTestSuite extends ConnectFunSuite with BeforeAndAfterAll { testImplicit(strings) testImplicit(strings.toSeq) testImplicit(strings.toSeq)(newStringSeqEncoder) + testImplicit(strings.toSeq)(newSequenceEncoder) testImplicit(strings.toImmutableArraySeq) val myTypes = Array(MyType(12L, Math.E, Math.PI), MyType(0, 0, 0)) @@ -147,6 +155,7 @@ class SQLImplicitsTestSuite extends ConnectFunSuite with BeforeAndAfterAll { testImplicit(myTypes) testImplicit(myTypes.toSeq) testImplicit(myTypes.toSeq)(newProductSeqEncoder[MyType]) + testImplicit(myTypes.toSeq)(newSequenceEncoder) testImplicit(myTypes.toImmutableArraySeq) // Others. diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/SparkSessionE2ESuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/SparkSessionE2ESuite.scala index b967245d90c26..b28aa905c7a29 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/SparkSessionE2ESuite.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/SparkSessionE2ESuite.scala @@ -26,7 +26,7 @@ import scala.util.{Failure, Success} import org.scalatest.concurrent.Eventually._ import org.apache.spark.SparkException -import org.apache.spark.sql.test.RemoteSparkSession +import org.apache.spark.sql.test.{ConnectFunSuite, RemoteSparkSession} import org.apache.spark.util.SparkThreadUtils.awaitResult /** @@ -34,7 +34,7 @@ import org.apache.spark.util.SparkThreadUtils.awaitResult * class, whether explicit or implicit, as it will trigger a UDF deserialization error during * Maven build/test. */ -class SparkSessionE2ESuite extends RemoteSparkSession { +class SparkSessionE2ESuite extends ConnectFunSuite with RemoteSparkSession { test("interrupt all - background queries, foreground interrupt") { val session = spark @@ -108,7 +108,37 @@ class SparkSessionE2ESuite extends RemoteSparkSession { assert(interrupted.length == 2, s"Interrupted operations: $interrupted.") } - test("interrupt tag") { + test("interrupt all - streaming queries") { + val q1 = spark.readStream + .format("rate") + .option("rowsPerSecond", 1) + .load() + .writeStream + .format("console") + .start() + + val q2 = spark.readStream + .format("rate") + .option("rowsPerSecond", 1) + .load() + .writeStream + .format("console") + .start() + + assert(q1.isActive) + assert(q2.isActive) + + val interrupted = spark.interruptAll() + + q1.awaitTermination(timeoutMs = 20 * 1000) + q2.awaitTermination(timeoutMs = 20 * 1000) + assert(!q1.isActive) + assert(!q2.isActive) + assert(interrupted.length == 2, s"Interrupted operations: $interrupted.") + } + + // TODO(SPARK-48139): Re-enable `SparkSessionE2ESuite.interrupt tag` + ignore("interrupt tag") { val session = spark import session.implicits._ @@ -196,7 +226,7 @@ class SparkSessionE2ESuite extends RemoteSparkSession { // q2 and q3 should be cancelled interrupted.clear() - eventually(timeout(30.seconds), interval(1.seconds)) { + eventually(timeout(1.minute), interval(1.seconds)) { val ids = spark.interruptTag("two") interrupted ++= ids assert(interrupted.length == 2, s"Interrupted operations: $interrupted.") @@ -213,7 +243,7 @@ class SparkSessionE2ESuite extends RemoteSparkSession { // q1 and q4 should be cancelled interrupted.clear() - eventually(timeout(30.seconds), interval(1.seconds)) { + eventually(timeout(1.minute), interval(1.seconds)) { val ids = spark.interruptTag("one") interrupted ++= ids assert(interrupted.length == 2, s"Interrupted operations: $interrupted.") @@ -229,6 +259,53 @@ class SparkSessionE2ESuite extends RemoteSparkSession { assert(interrupted.length == 2, s"Interrupted operations: $interrupted.") } + test("interrupt tag - streaming query") { + spark.addTag("foo") + val q1 = spark.readStream + .format("rate") + .option("rowsPerSecond", 1) + .load() + .writeStream + .format("console") + .start() + assert(spark.getTags() == Set("foo")) + + spark.addTag("bar") + val q2 = spark.readStream + .format("rate") + .option("rowsPerSecond", 1) + .load() + .writeStream + .format("console") + .start() + assert(spark.getTags() == Set("foo", "bar")) + + spark.clearTags() + + spark.addTag("zoo") + val q3 = spark.readStream + .format("rate") + .option("rowsPerSecond", 1) + .load() + .writeStream + .format("console") + .start() + assert(spark.getTags() == Set("zoo")) + + assert(q1.isActive) + assert(q2.isActive) + assert(q3.isActive) + + val interrupted = spark.interruptTag("foo") + + q1.awaitTermination(timeoutMs = 20 * 1000) + q2.awaitTermination(timeoutMs = 20 * 1000) + assert(!q1.isActive) + assert(!q2.isActive) + assert(q3.isActive) + assert(interrupted.length == 2, s"Interrupted operations: $interrupted.") + } + test("progress is available for the spark result") { val result = spark .range(10000) @@ -305,4 +382,43 @@ class SparkSessionE2ESuite extends RemoteSparkSession { .create() } } + + test("SPARK-47986: get or create after session changed") { + val remote = s"sc://localhost:$serverPort" + + SparkSession.clearDefaultSession() + SparkSession.clearActiveSession() + + val session1 = SparkSession + .builder() + .remote(remote) + .getOrCreate() + + assert(session1 eq SparkSession.getActiveSession.get) + assert(session1 eq SparkSession.getDefaultSession.get) + assert(session1.range(3).collect().length == 3) + + session1.client.hijackServerSideSessionIdForTesting("-testing") + + val e = intercept[SparkException] { + session1.range(3).analyze + } + + assert(e.getMessage.contains("[INVALID_HANDLE.SESSION_CHANGED]")) + assert(!session1.client.isSessionValid) + assert(SparkSession.getActiveSession.isEmpty) + assert(SparkSession.getDefaultSession.isEmpty) + + val session2 = SparkSession + .builder() + .remote(remote) + .getOrCreate() + + assert(session1 ne session2) + assert(session2.client.isSessionValid) + assert(session2 eq SparkSession.getActiveSession.get) + assert(session2 eq SparkSession.getDefaultSession.get) + assert(session2.range(3).collect().length == 3) + } + } diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/StubbingTestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/StubbingTestSuite.scala index b9c5888e5cb77..5bcb17672d6a9 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/StubbingTestSuite.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/StubbingTestSuite.scala @@ -17,9 +17,9 @@ package org.apache.spark.sql import org.apache.spark.sql.connect.client.ToStub -import org.apache.spark.sql.test.RemoteSparkSession +import org.apache.spark.sql.test.{ConnectFunSuite, RemoteSparkSession} -class StubbingTestSuite extends RemoteSparkSession { +class StubbingTestSuite extends ConnectFunSuite with RemoteSparkSession { private def eval[T](f: => T): T = f test("capture of to-be stubbed class") { diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/UDFClassLoadingE2ESuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/UDFClassLoadingE2ESuite.scala index a76e046db2e3a..1d8d164c9541c 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/UDFClassLoadingE2ESuite.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/UDFClassLoadingE2ESuite.scala @@ -22,10 +22,10 @@ import java.nio.file.{Files, Paths} import scala.util.Properties import org.apache.spark.sql.connect.common.ProtoDataTypes -import org.apache.spark.sql.expressions.ScalarUserDefinedFunction -import org.apache.spark.sql.test.RemoteSparkSession +import org.apache.spark.sql.expressions.ScalaUserDefinedFunction +import org.apache.spark.sql.test.{ConnectFunSuite, RemoteSparkSession} -class UDFClassLoadingE2ESuite extends RemoteSparkSession { +class UDFClassLoadingE2ESuite extends ConnectFunSuite with RemoteSparkSession { private val scalaVersion = Properties.versionNumberString .split("\\.") @@ -39,13 +39,14 @@ class UDFClassLoadingE2ESuite extends RemoteSparkSession { new File(s"src/test/resources/udf$scalaVersion.jar").toURI.toURL private def registerUdf(session: SparkSession): Unit = { - val udf = ScalarUserDefinedFunction( + val udf = ScalaUserDefinedFunction( serializedUdfPacket = udfByteArray, inputTypes = Seq(ProtoDataTypes.IntegerType), outputType = ProtoDataTypes.IntegerType, name = Some("dummyUdf"), nullable = true, - deterministic = true) + deterministic = true, + aggregate = false) session.registerUdf(udf.toProto) } diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/UserDefinedFunctionE2ETestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/UserDefinedFunctionE2ETestSuite.scala index f7ffe7aa12719..4032a9499c448 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/UserDefinedFunctionE2ETestSuite.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/UserDefinedFunctionE2ETestSuite.scala @@ -26,14 +26,15 @@ import scala.jdk.CollectionConverters._ import org.apache.spark.api.java.function._ import org.apache.spark.sql.api.java.UDF2 import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.{PrimitiveIntEncoder, PrimitiveLongEncoder} -import org.apache.spark.sql.functions.{col, struct, udf} -import org.apache.spark.sql.test.QueryTest +import org.apache.spark.sql.expressions.Aggregator +import org.apache.spark.sql.functions.{col, struct, udaf, udf} +import org.apache.spark.sql.test.{QueryTest, RemoteSparkSession} import org.apache.spark.sql.types.IntegerType /** * All tests in this class requires client UDF defined in this test class synced with the server. */ -class UserDefinedFunctionE2ETestSuite extends QueryTest { +class UserDefinedFunctionE2ETestSuite extends QueryTest with RemoteSparkSession { test("Dataset typed filter") { val rows = spark.range(10).filter(n => n % 2 == 0).collectAsList() assert(rows == Arrays.asList[Long](0, 2, 4, 6, 8)) @@ -346,4 +347,47 @@ class UserDefinedFunctionE2ETestSuite extends QueryTest { val result = df.select(f($"id")).as[Long].head() assert(result == 1L) } + + test("UDAF custom Aggregator - primitive types") { + val session: SparkSession = spark + import session.implicits._ + val agg = new Aggregator[Long, Long, Long] { + override def zero: Long = 0L + override def reduce(b: Long, a: Long): Long = b + a + override def merge(b1: Long, b2: Long): Long = b1 + b2 + override def finish(reduction: Long): Long = reduction + override def bufferEncoder: Encoder[Long] = Encoders.scalaLong + override def outputEncoder: Encoder[Long] = Encoders.scalaLong + } + spark.udf.register("agg", udaf(agg)) + val result = spark.range(10).selectExpr("agg(id)").as[Long].head() + assert(result == 45) + } + + test("UDAF custom Aggregator - case class as input types") { + val session: SparkSession = spark + import session.implicits._ + val agg = new Aggregator[UdafTestInput, (Long, Long), Long] { + override def zero: (Long, Long) = (0L, 0L) + override def reduce(b: (Long, Long), a: UdafTestInput): (Long, Long) = + (b._1 + a.id, b._2 + a.extra) + override def merge(b1: (Long, Long), b2: (Long, Long)): (Long, Long) = + (b1._1 + b2._1, b1._2 + b2._2) + override def finish(reduction: (Long, Long)): Long = reduction._1 + reduction._2 + override def bufferEncoder: Encoder[(Long, Long)] = + Encoders.tuple(Encoders.scalaLong, Encoders.scalaLong) + override def outputEncoder: Encoder[Long] = Encoders.scalaLong + } + spark.udf.register("agg", udaf(agg)) + val result = spark + .range(10) + .withColumn("extra", col("id") * 2) + .as[UdafTestInput] + .selectExpr("agg(id, extra)") + .as[Long] + .head() + assert(result == 135) // 45 + 90 + } } + +case class UdafTestInput(id: Long, extra: Long) diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/application/ReplE2ESuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/application/ReplE2ESuite.scala index 76958f055f2ef..d7977fbeb108f 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/application/ReplE2ESuite.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/application/ReplE2ESuite.scala @@ -25,13 +25,13 @@ import scala.util.Properties import org.apache.commons.io.output.ByteArrayOutputStream import org.scalatest.BeforeAndAfterEach -import org.apache.spark.sql.test.{IntegrationTestUtils, RemoteSparkSession} +import org.apache.spark.sql.test.{ConnectFunSuite, IntegrationTestUtils, RemoteSparkSession} import org.apache.spark.tags.AmmoniteTest import org.apache.spark.util.IvyTestUtils import org.apache.spark.util.MavenUtils.MavenCoordinate @AmmoniteTest -class ReplE2ESuite extends RemoteSparkSession with BeforeAndAfterEach { +class ReplE2ESuite extends ConnectFunSuite with RemoteSparkSession with BeforeAndAfterEach { private val executorService = Executors.newSingleThreadExecutor() private val TIMEOUT_SECONDS = 30 diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/CheckConnectJvmClientCompatibility.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/CheckConnectJvmClientCompatibility.scala index 0f383d007f295..7bf7673a7a121 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/CheckConnectJvmClientCompatibility.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/CheckConnectJvmClientCompatibility.scala @@ -196,9 +196,6 @@ object CheckConnectJvmClientCompatibility { ProblemFilters.exclude[Problem]("org.apache.spark.sql.Dataset.COL_POS_KEY"), ProblemFilters.exclude[Problem]("org.apache.spark.sql.Dataset.DATASET_ID_KEY"), ProblemFilters.exclude[Problem]("org.apache.spark.sql.Dataset.curId"), - ProblemFilters.exclude[Problem]("org.apache.spark.sql.Dataset.observe"), - ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.Observation"), - ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.Observation$"), ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.ObservationListener"), ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.ObservationListener$"), ProblemFilters.exclude[Problem]("org.apache.spark.sql.Dataset.queryExecution"), @@ -210,7 +207,6 @@ object CheckConnectJvmClientCompatibility { // functions ProblemFilters.exclude[Problem]("org.apache.spark.sql.functions.unwrap_udt"), - ProblemFilters.exclude[Problem]("org.apache.spark.sql.functions.udaf"), // KeyValueGroupedDataset ProblemFilters.exclude[Problem]( @@ -304,6 +300,7 @@ object CheckConnectJvmClientCompatibility { // MergeIntoWriter ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.MergeIntoWriter"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.MergeIntoWriter$"), ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.WhenMatched"), ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.WhenMatched$"), ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.WhenNotMatched"), @@ -336,18 +333,28 @@ object CheckConnectJvmClientCompatibility { ProblemFilters.exclude[ReversedMissingMethodProblem]( "org.apache.spark.sql.SQLImplicits._sqlContext" // protected ), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.internal.SessionCleaner"), + + // private + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.internal.CleanupTask"), + ProblemFilters.exclude[MissingClassProblem]( + "org.apache.spark.sql.internal.CleanupTaskWeakReference"), + ProblemFilters.exclude[MissingClassProblem]( + "org.apache.spark.sql.internal.CleanupCachedRemoteRelation"), + ProblemFilters.exclude[MissingClassProblem]( + "org.apache.spark.sql.internal.CleanupCachedRemoteRelation$"), // Catalyst Refactoring ProblemFilters.exclude[Problem]("org.apache.spark.sql.catalyst.util.SparkCollectionUtils"), ProblemFilters.exclude[Problem]("org.apache.spark.sql.catalyst.util.SparkCollectionUtils$"), // New public APIs added in the client - // ScalarUserDefinedFunction + // ScalaUserDefinedFunction ProblemFilters .exclude[MissingClassProblem]( - "org.apache.spark.sql.expressions.ScalarUserDefinedFunction"), + "org.apache.spark.sql.expressions.ScalaUserDefinedFunction"), ProblemFilters.exclude[MissingClassProblem]( - "org.apache.spark.sql.expressions.ScalarUserDefinedFunction$"), + "org.apache.spark.sql.expressions.ScalaUserDefinedFunction$"), // New private API added in the client ProblemFilters @@ -357,6 +364,11 @@ object CheckConnectJvmClientCompatibility { .exclude[MissingClassProblem]( "org.apache.spark.sql.expressions.SparkConnectClosureCleaner$"), + // Column + // developer API + ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.sql.Column.apply"), + ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.sql.Column.expr"), + // Dataset ProblemFilters.exclude[DirectMissingMethodProblem]( "org.apache.spark.sql.Dataset.plan" @@ -439,6 +451,9 @@ object CheckConnectJvmClientCompatibility { "org.apache.spark.sql.streaming.RemoteStreamingQuery"), ProblemFilters.exclude[MissingClassProblem]( "org.apache.spark.sql.streaming.RemoteStreamingQuery$"), + // Skip client side listener specific class + ProblemFilters.exclude[MissingClassProblem]( + "org.apache.spark.sql.streaming.StreamingQueryListenerBus"), // Encoders are in the wrong JAR ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.Encoders"), diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/SparkConnectClientSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/SparkConnectClientSuite.scala index 55f962b2a52c8..46aeaeff43d2f 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/SparkConnectClientSuite.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/SparkConnectClientSuite.scala @@ -530,6 +530,25 @@ class SparkConnectClientSuite extends ConnectFunSuite with BeforeAndAfterEach { assert(reattachableIter.resultComplete) } + test("SPARK-48056: Client execute gets INVALID_HANDLE.SESSION_NOT_FOUND and proceeds") { + startDummyServer(0) + client = SparkConnectClient + .builder() + .connectionString(s"sc://localhost:${server.getPort}") + .enableReattachableExecute() + .build() + service.errorToThrowOnExecute = Some( + new StatusRuntimeException( + Status.INTERNAL.withDescription("INVALID_HANDLE.SESSION_NOT_FOUND"))) + + val plan = buildPlan("select * from range(1)") + val iter = client.execute(plan) + val reattachableIter = + ExecutePlanResponseReattachableIterator.fromIterator(iter) + reattachableIter.foreach(_ => ()) + assert(reattachableIter.resultComplete) + } + test("GRPC stub unary call throws error immediately") { // Spark Connect error retry handling depends on the error being returned from the unary // call immediately. @@ -609,6 +628,8 @@ class DummySparkConnectService() extends SparkConnectServiceGrpc.SparkConnectSer private val inputArtifactRequests: mutable.ListBuffer[AddArtifactsRequest] = mutable.ListBuffer.empty + var errorToThrowOnExecute: Option[Throwable] = None + private[sql] def getAndClearLatestInputPlan(): proto.Plan = { val plan = inputPlan inputPlan = null @@ -624,6 +645,13 @@ class DummySparkConnectService() extends SparkConnectServiceGrpc.SparkConnectSer override def executePlan( request: ExecutePlanRequest, responseObserver: StreamObserver[ExecutePlanResponse]): Unit = { + if (errorToThrowOnExecute.isDefined) { + val error = errorToThrowOnExecute.get + errorToThrowOnExecute = None + responseObserver.onError(error) + return + } + // Reply with a dummy response using the same client ID val requestSessionId = request.getSessionId val operationId = if (request.hasOperationId) { diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/streaming/ClientStreamingQuerySuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/streaming/ClientStreamingQuerySuite.scala index 38712a0f1f633..e6009a967d156 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/streaming/ClientStreamingQuerySuite.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/streaming/ClientStreamingQuerySuite.scala @@ -33,11 +33,11 @@ import org.apache.spark.internal.Logging import org.apache.spark.sql.{DataFrame, ForeachWriter, Row, SparkSession} import org.apache.spark.sql.functions.{col, lit, udf, window} import org.apache.spark.sql.streaming.StreamingQueryListener.{QueryIdleEvent, QueryProgressEvent, QueryStartedEvent, QueryTerminatedEvent} -import org.apache.spark.sql.test.{IntegrationTestUtils, QueryTest, SQLHelper} +import org.apache.spark.sql.test.{IntegrationTestUtils, QueryTest, RemoteSparkSession} import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} import org.apache.spark.util.SparkFileUtils -class ClientStreamingQuerySuite extends QueryTest with SQLHelper with Logging { +class ClientStreamingQuerySuite extends QueryTest with RemoteSparkSession with Logging { private val testDataPath = Paths .get( @@ -508,6 +508,33 @@ class ClientStreamingQuerySuite extends QueryTest with SQLHelper with Logging { assert(spark.streams.listListeners().length == 0) } + test("listener events") { + val listener = new MyListener() + spark.streams.addListener(listener) + + val q = spark.readStream + .format("rate") + .load() + .writeStream + .format("console") + .start() + + try { + q.processAllAvailable() + eventually(timeout(30.seconds)) { + assert(q.isActive) + assert(listener.start.length == 1) + assert(listener.progress.nonEmpty) + } + } finally { + q.stop() + eventually(timeout(30.seconds)) { + assert(!q.isActive) + assert(listener.terminate.nonEmpty) + } + } + } + test("foreachBatch") { // Starts a streaming query with a foreachBatch function, which writes batchId and row count // to a temp view. The test verifies that the view is populated with data. @@ -543,6 +570,78 @@ class ClientStreamingQuerySuite extends QueryTest with SQLHelper with Logging { q.stop() } } + + abstract class EventCollector extends StreamingQueryListener { + protected def tablePostfix: String + + protected def handleOnQueryStarted(event: QueryStartedEvent): Unit = { + val df = spark.createDataFrame(Seq((event.json, 0))) + df.write.mode("append").saveAsTable(s"listener_start_events$tablePostfix") + } + + protected def handleOnQueryProgress(event: QueryProgressEvent): Unit = { + val df = spark.createDataFrame(Seq((event.json, 0))) + df.write.mode("append").saveAsTable(s"listener_progress_events$tablePostfix") + } + + protected def handleOnQueryTerminated(event: QueryTerminatedEvent): Unit = { + val df = spark.createDataFrame(Seq((event.json, 0))) + df.write.mode("append").saveAsTable(s"listener_terminated_events$tablePostfix") + } + } + + /** + * V1: Initial interface of StreamingQueryListener containing methods `onQueryStarted`, + * `onQueryProgress`, `onQueryTerminated`. It is prior to Spark 3.5. + */ + class EventCollectorV1 extends EventCollector { + override protected def tablePostfix: String = "_v1" + + override def onQueryStarted(event: QueryStartedEvent): Unit = handleOnQueryStarted(event) + + override def onQueryProgress(event: QueryProgressEvent): Unit = handleOnQueryProgress(event) + + override def onQueryTerminated(event: QueryTerminatedEvent): Unit = + handleOnQueryTerminated(event) + } + + /** + * V2: The interface after the method `onQueryIdle` is added. It is Spark 3.5+. + */ + class EventCollectorV2 extends EventCollector { + override protected def tablePostfix: String = "_v2" + + override def onQueryStarted(event: QueryStartedEvent): Unit = handleOnQueryStarted(event) + + override def onQueryProgress(event: QueryProgressEvent): Unit = handleOnQueryProgress(event) + + override def onQueryIdle(event: QueryIdleEvent): Unit = {} + + override def onQueryTerminated(event: QueryTerminatedEvent): Unit = + handleOnQueryTerminated(event) + } + + class MyListener extends StreamingQueryListener { + var start: Seq[String] = Seq.empty + var progress: Seq[String] = Seq.empty + var terminate: Seq[String] = Seq.empty + + override def onQueryStarted(event: QueryStartedEvent): Unit = { + start = start :+ event.json + } + + override def onQueryProgress(event: QueryProgressEvent): Unit = { + progress = progress :+ event.json + } + + override def onQueryIdle(event: QueryIdleEvent): Unit = { + // Do nothing + } + + override def onQueryTerminated(event: QueryTerminatedEvent): Unit = { + terminate = terminate :+ event.json + } + } } class TestForeachWriter[T] extends ForeachWriter[T] { @@ -570,58 +669,6 @@ case class TestClass(value: Int) { override def toString: String = value.toString } -abstract class EventCollector extends StreamingQueryListener { - private lazy val spark = SparkSession.builder().getOrCreate() - - protected def tablePostfix: String - - protected def handleOnQueryStarted(event: QueryStartedEvent): Unit = { - val df = spark.createDataFrame(Seq((event.json, 0))) - df.write.mode("append").saveAsTable(s"listener_start_events$tablePostfix") - } - - protected def handleOnQueryProgress(event: QueryProgressEvent): Unit = { - val df = spark.createDataFrame(Seq((event.json, 0))) - df.write.mode("append").saveAsTable(s"listener_progress_events$tablePostfix") - } - - protected def handleOnQueryTerminated(event: QueryTerminatedEvent): Unit = { - val df = spark.createDataFrame(Seq((event.json, 0))) - df.write.mode("append").saveAsTable(s"listener_terminated_events$tablePostfix") - } -} - -/** - * V1: Initial interface of StreamingQueryListener containing methods `onQueryStarted`, - * `onQueryProgress`, `onQueryTerminated`. It is prior to Spark 3.5. - */ -class EventCollectorV1 extends EventCollector { - override protected def tablePostfix: String = "_v1" - - override def onQueryStarted(event: QueryStartedEvent): Unit = handleOnQueryStarted(event) - - override def onQueryProgress(event: QueryProgressEvent): Unit = handleOnQueryProgress(event) - - override def onQueryTerminated(event: QueryTerminatedEvent): Unit = - handleOnQueryTerminated(event) -} - -/** - * V2: The interface after the method `onQueryIdle` is added. It is Spark 3.5+. - */ -class EventCollectorV2 extends EventCollector { - override protected def tablePostfix: String = "_v2" - - override def onQueryStarted(event: QueryStartedEvent): Unit = handleOnQueryStarted(event) - - override def onQueryProgress(event: QueryProgressEvent): Unit = handleOnQueryProgress(event) - - override def onQueryIdle(event: QueryIdleEvent): Unit = {} - - override def onQueryTerminated(event: QueryTerminatedEvent): Unit = - handleOnQueryTerminated(event) -} - class ForeachBatchFn(val viewName: String) extends VoidFunction2[DataFrame, java.lang.Long] with Serializable { diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateStreamingSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateStreamingSuite.scala index 2fab6e8e3c843..dc74463f1a25b 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateStreamingSuite.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateStreamingSuite.scala @@ -25,14 +25,14 @@ import org.scalatest.time.SpanSugar._ import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.streaming.InternalOutputModes.Append -import org.apache.spark.sql.test.{QueryTest, SQLHelper} +import org.apache.spark.sql.test.{QueryTest, RemoteSparkSession} import org.apache.spark.sql.types.{StringType, StructField, StructType, TimestampType} case class ClickEvent(id: String, timestamp: Timestamp) case class ClickState(id: String, count: Int) -class FlatMapGroupsWithStateStreamingSuite extends QueryTest with SQLHelper { +class FlatMapGroupsWithStateStreamingSuite extends QueryTest with RemoteSparkSession { val flatMapGroupsWithStateSchema: StructType = StructType( Array(StructField("id", StringType), StructField("timestamp", TimestampType))) diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/test/ConnectFunSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/test/ConnectFunSuite.scala index 8d69d91a34f7d..f40738b983b39 100755 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/test/ConnectFunSuite.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/test/ConnectFunSuite.scala @@ -34,7 +34,7 @@ trait ConnectFunSuite extends AnyFunSuite { // scalastyle:ignore funsuite java.nio.file.Paths.get(sparkHome, first +: more: _*) } - protected val baseResourcePath: Path = { + protected def baseResourcePath: Path = { getWorkspaceFilePath( "connector", "connect", @@ -45,7 +45,7 @@ trait ConnectFunSuite extends AnyFunSuite { // scalastyle:ignore funsuite "resources").toAbsolutePath } - protected val commonResourcePath: Path = { + protected def commonResourcePath: Path = { getWorkspaceFilePath( "connector", "connect", diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/test/QueryTest.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/test/QueryTest.scala index 54fc97c50b3ec..8837c76b76aeb 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/test/QueryTest.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/test/QueryTest.scala @@ -21,11 +21,13 @@ import java.util.TimeZone import org.scalatest.Assertions -import org.apache.spark.sql.{DataFrame, Dataset, Row} +import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import org.apache.spark.sql.catalyst.util.SparkStringUtils.sideBySide import org.apache.spark.util.ArrayImplicits._ -abstract class QueryTest extends RemoteSparkSession { +abstract class QueryTest extends ConnectFunSuite with SQLHelper { + + def spark: SparkSession /** * Runs the plan and makes sure the answer matches the expected result. diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/test/RemoteSparkSession.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/test/RemoteSparkSession.scala index 300de6e9b0812..ecc84e8418013 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/test/RemoteSparkSession.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/test/RemoteSparkSession.scala @@ -23,7 +23,7 @@ import java.util.concurrent.TimeUnit import scala.concurrent.duration.FiniteDuration -import org.scalatest.BeforeAndAfterAll +import org.scalatest.{BeforeAndAfterAll, Suite} import org.apache.spark.SparkBuildInfo import org.apache.spark.sql.SparkSession @@ -204,7 +204,7 @@ object SparkConnectServerUtils { } } -trait RemoteSparkSession extends ConnectFunSuite with BeforeAndAfterAll { +trait RemoteSparkSession extends BeforeAndAfterAll { self: Suite => import SparkConnectServerUtils._ var spark: SparkSession = _ protected lazy val serverPort: Int = port diff --git a/connector/connect/common/src/main/protobuf/spark/connect/base.proto b/connector/connect/common/src/main/protobuf/spark/connect/base.proto index 49a33d3419b6f..33ed73836616a 100644 --- a/connector/connect/common/src/main/protobuf/spark/connect/base.proto +++ b/connector/connect/common/src/main/protobuf/spark/connect/base.proto @@ -381,6 +381,9 @@ message ExecutePlanResponse { // (Optional) Intermediate query progress reports. ExecutionProgress execution_progress = 18; + // Response for command that checkpoints a DataFrame. + CheckpointCommandResult checkpoint_command_result = 19; + // Support arbitrary result objects. google.protobuf.Any extension = 999; } @@ -434,6 +437,7 @@ message ExecutePlanResponse { string name = 1; repeated Expression.Literal values = 2; repeated string keys = 3; + int64 plan_id = 4; } message ResultComplete { @@ -1047,6 +1051,11 @@ message FetchErrorDetailsResponse { } } +message CheckpointCommandResult { + // (Required) The logical plan checkpointed. + CachedRemoteRelation relation = 1; +} + // Main interface for the SparkConnect service. service SparkConnectService { diff --git a/connector/connect/common/src/main/protobuf/spark/connect/commands.proto b/connector/connect/common/src/main/protobuf/spark/connect/commands.proto index acff0a2089e95..0e0c55fa34f00 100644 --- a/connector/connect/common/src/main/protobuf/spark/connect/commands.proto +++ b/connector/connect/common/src/main/protobuf/spark/connect/commands.proto @@ -45,6 +45,8 @@ message Command { StreamingQueryListenerBusCommand streaming_query_listener_bus_command = 11; CommonInlineUserDefinedDataSource register_data_source = 12; CreateResourceProfileCommand create_resource_profile_command = 13; + CheckpointCommand checkpoint_command = 14; + RemoveCachedRemoteRelationCommand remove_cached_remote_relation_command = 15; // This field is used to mark extensions to the protocol. When plugins generate arbitrary // Commands they can add them here. During the planning the correct resolution is done. @@ -484,3 +486,21 @@ message CreateResourceProfileCommandResult { // (Required) Server-side generated resource profile id. int32 profile_id = 1; } + +// Command to remove `CashedRemoteRelation` +message RemoveCachedRemoteRelationCommand { + // (Required) The remote to be related + CachedRemoteRelation relation = 1; +} + +message CheckpointCommand { + // (Required) The logical plan to checkpoint. + Relation relation = 1; + + // (Required) Locally checkpoint using a local temporary + // directory in Spark Connect server (Spark Driver) + bool local = 2; + + // (Required) Whether to checkpoint this dataframe immediately. + bool eager = 3; +} diff --git a/connector/connect/common/src/main/protobuf/spark/connect/common.proto b/connector/connect/common/src/main/protobuf/spark/connect/common.proto index da334bfd9ee8e..b2848370b01dc 100644 --- a/connector/connect/common/src/main/protobuf/spark/connect/common.proto +++ b/connector/connect/common/src/main/protobuf/spark/connect/common.proto @@ -81,3 +81,18 @@ message ResourceProfile { // (e.g., cores, memory, CPU) to its specific request. map task_resources = 2; } + +message Origin { + // (Required) Indicate the origin type. + oneof function { + PythonOrigin python_origin = 1; + } +} + +message PythonOrigin { + // (Required) Name of the origin, for example, the name of the function + string fragment = 1; + + // (Required) Callsite to show to end users, for example, stacktrace. + string call_site = 2; +} diff --git a/connector/connect/common/src/main/protobuf/spark/connect/expressions.proto b/connector/connect/common/src/main/protobuf/spark/connect/expressions.proto index 726ae5dd1c219..257634813e742 100644 --- a/connector/connect/common/src/main/protobuf/spark/connect/expressions.proto +++ b/connector/connect/common/src/main/protobuf/spark/connect/expressions.proto @@ -19,6 +19,7 @@ syntax = 'proto3'; import "google/protobuf/any.proto"; import "spark/connect/types.proto"; +import "spark/connect/common.proto"; package spark.connect; @@ -30,6 +31,7 @@ option go_package = "internal/generated"; // expressions in SQL appear. message Expression { + ExpressionCommon common = 18; oneof expr_type { Literal literal = 1; UnresolvedAttribute unresolved_attribute = 2; @@ -342,6 +344,11 @@ message Expression { } } +message ExpressionCommon { + // (Required) Keep the information of the origin for this expression such as stacktrace. + Origin origin = 1; +} + message CommonInlineUserDefinedFunction { // (Required) Name of the user-defined function. string function_name = 1; @@ -366,6 +373,8 @@ message PythonUDF { bytes command = 3; // (Required) Python version being used in the client. string python_ver = 4; + // (Optional) Additional includes for the Python UDF. + repeated string additional_includes = 5; } message ScalarScalaUDF { @@ -377,6 +386,8 @@ message ScalarScalaUDF { DataType outputType = 3; // (Required) True if the UDF can return null value bool nullable = 4; + // (Required) Indicate if the UDF is an aggregate function + bool aggregate = 5; } message JavaUDF { diff --git a/connector/connect/common/src/main/protobuf/spark/connect/relations.proto b/connector/connect/common/src/main/protobuf/spark/connect/relations.proto index 5cbe6459d226b..04fe21086097c 100644 --- a/connector/connect/common/src/main/protobuf/spark/connect/relations.proto +++ b/connector/connect/common/src/main/protobuf/spark/connect/relations.proto @@ -108,10 +108,13 @@ message Unknown {} // Common metadata of all relations. message RelationCommon { // (Required) Shared relation metadata. - string source_info = 1; + string source_info = 1 [deprecated=true]; // (Optional) A per-client globally unique id for a given connect plan. optional int64 plan_id = 2; + + // (Optional) Keep the information of the origin for this expression such as stacktrace. + Origin origin = 3; } // Relation that uses a SQL query to generate the output. @@ -468,7 +471,9 @@ message Sample { // (Optional) Whether to sample with replacement. optional bool with_replacement = 4; - // (Optional) The random seed. + // (Required) The random seed. + // This field is required to avoid generating mutable dataframes (see SPARK-48184 for details), + // however, still keep it 'optional' here for backward compatibility. optional int64 seed = 5; // (Required) Explicitly sort the underlying plan to make the ordering deterministic or cache it. @@ -688,7 +693,9 @@ message StatSampleBy { // If a stratum is not specified, we treat its fraction as zero. repeated Fraction fractions = 3; - // (Optional) The random seed. + // (Required) The random seed. + // This field is required to avoid generating mutable dataframes (see SPARK-48184 for details), + // however, still keep it 'optional' here for backward compatibility. optional int64 seed = 5; message Fraction { diff --git a/connector/connect/common/src/main/protobuf/spark/connect/types.proto b/connector/connect/common/src/main/protobuf/spark/connect/types.proto index 48f7385330c86..4f768f201575b 100644 --- a/connector/connect/common/src/main/protobuf/spark/connect/types.proto +++ b/connector/connect/common/src/main/protobuf/spark/connect/types.proto @@ -101,7 +101,7 @@ message DataType { message String { uint32 type_variation_reference = 1; - uint32 collation_id = 2; + string collation = 2; } message Binary { diff --git a/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/ConnectProtoUtils.scala b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/ConnectProtoUtils.scala new file mode 100644 index 0000000000000..053e03fc08e4b --- /dev/null +++ b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/ConnectProtoUtils.scala @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.connect + +import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.connect.proto +import org.apache.spark.sql.connect.common.ProtoUtils + +/** + * Utility functions for parsing Spark Connect protocol buffers with a recursion limit. This is + * intended to be used by plugins, as they cannot use `ProtoUtils.parseWithRecursionLimit` due to + * the shading of the `com.google.protobuf` package. + */ +object ConnectProtoUtils { + @DeveloperApi + def parsePlanWithRecursionLimit(bytes: Array[Byte], recursionLimit: Int): proto.Plan = { + ProtoUtils.parseWithRecursionLimit(bytes, proto.Plan.parser(), recursionLimit) + } + + @DeveloperApi + def parseRelationWithRecursionLimit(bytes: Array[Byte], recursionLimit: Int): proto.Relation = { + ProtoUtils.parseWithRecursionLimit(bytes, proto.Relation.parser(), recursionLimit) + } + + @DeveloperApi + def parseCommandWithRecursionLimit(bytes: Array[Byte], recursionLimit: Int): proto.Command = { + ProtoUtils.parseWithRecursionLimit(bytes, proto.Command.parser(), recursionLimit) + } + + @DeveloperApi + def parseExpressionWithRecursionLimit( + bytes: Array[Byte], + recursionLimit: Int): proto.Expression = { + ProtoUtils.parseWithRecursionLimit(bytes, proto.Expression.parser(), recursionLimit) + } + + @DeveloperApi + def parseDataTypeWithRecursionLimit(bytes: Array[Byte], recursionLimit: Int): proto.DataType = { + ProtoUtils.parseWithRecursionLimit(bytes, proto.DataType.parser(), recursionLimit) + } +} diff --git a/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/ExecutePlanResponseReattachableIterator.scala b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/ExecutePlanResponseReattachableIterator.scala index 74f13272a3655..f3c13c9c2c4d8 100644 --- a/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/ExecutePlanResponseReattachableIterator.scala +++ b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/ExecutePlanResponseReattachableIterator.scala @@ -42,7 +42,8 @@ import org.apache.spark.sql.connect.client.GrpcRetryHandler.RetryException * ReattachExecute request. ReattachExecute request is provided the responseId of last returned * ExecutePlanResponse on the iterator to return a new iterator from server that continues after * that. If the initial ExecutePlan did not even reach the server, and hence reattach fails with - * INVALID_HANDLE.OPERATION_NOT_FOUND, we attempt to retry ExecutePlan. + * INVALID_HANDLE.OPERATION_NOT_FOUND or INVALID_HANDLE.SESSION_NOT_FOUND, we attempt to retry + * ExecutePlan. * * In reattachable execute the server does buffer some responses in case the client needs to * backtrack. To let server release this buffer sooner, this iterator asynchronously sends @@ -66,7 +67,8 @@ class ExecutePlanResponseReattachableIterator( // Add operation id, if not present. // with operationId set by the client, the client can use it to try to reattach on error // even before getting the first response. If the operation in fact didn't even reach the - // server, that will end with INVALID_HANDLE.OPERATION_NOT_FOUND error. + // server, that will end with INVALID_HANDLE.OPERATION_NOT_FOUND or + // INVALID_HANDLE.SESSION_NOT_FOUND error. UUID.randomUUID.toString } @@ -234,10 +236,14 @@ class ExecutePlanResponseReattachableIterator( } catch { case ex: StatusRuntimeException if Option(StatusProto.fromThrowable(ex)) - .exists(_.getMessage.contains("INVALID_HANDLE.OPERATION_NOT_FOUND")) => + .exists(ex => { + ex.getMessage.contains("INVALID_HANDLE.OPERATION_NOT_FOUND") || + ex.getMessage.contains("INVALID_HANDLE.SESSION_NOT_FOUND") + }) => if (lastReturnedResponseId.isDefined) { throw new IllegalStateException( - "OPERATION_NOT_FOUND on the server but responses were already received from it.", + "OPERATION_NOT_FOUND/SESSION_NOT_FOUND on the server but responses were already " + + "received from it.", ex) } // Try a new ExecutePlan, and throw upstream for retry. diff --git a/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/GrpcRetryHandler.scala b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/GrpcRetryHandler.scala index 508dad3d748d2..7e0a356b9e493 100644 --- a/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/GrpcRetryHandler.scala +++ b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/GrpcRetryHandler.scala @@ -22,7 +22,7 @@ import scala.util.control.NonFatal import io.grpc.stub.StreamObserver import org.apache.spark.internal.Logging -import org.apache.spark.internal.LogKey.{ERROR, POLICY, RETRY_COUNT, WAIT_TIME} +import org.apache.spark.internal.LogKeys.{ERROR, NUM_RETRY, POLICY, RETRY_WAIT_TIME} import org.apache.spark.internal.MDC private[sql] class GrpcRetryHandler( @@ -190,7 +190,7 @@ private[sql] object GrpcRetryHandler extends Logging { // retry exception is considered immediately retriable without any policies. logWarning( log"Non-Fatal error during RPC execution: ${MDC(ERROR, lastException)}, " + - log"retrying (currentRetryNum=${MDC(RETRY_COUNT, currentRetryNum)})") + log"retrying (currentRetryNum=${MDC(NUM_RETRY, currentRetryNum)})") return } @@ -200,8 +200,8 @@ private[sql] object GrpcRetryHandler extends Logging { if (time.isDefined) { logWarning( log"Non-Fatal error during RPC execution: ${MDC(ERROR, lastException)}, " + - log"retrying (wait=${MDC(WAIT_TIME, time.get.toMillis)} ms, " + - log"currentRetryNum=${MDC(RETRY_COUNT, currentRetryNum)}, " + + log"retrying (wait=${MDC(RETRY_WAIT_TIME, time.get.toMillis)} ms, " + + log"currentRetryNum=${MDC(NUM_RETRY, currentRetryNum)}, " + log"policy=${MDC(POLICY, policy.getName)}).") sleep(time.get.toMillis) return @@ -210,7 +210,7 @@ private[sql] object GrpcRetryHandler extends Logging { logWarning( log"Non-Fatal error during RPC execution: ${MDC(ERROR, lastException)}, " + - log"exceeded retries (currentRetryNum=${MDC(RETRY_COUNT, currentRetryNum)})") + log"exceeded retries (currentRetryNum=${MDC(NUM_RETRY, currentRetryNum)})") val error = new RetriesExceeded() exceptionList.foreach(error.addSuppressed) diff --git a/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/ResponseValidator.scala b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/ResponseValidator.scala index 29272c96132bc..42c3387335be9 100644 --- a/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/ResponseValidator.scala +++ b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/ResponseValidator.scala @@ -16,7 +16,10 @@ */ package org.apache.spark.sql.connect.client +import java.util.concurrent.atomic.AtomicBoolean + import com.google.protobuf.GeneratedMessageV3 +import io.grpc.{Status, StatusRuntimeException} import io.grpc.stub.StreamObserver import org.apache.spark.internal.Logging @@ -30,6 +33,12 @@ class ResponseValidator extends Logging { // do not use server-side streaming. private var serverSideSessionId: Option[String] = None + // Indicates whether the client and the client information on the server correspond to each other + // This flag being false means that the server has restarted and lost the client information, or + // there is a logic error in the code; both cases, the user should establish a new connection to + // the server. Access to the value has to be synchronized since it can be shared. + private val isSessionActive: AtomicBoolean = new AtomicBoolean(true) + // Returns the server side session ID, used to send it back to the server in the follow-up // requests so the server can validate it session id against the previous requests. def getServerSideSessionId: Option[String] = serverSideSessionId @@ -42,8 +51,25 @@ class ResponseValidator extends Logging { serverSideSessionId = Some(serverSideSessionId.getOrElse("") + suffix) } + /** + * Returns true if the session is valid on both the client and the server. + */ + private[sql] def isSessionValid: Boolean = { + // An active session is considered valid. + isSessionActive.getAcquire + } + def verifyResponse[RespT <: GeneratedMessageV3](fn: => RespT): RespT = { - val response = fn + val response = + try { + fn + } catch { + case e: StatusRuntimeException + if e.getStatus.getCode == Status.Code.INTERNAL && + e.getMessage.contains("[INVALID_HANDLE.SESSION_CHANGED]") => + isSessionActive.setRelease(false) + throw e + } val field = response.getDescriptorForType.findFieldByName("server_side_session_id") // If the field does not exist, we ignore it. New / Old message might not contain it and this // behavior allows us to be compatible. @@ -54,6 +80,7 @@ class ResponseValidator extends Logging { serverSideSessionId match { case Some(id) => if (value != id) { + isSessionActive.setRelease(false) throw new IllegalStateException( s"Server side session ID changed from $id to $value") } diff --git a/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/SparkConnectClient.scala b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/SparkConnectClient.scala index d9d51c15a880b..7c3108fdb1b0e 100644 --- a/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/SparkConnectClient.scala +++ b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/SparkConnectClient.scala @@ -39,7 +39,7 @@ import org.apache.spark.sql.connect.common.config.ConnectCommon */ private[sql] class SparkConnectClient( private[sql] val configuration: SparkConnectClient.Configuration, - private val channel: ManagedChannel) { + private[sql] val channel: ManagedChannel) { private val userContext: UserContext = configuration.userContext @@ -71,6 +71,17 @@ private[sql] class SparkConnectClient( stubState.responseValidator.hijackServerSideSessionIdForTesting(suffix) } + /** + * Returns true if the session is valid on both the client and the server. A session becomes + * invalid if the server side information about the client, e.g., session ID, does not + * correspond to the actual client state. + */ + private[sql] def isSessionValid: Boolean = { + // The last known state of the session is store in `responseValidator`, because it is where the + // client gets responses from the server. + stubState.responseValidator.isSessionValid + } + private[sql] val artifactManager: ArtifactManager = { new ArtifactManager(configuration, sessionId, bstub, stub) } @@ -566,6 +577,13 @@ object SparkConnectClient { def grpcMaxMessageSize: Int = _configuration.grpcMaxMessageSize + def grpcMaxRecursionLimit(recursionLimit: Int): Builder = { + _configuration = _configuration.copy(grpcMaxRecursionLimit = recursionLimit) + this + } + + def grpcMaxRecursionLimit: Int = _configuration.grpcMaxRecursionLimit + def option(key: String, value: String): Builder = { _configuration = _configuration.copy(metadata = _configuration.metadata + ((key, value))) this @@ -703,7 +721,8 @@ object SparkConnectClient { useReattachableExecute: Boolean = true, interceptors: List[ClientInterceptor] = List.empty, sessionId: Option[String] = None, - grpcMaxMessageSize: Int = ConnectCommon.CONNECT_GRPC_MAX_MESSAGE_SIZE) { + grpcMaxMessageSize: Int = ConnectCommon.CONNECT_GRPC_MAX_MESSAGE_SIZE, + grpcMaxRecursionLimit: Int = ConnectCommon.CONNECT_GRPC_MARSHALLER_RECURSION_LIMIT) { def userContext: proto.UserContext = { val builder = proto.UserContext.newBuilder() diff --git a/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/SparkResult.scala b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/SparkResult.scala index 93d1075aea025..0905ee76c3f34 100644 --- a/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/SparkResult.scala +++ b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/SparkResult.scala @@ -27,10 +27,13 @@ import org.apache.arrow.vector.ipc.message.{ArrowMessage, ArrowRecordBatch} import org.apache.arrow.vector.types.pojo import org.apache.spark.connect.proto +import org.apache.spark.connect.proto.ExecutePlanResponse.ObservedMetrics +import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.encoders.{AgnosticEncoder, RowEncoder} import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.{ProductEncoder, UnboundRowEncoder} +import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.connect.client.arrow.{AbstractMessageIterator, ArrowDeserializingIterator, ConcatenatingArrowStreamReader, MessageIterator} -import org.apache.spark.sql.connect.common.DataTypeProtoConverter +import org.apache.spark.sql.connect.common.{DataTypeProtoConverter, LiteralValueProtoConverter} import org.apache.spark.sql.types.{DataType, StructType} import org.apache.spark.sql.util.ArrowUtils @@ -38,7 +41,8 @@ private[sql] class SparkResult[T]( responses: CloseableIterator[proto.ExecutePlanResponse], allocator: BufferAllocator, encoder: AgnosticEncoder[T], - timeZoneId: String) + timeZoneId: String, + setObservationMetricsOpt: Option[(Long, Map[String, Any]) => Unit] = None) extends AutoCloseable { self => case class StageInfo( @@ -79,6 +83,7 @@ private[sql] class SparkResult[T]( private[this] var arrowSchema: pojo.Schema = _ private[this] var nextResultIndex: Int = 0 private val resultMap = mutable.Map.empty[Int, (Long, Seq[ArrowMessage])] + private val observedMetrics = mutable.Map.empty[String, Row] private val cleanable = SparkResult.cleaner.register(this, new SparkResultCloseable(resultMap, responses)) @@ -117,6 +122,9 @@ private[sql] class SparkResult[T]( while (!stop && responses.hasNext) { val response = responses.next() + // Collect metrics for this response + observedMetrics ++= processObservedMetrics(response.getObservedMetricsList) + // Save and validate operationId if (opId == null) { opId = response.getOperationId @@ -198,6 +206,29 @@ private[sql] class SparkResult[T]( nonEmpty } + private def processObservedMetrics( + metrics: java.util.List[ObservedMetrics]): Iterable[(String, Row)] = { + metrics.asScala.map { metric => + assert(metric.getKeysCount == metric.getValuesCount) + var schema = new StructType() + val keys = mutable.ListBuffer.empty[String] + val values = mutable.ListBuffer.empty[Any] + (0 until metric.getKeysCount).map { i => + val key = metric.getKeys(i) + val value = LiteralValueProtoConverter.toCatalystValue(metric.getValues(i)) + schema = schema.add(key, LiteralValueProtoConverter.toDataType(value.getClass)) + keys += key + values += value + } + // If the metrics is registered by an Observation object, attach them and unblock any + // blocked thread. + setObservationMetricsOpt.foreach { setObservationMetrics => + setObservationMetrics(metric.getPlanId, keys.zip(values).toMap) + } + metric.getName -> new GenericRowWithSchema(values.toArray, schema) + } + } + /** * Returns the number of elements in the result. */ @@ -248,6 +279,15 @@ private[sql] class SparkResult[T]( result } + /** + * Returns all observed metrics in the result. + */ + def getObservedMetrics: Map[String, Row] = { + // We need to process all responses to get all metrics. + processResponses() + observedMetrics.toMap + } + /** * Returns an iterator over the contents of the result. */ diff --git a/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/DataTypeProtoConverter.scala b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/DataTypeProtoConverter.scala index 1f580a0ffc0a3..f63692717947a 100644 --- a/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/DataTypeProtoConverter.scala +++ b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/DataTypeProtoConverter.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.connect.common import scala.jdk.CollectionConverters._ import org.apache.spark.connect.proto +import org.apache.spark.sql.catalyst.util.CollationFactory import org.apache.spark.sql.types._ import org.apache.spark.util.ArrayImplicits._ import org.apache.spark.util.SparkClassUtils @@ -80,7 +81,7 @@ object DataTypeProtoConverter { } private def toCatalystStringType(t: proto.DataType.String): StringType = - StringType(t.getCollationId) + StringType(if (t.getCollation.nonEmpty) t.getCollation else "UTF8_BINARY") private def toCatalystYearMonthIntervalType(t: proto.DataType.YearMonthInterval) = { (t.hasStartField, t.hasEndField) match { @@ -177,7 +178,11 @@ object DataTypeProtoConverter { case s: StringType => proto.DataType .newBuilder() - .setString(proto.DataType.String.newBuilder().setCollationId(s.collationId).build()) + .setString( + proto.DataType.String + .newBuilder() + .setCollation(CollationFactory.fetchCollation(s.collationId).collationName) + .build()) .build() case CharType(length) => diff --git a/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/LiteralValueProtoConverter.scala b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/LiteralValueProtoConverter.scala index ce42cc797bf38..1f3496fa89847 100644 --- a/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/LiteralValueProtoConverter.scala +++ b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/LiteralValueProtoConverter.scala @@ -204,7 +204,7 @@ object LiteralValueProtoConverter { def toLiteralProto(literal: Any, dataType: DataType): proto.Expression.Literal = toLiteralProtoBuilder(literal, dataType).build() - private def toDataType(clz: Class[_]): DataType = clz match { + private[sql] def toDataType(clz: Class[_]): DataType = clz match { // primitive types case JShort.TYPE => ShortType case JInteger.TYPE => IntegerType diff --git a/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/ProtoUtils.scala b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/ProtoUtils.scala index 96bd06b01535e..af07ef11cdf30 100644 --- a/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/ProtoUtils.scala +++ b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/ProtoUtils.scala @@ -17,9 +17,9 @@ package org.apache.spark.sql.connect.common -import com.google.protobuf.Message +import com.google.protobuf.{CodedInputStream, InvalidProtocolBufferException, Message, Parser} -private[connect] object ProtoUtils { +private[sql] object ProtoUtils { def abbreviate[T <: Message](message: T, maxStringSize: Int = 1024): T = { abbreviate[T](message, Map("STRING" -> maxStringSize)) } @@ -51,4 +51,25 @@ private[connect] object ProtoUtils { throw new IllegalArgumentException("Spark Connect tag cannot be an empty string.") } } + + def parseWithRecursionLimit[T <: Message]( + bytes: Array[Byte], + parser: Parser[T], + recursionLimit: Int): T = { + val cis = CodedInputStream.newInstance(bytes) + cis.setSizeLimit(Integer.MAX_VALUE) + cis.setRecursionLimit(recursionLimit) + val message = parser.parseFrom(cis) + try { + // If the last tag is 0, it means the message is correctly parsed. + // If the last tag is not 0, it means the message is not correctly + // parsed, and we should throw an exception. + cis.checkLastTagWas(0) + message + } catch { + case e: InvalidProtocolBufferException => + e.setUnfinishedMessage(message) + throw e + } + } } diff --git a/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/config/ConnectCommon.scala b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/config/ConnectCommon.scala index dca65cf905fc8..e244fd13595b2 100644 --- a/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/config/ConnectCommon.scala +++ b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/config/ConnectCommon.scala @@ -18,5 +18,7 @@ package org.apache.spark.sql.connect.common.config private[sql] object ConnectCommon { val CONNECT_GRPC_BINDING_PORT: Int = 15002 - val CONNECT_GRPC_MAX_MESSAGE_SIZE: Int = 128 * 1024 * 1024; + val CONNECT_GRPC_PORT_MAX_RETRIES: Int = 0 + val CONNECT_GRPC_MAX_MESSAGE_SIZE: Int = 128 * 1024 * 1024 + val CONNECT_GRPC_MARSHALLER_RECURSION_LIMIT: Int = 1024 } diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_aes_decrypt.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_aes_decrypt.explain index 31e03b79eb987..8321eb8beb926 100644 --- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_aes_decrypt.explain +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_aes_decrypt.explain @@ -1,2 +1,2 @@ -Project [staticinvoke(class org.apache.spark.sql.catalyst.expressions.ExpressionImplUtils, BinaryType, aesDecrypt, cast(g#0 as binary), cast(g#0 as binary), GCM, DEFAULT, cast( as binary), BinaryType, BinaryType, StringType, StringType, BinaryType, true, true, true) AS aes_decrypt(g, g, GCM, DEFAULT, )#0] +Project [static_invoke(ExpressionImplUtils.aesDecrypt(cast(g#0 as binary), cast(g#0 as binary), GCM, DEFAULT, cast( as binary))) AS aes_decrypt(g, g, GCM, DEFAULT, )#0] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_aes_decrypt_with_mode.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_aes_decrypt_with_mode.explain index fc572e8fe7c67..1a721c372c106 100644 --- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_aes_decrypt_with_mode.explain +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_aes_decrypt_with_mode.explain @@ -1,2 +1,2 @@ -Project [staticinvoke(class org.apache.spark.sql.catalyst.expressions.ExpressionImplUtils, BinaryType, aesDecrypt, cast(g#0 as binary), cast(g#0 as binary), g#0, DEFAULT, cast( as binary), BinaryType, BinaryType, StringType, StringType, BinaryType, true, true, true) AS aes_decrypt(g, g, g, DEFAULT, )#0] +Project [static_invoke(ExpressionImplUtils.aesDecrypt(cast(g#0 as binary), cast(g#0 as binary), g#0, DEFAULT, cast( as binary))) AS aes_decrypt(g, g, g, DEFAULT, )#0] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_aes_decrypt_with_mode_padding.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_aes_decrypt_with_mode_padding.explain index c6c693013dd0a..0d87c8b40853a 100644 --- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_aes_decrypt_with_mode_padding.explain +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_aes_decrypt_with_mode_padding.explain @@ -1,2 +1,2 @@ -Project [staticinvoke(class org.apache.spark.sql.catalyst.expressions.ExpressionImplUtils, BinaryType, aesDecrypt, cast(g#0 as binary), cast(g#0 as binary), g#0, g#0, cast( as binary), BinaryType, BinaryType, StringType, StringType, BinaryType, true, true, true) AS aes_decrypt(g, g, g, g, )#0] +Project [static_invoke(ExpressionImplUtils.aesDecrypt(cast(g#0 as binary), cast(g#0 as binary), g#0, g#0, cast( as binary))) AS aes_decrypt(g, g, g, g, )#0] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_aes_decrypt_with_mode_padding_aad.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_aes_decrypt_with_mode_padding_aad.explain index 97bb528b84b3f..3afae44e97ddf 100644 --- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_aes_decrypt_with_mode_padding_aad.explain +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_aes_decrypt_with_mode_padding_aad.explain @@ -1,2 +1,2 @@ -Project [staticinvoke(class org.apache.spark.sql.catalyst.expressions.ExpressionImplUtils, BinaryType, aesDecrypt, cast(g#0 as binary), cast(g#0 as binary), g#0, g#0, cast(g#0 as binary), BinaryType, BinaryType, StringType, StringType, BinaryType, true, true, true) AS aes_decrypt(g, g, g, g, g)#0] +Project [static_invoke(ExpressionImplUtils.aesDecrypt(cast(g#0 as binary), cast(g#0 as binary), g#0, g#0, cast(g#0 as binary))) AS aes_decrypt(g, g, g, g, g)#0] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_aes_encrypt.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_aes_encrypt.explain index 44084a8e60fb0..9f88193ce3e3f 100644 --- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_aes_encrypt.explain +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_aes_encrypt.explain @@ -1,2 +1,2 @@ -Project [staticinvoke(class org.apache.spark.sql.catalyst.expressions.ExpressionImplUtils, BinaryType, aesEncrypt, cast(g#0 as binary), cast(g#0 as binary), GCM, DEFAULT, cast( as binary), cast( as binary), BinaryType, BinaryType, StringType, StringType, BinaryType, BinaryType, true, true, true) AS aes_encrypt(g, g, GCM, DEFAULT, , )#0] +Project [static_invoke(ExpressionImplUtils.aesEncrypt(cast(g#0 as binary), cast(g#0 as binary), GCM, DEFAULT, cast( as binary), cast( as binary))) AS aes_encrypt(g, g, GCM, DEFAULT, , )#0] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_aes_encrypt_with_mode.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_aes_encrypt_with_mode.explain index 29ccf0c1c833f..97163bf0f7c32 100644 --- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_aes_encrypt_with_mode.explain +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_aes_encrypt_with_mode.explain @@ -1,2 +1,2 @@ -Project [staticinvoke(class org.apache.spark.sql.catalyst.expressions.ExpressionImplUtils, BinaryType, aesEncrypt, cast(g#0 as binary), cast(g#0 as binary), g#0, DEFAULT, cast( as binary), cast( as binary), BinaryType, BinaryType, StringType, StringType, BinaryType, BinaryType, true, true, true) AS aes_encrypt(g, g, g, DEFAULT, , )#0] +Project [static_invoke(ExpressionImplUtils.aesEncrypt(cast(g#0 as binary), cast(g#0 as binary), g#0, DEFAULT, cast( as binary), cast( as binary))) AS aes_encrypt(g, g, g, DEFAULT, , )#0] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_aes_encrypt_with_mode_padding.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_aes_encrypt_with_mode_padding.explain index 5591363426ab5..35fdd3df3e6b2 100644 --- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_aes_encrypt_with_mode_padding.explain +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_aes_encrypt_with_mode_padding.explain @@ -1,2 +1,2 @@ -Project [staticinvoke(class org.apache.spark.sql.catalyst.expressions.ExpressionImplUtils, BinaryType, aesEncrypt, cast(g#0 as binary), cast(g#0 as binary), g#0, g#0, cast( as binary), cast( as binary), BinaryType, BinaryType, StringType, StringType, BinaryType, BinaryType, true, true, true) AS aes_encrypt(g, g, g, g, , )#0] +Project [static_invoke(ExpressionImplUtils.aesEncrypt(cast(g#0 as binary), cast(g#0 as binary), g#0, g#0, cast( as binary), cast( as binary))) AS aes_encrypt(g, g, g, g, , )#0] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_aes_encrypt_with_mode_padding_iv.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_aes_encrypt_with_mode_padding_iv.explain index 54b08d7bdb48e..0d566721e51d4 100644 --- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_aes_encrypt_with_mode_padding_iv.explain +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_aes_encrypt_with_mode_padding_iv.explain @@ -1,2 +1,2 @@ -Project [staticinvoke(class org.apache.spark.sql.catalyst.expressions.ExpressionImplUtils, BinaryType, aesEncrypt, cast(g#0 as binary), cast(g#0 as binary), g#0, g#0, 0x434445, cast( as binary), BinaryType, BinaryType, StringType, StringType, BinaryType, BinaryType, true, true, true) AS aes_encrypt(g, g, g, g, X'434445', )#0] +Project [static_invoke(ExpressionImplUtils.aesEncrypt(cast(g#0 as binary), cast(g#0 as binary), g#0, g#0, 0x434445, cast( as binary))) AS aes_encrypt(g, g, g, g, X'434445', )#0] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_aes_encrypt_with_mode_padding_iv_aad.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_aes_encrypt_with_mode_padding_iv_aad.explain index 024089170bc75..755332cca5edd 100644 --- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_aes_encrypt_with_mode_padding_iv_aad.explain +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_aes_encrypt_with_mode_padding_iv_aad.explain @@ -1,2 +1,2 @@ -Project [staticinvoke(class org.apache.spark.sql.catalyst.expressions.ExpressionImplUtils, BinaryType, aesEncrypt, cast(g#0 as binary), cast(g#0 as binary), g#0, g#0, 0x434445, cast(g#0 as binary), BinaryType, BinaryType, StringType, StringType, BinaryType, BinaryType, true, true, true) AS aes_encrypt(g, g, g, g, X'434445', g)#0] +Project [static_invoke(ExpressionImplUtils.aesEncrypt(cast(g#0 as binary), cast(g#0 as binary), g#0, g#0, 0x434445, cast(g#0 as binary))) AS aes_encrypt(g, g, g, g, X'434445', g)#0] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_bitmap_bit_position.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_bitmap_bit_position.explain index 61a15dd4c945e..76b460ad4d043 100644 --- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_bitmap_bit_position.explain +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_bitmap_bit_position.explain @@ -1,2 +1,2 @@ -Project [staticinvoke(class org.apache.spark.sql.catalyst.expressions.BitmapExpressionUtils, LongType, bitmapBitPosition, id#0L, LongType, true, false, true) AS bitmap_bit_position(id)#0L] +Project [static_invoke(BitmapExpressionUtils.bitmapBitPosition(id#0L)) AS bitmap_bit_position(id)#0L] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_bitmap_bucket_number.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_bitmap_bucket_number.explain index 61a15dd4c945e..76b460ad4d043 100644 --- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_bitmap_bucket_number.explain +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_bitmap_bucket_number.explain @@ -1,2 +1,2 @@ -Project [staticinvoke(class org.apache.spark.sql.catalyst.expressions.BitmapExpressionUtils, LongType, bitmapBitPosition, id#0L, LongType, true, false, true) AS bitmap_bit_position(id)#0L] +Project [static_invoke(BitmapExpressionUtils.bitmapBitPosition(id#0L)) AS bitmap_bit_position(id)#0L] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_bitmap_count.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_bitmap_count.explain index da43425c3ec04..c2783bff65eec 100644 --- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_bitmap_count.explain +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_bitmap_count.explain @@ -1,2 +1,2 @@ -Project [staticinvoke(class org.apache.spark.sql.catalyst.expressions.BitmapExpressionUtils, LongType, bitmapCount, bytes#0, BinaryType, true, false, true) AS bitmap_count(bytes)#0L] +Project [static_invoke(BitmapExpressionUtils.bitmapCount(bytes#0)) AS bitmap_count(bytes)#0L] +- LocalRelation , [id#0L, bytes#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_count_if.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_count_if.explain index f2ada15eccb7d..a9fd2eeb669aa 100644 --- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_count_if.explain +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_count_if.explain @@ -1,3 +1,4 @@ -Aggregate [count(if ((_common_expr_0#0 = false)) null else _common_expr_0#0) AS count_if((a > 0))#0L] -+- Project [id#0L, a#0, b#0, d#0, e#0, f#0, g#0, (a#0 > 0) AS _common_expr_0#0] - +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] +Project [_aggregateexpression#0L AS count_if((a > 0))#0L] ++- Aggregate [count(if ((_common_expr_0#0 = false)) null else _common_expr_0#0) AS _aggregateexpression#0L] + +- Project [id#0L, a#0, b#0, d#0, e#0, f#0, g#0, (a#0 > 0) AS _common_expr_0#0] + +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_decode.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_decode.explain index 165be9b9e12f1..c7f2e4cf9c769 100644 --- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_decode.explain +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_decode.explain @@ -1,2 +1,2 @@ -Project [decode(cast(g#0 as binary), UTF-8, false) AS decode(g, UTF-8)#0] +Project [static_invoke(StringDecode.decode(cast(g#0 as binary), UTF-8, false, false)) AS decode(g, UTF-8)#0] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_encode.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_encode.explain index 2f65436059230..3f36f5e4451ba 100644 --- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_encode.explain +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_encode.explain @@ -1,2 +1,2 @@ -Project [encode(g#0, UTF-8, false) AS encode(g, UTF-8)#0] +Project [static_invoke(Encode.encode(g#0, UTF-8, false, false)) AS encode(g, UTF-8)#0] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_is_variant_null.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_is_variant_null.explain new file mode 100644 index 0000000000000..e750021ce22bb --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_is_variant_null.explain @@ -0,0 +1,2 @@ +Project [static_invoke(VariantExpressionEvalUtils.isVariantNull(static_invoke(VariantExpressionEvalUtils.parseJson(g#0, true)))) AS is_variant_null(parse_json(g))#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_lpad_binary.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_lpad_binary.explain index 4efc5a3709b6f..50b50a19a49ce 100644 --- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_lpad_binary.explain +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_lpad_binary.explain @@ -1,2 +1,2 @@ -Project [staticinvoke(class org.apache.spark.unsafe.types.ByteArray, BinaryType, lpad, bytes#0, 5, 0x0C0A0F0E, BinaryType, IntegerType, BinaryType, true, false, true) AS lpad(bytes, 5, X'0C0A0F0E')#0] +Project [static_invoke(ByteArray.lpad(bytes#0, 5, 0x0C0A0F0E)) AS lpad(bytes, 5, X'0C0A0F0E')#0] +- LocalRelation , [id#0L, bytes#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_parse_json.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_parse_json.explain new file mode 100644 index 0000000000000..cbcf803b39010 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_parse_json.explain @@ -0,0 +1,2 @@ +Project [static_invoke(VariantExpressionEvalUtils.parseJson(g#0, true)) AS parse_json(g)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_positive.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_positive.explain index 8e1df4a043575..1f17ca72867da 100644 --- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_positive.explain +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_positive.explain @@ -1,2 +1,2 @@ -Project [positive(a#0) AS (+ a)#0] +Project [a#0 AS (+ a)#0] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_rpad_binary.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_rpad_binary.explain index 10d77eef1cb65..5726552fe429d 100644 --- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_rpad_binary.explain +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_rpad_binary.explain @@ -1,2 +1,2 @@ -Project [staticinvoke(class org.apache.spark.unsafe.types.ByteArray, BinaryType, rpad, bytes#0, 5, 0x0B0A0B0E, BinaryType, IntegerType, BinaryType, true, false, true) AS rpad(bytes, 5, X'0B0A0B0E')#0] +Project [static_invoke(ByteArray.rpad(bytes#0, 5, 0x0B0A0B0E)) AS rpad(bytes, 5, X'0B0A0B0E')#0] +- LocalRelation , [id#0L, bytes#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_schema_of_variant.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_schema_of_variant.explain new file mode 100644 index 0000000000000..04b33fdd70678 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_schema_of_variant.explain @@ -0,0 +1,2 @@ +Project [static_invoke(SchemaOfVariant.schemaOfVariant(static_invoke(VariantExpressionEvalUtils.parseJson(g#0, true)))) AS schema_of_variant(parse_json(g))#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_schema_of_variant_agg.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_schema_of_variant_agg.explain new file mode 100644 index 0000000000000..18e8801bb2986 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_schema_of_variant_agg.explain @@ -0,0 +1,2 @@ +Aggregate [schema_of_variant_agg(static_invoke(VariantExpressionEvalUtils.parseJson(g#0, true)), 0, 0) AS schema_of_variant_agg(parse_json(g))#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_split_using_columns.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_split_using_columns.explain new file mode 100644 index 0000000000000..2ce3052d7d75e --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_split_using_columns.explain @@ -0,0 +1,2 @@ +Project [split(g#0, g#0, -1) AS split(g, g, -1)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_split_with_limit_using_columns.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_split_with_limit_using_columns.explain new file mode 100644 index 0000000000000..2d16b9eed332d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_split_with_limit_using_columns.explain @@ -0,0 +1,2 @@ +Project [split(g#0, ;, a#0) AS split(g, ;, a)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_columns.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_columns.explain new file mode 100644 index 0000000000000..3050d15d9754c --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_columns.explain @@ -0,0 +1,2 @@ +Project [substring(g#0, a#0, cast(b#0 as int)) AS substring(g, a, b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_with_columns.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_with_columns.explain new file mode 100644 index 0000000000000..fe07244fc9cec --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_with_columns.explain @@ -0,0 +1,2 @@ +Project [substring(g#0, 4, 5) AS substring(g, 4, 5)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_timestamp_add.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_timestamp_add.explain new file mode 100644 index 0000000000000..36dde1393cdb2 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_timestamp_add.explain @@ -0,0 +1,2 @@ +Project [timestampadd(week, cast(x#0L as int), t#0, Some(America/Los_Angeles)) AS timestampadd(week, x, t)#0] ++- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_timestamp_diff.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_timestamp_diff.explain new file mode 100644 index 0000000000000..7a0a3ff8c53d3 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_timestamp_diff.explain @@ -0,0 +1,2 @@ +Project [timestampdiff(year, t#0, t#0, Some(America/Los_Angeles)) AS timestampdiff(year, t, t)#0L] ++- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_binary_with_format.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_binary_with_format.explain index b62ccccc0c15e..3017720acbafb 100644 --- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_binary_with_format.explain +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_binary_with_format.explain @@ -1,2 +1,2 @@ -Project [encode(g#0, UTF-8, false) AS to_binary(g, utf-8)#0] +Project [static_invoke(Encode.encode(g#0, UTF-8, false, false)) AS to_binary(g, utf-8)#0] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_try_aes_decrypt.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_try_aes_decrypt.explain index b45be28453089..8ab4b477bb557 100644 --- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_try_aes_decrypt.explain +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_try_aes_decrypt.explain @@ -1,2 +1,2 @@ -Project [tryeval(staticinvoke(class org.apache.spark.sql.catalyst.expressions.ExpressionImplUtils, BinaryType, aesDecrypt, cast(g#0 as binary), cast(g#0 as binary), GCM, DEFAULT, cast( as binary), BinaryType, BinaryType, StringType, StringType, BinaryType, true, true, true)) AS try_aes_decrypt(g, g, GCM, DEFAULT, )#0] +Project [tryeval(static_invoke(ExpressionImplUtils.aesDecrypt(cast(g#0 as binary), cast(g#0 as binary), GCM, DEFAULT, cast( as binary)))) AS try_aes_decrypt(g, g, GCM, DEFAULT, )#0] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_try_aes_decrypt_with_mode.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_try_aes_decrypt_with_mode.explain index 82b7ed1ea893e..e45fef8af254c 100644 --- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_try_aes_decrypt_with_mode.explain +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_try_aes_decrypt_with_mode.explain @@ -1,2 +1,2 @@ -Project [tryeval(staticinvoke(class org.apache.spark.sql.catalyst.expressions.ExpressionImplUtils, BinaryType, aesDecrypt, cast(g#0 as binary), cast(g#0 as binary), g#0, DEFAULT, cast( as binary), BinaryType, BinaryType, StringType, StringType, BinaryType, true, true, true)) AS try_aes_decrypt(g, g, g, DEFAULT, )#0] +Project [tryeval(static_invoke(ExpressionImplUtils.aesDecrypt(cast(g#0 as binary), cast(g#0 as binary), g#0, DEFAULT, cast( as binary)))) AS try_aes_decrypt(g, g, g, DEFAULT, )#0] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_try_aes_decrypt_with_mode_padding.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_try_aes_decrypt_with_mode_padding.explain index 9087d743d941f..cdee84b92bc2a 100644 --- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_try_aes_decrypt_with_mode_padding.explain +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_try_aes_decrypt_with_mode_padding.explain @@ -1,2 +1,2 @@ -Project [tryeval(staticinvoke(class org.apache.spark.sql.catalyst.expressions.ExpressionImplUtils, BinaryType, aesDecrypt, cast(g#0 as binary), cast(g#0 as binary), g#0, g#0, cast( as binary), BinaryType, BinaryType, StringType, StringType, BinaryType, true, true, true)) AS try_aes_decrypt(g, g, g, g, )#0] +Project [tryeval(static_invoke(ExpressionImplUtils.aesDecrypt(cast(g#0 as binary), cast(g#0 as binary), g#0, g#0, cast( as binary)))) AS try_aes_decrypt(g, g, g, g, )#0] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_try_aes_decrypt_with_mode_padding_aad.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_try_aes_decrypt_with_mode_padding_aad.explain index 8854da9b423d0..b5eb4258b5250 100644 --- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_try_aes_decrypt_with_mode_padding_aad.explain +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_try_aes_decrypt_with_mode_padding_aad.explain @@ -1,2 +1,2 @@ -Project [tryeval(staticinvoke(class org.apache.spark.sql.catalyst.expressions.ExpressionImplUtils, BinaryType, aesDecrypt, cast(g#0 as binary), cast(g#0 as binary), g#0, g#0, cast(g#0 as binary), BinaryType, BinaryType, StringType, StringType, BinaryType, true, true, true)) AS try_aes_decrypt(g, g, g, g, g)#0] +Project [tryeval(static_invoke(ExpressionImplUtils.aesDecrypt(cast(g#0 as binary), cast(g#0 as binary), g#0, g#0, cast(g#0 as binary)))) AS try_aes_decrypt(g, g, g, g, g)#0] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_try_parse_json.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_try_parse_json.explain new file mode 100644 index 0000000000000..826ec4fc81d83 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_try_parse_json.explain @@ -0,0 +1,2 @@ +Project [static_invoke(VariantExpressionEvalUtils.parseJson(g#0, false)) AS try_parse_json(g)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_try_variant_get.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_try_variant_get.explain new file mode 100644 index 0000000000000..933fbff8e1f3d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_try_variant_get.explain @@ -0,0 +1,2 @@ +Project [try_variant_get(static_invoke(VariantExpressionEvalUtils.parseJson(g#0, true)), $, IntegerType, false, Some(America/Los_Angeles)) AS try_variant_get(parse_json(g), $)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_url_decode.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_url_decode.explain index d612190396d2b..6111cc1374fb6 100644 --- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_url_decode.explain +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_url_decode.explain @@ -1,2 +1,2 @@ -Project [staticinvoke(class org.apache.spark.sql.catalyst.expressions.UrlCodec$, StringType, decode, g#0, UTF-8, StringType, StringType, true, true, true) AS url_decode(g)#0] +Project [static_invoke(UrlCodec.decode(g#0, UTF-8)) AS url_decode(g)#0] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_url_encode.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_url_encode.explain index bd2c63e19c609..871842d41ba4f 100644 --- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_url_encode.explain +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_url_encode.explain @@ -1,2 +1,2 @@ -Project [staticinvoke(class org.apache.spark.sql.catalyst.expressions.UrlCodec$, StringType, encode, g#0, UTF-8, StringType, StringType, true, true, true) AS url_encode(g)#0] +Project [static_invoke(UrlCodec.encode(g#0, UTF-8)) AS url_encode(g)#0] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_variant_get.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_variant_get.explain new file mode 100644 index 0000000000000..2e0baf058f72a --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_variant_get.explain @@ -0,0 +1,2 @@ +Project [variant_get(static_invoke(VariantExpressionEvalUtils.parseJson(g#0, true)), $, IntegerType, true, Some(America/Los_Angeles)) AS variant_get(parse_json(g), $)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/streaming_table_API_with_options.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/streaming_table_API_with_options.explain index 2a20daaefa8c6..2cc166efa99ec 100644 --- a/connector/connect/common/src/test/resources/query-tests/explain-results/streaming_table_API_with_options.explain +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/streaming_table_API_with_options.explain @@ -1,2 +1,2 @@ -SubqueryAlias primary.tempdb.myStreamingTable -+- StreamingRelationV2 primary.tempdb.myStreamingTable, org.apache.spark.sql.connector.catalog.InMemoryTable, [p1=v1, p2=v2], [id#0L], org.apache.spark.sql.connector.catalog.InMemoryCatalog, tempdb.myStreamingTable +~SubqueryAlias primary.tempdb.myStreamingTable ++- ~StreamingRelationV2 primary.tempdb.myStreamingTable, org.apache.spark.sql.connector.catalog.InMemoryTable, [p1=v1, p2=v2], [id#0L], org.apache.spark.sql.connector.catalog.InMemoryCatalog, tempdb.myStreamingTable diff --git a/connector/connect/common/src/test/resources/query-tests/queries/csv_from_dataset.json b/connector/connect/common/src/test/resources/query-tests/queries/csv_from_dataset.json index 33f6007ec68a1..e4b31258f984a 100644 --- a/connector/connect/common/src/test/resources/query-tests/queries/csv_from_dataset.json +++ b/connector/connect/common/src/test/resources/query-tests/queries/csv_from_dataset.json @@ -18,7 +18,7 @@ "name": "c1", "dataType": { "string": { - "collationId": 0 + "collation": "UTF8_BINARY" } }, "nullable": true diff --git a/connector/connect/common/src/test/resources/query-tests/queries/csv_from_dataset.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/csv_from_dataset.proto.bin index da4ad9bf9a4ed..c39243a10a8e4 100644 Binary files a/connector/connect/common/src/test/resources/query-tests/queries/csv_from_dataset.proto.bin and b/connector/connect/common/src/test/resources/query-tests/queries/csv_from_dataset.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_is_variant_null.json b/connector/connect/common/src/test/resources/query-tests/queries/function_is_variant_null.json new file mode 100644 index 0000000000000..7ae72f8f88e5c --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_is_variant_null.json @@ -0,0 +1,30 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "is_variant_null", + "arguments": [{ + "unresolvedFunction": { + "functionName": "parse_json", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }] + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_is_variant_null.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_is_variant_null.proto.bin new file mode 100644 index 0000000000000..4d3d2624609e7 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_is_variant_null.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_lit_array.json b/connector/connect/common/src/test/resources/query-tests/queries/function_lit_array.json index adf8cabd97b1c..2a5a0ddd15f8a 100644 --- a/connector/connect/common/src/test/resources/query-tests/queries/function_lit_array.json +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_lit_array.json @@ -305,7 +305,7 @@ "array": { "elementType": { "string": { - "collationId": 0 + "collation": "UTF8_BINARY" } }, "elements": [{ @@ -324,7 +324,7 @@ "array": { "elementType": { "string": { - "collationId": 0 + "collation": "UTF8_BINARY" } }, "elements": [{ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_lit_array.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_lit_array.proto.bin index d8b4407f6cfa2..359ddd61d8b74 100644 Binary files a/connector/connect/common/src/test/resources/query-tests/queries/function_lit_array.proto.bin and b/connector/connect/common/src/test/resources/query-tests/queries/function_lit_array.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_parse_json.json b/connector/connect/common/src/test/resources/query-tests/queries/function_parse_json.json new file mode 100644 index 0000000000000..dfcf56c19223e --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_parse_json.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "parse_json", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_parse_json.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_parse_json.proto.bin new file mode 100644 index 0000000000000..a7187fa2c1af0 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_parse_json.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_schema_of_variant.json b/connector/connect/common/src/test/resources/query-tests/queries/function_schema_of_variant.json new file mode 100644 index 0000000000000..c4ea467bc1a24 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_schema_of_variant.json @@ -0,0 +1,30 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "schema_of_variant", + "arguments": [{ + "unresolvedFunction": { + "functionName": "parse_json", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }] + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_schema_of_variant.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_schema_of_variant.proto.bin new file mode 100644 index 0000000000000..0971460bf4112 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_schema_of_variant.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_schema_of_variant_agg.json b/connector/connect/common/src/test/resources/query-tests/queries/function_schema_of_variant_agg.json new file mode 100644 index 0000000000000..19bf62f70b20f --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_schema_of_variant_agg.json @@ -0,0 +1,30 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "schema_of_variant_agg", + "arguments": [{ + "unresolvedFunction": { + "functionName": "parse_json", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }] + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_schema_of_variant_agg.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_schema_of_variant_agg.proto.bin new file mode 100644 index 0000000000000..68c872ef0d4d2 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_schema_of_variant_agg.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_split_using_columns.json b/connector/connect/common/src/test/resources/query-tests/queries/function_split_using_columns.json new file mode 100644 index 0000000000000..98ef0e54e6211 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_split_using_columns.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "split", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_split_using_columns.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_split_using_columns.proto.bin new file mode 100644 index 0000000000000..a87702f83d1bd Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_split_using_columns.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_split_with_limit_using_columns.json b/connector/connect/common/src/test/resources/query-tests/queries/function_split_with_limit_using_columns.json new file mode 100644 index 0000000000000..138f9d70b2c85 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_split_with_limit_using_columns.json @@ -0,0 +1,33 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "split", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, { + "literal": { + "string": ";" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_split_with_limit_using_columns.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_split_with_limit_using_columns.proto.bin new file mode 100644 index 0000000000000..04e24be40e9d8 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_split_with_limit_using_columns.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_columns.json b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_columns.json new file mode 100644 index 0000000000000..ba28b1c7f5700 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_columns.json @@ -0,0 +1,33 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "substring", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_columns.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_columns.proto.bin new file mode 100644 index 0000000000000..f14b44ef5a501 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_columns.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_timestamp_add.json b/connector/connect/common/src/test/resources/query-tests/queries/function_timestamp_add.json new file mode 100644 index 0000000000000..8fd71bb36d85e --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_timestamp_add.json @@ -0,0 +1,33 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "timestampadd", + "arguments": [{ + "literal": { + "string": "week" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "x" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "t" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_timestamp_add.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_timestamp_add.proto.bin new file mode 100644 index 0000000000000..5ab8ec531e073 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_timestamp_add.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_timestamp_diff.json b/connector/connect/common/src/test/resources/query-tests/queries/function_timestamp_diff.json new file mode 100644 index 0000000000000..635cbb45460e6 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_timestamp_diff.json @@ -0,0 +1,33 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "timestampdiff", + "arguments": [{ + "literal": { + "string": "year" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "t" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "t" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_timestamp_diff.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_timestamp_diff.proto.bin new file mode 100644 index 0000000000000..3a81fd8b318c0 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_timestamp_diff.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_try_parse_json.json b/connector/connect/common/src/test/resources/query-tests/queries/function_try_parse_json.json new file mode 100644 index 0000000000000..91177eb4a5857 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_try_parse_json.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "try_parse_json", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_try_parse_json.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_try_parse_json.proto.bin new file mode 100644 index 0000000000000..cc1f159cfd78c Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_try_parse_json.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_try_variant_get.json b/connector/connect/common/src/test/resources/query-tests/queries/function_try_variant_get.json new file mode 100644 index 0000000000000..9a4a4e25f19e6 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_try_variant_get.json @@ -0,0 +1,38 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "try_variant_get", + "arguments": [{ + "unresolvedFunction": { + "functionName": "parse_json", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }] + } + }, { + "literal": { + "string": "$" + } + }, { + "literal": { + "string": "int" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_try_variant_get.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_try_variant_get.proto.bin new file mode 100644 index 0000000000000..b16bbf4c7a4e9 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_try_variant_get.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_typedLit.json b/connector/connect/common/src/test/resources/query-tests/queries/function_typedLit.json index 1e651f0455c7b..aaf3a91c4fe19 100644 --- a/connector/connect/common/src/test/resources/query-tests/queries/function_typedLit.json +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_typedLit.json @@ -200,7 +200,7 @@ "map": { "keyType": { "string": { - "collationId": 0 + "collation": "UTF8_BINARY" } }, "valueType": { @@ -228,7 +228,7 @@ "name": "_1", "dataType": { "string": { - "collationId": 0 + "collation": "UTF8_BINARY" } }, "nullable": true @@ -404,7 +404,7 @@ "map": { "keyType": { "string": { - "collationId": 0 + "collation": "UTF8_BINARY" } }, "valueType": { @@ -417,7 +417,7 @@ "map": { "keyType": { "string": { - "collationId": 0 + "collation": "UTF8_BINARY" } }, "valueType": { @@ -439,7 +439,7 @@ "map": { "keyType": { "string": { - "collationId": 0 + "collation": "UTF8_BINARY" } }, "valueType": { @@ -461,7 +461,7 @@ "map": { "keyType": { "string": { - "collationId": 0 + "collation": "UTF8_BINARY" } }, "valueType": { @@ -493,7 +493,7 @@ "map": { "keyType": { "string": { - "collationId": 0 + "collation": "UTF8_BINARY" } }, "valueType": { @@ -511,7 +511,7 @@ "map": { "keyType": { "string": { - "collationId": 0 + "collation": "UTF8_BINARY" } }, "valueType": { @@ -533,7 +533,7 @@ "map": { "keyType": { "string": { - "collationId": 0 + "collation": "UTF8_BINARY" } }, "valueType": { @@ -576,7 +576,7 @@ "map": { "keyType": { "string": { - "collationId": 0 + "collation": "UTF8_BINARY" } }, "valueType": { @@ -594,7 +594,7 @@ "name": "_1", "dataType": { "string": { - "collationId": 0 + "collation": "UTF8_BINARY" } }, "nullable": true @@ -608,7 +608,7 @@ }, "valueType": { "string": { - "collationId": 0 + "collation": "UTF8_BINARY" } }, "valueContainsNull": true @@ -640,7 +640,7 @@ "map": { "keyType": { "string": { - "collationId": 0 + "collation": "UTF8_BINARY" } }, "valueType": { @@ -666,7 +666,7 @@ "name": "_1", "dataType": { "string": { - "collationId": 0 + "collation": "UTF8_BINARY" } }, "nullable": true @@ -680,7 +680,7 @@ }, "valueType": { "string": { - "collationId": 0 + "collation": "UTF8_BINARY" } }, "valueContainsNull": true @@ -700,7 +700,7 @@ }, "valueType": { "string": { - "collationId": 0 + "collation": "UTF8_BINARY" } }, "keys": [{ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_typedLit.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_typedLit.proto.bin index b3f61830bee0b..71640717c12ea 100644 Binary files a/connector/connect/common/src/test/resources/query-tests/queries/function_typedLit.proto.bin and b/connector/connect/common/src/test/resources/query-tests/queries/function_typedLit.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_variant_get.json b/connector/connect/common/src/test/resources/query-tests/queries/function_variant_get.json new file mode 100644 index 0000000000000..ab0acd29d505b --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_variant_get.json @@ -0,0 +1,38 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "variant_get", + "arguments": [{ + "unresolvedFunction": { + "functionName": "parse_json", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }] + } + }, { + "literal": { + "string": "$" + } + }, { + "literal": { + "string": "int" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_variant_get.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_variant_get.proto.bin new file mode 100644 index 0000000000000..fe9b76bb97c4a Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_variant_get.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/json_from_dataset.json b/connector/connect/common/src/test/resources/query-tests/queries/json_from_dataset.json index 537c218952a42..f29245374e6e2 100644 --- a/connector/connect/common/src/test/resources/query-tests/queries/json_from_dataset.json +++ b/connector/connect/common/src/test/resources/query-tests/queries/json_from_dataset.json @@ -18,7 +18,7 @@ "name": "c1", "dataType": { "string": { - "collationId": 0 + "collation": "UTF8_BINARY" } }, "nullable": true diff --git a/connector/connect/common/src/test/resources/query-tests/queries/json_from_dataset.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/json_from_dataset.proto.bin index 297ab2bf02622..1ce2e676ce30a 100644 Binary files a/connector/connect/common/src/test/resources/query-tests/queries/json_from_dataset.proto.bin and b/connector/connect/common/src/test/resources/query-tests/queries/json_from_dataset.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.json b/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.json index 86595d46654c0..2ccad0345af62 100644 --- a/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.json +++ b/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.json @@ -8,7 +8,7 @@ "planId": "0" }, "localRelation": { - "schema": "struct\u003cs:string collate UTF8_BINARY_LCASE\u003e" + "schema": "struct\u003cs:string collate UTF8_LCASE\u003e" } }, "expressions": [{ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.proto.bin index 30d816526ccea..3708878a2de2d 100644 Binary files a/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.proto.bin and b/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.proto.bin differ diff --git a/connector/connect/docs/client-connection-string.md b/connector/connect/docs/client-connection-string.md index ebab7cbff4fc1..37b2956a5c44a 100644 --- a/connector/connect/docs/client-connection-string.md +++ b/connector/connect/docs/client-connection-string.md @@ -22,8 +22,8 @@ cannot contain arbitrary characters. Configuration parameter are passed in the style of the HTTP URL Path Parameter Syntax. This is similar to the JDBC connection strings. The path component must be empty. All parameters are interpreted **case sensitive**. -```shell -sc://hostname:port/;param1=value;param2=value +```text +sc://host:port/;param1=value;param2=value ``` @@ -34,7 +34,7 @@ sc://hostname:port/;param1=value;param2=value - + - - + @@ -75,7 +75,7 @@ sc://hostname:port/;param1=value;param2=value + Default:
A UUID generated randomly
+ + + + + +
Examples
hostnamehost String The hostname of the endpoint for Spark Connect. Since the endpoint @@ -49,8 +49,8 @@ sc://hostname:port/;param1=value;param2=value
portNumericThe portname to be used when connecting to the GRPC endpoint. The + NumericThe port to be used when connecting to the GRPC endpoint. The default values is: 15002. Any valid port number can be used.
15002
443
user_id String User ID to automatically set in the Spark Connect UserContext message. - This is necssary for the appropriate Spark Session management. This is an + This is necessary for the appropriate Spark Session management. This is an *optional* parameter and depending on the deployment this parameter might be automatically injected using other means. @@ -99,9 +99,16 @@ sc://hostname:port/;param1=value;param2=value allows to provide this session ID to allow sharing Spark Sessions for the same users for example across multiple languages. The value must be provided in a valid UUID string format.
- Default: A UUID generated randomly.
session_id=550e8400-e29b-41d4-a716-446655440000
grpc_max_message_sizeNumericMaximum message size allowed for gRPC messages in bytes.
+ Default:
 128 * 1024 * 1024
grpc_max_message_size=134217728
## Examples diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/config/Connect.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/config/Connect.scala index e94e865873937..dc45684a75ebd 100644 --- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/config/Connect.scala +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/config/Connect.scala @@ -38,6 +38,14 @@ object Connect { .intConf .createWithDefault(ConnectCommon.CONNECT_GRPC_BINDING_PORT) + val CONNECT_GRPC_PORT_MAX_RETRIES = + buildStaticConf("spark.connect.grpc.port.maxRetries") + .doc("The max port retry attempts for the gRPC server binding." + + "By default, it's set to 0, and the server will fail fast in case of port conflicts.") + .version("4.0.0") + .intConf + .createWithDefault(ConnectCommon.CONNECT_GRPC_PORT_MAX_RETRIES) + val CONNECT_GRPC_INTERCEPTOR_CLASSES = buildStaticConf("spark.connect.grpc.interceptor.classes") .doc( @@ -73,7 +81,7 @@ object Connect { |""".stripMargin) .version("3.5.0") .intConf - .createWithDefault(1024) + .createWithDefault(ConnectCommon.CONNECT_GRPC_MARSHALLER_RECURSION_LIMIT) val CONNECT_SESSION_MANAGER_DEFAULT_SESSION_TIMEOUT = buildStaticConf("spark.connect.session.manager.defaultSessionTimeout") @@ -279,6 +287,7 @@ object Connect { .doc("Sets the maximum number of cached resolved logical plans in Spark Connect Session." + " If set to a value less or equal than zero will disable the plan cache.") .version("4.0.0") + .internal() .intConf .createWithDefault(5) @@ -289,6 +298,7 @@ object Connect { s" When false, the cache is disabled even if '${CONNECT_SESSION_PLAN_CACHE_SIZE.key}' is" + " greater than zero. The caching is best-effort and not guaranteed.") .version("4.0.0") + .internal() .booleanConf .createWithDefault(true) } diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteGrpcResponseSender.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteGrpcResponseSender.scala index d4709db081fc8..3e360372d5600 100644 --- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteGrpcResponseSender.scala +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteGrpcResponseSender.scala @@ -25,7 +25,7 @@ import io.grpc.stub.{ServerCallStreamObserver, StreamObserver} import org.apache.spark.{SparkEnv, SparkSQLException} import org.apache.spark.connect.proto.ExecutePlanResponse import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey._ +import org.apache.spark.internal.LogKeys._ import org.apache.spark.sql.catalyst.util.DateTimeConstants.NANOS_PER_MILLIS import org.apache.spark.sql.connect.common.ProtoUtils import org.apache.spark.sql.connect.config.Connect.{CONNECT_EXECUTE_REATTACHABLE_SENDER_MAX_STREAM_DURATION, CONNECT_EXECUTE_REATTACHABLE_SENDER_MAX_STREAM_SIZE, CONNECT_PROGRESS_REPORT_INTERVAL} diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteResponseObserver.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteResponseObserver.scala index 30a899a2ac136..9d83d93083dc4 100644 --- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteResponseObserver.scala +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteResponseObserver.scala @@ -27,7 +27,7 @@ import io.grpc.stub.StreamObserver import org.apache.spark.{SparkEnv, SparkSQLException} import org.apache.spark.connect.proto import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey +import org.apache.spark.internal.LogKeys import org.apache.spark.sql.connect.config.Connect.CONNECT_EXECUTE_REATTACHABLE_OBSERVER_RETRY_BUFFER_SIZE import org.apache.spark.sql.connect.service.ExecuteHolder @@ -245,13 +245,13 @@ private[connect] class ExecuteResponseObserver[T <: Message](val executeHolder: removeResponsesUntilIndex(lastProducedIndex) // scalastyle:off line.size.limit logInfo( - log"Release all for opId=${MDC(LogKey.OP_ID, executeHolder.operationId)}. Execution stats: " + - log"total=${MDC(LogKey.TOTAL_SIZE, totalSize)} " + - log"autoRemoved=${MDC(LogKey.CACHE_AUTO_REMOVED_SIZE, autoRemovedSize)} " + - log"cachedUntilConsumed=${MDC(LogKey.CACHE_UNTIL_HIGHEST_CONSUMED_SIZE, cachedSizeUntilHighestConsumed)} " + - log"cachedUntilProduced=${MDC(LogKey.CACHE_UNTIL_LAST_PRODUCED_SIZE, cachedSizeUntilLastProduced)} " + - log"maxCachedUntilConsumed=${MDC(LogKey.MAX_CACHE_UNTIL_HIGHEST_CONSUMED_SIZE, cachedSizeUntilHighestConsumed.max)} " + - log"maxCachedUntilProduced=${MDC(LogKey.MAX_CACHE_UNTIL_LAST_PRODUCED_SIZE, cachedSizeUntilLastProduced.max)}") + log"Release all for opId=${MDC(LogKeys.OP_ID, executeHolder.operationId)}. Execution stats: " + + log"total=${MDC(LogKeys.TOTAL, totalSize)} " + + log"autoRemoved=${MDC(LogKeys.CACHE_AUTO_REMOVED_SIZE, autoRemovedSize)} " + + log"cachedUntilConsumed=${MDC(LogKeys.CACHE_UNTIL_HIGHEST_CONSUMED_SIZE, cachedSizeUntilHighestConsumed)} " + + log"cachedUntilProduced=${MDC(LogKeys.CACHE_UNTIL_LAST_PRODUCED_SIZE, cachedSizeUntilLastProduced)} " + + log"maxCachedUntilConsumed=${MDC(LogKeys.MAX_CACHE_UNTIL_HIGHEST_CONSUMED_SIZE, cachedSizeUntilHighestConsumed.max)} " + + log"maxCachedUntilProduced=${MDC(LogKeys.MAX_CACHE_UNTIL_LAST_PRODUCED_SIZE, cachedSizeUntilLastProduced.max)}") // scalastyle:on line.size.limit } diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteThreadRunner.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteThreadRunner.scala index 0a6d12cbb1918..4ef4f632204b3 100644 --- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteThreadRunner.scala +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteThreadRunner.scala @@ -220,6 +220,7 @@ private[connect] class ExecuteThreadRunner(executeHolder: ExecuteHolder) extends .createObservedMetricsResponse( executeHolder.sessionHolder.sessionId, executeHolder.sessionHolder.serverSessionId, + executeHolder.request.getPlan.getRoot.getCommon.getPlanId, observedMetrics ++ accumulatedInPython)) } diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/SparkConnectPlanExecution.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/SparkConnectPlanExecution.scala index 23390bf7aba8f..660951f229849 100644 --- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/SparkConnectPlanExecution.scala +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/SparkConnectPlanExecution.scala @@ -35,8 +35,9 @@ import org.apache.spark.sql.connect.config.Connect.CONNECT_GRPC_ARROW_MAX_BATCH_ import org.apache.spark.sql.connect.planner.SparkConnectPlanner import org.apache.spark.sql.connect.service.ExecuteHolder import org.apache.spark.sql.connect.utils.MetricGenerator -import org.apache.spark.sql.execution.{LocalTableScanExec, SQLExecution} +import org.apache.spark.sql.execution.{DoNotCleanup, LocalTableScanExec, RemoveShuffleFiles, SkipMigration, SQLExecution} import org.apache.spark.sql.execution.arrow.ArrowConverters +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType import org.apache.spark.util.ThreadUtils @@ -58,11 +59,21 @@ private[execution] class SparkConnectPlanExecution(executeHolder: ExecuteHolder) } val planner = new SparkConnectPlanner(executeHolder) val tracker = executeHolder.eventsManager.createQueryPlanningTracker() + val conf = session.sessionState.conf + val shuffleCleanupMode = + if (conf.getConf(SQLConf.SHUFFLE_DEPENDENCY_FILE_CLEANUP_ENABLED)) { + RemoveShuffleFiles + } else if (conf.getConf(SQLConf.SHUFFLE_DEPENDENCY_SKIP_MIGRATION_ENABLED)) { + SkipMigration + } else { + DoNotCleanup + } val dataframe = Dataset.ofRows( sessionHolder.session, - planner.transformRelation(request.getPlan.getRoot), - tracker) + planner.transformRelation(request.getPlan.getRoot, cachePlan = true), + tracker, + shuffleCleanupMode) responseObserver.onNext(createSchemaResponse(request.getSessionId, dataframe.schema)) processAsArrowBatches(dataframe, responseObserver, executeHolder) responseObserver.onNext(MetricGenerator.createMetricsResponse(sessionHolder, dataframe)) @@ -253,8 +264,14 @@ private[execution] class SparkConnectPlanExecution(executeHolder: ExecuteHolder) name -> values } if (observedMetrics.nonEmpty) { - Some(SparkConnectPlanExecution - .createObservedMetricsResponse(sessionId, sessionHolder.serverSessionId, observedMetrics)) + val planId = executeHolder.request.getPlan.getRoot.getCommon.getPlanId + Some( + SparkConnectPlanExecution + .createObservedMetricsResponse( + sessionId, + sessionHolder.serverSessionId, + planId, + observedMetrics)) } else None } } @@ -263,11 +280,13 @@ object SparkConnectPlanExecution { def createObservedMetricsResponse( sessionId: String, serverSessionId: String, + planId: Long, metrics: Map[String, Seq[(Option[String], Any)]]): ExecutePlanResponse = { val observedMetrics = metrics.map { case (name, values) => val metrics = ExecutePlanResponse.ObservedMetrics .newBuilder() .setName(name) + .setPlanId(planId) values.foreach { case (key, value) => metrics.addValues(toLiteralProto(value)) key.foreach(metrics.addKeys) diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala index 1ef4bbec3e039..eaeb1c775ddb6 100644 --- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.connect.planner +import java.util.UUID + import scala.collection.mutable import scala.jdk.CollectionConverters._ import scala.util.Try @@ -30,19 +32,19 @@ import io.grpc.stub.StreamObserver import org.apache.commons.lang3.exception.ExceptionUtils import org.apache.spark.{Partition, SparkEnv, TaskContext} -import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.api.python.{PythonEvalType, SimplePythonFunction} import org.apache.spark.connect.proto -import org.apache.spark.connect.proto.{CreateResourceProfileCommand, ExecutePlanResponse, SqlCommand, StreamingForeachFunction, StreamingQueryCommand, StreamingQueryCommandResult, StreamingQueryInstanceId, StreamingQueryManagerCommand, StreamingQueryManagerCommandResult, WriteStreamOperationStart, WriteStreamOperationStartResult} +import org.apache.spark.connect.proto.{CheckpointCommand, CreateResourceProfileCommand, ExecutePlanResponse, SqlCommand, StreamingForeachFunction, StreamingQueryCommand, StreamingQueryCommandResult, StreamingQueryInstanceId, StreamingQueryManagerCommand, StreamingQueryManagerCommandResult, WriteStreamOperationStart, WriteStreamOperationStartResult} import org.apache.spark.connect.proto.ExecutePlanResponse.SqlCommandResult import org.apache.spark.connect.proto.Parse.ParseFormat import org.apache.spark.connect.proto.StreamingQueryManagerCommandResult.StreamingQueryInstance import org.apache.spark.connect.proto.WriteStreamOperationStart.TriggerCase import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.SESSION_ID +import org.apache.spark.internal.LogKeys.{DATAFRAME_ID, SESSION_ID} import org.apache.spark.ml.{functions => MLFunctions} import org.apache.spark.resource.{ExecutorResourceRequest, ResourceProfile, TaskResourceProfile, TaskResourceRequest} -import org.apache.spark.sql.{Column, Dataset, Encoders, ForeachWriter, Observation, RelationalGroupedDataset, SparkSession} +import org.apache.spark.sql.{withOrigin, Column, Dataset, Encoders, ForeachWriter, Observation, RelationalGroupedDataset, SparkSession} import org.apache.spark.sql.avro.{AvroDataToCatalyst, CatalystDataToAvro} import org.apache.spark.sql.catalyst.{expressions, AliasIdentifier, FunctionIdentifier, QueryPlanningTracker} import org.apache.spark.sql.catalyst.analysis.{GlobalTempView, LocalTempView, MultiAlias, NameParameterizedQuery, PosParameterizedQuery, UnresolvedAlias, UnresolvedAttribute, UnresolvedDataFrameStar, UnresolvedDeserializer, UnresolvedExtractValue, UnresolvedFunction, UnresolvedRegex, UnresolvedRelation, UnresolvedStar} @@ -55,6 +57,7 @@ import org.apache.spark.sql.catalyst.plans.{Cross, FullOuter, Inner, JoinType, L import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.plans.logical.{AppendColumns, CoGroup, CollectMetrics, CommandResult, Deduplicate, DeduplicateWithinWatermark, DeserializeToObject, Except, FlatMapGroupsWithState, Intersect, JoinWith, LocalRelation, LogicalGroupState, LogicalPlan, MapGroups, MapPartitions, Project, Sample, SerializeFromObject, Sort, SubqueryAlias, TypedFilter, Union, Unpivot, UnresolvedHint} import org.apache.spark.sql.catalyst.streaming.InternalOutputModes +import org.apache.spark.sql.catalyst.trees.PySparkCurrentOrigin import org.apache.spark.sql.catalyst.types.DataTypeUtils import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, CharVarcharUtils} import org.apache.spark.sql.connect.common.{DataTypeProtoConverter, ForeachWriterPacket, InvalidPlanInput, LiteralValueProtoConverter, StorageLevelProtoConverter, StreamingListenerPacket, UdfPacket} @@ -73,7 +76,7 @@ import org.apache.spark.sql.execution.python.{PythonForeachWriter, UserDefinedPy import org.apache.spark.sql.execution.stat.StatFunctions import org.apache.spark.sql.execution.streaming.GroupStateImpl.groupStateTimeoutFromString import org.apache.spark.sql.execution.streaming.StreamingQueryWrapper -import org.apache.spark.sql.expressions.{ReduceAggregator, SparkUserDefinedFunction} +import org.apache.spark.sql.expressions.{Aggregator, ReduceAggregator, SparkUserDefinedFunction, UserDefinedAggregator, UserDefinedFunction} import org.apache.spark.sql.internal.{CatalogImpl, TypedAggUtils} import org.apache.spark.sql.protobuf.{CatalystDataToProtobuf, ProtobufDataToCatalyst} import org.apache.spark.sql.streaming.{GroupStateTimeout, OutputMode, StreamingQuery, StreamingQueryListener, StreamingQueryProgress, Trigger} @@ -101,7 +104,9 @@ class SparkConnectPlanner( throw new IllegalArgumentException("executeHolder does not belong to sessionHolder") } - private[connect] def session: SparkSession = sessionHolder.session + @Since("4.0.0") + @DeveloperApi + def session: SparkSession = sessionHolder.session private[connect] def parser = session.sessionState.sqlParser @@ -125,6 +130,7 @@ class SparkConnectPlanner( * @return * The resolved logical plan. */ + @DeveloperApi def transformRelation(rel: proto.Relation): LogicalPlan = transformRelation(rel, cachePlan = false) @@ -138,6 +144,7 @@ class SparkConnectPlanner( * @return * The resolved logical plan. */ + @DeveloperApi def transformRelation(rel: proto.Relation, cachePlan: Boolean): LogicalPlan = { sessionHolder.usePlanCache(rel, cachePlan) { rel => val plan = rel.getRelTypeCase match { @@ -230,11 +237,6 @@ class SparkConnectPlanner( } } - @DeveloperApi - def transformRelation(bytes: Array[Byte]): LogicalPlan = { - transformRelation(proto.Relation.parseFrom(bytes)) - } - private def transformRelationPlugin(extension: ProtoAny): LogicalPlan = { SparkConnectPluginRegistry.relationRegistry // Lazily traverse the collection. @@ -1469,7 +1471,22 @@ class SparkConnectPlanner( * @return * Catalyst expression */ - def transformExpression(exp: proto.Expression): Expression = { + @DeveloperApi + def transformExpression(exp: proto.Expression): Expression = if (exp.hasCommon) { + try { + val origin = exp.getCommon.getOrigin + PySparkCurrentOrigin.set( + origin.getPythonOrigin.getFragment, + origin.getPythonOrigin.getCallSite) + withOrigin { doTransformExpression(exp) } + } finally { + PySparkCurrentOrigin.clear() + } + } else { + doTransformExpression(exp) + } + + private def doTransformExpression(exp: proto.Expression): Expression = { exp.getExprTypeCase match { case proto.Expression.ExprTypeCase.LITERAL => transformLiteral(exp.getLiteral) case proto.Expression.ExprTypeCase.UNRESOLVED_ATTRIBUTE => @@ -1510,11 +1527,6 @@ class SparkConnectPlanner( } } - @DeveloperApi - def transformExpression(bytes: Array[Byte]): Expression = { - transformExpression(proto.Expression.parseFrom(bytes)) - } - private def toNamedExpression(expr: Expression): NamedExpression = expr match { case named: NamedExpression => named case expr => UnresolvedAlias(expr) @@ -1603,7 +1615,7 @@ class SparkConnectPlanner( case proto.CommonInlineUserDefinedFunction.FunctionCase.PYTHON_UDF => transformPythonFuncExpression(fun) case proto.CommonInlineUserDefinedFunction.FunctionCase.SCALAR_SCALA_UDF => - transformScalarScalaUDF(fun) + transformScalaUDF(fun) case _ => throw InvalidPlanInput( s"Function with ID: ${fun.getFunctionCase.getNumber} is not supported") @@ -1632,14 +1644,14 @@ class SparkConnectPlanner( } private def unpackUdf(fun: proto.CommonInlineUserDefinedFunction): UdfPacket = { - unpackScalarScalaUDF[UdfPacket](fun.getScalarScalaUdf) + unpackScalaUDF[UdfPacket](fun.getScalarScalaUdf) } private def unpackForeachWriter(fun: proto.ScalarScalaUDF): ForeachWriterPacket = { - unpackScalarScalaUDF[ForeachWriterPacket](fun) + unpackScalaUDF[ForeachWriterPacket](fun) } - private def unpackScalarScalaUDF[T](fun: proto.ScalarScalaUDF): T = { + private def unpackScalaUDF[T](fun: proto.ScalarScalaUDF): T = { try { logDebug(s"Unpack using class loader: ${Utils.getContextOrSparkClassLoader}") Utils.deserialize[T](fun.getPayload.toByteArray, Utils.getContextOrSparkClassLoader) @@ -1662,39 +1674,56 @@ class SparkConnectPlanner( } /** - * Translates a Scalar Scala user-defined function from proto to the Catalyst expression. + * Translates a Scala user-defined function from proto to the Catalyst expression. * * @param fun - * Proto representation of the Scalar Scalar user-defined function. + * Proto representation of the Scala user-defined function. * @return * ScalaUDF. */ - private def transformScalarScalaUDF(fun: proto.CommonInlineUserDefinedFunction): ScalaUDF = { + private def transformScalaUDF(fun: proto.CommonInlineUserDefinedFunction): Expression = { val udf = fun.getScalarScalaUdf val udfPacket = unpackUdf(fun) - ScalaUDF( - function = udfPacket.function, - dataType = transformDataType(udf.getOutputType), - children = fun.getArgumentsList.asScala.map(transformExpression).toSeq, - inputEncoders = udfPacket.inputEncoders.map(e => Try(ExpressionEncoder(e)).toOption), - outputEncoder = Option(ExpressionEncoder(udfPacket.outputEncoder)), - udfName = Option(fun.getFunctionName), - nullable = udf.getNullable, - udfDeterministic = fun.getDeterministic) + if (udf.getAggregate) { + transformScalaFunction(fun) + .asInstanceOf[UserDefinedAggregator[Any, Any, Any]] + .scalaAggregator(fun.getArgumentsList.asScala.map(transformExpression).toSeq) + .toAggregateExpression() + } else { + ScalaUDF( + function = udfPacket.function, + dataType = transformDataType(udf.getOutputType), + children = fun.getArgumentsList.asScala.map(transformExpression).toSeq, + inputEncoders = udfPacket.inputEncoders.map(e => Try(ExpressionEncoder(e)).toOption), + outputEncoder = Option(ExpressionEncoder(udfPacket.outputEncoder)), + udfName = Option(fun.getFunctionName), + nullable = udf.getNullable, + udfDeterministic = fun.getDeterministic) + } } - private def transformScalarScalaFunction( - fun: proto.CommonInlineUserDefinedFunction): SparkUserDefinedFunction = { + private def transformScalaFunction( + fun: proto.CommonInlineUserDefinedFunction): UserDefinedFunction = { val udf = fun.getScalarScalaUdf val udfPacket = unpackUdf(fun) - SparkUserDefinedFunction( - f = udfPacket.function, - dataType = transformDataType(udf.getOutputType), - inputEncoders = udfPacket.inputEncoders.map(e => Try(ExpressionEncoder(e)).toOption), - outputEncoder = Option(ExpressionEncoder(udfPacket.outputEncoder)), - name = Option(fun.getFunctionName), - nullable = udf.getNullable, - deterministic = fun.getDeterministic) + if (udf.getAggregate) { + assert(udfPacket.inputEncoders.size == 1, "UDAF should have exactly one input encoder") + UserDefinedAggregator( + aggregator = udfPacket.function.asInstanceOf[Aggregator[Any, Any, Any]], + inputEncoder = ExpressionEncoder(udfPacket.inputEncoders.head), + name = Option(fun.getFunctionName), + nullable = udf.getNullable, + deterministic = fun.getDeterministic) + } else { + SparkUserDefinedFunction( + f = udfPacket.function, + dataType = transformDataType(udf.getOutputType), + inputEncoders = udfPacket.inputEncoders.map(e => Try(ExpressionEncoder(e)).toOption), + outputEncoder = Option(ExpressionEncoder(udfPacket.outputEncoder)), + name = Option(fun.getFunctionName), + nullable = udf.getNullable, + deterministic = fun.getDeterministic) + } } /** @@ -1735,8 +1764,10 @@ class SparkConnectPlanner( command = fun.getCommand.toByteArray.toImmutableArraySeq, // Empty environment variables envVars = Maps.newHashMap(), - pythonIncludes = sessionHolder.artifactManager.getPythonIncludes.asJava, pythonExec = pythonExec, + // Merge the user specified includes with the includes managed by the artifact manager. + pythonIncludes = (fun.getAdditionalIncludesList.asScala.toSeq ++ + sessionHolder.artifactManager.getPythonIncludes).asJava, pythonVer = fun.getPythonVer, // Empty broadcast variables broadcastVars = Lists.newArrayList(), @@ -1828,6 +1859,16 @@ class SparkConnectPlanner( new BloomFilterAggregate(children(0), children(1), children(2)) .toAggregateExpression()) + case "timestampdiff" if fun.getArgumentsCount == 3 => + val children = fun.getArgumentsList.asScala.map(transformExpression) + val unit = extractString(children(0), "unit") + Some(TimestampDiff(unit, children(1), children(2))) + + case "timestampadd" if fun.getArgumentsCount == 3 => + val children = fun.getArgumentsList.asScala.map(transformExpression) + val unit = extractString(children(0), "unit") + Some(TimestampAdd(unit, children(1), children(2))) + case "window" if Seq(2, 3, 4).contains(fun.getArgumentsCount) => val children = fun.getArgumentsList.asScala.map(transformExpression) val timeCol = children.head @@ -1973,11 +2014,6 @@ class SparkConnectPlanner( val children = fun.getArgumentsList.asScala.map(transformExpression) Some(NullIndex(children(0))) - case "timestampdiff" if fun.getArgumentsCount == 3 => - val children = fun.getArgumentsList.asScala.map(transformExpression) - val unit = extractString(children(0), "unit") - Some(TimestampDiff(unit, children(1), children(2))) - // ML-specific functions case "vector_to_array" if fun.getArgumentsCount == 2 => val expr = transformExpression(fun.getArguments(0)) @@ -2586,6 +2622,10 @@ class SparkConnectPlanner( handleCreateResourceProfileCommand( command.getCreateResourceProfileCommand, responseObserver) + case proto.Command.CommandTypeCase.CHECKPOINT_COMMAND => + handleCheckpointCommand(command.getCheckpointCommand, responseObserver) + case proto.Command.CommandTypeCase.REMOVE_CACHED_REMOTE_RELATION_COMMAND => + handleRemoveCachedRemoteRelationCommand(command.getRemoveCachedRemoteRelationCommand) case _ => throw new UnsupportedOperationException(s"$command not supported.") } @@ -2780,7 +2820,7 @@ class SparkConnectPlanner( case proto.CommonInlineUserDefinedFunction.FunctionCase.JAVA_UDF => handleRegisterJavaUDF(fun) case proto.CommonInlineUserDefinedFunction.FunctionCase.SCALAR_SCALA_UDF => - handleRegisterScalarScalaUDF(fun) + handleRegisterScalaUDF(fun) case _ => throw InvalidPlanInput( s"Function with ID: ${fun.getFunctionCase.getNumber} is not supported") @@ -2857,8 +2897,8 @@ class SparkConnectPlanner( } } - private def handleRegisterScalarScalaUDF(fun: proto.CommonInlineUserDefinedFunction): Unit = { - val udf = transformScalarScalaFunction(fun) + private def handleRegisterScalaUDF(fun: proto.CommonInlineUserDefinedFunction): Unit = { + val udf = transformScalaFunction(fun) session.udf.register(fun.getFunctionName, udf) } @@ -3140,7 +3180,11 @@ class SparkConnectPlanner( } // Register the new query so that its reference is cached and is stopped on session timeout. - SparkConnectService.streamingSessionManager.registerNewStreamingQuery(sessionHolder, query) + SparkConnectService.streamingSessionManager.registerNewStreamingQuery( + sessionHolder, + query, + executeHolder.sparkSessionTags, + executeHolder.operationId) // Register the runner with the query if Python foreachBatch is enabled. foreachBatchRunnerCleaner.foreach { cleaner => sessionHolder.streamingForeachBatchRunnerCleanerCache.registerCleanerForQuery( @@ -3205,7 +3249,9 @@ class SparkConnectPlanner( // Find the query in connect service level cache, otherwise check session's active streams. val query = SparkConnectService.streamingSessionManager - .getCachedQuery(id, runId, session) // Common case: query is cached in the cache. + // Common case: query is cached in the cache. + .getCachedQuery(id, runId, executeHolder.sparkSessionTags, session) + .map(_.query) .orElse { // Else try to find it in active streams. Mostly will not be found here either. Option(session.streams.get(id)) } match { @@ -3512,6 +3558,41 @@ class SparkConnectPlanner( .build()) } + private def handleCheckpointCommand( + checkpointCommand: CheckpointCommand, + responseObserver: StreamObserver[proto.ExecutePlanResponse]): Unit = { + val target = Dataset + .ofRows(session, transformRelation(checkpointCommand.getRelation)) + val checkpointed = target.checkpoint( + eager = checkpointCommand.getEager, + reliableCheckpoint = !checkpointCommand.getLocal) + + val dfId = UUID.randomUUID().toString + logInfo(log"Caching DataFrame with id ${MDC(DATAFRAME_ID, dfId)}") + sessionHolder.cacheDataFrameById(dfId, checkpointed) + + executeHolder.eventsManager.postFinished() + responseObserver.onNext( + proto.ExecutePlanResponse + .newBuilder() + .setSessionId(sessionId) + .setServerSideSessionId(sessionHolder.serverSessionId) + .setCheckpointCommandResult( + proto.CheckpointCommandResult + .newBuilder() + .setRelation(proto.CachedRemoteRelation.newBuilder().setRelationId(dfId).build()) + .build()) + .build()) + } + + private def handleRemoveCachedRemoteRelationCommand( + removeCachedRemoteRelationCommand: proto.RemoveCachedRemoteRelationCommand): Unit = { + val dfId = removeCachedRemoteRelationCommand.getRelation.getRelationId + logInfo(log"Removing DataFrame with id ${MDC(DATAFRAME_ID, dfId)} from the cache") + sessionHolder.removeCachedDataFrame(dfId) + executeHolder.eventsManager.postFinished() + } + private val emptyLocalRelation = LocalRelation( output = AttributeReference("value", StringType, false)() :: Nil, data = Seq.empty) diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectStreamingQueryListenerHandler.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectStreamingQueryListenerHandler.scala index 94f01026b7a5f..ce5aa0888ca53 100644 --- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectStreamingQueryListenerHandler.scala +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectStreamingQueryListenerHandler.scala @@ -24,7 +24,7 @@ import io.grpc.stub.StreamObserver import org.apache.spark.connect.proto.ExecutePlanResponse import org.apache.spark.connect.proto.StreamingQueryListenerBusCommand import org.apache.spark.connect.proto.StreamingQueryListenerEventsResult -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.sql.connect.service.ExecuteHolder /** @@ -57,9 +57,10 @@ class SparkConnectStreamingQueryListenerHandler(executeHolder: ExecuteHolder) ex case StreamingQueryListenerBusCommand.CommandCase.ADD_LISTENER_BUS_LISTENER => listenerHolder.isServerSideListenerRegistered match { case true => - logWarning( - s"[SessionId: $sessionId][UserId: $userId][operationId: " + - s"${executeHolder.operationId}] Redundant server side listener added. Exiting.") + logWarning(log"[SessionId: ${MDC(LogKeys.SESSION_ID, sessionId)}]" + + log"[UserId: ${MDC(LogKeys.USER_ID, userId)}]" + + log"[operationId: ${MDC(LogKeys.OPERATION_HANDLE_ID, executeHolder.operationId)}] " + + log"Redundant server side listener added. Exiting.") return case false => // This transfers sending back the response to the client until @@ -83,29 +84,38 @@ class SparkConnectStreamingQueryListenerHandler(executeHolder: ExecuteHolder) ex } catch { case NonFatal(e) => logError( - s"[SessionId: $sessionId][UserId: $userId][operationId: " + - s"${executeHolder.operationId}] Error sending listener added response.", + log"[SessionId: ${MDC(LogKeys.SESSION_ID, sessionId)}]" + + log"[UserId: ${MDC(LogKeys.USER_ID, userId)}]" + + log"[operationId: " + + log"${MDC(LogKeys.OPERATION_HANDLE_ID, executeHolder.operationId)}] " + + log"Error sending listener added response.", e) listenerHolder.cleanUp() return } } - logInfo(s"[SessionId: $sessionId][UserId: $userId][operationId: " + - s"${executeHolder.operationId}] Server side listener added. Now blocking until " + - "all client side listeners are removed or there is error transmitting the event back.") + logInfo(log"[SessionId: ${MDC(LogKeys.SESSION_ID, sessionId)}]" + + log"[UserId: ${MDC(LogKeys.USER_ID, userId)}]" + + log"[operationId: ${MDC(LogKeys.OPERATION_HANDLE_ID, executeHolder.operationId)}] " + + log"Server side listener added. Now blocking until " + + log"all client side listeners are removed or there is error transmitting the event back.") // Block the handling thread, and have serverListener continuously send back new events listenerHolder.streamingQueryListenerLatch.await() - logInfo(s"[SessionId: $sessionId][UserId: $userId][operationId: " + - s"${executeHolder.operationId}] Server side listener long-running handling thread ended.") + logInfo( + log"[SessionId: ${MDC(LogKeys.SESSION_ID, sessionId)}]" + + log"[UserId: ${MDC(LogKeys.USER_ID, userId)}]" + + log"[operationId: ${MDC(LogKeys.OPERATION_HANDLE_ID, executeHolder.operationId)}] " + + log"Server side listener long-running handling thread ended.") case StreamingQueryListenerBusCommand.CommandCase.REMOVE_LISTENER_BUS_LISTENER => listenerHolder.isServerSideListenerRegistered match { case true => sessionHolder.streamingServersideListenerHolder.cleanUp() case false => - logWarning( - s"[SessionId: $sessionId][UserId: $userId][operationId: " + - s"${executeHolder.operationId}] No active server side listener bus listener " + - s"but received remove listener call. Exiting.") + logWarning(log"[SessionId: ${MDC(LogKeys.SESSION_ID, sessionId)}]" + + log"[UserId: ${MDC(LogKeys.USER_ID, userId)}]" + + log"[operationId: ${MDC(LogKeys.OPERATION_HANDLE_ID, executeHolder.operationId)}] " + + log"No active server side listener bus listener but received remove listener call. " + + log"Exiting.") return } case StreamingQueryListenerBusCommand.CommandCase.COMMAND_NOT_SET => diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/StreamingForeachBatchHelper.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/StreamingForeachBatchHelper.scala index ef5faac77e3e0..df883a5c86814 100644 --- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/StreamingForeachBatchHelper.scala +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/StreamingForeachBatchHelper.scala @@ -27,7 +27,7 @@ import scala.util.control.NonFatal import org.apache.spark.SparkException import org.apache.spark.api.python.{PythonException, PythonWorkerUtils, SimplePythonFunction, SpecialLengths, StreamingPythonRunner} import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{DATAFRAME_ID, QUERY_ID, RUN_ID, SESSION_ID} +import org.apache.spark.internal.LogKeys.{DATAFRAME_ID, QUERY_ID, RUN_ID, SESSION_ID} import org.apache.spark.sql.DataFrame import org.apache.spark.sql.connect.service.SessionHolder import org.apache.spark.sql.connect.service.SparkConnectService diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/StreamingQueryListenerHelper.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/StreamingQueryListenerHelper.scala index 74e9e32f208df..c342050a212ef 100644 --- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/StreamingQueryListenerHelper.scala +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/StreamingQueryListenerHelper.scala @@ -22,7 +22,7 @@ import java.io.EOFException import org.apache.spark.SparkException import org.apache.spark.api.python.{PythonException, PythonWorkerUtils, SimplePythonFunction, SpecialLengths, StreamingPythonRunner} import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.FUNCTION_NAME +import org.apache.spark.internal.LogKeys.FUNCTION_NAME import org.apache.spark.sql.connect.service.{SessionHolder, SparkConnectService} import org.apache.spark.sql.streaming.StreamingQueryListener diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/ExecuteHolder.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/ExecuteHolder.scala index 3112d12bb0e67..ec7ebbe92d72e 100644 --- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/ExecuteHolder.scala +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/ExecuteHolder.scala @@ -296,7 +296,7 @@ private[connect] class ExecuteHolder( object ExecuteJobTag { private val prefix = "SparkConnect_OperationTag" - def apply(sessionId: String, userId: String, operationId: String): String = { + def apply(userId: String, sessionId: String, operationId: String): String = { s"${prefix}_" + s"User_${userId}_" + s"Session_${sessionId}_" + diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/LoggingInterceptor.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/LoggingInterceptor.scala index c82cadbd5f7ab..a071579692fb1 100644 --- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/LoggingInterceptor.scala +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/LoggingInterceptor.scala @@ -29,7 +29,7 @@ import io.grpc.ServerCallHandler import io.grpc.ServerInterceptor import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{DESCRIPTION, MESSAGE} +import org.apache.spark.internal.LogKeys.{DESCRIPTION, MESSAGE} /** * A gRPC interceptor to log RPC requests and responses. It logs the protobufs as JSON. Useful for diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SessionHolder.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SessionHolder.scala index bb32ac1275fbe..681f7e29630ff 100644 --- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SessionHolder.scala +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SessionHolder.scala @@ -23,6 +23,7 @@ import java.util.concurrent.{ConcurrentHashMap, ConcurrentMap, TimeUnit} import javax.annotation.concurrent.GuardedBy import scala.collection.mutable +import scala.concurrent.{ExecutionContext, Future} import scala.jdk.CollectionConverters._ import scala.util.Try @@ -33,7 +34,7 @@ import org.apache.spark.{SparkEnv, SparkException, SparkSQLException} import org.apache.spark.api.python.PythonFunction.PythonAccumulator import org.apache.spark.connect.proto import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey._ +import org.apache.spark.internal.LogKeys._ import org.apache.spark.sql.DataFrame import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan @@ -105,8 +106,10 @@ case class SessionHolder(userId: String, sessionId: String, session: SparkSessio val eventManager: SessionEventsManager = SessionEventsManager(this, new SystemClock()) // Mapping from relation ID (passed to client) to runtime dataframe. Used for callbacks like - // foreachBatch() in Streaming. Lazy since most sessions don't need it. - private lazy val dataFrameCache: ConcurrentMap[String, DataFrame] = new ConcurrentHashMap() + // foreachBatch() in Streaming, and DataFrame.checkpoint API. Lazy since most sessions don't + // need it. + private[spark] lazy val dataFrameCache: ConcurrentMap[String, DataFrame] = + new ConcurrentHashMap() // Mapping from id to StreamingQueryListener. Used for methods like removeListener() in // StreamingQueryManager. @@ -177,12 +180,14 @@ case class SessionHolder(userId: String, sessionId: String, session: SparkSessio */ private[service] def interruptAll(): Seq[String] = { val interruptedIds = new mutable.ArrayBuffer[String]() + val operationsIds = + SparkConnectService.streamingSessionManager.cleanupRunningQueries(this, blocking = false) executions.asScala.values.foreach { execute => if (execute.interrupt()) { interruptedIds += execute.operationId } } - interruptedIds.toSeq + interruptedIds.toSeq ++ operationsIds } /** @@ -192,6 +197,8 @@ case class SessionHolder(userId: String, sessionId: String, session: SparkSessio */ private[service] def interruptTag(tag: String): Seq[String] = { val interruptedIds = new mutable.ArrayBuffer[String]() + val queries = SparkConnectService.streamingSessionManager.getTaggedQuery(tag, session) + queries.foreach(q => Future(q.query.stop())(ExecutionContext.global)) executions.asScala.values.foreach { execute => if (execute.sparkSessionTags.contains(tag)) { if (execute.interrupt()) { @@ -199,7 +206,7 @@ case class SessionHolder(userId: String, sessionId: String, session: SparkSessio } } } - interruptedIds.toSeq + interruptedIds.toSeq ++ queries.map(_.operationId) } /** @@ -296,7 +303,7 @@ case class SessionHolder(userId: String, sessionId: String, session: SparkSessio // Clean up running streaming queries. // Note: there can be concurrent streaming queries being started. - SparkConnectService.streamingSessionManager.cleanupRunningQueries(this) + SparkConnectService.streamingSessionManager.cleanupRunningQueries(this, blocking = true) streamingForeachBatchRunnerCleanerCache.cleanUpAll() // Clean up any streaming workers. removeAllListeners() // removes all listener and stop python listener processes if necessary. diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectExecutionManager.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectExecutionManager.scala index 4fe7f3eceb81a..6681a5f509c6e 100644 --- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectExecutionManager.scala +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectExecutionManager.scala @@ -29,7 +29,7 @@ import com.google.common.cache.CacheBuilder import org.apache.spark.{SparkEnv, SparkSQLException} import org.apache.spark.connect.proto -import org.apache.spark.internal.{Logging, LogKey, MDC} +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.sql.connect.config.Connect.{CONNECT_EXECUTE_MANAGER_ABANDONED_TOMBSTONES_SIZE, CONNECT_EXECUTE_MANAGER_DETACHED_TIMEOUT, CONNECT_EXECUTE_MANAGER_MAINTENANCE_INTERVAL} import org.apache.spark.util.ThreadUtils @@ -95,7 +95,7 @@ private[connect] class SparkConnectExecutionManager() extends Logging { sessionHolder.addExecuteHolder(executeHolder) executions.put(executeHolder.key, executeHolder) lastExecutionTimeMs = None - logInfo(log"ExecuteHolder ${MDC(LogKey.EXECUTE_KEY, executeHolder.key)} is created.") + logInfo(log"ExecuteHolder ${MDC(LogKeys.EXECUTE_KEY, executeHolder.key)} is created.") } schedulePeriodicChecks() // Starts the maintenance thread if it hasn't started. @@ -122,7 +122,7 @@ private[connect] class SparkConnectExecutionManager() extends Logging { if (executions.isEmpty) { lastExecutionTimeMs = Some(System.currentTimeMillis()) } - logInfo(log"ExecuteHolder ${MDC(LogKey.EXECUTE_KEY, key)} is removed.") + logInfo(log"ExecuteHolder ${MDC(LogKeys.EXECUTE_KEY, key)} is removed.") } // close the execution outside the lock executeHolder.foreach { e => @@ -147,7 +147,7 @@ private[connect] class SparkConnectExecutionManager() extends Logging { sessionExecutionHolders.foreach { case (_, executeHolder) => val info = executeHolder.getExecuteInfo logInfo( - log"Execution ${MDC(LogKey.EXECUTE_INFO, info)} removed in removeSessionExecutions.") + log"Execution ${MDC(LogKeys.EXECUTE_INFO, info)} removed in removeSessionExecutions.") removeExecuteHolder(executeHolder.key, abandoned = true) } } @@ -202,7 +202,7 @@ private[connect] class SparkConnectExecutionManager() extends Logging { val interval = SparkEnv.get.conf.get(CONNECT_EXECUTE_MANAGER_MAINTENANCE_INTERVAL) logInfo( log"Starting thread for cleanup of abandoned executions every " + - log"${MDC(LogKey.INTERVAL, interval)} ms") + log"${MDC(LogKeys.INTERVAL, interval)} ms") scheduledExecutor = Some(Executors.newSingleThreadScheduledExecutor()) scheduledExecutor.get.scheduleAtFixedRate( () => { @@ -242,7 +242,7 @@ private[connect] class SparkConnectExecutionManager() extends Logging { toRemove.foreach { executeHolder => val info = executeHolder.getExecuteInfo logInfo( - log"Found execution ${MDC(LogKey.EXECUTE_INFO, info)} that was abandoned " + + log"Found execution ${MDC(LogKeys.EXECUTE_INFO, info)} that was abandoned " + log"and expired and will be removed.") removeExecuteHolder(executeHolder.key, abandoned = true) } diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectInterceptorRegistry.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectInterceptorRegistry.scala index c1b1bacba3b6d..90759c00ccfca 100644 --- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectInterceptorRegistry.scala +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectInterceptorRegistry.scala @@ -46,8 +46,14 @@ object SparkConnectInterceptorRegistry { * @param sb */ def chainInterceptors(sb: NettyServerBuilder): Unit = { + chainInterceptors(sb, createConfiguredInterceptors()) + } + + def chainInterceptors( + sb: NettyServerBuilder, + additionalInterceptors: Seq[ServerInterceptor]): Unit = { interceptorChain.foreach(i => sb.intercept(i())) - createConfiguredInterceptors().foreach(sb.intercept(_)) + additionalInterceptors.foreach(sb.intercept(_)) } // Type used to identify the closure responsible to instantiate a ServerInterceptor. diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectServer.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectServer.scala index c55600886a393..4f05ea927e12b 100644 --- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectServer.scala +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectServer.scala @@ -17,12 +17,8 @@ package org.apache.spark.sql.connect.service -import java.net.InetSocketAddress - -import scala.jdk.CollectionConverters._ - import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{HOST, PORT} +import org.apache.spark.internal.LogKeys.{HOST, PORT} import org.apache.spark.sql.SparkSession /** @@ -36,12 +32,10 @@ object SparkConnectServer extends Logging { try { try { SparkConnectService.start(session.sparkContext) - SparkConnectService.server.getListenSockets.asScala.foreach { sa => - val isa = sa.asInstanceOf[InetSocketAddress] - logInfo( - log"Spark Connect server started at: " + - log"${MDC(HOST, isa.getAddress.getHostAddress)}:${MDC(PORT, isa.getPort)}") - } + val isa = SparkConnectService.bindingAddress + logInfo( + log"Spark Connect server started at: " + + log"${MDC(HOST, isa.getAddress.getHostAddress)}:${MDC(PORT, isa.getPort)}") } catch { case e: Exception => logError("Error starting Spark Connect server", e) @@ -49,8 +43,10 @@ object SparkConnectServer extends Logging { } SparkConnectService.server.awaitTermination() } finally { + if (SparkConnectService.started) { + SparkConnectService.stop() + } session.stop() - SparkConnectService.uiTab.foreach(_.detach()) } } } diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectService.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectService.scala index 4b35971286ddf..e9c92f8d007ea 100644 --- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectService.scala +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectService.scala @@ -31,18 +31,20 @@ import io.grpc.protobuf.services.ProtoReflectionService import io.grpc.stub.StreamObserver import org.apache.commons.lang3.StringUtils -import org.apache.spark.{SparkContext, SparkEnv} +import org.apache.spark.{SparkConf, SparkContext, SparkEnv} import org.apache.spark.connect.proto import org.apache.spark.connect.proto.{AddArtifactsRequest, AddArtifactsResponse, SparkConnectServiceGrpc} import org.apache.spark.connect.proto.SparkConnectServiceGrpc.AsyncService import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.HOST +import org.apache.spark.internal.LogKeys.HOST import org.apache.spark.internal.config.UI.UI_ENABLED -import org.apache.spark.sql.connect.config.Connect.{CONNECT_GRPC_BINDING_ADDRESS, CONNECT_GRPC_BINDING_PORT, CONNECT_GRPC_MARSHALLER_RECURSION_LIMIT, CONNECT_GRPC_MAX_INBOUND_MESSAGE_SIZE} +import org.apache.spark.scheduler.{LiveListenerBus, SparkListenerEvent} +import org.apache.spark.sql.connect.config.Connect.{CONNECT_GRPC_BINDING_ADDRESS, CONNECT_GRPC_BINDING_PORT, CONNECT_GRPC_MARSHALLER_RECURSION_LIMIT, CONNECT_GRPC_MAX_INBOUND_MESSAGE_SIZE, CONNECT_GRPC_PORT_MAX_RETRIES} import org.apache.spark.sql.connect.execution.ConnectProgressExecutionListener import org.apache.spark.sql.connect.ui.{SparkConnectServerAppStatusStore, SparkConnectServerListener, SparkConnectServerTab} import org.apache.spark.sql.connect.utils.ErrorUtils import org.apache.spark.status.ElementTrackingStore +import org.apache.spark.util.Utils /** * The SparkConnectService implementation. @@ -283,10 +285,12 @@ class SparkConnectService(debug: Boolean) extends AsyncService with BindableServ object SparkConnectService extends Logging { private[connect] var server: Server = _ + private[connect] var bindingAddress: InetSocketAddress = _ private[connect] var uiTab: Option[SparkConnectServerTab] = None private[connect] var listener: SparkConnectServerListener = _ private[connect] var executionListener: Option[ConnectProgressExecutionListener] = None + private[connect] var listenerBus: LiveListenerBus = _ // For testing purpose, it's package level private. private[connect] def localPort: Int = { @@ -296,6 +300,10 @@ object SparkConnectService extends Logging { server.getPort } + private[connect] def hostAddress: String = { + Utils.localCanonicalHostName() + } + private[connect] lazy val executionManager = new SparkConnectExecutionManager() private[connect] lazy val sessionManager = new SparkConnectSessionManager() @@ -303,6 +311,10 @@ object SparkConnectService extends Logging { private[connect] val streamingSessionManager = new SparkConnectStreamingQueryCache() + // Package level private for testing purpose. + @volatile private[connect] var started = false + @volatile private[connect] var stopped = false + /** * Based on the userId and sessionId, find or create a new SparkSession. */ @@ -315,6 +327,13 @@ object SparkConnectService extends Logging { previoslyObservedSessionId) } + // For testing + private[spark] def getOrCreateIsolatedSession( + userId: String, + sessionId: String): SessionHolder = { + getOrCreateIsolatedSession(userId, sessionId, None) + } + /** * If there are no executions, return Left with System.currentTimeMillis of last active * execution. Otherwise return Right with list of ExecuteInfo of all executions. @@ -336,6 +355,7 @@ object SparkConnectService extends Logging { // Add the execution listener needed for query progress. executionListener = Some(new ConnectProgressExecutionListener) sc.addSparkListener(executionListener.get) + listenerBus = sc.listenerBus } /** @@ -344,35 +364,75 @@ object SparkConnectService extends Logging { private def startGRPCService(): Unit = { val debugMode = SparkEnv.get.conf.getBoolean("spark.connect.grpc.debug.enabled", true) val bindAddress = SparkEnv.get.conf.get(CONNECT_GRPC_BINDING_ADDRESS) - val port = SparkEnv.get.conf.get(CONNECT_GRPC_BINDING_PORT) - val sb = bindAddress match { - case Some(hostname) => - logInfo(log"start GRPC service at: ${MDC(HOST, hostname)}") - NettyServerBuilder.forAddress(new InetSocketAddress(hostname, port)) - case _ => NettyServerBuilder.forPort(port) + val startPort = SparkEnv.get.conf.get(CONNECT_GRPC_BINDING_PORT) + val sparkConnectService = new SparkConnectService(debugMode) + val protoReflectionService = + if (debugMode) Some(ProtoReflectionService.newInstance()) else None + val configuredInterceptors = SparkConnectInterceptorRegistry.createConfiguredInterceptors() + + val startServiceFn = (port: Int) => { + val sb = bindAddress match { + case Some(hostname) => + logInfo(log"start GRPC service at: ${MDC(HOST, hostname)}") + NettyServerBuilder.forAddress(new InetSocketAddress(hostname, port)) + case _ => NettyServerBuilder.forPort(port) + } + sb.maxInboundMessageSize(SparkEnv.get.conf.get(CONNECT_GRPC_MAX_INBOUND_MESSAGE_SIZE).toInt) + .addService(sparkConnectService) + + // Add all registered interceptors to the server builder. + SparkConnectInterceptorRegistry.chainInterceptors(sb, configuredInterceptors) + + // If debug mode is configured, load the ProtoReflection service so that tools like + // grpcurl can introspect the API for debugging. + protoReflectionService.foreach(service => sb.addService(service)) + + server = sb.build + server.start() + + // It will throw an IllegalStateException if you want to access the binding address + // while the server is in a terminated state, so record the actual binding address + // immediately after the server starts. + // There should only be one address, get the actual binding address + // of the server according the `server.port()` + bindingAddress = server.getListenSockets.asScala + .find(_.isInstanceOf[InetSocketAddress]) + .get + .asInstanceOf[InetSocketAddress] + + (server, server.getPort) } - sb.maxInboundMessageSize(SparkEnv.get.conf.get(CONNECT_GRPC_MAX_INBOUND_MESSAGE_SIZE).toInt) - .addService(new SparkConnectService(debugMode)) - - // Add all registered interceptors to the server builder. - SparkConnectInterceptorRegistry.chainInterceptors(sb) - // If debug mode is configured, load the ProtoReflection service so that tools like - // grpcurl can introspect the API for debugging. - if (debugMode) { - sb.addService(ProtoReflectionService.newInstance()) - } - server = sb.build - server.start() + val maxRetries: Int = SparkEnv.get.conf.get(CONNECT_GRPC_PORT_MAX_RETRIES) + Utils.startServiceOnPort[Server](startPort, startServiceFn, maxRetries, getClass.getName) } // Starts the service - def start(sc: SparkContext): Unit = { + def start(sc: SparkContext): Unit = synchronized { + if (started) { + logWarning("The Spark Connect service has already started.") + return + } + startGRPCService() createListenerAndUI(sc) + + started = true + stopped = false + postSparkConnectServiceStarted(sc) } - def stop(timeout: Option[Long] = None, unit: Option[TimeUnit] = None): Unit = { + def stop(timeout: Option[Long] = None, unit: Option[TimeUnit] = None): Unit = synchronized { + if (stopped) { + logWarning("The Spark Connect service has already been stopped.") + return + } + + if (!started) { + throw new IllegalStateException( + "Attempting to stop the Spark Connect service that has not been started.") + } + if (server != null) { if (timeout.isDefined && unit.isDefined) { server.shutdown() @@ -385,6 +445,57 @@ object SparkConnectService extends Logging { executionManager.shutdown() sessionManager.shutdown() uiTab.foreach(_.detach()) + + started = false + stopped = true + postSparkConnectServiceEnd() + } + + /** + * Post the event that the Spark Connect service has started. This is expected to be called only + * once after the service is ready. + */ + private def postSparkConnectServiceStarted(sc: SparkContext): Unit = { + postServiceEvent(isa => + SparkListenerConnectServiceStarted( + hostAddress, + isa.getPort, + sc.conf, + System.currentTimeMillis())) + } + + /** + * Post the event that the Spark Connect service is offline. + */ + private[connect] def postSparkConnectServiceEnd(): Unit = { + postServiceEvent(isa => + SparkListenerConnectServiceEnd(hostAddress, isa.getPort, System.currentTimeMillis())) + } + + /** + * Post the event to the Spark listener bus. To deliver the event to the listeners, the listener + * bus must be active in this time. + */ + private def postServiceEvent(eventBuilder: InetSocketAddress => SparkListenerEvent): Unit = { + // Sanity checks + if (server == null) { + logWarning( + "The Spark Connect event was dropped because the server bus has not been created and set.") + return + } + + if (bindingAddress == null) { + logWarning( + "The Spark Connect event was dropped because the internal server address is not set.") + return + } + + if (listenerBus == null) { + logWarning("The Spark Connect event was dropped because the listener bus has not been set.") + return + } + + listenerBus.post(eventBuilder(bindingAddress)) } def extractErrorMessage(st: Throwable): String = { @@ -400,3 +511,38 @@ object SparkConnectService extends Logging { } } } + +/** + * The event is sent after the Spark Connect service has started and is ready to receive the + * inbound requests. + * + * @param hostAddress: + * The host address of the started Spark Connect service. + * @param bindingPort: + * The binding port of the started Spark Connect service. + * @param sparkConf: + * The SparkConf of the active SparkContext that associated with the service. + * @param eventTime: + * The time in ms when the event was generated. + */ +case class SparkListenerConnectServiceStarted( + hostAddress: String, + bindingPort: Int, + sparkConf: SparkConf, + eventTime: Long) + extends SparkListenerEvent + +/** + * The event is sent to inform that Spark Connect service has already been shutdown. This event + * indicates the end of the service, and any in-processing requests or upcoming requests are not + * guaranteed to be handled properly by the service. + * + * @param hostAddress: + * The host address of the Spark Connect service. + * @param bindingPort: + * The binding port of the Spark Connect service. + * @param eventTime: + * The time in ms when the event was generated. + */ +case class SparkListenerConnectServiceEnd(hostAddress: String, bindingPort: Int, eventTime: Long) + extends SparkListenerEvent diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectSessionManager.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectSessionManager.scala index 1a34964932ef2..edaaa640bf12e 100644 --- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectSessionManager.scala +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectSessionManager.scala @@ -30,7 +30,7 @@ import com.google.common.cache.CacheBuilder import org.apache.spark.{SparkEnv, SparkSQLException} import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{INTERVAL, SESSION_HOLD_INFO} +import org.apache.spark.internal.LogKeys.{INTERVAL, SESSION_HOLD_INFO} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.connect.config.Connect.{CONNECT_SESSION_MANAGER_CLOSED_SESSIONS_TOMBSTONES_SIZE, CONNECT_SESSION_MANAGER_DEFAULT_SESSION_TIMEOUT, CONNECT_SESSION_MANAGER_MAINTENANCE_INTERVAL} import org.apache.spark.util.ThreadUtils diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectStreamingQueryCache.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectStreamingQueryCache.scala index 4c9b3baa689b3..03719ddd87419 100644 --- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectStreamingQueryCache.scala +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectStreamingQueryCache.scala @@ -23,11 +23,12 @@ import java.util.concurrent.TimeUnit import javax.annotation.concurrent.GuardedBy import scala.collection.mutable +import scala.concurrent.{ExecutionContext, Future} import scala.concurrent.duration.{Duration, DurationInt, FiniteDuration} import scala.util.control.NonFatal import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{DURATION, NEW_VALUE, OLD_VALUE, QUERY_CACHE_VALUE, QUERY_ID, SESSION_ID} +import org.apache.spark.internal.LogKeys.{DURATION, NEW_VALUE, OLD_VALUE, QUERY_CACHE_VALUE, QUERY_ID, SESSION_ID} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.streaming.StreamingQuery import org.apache.spark.util.{Clock, SystemClock, ThreadUtils} @@ -55,16 +56,28 @@ private[connect] class SparkConnectStreamingQueryCache( import SparkConnectStreamingQueryCache._ - def registerNewStreamingQuery(sessionHolder: SessionHolder, query: StreamingQuery): Unit = { - queryCacheLock.synchronized { + def registerNewStreamingQuery( + sessionHolder: SessionHolder, + query: StreamingQuery, + tags: Set[String], + operationId: String): Unit = queryCacheLock.synchronized { + taggedQueriesLock.synchronized { val value = QueryCacheValue( userId = sessionHolder.userId, sessionId = sessionHolder.sessionId, session = sessionHolder.session, query = query, + operationId = operationId, expiresAtMs = None) - queryCache.put(QueryCacheKey(query.id.toString, query.runId.toString), value) match { + val queryKey = QueryCacheKey(query.id.toString, query.runId.toString) + tags.foreach { tag => + taggedQueries + .getOrElseUpdate(tag, new mutable.ArrayBuffer[QueryCacheKey]) + .addOne(queryKey) + } + + queryCache.put(queryKey, value) match { case Some(existing) => // Query is being replace. Not really expected. logWarning(log"Replacing existing query in the cache (unexpected). " + log"Query Id: ${MDC(QUERY_ID, query.id)}.Existing value ${MDC(OLD_VALUE, existing)}, " + @@ -80,7 +93,7 @@ private[connect] class SparkConnectStreamingQueryCache( } /** - * Returns [[StreamingQuery]] if it is cached and session matches the cached query. It ensures + * Returns [[QueryCacheValue]] if it is cached and session matches the cached query. It ensures * the session associated with it matches the session passed into the call. If the query is * inactive (i.e. it has a cache expiry time set), this access extends its expiry time. So if a * client keeps accessing a query, it stays in the cache. @@ -88,8 +101,35 @@ private[connect] class SparkConnectStreamingQueryCache( def getCachedQuery( queryId: String, runId: String, - session: SparkSession): Option[StreamingQuery] = { - val key = QueryCacheKey(queryId, runId) + tags: Set[String], + session: SparkSession): Option[QueryCacheValue] = { + taggedQueriesLock.synchronized { + val key = QueryCacheKey(queryId, runId) + val result = getCachedQuery(QueryCacheKey(queryId, runId), session) + tags.foreach { tag => + taggedQueries.getOrElseUpdate(tag, new mutable.ArrayBuffer[QueryCacheKey]).addOne(key) + } + result + } + } + + /** + * Similar with [[getCachedQuery]] but it gets queries tagged previously. + */ + def getTaggedQuery(tag: String, session: SparkSession): Seq[QueryCacheValue] = { + taggedQueriesLock.synchronized { + taggedQueries + .get(tag) + .map { k => + k.flatMap(getCachedQuery(_, session)).toSeq + } + .getOrElse(Seq.empty[QueryCacheValue]) + } + } + + private def getCachedQuery( + key: QueryCacheKey, + session: SparkSession): Option[QueryCacheValue] = { queryCacheLock.synchronized { queryCache.get(key).flatMap { v => if (v.session == session) { @@ -98,7 +138,7 @@ private[connect] class SparkConnectStreamingQueryCache( val expiresAtMs = clock.getTimeMillis() + stoppedQueryInactivityTimeout.toMillis queryCache.put(key, v.copy(expiresAtMs = Some(expiresAtMs))) } - Some(v.query) + Some(v) } else None // Should be rare, may be client is trying access from a different session. } } @@ -109,7 +149,10 @@ private[connect] class SparkConnectStreamingQueryCache( * the queryCache. This is used when session is expired and we need to cleanup resources of that * session. */ - def cleanupRunningQueries(sessionHolder: SessionHolder): Unit = { + def cleanupRunningQueries( + sessionHolder: SessionHolder, + blocking: Boolean = true): Seq[String] = { + val operationIds = new mutable.ArrayBuffer[String]() for ((k, v) <- queryCache) { if (v.userId.equals(sessionHolder.userId) && v.sessionId.equals(sessionHolder.sessionId)) { if (v.query.isActive && Option(v.session.streams.get(k.queryId)).nonEmpty) { @@ -117,7 +160,12 @@ private[connect] class SparkConnectStreamingQueryCache( log"Stopping the query with id ${MDC(QUERY_ID, k.queryId)} " + log"since the session has timed out") try { - v.query.stop() + if (blocking) { + v.query.stop() + } else { + Future(v.query.stop())(ExecutionContext.global) + } + operationIds.addOne(v.operationId) } catch { case NonFatal(ex) => logWarning( @@ -128,6 +176,7 @@ private[connect] class SparkConnectStreamingQueryCache( } } } + operationIds.toSeq } // Visible for testing @@ -146,6 +195,10 @@ private[connect] class SparkConnectStreamingQueryCache( private val queryCache = new mutable.HashMap[QueryCacheKey, QueryCacheValue] private val queryCacheLock = new Object + @GuardedBy("queryCacheLock") + private val taggedQueries = new mutable.HashMap[String, mutable.ArrayBuffer[QueryCacheKey]] + private val taggedQueriesLock = new Object + @GuardedBy("queryCacheLock") private var scheduledExecutor: Option[ScheduledExecutorService] = None @@ -176,7 +229,7 @@ private[connect] class SparkConnectStreamingQueryCache( * - Update status of query if it is inactive. Sets an expiry time for such queries * - Drop expired queries from the cache. */ - private def periodicMaintenance(): Unit = { + private def periodicMaintenance(): Unit = taggedQueriesLock.synchronized { queryCacheLock.synchronized { val nowMs = clock.getTimeMillis() @@ -212,6 +265,18 @@ private[connect] class SparkConnectStreamingQueryCache( } } } + + taggedQueries.toArray.foreach { case (key, value) => + value.zipWithIndex.toArray.foreach { case (queryKey, i) => + if (queryCache.contains(queryKey)) { + value.remove(i) + } + } + + if (value.isEmpty) { + taggedQueries.remove(key) + } + } } } } @@ -225,6 +290,7 @@ private[connect] object SparkConnectStreamingQueryCache { sessionId: String, session: SparkSession, // Holds the reference to the session. query: StreamingQuery, // Holds the reference to the query. + operationId: String, expiresAtMs: Option[Long] = None // Expiry time for a stopped query. ) { override def toString(): String = diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/ui/SparkConnectServerListener.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/ui/SparkConnectServerListener.scala index a1bbab7dbdbc2..65db08be7f904 100644 --- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/ui/SparkConnectServerListener.scala +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/ui/SparkConnectServerListener.scala @@ -22,7 +22,7 @@ import scala.collection.mutable.ArrayBuffer import org.apache.spark.{SparkConf, SparkContext, SparkEnv} import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{OP_ID, SESSION_ID} +import org.apache.spark.internal.LogKeys.{OP_ID, SESSION_ID} import org.apache.spark.internal.config.Status.LIVE_ENTITY_UPDATE_PERIOD import org.apache.spark.scheduler._ import org.apache.spark.sql.connect.config.Connect.{CONNECT_UI_SESSION_LIMIT, CONNECT_UI_STATEMENT_LIMIT} diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/utils/ErrorUtils.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/utils/ErrorUtils.scala index b1bfe71930fb1..355048cf30363 100644 --- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/utils/ErrorUtils.scala +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/utils/ErrorUtils.scala @@ -35,11 +35,11 @@ import org.apache.commons.lang3.exception.ExceptionUtils import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods -import org.apache.spark.{SparkEnv, SparkException, SparkThrowable} +import org.apache.spark.{QueryContextType, SparkEnv, SparkException, SparkThrowable} import org.apache.spark.api.python.PythonException import org.apache.spark.connect.proto.FetchErrorDetailsResponse import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{OP_TYPE, SESSION_ID, USER_ID} +import org.apache.spark.internal.LogKeys.{OP_TYPE, SESSION_ID, USER_ID} import org.apache.spark.sql.connect.config.Connect import org.apache.spark.sql.connect.service.{ExecuteEventsManager, SessionHolder, SessionKey, SparkConnectService} import org.apache.spark.sql.internal.SQLConf @@ -118,15 +118,27 @@ private[connect] object ErrorUtils extends Logging { sparkThrowableBuilder.setErrorClass(sparkThrowable.getErrorClass) } for (queryCtx <- sparkThrowable.getQueryContext) { - sparkThrowableBuilder.addQueryContexts( - FetchErrorDetailsResponse.QueryContext - .newBuilder() + val builder = FetchErrorDetailsResponse.QueryContext + .newBuilder() + val context = if (queryCtx.contextType() == QueryContextType.SQL) { + builder + .setContextType(FetchErrorDetailsResponse.QueryContext.ContextType.SQL) .setObjectType(queryCtx.objectType()) .setObjectName(queryCtx.objectName()) .setStartIndex(queryCtx.startIndex()) .setStopIndex(queryCtx.stopIndex()) .setFragment(queryCtx.fragment()) - .build()) + .setSummary(queryCtx.summary()) + .build() + } else { + builder + .setContextType(FetchErrorDetailsResponse.QueryContext.ContextType.DATAFRAME) + .setFragment(queryCtx.fragment()) + .setCallSite(queryCtx.callSite()) + .setSummary(queryCtx.summary()) + .build() + } + sparkThrowableBuilder.addQueryContexts(context) } if (sparkThrowable.getSqlState != null) { sparkThrowableBuilder.setSqlState(sparkThrowable.getSqlState) diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/utils/MetricGenerator.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/utils/MetricGenerator.scala index e2e4128311871..d76bec5454abb 100644 --- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/utils/MetricGenerator.scala +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/utils/MetricGenerator.scala @@ -70,6 +70,7 @@ private[connect] object MetricGenerator extends AdaptiveSparkPlanHelper { .newBuilder() .setName(p.nodeName) .setPlanId(p.id) + .setParent(parentId) .putAllExecutionMetrics(mv.asJava) .build() Seq(mo) ++ transformChildren(p) diff --git a/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/ProtoToParsedPlanTestSuite.scala b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/ProtoToParsedPlanTestSuite.scala index cc9decb4c98bc..264c6aa70ae2e 100644 --- a/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/ProtoToParsedPlanTestSuite.scala +++ b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/ProtoToParsedPlanTestSuite.scala @@ -26,7 +26,7 @@ import scala.util.{Failure, Success, Try} import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.connect.proto -import org.apache.spark.internal.LogKey.PATH +import org.apache.spark.internal.LogKeys.PATH import org.apache.spark.internal.MDC import org.apache.spark.sql.catalyst.{catalog, QueryPlanningTracker} import org.apache.spark.sql.catalyst.analysis.{caseSensitiveResolution, Analyzer, FunctionRegistry, Resolver, TableFunctionRegistry} @@ -126,6 +126,7 @@ class ProtoToParsedPlanTestSuite Connect.CONNECT_EXTENSIONS_EXPRESSION_CLASSES.key, "org.apache.spark.sql.connect.plugin.ExampleExpressionPlugin") .set(org.apache.spark.sql.internal.SQLConf.ANSI_ENABLED.key, false.toString) + .set(org.apache.spark.sql.internal.SQLConf.USE_COMMON_EXPR_ID_FOR_ALIAS.key, false.toString) } protected val suiteBaseResourcePath = commonResourcePath.resolve("query-tests") diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/dsl/package.scala b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/dsl/package.scala similarity index 95% rename from connector/connect/server/src/main/scala/org/apache/spark/sql/connect/dsl/package.scala rename to connector/connect/server/src/test/scala/org/apache/spark/sql/connect/dsl/package.scala index 6aadb6c34b779..3edb63ee8e815 100644 --- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/dsl/package.scala +++ b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/dsl/package.scala @@ -332,6 +332,21 @@ package object dsl { def sql(sqlText: String): Relation = { Relation.newBuilder().setSql(SQL.newBuilder().setQuery(sqlText)).build() } + + def table(name: String): Relation = { + proto.Relation + .newBuilder() + .setRead( + proto.Read + .newBuilder() + .setNamedTable( + proto.Read.NamedTable + .newBuilder() + .setUnparsedIdentifier(name) + .build()) + .build()) + .build() + } } implicit class DslNAFunctions(val logicalPlan: Relation) { @@ -513,6 +528,25 @@ package object dsl { freqItems(cols.toArray, support) def freqItems(cols: Seq[String]): Relation = freqItems(cols, 0.01) + + def sampleBy(col: String, fractions: Map[Any, Double], seed: Long): Relation = { + Relation + .newBuilder() + .setSampleBy( + StatSampleBy + .newBuilder() + .setInput(logicalPlan) + .addAllFractions(fractions.toSeq.map { case (k, v) => + StatSampleBy.Fraction + .newBuilder() + .setStratum(toLiteralProto(k)) + .setFraction(v) + .build() + }.asJava) + .setSeed(seed) + .build()) + .build() + } } def select(exprs: Expression*): Relation = { @@ -587,6 +621,10 @@ package object dsl { .build() } + def filter(condition: Expression): Relation = { + where(condition) + } + def deduplicate(colNames: Seq[String]): Relation = Relation .newBuilder() @@ -641,6 +679,10 @@ package object dsl { join(otherPlan, joinType, usingColumns, None) } + def crossJoin(otherPlan: Relation): Relation = { + join(otherPlan, JoinType.JOIN_TYPE_CROSS, Seq(), None) + } + private def join( otherPlan: Relation, joinType: JoinType = JoinType.JOIN_TYPE_INNER, @@ -663,7 +705,7 @@ package object dsl { def as(alias: String): Relation = { Relation - .newBuilder(logicalPlan) + .newBuilder() .setSubqueryAlias(SubqueryAlias.newBuilder().setAlias(alias).setInput(logicalPlan)) .build() } @@ -693,9 +735,10 @@ package object dsl { .setNullOrdering(Expression.SortOrder.NullOrdering.SORT_NULLS_FIRST) .setDirection(Expression.SortOrder.SortDirection.SORT_DIRECTION_ASCENDING) .setChild( - Expression.newBuilder + Expression + .newBuilder() .setUnresolvedAttribute( - Expression.UnresolvedAttribute.newBuilder.setUnparsedIdentifier(col).build()) + Expression.UnresolvedAttribute.newBuilder().setUnparsedIdentifier(col).build()) .build()) .build() } @@ -992,7 +1035,13 @@ package object dsl { WithColumnsRenamed .newBuilder() .setInput(logicalPlan) - .putAllRenameColumnsMap(renameColumnsMap.asJava)) + .addAllRenames(renameColumnsMap.toSeq.map { case (k, v) => + WithColumnsRenamed.Rename + .newBuilder() + .setColName(k) + .setNewColName(v) + .build() + }.asJava)) .build() } diff --git a/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectPlannerSuite.scala b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectPlannerSuite.scala index dfada825df47d..70da1f0a2a1d0 100644 --- a/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectPlannerSuite.scala +++ b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectPlannerSuite.scala @@ -20,11 +20,9 @@ package org.apache.spark.sql.connect.planner import scala.jdk.CollectionConverters._ import com.google.protobuf.ByteString -import io.grpc.stub.StreamObserver import org.apache.spark.SparkFunSuite import org.apache.spark.connect.proto -import org.apache.spark.connect.proto.ExecutePlanResponse import org.apache.spark.connect.proto.Expression.{Alias, ExpressionString, UnresolvedStar} import org.apache.spark.sql.{AnalysisException, Dataset, Row} import org.apache.spark.sql.catalyst.InternalRow @@ -34,7 +32,7 @@ import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.types.DataTypeUtils import org.apache.spark.sql.connect.common.InvalidPlanInput import org.apache.spark.sql.connect.common.LiteralValueProtoConverter.toLiteralProto -import org.apache.spark.sql.connect.service.{ExecuteHolder, ExecuteStatus, SessionHolder, SessionStatus, SparkConnectService} +import org.apache.spark.sql.connect.service.SessionHolder import org.apache.spark.sql.execution.arrow.ArrowConverters import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} @@ -45,21 +43,12 @@ import org.apache.spark.unsafe.types.UTF8String * test cases. */ trait SparkConnectPlanTest extends SharedSparkSession { - - class MockObserver extends StreamObserver[proto.ExecutePlanResponse] { - override def onNext(value: ExecutePlanResponse): Unit = {} - override def onError(t: Throwable): Unit = {} - override def onCompleted(): Unit = {} - } - def transform(rel: proto.Relation): logical.LogicalPlan = { - new SparkConnectPlanner(SessionHolder.forTesting(spark)).transformRelation(rel) + SparkConnectPlannerTestUtils.transform(spark, rel) } def transform(cmd: proto.Command): Unit = { - val executeHolder = buildExecutePlanHolder(cmd) - new SparkConnectPlanner(executeHolder) - .process(cmd, new MockObserver()) + SparkConnectPlannerTestUtils.transform(spark, cmd) } def readRel: proto.Relation = @@ -114,29 +103,6 @@ trait SparkConnectPlanTest extends SharedSparkSession { localRelationBuilder.setData(ByteString.copyFrom(bytes)) proto.Relation.newBuilder().setLocalRelation(localRelationBuilder.build()).build() } - - def buildExecutePlanHolder(command: proto.Command): ExecuteHolder = { - val sessionHolder = SessionHolder.forTesting(spark) - sessionHolder.eventManager.status_(SessionStatus.Started) - - val context = proto.UserContext - .newBuilder() - .setUserId(sessionHolder.userId) - .build() - val plan = proto.Plan - .newBuilder() - .setCommand(command) - .build() - val request = proto.ExecutePlanRequest - .newBuilder() - .setPlan(plan) - .setSessionId(sessionHolder.sessionId) - .setUserContext(context) - .build() - val executeHolder = SparkConnectService.executionManager.createExecuteHolder(request) - executeHolder.eventsManager.status_(ExecuteStatus.Started) - executeHolder - } } /** diff --git a/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectPlannerTestUtils.scala b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectPlannerTestUtils.scala new file mode 100644 index 0000000000000..c9d282af2e5ea --- /dev/null +++ b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectPlannerTestUtils.scala @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.connect.planner + +import io.grpc.stub.StreamObserver + +import org.apache.spark.connect.proto +import org.apache.spark.connect.proto.ExecutePlanResponse +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.connect.service.{ExecuteHolder, ExecuteStatus, SessionHolder, SessionStatus, SparkConnectService} + +object SparkConnectPlannerTestUtils { + def transform(spark: SparkSession, relation: proto.Relation): LogicalPlan = { + new SparkConnectPlanner(SessionHolder.forTesting(spark)).transformRelation(relation) + } + + def transform(spark: SparkSession, command: proto.Command): Unit = { + val executeHolder = buildExecutePlanHolder(spark, command) + new SparkConnectPlanner(executeHolder).process(command, new MockObserver()) + } + + private def buildExecutePlanHolder( + spark: SparkSession, + command: proto.Command): ExecuteHolder = { + val sessionHolder = SessionHolder.forTesting(spark) + sessionHolder.eventManager.status_(SessionStatus.Started) + + val context = proto.UserContext + .newBuilder() + .setUserId(sessionHolder.userId) + .build() + val plan = proto.Plan + .newBuilder() + .setCommand(command) + .build() + val request = proto.ExecutePlanRequest + .newBuilder() + .setPlan(plan) + .setSessionId(sessionHolder.sessionId) + .setUserContext(context) + .build() + + val executeHolder = SparkConnectService.executionManager.createExecuteHolder(request) + executeHolder.eventsManager.status_(ExecuteStatus.Started) + executeHolder + } + + private class MockObserver extends StreamObserver[proto.ExecutePlanResponse] { + override def onNext(value: ExecutePlanResponse): Unit = {} + override def onError(t: Throwable): Unit = {} + override def onCompleted(): Unit = {} + } +} diff --git a/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectProtoSuite.scala b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectProtoSuite.scala index 632e2308fc76b..7e862bcfc533f 100644 --- a/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectProtoSuite.scala +++ b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectProtoSuite.scala @@ -1036,7 +1036,7 @@ class SparkConnectProtoSuite extends PlanTest with SparkConnectPlanTest { } test("SPARK-47144: Collated string") { - Seq("UTF8_BINARY", "UTF8_BINARY_LCASE", "UNICODE", "UNICODE_CI").map(collationName => + Seq("UTF8_BINARY", "UTF8_LCASE", "UNICODE", "UNICODE_CI").map(collationName => Seq( s"select 'abc' collate $collationName", s"select collation('abc' collate $collationName)").map(query => diff --git a/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectServiceSuite.scala b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectServiceSuite.scala index af18fca9dd216..71ca0f44af680 100644 --- a/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectServiceSuite.scala +++ b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectServiceSuite.scala @@ -826,7 +826,9 @@ class SparkConnectServiceSuite when(restartedQuery.runId).thenReturn(DEFAULT_UUID) SparkConnectService.streamingSessionManager.registerNewStreamingQuery( SparkConnectService.getOrCreateIsolatedSession("c1", sessionId, None), - restartedQuery) + restartedQuery, + Set.empty[String], + "") f(verifyEvents) } } diff --git a/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectWithSessionExtensionSuite.scala b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectWithSessionExtensionSuite.scala index 37c7fe25097c4..c234b4f068bc9 100644 --- a/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectWithSessionExtensionSuite.scala +++ b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectWithSessionExtensionSuite.scala @@ -23,7 +23,7 @@ import org.apache.spark.sql._ import org.apache.spark.sql.catalyst._ import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation import org.apache.spark.sql.catalyst.expressions.Expression -import org.apache.spark.sql.catalyst.parser.ParserInterface +import org.apache.spark.sql.catalyst.parser.{CompoundBody, ParserInterface} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.connect.service.SessionHolder import org.apache.spark.sql.types.{DataType, StructType} @@ -54,6 +54,9 @@ class SparkConnectWithSessionExtensionSuite extends SparkFunSuite { override def parseQuery(sqlText: String): LogicalPlan = delegate.parseQuery(sqlText) + + override def parseScript(sqlScriptText: String): CompoundBody = + delegate.parseScript(sqlScriptText) } test("Parse table name with test parser") { diff --git a/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/plugin/SparkConnectPluginRegistrySuite.scala b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/plugin/SparkConnectPluginRegistrySuite.scala index a213a36168e8d..512cdad62b921 100644 --- a/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/plugin/SparkConnectPluginRegistrySuite.scala +++ b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/plugin/SparkConnectPluginRegistrySuite.scala @@ -27,6 +27,7 @@ import org.apache.spark.connect.proto.Relation import org.apache.spark.sql.Dataset import org.apache.spark.sql.catalyst.expressions.{Alias, Expression} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.connect.ConnectProtoUtils import org.apache.spark.sql.connect.common.InvalidPlanInput import org.apache.spark.sql.connect.config.Connect import org.apache.spark.sql.connect.planner.{SparkConnectPlanner, SparkConnectPlanTest} @@ -68,7 +69,10 @@ class ExampleRelationPlugin extends RelationPlugin { return Optional.empty() } val plugin = rel.unpack(classOf[proto.ExamplePluginRelation]) - Optional.of(planner.transformRelation(plugin.getInput.toByteArray)) + val input = ConnectProtoUtils.parseRelationWithRecursionLimit( + plugin.getInput.toByteArray, + recursionLimit = 1024) + Optional.of(planner.transformRelation(input)) } } @@ -81,8 +85,10 @@ class ExampleExpressionPlugin extends ExpressionPlugin { return Optional.empty() } val exp = rel.unpack(classOf[proto.ExamplePluginExpression]) - Optional.of( - Alias(planner.transformExpression(exp.getChild.toByteArray), exp.getCustomField)()) + val child = ConnectProtoUtils.parseExpressionWithRecursionLimit( + exp.getChild.toByteArray, + recursionLimit = 1024) + Optional.of(Alias(planner.transformExpression(child), exp.getCustomField)()) } } @@ -198,9 +204,7 @@ class SparkConnectPluginRegistrySuite extends SharedSparkSession with SparkConne .build())) .build() - val executeHolder = buildExecutePlanHolder(plan) - new SparkConnectPlanner(executeHolder) - .process(plan, new MockObserver()) + transform(plan) assert(spark.sparkContext.getLocalProperty("testingProperty").equals("Martin")) } } diff --git a/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/service/SparkConnectServiceInternalServerSuite.scala b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/service/SparkConnectServiceInternalServerSuite.scala new file mode 100644 index 0000000000000..3240b33f3f090 --- /dev/null +++ b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/service/SparkConnectServiceInternalServerSuite.scala @@ -0,0 +1,328 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.connect.service + +import java.net.ServerSocket +import java.util.concurrent.CopyOnWriteArrayList +import java.util.concurrent.Semaphore + +import scala.collection.mutable + +import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite} +import org.apache.spark.internal.config._ +import org.apache.spark.launcher.SparkLauncher +import org.apache.spark.scheduler.{SparkListener, SparkListenerEvent} +import org.apache.spark.sql.connect.SparkConnectPlugin +import org.apache.spark.sql.connect.config.Connect.{CONNECT_GRPC_BINDING_PORT, CONNECT_GRPC_PORT_MAX_RETRIES} +import org.apache.spark.util.Utils + +class SparkConnectServiceInternalServerSuite extends SparkFunSuite with LocalSparkContext { + + override def afterEach(): Unit = { + super.afterEach() + SparkConnectServiceLifeCycleListener.reset() + } + + test("The SparkConnectService will retry using different ports in case of conflicts") { + val conf = new SparkConf() + .setAppName(getClass().getName()) + .set(SparkLauncher.SPARK_MASTER, "local[1]") + sc = new SparkContext(conf) + + // 1. By default there is no retry, the SparkConnectService will fail to start + // if the port is already in use. + val startPort = 15002 + withSparkEnvConfs((CONNECT_GRPC_BINDING_PORT.key, startPort.toString)) { + withPortOccupied(startPort, startPort) { + val portConflicts = intercept[Throwable] { + SparkConnectService.start(sc) + } + portConflicts.printStackTrace() + assert(Utils.isBindCollision(portConflicts)) + } + } + + // 2. Enable the port retry, the SparkConnectService will retry using different ports + // until it finds an available port before reaching the maximum number of retries. + withSparkEnvConfs( + (CONNECT_GRPC_BINDING_PORT.key, startPort.toString), + (CONNECT_GRPC_PORT_MAX_RETRIES.key, "3")) { + // 15002, 15003, 15004 occupied + withPortOccupied(startPort, startPort + 2) { + SparkConnectService.start(sc) + assert(SparkConnectService.started) + assert(SparkConnectService.server.getPort == startPort + 3) // 15005 available + SparkConnectService.stop() + } + } + + // 3. It will fail if not able to find an available port + // before reaching the maximum number of retries. + withSparkEnvConfs( + (CONNECT_GRPC_BINDING_PORT.key, startPort.toString), + (CONNECT_GRPC_PORT_MAX_RETRIES.key, "1")) { + // 15002, 15003 occupied but only retried on 15003 and reach the maximum number of retries + withPortOccupied(startPort, startPort + 1) { + val portConflicts = intercept[Throwable] { + SparkConnectService.start(sc) + } + portConflicts.printStackTrace() + assert(Utils.isBindCollision(portConflicts)) + } + } + + // 4. The value of port will be validated before the service starts + Seq( + (CONNECT_GRPC_BINDING_PORT.key, (1024 - 1).toString), + (CONNECT_GRPC_BINDING_PORT.key, (65535 + 1).toString)).foreach(conf => { + withSparkEnvConfs(conf) { + val invalidPort = intercept[IllegalArgumentException] { + SparkConnectService.start(sc) + } + assert( + invalidPort.getMessage.contains( + "requirement failed: startPort should be between 1024 and 65535 (inclusive)," + + " or 0 for a random free port.")) + } + }) + } + + test("The SparkConnectService will post events for each pair of start and stop") { + // Future validations when listener receive the `SparkListenerConnectServiceStarted` event + val startedEventValidations: CopyOnWriteArrayList[(String, Boolean)] = + new CopyOnWriteArrayList[(String, Boolean)]() + val startedEventSignal = new Semaphore(0) + SparkConnectServiceLifeCycleListener.checksOnServiceStartedEvent = Some( + Seq( + _ => { + startedEventSignal.release() + startedEventValidations.add( + ("The listener should receive the `SparkListenerConnectServiceStarted` event.", true)) + }, + _ => { + startedEventValidations.add( + ( + "The server should has already been started" + + " if the listener receives the `SparkListenerConnectServiceStarted` event.", + SparkConnectService.started && + !SparkConnectService.stopped && + SparkConnectService.server != null)) + }, + serviceStarted => { + startedEventValidations.add( + ( + "The SparkConnectService should post it's address " + + "by the `SparkListenerConnectServiceStarted` event", + serviceStarted.bindingPort == SparkConnectService.server.getPort && + serviceStarted.hostAddress == SparkConnectService.hostAddress)) + })) + + // Future validations when listener receive the `SparkListenerConnectServiceEnd` event + val endEventValidations: CopyOnWriteArrayList[(String, Boolean)] = + new CopyOnWriteArrayList[(String, Boolean)]() + val endEventSignal = new Semaphore(0) + SparkConnectServiceLifeCycleListener.checksOnServiceEndEvent = Some( + Seq( + _ => { + endEventSignal.release() + startedEventValidations.add( + ("The listener should receive the `SparkListenerConnectServiceEnd` event.", true)) + }, + _ => { + endEventValidations.add( + ( + "The server has already been stopped" + + " if the listener receives the `SparkListenerConnectServiceEnd` event.", + SparkConnectService.stopped && + !SparkConnectService.started && + SparkConnectService.server.isShutdown)) + }, + serviceEnd => { + endEventValidations.add( + ( + "The SparkConnectService should post it's address " + + "by the `SparkListenerConnectServiceEnd` event", + serviceEnd.bindingPort == SparkConnectService.server.getPort && + serviceEnd.hostAddress == SparkConnectService.hostAddress)) + })) + + val conf = new SparkConf() + .setAppName(getClass().getName()) + .set(SparkLauncher.SPARK_MASTER, "local[1]") + sc = new SparkContext(conf) + + val listenerInstance = new SparkConnectServiceLifeCycleListener() + sc.addSparkListener(listenerInstance) + + // Start the SparkConnectService and wait for the listener + // to receive the `SparkListenerConnectServiceStarted` event. + SparkConnectService.start(sc) + startedEventSignal.acquire() + // Now the listener should have already received the `SparkListenerConnectServiceStarted` event. + + // The internal server of SparkConnectService should has + // already been created and started in this time. + assert(SparkConnectService.started && SparkConnectService.server != null) + + // The event `SparkListenerConnectServiceStarted` should be posted + // during the startup of the SparkConnectService. + assert(listenerInstance.serviceStartedEvents.size() == 1) + // The server should already been started when the listener receive the event + // and the server address should be the same as the address of service + startedEventValidations.forEach { case (msg, validated) => + assert(validated, msg) + } + // In the meanwhile, no any end event should be posted + assert(listenerInstance.serviceEndEvents.size() == 0) + + // The listener is able to get the SparkConf from the event + val event = listenerInstance.serviceStartedEvents.get(0) + assert(event.sparkConf != null) + val sparkConf = event.sparkConf + assert(sparkConf.contains("spark.driver.host")) + assert(sparkConf.contains("spark.app.id")) + + // Try to start an already started SparkConnectService + SparkConnectService.start(sc) + // The listener should still receive only one started event + // because the server has not been stopped yet, and there won't be duplicated service start + assert(listenerInstance.serviceStartedEvents.size() == 1) + + // Stop the SparkConnectService + SparkConnectService.stop() + assert(SparkConnectService.stopped) + // The listener should receive the `SparkListenerConnectServiceEnd` event + endEventSignal.acquire() + + // The event `SparkListenerConnectServiceEnd` should be posted and received by the listener + assert(listenerInstance.serviceEndEvents.size() == 1) + // The server should already been stopped when the listener receive the event + // and the server address should be the same as the address of service + endEventValidations.forEach { case (msg, validated) => + assert(validated, msg) + } + + // Try to stop an already stopped SparkConnectService + SparkConnectService.stop() + // The listener should still receive only one end event, + // no duplicated `SparkListenerConnectServiceEnd` event posted + assert(listenerInstance.serviceEndEvents.size() == 1) + } + + test("SparkConnectPlugin will post started and end events that can be received by listeners") { + // Future validations when listener receive the `SparkListenerConnectServiceStarted` event + val startedEventSignal = new Semaphore(0) + SparkConnectServiceLifeCycleListener.checksOnServiceStartedEvent = Some(Seq(_ => { + startedEventSignal.release() + })) + + // Future validations when listener receive the `SparkListenerConnectServiceEnd` event + val endEventSignal = new Semaphore(0) + SparkConnectServiceLifeCycleListener.checksOnServiceEndEvent = Some(Seq(_ => { + endEventSignal.release() + })) + + val conf = new SparkConf() + .setAppName(getClass().getName()) + // Start the SparkConnectService from SparkConnectPlugin + .set(PLUGINS, Seq(classOf[SparkConnectPlugin].getName())) + // In this case, the listener need to be registered via the configuration + // otherwise the listener will not be able to receive the events that post during + // the initialization of the SparkConnectPlugin + .set(EXTRA_LISTENERS, Seq(classOf[SparkConnectServiceLifeCycleListener].getName())) + .set(SparkLauncher.SPARK_MASTER, "local[1]") + + // Create the SparkContext, initialize the SparkConnectPlugin and start the SparkConnectService + sc = new SparkContext(conf) + + val listenerInstance = SparkConnectServiceLifeCycleListener.currentInstance + assert(listenerInstance != null) + // The internal server of SparkConnectService should has + // already been created and started during the initializing of the SparkConnectPlugin. + assert(SparkConnectService.started && SparkConnectService.server != null) + // The event `SparkListenerConnectServiceStarted` should be posted and received by the listener + startedEventSignal.acquire() + // Only one `SparkListenerConnectServiceStarted` event should be received by the listener + assert(listenerInstance.serviceStartedEvents.size() == 1) + + // Stop the SparkContext, the SparkConnectService will be stopped during the shutdown of + // SparkConnectPlugin and the message will be posted to the listener via active ListenerBus. + // This requires the ListenerBus can accept events if the SparkPlugins has not been shutdown. + sc.stop() + assert(SparkConnectService.stopped) + // The listener should receive the `SparkListenerConnectServiceEnd` event + endEventSignal.acquire() + } + + def withPortOccupied(startPort: Int, endPort: Int)(f: => Unit): Unit = { + val startedServers = new mutable.ArrayBuffer[ServerSocket]() + try { + for (toBeOccupiedPort <- startPort to endPort) { + val server = new ServerSocket(toBeOccupiedPort) + startedServers += server + } + f + } finally { + startedServers.foreach(server => { + try { + server.close() + } catch { + case _: Throwable => + } + }) + } + } +} + +private class SparkConnectServiceLifeCycleListener extends SparkListener { + + SparkConnectServiceLifeCycleListener.currentInstance = this + + val serviceStartedEvents: CopyOnWriteArrayList[SparkListenerConnectServiceStarted] = + new CopyOnWriteArrayList[SparkListenerConnectServiceStarted]() + val serviceEndEvents: CopyOnWriteArrayList[SparkListenerConnectServiceEnd] = + new CopyOnWriteArrayList[SparkListenerConnectServiceEnd]() + + override def onOtherEvent(event: SparkListenerEvent): Unit = { + event match { + case serviceStarted: SparkListenerConnectServiceStarted => + serviceStartedEvents.add(serviceStarted) + SparkConnectServiceLifeCycleListener.checksOnServiceStartedEvent.foreach { checks => + checks.foreach(_(serviceStarted)) + } + case serviceEnd: SparkListenerConnectServiceEnd => + serviceEndEvents.add(serviceEnd) + SparkConnectServiceLifeCycleListener.checksOnServiceEndEvent.foreach { checks => + checks.foreach(_(serviceEnd)) + } + } + } +} + +private object SparkConnectServiceLifeCycleListener { + + var currentInstance: SparkConnectServiceLifeCycleListener = _ + var checksOnServiceStartedEvent: Option[Seq[(SparkListenerConnectServiceStarted) => Unit]] = + None + var checksOnServiceEndEvent: Option[Seq[(SparkListenerConnectServiceEnd) => Unit]] = None + + def reset(): Unit = { + currentInstance = null + checksOnServiceStartedEvent = None + checksOnServiceEndEvent = None + } +} diff --git a/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/service/SparkConnectStreamingQueryCacheSuite.scala b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/service/SparkConnectStreamingQueryCacheSuite.scala index ed3da2c0f7156..512a0a80c4a91 100644 --- a/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/service/SparkConnectStreamingQueryCacheSuite.scala +++ b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/service/SparkConnectStreamingQueryCacheSuite.scala @@ -67,7 +67,7 @@ class SparkConnectStreamingQueryCacheSuite extends SparkFunSuite with MockitoSug // Register the query. - sessionMgr.registerNewStreamingQuery(sessionHolder, mockQuery) + sessionMgr.registerNewStreamingQuery(sessionHolder, mockQuery, Set.empty[String], "") sessionMgr.getCachedValue(queryId, runId) match { case Some(v) => @@ -78,9 +78,14 @@ class SparkConnectStreamingQueryCacheSuite extends SparkFunSuite with MockitoSug } // Verify query is returned only with the correct session, not with a different session. - assert(sessionMgr.getCachedQuery(queryId, runId, mock[SparkSession]).isEmpty) + assert( + sessionMgr.getCachedQuery(queryId, runId, Set.empty[String], mock[SparkSession]).isEmpty) // Query is returned when correct session is used - assert(sessionMgr.getCachedQuery(queryId, runId, mockSession).contains(mockQuery)) + assert( + sessionMgr + .getCachedQuery(queryId, runId, Set.empty[String], mockSession) + .map(_.query) + .contains(mockQuery)) // Cleanup the query and verify if stop() method has been called. when(mockQuery.isActive).thenReturn(false) @@ -99,7 +104,11 @@ class SparkConnectStreamingQueryCacheSuite extends SparkFunSuite with MockitoSug clock.advance(30.seconds.toMillis) // Access the query. This should advance expiry time by 30 seconds. - assert(sessionMgr.getCachedQuery(queryId, runId, mockSession).contains(mockQuery)) + assert( + sessionMgr + .getCachedQuery(queryId, runId, Set.empty[String], mockSession) + .map(_.query) + .contains(mockQuery)) val expiresAtMs = sessionMgr.getCachedValue(queryId, runId).get.expiresAtMs.get assert(expiresAtMs == prevExpiryTimeMs + 30.seconds.toMillis) @@ -112,7 +121,7 @@ class SparkConnectStreamingQueryCacheSuite extends SparkFunSuite with MockitoSug when(restartedQuery.isActive).thenReturn(true) when(mockStreamingQueryManager.get(queryId)).thenReturn(restartedQuery) - sessionMgr.registerNewStreamingQuery(sessionHolder, restartedQuery) + sessionMgr.registerNewStreamingQuery(sessionHolder, restartedQuery, Set.empty[String], "") // Both queries should existing in the cache. assert(sessionMgr.getCachedValue(queryId, runId).map(_.query).contains(mockQuery)) diff --git a/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/ui/SparkConnectServerListenerSuite.scala b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/ui/SparkConnectServerListenerSuite.scala index 3b75c37b2aa00..c9c110dd1e626 100644 --- a/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/ui/SparkConnectServerListenerSuite.scala +++ b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/ui/SparkConnectServerListenerSuite.scala @@ -37,7 +37,7 @@ class SparkConnectServerListenerSuite private var kvstore: ElementTrackingStore = _ - private val jobTag = ExecuteJobTag("sessionId", "userId", "operationId") + private val jobTag = ExecuteJobTag("userId", "sessionId", "operationId") after { if (kvstore != null) { @@ -174,7 +174,7 @@ class SparkConnectServerListenerSuite SparkListenerJobStart(0, System.currentTimeMillis(), Nil, createProperties)) listener.onOtherEvent( SparkListenerConnectSessionClosed("sessionId", "userId", System.currentTimeMillis())) - val exec = statusStore.getExecution(ExecuteJobTag("sessionId", "userId", "operationId")) + val exec = statusStore.getExecution(ExecuteJobTag("userId", "sessionId", "operationId")) assert(exec.isDefined) assert(exec.get.jobId === Seq("0")) assert(exec.get.sqlExecId === Set("0")) @@ -190,7 +190,7 @@ class SparkConnectServerListenerSuite listener.onOtherEvent(SparkListenerConnectSessionClosed(unknownSession, "userId", 0)) listener.onOtherEvent( SparkListenerConnectOperationStarted( - ExecuteJobTag("sessionId", "userId", "operationId"), + ExecuteJobTag("userId", "sessionId", "operationId"), "operationId", System.currentTimeMillis(), unknownSession, diff --git a/connector/docker-integration-tests/README.md b/connector/docker-integration-tests/README.md index 0192947bdbf90..03d3fe706a606 100644 --- a/connector/docker-integration-tests/README.md +++ b/connector/docker-integration-tests/README.md @@ -45,7 +45,7 @@ the container bootstrapping. To run an individual Docker integration test, use t Besides the default Docker images, the integration tests can be run with custom Docker images. For example, - ORACLE_DOCKER_IMAGE_NAME=gvenzl/oracle-free:23.3-slim-faststart ./build/sbt -Pdocker-integration-tests "docker-integration-tests/testOnly *OracleIntegrationSuite" + ORACLE_DOCKER_IMAGE_NAME=gvenzl/oracle-free:23.4-slim-faststart ./build/sbt -Pdocker-integration-tests "docker-integration-tests/testOnly *OracleIntegrationSuite" The following environment variables can be used to specify the custom Docker images for different databases: diff --git a/connector/docker-integration-tests/pom.xml b/connector/docker-integration-tests/pom.xml index bb7647c72491a..9003c2190be22 100644 --- a/connector/docker-integration-tests/pom.xml +++ b/connector/docker-integration-tests/pom.xml @@ -39,7 +39,7 @@ com.google.guava guava - 33.0.0-jre + 33.1.0-jre test diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DB2IntegrationSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DB2IntegrationSuite.scala index cedb33d491fbc..72b2ac8074f4a 100644 --- a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DB2IntegrationSuite.scala +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DB2IntegrationSuite.scala @@ -21,11 +21,11 @@ import java.math.BigDecimal import java.sql.{Connection, Date, Timestamp} import java.util.Properties -import org.scalatest.time.SpanSugar._ - import org.apache.spark.sql.{Row, SaveMode} +import org.apache.spark.sql.catalyst.util.CharVarcharUtils import org.apache.spark.sql.catalyst.util.DateTimeTestUtils._ -import org.apache.spark.sql.types.{BooleanType, ByteType, ShortType, StructType} +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.{ByteType, ShortType, StructType} import org.apache.spark.tags.DockerTest /** @@ -40,8 +40,6 @@ import org.apache.spark.tags.DockerTest class DB2IntegrationSuite extends DockerJDBCIntegrationSuite { override val db = new DB2DatabaseOnDocker - override val connectionTimeout = timeout(3.minutes) - override def dataPreparation(conn: Connection): Unit = { conn.prepareStatement("CREATE TABLE tbl (x INTEGER, y VARCHAR(8))").executeUpdate() conn.prepareStatement("INSERT INTO tbl VALUES (42,'fred')").executeUpdate() @@ -64,6 +62,20 @@ class DB2IntegrationSuite extends DockerJDBCIntegrationSuite { .executeUpdate() conn.prepareStatement("INSERT INTO strings VALUES ('the', 'quick', 'brown', BLOB('fox')," + "'Kathy')").executeUpdate() + + conn.prepareStatement("CREATE TABLE booleans (a BOOLEAN)").executeUpdate() + conn.prepareStatement("INSERT INTO booleans VALUES (true)").executeUpdate() + // VARGRAPHIC + conn.prepareStatement("CREATE TABLE graphics (a GRAPHIC(16), b VARGRAPHIC(16))") + .executeUpdate() + conn.prepareStatement("INSERT INTO graphics VALUES ('a', 'b')").executeUpdate() + // CHAR(n) FOR BIT DATA + conn.prepareStatement("CREATE TABLE binarys (" + + "a CHAR(10) FOR BIT DATA, b VARCHAR(10) FOR BIT DATA, c BINARY(10), d VARBINARY(10))") + .executeUpdate() + conn.prepareStatement("INSERT INTO binarys VALUES (" + + "'ABC', 'ABC', BINARY('ABC', 10), VARBINARY('ABC', 10))") + .executeUpdate() } test("Basic test") { @@ -77,32 +89,44 @@ class DB2IntegrationSuite extends DockerJDBCIntegrationSuite { } test("Numeric types") { - val df = sqlContext.read.jdbc(jdbcUrl, "numbers", new Properties) - val rows = df.collect() - assert(rows.length == 1) - val types = rows(0).toSeq.map(x => x.getClass.toString) - assert(types.length == 10) - assert(types(0).equals("class java.lang.Integer")) - assert(types(1).equals("class java.lang.Integer")) - assert(types(2).equals("class java.lang.Long")) - assert(types(3).equals("class java.math.BigDecimal")) - assert(types(4).equals("class java.lang.Double")) - assert(types(5).equals("class java.lang.Double")) - assert(types(6).equals("class java.lang.Float")) - assert(types(7).equals("class java.math.BigDecimal")) - assert(types(8).equals("class java.math.BigDecimal")) - assert(types(9).equals("class java.math.BigDecimal")) - assert(rows(0).getInt(0) == 17) - assert(rows(0).getInt(1) == 77777) - assert(rows(0).getLong(2) == 922337203685477580L) - val bd = new BigDecimal("123456745.56789012345000000000") - assert(rows(0).getAs[BigDecimal](3).equals(bd)) - assert(rows(0).getDouble(4) == 42.75) - assert(rows(0).getDouble(5) == 5.4E-70) - assert(rows(0).getFloat(6) == 3.4028234663852886e+38) - assert(rows(0).getDecimal(7) == new BigDecimal("4.299900000000000000")) - assert(rows(0).getDecimal(8) == new BigDecimal("99999999999999990000.000000000000000000")) - assert(rows(0).getDecimal(9) == new BigDecimal("1234567891234567.123456789123456789")) + Seq(true, false).foreach { legacy => + withSQLConf(SQLConf.LEGACY_DB2_TIMESTAMP_MAPPING_ENABLED.key -> legacy.toString) { + val df = sqlContext.read.jdbc(jdbcUrl, "numbers", new Properties) + val rows = df.collect() + assert(rows.length == 1) + val types = rows(0).toSeq.map(x => x.getClass.toString) + assert(types.length == 10) + if (legacy) { + assert(types(0).equals("class java.lang.Integer")) + } else { + assert(types(0).equals("class java.lang.Short")) + } + assert(types(1).equals("class java.lang.Integer")) + assert(types(2).equals("class java.lang.Long")) + assert(types(3).equals("class java.math.BigDecimal")) + assert(types(4).equals("class java.lang.Double")) + assert(types(5).equals("class java.lang.Double")) + assert(types(6).equals("class java.lang.Float")) + assert(types(7).equals("class java.math.BigDecimal")) + assert(types(8).equals("class java.math.BigDecimal")) + assert(types(9).equals("class java.math.BigDecimal")) + if (legacy) { + assert(rows(0).getInt(0) == 17) + } else { + assert(rows(0).getShort(0) == 17) + } + assert(rows(0).getInt(1) == 77777) + assert(rows(0).getLong(2) == 922337203685477580L) + val bd = new BigDecimal("123456745.56789012345000000000") + assert(rows(0).getAs[BigDecimal](3).equals(bd)) + assert(rows(0).getDouble(4) == 42.75) + assert(rows(0).getDouble(5) == 5.4E-70) + assert(rows(0).getFloat(6) == 3.4028234663852886e+38) + assert(rows(0).getDecimal(7) == new BigDecimal("4.299900000000000000")) + assert(rows(0).getDecimal(8) == new BigDecimal("99999999999999990000.000000000000000000")) + assert(rows(0).getDecimal(9) == new BigDecimal("1234567891234567.123456789123456789")) + } + } } test("Date types") { @@ -150,13 +174,12 @@ class DB2IntegrationSuite extends DockerJDBCIntegrationSuite { df3.write.jdbc(jdbcUrl, "stringscopy", new Properties) // spark types that does not have exact matching db2 table types. val df4 = sqlContext.createDataFrame( - sparkContext.parallelize(Seq(Row("1".toShort, "20".toByte, true))), - new StructType().add("c1", ShortType).add("b", ByteType).add("c3", BooleanType)) + sparkContext.parallelize(Seq(Row("1".toShort, "20".toByte))), + new StructType().add("c1", ShortType).add("b", ByteType)) df4.write.jdbc(jdbcUrl, "otherscopy", new Properties) val rows = sqlContext.read.jdbc(jdbcUrl, "otherscopy", new Properties).collect() - assert(rows(0).getInt(0) == 1) - assert(rows(0).getInt(1) == 20) - assert(rows(0).getString(2) == "1") + assert(rows(0).getShort(0) == 1) + assert(rows(0).getShort(1) == 20) } test("query JDBC option") { @@ -224,4 +247,37 @@ class DB2IntegrationSuite extends DockerJDBCIntegrationSuite { assert(actual === expected) } + + test("SPARK-48269: boolean type") { + val df = sqlContext.read.jdbc(jdbcUrl, "booleans", new Properties) + checkAnswer(df, Row(true)) + Seq(true, false).foreach { legacy => + withSQLConf(SQLConf.LEGACY_DB2_BOOLEAN_MAPPING_ENABLED.key -> legacy.toString) { + val tbl = "booleanscopy" + legacy + df.write.jdbc(jdbcUrl, tbl, new Properties) + if (legacy) { + checkAnswer(sqlContext.read.jdbc(jdbcUrl, tbl, new Properties), Row("1")) + } else { + checkAnswer(sqlContext.read.jdbc(jdbcUrl, tbl, new Properties), Row(true)) + } + } + } + } + + test("SPARK-48269: GRAPHIC types") { + val df = sqlContext.read.jdbc(jdbcUrl, "graphics", new Properties) + checkAnswer(df, Row("a".padTo(16, ' '), "b")) + // the padding happens in the source not because of reading as char type + assert(!df.schema.exists { + _.metadata.contains(CharVarcharUtils.CHAR_VARCHAR_TYPE_STRING_METADATA_KEY) }) + } + + test("SPARK-48269: binary types") { + val df = sqlContext.read.jdbc(jdbcUrl, "binarys", new Properties) + checkAnswer(df, Row( + "ABC".padTo(10, ' ').getBytes, + "ABC".getBytes, + "ABC".getBytes ++ Array.fill(7)(0), + "ABC".getBytes)) + } } diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DB2KrbIntegrationSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DB2KrbIntegrationSuite.scala index abb683c064955..4899de2b2a14c 100644 --- a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DB2KrbIntegrationSuite.scala +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DB2KrbIntegrationSuite.scala @@ -24,7 +24,6 @@ import javax.security.auth.login.Configuration import com.github.dockerjava.api.model.{AccessMode, Bind, ContainerConfig, HostConfig, Volume} import org.apache.hadoop.security.{SecurityUtil, UserGroupInformation} import org.apache.hadoop.security.UserGroupInformation.AuthenticationMethod.KERBEROS -import org.scalatest.time.SpanSugar._ import org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions import org.apache.spark.sql.execution.datasources.jdbc.connection.{DB2ConnectionProvider, SecureConnectionProvider} @@ -68,8 +67,6 @@ class DB2KrbIntegrationSuite extends DockerKrbJDBCIntegrationSuite { } } - override val connectionTimeout = timeout(3.minutes) - override protected def setAuthentication(keytabFile: String, principal: String): Unit = { val config = new SecureConnectionProvider.JDBCConfiguration( Configuration.getConfiguration, "JaasClient", keytabFile, principal, true) diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DockerJDBCIntegrationSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DockerJDBCIntegrationSuite.scala index fc095c5f5b310..8d17e0b4e36e6 100644 --- a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DockerJDBCIntegrationSuite.scala +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DockerJDBCIntegrationSuite.scala @@ -36,7 +36,7 @@ import com.github.dockerjava.zerodep.ZerodepDockerHttpClient import org.scalatest.concurrent.{Eventually, PatienceConfiguration} import org.scalatest.time.SpanSugar._ -import org.apache.spark.internal.LogKey.{CLASS_NAME, CONTAINER, STATUS} +import org.apache.spark.internal.LogKeys.{CLASS_NAME, CONTAINER, STATUS} import org.apache.spark.internal.MDC import org.apache.spark.sql.QueryTest import org.apache.spark.sql.test.SharedSparkSession @@ -115,7 +115,7 @@ abstract class DockerJDBCIntegrationSuite protected val startContainerTimeout: Long = timeStringAsSeconds(sys.props.getOrElse("spark.test.docker.startContainerTimeout", "5min")) protected val connectionTimeout: PatienceConfiguration.Timeout = { - val timeoutStr = sys.props.getOrElse("spark.test.docker.conn", "5min") + val timeoutStr = sys.props.getOrElse("spark.test.docker.connectionTimeout", "5min") timeout(timeStringAsSeconds(timeoutStr).seconds) } diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MariaDBKrbIntegrationSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MariaDBKrbIntegrationSuite.scala index 6825c001f7670..efb2fa09f6a3f 100644 --- a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MariaDBKrbIntegrationSuite.scala +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MariaDBKrbIntegrationSuite.scala @@ -25,9 +25,9 @@ import org.apache.spark.sql.execution.datasources.jdbc.connection.SecureConnecti import org.apache.spark.tags.DockerTest /** - * To run this test suite for a specific version (e.g., mariadb:10.5.12): + * To run this test suite for a specific version (e.g., mariadb:10.5.25): * {{{ - * ENABLE_DOCKER_INTEGRATION_TESTS=1 MARIADB_DOCKER_IMAGE_NAME=mariadb:10.5.12 + * ENABLE_DOCKER_INTEGRATION_TESTS=1 MARIADB_DOCKER_IMAGE_NAME=mariadb:10.5.25 * ./build/sbt -Pdocker-integration-tests * "docker-integration-tests/testOnly org.apache.spark.sql.jdbc.MariaDBKrbIntegrationSuite" * }}} @@ -38,7 +38,7 @@ class MariaDBKrbIntegrationSuite extends DockerKrbJDBCIntegrationSuite { override protected val keytabFileName = "mariadb.keytab" override val db = new DatabaseOnDocker { - override val imageName = sys.env.getOrElse("MARIADB_DOCKER_IMAGE_NAME", "mariadb:10.5.12") + override val imageName = sys.env.getOrElse("MARIADB_DOCKER_IMAGE_NAME", "mariadb:10.5.25") override val env = Map( "MYSQL_ROOT_PASSWORD" -> "rootpass" ) diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MsSQLServerDatabaseOnDocker.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MsSQLServerDatabaseOnDocker.scala new file mode 100644 index 0000000000000..61530f713eb86 --- /dev/null +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MsSQLServerDatabaseOnDocker.scala @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.jdbc + +class MsSQLServerDatabaseOnDocker extends DatabaseOnDocker { + override val imageName = sys.env.getOrElse("MSSQLSERVER_DOCKER_IMAGE_NAME", + "mcr.microsoft.com/mssql/server:2022-CU12-GDR1-ubuntu-22.04") + override val env = Map( + "SA_PASSWORD" -> "Sapass123", + "ACCEPT_EULA" -> "Y" + ) + override val usesIpc = false + override val jdbcPort: Int = 1433 + + override def getJdbcUrl(ip: String, port: Int): String = + s"jdbc:sqlserver://$ip:$port;user=sa;password=Sapass123;" + + "encrypt=true;trustServerCertificate=true" +} diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MsSqlServerIntegrationSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MsSqlServerIntegrationSuite.scala index 8bceb9506e850..623f404339e9e 100644 --- a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MsSqlServerIntegrationSuite.scala +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MsSqlServerIntegrationSuite.scala @@ -19,12 +19,15 @@ package org.apache.spark.sql.jdbc import java.math.BigDecimal import java.sql.{Connection, Date, Timestamp} +import java.time.LocalDateTime import java.util.Properties +import org.apache.spark.SparkSQLException import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.util.DateTimeTestUtils._ import org.apache.spark.sql.functions.col import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.{BinaryType, DecimalType} import org.apache.spark.tags.DockerTest /** @@ -38,19 +41,7 @@ import org.apache.spark.tags.DockerTest */ @DockerTest class MsSqlServerIntegrationSuite extends DockerJDBCIntegrationSuite { - override val db = new DatabaseOnDocker { - override val imageName = sys.env.getOrElse("MSSQLSERVER_DOCKER_IMAGE_NAME", - "mcr.microsoft.com/mssql/server:2019-CU13-ubuntu-20.04") - override val env = Map( - "SA_PASSWORD" -> "Sapass123", - "ACCEPT_EULA" -> "Y" - ) - override val usesIpc = false - override val jdbcPort: Int = 1433 - - override def getJdbcUrl(ip: String, port: Int): String = - s"jdbc:sqlserver://$ip:$port;user=sa;password=Sapass123;" - } + override val db = new MsSQLServerDatabaseOnDocker override def dataPreparation(conn: Connection): Unit = { conn.prepareStatement("CREATE TABLE tbl (x INT, y VARCHAR (50))").executeUpdate() @@ -150,6 +141,11 @@ class MsSqlServerIntegrationSuite extends DockerJDBCIntegrationSuite { """ |INSERT INTO bits VALUES (1, 2, 1) """.stripMargin).executeUpdate() + conn.prepareStatement( + """CREATE TABLE test_rowversion (myKey int PRIMARY KEY,myValue int, RV rowversion)""") + .executeUpdate() + conn.prepareStatement("""INSERT INTO test_rowversion (myKey, myValue) VALUES (1, 0)""") + .executeUpdate() } test("Basic test") { @@ -227,24 +223,43 @@ class MsSqlServerIntegrationSuite extends DockerJDBCIntegrationSuite { test("Date types") { withDefaultTimeZone(UTC) { - val df = spark.read.jdbc(jdbcUrl, "dates", new Properties) - val rows = df.collect() - assert(rows.length == 1) - val row = rows(0) - val types = row.toSeq.map(x => x.getClass.toString) - assert(types.length == 6) - assert(types(0).equals("class java.sql.Date")) - assert(types(1).equals("class java.sql.Timestamp")) - assert(types(2).equals("class java.sql.Timestamp")) - assert(types(3).equals("class java.lang.String")) - assert(types(4).equals("class java.sql.Timestamp")) - assert(types(5).equals("class java.sql.Timestamp")) - assert(row.getAs[Date](0).equals(Date.valueOf("1991-11-09"))) - assert(row.getAs[Timestamp](1).equals(Timestamp.valueOf("1999-01-01 13:23:35.0"))) - assert(row.getAs[Timestamp](2).equals(Timestamp.valueOf("9999-12-31 23:59:59.0"))) - assert(row.getString(3).equals("1901-05-09 23:59:59.0000000 +14:00")) - assert(row.getAs[Timestamp](4).equals(Timestamp.valueOf("1996-01-01 23:24:00.0"))) - assert(row.getAs[Timestamp](5).equals(Timestamp.valueOf("1970-01-01 13:31:24.0"))) + Seq(true, false).foreach { ntz => + Seq(true, false).foreach { legacy => + withSQLConf( + SQLConf.LEGACY_MSSQLSERVER_DATETIMEOFFSET_MAPPING_ENABLED.key -> legacy.toString) { + val df = spark.read + .option("preferTimestampNTZ", ntz) + .jdbc(jdbcUrl, "dates", new Properties) + checkAnswer(df, Row( + Date.valueOf("1991-11-09"), + if (ntz) { + LocalDateTime.of(1999, 1, 1, 13, 23, 35) + } else { + Timestamp.valueOf("1999-01-01 13:23:35") + }, + if (ntz) { + LocalDateTime.of(9999, 12, 31, 23, 59, 59) + } else { + Timestamp.valueOf("9999-12-31 23:59:59") + }, + if (legacy) { + "1901-05-09 23:59:59.0000000 +14:00" + } else { + Timestamp.valueOf("1901-05-09 09:59:59") + }, + if (ntz) { + LocalDateTime.of(1996, 1, 1, 23, 24, 0) + } else { + Timestamp.valueOf("1996-01-01 23:24:00") + }, + if (ntz) { + LocalDateTime.of(1970, 1, 1, 13, 31, 24) + } else { + Timestamp.valueOf("1970-01-01 13:31:24") + })) + } + } + } } } @@ -287,93 +302,96 @@ class MsSqlServerIntegrationSuite extends DockerJDBCIntegrationSuite { } test("SPARK-33813: MsSqlServerDialect should support spatial types") { - val df = spark.read.jdbc(jdbcUrl, "spatials", new Properties) - val rows = df.collect() - assert(rows.length == 1) - val row = rows(0) - val types = row.toSeq.map(x => x.getClass.toString) - assert(types.length == 10) - assert(types(0) == "class [B") - assert(row.getAs[Array[Byte]](0) === - Array(0, 0, 0, 0, 1, 15, 0, 0, 0, 0, 0, 0, 8, 64, 0, 0, 0, 0, 0, 0, - 16, 64, 0, 0, 0, 0, 0, 0, 28, 64, 0, 0, 0, 0, 0, 0, 4, 64)) - assert(types(1) == "class [B") - assert(row.getAs[Array[Byte]](1) === - Array[Byte](0, 0, 0, 0, 1, 4, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, -16, 63, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - -16, 63, 0, 0, 0, 0, 0, 0, -16, -65, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, - 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, -1, -1, -1, -1, 0, 0, 0, 0, 2)) - assert(types(2) == "class [B") - assert(row.getAs[Array[Byte]](2) === - Array[Byte](0, 0, 0, 0, 2, 4, 5, 0, 0, 0, -12, -3, -44, 120, -23, -106, - 94, -64, -35, 36, 6, -127, -107, -45, 71, 64, -125, -64, -54, -95, 69, - -106, 94, -64, 80, -115, -105, 110, 18, -45, 71, 64, -125, -64, -54, - -95, 69, -106, 94, -64, 78, 98, 16, 88, 57, -44, 71, 64, -12, -3, -44, - 120, -23, -106, 94, -64, 78, 98, 16, 88, 57, -44, 71, 64, -12, -3, -44, - 120, -23, -106, 94, -64, -35, 36, 6, -127, -107, -45, 71, 64, 1, 0, 0, - 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, -1, -1, -1, -1, 0, 0, 0, 0, 8)) - assert(types(3) == "class [B") - assert(row.getAs[Array[Byte]](3) === - Array[Byte](-26, 16, 0, 0, 2, 4, 5, 0, 0, 0, -35, 36, 6, -127, -107, -45, - 71, 64, -12, -3, -44, 120, -23, -106, 94, -64, 80, -115, -105, 110, 18, - -45, 71, 64, -125, -64, -54, -95, 69, -106, 94, -64, 78, 98, 16, 88, 57, - -44, 71, 64, -125, -64, -54, -95, 69, -106, 94, -64, 78, 98, 16, 88, 57, - -44, 71, 64, -12, -3, -44, 120, -23, -106, 94, -64, -35, 36, 6, -127, -107, - -45, 71, 64, -12, -3, -44, 120, -23, -106, 94, -64, 1, 0, 0, 0, 3, 0, 0, - 0, 0, 1, 0, 0, 0, -1, -1, -1, -1, 0, 0, 0, 0, 9, 2, 0, 0, 0, 3, 1)) - assert(types(5) == "class [B") - assert(row.getAs[Array[Byte]](4) === - Array[Byte](0, 0, 0, 0, 1, 4, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 52, -64, 0, 0, - 0, 0, 0, 0, 52, -64, 0, 0, 0, 0, 0, 0, 52, -64, 0, 0, 0, 0, 0, 0, 52, 64, - 0, 0, 0, 0, 0, 0, 52, 64, 0, 0, 0, 0, 0, 0, 52, 64, 0, 0, 0, 0, 0, 0, 52, - 64, 0, 0, 0, 0, 0, 0, 52, -64, 0, 0, 0, 0, 0, 0, 52, -64, 0, 0, 0, 0, 0, - 0, 52, -64, 0, 0, 0, 0, 0, 0, 36, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 36, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 36, -64, 0, 0, 0, 0, 0, 0, 36, 64, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, - 0, 2, 0, 0, 0, 0, 0, 5, 0, 0, 0, 1, 0, 0, 0, -1, -1, -1, -1, 0, 0, 0, 0, 3)) - assert(types(6) === "class [B") - assert(row.getAs[Array[Byte]](5) === - Array[Byte](-26, 16, 0, 0, 2, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0, -128, 71, 64, 51, - 51, 51, 51, 51, -109, 94, -64, 0, 0, 0, 0, 0, -128, 71, 64, 51, 51, 51, 51, - 51, -109, 94, 64, 0, 0, 0, 0, 0, -128, 72, 64, -51, -52, -52, -52, -52, 108, - 95, 64, 0, 0, 0, 0, 0, 0, 67, 64, 0, 0, 0, 0, 0, 64, 94, 64, 0, 0, 0, 0, 0, - -128, 71, 64, 51, 51, 51, 51, 51, -109, 94, -64, 1, 0, 0, 0, 1, 0, 0, 0, 0, - 1, 0, 0, 0, -1, -1, -1, -1, 0, 0, 0, 0, 10)) - assert(types(6) === "class [B") - assert(row.getAs[Array[Byte]](6) === - Array[Byte](0, 0, 0, 0, 1, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 64, 0, 0, 0, 0, - 0, 0, 8, 64, 0, 0, 0, 0, 0, 0, 28, 64, 0, 0, 0, 0, 0, 0, 32, 64, 0, 0, 0, 0, - 0, 0, -8, -1, 0, 0, 0, 0, 0, 0, 35, 64, 2, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, - 0, 0, 3, 0, 0, 0, -1, -1, -1, -1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 1, - 0, 0, 0, 0, 1, 0, 0, 0, 1)) - assert(types(6) === "class [B") - assert(row.getAs[Array[Byte]](7) === - Array[Byte](0, 0, 0, 0, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 64, 0, 0, 0, 0, 0, 0, -16, 63, 0, 0, 0, 0, 0, 0, -16, 63, 0, 0, 0, - 0, 0, 0, -16, 63, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -16, 63, 0, 0, - 0, 0, 0, 0, -16, 63, 2, 0, 0, 0, 1, 0, 0, 0, 0, 1, 2, 0, 0, 0, 3, 0, 0, 0, - -1, -1, -1, -1, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 2)) - assert(types(6) === "class [B") - assert(row.getAs[Array[Byte]](8) === - Array[Byte](0, 0, 0, 0, 1, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 64, 0, 0, 0, - 0, 0, 0, 0, 64, 0, 0, 0, 0, 0, 0, 0, 64, 0, 0, 0, 0, 0, 0, 0, -64, 0, 0, 0, - 0, 0, 0, 0, -64, 0, 0, 0, 0, 0, 0, 0, -64, 0, 0, 0, 0, 0, 0, 0, -64, 0, 0, - 0, 0, 0, 0, 0, 64, 0, 0, 0, 0, 0, 0, 0, 64, 0, 0, 0, 0, 0, 0, 0, 64, 0, 0, - 0, 0, 0, 0, -16, 63, 0, 0, 0, 0, 0, 0, -16, 63, 0, 0, 0, 0, 0, 0, 8, 64, 0, - 0, 0, 0, 0, 0, -16, 63, 0, 0, 0, 0, 0, 0, 8, 64, 0, 0, 0, 0, 0, 0, 8, 64, 0, - 0, 0, 0, 0, 0, -16, 63, 0, 0, 0, 0, 0, 0, 8, 64, 0, 0, 0, 0, 0, 0, -16, 63, - 0, 0, 0, 0, 0, 0, -16, 63, 2, 0, 0, 0, 2, 0, 0, 0, 0, 2, 5, 0, 0, 0, 3, 0, - 0, 0, -1, -1, -1, -1, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 1, 0, 0, 0, 3)) - assert(types(6) === "class [B") - assert(row.getAs[Array[Byte]](9) === - Array[Byte](0, 0, 0, 0, 1, 4, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, -16, 63, 0, 0, 0, - 0, 0, 0, -16, 63, 0, 0, 0, 0, 0, 0, 8, 64, 0, 0, 0, 0, 0, 0, 20, 64, 0, 0, - 0, 0, 0, 0, -16, -65, 0, 0, 0, 0, 0, 0, -16, -65, 0, 0, 0, 0, 0, 0, -16, -65, - 0, 0, 0, 0, 0, 0, 20, -64, 0, 0, 0, 0, 0, 0, 20, -64, 0, 0, 0, 0, 0, 0, 20, - -64, 0, 0, 0, 0, 0, 0, 20, -64, 0, 0, 0, 0, 0, 0, -16, -65, 0, 0, 0, 0, 0, 0, - -16, -65, 0, 0, 0, 0, 0, 0, -16, -65, 2, 0, 0, 0, 1, 0, 0, 0, 0, 2, 2, 0, 0, - 0, 3, 0, 0, 0, -1, -1, -1, -1, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, - 0, 0, 0, 1, 0, 0, 0, 3)) + Seq("true", "false").foreach { legacy => + val df = spark.read.jdbc(jdbcUrl, "spatials", new Properties) + val rows = df.collect() + assert(rows.length == 1) + val row = rows(0) + val types = row.toSeq.map(x => x.getClass.toString) + assert(types.length == 10) + assert(types(0) == "class [B") + assert(row.getAs[Array[Byte]](0) === + Array(0, 0, 0, 0, 1, 15, 0, 0, 0, 0, 0, 0, 8, 64, 0, 0, 0, 0, 0, 0, + 16, 64, 0, 0, 0, 0, 0, 0, 28, 64, 0, 0, 0, 0, 0, 0, 4, 64)) + assert(types(1) == "class [B") + assert(row.getAs[Array[Byte]](1) === + Array[Byte](0, 0, 0, 0, 1, 4, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, -16, 63, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + -16, 63, 0, 0, 0, 0, 0, 0, -16, -65, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, + 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, -1, -1, -1, -1, 0, 0, 0, 0, 2)) + assert(types(2) == "class [B") + assert(row.getAs[Array[Byte]](2) === + Array[Byte](0, 0, 0, 0, 2, 4, 5, 0, 0, 0, -12, -3, -44, 120, -23, -106, + 94, -64, -35, 36, 6, -127, -107, -45, 71, 64, -125, -64, -54, -95, 69, + -106, 94, -64, 80, -115, -105, 110, 18, -45, 71, 64, -125, -64, -54, + -95, 69, -106, 94, -64, 78, 98, 16, 88, 57, -44, 71, 64, -12, -3, -44, + 120, -23, -106, 94, -64, 78, 98, 16, 88, 57, -44, 71, 64, -12, -3, -44, + 120, -23, -106, 94, -64, -35, 36, 6, -127, -107, -45, 71, 64, 1, 0, 0, + 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, -1, -1, -1, -1, 0, 0, 0, 0, 8)) + assert(types(3) == "class [B") + assert(row.getAs[Array[Byte]](3) === + Array[Byte](-26, 16, 0, 0, 2, 4, 5, 0, 0, 0, -35, 36, 6, -127, -107, -45, + 71, 64, -12, -3, -44, 120, -23, -106, 94, -64, 80, -115, -105, 110, 18, + -45, 71, 64, -125, -64, -54, -95, 69, -106, 94, -64, 78, 98, 16, 88, 57, + -44, 71, 64, -125, -64, -54, -95, 69, -106, 94, -64, 78, 98, 16, 88, 57, + -44, 71, 64, -12, -3, -44, 120, -23, -106, 94, -64, -35, 36, 6, -127, -107, + -45, 71, 64, -12, -3, -44, 120, -23, -106, 94, -64, 1, 0, 0, 0, 3, 0, 0, + 0, 0, 1, 0, 0, 0, -1, -1, -1, -1, 0, 0, 0, 0, 9, 2, 0, 0, 0, 3, 1)) + assert(types(5) == "class [B") + assert(row.getAs[Array[Byte]](4) === + Array[Byte](0, 0, 0, 0, 1, 4, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 52, -64, 0, 0, + 0, 0, 0, 0, 52, -64, 0, 0, 0, 0, 0, 0, 52, -64, 0, 0, 0, 0, 0, 0, 52, 64, + 0, 0, 0, 0, 0, 0, 52, 64, 0, 0, 0, 0, 0, 0, 52, 64, 0, 0, 0, 0, 0, 0, 52, + 64, 0, 0, 0, 0, 0, 0, 52, -64, 0, 0, 0, 0, 0, 0, 52, -64, 0, 0, 0, 0, 0, + 0, 52, -64, 0, 0, 0, 0, 0, 0, 36, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 36, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 36, -64, 0, 0, 0, 0, 0, 0, 36, 64, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, + 0, 2, 0, 0, 0, 0, 0, 5, 0, 0, 0, 1, 0, 0, 0, -1, -1, -1, -1, 0, 0, 0, 0, 3)) + assert(types(6) === "class [B") + assert(row.getAs[Array[Byte]](5) === + Array[Byte](-26, 16, 0, 0, 2, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0, -128, 71, 64, 51, + 51, 51, 51, 51, -109, 94, -64, 0, 0, 0, 0, 0, -128, 71, 64, 51, 51, 51, 51, + 51, -109, 94, 64, 0, 0, 0, 0, 0, -128, 72, 64, -51, -52, -52, -52, -52, 108, + 95, 64, 0, 0, 0, 0, 0, 0, 67, 64, 0, 0, 0, 0, 0, 64, 94, 64, 0, 0, 0, 0, 0, + -128, 71, 64, 51, 51, 51, 51, 51, -109, 94, -64, 1, 0, 0, 0, 1, 0, 0, 0, 0, + 1, 0, 0, 0, -1, -1, -1, -1, 0, 0, 0, 0, 10)) + assert(types(6) === "class [B") + assert(row.getAs[Array[Byte]](6) === + Array[Byte](0, 0, 0, 0, 1, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 64, 0, 0, 0, 0, + 0, 0, 8, 64, 0, 0, 0, 0, 0, 0, 28, 64, 0, 0, 0, 0, 0, 0, 32, 64, 0, 0, 0, 0, + 0, 0, -8, -1, 0, 0, 0, 0, 0, 0, 35, 64, 2, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, + 0, 0, 3, 0, 0, 0, -1, -1, -1, -1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 1, + 0, 0, 0, 0, 1, 0, 0, 0, 1)) + assert(types(6) === "class [B") + assert(row.getAs[Array[Byte]](7) === + Array[Byte](0, 0, 0, 0, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 64, 0, 0, 0, 0, 0, 0, -16, 63, 0, 0, 0, 0, 0, 0, -16, 63, 0, 0, 0, + 0, 0, 0, -16, 63, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -16, 63, 0, 0, + 0, 0, 0, 0, -16, 63, 2, 0, 0, 0, 1, 0, 0, 0, 0, 1, 2, 0, 0, 0, 3, 0, 0, 0, + -1, -1, -1, -1, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 2)) + assert(types(6) === "class [B") + assert(row.getAs[Array[Byte]](8) === + Array[Byte](0, 0, 0, 0, 1, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 64, 0, 0, 0, + 0, 0, 0, 0, 64, 0, 0, 0, 0, 0, 0, 0, 64, 0, 0, 0, 0, 0, 0, 0, -64, 0, 0, 0, + 0, 0, 0, 0, -64, 0, 0, 0, 0, 0, 0, 0, -64, 0, 0, 0, 0, 0, 0, 0, -64, 0, 0, + 0, 0, 0, 0, 0, 64, 0, 0, 0, 0, 0, 0, 0, 64, 0, 0, 0, 0, 0, 0, 0, 64, 0, 0, + 0, 0, 0, 0, -16, 63, 0, 0, 0, 0, 0, 0, -16, 63, 0, 0, 0, 0, 0, 0, 8, 64, 0, + 0, 0, 0, 0, 0, -16, 63, 0, 0, 0, 0, 0, 0, 8, 64, 0, 0, 0, 0, 0, 0, 8, 64, 0, + 0, 0, 0, 0, 0, -16, 63, 0, 0, 0, 0, 0, 0, 8, 64, 0, 0, 0, 0, 0, 0, -16, 63, + 0, 0, 0, 0, 0, 0, -16, 63, 2, 0, 0, 0, 2, 0, 0, 0, 0, 2, 5, 0, 0, 0, 3, 0, + 0, 0, -1, -1, -1, -1, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, + 1, 0, 0, 0, 3)) + assert(types(6) === "class [B") + assert(row.getAs[Array[Byte]](9) === + Array[Byte](0, 0, 0, 0, 1, 4, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, -16, 63, 0, 0, 0, + 0, 0, 0, -16, 63, 0, 0, 0, 0, 0, 0, 8, 64, 0, 0, 0, 0, 0, 0, 20, 64, 0, 0, + 0, 0, 0, 0, -16, -65, 0, 0, 0, 0, 0, 0, -16, -65, 0, 0, 0, 0, 0, 0, -16, -65, + 0, 0, 0, 0, 0, 0, 20, -64, 0, 0, 0, 0, 0, 0, 20, -64, 0, 0, 0, 0, 0, 0, 20, + -64, 0, 0, 0, 0, 0, 0, 20, -64, 0, 0, 0, 0, 0, 0, -16, -65, 0, 0, 0, 0, 0, 0, + -16, -65, 0, 0, 0, 0, 0, 0, -16, -65, 2, 0, 0, 0, 1, 0, 0, 0, 0, 2, 2, 0, 0, + 0, 3, 0, 0, 0, -1, -1, -1, -1, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, + 0, 0, 0, 1, 0, 0, 0, 3)) + } } test("SPARK-38889: MsSqlServerDialect should handle boolean filter push down") { @@ -437,4 +455,42 @@ class MsSqlServerIntegrationSuite extends DockerJDBCIntegrationSuite { .load() assert(df.collect().toSet === expectedResult) } + + test("SPARK-47938: Fix 'Cannot find data type BYTE' in SQL Server") { + spark.sql("select cast(1 as byte) as c0") + .write + .jdbc(jdbcUrl, "test_byte", new Properties) + val df = spark.read.jdbc(jdbcUrl, "test_byte", new Properties) + checkAnswer(df, Row(1.toShort)) + } + + test("SPARK-47945: money types") { + val df = spark.read.format("jdbc") + .option("url", jdbcUrl) + .option("prepareQuery", "DECLARE @mymoney_sm SMALLMONEY = 3148.29, @mymoney MONEY = 3148.29 ") + .option("query", "SELECT @mymoney_sm as smallmoney, @mymoney as money") + .load() + checkAnswer(df, Row(BigDecimal.valueOf(3148.29), BigDecimal.valueOf(3148.29))) + assert(df.schema.fields(0).dataType === DecimalType(10, 4)) + assert(df.schema.fields(1).dataType === DecimalType(19, 4)) + } + + test("SPARK-47945: rowversion") { + val df = spark.read.jdbc(jdbcUrl, "test_rowversion", new Properties) + assert(df.schema.fields(2).dataType === BinaryType) + } + + test("SPARK-47945: sql_variant") { + checkError( + exception = intercept[SparkSQLException] { + spark.read.format("jdbc") + .option("url", jdbcUrl) + .option("prepareQuery", + "DECLARE @myvariant1 SQL_VARIANT = 1, @myvariant2 SQL_VARIANT = 'test'") + .option("query", "SELECT @myvariant1 as variant1, @myvariant2 as variant2") + .load() + }, + errorClass = "UNRECOGNIZED_SQL_TYPE", + parameters = Map("typeName" -> "sql_variant", "jdbcType" -> "-156")) + } } diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MySQLDatabaseOnDocker.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MySQLDatabaseOnDocker.scala index 568eb5f109731..570a81ac3947f 100644 --- a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MySQLDatabaseOnDocker.scala +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MySQLDatabaseOnDocker.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql.jdbc class MySQLDatabaseOnDocker extends DatabaseOnDocker { - override val imageName = sys.env.getOrElse("MYSQL_DOCKER_IMAGE_NAME", "mysql:8.3.0") + override val imageName = sys.env.getOrElse("MYSQL_DOCKER_IMAGE_NAME", "mysql:8.4.0") override val env = Map( "MYSQL_ROOT_PASSWORD" -> "rootpass" ) diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MySQLIntegrationSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MySQLIntegrationSuite.scala index 684cec37c1703..e6cca2ac9cd0a 100644 --- a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MySQLIntegrationSuite.scala +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MySQLIntegrationSuite.scala @@ -74,9 +74,9 @@ class MySQLIntegrationSuite extends DockerJDBCIntegrationSuite { .executeUpdate() conn.prepareStatement("CREATE TABLE dates (d DATE, t TIME, dt DATETIME, ts TIMESTAMP, " - + "yr YEAR)").executeUpdate() - conn.prepareStatement("INSERT INTO dates VALUES ('1991-11-09', '13:31:24', " - + "'1996-01-01 01:23:45', '2009-02-13 23:31:30', '2001')").executeUpdate() + + "yr YEAR, t1 TIME(3))").executeUpdate() + conn.prepareStatement("INSERT INTO dates VALUES ('1991-11-09', '13:31:24.123', " + + "'1996-01-01 01:23:45', '2009-02-13 23:31:30', '2001', '13:31:24.123')").executeUpdate() // TODO: Test locale conversion for strings. conn.prepareStatement("CREATE TABLE strings (a CHAR(10), b VARCHAR(10), c TINYTEXT, " @@ -185,21 +185,13 @@ class MySQLIntegrationSuite extends DockerJDBCIntegrationSuite { test("Date types") { withDefaultTimeZone(UTC) { val df = sqlContext.read.jdbc(jdbcUrl, "dates", new Properties) - val rows = df.collect() - assert(rows.length == 1) - val types = rows(0).toSeq.map(x => x.getClass.toString) - assert(types.length == 5) - assert(types(0).equals("class java.sql.Date")) - assert(types(1).equals("class java.sql.Timestamp")) - assert(types(2).equals("class java.sql.Timestamp")) - assert(types(3).equals("class java.sql.Timestamp")) - assert(types(4).equals("class java.sql.Date")) - assert(rows(0).getAs[Date](0).equals(Date.valueOf("1991-11-09"))) - assert( - rows(0).getAs[Timestamp](1) === Timestamp.valueOf("1970-01-01 13:31:24")) - assert(rows(0).getAs[Timestamp](2).equals(Timestamp.valueOf("1996-01-01 01:23:45"))) - assert(rows(0).getAs[Timestamp](3).equals(Timestamp.valueOf("2009-02-13 23:31:30"))) - assert(rows(0).getAs[Date](4).equals(Date.valueOf("2001-01-01"))) + checkAnswer(df, Row( + Date.valueOf("1991-11-09"), + Timestamp.valueOf("1970-01-01 13:31:24"), + Timestamp.valueOf("1996-01-01 01:23:45"), + Timestamp.valueOf("2009-02-13 23:31:30"), + Date.valueOf("2001-01-01"), + Timestamp.valueOf("1970-01-01 13:31:24.123"))) } val df = spark.read.format("jdbc") .option("url", jdbcUrl) @@ -218,7 +210,8 @@ class MySQLIntegrationSuite extends DockerJDBCIntegrationSuite { LocalDateTime.of(1970, 1, 1, 13, 31, 24), LocalDateTime.of(1996, 1, 1, 1, 23, 45), Timestamp.valueOf("2009-02-13 23:31:30"), - Date.valueOf("2001-01-01"))) + Date.valueOf("2001-01-01"), + LocalDateTime.of(1970, 1, 1, 13, 31, 24, 123000000))) } } diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleDatabaseOnDocker.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleDatabaseOnDocker.scala index bfbcf5b533d73..dd6bbf0af8a33 100644 --- a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleDatabaseOnDocker.scala +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleDatabaseOnDocker.scala @@ -17,16 +17,11 @@ package org.apache.spark.sql.jdbc -import java.io.{File, PrintWriter} - -import com.github.dockerjava.api.model._ - import org.apache.spark.internal.Logging -import org.apache.spark.util.Utils class OracleDatabaseOnDocker extends DatabaseOnDocker with Logging { lazy override val imageName = - sys.env.getOrElse("ORACLE_DOCKER_IMAGE_NAME", "gvenzl/oracle-free:23.3-slim") + sys.env.getOrElse("ORACLE_DOCKER_IMAGE_NAME", "gvenzl/oracle-free:23.4-slim") val oracle_password = "Th1s1sThe0racle#Pass" override val env = Map( "ORACLE_PWD" -> oracle_password, // oracle images uses this @@ -38,30 +33,4 @@ class OracleDatabaseOnDocker extends DatabaseOnDocker with Logging { override def getJdbcUrl(ip: String, port: Int): String = { s"jdbc:oracle:thin:system/$oracle_password@//$ip:$port/freepdb1" } - - override def beforeContainerStart( - hostConfigBuilder: HostConfig, - containerConfigBuilder: ContainerConfig): Unit = { - try { - val dir = Utils.createTempDir() - val writer = new PrintWriter(new File(dir, "install.sql")) - // SPARK-46592: gvenzl/oracle-free occasionally fails to start with the following error: - // 'ORA-04021: timeout occurred while waiting to lock object', when initializing the - // SYSTEM user. This is due to the fact that the default DDL_LOCK_TIMEOUT is 0, which - // means that the lock will no wait. We set the timeout to 30 seconds to try again. - // TODO: This workaround should be removed once the issue is fixed in the image. - // https://github.com/gvenzl/oci-oracle-free/issues/35 - writer.write("ALTER SESSION SET DDL_LOCK_TIMEOUT = 30;\n") - writer.write(s"""ALTER USER SYSTEM IDENTIFIED BY "$oracle_password";""") - writer.close() - val newBind = new Bind( - dir.getAbsolutePath, - new Volume("/docker-entrypoint-initdb.d"), - AccessMode.DEFAULT) - hostConfigBuilder.withBinds(hostConfigBuilder.getBinds :+ newBind: _*) - } catch { - case e: Exception => - logWarning("Failed to create install.sql file", e) - } - } } diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala index 496498e5455b4..2b2596289548c 100644 --- a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala @@ -22,8 +22,6 @@ import java.sql.{Connection, Date, Timestamp} import java.time.{Duration, Period} import java.util.{Properties, TimeZone} -import org.scalatest.time.SpanSugar._ - import org.apache.spark.sql.{DataFrame, Row, SaveMode} import org.apache.spark.sql.catalyst.util.CharVarcharUtils import org.apache.spark.sql.catalyst.util.DateTimeTestUtils._ @@ -56,11 +54,11 @@ import org.apache.spark.tags.DockerTest * A sequence of commands to build the Oracle Database Free container image: * $ git clone https://github.com/oracle/docker-images.git * $ cd docker-images/OracleDatabase/SingleInstance/dockerfiles - * $ ./buildContainerImage.sh -v 23.2.0 -f - * $ export ORACLE_DOCKER_IMAGE_NAME=oracle/database:23.2.0-free + * $ ./buildContainerImage.sh -v 23.4.0 -f + * $ export ORACLE_DOCKER_IMAGE_NAME=oracle/database:23.4.0-free * - * This procedure has been validated with Oracle Database Free version 23.2.0, - * and with Oracle Express Edition versions 18.4.0 and 21.3.0 + * This procedure has been validated with Oracle Database Free version 23.4.0, + * and with Oracle Express Edition versions 18.4.0 and 21.4.0 */ @DockerTest class OracleIntegrationSuite extends DockerJDBCIntegrationSuite with SharedSparkSession { @@ -68,8 +66,6 @@ class OracleIntegrationSuite extends DockerJDBCIntegrationSuite with SharedSpark override val db = new OracleDatabaseOnDocker - override val connectionTimeout = timeout(7.minutes) - private val rsOfTsWithTimezone = Seq( Row(BigDecimal.valueOf(1), new Timestamp(944046000000L)), Row(BigDecimal.valueOf(2), new Timestamp(944078400000L)) diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/PostgresIntegrationSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/PostgresIntegrationSuite.scala index 1cd8a77e8442e..12a71dbd7c7f8 100644 --- a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/PostgresIntegrationSuite.scala +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/PostgresIntegrationSuite.scala @@ -18,20 +18,23 @@ package org.apache.spark.sql.jdbc import java.math.{BigDecimal => JBigDecimal} -import java.sql.{Connection, Date, Timestamp} +import java.sql.{Connection, Date, SQLException, Timestamp} import java.text.SimpleDateFormat import java.time.LocalDateTime import java.util.Properties -import org.apache.spark.sql.{Column, Row} +import org.apache.spark.SparkException +import org.apache.spark.sql.{Column, DataFrame, Row} import org.apache.spark.sql.catalyst.expressions.Literal +import org.apache.spark.sql.catalyst.util.DateTimeUtils +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ import org.apache.spark.tags.DockerTest /** - * To run this test suite for a specific version (e.g., postgres:16.2): + * To run this test suite for a specific version (e.g., postgres:16.3-alpine): * {{{ - * ENABLE_DOCKER_INTEGRATION_TESTS=1 POSTGRES_DOCKER_IMAGE_NAME=postgres:16.2 + * ENABLE_DOCKER_INTEGRATION_TESTS=1 POSTGRES_DOCKER_IMAGE_NAME=postgres:16.3-alpine * ./build/sbt -Pdocker-integration-tests * "docker-integration-tests/testOnly org.apache.spark.sql.jdbc.PostgresIntegrationSuite" * }}} @@ -39,7 +42,7 @@ import org.apache.spark.tags.DockerTest @DockerTest class PostgresIntegrationSuite extends DockerJDBCIntegrationSuite { override val db = new DatabaseOnDocker { - override val imageName = sys.env.getOrElse("POSTGRES_DOCKER_IMAGE_NAME", "postgres:16.2-alpine") + override val imageName = sys.env.getOrElse("POSTGRES_DOCKER_IMAGE_NAME", "postgres:16.3-alpine") override val env = Map( "POSTGRES_PASSWORD" -> "rootpass" ) @@ -71,7 +74,7 @@ class PostgresIntegrationSuite extends DockerJDBCIntegrationSuite { + "'((100.3, 40.2), (20.198, 83.1), (500.821, 311.38))', '<500, 200, 100>', '16/B374D848', " + "'ab', 'efg', '2021-02-02', '1 minute', '00:11:22:33:44:55', " + "'00:11:22:33:44:55:66:77', 12.3456, '10:20:10,14,15', 1E+37, " - + "'17:22:31', '2016-08-12 10:22:31.949271', 'cat:AB & dog:CD', " + + "'17:22:31.123', '2016-08-12 10:22:31.949271', 'cat:AB & dog:CD', " + "'dog and cat and fox', '10:20:10,14,15', 'id10')" ).executeUpdate() conn.prepareStatement("INSERT INTO bar VALUES (null, null, null, null, null, " @@ -280,7 +283,7 @@ class PostgresIntegrationSuite extends DockerJDBCIntegrationSuite { assert(rows(0).getDecimal(33) == new JBigDecimal("12.3456")) assert(rows(0).getString(34) == "10:20:10,14,15") assert(rows(0).getFloat(35) == 1E+37F) - assert(rows(0).getTimestamp(36) == Timestamp.valueOf("1970-01-01 17:22:31.0")) + assert(rows(0).getTimestamp(36) == Timestamp.valueOf("1970-01-01 17:22:31.123")) assert(rows(0).getTimestamp(37) == Timestamp.valueOf("2016-08-12 10:22:31.949271")) assert(rows(0).getString(38) == "'cat':AB & 'dog':CD") assert(rows(0).getString(39) == "'and' 'cat' 'dog' 'fox'") @@ -314,11 +317,13 @@ class PostgresIntegrationSuite extends DockerJDBCIntegrationSuite { test("SPARK-47390: Convert TIMESTAMP/TIME WITH TIME ZONE regardless of preferTimestampNTZ") { Seq(true, false).foreach { prefer => - val rows = sqlContext.read + val df = sqlContext.read .option("preferTimestampNTZ", prefer) .jdbc(jdbcUrl, "ts_with_timezone", new Properties) - .collect() - rows.head.toSeq.tail.foreach(c => assert(c.isInstanceOf[java.sql.Timestamp])) + checkAnswer(df, Row( + 1, + DateTimeUtils.toJavaTimestamp(1471022551949271L), + DateTimeUtils.toJavaTimestamp(62551949000L))) } } @@ -554,4 +559,74 @@ class PostgresIntegrationSuite extends DockerJDBCIntegrationSuite { .option("query", "SELECT 1::oid, 'bar'::regclass, 'integer'::regtype").load() checkAnswer(df, Row(1, "bar", "integer")) } + + test("SPARK-47886: special number values") { + def toDF(qry: String): DataFrame = { + spark.read.format("jdbc") + .option("url", jdbcUrl) + .option("query", qry) + .load() + } + checkAnswer( + toDF("SELECT 'NaN'::float8 c1, 'infinity'::float8 c2, '-infinity'::float8 c3"), + Row(Double.NaN, Double.PositiveInfinity, Double.NegativeInfinity)) + checkAnswer( + toDF("SELECT 'NaN'::float4 c1, 'infinity'::float4 c2, '-infinity'::float4 c3"), + Row(Float.NaN, Float.PositiveInfinity, Float.NegativeInfinity) + ) + + Seq("NaN", "infinity", "-infinity").foreach { v => + val df = toDF(s"SELECT '$v'::numeric c1") + val e = intercept[SparkException](df.collect()) + checkError(e, null) + val cause = e.getCause.asInstanceOf[SQLException] + assert(cause.getMessage.contains("Bad value for type BigDecimal")) + assert(cause.getSQLState === "22003") + } + } + + test("SPARK-48387: Timestamp write as timestamp with time zone") { + val df = spark.sql("select TIMESTAMP '2018-11-17 13:33:33' as col0") + // write timestamps for preparation + withSQLConf(SQLConf.LEGACY_POSTGRES_DATETIME_MAPPING_ENABLED.key -> "false") { + // write timestamp as timestamp with time zone + df.write.jdbc(jdbcUrl, "ts_with_timezone_copy_false", new Properties) + } + withSQLConf(SQLConf.LEGACY_POSTGRES_DATETIME_MAPPING_ENABLED.key -> "true") { + // write timestamp as timestamp without time zone + df.write.jdbc(jdbcUrl, "ts_with_timezone_copy_true", new Properties) + } + + // read timestamps for test + withSQLConf(SQLConf.LEGACY_POSTGRES_DATETIME_MAPPING_ENABLED.key -> "true") { + val df1 = spark.read.option("preferTimestampNTZ", false) + .jdbc(jdbcUrl, "ts_with_timezone_copy_false", new Properties) + checkAnswer(df1, Row(Timestamp.valueOf("2018-11-17 13:33:33"))) + val df2 = spark.read.option("preferTimestampNTZ", true) + .jdbc(jdbcUrl, "ts_with_timezone_copy_false", new Properties) + checkAnswer(df2, Row(LocalDateTime.of(2018, 11, 17, 13, 33, 33))) + + val df3 = spark.read.option("preferTimestampNTZ", false) + .jdbc(jdbcUrl, "ts_with_timezone_copy_true", new Properties) + checkAnswer(df3, Row(Timestamp.valueOf("2018-11-17 13:33:33"))) + val df4 = spark.read.option("preferTimestampNTZ", true) + .jdbc(jdbcUrl, "ts_with_timezone_copy_true", new Properties) + checkAnswer(df4, Row(LocalDateTime.of(2018, 11, 17, 13, 33, 33))) + } + withSQLConf(SQLConf.LEGACY_POSTGRES_DATETIME_MAPPING_ENABLED.key -> "false") { + Seq("true", "false").foreach { prefer => + val prop = new Properties + prop.setProperty("preferTimestampNTZ", prefer) + val dfCopy = spark.read.jdbc(jdbcUrl, "ts_with_timezone_copy_false", prop) + checkAnswer(dfCopy, Row(Timestamp.valueOf("2018-11-17 13:33:33"))) + } + + val df5 = spark.read.option("preferTimestampNTZ", false) + .jdbc(jdbcUrl, "ts_with_timezone_copy_true", new Properties) + checkAnswer(df5, Row(Timestamp.valueOf("2018-11-17 13:33:33"))) + val df6 = spark.read.option("preferTimestampNTZ", true) + .jdbc(jdbcUrl, "ts_with_timezone_copy_true", new Properties) + checkAnswer(df6, Row(LocalDateTime.of(2018, 11, 17, 13, 33, 33))) + } + } } diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/PostgresKrbIntegrationSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/PostgresKrbIntegrationSuite.scala index d08be3b5f40e3..af1cd464ad5fe 100644 --- a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/PostgresKrbIntegrationSuite.scala +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/PostgresKrbIntegrationSuite.scala @@ -25,9 +25,9 @@ import org.apache.spark.sql.execution.datasources.jdbc.connection.SecureConnecti import org.apache.spark.tags.DockerTest /** - * To run this test suite for a specific version (e.g., postgres:16.2): + * To run this test suite for a specific version (e.g., postgres:16.3-alpine): * {{{ - * ENABLE_DOCKER_INTEGRATION_TESTS=1 POSTGRES_DOCKER_IMAGE_NAME=postgres:16.2 + * ENABLE_DOCKER_INTEGRATION_TESTS=1 POSTGRES_DOCKER_IMAGE_NAME=postgres:16.3-alpine * ./build/sbt -Pdocker-integration-tests * "docker-integration-tests/testOnly *PostgresKrbIntegrationSuite" * }}} @@ -38,7 +38,7 @@ class PostgresKrbIntegrationSuite extends DockerKrbJDBCIntegrationSuite { override protected val keytabFileName = "postgres.keytab" override val db = new DatabaseOnDocker { - override val imageName = sys.env.getOrElse("POSTGRES_DOCKER_IMAGE_NAME", "postgres:16.2") + override val imageName = sys.env.getOrElse("POSTGRES_DOCKER_IMAGE_NAME", "postgres:16.3-alpine") override val env = Map( "POSTGRES_PASSWORD" -> "rootpass" ) diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/querytest/GeneratedSubquerySuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/querytest/GeneratedSubquerySuite.scala index 7ae03e974845b..8b27e9cb0e0a3 100644 --- a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/querytest/GeneratedSubquerySuite.scala +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/querytest/GeneratedSubquerySuite.scala @@ -28,9 +28,9 @@ import org.apache.spark.tags.DockerTest /** * This suite is used to generate subqueries, and test Spark against Postgres. - * To run this test suite for a specific version (e.g., postgres:16.2): + * To run this test suite for a specific version (e.g., postgres:16.3-alpine): * {{{ - * ENABLE_DOCKER_INTEGRATION_TESTS=1 POSTGRES_DOCKER_IMAGE_NAME=postgres:16.2 + * ENABLE_DOCKER_INTEGRATION_TESTS=1 POSTGRES_DOCKER_IMAGE_NAME=postgres:16.3-alpine * ./build/sbt -Pdocker-integration-tests * "docker-integration-tests/testOnly org.apache.spark.sql.jdbc.GeneratedSubquerySuite" * }}} @@ -39,7 +39,7 @@ import org.apache.spark.tags.DockerTest class GeneratedSubquerySuite extends DockerJDBCIntegrationSuite with QueryGeneratorHelper { override val db = new DatabaseOnDocker { - override val imageName = sys.env.getOrElse("POSTGRES_DOCKER_IMAGE_NAME", "postgres:16.2-alpine") + override val imageName = sys.env.getOrElse("POSTGRES_DOCKER_IMAGE_NAME", "postgres:16.3-alpine") override val env = Map( "POSTGRES_PASSWORD" -> "rootpass" ) diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/querytest/PostgreSQLQueryTestSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/querytest/PostgreSQLQueryTestSuite.scala index f2a7e14cfc4b9..de28e16b325ce 100644 --- a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/querytest/PostgreSQLQueryTestSuite.scala +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/querytest/PostgreSQLQueryTestSuite.scala @@ -30,9 +30,9 @@ import org.apache.spark.tags.DockerTest * confidence, and you won't have to manually verify the golden files generated with your test. * 2. Add this line to your .sql file: --ONLY_IF spark * - * Note: To run this test suite for a specific version (e.g., postgres:16.2): + * Note: To run this test suite for a specific version (e.g., postgres:16.3-alpine): * {{{ - * ENABLE_DOCKER_INTEGRATION_TESTS=1 POSTGRES_DOCKER_IMAGE_NAME=postgres:16.2 + * ENABLE_DOCKER_INTEGRATION_TESTS=1 POSTGRES_DOCKER_IMAGE_NAME=postgres:16.3-alpine * ./build/sbt -Pdocker-integration-tests * "testOnly org.apache.spark.sql.jdbc.PostgreSQLQueryTestSuite" * }}} @@ -45,7 +45,7 @@ class PostgreSQLQueryTestSuite extends CrossDbmsQueryTestSuite { protected val customInputFilePath: String = new File(inputFilePath, "subquery").getAbsolutePath override val db = new DatabaseOnDocker { - override val imageName = sys.env.getOrElse("POSTGRES_DOCKER_IMAGE_NAME", "postgres:16.2-alpine") + override val imageName = sys.env.getOrElse("POSTGRES_DOCKER_IMAGE_NAME", "postgres:16.3-alpine") override val env = Map( "POSTGRES_PASSWORD" -> "rootpass" ) diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/DB2IntegrationSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/DB2IntegrationSuite.scala index 6c1b7fdd1be5a..57129e9d846f6 100644 --- a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/DB2IntegrationSuite.scala +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/DB2IntegrationSuite.scala @@ -20,8 +20,6 @@ package org.apache.spark.sql.jdbc.v2 import java.sql.Connection import java.util.Locale -import org.scalatest.time.SpanSugar._ - import org.apache.spark.SparkConf import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.execution.datasources.v2.jdbc.JDBCTableCatalog @@ -52,7 +50,6 @@ class DB2IntegrationSuite extends DockerJDBCIntegrationV2Suite with V2JDBCTest { override val catalogName: String = "db2" override val namespaceOpt: Option[String] = Some("DB2INST1") override val db = new DB2DatabaseOnDocker - override val connectionTimeout = timeout(3.minutes) override def sparkConf: SparkConf = super.sparkConf .set("spark.sql.catalog.db2", classOf[JDBCTableCatalog].getName) @@ -65,6 +62,12 @@ class DB2IntegrationSuite extends DockerJDBCIntegrationV2Suite with V2JDBCTest { connection.prepareStatement( "CREATE TABLE employee (dept INTEGER, name VARCHAR(10), salary DECIMAL(20, 2), bonus DOUBLE)") .executeUpdate() + connection.prepareStatement( + s"""CREATE TABLE pattern_testing_table ( + |pattern_testing_col VARCHAR(50) + |) + """.stripMargin + ).executeUpdate() } override def testUpdateColumnType(tbl: String): Unit = { diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/DockerJDBCIntegrationV2Suite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/DockerJDBCIntegrationV2Suite.scala index 72edfc9f1bf1c..60345257f2dc4 100644 --- a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/DockerJDBCIntegrationV2Suite.scala +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/DockerJDBCIntegrationV2Suite.scala @@ -38,6 +38,25 @@ abstract class DockerJDBCIntegrationV2Suite extends DockerJDBCIntegrationSuite { .executeUpdate() connection.prepareStatement("INSERT INTO employee VALUES (6, 'jen', 12000, 1200)") .executeUpdate() + + connection.prepareStatement("INSERT INTO pattern_testing_table " + + "VALUES ('special_character_quote''_present')") + .executeUpdate() + connection.prepareStatement("INSERT INTO pattern_testing_table " + + "VALUES ('special_character_quote_not_present')") + .executeUpdate() + connection.prepareStatement("INSERT INTO pattern_testing_table " + + "VALUES ('special_character_percent%_present')") + .executeUpdate() + connection.prepareStatement("INSERT INTO pattern_testing_table " + + "VALUES ('special_character_percent_not_present')") + .executeUpdate() + connection.prepareStatement("INSERT INTO pattern_testing_table " + + "VALUES ('special_character_underscore_present')") + .executeUpdate() + connection.prepareStatement("INSERT INTO pattern_testing_table " + + "VALUES ('special_character_underscorenot_present')") + .executeUpdate() } def tablePreparation(connection: Connection): Unit diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MsSqlServerIntegrationSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MsSqlServerIntegrationSuite.scala index 0dc3a39f4db5d..9ddd79fb257d8 100644 --- a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MsSqlServerIntegrationSuite.scala +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MsSqlServerIntegrationSuite.scala @@ -19,12 +19,10 @@ package org.apache.spark.sql.jdbc.v2 import java.sql.Connection -import org.scalatest.time.SpanSugar._ - import org.apache.spark.{SparkConf, SparkSQLFeatureNotSupportedException} import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.execution.datasources.v2.jdbc.JDBCTableCatalog -import org.apache.spark.sql.jdbc.DatabaseOnDocker +import org.apache.spark.sql.jdbc.MsSQLServerDatabaseOnDocker import org.apache.spark.sql.types._ import org.apache.spark.tags.DockerTest @@ -60,19 +58,7 @@ class MsSqlServerIntegrationSuite extends DockerJDBCIntegrationV2Suite with V2JD "scan with aggregate push-down: REGR_SXY without DISTINCT") override val catalogName: String = "mssql" - override val db = new DatabaseOnDocker { - override val imageName = sys.env.getOrElse("MSSQLSERVER_DOCKER_IMAGE_NAME", - "mcr.microsoft.com/mssql/server:2019-CU13-ubuntu-20.04") - override val env = Map( - "SA_PASSWORD" -> "Sapass123", - "ACCEPT_EULA" -> "Y" - ) - override val usesIpc = false - override val jdbcPort: Int = 1433 - - override def getJdbcUrl(ip: String, port: Int): String = - s"jdbc:sqlserver://$ip:$port;user=sa;password=Sapass123;" - } + override val db = new MsSQLServerDatabaseOnDocker override def sparkConf: SparkConf = super.sparkConf .set("spark.sql.catalog.mssql", classOf[JDBCTableCatalog].getName) @@ -80,12 +66,16 @@ class MsSqlServerIntegrationSuite extends DockerJDBCIntegrationV2Suite with V2JD .set("spark.sql.catalog.mssql.pushDownAggregate", "true") .set("spark.sql.catalog.mssql.pushDownLimit", "true") - override val connectionTimeout = timeout(7.minutes) - override def tablePreparation(connection: Connection): Unit = { connection.prepareStatement( "CREATE TABLE employee (dept INT, name VARCHAR(32), salary NUMERIC(20, 2), bonus FLOAT)") .executeUpdate() + connection.prepareStatement( + s"""CREATE TABLE pattern_testing_table ( + |pattern_testing_col VARCHAR(50) + |) + """.stripMargin + ).executeUpdate() } override def notSupportsTableComment: Boolean = true @@ -143,4 +133,17 @@ class MsSqlServerIntegrationSuite extends DockerJDBCIntegrationV2Suite with V2JD "WHERE (dept > 1 AND ((name LIKE 'am%') = (name LIKE '%y')))") assert(df3.collect().length == 3) } + + test("SPARK-47994: SQLServer does not support 1 or 0 as boolean type in CASE WHEN filter") { + val df = sql( + s""" + |WITH tbl AS ( + |SELECT CASE + |WHEN e.dept = 1 THEN 'first' WHEN e.dept = 2 THEN 'second' ELSE 'third' END + |AS deptString FROM $catalogName.employee as e) + |SELECT * FROM tbl + |WHERE deptString = 'first' + |""".stripMargin) + assert(df.collect().length == 2) + } } diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MsSqlServerNamespaceSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MsSqlServerNamespaceSuite.scala index 4bdc80dedfbe1..e010a0caf13fa 100644 --- a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MsSqlServerNamespaceSuite.scala +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MsSqlServerNamespaceSuite.scala @@ -21,7 +21,7 @@ import java.sql.Connection import scala.jdk.CollectionConverters._ -import org.apache.spark.sql.jdbc.{DatabaseOnDocker, DockerJDBCIntegrationSuite} +import org.apache.spark.sql.jdbc.{DockerJDBCIntegrationSuite, MsSQLServerDatabaseOnDocker} import org.apache.spark.sql.util.CaseInsensitiveStringMap import org.apache.spark.tags.DockerTest @@ -35,20 +35,7 @@ import org.apache.spark.tags.DockerTest */ @DockerTest class MsSqlServerNamespaceSuite extends DockerJDBCIntegrationSuite with V2JDBCNamespaceTest { - override val db = new DatabaseOnDocker { - override val imageName = sys.env.getOrElse("MSSQLSERVER_DOCKER_IMAGE_NAME", - "mcr.microsoft.com/mssql/server:2019-CU13-ubuntu-20.04") - override val env = Map( - "SA_PASSWORD" -> "Sapass123", - "ACCEPT_EULA" -> "Y" - ) - override val usesIpc = false - override val jdbcPort: Int = 1433 - - override def getJdbcUrl(ip: String, port: Int): String = - s"jdbc:sqlserver://$ip:$port;user=sa;password=Sapass123;" - } - + override val db = new MsSQLServerDatabaseOnDocker val map = new CaseInsensitiveStringMap( Map("url" -> db.getJdbcUrl(dockerIp, externalPort), "driver" -> "com.microsoft.sqlserver.jdbc.SQLServerDriver").asJava) diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MySQLIntegrationSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MySQLIntegrationSuite.scala index 4997d335fda6b..d5478e664221d 100644 --- a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MySQLIntegrationSuite.scala +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MySQLIntegrationSuite.scala @@ -19,8 +19,6 @@ package org.apache.spark.sql.jdbc.v2 import java.sql.{Connection, SQLFeatureNotSupportedException} -import org.scalatest.time.SpanSugar._ - import org.apache.spark.{SparkConf, SparkSQLFeatureNotSupportedException} import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.execution.datasources.v2.jdbc.JDBCTableCatalog @@ -68,8 +66,6 @@ class MySQLIntegrationSuite extends DockerJDBCIntegrationV2Suite with V2JDBCTest .set("spark.sql.catalog.mysql.pushDownLimit", "true") .set("spark.sql.catalog.mysql.pushDownOffset", "true") - override val connectionTimeout = timeout(7.minutes) - private var mySQLVersion = -1 override def tablePreparation(connection: Connection): Unit = { @@ -77,6 +73,12 @@ class MySQLIntegrationSuite extends DockerJDBCIntegrationV2Suite with V2JDBCTest connection.prepareStatement( "CREATE TABLE employee (dept INT, name VARCHAR(32), salary DECIMAL(20, 2)," + " bonus DOUBLE)").executeUpdate() + connection.prepareStatement( + s"""CREATE TABLE pattern_testing_table ( + |pattern_testing_col LONGTEXT + |) + """.stripMargin + ).executeUpdate() } override def testUpdateColumnType(tbl: String): Unit = { diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MySQLNamespaceSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MySQLNamespaceSuite.scala index d2a7aa7758263..2b607fccd1710 100644 --- a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MySQLNamespaceSuite.scala +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MySQLNamespaceSuite.scala @@ -40,7 +40,7 @@ class MySQLNamespaceSuite extends DockerJDBCIntegrationSuite with V2JDBCNamespac val map = new CaseInsensitiveStringMap( Map("url" -> db.getJdbcUrl(dockerIp, externalPort), - "driver" -> "com.mysql.jdbc.Driver").asJava) + "driver" -> "com.mysql.cj.jdbc.Driver").asJava) catalog.initialize("mysql", map) diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/OracleIntegrationSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/OracleIntegrationSuite.scala index 0aa2905f93b85..342fb4bb38e60 100644 --- a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/OracleIntegrationSuite.scala +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/OracleIntegrationSuite.scala @@ -20,10 +20,8 @@ package org.apache.spark.sql.jdbc.v2 import java.sql.Connection import java.util.Locale -import org.scalatest.time.SpanSugar._ - import org.apache.spark.{SparkConf, SparkRuntimeException} -import org.apache.spark.sql.AnalysisException +import org.apache.spark.sql.{AnalysisException, Row} import org.apache.spark.sql.catalyst.util.CharVarcharUtils.CHAR_VARCHAR_TYPE_STRING_METADATA_KEY import org.apache.spark.sql.execution.datasources.v2.jdbc.JDBCTableCatalog import org.apache.spark.sql.jdbc.OracleDatabaseOnDocker @@ -50,12 +48,12 @@ import org.apache.spark.tags.DockerTest * * A sequence of commands to build the Oracle Database Free container image: * $ git clone https://github.com/oracle/docker-images.git - * $ cd docker-images/OracleDatabase/SingleInstance/dockerfiles - * $ ./buildContainerImage.sh -v 23.2.0 -f - * $ export ORACLE_DOCKER_IMAGE_NAME=oracle/database:23.2.0-free + * $ cd docker-images/OracleDatabase/SingleInstance/dockerfiles0 + * $ ./buildContainerImage.sh -v 23.4.0 -f + * $ export ORACLE_DOCKER_IMAGE_NAME=oracle/database:23.4.0-free * - * This procedure has been validated with Oracle Database Free version 23.2.0, - * and with Oracle Express Edition versions 18.4.0 and 21.3.0 + * This procedure has been validated with Oracle Database Free version 23.4.0, + * and with Oracle Express Edition versions 18.4.0 and 21.4.0 */ @DockerTest class OracleIntegrationSuite extends DockerJDBCIntegrationV2Suite with V2JDBCTest { @@ -91,12 +89,16 @@ class OracleIntegrationSuite extends DockerJDBCIntegrationV2Suite with V2JDBCTes .set("spark.sql.catalog.oracle.pushDownLimit", "true") .set("spark.sql.catalog.oracle.pushDownOffset", "true") - override val connectionTimeout = timeout(7.minutes) - override def tablePreparation(connection: Connection): Unit = { connection.prepareStatement( "CREATE TABLE employee (dept NUMBER(32), name VARCHAR2(32), salary NUMBER(20, 2)," + " bonus BINARY_DOUBLE)").executeUpdate() + connection.prepareStatement( + s"""CREATE TABLE pattern_testing_table ( + |pattern_testing_col VARCHAR(50) + |) + """.stripMargin + ).executeUpdate() } override def testUpdateColumnType(tbl: String): Unit = { @@ -142,4 +144,13 @@ class OracleIntegrationSuite extends DockerJDBCIntegrationV2Suite with V2JDBCTes ) } } + + test("SPARK-47879: Use VARCHAR2 instead of VARCHAR") { + val tableName = catalogName + ".t1" + withTable(tableName) { + sql(s"CREATE TABLE $tableName(c1 varchar(10), c2 char(3))") + sql(s"INSERT INTO $tableName SELECT 'Eason' as c1, 'Y' as c2") + checkAnswer(sql(s"SELECT * FROM $tableName"), Seq(Row("Eason", "Y "))) + } + } } diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/OracleNamespaceSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/OracleNamespaceSuite.scala index 05f38102d4101..48f8282e58804 100644 --- a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/OracleNamespaceSuite.scala +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/OracleNamespaceSuite.scala @@ -46,11 +46,11 @@ import org.apache.spark.tags.DockerTest * A sequence of commands to build the Oracle Database Free container image: * $ git clone https://github.com/oracle/docker-images.git * $ cd docker-images/OracleDatabase/SingleInstance/dockerfiles - * $ ./buildContainerImage.sh -v 23.2.0 -f - * $ export ORACLE_DOCKER_IMAGE_NAME=oracle/database:23.2.0-free + * $ ./buildContainerImage.sh -v 23.4.0 -f + * $ export ORACLE_DOCKER_IMAGE_NAME=oracle/database:23.4.0-free * - * This procedure has been validated with Oracle Database Free version 23.2.0, - * and with Oracle Express Edition versions 18.4.0 and 21.3.0 + * This procedure has been validated with Oracle Database Free version 23.4.0, + * and with Oracle Express Edition versions 18.4.0 and 21.4.0 */ @DockerTest class OracleNamespaceSuite extends DockerJDBCIntegrationSuite with V2JDBCNamespaceTest { diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/PostgresIntegrationSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/PostgresIntegrationSuite.scala index 1f09c2fd3fc59..7c439d449d86f 100644 --- a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/PostgresIntegrationSuite.scala +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/PostgresIntegrationSuite.scala @@ -28,9 +28,9 @@ import org.apache.spark.sql.types._ import org.apache.spark.tags.DockerTest /** - * To run this test suite for a specific version (e.g., postgres:16.2) + * To run this test suite for a specific version (e.g., postgres:16.3-alpine) * {{{ - * ENABLE_DOCKER_INTEGRATION_TESTS=1 POSTGRES_DOCKER_IMAGE_NAME=postgres:16.2 + * ENABLE_DOCKER_INTEGRATION_TESTS=1 POSTGRES_DOCKER_IMAGE_NAME=postgres:16.3-alpine * ./build/sbt -Pdocker-integration-tests "testOnly *v2.PostgresIntegrationSuite" * }}} */ @@ -38,7 +38,7 @@ import org.apache.spark.tags.DockerTest class PostgresIntegrationSuite extends DockerJDBCIntegrationV2Suite with V2JDBCTest { override val catalogName: String = "postgresql" override val db = new DatabaseOnDocker { - override val imageName = sys.env.getOrElse("POSTGRES_DOCKER_IMAGE_NAME", "postgres:16.2-alpine") + override val imageName = sys.env.getOrElse("POSTGRES_DOCKER_IMAGE_NAME", "postgres:16.3-alpine") override val env = Map( "POSTGRES_PASSWORD" -> "rootpass" ) @@ -59,6 +59,12 @@ class PostgresIntegrationSuite extends DockerJDBCIntegrationV2Suite with V2JDBCT connection.prepareStatement( "CREATE TABLE employee (dept INTEGER, name VARCHAR(32), salary NUMERIC(20, 2)," + " bonus double precision)").executeUpdate() + connection.prepareStatement( + s"""CREATE TABLE pattern_testing_table ( + |pattern_testing_col VARCHAR(50) + |) + """.stripMargin + ).executeUpdate() } override def testUpdateColumnType(tbl: String): Unit = { diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/PostgresNamespaceSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/PostgresNamespaceSuite.scala index 838de5acab0df..8a2d0ded84381 100644 --- a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/PostgresNamespaceSuite.scala +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/PostgresNamespaceSuite.scala @@ -26,16 +26,16 @@ import org.apache.spark.sql.util.CaseInsensitiveStringMap import org.apache.spark.tags.DockerTest /** - * To run this test suite for a specific version (e.g., postgres:16.2): + * To run this test suite for a specific version (e.g., postgres:16.3-alpine): * {{{ - * ENABLE_DOCKER_INTEGRATION_TESTS=1 POSTGRES_DOCKER_IMAGE_NAME=postgres:16.2 + * ENABLE_DOCKER_INTEGRATION_TESTS=1 POSTGRES_DOCKER_IMAGE_NAME=postgres:16.3-alpine * ./build/sbt -Pdocker-integration-tests "testOnly *v2.PostgresNamespaceSuite" * }}} */ @DockerTest class PostgresNamespaceSuite extends DockerJDBCIntegrationSuite with V2JDBCNamespaceTest { override val db = new DatabaseOnDocker { - override val imageName = sys.env.getOrElse("POSTGRES_DOCKER_IMAGE_NAME", "postgres:16.2-alpine") + override val imageName = sys.env.getOrElse("POSTGRES_DOCKER_IMAGE_NAME", "postgres:16.3-alpine") override val env = Map( "POSTGRES_PASSWORD" -> "rootpass" ) diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/V2JDBCTest.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/V2JDBCTest.scala index c80fbfc748dd1..88ba00a8a1aea 100644 --- a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/V2JDBCTest.scala +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/V2JDBCTest.scala @@ -20,9 +20,8 @@ package org.apache.spark.sql.jdbc.v2 import org.apache.logging.log4j.Level import org.apache.spark.sql.{AnalysisException, DataFrame} -import org.apache.spark.sql.catalyst.analysis.{IndexAlreadyExistsException, NoSuchIndexException, UnresolvedAttribute} +import org.apache.spark.sql.catalyst.analysis.{IndexAlreadyExistsException, NoSuchIndexException} import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, Filter, Sample, Sort} -import org.apache.spark.sql.catalyst.util.quoteIdentifier import org.apache.spark.sql.connector.catalog.{Catalogs, Identifier, TableCatalog} import org.apache.spark.sql.connector.catalog.index.SupportsIndex import org.apache.spark.sql.connector.expressions.NullOrdering @@ -84,6 +83,19 @@ private[v2] trait V2JDBCTest extends SharedSparkSession with DockerIntegrationFu def testCreateTableWithProperty(tbl: String): Unit = {} + private def checkErrorFailedJDBC( + e: AnalysisException, + errorClass: String, + tbl: String): Unit = { + checkErrorMatchPVals( + exception = e, + errorClass = errorClass, + parameters = Map( + "url" -> "jdbc:.*", + "tableName" -> s"`$tbl`") + ) + } + test("SPARK-33034: ALTER TABLE ... add new columns") { withTable(s"$catalogName.alt_table") { sql(s"CREATE TABLE $catalogName.alt_table (ID STRING)") @@ -107,7 +119,7 @@ private[v2] trait V2JDBCTest extends SharedSparkSession with DockerIntegrationFu exception = intercept[AnalysisException] { sql(s"ALTER TABLE $catalogName.alt_table ADD COLUMNS (C3 DOUBLE)") }, - errorClass = "FIELDS_ALREADY_EXISTS", + errorClass = "FIELD_ALREADY_EXISTS", parameters = Map( "op" -> "add", "fieldNames" -> "`C3`", @@ -122,9 +134,7 @@ private[v2] trait V2JDBCTest extends SharedSparkSession with DockerIntegrationFu val e = intercept[AnalysisException] { sql(s"ALTER TABLE $catalogName.not_existing_table ADD COLUMNS (C4 STRING)") } - checkErrorTableNotFound(e, s"`$catalogName`.`not_existing_table`", - ExpectedContext(s"$catalogName.not_existing_table", 12, - 11 + s"$catalogName.not_existing_table".length)) + checkErrorFailedJDBC(e, "FAILED_JDBC.LOAD_TABLE", "not_existing_table") } test("SPARK-33034: ALTER TABLE ... drop column") { @@ -146,9 +156,7 @@ private[v2] trait V2JDBCTest extends SharedSparkSession with DockerIntegrationFu val e = intercept[AnalysisException] { sql(s"ALTER TABLE $catalogName.not_existing_table DROP COLUMN C1") } - checkErrorTableNotFound(e, s"`$catalogName`.`not_existing_table`", - ExpectedContext(s"$catalogName.not_existing_table", 12, - 11 + s"$catalogName.not_existing_table".length)) + checkErrorFailedJDBC(e, "FAILED_JDBC.LOAD_TABLE", "not_existing_table") } test("SPARK-33034: ALTER TABLE ... update column type") { @@ -164,9 +172,7 @@ private[v2] trait V2JDBCTest extends SharedSparkSession with DockerIntegrationFu val e = intercept[AnalysisException] { sql(s"ALTER TABLE $catalogName.not_existing_table ALTER COLUMN id TYPE DOUBLE") } - checkErrorTableNotFound(e, s"`$catalogName`.`not_existing_table`", - ExpectedContext(s"$catalogName.not_existing_table", 12, - 11 + s"$catalogName.not_existing_table".length)) + checkErrorFailedJDBC(e, "FAILED_JDBC.LOAD_TABLE", "not_existing_table") } test("SPARK-33034: ALTER TABLE ... rename column") { @@ -179,7 +185,7 @@ private[v2] trait V2JDBCTest extends SharedSparkSession with DockerIntegrationFu exception = intercept[AnalysisException] { sql(s"ALTER TABLE $catalogName.alt_table RENAME COLUMN ID1 TO ID2") }, - errorClass = "FIELDS_ALREADY_EXISTS", + errorClass = "FIELD_ALREADY_EXISTS", parameters = Map( "op" -> "rename", "fieldNames" -> "`ID2`", @@ -194,11 +200,7 @@ private[v2] trait V2JDBCTest extends SharedSparkSession with DockerIntegrationFu val e = intercept[AnalysisException] { sql(s"ALTER TABLE $catalogName.not_existing_table RENAME COLUMN ID TO C") } - checkErrorTableNotFound(e, - UnresolvedAttribute.parseAttributeName(s"$catalogName.not_existing_table") - .map(part => quoteIdentifier(part)).mkString("."), - ExpectedContext(s"$catalogName.not_existing_table", 12, - 11 + s"$catalogName.not_existing_table".length)) + checkErrorFailedJDBC(e, "FAILED_JDBC.LOAD_TABLE", "not_existing_table") } test("SPARK-33034: ALTER TABLE ... update column nullability") { @@ -209,9 +211,7 @@ private[v2] trait V2JDBCTest extends SharedSparkSession with DockerIntegrationFu val e = intercept[AnalysisException] { sql(s"ALTER TABLE $catalogName.not_existing_table ALTER COLUMN ID DROP NOT NULL") } - checkErrorTableNotFound(e, s"`$catalogName`.`not_existing_table`", - ExpectedContext(s"$catalogName.not_existing_table", 12, - 11 + s"$catalogName.not_existing_table".length)) + checkErrorFailedJDBC(e, "FAILED_JDBC.LOAD_TABLE", "not_existing_table") } test("CREATE TABLE with table comment") { @@ -233,7 +233,7 @@ private[v2] trait V2JDBCTest extends SharedSparkSession with DockerIntegrationFu val e = intercept[AnalysisException] { sql(s"CREATE TABLE $catalogName.new_table (i INT) TBLPROPERTIES('a'='1')") } - assert(e.getErrorClass == "FAILED_JDBC.UNCLASSIFIED") + checkErrorFailedJDBC(e, "FAILED_JDBC.CREATE_TABLE", "new_table") testCreateTableWithProperty(s"$catalogName.new_table") } } @@ -359,6 +359,235 @@ private[v2] trait V2JDBCTest extends SharedSparkSession with DockerIntegrationFu assert(scan.schema.names.sameElements(Seq(col))) } + test("SPARK-48172: Test CONTAINS") { + val df1 = spark.sql( + s""" + |SELECT * FROM $catalogAndNamespace.${caseConvert("pattern_testing_table")} + |WHERE contains(pattern_testing_col, 'quote\\'')""".stripMargin) + df1.explain("formatted") + val rows1 = df1.collect() + assert(rows1.length === 1) + assert(rows1(0).getString(0) === "special_character_quote'_present") + + val df2 = spark.sql( + s"""SELECT * FROM $catalogAndNamespace.${caseConvert("pattern_testing_table")} + |WHERE contains(pattern_testing_col, 'percent%')""".stripMargin) + val rows2 = df2.collect() + assert(rows2.length === 1) + assert(rows2(0).getString(0) === "special_character_percent%_present") + + val df3 = spark. + sql( + s"""SELECT * FROM $catalogAndNamespace.${caseConvert("pattern_testing_table")} + |WHERE contains(pattern_testing_col, 'underscore_')""".stripMargin) + val rows3 = df3.collect() + assert(rows3.length === 1) + assert(rows3(0).getString(0) === "special_character_underscore_present") + + val df4 = spark. + sql( + s"""SELECT * FROM $catalogAndNamespace.${caseConvert("pattern_testing_table")} + |WHERE contains(pattern_testing_col, 'character') + |ORDER BY pattern_testing_col""".stripMargin) + val rows4 = df4.collect() + assert(rows4.length === 6) + assert(rows4(0).getString(0) === "special_character_percent%_present") + assert(rows4(1).getString(0) === "special_character_percent_not_present") + assert(rows4(2).getString(0) === "special_character_quote'_present") + assert(rows4(3).getString(0) === "special_character_quote_not_present") + assert(rows4(4).getString(0) === "special_character_underscore_present") + assert(rows4(5).getString(0) === "special_character_underscorenot_present") + } + + test("SPARK-48172: Test ENDSWITH") { + val df1 = spark.sql( + s"""SELECT * FROM $catalogAndNamespace.${caseConvert("pattern_testing_table")} + |WHERE endswith(pattern_testing_col, 'quote\\'_present')""".stripMargin) + val rows1 = df1.collect() + assert(rows1.length === 1) + assert(rows1(0).getString(0) === "special_character_quote'_present") + + val df2 = spark.sql( + s"""SELECT * FROM $catalogAndNamespace.${caseConvert("pattern_testing_table")} + |WHERE endswith(pattern_testing_col, 'percent%_present')""".stripMargin) + val rows2 = df2.collect() + assert(rows2.length === 1) + assert(rows2(0).getString(0) === "special_character_percent%_present") + + val df3 = spark. + sql( + s"""SELECT * FROM $catalogAndNamespace.${caseConvert("pattern_testing_table")} + |WHERE endswith(pattern_testing_col, 'underscore_present')""".stripMargin) + val rows3 = df3.collect() + assert(rows3.length === 1) + assert(rows3(0).getString(0) === "special_character_underscore_present") + + val df4 = spark. + sql( + s"""SELECT * FROM $catalogAndNamespace.${caseConvert("pattern_testing_table")} + |WHERE endswith(pattern_testing_col, 'present') + |ORDER BY pattern_testing_col""".stripMargin) + val rows4 = df4.collect() + assert(rows4.length === 6) + assert(rows4(0).getString(0) === "special_character_percent%_present") + assert(rows4(1).getString(0) === "special_character_percent_not_present") + assert(rows4(2).getString(0) === "special_character_quote'_present") + assert(rows4(3).getString(0) === "special_character_quote_not_present") + assert(rows4(4).getString(0) === "special_character_underscore_present") + assert(rows4(5).getString(0) === "special_character_underscorenot_present") + } + + test("SPARK-48172: Test STARTSWITH") { + val df1 = spark.sql( + s"""SELECT * FROM $catalogAndNamespace.${caseConvert("pattern_testing_table")} + |WHERE startswith(pattern_testing_col, 'special_character_quote\\'')""".stripMargin) + val rows1 = df1.collect() + assert(rows1.length === 1) + assert(rows1(0).getString(0) === "special_character_quote'_present") + + val df2 = spark.sql( + s"""SELECT * FROM $catalogAndNamespace.${caseConvert("pattern_testing_table")} + |WHERE startswith(pattern_testing_col, 'special_character_percent%')""".stripMargin) + val rows2 = df2.collect() + assert(rows2.length === 1) + assert(rows2(0).getString(0) === "special_character_percent%_present") + + val df3 = spark. + sql( + s"""SELECT * FROM $catalogAndNamespace.${caseConvert("pattern_testing_table")} + |WHERE startswith(pattern_testing_col, 'special_character_underscore_')""".stripMargin) + val rows3 = df3.collect() + assert(rows3.length === 1) + assert(rows3(0).getString(0) === "special_character_underscore_present") + + val df4 = spark. + sql( + s"""SELECT * FROM $catalogAndNamespace.${caseConvert("pattern_testing_table")} + |WHERE startswith(pattern_testing_col, 'special_character') + |ORDER BY pattern_testing_col""".stripMargin) + val rows4 = df4.collect() + assert(rows4.length === 6) + assert(rows4(0).getString(0) === "special_character_percent%_present") + assert(rows4(1).getString(0) === "special_character_percent_not_present") + assert(rows4(2).getString(0) === "special_character_quote'_present") + assert(rows4(3).getString(0) === "special_character_quote_not_present") + assert(rows4(4).getString(0) === "special_character_underscore_present") + assert(rows4(5).getString(0) === "special_character_underscorenot_present") + } + + test("SPARK-48172: Test LIKE") { + // this one should map to contains + val df1 = spark.sql( + s"""SELECT * FROM $catalogAndNamespace.${caseConvert("pattern_testing_table")} + |WHERE pattern_testing_col LIKE '%quote\\'%'""".stripMargin) + val rows1 = df1.collect() + assert(rows1.length === 1) + assert(rows1(0).getString(0) === "special_character_quote'_present") + + val df2 = spark.sql( + s"""SELECT * FROM $catalogAndNamespace.${caseConvert("pattern_testing_table")} + |WHERE pattern_testing_col LIKE '%percent\\%%'""".stripMargin) + val rows2 = df2.collect() + assert(rows2.length === 1) + assert(rows2(0).getString(0) === "special_character_percent%_present") + + val df3 = spark. + sql( + s"""SELECT * FROM $catalogAndNamespace.${caseConvert("pattern_testing_table")} + |WHERE pattern_testing_col LIKE '%underscore\\_%'""".stripMargin) + val rows3 = df3.collect() + assert(rows3.length === 1) + assert(rows3(0).getString(0) === "special_character_underscore_present") + + val df4 = spark. + sql( + s"""SELECT * FROM $catalogAndNamespace.${caseConvert("pattern_testing_table")} + |WHERE pattern_testing_col LIKE '%character%' + |ORDER BY pattern_testing_col""".stripMargin) + val rows4 = df4.collect() + assert(rows4.length === 6) + assert(rows4(0).getString(0) === "special_character_percent%_present") + assert(rows4(1).getString(0) === "special_character_percent_not_present") + assert(rows4(2).getString(0) === "special_character_quote'_present") + assert(rows4(3).getString(0) === "special_character_quote_not_present") + assert(rows4(4).getString(0) === "special_character_underscore_present") + assert(rows4(5).getString(0) === "special_character_underscorenot_present") + + // map to startsWith + // this one should map to contains + val df5 = spark.sql( + s"""SELECT * FROM $catalogAndNamespace.${caseConvert("pattern_testing_table")} + |WHERE pattern_testing_col LIKE 'special_character_quote\\'%'""".stripMargin) + val rows5 = df5.collect() + assert(rows5.length === 1) + assert(rows5(0).getString(0) === "special_character_quote'_present") + + val df6 = spark.sql( + s"""SELECT * FROM $catalogAndNamespace.${caseConvert("pattern_testing_table")} + |WHERE pattern_testing_col LIKE 'special_character_percent\\%%'""".stripMargin) + val rows6 = df6.collect() + assert(rows6.length === 1) + assert(rows6(0).getString(0) === "special_character_percent%_present") + + val df7 = spark. + sql( + s"""SELECT * FROM $catalogAndNamespace.${caseConvert("pattern_testing_table")} + |WHERE pattern_testing_col LIKE 'special_character_underscore\\_%'""".stripMargin) + val rows7 = df7.collect() + assert(rows7.length === 1) + assert(rows7(0).getString(0) === "special_character_underscore_present") + + val df8 = spark. + sql( + s"""SELECT * FROM $catalogAndNamespace.${caseConvert("pattern_testing_table")} + |WHERE pattern_testing_col LIKE 'special_character%' + |ORDER BY pattern_testing_col""".stripMargin) + val rows8 = df8.collect() + assert(rows8.length === 6) + assert(rows8(0).getString(0) === "special_character_percent%_present") + assert(rows8(1).getString(0) === "special_character_percent_not_present") + assert(rows8(2).getString(0) === "special_character_quote'_present") + assert(rows8(3).getString(0) === "special_character_quote_not_present") + assert(rows8(4).getString(0) === "special_character_underscore_present") + assert(rows8(5).getString(0) === "special_character_underscorenot_present") + // map to endsWith + // this one should map to contains + val df9 = spark.sql( + s"""SELECT * FROM $catalogAndNamespace.${caseConvert("pattern_testing_table")} + |WHERE pattern_testing_col LIKE '%quote\\'_present'""".stripMargin) + val rows9 = df9.collect() + assert(rows9.length === 1) + assert(rows9(0).getString(0) === "special_character_quote'_present") + + val df10 = spark.sql( + s"""SELECT * FROM $catalogAndNamespace.${caseConvert("pattern_testing_table")} + |WHERE pattern_testing_col LIKE '%percent\\%_present'""".stripMargin) + val rows10 = df10.collect() + assert(rows10.length === 1) + assert(rows10(0).getString(0) === "special_character_percent%_present") + + val df11 = spark. + sql( + s"""SELECT * FROM $catalogAndNamespace.${caseConvert("pattern_testing_table")} + |WHERE pattern_testing_col LIKE '%underscore\\_present'""".stripMargin) + val rows11 = df11.collect() + assert(rows11.length === 1) + assert(rows11(0).getString(0) === "special_character_underscore_present") + + val df12 = spark. + sql( + s"""SELECT * FROM $catalogAndNamespace.${caseConvert("pattern_testing_table")} + |WHERE pattern_testing_col LIKE '%present' ORDER BY pattern_testing_col""".stripMargin) + val rows12 = df12.collect() + assert(rows12.length === 6) + assert(rows12(0).getString(0) === "special_character_percent%_present") + assert(rows12(1).getString(0) === "special_character_percent_not_present") + assert(rows12(2).getString(0) === "special_character_quote'_present") + assert(rows12(3).getString(0) === "special_character_quote_not_present") + assert(rows12(4).getString(0) === "special_character_underscore_present") + assert(rows12(5).getString(0) === "special_character_underscorenot_present") + } + test("SPARK-37038: Test TABLESAMPLE") { if (supportsTableSample) { withTable(s"$catalogName.new_table") { diff --git a/connector/kafka-0-10-sql/src/main/resources/error/kafka-error-conditions.json b/connector/kafka-0-10-sql/src/main/resources/error/kafka-error-conditions.json index a7b22e1370fd8..2fa44d7bd66a6 100644 --- a/connector/kafka-0-10-sql/src/main/resources/error/kafka-error-conditions.json +++ b/connector/kafka-0-10-sql/src/main/resources/error/kafka-error-conditions.json @@ -23,6 +23,13 @@ "latest offset: , end offset: " ] }, + "KAFKA_START_OFFSET_DOES_NOT_MATCH_ASSIGNED" : { + "message" : [ + "Partitions specified for Kafka start offsets don't match what are assigned. Maybe topic partitions are created ", + "or deleted while the query is running. Use -1 for latest, -2 for earliest.", + "Specified: Assigned: " + ] + }, "KAFKA_DATA_LOSS" : { "message" : [ "Some data may have been lost because they are not available in Kafka any more;", diff --git a/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaBatchPartitionReader.scala b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaBatchPartitionReader.scala index d3fe3264afe14..cb1c7055483b3 100644 --- a/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaBatchPartitionReader.scala +++ b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaBatchPartitionReader.scala @@ -21,7 +21,7 @@ import java.{util => ju} import org.apache.spark.TaskContext import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey._ +import org.apache.spark.internal.LogKeys._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.connector.metric.CustomTaskMetric diff --git a/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousStream.scala b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousStream.scala index e5e22243a5826..10bdbb1d9d447 100644 --- a/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousStream.scala +++ b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousStream.scala @@ -25,7 +25,7 @@ import org.apache.kafka.common.TopicPartition import org.apache.spark.TaskContext import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{ERROR, OFFSETS, TIP} +import org.apache.spark.internal.LogKeys.{ERROR, OFFSETS, TIP} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.connector.read.InputPartition diff --git a/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaExceptions.scala b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaExceptions.scala index 735184db3c1af..13a68e72269f0 100644 --- a/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaExceptions.scala +++ b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaExceptions.scala @@ -27,8 +27,8 @@ private object KafkaExceptionsHelper { val errorClassesJsonReader: ErrorClassesJsonReader = new ErrorClassesJsonReader( // Note that though we call them "error classes" here, the proper name is "error conditions", - // hence why the name of the JSON file different. We will address this inconsistency as part - // of this ticket: https://issues.apache.org/jira/browse/SPARK-47429 + // hence why the name of the JSON file is different. We will address this inconsistency as + // part of this ticket: https://issues.apache.org/jira/browse/SPARK-47429 Seq(getClass.getClassLoader.getResource("error/kafka-error-conditions.json"))) } @@ -97,7 +97,7 @@ object KafkaExceptions { "startOffset" -> startOffset.toString, "endOffset" -> endOffset.toString, "topicPartition" -> topicPartition.toString, - "groupId" -> groupId), + "groupId" -> Option(groupId).getOrElse("null")), cause = cause) } @@ -155,6 +155,16 @@ object KafkaExceptions { "prevOffset" -> prevOffset.toString, "newOffset" -> newOffset.toString)) } + + def startOffsetDoesNotMatchAssigned( + specifiedPartitions: Set[TopicPartition], + assignedPartitions: Set[TopicPartition]): KafkaIllegalStateException = { + new KafkaIllegalStateException( + errorClass = "KAFKA_START_OFFSET_DOES_NOT_MATCH_ASSIGNED", + messageParameters = Map( + "specifiedPartitions" -> specifiedPartitions.toString, + "assignedPartitions" -> assignedPartitions.toString)) + } } /** diff --git a/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchStream.scala b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchStream.scala index 3313d42d1a30e..c79da13017b97 100644 --- a/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchStream.scala +++ b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchStream.scala @@ -26,7 +26,7 @@ import org.apache.kafka.common.TopicPartition import org.apache.spark.SparkEnv import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{ERROR, OFFSETS, TIP} +import org.apache.spark.internal.LogKeys.{ERROR, OFFSETS, TIP} import org.apache.spark.internal.config.Network.NETWORK_TIMEOUT import org.apache.spark.sql.SparkSession import org.apache.spark.sql.connector.read.{InputPartition, PartitionReaderFactory} diff --git a/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReaderAdmin.scala b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReaderAdmin.scala index 5ed8576e88888..bb4f14686f976 100644 --- a/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReaderAdmin.scala +++ b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReaderAdmin.scala @@ -31,7 +31,7 @@ import org.apache.kafka.common.requests.OffsetFetchResponse import org.apache.spark.SparkEnv import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{OFFSETS, RETRY_COUNT, TOPIC_PARTITION_OFFSET} +import org.apache.spark.internal.LogKeys.{NUM_RETRY, OFFSETS, TOPIC_PARTITION_OFFSET} import org.apache.spark.scheduler.ExecutorCacheTaskLocation import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap import org.apache.spark.sql.kafka010.KafkaSourceProvider.StrategyOnNoMatchStartingOffset @@ -120,10 +120,9 @@ private[kafka010] class KafkaOffsetReaderAdmin( isStartingOffsets: Boolean): Map[TopicPartition, Long] = { def validateTopicPartitions(partitions: Set[TopicPartition], partitionOffsets: Map[TopicPartition, Long]): Map[TopicPartition, Long] = { - assert(partitions == partitionOffsets.keySet, - "If startingOffsets contains specific offsets, you must specify all TopicPartitions.\n" + - "Use -1 for latest, -2 for earliest.\n" + - s"Specified: ${partitionOffsets.keySet} Assigned: ${partitions}") + if (partitions != partitionOffsets.keySet) { + throw KafkaExceptions.startOffsetDoesNotMatchAssigned(partitionOffsets.keySet, partitions) + } logDebug(s"Assigned partitions: $partitions. Seeking to $partitionOffsets") partitionOffsets } @@ -536,7 +535,7 @@ private[kafka010] class KafkaOffsetReaderAdmin( case NonFatal(e) => lastException = e logWarning( - log"Error in attempt ${MDC(RETRY_COUNT, attempt)} getting Kafka offsets: ", e) + log"Error in attempt ${MDC(NUM_RETRY, attempt)} getting Kafka offsets: ", e) attempt += 1 Thread.sleep(offsetFetchAttemptIntervalMs) resetAdmin() diff --git a/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReaderConsumer.scala b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReaderConsumer.scala index 34d44fdf10591..fa53d6373176e 100644 --- a/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReaderConsumer.scala +++ b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReaderConsumer.scala @@ -28,7 +28,7 @@ import org.apache.kafka.common.TopicPartition import org.apache.spark.SparkEnv import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{OFFSETS, RETRY_COUNT, TOPIC_PARTITION_OFFSET} +import org.apache.spark.internal.LogKeys.{NUM_RETRY, OFFSETS, TOPIC_PARTITION_OFFSET} import org.apache.spark.scheduler.ExecutorCacheTaskLocation import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap import org.apache.spark.sql.kafka010.KafkaSourceProvider.StrategyOnNoMatchStartingOffset @@ -142,10 +142,9 @@ private[kafka010] class KafkaOffsetReaderConsumer( isStartingOffsets: Boolean): Map[TopicPartition, Long] = { def validateTopicPartitions(partitions: Set[TopicPartition], partitionOffsets: Map[TopicPartition, Long]): Map[TopicPartition, Long] = { - assert(partitions == partitionOffsets.keySet, - "If startingOffsets contains specific offsets, you must specify all TopicPartitions.\n" + - "Use -1 for latest, -2 for earliest.\n" + - s"Specified: ${partitionOffsets.keySet} Assigned: ${partitions}") + if (partitions != partitionOffsets.keySet) { + throw KafkaExceptions.startOffsetDoesNotMatchAssigned(partitionOffsets.keySet, partitions) + } logDebug(s"Partitions assigned to consumer: $partitions. Seeking to $partitionOffsets") partitionOffsets } @@ -613,7 +612,7 @@ private[kafka010] class KafkaOffsetReaderConsumer( case NonFatal(e) => lastException = e logWarning( - log"Error in attempt ${MDC(RETRY_COUNT, attempt)} getting Kafka offsets: ", e) + log"Error in attempt ${MDC(NUM_RETRY, attempt)} getting Kafka offsets: ", e) attempt += 1 Thread.sleep(offsetFetchAttemptIntervalMs) resetConsumer() diff --git a/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaRelation.scala b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaRelation.scala index 97b866067ea88..b77bb94aaf46f 100644 --- a/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaRelation.scala +++ b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaRelation.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql.kafka010 import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.TOPIC_PARTITIONS +import org.apache.spark.internal.LogKeys.TOPIC_PARTITIONS import org.apache.spark.internal.config.Network.NETWORK_TIMEOUT import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Row, SQLContext} diff --git a/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSink.scala b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSink.scala index 5a75682f54f9a..fb473e71d5a75 100644 --- a/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSink.scala +++ b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSink.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.kafka010 import java.{util => ju} import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.BATCH_ID +import org.apache.spark.internal.LogKeys.BATCH_ID import org.apache.spark.sql.DataFrame import org.apache.spark.sql.execution.streaming.Sink diff --git a/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala index b0ab469690e2a..d43b22d9de922 100644 --- a/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala +++ b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala @@ -23,7 +23,7 @@ import org.apache.kafka.common.TopicPartition import org.apache.spark.SparkContext import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{ERROR, FROM_OFFSET, OFFSETS, TIP, TOPIC_PARTITIONS, UNTIL_OFFSET} +import org.apache.spark.internal.LogKeys.{ERROR, FROM_OFFSET, OFFSETS, TIP, TOPIC_PARTITIONS, UNTIL_OFFSET} import org.apache.spark.internal.config.Network.NETWORK_TIMEOUT import org.apache.spark.scheduler.ExecutorCacheTaskLocation import org.apache.spark.sql._ diff --git a/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceRDD.scala b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceRDD.scala index 4eb73e6d39f02..bc7f8b6b44f90 100644 --- a/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceRDD.scala +++ b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceRDD.scala @@ -22,7 +22,7 @@ import java.{util => ju} import org.apache.kafka.clients.consumer.ConsumerRecord import org.apache.spark.{Partition, SparkContext, TaskContext} -import org.apache.spark.internal.LogKey.{FROM_OFFSET, PARTITION_ID, TOPIC} +import org.apache.spark.internal.LogKeys.{FROM_OFFSET, PARTITION_ID, TOPIC} import org.apache.spark.internal.MDC import org.apache.spark.rdd.RDD import org.apache.spark.sql.kafka010.consumer.KafkaDataConsumer diff --git a/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/consumer/FetchedDataPool.scala b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/consumer/FetchedDataPool.scala index 981aa71bf9479..9f68cb6fd0882 100644 --- a/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/consumer/FetchedDataPool.scala +++ b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/consumer/FetchedDataPool.scala @@ -27,7 +27,7 @@ import org.apache.kafka.clients.consumer.ConsumerRecord import org.apache.spark.SparkConf import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{DATA, KEY} +import org.apache.spark.internal.LogKeys.{DATA, KEY} import org.apache.spark.sql.kafka010.{FETCHED_DATA_CACHE_EVICTOR_THREAD_RUN_INTERVAL, FETCHED_DATA_CACHE_TIMEOUT} import org.apache.spark.sql.kafka010.consumer.KafkaDataConsumer.{AvailableOffsetRange, CacheKey, UNKNOWN_OFFSET} import org.apache.spark.util.{Clock, SystemClock, ThreadUtils, Utils} diff --git a/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/consumer/InternalKafkaConsumerPool.scala b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/consumer/InternalKafkaConsumerPool.scala index 661fe731b97b9..edd5121cfbeee 100644 --- a/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/consumer/InternalKafkaConsumerPool.scala +++ b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/consumer/InternalKafkaConsumerPool.scala @@ -183,8 +183,8 @@ private[consumer] object InternalKafkaConsumerPool { setMaxTotal(-1) // Set minimum evictable idle time which will be referred from evictor thread - setMinEvictableIdleTime(Duration.ofMillis(minEvictableIdleTimeMillis)) - setSoftMinEvictableIdleTime(BaseObjectPoolConfig.DEFAULT_SOFT_MIN_EVICTABLE_IDLE_DURATION) + setMinEvictableIdleDuration(Duration.ofMillis(minEvictableIdleTimeMillis)) + setSoftMinEvictableIdleDuration(BaseObjectPoolConfig.DEFAULT_SOFT_MIN_EVICTABLE_IDLE_DURATION) // evictor thread will run test with ten idle objects setTimeBetweenEvictionRuns(Duration.ofMillis(evictorThreadRunIntervalMillis)) diff --git a/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/consumer/KafkaDataConsumer.scala b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/consumer/KafkaDataConsumer.scala index 72ceebb700d69..ceb9d96660ae3 100644 --- a/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/consumer/KafkaDataConsumer.scala +++ b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/consumer/KafkaDataConsumer.scala @@ -31,7 +31,7 @@ import org.apache.kafka.common.TopicPartition import org.apache.spark.{SparkEnv, TaskContext} import org.apache.spark.deploy.security.HadoopDelegationTokenManager import org.apache.spark.internal.{Logging, MDC, MessageWithContext} -import org.apache.spark.internal.LogKey._ +import org.apache.spark.internal.LogKeys._ import org.apache.spark.kafka010.{KafkaConfigUpdater, KafkaTokenUtil} import org.apache.spark.sql.catalyst.util.DateTimeConstants.NANOS_PER_MILLIS import org.apache.spark.sql.kafka010.KafkaExceptions @@ -393,9 +393,9 @@ private[kafka010] class KafkaDataConsumer( val walTime = System.nanoTime() - startTimestampNano logInfo(log"From Kafka ${MDC(CONSUMER, kafkaMeta)} read " + - log"${MDC(TOTAL_RECORDS_READ, totalRecordsRead)} records through " + - log"${MDC(KAFKA_PULLS_COUNT, numPolls)} polls " + - log"(polled out ${MDC(KAFKA_RECORDS_PULLED_COUNT, numRecordsPolled)} records), " + + log"${MDC(NUM_RECORDS_READ, totalRecordsRead)} records through " + + log"${MDC(NUM_KAFKA_PULLS, numPolls)} polls " + + log"(polled out ${MDC(NUM_KAFKA_RECORDS_PULLED, numRecordsPolled)} records), " + log"taking ${MDC(TOTAL_TIME_READ, totalTimeReadNanos / NANOS_PER_MILLIS.toDouble)} ms, " + log"during time span of ${MDC(TIME, walTime / NANOS_PER_MILLIS.toDouble)} ms." ) diff --git a/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/producer/CachedKafkaProducer.scala b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/producer/CachedKafkaProducer.scala index afd426694d7b0..c3457cf8982d9 100644 --- a/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/producer/CachedKafkaProducer.scala +++ b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/producer/CachedKafkaProducer.scala @@ -24,7 +24,7 @@ import scala.util.control.NonFatal import org.apache.kafka.clients.producer.KafkaProducer import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.PRODUCER_ID +import org.apache.spark.internal.LogKeys.PRODUCER_ID private[kafka010] class CachedKafkaProducer( val cacheKey: Seq[(String, Object)], diff --git a/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/producer/InternalKafkaProducerPool.scala b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/producer/InternalKafkaProducerPool.scala index f35023d744b63..79e0a91dd8968 100644 --- a/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/producer/InternalKafkaProducerPool.scala +++ b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/producer/InternalKafkaProducerPool.scala @@ -28,7 +28,7 @@ import org.apache.kafka.clients.producer.KafkaProducer import org.apache.spark.{SparkConf, SparkEnv} import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.PRODUCER_ID +import org.apache.spark.internal.LogKeys.PRODUCER_ID import org.apache.spark.kafka010.{KafkaConfigUpdater, KafkaRedactionUtil} import org.apache.spark.sql.kafka010.{PRODUCER_CACHE_EVICTOR_THREAD_RUN_INTERVAL, PRODUCER_CACHE_TIMEOUT} import org.apache.spark.util.{Clock, ShutdownHookManager, SystemClock, ThreadUtils, Utils} diff --git a/connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaOffsetReaderSuite.scala b/connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaOffsetReaderSuite.scala index 691e81f02a8c9..320485a79e59d 100644 --- a/connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaOffsetReaderSuite.scala +++ b/connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaOffsetReaderSuite.scala @@ -135,6 +135,31 @@ class KafkaOffsetReaderSuite extends QueryTest with SharedSparkSession with Kafk KafkaOffsetRange(tp, 2, LATEST, None)).sortBy(_.topicPartition.toString)) } + testWithAllOffsetFetchingSQLConf( + "SPARK-48383: START_OFFSET_DOES_NOT_MATCH_ASSIGNED error class" + ) { + val topic = newTopic() + testUtils.createTopic(topic, partitions = 3) + val reader = createKafkaReader(topic, minPartitions = Some(4)) + + // There are three topic partitions, but we only include two in offsets. + val tp1 = new TopicPartition(topic, 0) + val tp2 = new TopicPartition(topic, 1) + val startingOffsets = SpecificOffsetRangeLimit(Map(tp1 -> EARLIEST, tp2 -> EARLIEST)) + val endingOffsets = SpecificOffsetRangeLimit(Map(tp1 -> LATEST, tp2 -> 3)) + + val ex = intercept[KafkaIllegalStateException] { + reader.getOffsetRangesFromUnresolvedOffsets(startingOffsets, endingOffsets) + } + checkError( + exception = ex, + errorClass = "KAFKA_START_OFFSET_DOES_NOT_MATCH_ASSIGNED", + parameters = Map( + "specifiedPartitions" -> "Set\\(.*,.*\\)", + "assignedPartitions" -> "Set\\(.*,.*,.*\\)"), + matchPVals = true) + } + testWithAllOffsetFetchingSQLConf("SPARK-30656: getOffsetRangesFromUnresolvedOffsets - " + "multiple topic partitions") { val topic = newTopic() diff --git a/connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala b/connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala index 068e3423cd26c..0737658e65256 100644 --- a/connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala +++ b/connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala @@ -52,7 +52,7 @@ import org.scalatest.time.SpanSugar._ import org.apache.spark.{SparkConf, SparkException} import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey +import org.apache.spark.internal.LogKeys import org.apache.spark.kafka010.KafkaTokenUtil import org.apache.spark.util.{SecurityUtils, ShutdownHookManager, Utils} import org.apache.spark.util.ArrayImplicits._ @@ -70,7 +70,7 @@ class KafkaTestUtils( private val JAVA_AUTH_CONFIG = "java.security.auth.login.config" private val localHostNameForURI = Utils.localHostNameForURI() - logInfo(log"Local host name is ${MDC(LogKey.URI, localHostNameForURI)}") + logInfo(log"Local host name is ${MDC(LogKeys.URI, localHostNameForURI)}") // MiniKDC uses canonical host name on host part, hence we need to provide canonical host name // on the 'host' part of the principal. @@ -333,7 +333,7 @@ class KafkaTestUtils( Utils.deleteRecursively(new File(f)) } catch { case e: IOException if Utils.isWindows => - logWarning(log"${MDC(LogKey.ERROR, e.getMessage)}") + logWarning(log"${MDC(LogKeys.ERROR, e.getMessage)}") } } @@ -654,13 +654,13 @@ class KafkaTestUtils( Utils.deleteRecursively(snapshotDir) } catch { case e: IOException if Utils.isWindows => - logWarning(log"${MDC(LogKey.ERROR, e.getMessage)}") + logWarning(log"${MDC(LogKeys.ERROR, e.getMessage)}") } try { Utils.deleteRecursively(logDir) } catch { case e: IOException if Utils.isWindows => - logWarning(log"${MDC(LogKey.ERROR, e.getMessage)}") + logWarning(log"${MDC(LogKeys.ERROR, e.getMessage)}") } System.clearProperty(ZOOKEEPER_AUTH_PROVIDER) } diff --git a/connector/kafka-0-10-token-provider/src/main/scala/org/apache/spark/kafka010/KafkaDelegationTokenProvider.scala b/connector/kafka-0-10-token-provider/src/main/scala/org/apache/spark/kafka010/KafkaDelegationTokenProvider.scala index d0bcf90babc13..3616f93659fbb 100644 --- a/connector/kafka-0-10-token-provider/src/main/scala/org/apache/spark/kafka010/KafkaDelegationTokenProvider.scala +++ b/connector/kafka-0-10-token-provider/src/main/scala/org/apache/spark/kafka010/KafkaDelegationTokenProvider.scala @@ -25,7 +25,7 @@ import org.apache.kafka.common.security.auth.SecurityProtocol.{SASL_PLAINTEXT, S import org.apache.spark.SparkConf import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{CLUSTER_ID, SERVICE_NAME} +import org.apache.spark.internal.LogKeys.{CLUSTER_ID, SERVICE_NAME} import org.apache.spark.security.HadoopDelegationTokenProvider private[spark] class KafkaDelegationTokenProvider diff --git a/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/ConsumerStrategy.scala b/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/ConsumerStrategy.scala index 2bc2acf9aaf91..2320f1908da5a 100644 --- a/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/ConsumerStrategy.scala +++ b/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/ConsumerStrategy.scala @@ -26,7 +26,7 @@ import org.apache.kafka.clients.consumer._ import org.apache.kafka.common.TopicPartition import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.CONFIG +import org.apache.spark.internal.LogKeys.CONFIG import org.apache.spark.kafka010.KafkaConfigUpdater /** diff --git a/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala b/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala index 86ee208496263..cefaa3de182a5 100644 --- a/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala +++ b/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala @@ -28,7 +28,7 @@ import org.apache.kafka.clients.consumer._ import org.apache.kafka.common.TopicPartition import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{OFFSET, TIME, TOPIC_PARTITION, TOPIC_PARTITION_OFFSET_RANGE} +import org.apache.spark.internal.LogKeys.{OFFSET, TIME, TOPIC_PARTITION, TOPIC_PARTITION_OFFSET_RANGE} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{StreamingContext, Time} import org.apache.spark.streaming.dstream._ diff --git a/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaDataConsumer.scala b/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaDataConsumer.scala index 91df53c9e06bb..75b046430ef50 100644 --- a/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaDataConsumer.scala +++ b/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaDataConsumer.scala @@ -27,7 +27,7 @@ import org.apache.kafka.common.{KafkaException, TopicPartition} import org.apache.spark.TaskContext import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey._ +import org.apache.spark.internal.LogKeys._ import org.apache.spark.kafka010.KafkaConfigUpdater private[kafka010] sealed trait KafkaDataConsumer[K, V] { diff --git a/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaRDD.scala b/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaRDD.scala index 5bc89864cf0af..2637034766574 100644 --- a/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaRDD.scala +++ b/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaRDD.scala @@ -24,7 +24,7 @@ import org.apache.kafka.common.TopicPartition import org.apache.spark.{Partition, SparkContext, TaskContext} import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{FROM_OFFSET, PARTITION_ID, TOPIC, UNTIL_OFFSET} +import org.apache.spark.internal.LogKeys.{FROM_OFFSET, PARTITION_ID, TOPIC, UNTIL_OFFSET} import org.apache.spark.internal.config.Network._ import org.apache.spark.partial.{BoundedDouble, PartialResult} import org.apache.spark.rdd.RDD diff --git a/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaUtils.scala b/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaUtils.scala index f3e4c45b3aa99..d15e5e25f561d 100644 --- a/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaUtils.scala +++ b/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaUtils.scala @@ -25,7 +25,7 @@ import org.apache.kafka.common.TopicPartition import org.apache.spark.SparkContext import org.apache.spark.api.java.{ JavaRDD, JavaSparkContext } import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{CONFIG, GROUP_ID} +import org.apache.spark.internal.LogKeys.{CONFIG, GROUP_ID} import org.apache.spark.rdd.RDD import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.api.java.{ JavaInputDStream, JavaStreamingContext } diff --git a/connector/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala b/connector/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala index 6b0c091534b78..b391203b4b968 100644 --- a/connector/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala +++ b/connector/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala @@ -30,7 +30,7 @@ import com.amazonaws.services.kinesis.model._ import org.apache.spark._ import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{ERROR, RETRY_COUNT} +import org.apache.spark.internal.LogKeys.{ERROR, NUM_RETRY} import org.apache.spark.rdd.{BlockRDD, BlockRDDPartition} import org.apache.spark.storage.BlockId import org.apache.spark.util.NextIterator @@ -279,7 +279,7 @@ class KinesisSequenceRangeIterator( t match { case ptee: ProvisionedThroughputExceededException => logWarning(log"Error while ${MDC(ERROR, message)} " + - log"[attempt = ${MDC(RETRY_COUNT, retryCount + 1)}]", ptee) + log"[attempt = ${MDC(NUM_RETRY, retryCount + 1)}]", ptee) case e: Throwable => throw new SparkException(s"Error while $message", e) } diff --git a/connector/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointer.scala b/connector/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointer.scala index f6740999ce1c7..c52eeca1e48a1 100644 --- a/connector/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointer.scala +++ b/connector/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointer.scala @@ -23,7 +23,7 @@ import scala.util.control.NonFatal import com.amazonaws.services.kinesis.clientlibrary.interfaces.IRecordProcessorCheckpointer import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{SHARD_ID, WORKER_URL} +import org.apache.spark.internal.LogKeys.{SHARD_ID, WORKER_URL} import org.apache.spark.streaming.Duration import org.apache.spark.streaming.util.RecurringTimer import org.apache.spark.util.{Clock, SystemClock} @@ -103,7 +103,7 @@ private[kinesis] class KinesisCheckpointer( } } catch { case NonFatal(e) => - logWarning(s"Failed to checkpoint shardId $shardId to DynamoDB.", e) + logWarning(log"Failed to checkpoint shardId ${MDC(SHARD_ID, shardId)} to DynamoDB.", e) } } diff --git a/connector/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala b/connector/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala index 47b03c2b75376..953817e625e48 100644 --- a/connector/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala +++ b/connector/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala @@ -29,7 +29,7 @@ import com.amazonaws.services.kinesis.metrics.interfaces.MetricsLevel import com.amazonaws.services.kinesis.model.Record import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.WORKER_URL +import org.apache.spark.internal.LogKeys.WORKER_URL import org.apache.spark.storage.{StorageLevel, StreamBlockId} import org.apache.spark.streaming.Duration import org.apache.spark.streaming.kinesis.KinesisInitialPositions.AtTimestamp diff --git a/connector/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala b/connector/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala index 8424dde7d9c40..aaafb3215d031 100644 --- a/connector/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala +++ b/connector/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala @@ -27,7 +27,7 @@ import com.amazonaws.services.kinesis.clientlibrary.lib.worker.ShutdownReason import com.amazonaws.services.kinesis.model.Record import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{REASON, RETRY_INTERVAL, SHARD_ID, WORKER_URL} +import org.apache.spark.internal.LogKeys.{REASON, RETRY_INTERVAL, SHARD_ID, WORKER_URL} /** * Kinesis-specific implementation of the Kinesis Client Library (KCL) IRecordProcessor. diff --git a/connector/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala b/connector/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala index cd4c61396a12f..652822c5fdc97 100644 --- a/connector/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala +++ b/connector/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala @@ -34,7 +34,7 @@ import com.amazonaws.services.kinesis.{AmazonKinesis, AmazonKinesisClient} import com.amazonaws.services.kinesis.model._ import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{STREAM_NAME, TABLE_NAME} +import org.apache.spark.internal.LogKeys.{STREAM_NAME, TABLE_NAME} /** * Shared utility methods for performing Kinesis tests that actually transfer data. diff --git a/connector/profiler/README.md b/connector/profiler/README.md index 527f8b487d4d4..d928a47cab7d2 100644 --- a/connector/profiler/README.md +++ b/connector/profiler/README.md @@ -23,7 +23,7 @@ Code profiling is currently only supported for To get maximum profiling information set the following jvm options for the executor : ``` - -XX:+UnlockDiagnosticVMOptions -XX:+DebugNonSafepoints -XX:+PreserveFramePointer +spark.executor.extraJavaOptions=-XX:+UnlockDiagnosticVMOptions -XX:+DebugNonSafepoints -XX:+PreserveFramePointer ``` For more information on async_profiler see the [Async Profiler Manual](https://krzysztofslusarski.github.io/2022/12/12/async-manual.html) diff --git a/connector/profiler/pom.xml b/connector/profiler/pom.xml index 933a74edc0a94..6b254dbae128c 100644 --- a/connector/profiler/pom.xml +++ b/connector/profiler/pom.xml @@ -31,6 +31,9 @@ jar Spark Profiler + + Enables code profiling of executors based on the the async profiler. + https://spark.apache.org/ @@ -44,7 +47,8 @@ me.bechberger ap-loader-all - 3.0-8 + ${ap-loader.version} + provided diff --git a/connector/profiler/src/main/scala/org/apache/spark/executor/profiler/ExecutorJVMProfiler.scala b/connector/profiler/src/main/scala/org/apache/spark/executor/profiler/ExecutorJVMProfiler.scala index 15ffbbd9d730c..20b6db5221fa9 100644 --- a/connector/profiler/src/main/scala/org/apache/spark/executor/profiler/ExecutorJVMProfiler.scala +++ b/connector/profiler/src/main/scala/org/apache/spark/executor/profiler/ExecutorJVMProfiler.scala @@ -25,8 +25,8 @@ import org.apache.hadoop.fs.{FileSystem, FSDataOutputStream, Path} import org.apache.spark.SparkConf import org.apache.spark.deploy.SparkHadoopUtil -import org.apache.spark.internal.LogKey.PATH import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.PATH import org.apache.spark.util.ThreadUtils diff --git a/connector/profiler/src/main/scala/org/apache/spark/executor/profiler/ExecutorProfilerPlugin.scala b/connector/profiler/src/main/scala/org/apache/spark/executor/profiler/ExecutorProfilerPlugin.scala index fb9abfe59aa78..b6b6221277968 100644 --- a/connector/profiler/src/main/scala/org/apache/spark/executor/profiler/ExecutorProfilerPlugin.scala +++ b/connector/profiler/src/main/scala/org/apache/spark/executor/profiler/ExecutorProfilerPlugin.scala @@ -23,8 +23,8 @@ import scala.util.Random import org.apache.spark.SparkConf import org.apache.spark.api.plugin.{DriverPlugin, ExecutorPlugin, PluginContext, SparkPlugin} -import org.apache.spark.internal.LogKey.EXECUTOR_ID import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.EXECUTOR_ID /** diff --git a/connector/spark-ganglia-lgpl/src/main/java/com/codahale/metrics/ganglia/GangliaReporter.java b/connector/spark-ganglia-lgpl/src/main/java/com/codahale/metrics/ganglia/GangliaReporter.java index 019ee08e09188..48c61e80d6655 100644 --- a/connector/spark-ganglia-lgpl/src/main/java/com/codahale/metrics/ganglia/GangliaReporter.java +++ b/connector/spark-ganglia-lgpl/src/main/java/com/codahale/metrics/ganglia/GangliaReporter.java @@ -11,9 +11,6 @@ import info.ganglia.gmetric4j.gmetric.GMetricType; import info.ganglia.gmetric4j.gmetric.GangliaException; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import java.util.Collections; import java.util.Map; import java.util.Set; @@ -22,6 +19,11 @@ import java.util.concurrent.TimeUnit; import java.util.regex.Pattern; +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.MDC; + import static com.codahale.metrics.MetricRegistry.name; import static com.codahale.metrics.MetricAttribute.*; @@ -201,7 +203,7 @@ public GangliaReporter build(GMetric... gmetrics) { } } - private static final Logger LOGGER = LoggerFactory.getLogger(GangliaReporter.class); + private static final SparkLogger LOGGER = SparkLoggerFactory.getLogger(GangliaReporter.class); private final GMetric gmetric; private final GMetric[] gmetrics; @@ -292,7 +294,8 @@ private void reportTimer(String name, Timer timer) { reportMetered(sanitizedName, timer, group, "calls"); } catch (GangliaException e) { - LOGGER.warn("Unable to report timer {}", sanitizedName, e); + LOGGER.warn("Unable to report timer {}", e, + MDC.of(LogKeys.METRIC_NAME$.MODULE$, sanitizedName)); } } @@ -302,7 +305,8 @@ private void reportMeter(String name, Meter meter) { try { reportMetered(sanitizedName, meter, group, "events"); } catch (GangliaException e) { - LOGGER.warn("Unable to report meter {}", name, e); + LOGGER.warn("Unable to report meter {}", e, + MDC.of(LogKeys.METRIC_NAME$.MODULE$, name)); } } @@ -333,7 +337,8 @@ private void reportHistogram(String name, Histogram histogram) { announceIfEnabled(P99, sanitizedName, group, snapshot.get99thPercentile(), ""); announceIfEnabled(P999, sanitizedName, group, snapshot.get999thPercentile(), ""); } catch (GangliaException e) { - LOGGER.warn("Unable to report histogram {}", sanitizedName, e); + LOGGER.warn("Unable to report histogram {}", e, + MDC.of(LogKeys.METRIC_NAME$.MODULE$, sanitizedName)); } } @@ -343,7 +348,8 @@ private void reportCounter(String name, Counter counter) { try { announce(prefix(sanitizedName, COUNT.getCode()), group, Long.toString(counter.getCount()), GMetricType.DOUBLE, ""); } catch (GangliaException e) { - LOGGER.warn("Unable to report counter {}", name, e); + LOGGER.warn("Unable to report counter {}", e, + MDC.of(LogKeys.METRIC_NAME$.MODULE$, name)); } } @@ -356,7 +362,8 @@ private void reportGauge(String name, Gauge gauge) { try { announce(name(prefix, sanitizedName), group, value, type, ""); } catch (GangliaException e) { - LOGGER.warn("Unable to report gauge {}", name, e); + LOGGER.warn("Unable to report gauge {}", e, + MDC.of(LogKeys.METRIC_NAME$.MODULE$, name)); } } diff --git a/core/benchmarks/CoalescedRDDBenchmark-jdk21-results.txt b/core/benchmarks/CoalescedRDDBenchmark-jdk21-results.txt index e24325c07ff7c..2c1c69ac42db7 100644 --- a/core/benchmarks/CoalescedRDDBenchmark-jdk21-results.txt +++ b/core/benchmarks/CoalescedRDDBenchmark-jdk21-results.txt @@ -2,39 +2,39 @@ Coalesced RDD , large scale ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Coalesced RDD: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Coalesce Num Partitions: 100 Num Hosts: 1 253 256 4 0.4 2529.6 1.0X -Coalesce Num Partitions: 100 Num Hosts: 5 117 121 4 0.9 1166.9 2.2X -Coalesce Num Partitions: 100 Num Hosts: 10 110 111 2 0.9 1097.0 2.3X -Coalesce Num Partitions: 100 Num Hosts: 20 92 96 4 1.1 915.7 2.8X -Coalesce Num Partitions: 100 Num Hosts: 40 89 91 3 1.1 889.1 2.8X -Coalesce Num Partitions: 100 Num Hosts: 80 87 89 2 1.1 869.9 2.9X -Coalesce Num Partitions: 500 Num Hosts: 1 871 880 11 0.1 8714.5 0.3X -Coalesce Num Partitions: 500 Num Hosts: 5 255 260 7 0.4 2552.2 1.0X -Coalesce Num Partitions: 500 Num Hosts: 10 178 179 1 0.6 1780.8 1.4X -Coalesce Num Partitions: 500 Num Hosts: 20 146 148 3 0.7 1460.5 1.7X -Coalesce Num Partitions: 500 Num Hosts: 40 114 117 3 0.9 1135.5 2.2X -Coalesce Num Partitions: 500 Num Hosts: 80 106 108 2 0.9 1058.1 2.4X -Coalesce Num Partitions: 1000 Num Hosts: 1 1651 1672 31 0.1 16508.4 0.2X -Coalesce Num Partitions: 1000 Num Hosts: 5 420 425 7 0.2 4198.6 0.6X -Coalesce Num Partitions: 1000 Num Hosts: 10 256 256 0 0.4 2558.1 1.0X -Coalesce Num Partitions: 1000 Num Hosts: 20 188 189 1 0.5 1877.5 1.3X -Coalesce Num Partitions: 1000 Num Hosts: 40 137 139 1 0.7 1369.9 1.8X -Coalesce Num Partitions: 1000 Num Hosts: 80 118 124 9 0.8 1182.9 2.1X -Coalesce Num Partitions: 5000 Num Hosts: 1 7631 7716 127 0.0 76309.6 0.0X -Coalesce Num Partitions: 5000 Num Hosts: 5 1854 1867 12 0.1 18541.2 0.1X -Coalesce Num Partitions: 5000 Num Hosts: 10 993 999 5 0.1 9928.0 0.3X -Coalesce Num Partitions: 5000 Num Hosts: 20 546 549 3 0.2 5457.1 0.5X -Coalesce Num Partitions: 5000 Num Hosts: 40 336 337 2 0.3 3360.1 0.8X -Coalesce Num Partitions: 5000 Num Hosts: 80 217 221 4 0.5 2171.0 1.2X -Coalesce Num Partitions: 10000 Num Hosts: 1 14258 14329 65 0.0 142581.8 0.0X -Coalesce Num Partitions: 10000 Num Hosts: 5 3579 3594 14 0.0 35793.0 0.1X -Coalesce Num Partitions: 10000 Num Hosts: 10 1813 1824 14 0.1 18134.3 0.1X -Coalesce Num Partitions: 10000 Num Hosts: 20 965 967 4 0.1 9647.8 0.3X -Coalesce Num Partitions: 10000 Num Hosts: 40 535 540 5 0.2 5348.5 0.5X -Coalesce Num Partitions: 10000 Num Hosts: 80 314 315 1 0.3 3142.6 0.8X +Coalesce Num Partitions: 100 Num Hosts: 1 269 278 10 0.4 2693.1 1.0X +Coalesce Num Partitions: 100 Num Hosts: 5 135 140 5 0.7 1345.0 2.0X +Coalesce Num Partitions: 100 Num Hosts: 10 109 115 6 0.9 1091.6 2.5X +Coalesce Num Partitions: 100 Num Hosts: 20 100 101 1 1.0 999.4 2.7X +Coalesce Num Partitions: 100 Num Hosts: 40 96 98 3 1.0 961.5 2.8X +Coalesce Num Partitions: 100 Num Hosts: 80 93 100 9 1.1 933.4 2.9X +Coalesce Num Partitions: 500 Num Hosts: 1 875 902 28 0.1 8754.7 0.3X +Coalesce Num Partitions: 500 Num Hosts: 5 262 265 3 0.4 2619.9 1.0X +Coalesce Num Partitions: 500 Num Hosts: 10 179 182 4 0.6 1792.2 1.5X +Coalesce Num Partitions: 500 Num Hosts: 20 136 139 6 0.7 1357.2 2.0X +Coalesce Num Partitions: 500 Num Hosts: 40 115 116 1 0.9 1145.7 2.4X +Coalesce Num Partitions: 500 Num Hosts: 80 105 110 7 1.0 1047.2 2.6X +Coalesce Num Partitions: 1000 Num Hosts: 1 1655 1656 2 0.1 16546.0 0.2X +Coalesce Num Partitions: 1000 Num Hosts: 5 425 428 4 0.2 4251.4 0.6X +Coalesce Num Partitions: 1000 Num Hosts: 10 263 267 3 0.4 2634.4 1.0X +Coalesce Num Partitions: 1000 Num Hosts: 20 182 188 9 0.5 1822.5 1.5X +Coalesce Num Partitions: 1000 Num Hosts: 40 142 143 1 0.7 1424.5 1.9X +Coalesce Num Partitions: 1000 Num Hosts: 80 123 131 8 0.8 1226.2 2.2X +Coalesce Num Partitions: 5000 Num Hosts: 1 7484 7491 10 0.0 74836.3 0.0X +Coalesce Num Partitions: 5000 Num Hosts: 5 1873 1880 11 0.1 18725.7 0.1X +Coalesce Num Partitions: 5000 Num Hosts: 10 995 1005 11 0.1 9950.1 0.3X +Coalesce Num Partitions: 5000 Num Hosts: 20 557 561 6 0.2 5570.9 0.5X +Coalesce Num Partitions: 5000 Num Hosts: 40 337 341 4 0.3 3369.7 0.8X +Coalesce Num Partitions: 5000 Num Hosts: 80 222 223 1 0.5 2222.0 1.2X +Coalesce Num Partitions: 10000 Num Hosts: 1 14102 14133 45 0.0 141020.7 0.0X +Coalesce Num Partitions: 10000 Num Hosts: 5 3681 3702 31 0.0 36811.9 0.1X +Coalesce Num Partitions: 10000 Num Hosts: 10 1871 1884 22 0.1 18706.8 0.1X +Coalesce Num Partitions: 10000 Num Hosts: 20 998 1004 5 0.1 9980.9 0.3X +Coalesce Num Partitions: 10000 Num Hosts: 40 570 575 5 0.2 5696.7 0.5X +Coalesce Num Partitions: 10000 Num Hosts: 80 345 346 2 0.3 3447.7 0.8X diff --git a/core/benchmarks/CoalescedRDDBenchmark-results.txt b/core/benchmarks/CoalescedRDDBenchmark-results.txt index 2f1280b3817df..aba428c1729e7 100644 --- a/core/benchmarks/CoalescedRDDBenchmark-results.txt +++ b/core/benchmarks/CoalescedRDDBenchmark-results.txt @@ -2,39 +2,39 @@ Coalesced RDD , large scale ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Coalesced RDD: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Coalesce Num Partitions: 100 Num Hosts: 1 135 137 2 0.7 1352.2 1.0X -Coalesce Num Partitions: 100 Num Hosts: 5 108 109 0 0.9 1083.3 1.2X -Coalesce Num Partitions: 100 Num Hosts: 10 92 101 9 1.1 923.1 1.5X -Coalesce Num Partitions: 100 Num Hosts: 20 92 93 1 1.1 920.9 1.5X -Coalesce Num Partitions: 100 Num Hosts: 40 90 99 12 1.1 903.6 1.5X -Coalesce Num Partitions: 100 Num Hosts: 80 89 98 9 1.1 888.8 1.5X -Coalesce Num Partitions: 500 Num Hosts: 1 313 319 5 0.3 3127.5 0.4X -Coalesce Num Partitions: 500 Num Hosts: 5 135 137 2 0.7 1346.4 1.0X -Coalesce Num Partitions: 500 Num Hosts: 10 113 120 9 0.9 1130.4 1.2X -Coalesce Num Partitions: 500 Num Hosts: 20 102 109 11 1.0 1016.9 1.3X -Coalesce Num Partitions: 500 Num Hosts: 40 95 97 2 1.1 948.9 1.4X -Coalesce Num Partitions: 500 Num Hosts: 80 94 95 1 1.1 938.5 1.4X -Coalesce Num Partitions: 1000 Num Hosts: 1 528 529 1 0.2 5279.9 0.3X -Coalesce Num Partitions: 1000 Num Hosts: 5 193 195 3 0.5 1925.2 0.7X -Coalesce Num Partitions: 1000 Num Hosts: 10 134 135 0 0.7 1343.9 1.0X -Coalesce Num Partitions: 1000 Num Hosts: 20 112 115 4 0.9 1122.3 1.2X -Coalesce Num Partitions: 1000 Num Hosts: 40 104 105 2 1.0 1039.0 1.3X -Coalesce Num Partitions: 1000 Num Hosts: 80 99 100 2 1.0 986.8 1.4X -Coalesce Num Partitions: 5000 Num Hosts: 1 2332 2376 49 0.0 23322.6 0.1X -Coalesce Num Partitions: 5000 Num Hosts: 5 727 733 8 0.1 7269.1 0.2X -Coalesce Num Partitions: 5000 Num Hosts: 10 408 410 4 0.2 4082.9 0.3X -Coalesce Num Partitions: 5000 Num Hosts: 20 255 259 4 0.4 2548.7 0.5X -Coalesce Num Partitions: 5000 Num Hosts: 40 176 180 4 0.6 1764.4 0.8X -Coalesce Num Partitions: 5000 Num Hosts: 80 135 143 10 0.7 1351.9 1.0X -Coalesce Num Partitions: 10000 Num Hosts: 1 4249 4278 26 0.0 42489.3 0.0X -Coalesce Num Partitions: 10000 Num Hosts: 5 1512 1517 4 0.1 15121.1 0.1X -Coalesce Num Partitions: 10000 Num Hosts: 10 764 769 10 0.1 7636.8 0.2X -Coalesce Num Partitions: 10000 Num Hosts: 20 435 438 2 0.2 4352.9 0.3X -Coalesce Num Partitions: 10000 Num Hosts: 40 268 272 4 0.4 2678.4 0.5X -Coalesce Num Partitions: 10000 Num Hosts: 80 186 190 5 0.5 1860.7 0.7X +Coalesce Num Partitions: 100 Num Hosts: 1 134 143 8 0.7 1343.4 1.0X +Coalesce Num Partitions: 100 Num Hosts: 5 96 97 2 1.0 962.0 1.4X +Coalesce Num Partitions: 100 Num Hosts: 10 90 92 3 1.1 898.7 1.5X +Coalesce Num Partitions: 100 Num Hosts: 20 90 91 1 1.1 898.9 1.5X +Coalesce Num Partitions: 100 Num Hosts: 40 96 100 4 1.0 957.0 1.4X +Coalesce Num Partitions: 100 Num Hosts: 80 87 93 9 1.2 866.3 1.6X +Coalesce Num Partitions: 500 Num Hosts: 1 312 314 3 0.3 3115.2 0.4X +Coalesce Num Partitions: 500 Num Hosts: 5 135 136 1 0.7 1352.9 1.0X +Coalesce Num Partitions: 500 Num Hosts: 10 110 111 1 0.9 1103.7 1.2X +Coalesce Num Partitions: 500 Num Hosts: 20 103 109 9 1.0 1034.3 1.3X +Coalesce Num Partitions: 500 Num Hosts: 40 95 97 2 1.1 948.3 1.4X +Coalesce Num Partitions: 500 Num Hosts: 80 90 93 3 1.1 899.9 1.5X +Coalesce Num Partitions: 1000 Num Hosts: 1 527 535 13 0.2 5267.7 0.3X +Coalesce Num Partitions: 1000 Num Hosts: 5 179 180 1 0.6 1788.4 0.8X +Coalesce Num Partitions: 1000 Num Hosts: 10 132 138 8 0.8 1321.6 1.0X +Coalesce Num Partitions: 1000 Num Hosts: 20 116 122 5 0.9 1157.1 1.2X +Coalesce Num Partitions: 1000 Num Hosts: 40 99 104 7 1.0 988.1 1.4X +Coalesce Num Partitions: 1000 Num Hosts: 80 95 97 2 1.1 948.9 1.4X +Coalesce Num Partitions: 5000 Num Hosts: 1 2326 2336 10 0.0 23263.2 0.1X +Coalesce Num Partitions: 5000 Num Hosts: 5 735 743 7 0.1 7351.3 0.2X +Coalesce Num Partitions: 5000 Num Hosts: 10 400 405 6 0.2 4002.9 0.3X +Coalesce Num Partitions: 5000 Num Hosts: 20 263 266 2 0.4 2631.4 0.5X +Coalesce Num Partitions: 5000 Num Hosts: 40 175 180 7 0.6 1746.4 0.8X +Coalesce Num Partitions: 5000 Num Hosts: 80 139 141 1 0.7 1389.1 1.0X +Coalesce Num Partitions: 10000 Num Hosts: 1 4250 4263 21 0.0 42497.5 0.0X +Coalesce Num Partitions: 10000 Num Hosts: 5 1508 1512 6 0.1 15082.5 0.1X +Coalesce Num Partitions: 10000 Num Hosts: 10 765 770 7 0.1 7645.5 0.2X +Coalesce Num Partitions: 10000 Num Hosts: 20 420 424 4 0.2 4198.5 0.3X +Coalesce Num Partitions: 10000 Num Hosts: 40 277 279 3 0.4 2768.6 0.5X +Coalesce Num Partitions: 10000 Num Hosts: 80 187 189 2 0.5 1868.9 0.7X diff --git a/core/benchmarks/KryoBenchmark-jdk21-results.txt b/core/benchmarks/KryoBenchmark-jdk21-results.txt index 9910ed6e6e162..8488d15602228 100644 --- a/core/benchmarks/KryoBenchmark-jdk21-results.txt +++ b/core/benchmarks/KryoBenchmark-jdk21-results.txt @@ -2,27 +2,27 @@ Benchmark Kryo Unsafe vs safe Serialization ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Benchmark Kryo Unsafe vs safe Serialization: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -basicTypes: Int with unsafe:true 164 166 1 6.1 164.4 1.0X -basicTypes: Long with unsafe:true 180 181 0 5.5 180.3 0.9X -basicTypes: Float with unsafe:true 185 188 2 5.4 184.7 0.9X -basicTypes: Double with unsafe:true 187 188 1 5.3 187.1 0.9X -Array: Int with unsafe:true 1 1 0 761.6 1.3 125.2X -Array: Long with unsafe:true 2 2 0 481.6 2.1 79.2X -Array: Float with unsafe:true 1 1 0 749.8 1.3 123.3X -Array: Double with unsafe:true 2 2 0 483.7 2.1 79.5X -Map of string->Double with unsafe:true 27 27 1 37.2 26.9 6.1X -basicTypes: Int with unsafe:false 212 214 2 4.7 212.4 0.8X -basicTypes: Long with unsafe:false 232 233 1 4.3 232.1 0.7X -basicTypes: Float with unsafe:false 226 227 1 4.4 226.1 0.7X -basicTypes: Double with unsafe:false 222 223 1 4.5 221.6 0.7X -Array: Int with unsafe:false 13 13 0 79.8 12.5 13.1X -Array: Long with unsafe:false 22 23 1 45.7 21.9 7.5X -Array: Float with unsafe:false 6 6 0 172.2 5.8 28.3X -Array: Double with unsafe:false 15 15 0 65.2 15.3 10.7X -Map of string->Double with unsafe:false 29 31 1 34.7 28.9 5.7X +basicTypes: Int with unsafe:true 173 174 1 5.8 172.9 1.0X +basicTypes: Long with unsafe:true 178 179 1 5.6 177.7 1.0X +basicTypes: Float with unsafe:true 180 182 1 5.6 179.6 1.0X +basicTypes: Double with unsafe:true 183 191 15 5.5 182.8 0.9X +Array: Int with unsafe:true 1 1 0 753.9 1.3 130.3X +Array: Long with unsafe:true 2 2 0 486.0 2.1 84.0X +Array: Float with unsafe:true 1 1 0 759.7 1.3 131.3X +Array: Double with unsafe:true 2 2 0 473.8 2.1 81.9X +Map of string->Double with unsafe:true 27 27 1 37.5 26.7 6.5X +basicTypes: Int with unsafe:false 204 205 1 4.9 203.7 0.8X +basicTypes: Long with unsafe:false 229 230 1 4.4 229.1 0.8X +basicTypes: Float with unsafe:false 208 209 1 4.8 208.1 0.8X +basicTypes: Double with unsafe:false 204 205 2 4.9 203.7 0.8X +Array: Int with unsafe:false 13 13 0 79.4 12.6 13.7X +Array: Long with unsafe:false 21 21 0 47.8 20.9 8.3X +Array: Float with unsafe:false 6 6 0 168.3 5.9 29.1X +Array: Double with unsafe:false 11 12 0 87.2 11.5 15.1X +Map of string->Double with unsafe:false 30 31 0 32.9 30.4 5.7X diff --git a/core/benchmarks/KryoBenchmark-results.txt b/core/benchmarks/KryoBenchmark-results.txt index 7391c558b1942..5c35cc6affc1f 100644 --- a/core/benchmarks/KryoBenchmark-results.txt +++ b/core/benchmarks/KryoBenchmark-results.txt @@ -2,27 +2,27 @@ Benchmark Kryo Unsafe vs safe Serialization ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Benchmark Kryo Unsafe vs safe Serialization: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -basicTypes: Int with unsafe:true 172 173 0 5.8 171.7 1.0X -basicTypes: Long with unsafe:true 186 195 14 5.4 185.8 0.9X -basicTypes: Float with unsafe:true 194 201 11 5.2 193.5 0.9X -basicTypes: Double with unsafe:true 192 194 1 5.2 191.8 0.9X -Array: Int with unsafe:true 1 1 0 725.9 1.4 124.6X -Array: Long with unsafe:true 2 2 0 484.4 2.1 83.2X -Array: Float with unsafe:true 1 1 0 728.4 1.4 125.1X -Array: Double with unsafe:true 2 2 0 483.6 2.1 83.0X -Map of string->Double with unsafe:true 26 26 0 39.2 25.5 6.7X -basicTypes: Int with unsafe:false 206 207 1 4.9 205.7 0.8X -basicTypes: Long with unsafe:false 228 229 1 4.4 227.6 0.8X -basicTypes: Float with unsafe:false 225 226 0 4.4 225.1 0.8X -basicTypes: Double with unsafe:false 217 218 1 4.6 217.2 0.8X -Array: Int with unsafe:false 14 14 0 72.2 13.8 12.4X -Array: Long with unsafe:false 20 21 1 49.5 20.2 8.5X -Array: Float with unsafe:false 6 6 0 169.3 5.9 29.1X -Array: Double with unsafe:false 9 9 0 107.7 9.3 18.5X -Map of string->Double with unsafe:false 27 28 0 36.7 27.3 6.3X +basicTypes: Int with unsafe:true 171 172 1 5.9 170.5 1.0X +basicTypes: Long with unsafe:true 185 187 2 5.4 185.5 0.9X +basicTypes: Float with unsafe:true 190 192 1 5.3 190.0 0.9X +basicTypes: Double with unsafe:true 199 200 1 5.0 198.8 0.9X +Array: Int with unsafe:true 1 1 0 751.1 1.3 128.1X +Array: Long with unsafe:true 2 2 0 483.9 2.1 82.5X +Array: Float with unsafe:true 1 1 0 734.7 1.4 125.3X +Array: Double with unsafe:true 2 2 0 478.2 2.1 81.6X +Map of string->Double with unsafe:true 26 26 0 38.8 25.8 6.6X +basicTypes: Int with unsafe:false 207 209 1 4.8 207.5 0.8X +basicTypes: Long with unsafe:false 239 241 2 4.2 239.1 0.7X +basicTypes: Float with unsafe:false 213 213 1 4.7 212.6 0.8X +basicTypes: Double with unsafe:false 224 226 1 4.5 224.2 0.8X +Array: Int with unsafe:false 14 14 0 73.5 13.6 12.5X +Array: Long with unsafe:false 21 21 0 47.8 20.9 8.1X +Array: Float with unsafe:false 6 6 0 169.9 5.9 29.0X +Array: Double with unsafe:false 10 10 0 101.4 9.9 17.3X +Map of string->Double with unsafe:false 28 28 0 35.9 27.9 6.1X diff --git a/core/benchmarks/KryoIteratorBenchmark-jdk21-results.txt b/core/benchmarks/KryoIteratorBenchmark-jdk21-results.txt index 5ca8793bc3772..da82b05fd59e6 100644 --- a/core/benchmarks/KryoIteratorBenchmark-jdk21-results.txt +++ b/core/benchmarks/KryoIteratorBenchmark-jdk21-results.txt @@ -2,27 +2,27 @@ Benchmark of kryo asIterator on deserialization stream ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Benchmark of kryo asIterator on deserialization stream: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------------- -Colletion of int with 1 elements, useIterator: true 6 6 0 1.7 597.3 1.0X -Colletion of int with 10 elements, useIterator: true 13 14 0 0.7 1347.0 0.4X -Colletion of int with 100 elements, useIterator: true 85 86 1 0.1 8454.2 0.1X -Colletion of string with 1 elements, useIterator: true 8 8 1 1.3 754.5 0.8X -Colletion of string with 10 elements, useIterator: true 22 22 0 0.5 2176.6 0.3X -Colletion of string with 100 elements, useIterator: true 161 161 1 0.1 16078.6 0.0X -Colletion of Array[int] with 1 elements, useIterator: true 7 8 0 1.4 731.1 0.8X -Colletion of Array[int] with 10 elements, useIterator: true 20 20 1 0.5 1970.7 0.3X -Colletion of Array[int] with 100 elements, useIterator: true 148 150 1 0.1 14839.8 0.0X -Colletion of int with 1 elements, useIterator: false 6 7 1 1.6 608.2 1.0X -Colletion of int with 10 elements, useIterator: false 13 14 0 0.7 1337.8 0.4X -Colletion of int with 100 elements, useIterator: false 83 84 0 0.1 8349.0 0.1X -Colletion of string with 1 elements, useIterator: false 7 8 0 1.4 725.7 0.8X -Colletion of string with 10 elements, useIterator: false 21 22 0 0.5 2149.3 0.3X -Colletion of string with 100 elements, useIterator: false 160 161 1 0.1 16031.1 0.0X -Colletion of Array[int] with 1 elements, useIterator: false 7 7 0 1.4 711.9 0.8X -Colletion of Array[int] with 10 elements, useIterator: false 19 19 1 0.5 1891.2 0.3X -Colletion of Array[int] with 100 elements, useIterator: false 141 142 1 0.1 14076.4 0.0X +Colletion of int with 1 elements, useIterator: true 6 6 0 1.6 618.2 1.0X +Colletion of int with 10 elements, useIterator: true 14 15 0 0.7 1444.0 0.4X +Colletion of int with 100 elements, useIterator: true 92 92 1 0.1 9168.2 0.1X +Colletion of string with 1 elements, useIterator: true 8 8 0 1.3 777.9 0.8X +Colletion of string with 10 elements, useIterator: true 22 23 0 0.5 2221.0 0.3X +Colletion of string with 100 elements, useIterator: true 166 167 1 0.1 16617.2 0.0X +Colletion of Array[int] with 1 elements, useIterator: true 7 8 0 1.4 730.2 0.8X +Colletion of Array[int] with 10 elements, useIterator: true 20 20 0 0.5 1967.8 0.3X +Colletion of Array[int] with 100 elements, useIterator: true 145 146 1 0.1 14469.9 0.0X +Colletion of int with 1 elements, useIterator: false 7 7 0 1.5 653.6 0.9X +Colletion of int with 10 elements, useIterator: false 15 16 0 0.7 1528.7 0.4X +Colletion of int with 100 elements, useIterator: false 98 98 1 0.1 9755.3 0.1X +Colletion of string with 1 elements, useIterator: false 7 7 0 1.4 718.5 0.9X +Colletion of string with 10 elements, useIterator: false 21 22 2 0.5 2093.0 0.3X +Colletion of string with 100 elements, useIterator: false 157 157 1 0.1 15666.5 0.0X +Colletion of Array[int] with 1 elements, useIterator: false 7 7 0 1.4 698.1 0.9X +Colletion of Array[int] with 10 elements, useIterator: false 18 19 0 0.5 1831.7 0.3X +Colletion of Array[int] with 100 elements, useIterator: false 134 135 0 0.1 13430.8 0.0X diff --git a/core/benchmarks/KryoIteratorBenchmark-results.txt b/core/benchmarks/KryoIteratorBenchmark-results.txt index fb6073c50f767..e2cbfb871e7ba 100644 --- a/core/benchmarks/KryoIteratorBenchmark-results.txt +++ b/core/benchmarks/KryoIteratorBenchmark-results.txt @@ -2,27 +2,27 @@ Benchmark of kryo asIterator on deserialization stream ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Benchmark of kryo asIterator on deserialization stream: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------------- -Colletion of int with 1 elements, useIterator: true 6 6 0 1.6 620.2 1.0X -Colletion of int with 10 elements, useIterator: true 13 13 0 0.8 1312.1 0.5X -Colletion of int with 100 elements, useIterator: true 78 79 0 0.1 7833.8 0.1X -Colletion of string with 1 elements, useIterator: true 7 8 0 1.3 746.6 0.8X -Colletion of string with 10 elements, useIterator: true 22 23 0 0.4 2235.0 0.3X -Colletion of string with 100 elements, useIterator: true 166 168 3 0.1 16582.0 0.0X -Colletion of Array[int] with 1 elements, useIterator: true 7 8 0 1.4 723.4 0.9X -Colletion of Array[int] with 10 elements, useIterator: true 19 19 0 0.5 1882.2 0.3X -Colletion of Array[int] with 100 elements, useIterator: true 143 144 1 0.1 14251.0 0.0X -Colletion of int with 1 elements, useIterator: false 6 6 0 1.6 611.0 1.0X -Colletion of int with 10 elements, useIterator: false 13 14 0 0.7 1336.8 0.5X -Colletion of int with 100 elements, useIterator: false 83 84 1 0.1 8336.2 0.1X -Colletion of string with 1 elements, useIterator: false 7 8 0 1.4 730.6 0.8X -Colletion of string with 10 elements, useIterator: false 21 21 0 0.5 2094.0 0.3X -Colletion of string with 100 elements, useIterator: false 173 179 18 0.1 17327.1 0.0X -Colletion of Array[int] with 1 elements, useIterator: false 7 7 0 1.4 691.7 0.9X -Colletion of Array[int] with 10 elements, useIterator: false 19 19 0 0.5 1858.8 0.3X -Colletion of Array[int] with 100 elements, useIterator: false 138 139 1 0.1 13756.6 0.0X +Colletion of int with 1 elements, useIterator: true 6 6 1 1.6 611.4 1.0X +Colletion of int with 10 elements, useIterator: true 14 15 0 0.7 1443.2 0.4X +Colletion of int with 100 elements, useIterator: true 93 94 0 0.1 9331.3 0.1X +Colletion of string with 1 elements, useIterator: true 8 8 0 1.3 753.6 0.8X +Colletion of string with 10 elements, useIterator: true 22 22 0 0.5 2150.8 0.3X +Colletion of string with 100 elements, useIterator: true 163 164 1 0.1 16325.7 0.0X +Colletion of Array[int] with 1 elements, useIterator: true 7 8 0 1.3 741.1 0.8X +Colletion of Array[int] with 10 elements, useIterator: true 20 20 0 0.5 1989.2 0.3X +Colletion of Array[int] with 100 elements, useIterator: true 147 147 1 0.1 14659.2 0.0X +Colletion of int with 1 elements, useIterator: false 6 6 0 1.7 597.3 1.0X +Colletion of int with 10 elements, useIterator: false 13 14 0 0.8 1323.4 0.5X +Colletion of int with 100 elements, useIterator: false 83 84 3 0.1 8272.9 0.1X +Colletion of string with 1 elements, useIterator: false 7 7 0 1.4 714.5 0.9X +Colletion of string with 10 elements, useIterator: false 21 22 1 0.5 2146.0 0.3X +Colletion of string with 100 elements, useIterator: false 157 157 0 0.1 15690.5 0.0X +Colletion of Array[int] with 1 elements, useIterator: false 7 7 0 1.5 668.5 0.9X +Colletion of Array[int] with 10 elements, useIterator: false 18 18 0 0.6 1802.1 0.3X +Colletion of Array[int] with 100 elements, useIterator: false 134 135 1 0.1 13393.9 0.0X diff --git a/core/benchmarks/KryoSerializerBenchmark-jdk21-results.txt b/core/benchmarks/KryoSerializerBenchmark-jdk21-results.txt index 9434b9c3484aa..bb234fd6a13b3 100644 --- a/core/benchmarks/KryoSerializerBenchmark-jdk21-results.txt +++ b/core/benchmarks/KryoSerializerBenchmark-jdk21-results.txt @@ -2,11 +2,11 @@ Benchmark KryoPool vs old"pool of 1" implementation ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Benchmark KryoPool vs old"pool of 1" implementation: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------- -KryoPool:true 3927 5573 1781 0.0 7853845.5 1.0X -KryoPool:false 6170 7745 1218 0.0 12340812.7 0.6X +KryoPool:true 3937 5805 1793 0.0 7874251.0 1.0X +KryoPool:false 6392 8372 1513 0.0 12783860.9 0.6X diff --git a/core/benchmarks/KryoSerializerBenchmark-results.txt b/core/benchmarks/KryoSerializerBenchmark-results.txt index a9f20882f7929..79b87a83b34fa 100644 --- a/core/benchmarks/KryoSerializerBenchmark-results.txt +++ b/core/benchmarks/KryoSerializerBenchmark-results.txt @@ -2,11 +2,11 @@ Benchmark KryoPool vs old"pool of 1" implementation ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Benchmark KryoPool vs old"pool of 1" implementation: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------- -KryoPool:true 3881 5584 1898 0.0 7761840.3 1.0X -KryoPool:false 5852 7519 1549 0.0 11704736.8 0.7X +KryoPool:true 4180 6016 1956 0.0 8359077.2 1.0X +KryoPool:false 6204 8232 1447 0.0 12408361.0 0.7X diff --git a/core/benchmarks/LZFBenchmark-jdk21-results.txt b/core/benchmarks/LZFBenchmark-jdk21-results.txt new file mode 100644 index 0000000000000..e1566f201a1f6 --- /dev/null +++ b/core/benchmarks/LZFBenchmark-jdk21-results.txt @@ -0,0 +1,19 @@ +================================================================================================ +Benchmark LZFCompressionCodec +================================================================================================ + +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1021-azure +AMD EPYC 7763 64-Core Processor +Compress small objects: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +-------------------------------------------------------------------------------------------------------------------------------- +Compression 256000000 int values in parallel 598 600 2 428.2 2.3 1.0X +Compression 256000000 int values single-threaded 568 570 2 451.0 2.2 1.1X + +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1021-azure +AMD EPYC 7763 64-Core Processor +Compress large objects: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +----------------------------------------------------------------------------------------------------------------------------- +Compression 1024 array values in 1 threads 39 45 5 0.0 38475.4 1.0X +Compression 1024 array values single-threaded 32 33 1 0.0 31154.5 1.2X + + diff --git a/core/benchmarks/LZFBenchmark-results.txt b/core/benchmarks/LZFBenchmark-results.txt new file mode 100644 index 0000000000000..facc67f9cf4a8 --- /dev/null +++ b/core/benchmarks/LZFBenchmark-results.txt @@ -0,0 +1,19 @@ +================================================================================================ +Benchmark LZFCompressionCodec +================================================================================================ + +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1021-azure +AMD EPYC 7763 64-Core Processor +Compress small objects: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +-------------------------------------------------------------------------------------------------------------------------------- +Compression 256000000 int values in parallel 602 612 6 425.1 2.4 1.0X +Compression 256000000 int values single-threaded 610 617 5 419.8 2.4 1.0X + +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1021-azure +AMD EPYC 7763 64-Core Processor +Compress large objects: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +----------------------------------------------------------------------------------------------------------------------------- +Compression 1024 array values in 1 threads 35 43 6 0.0 33806.8 1.0X +Compression 1024 array values single-threaded 32 32 0 0.0 30990.4 1.1X + + diff --git a/core/benchmarks/MapStatusesConvertBenchmark-jdk21-results.txt b/core/benchmarks/MapStatusesConvertBenchmark-jdk21-results.txt index 502d10c1c58ca..a15442496b244 100644 --- a/core/benchmarks/MapStatusesConvertBenchmark-jdk21-results.txt +++ b/core/benchmarks/MapStatusesConvertBenchmark-jdk21-results.txt @@ -2,12 +2,12 @@ MapStatuses Convert Benchmark ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor MapStatuses Convert: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Num Maps: 50000 Fetch partitions:500 708 715 8 0.0 707870326.0 1.0X -Num Maps: 50000 Fetch partitions:1000 1610 1623 12 0.0 1610312472.0 0.4X -Num Maps: 50000 Fetch partitions:1500 2443 2461 23 0.0 2442675908.0 0.3X +Num Maps: 50000 Fetch partitions:500 674 685 12 0.0 673772738.0 1.0X +Num Maps: 50000 Fetch partitions:1000 1579 1590 12 0.0 1579383970.0 0.4X +Num Maps: 50000 Fetch partitions:1500 2435 2472 37 0.0 2434530380.0 0.3X diff --git a/core/benchmarks/MapStatusesConvertBenchmark-results.txt b/core/benchmarks/MapStatusesConvertBenchmark-results.txt index 9fe4175bb5d9e..b9f36af4a6531 100644 --- a/core/benchmarks/MapStatusesConvertBenchmark-results.txt +++ b/core/benchmarks/MapStatusesConvertBenchmark-results.txt @@ -2,12 +2,12 @@ MapStatuses Convert Benchmark ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor MapStatuses Convert: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Num Maps: 50000 Fetch partitions:500 775 778 5 0.0 774980756.0 1.0X -Num Maps: 50000 Fetch partitions:1000 1765 1765 1 0.0 1765011999.0 0.4X -Num Maps: 50000 Fetch partitions:1500 2671 2682 15 0.0 2671372452.0 0.3X +Num Maps: 50000 Fetch partitions:500 703 716 11 0.0 703103575.0 1.0X +Num Maps: 50000 Fetch partitions:1000 1707 1723 14 0.0 1707060398.0 0.4X +Num Maps: 50000 Fetch partitions:1500 2626 2638 14 0.0 2625981097.0 0.3X diff --git a/core/benchmarks/MapStatusesSerDeserBenchmark-jdk21-results.txt b/core/benchmarks/MapStatusesSerDeserBenchmark-jdk21-results.txt index 8c4230316d00b..f4846ce8b0fb3 100644 --- a/core/benchmarks/MapStatusesSerDeserBenchmark-jdk21-results.txt +++ b/core/benchmarks/MapStatusesSerDeserBenchmark-jdk21-results.txt @@ -1,64 +1,64 @@ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 200000 MapOutputs, 10 blocks w/ broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Serialization 83 86 5 2.4 417.0 1.0X -Deserialization 139 147 13 1.4 694.0 0.6X +Serialization 84 88 7 2.4 419.8 1.0X +Deserialization 140 148 8 1.4 700.9 0.6X -Compressed Serialized MapStatus sizes: 428.0 B +Compressed Serialized MapStatus sizes: 427.0 B Compressed Serialized Broadcast MapStatus sizes: 2.5 MiB -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 200000 MapOutputs, 10 blocks w/o broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -Serialization 77 79 3 2.6 384.9 1.0X -Deserialization 138 146 13 1.4 690.0 0.6X +Serialization 80 81 2 2.5 400.1 1.0X +Deserialization 139 146 6 1.4 694.8 0.6X Compressed Serialized MapStatus sizes: 2.5 MiB Compressed Serialized Broadcast MapStatus sizes: 0.0 B -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 200000 MapOutputs, 100 blocks w/ broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -Serialization 157 162 8 1.3 782.9 1.0X -Deserialization 155 160 13 1.3 774.7 1.0X +Serialization 153 158 11 1.3 767.3 1.0X +Deserialization 159 165 5 1.3 794.1 1.0X Compressed Serialized MapStatus sizes: 442.0 B Compressed Serialized Broadcast MapStatus sizes: 13.6 MiB -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 200000 MapOutputs, 100 blocks w/o broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -Serialization 143 144 1 1.4 714.5 1.0X -Deserialization 154 158 9 1.3 772.0 0.9X +Serialization 140 143 4 1.4 701.0 1.0X +Deserialization 154 159 7 1.3 771.3 0.9X Compressed Serialized MapStatus sizes: 13.6 MiB Compressed Serialized Broadcast MapStatus sizes: 0.0 B -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 200000 MapOutputs, 1000 blocks w/ broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -Serialization 713 736 38 0.3 3564.0 1.0X -Deserialization 314 337 16 0.6 1571.5 2.3X +Serialization 707 726 21 0.3 3533.4 1.0X +Deserialization 316 342 17 0.6 1582.2 2.2X -Compressed Serialized MapStatus sizes: 572.0 B +Compressed Serialized MapStatus sizes: 570.0 B Compressed Serialized Broadcast MapStatus sizes: 122.3 MiB -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 200000 MapOutputs, 1000 blocks w/o broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Serialization 570 575 6 0.4 2850.0 1.0X -Deserialization 322 345 18 0.6 1611.9 1.8X +Serialization 566 569 5 0.4 2828.3 1.0X +Deserialization 324 343 19 0.6 1617.8 1.7X Compressed Serialized MapStatus sizes: 122.3 MiB Compressed Serialized Broadcast MapStatus sizes: 0.0 B diff --git a/core/benchmarks/MapStatusesSerDeserBenchmark-results.txt b/core/benchmarks/MapStatusesSerDeserBenchmark-results.txt index 3f55d4c405a35..3b0b9b756d32b 100644 --- a/core/benchmarks/MapStatusesSerDeserBenchmark-results.txt +++ b/core/benchmarks/MapStatusesSerDeserBenchmark-results.txt @@ -1,64 +1,64 @@ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 200000 MapOutputs, 10 blocks w/ broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Serialization 85 91 5 2.3 427.2 1.0X -Deserialization 141 142 2 1.4 703.2 0.6X +Serialization 89 95 4 2.2 445.9 1.0X +Deserialization 138 145 6 1.4 689.8 0.6X -Compressed Serialized MapStatus sizes: 428.0 B +Compressed Serialized MapStatus sizes: 427.0 B Compressed Serialized Broadcast MapStatus sizes: 2.5 MiB -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 200000 MapOutputs, 10 blocks w/o broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -Serialization 77 78 1 2.6 386.1 1.0X -Deserialization 140 145 9 1.4 698.5 0.6X +Serialization 80 83 2 2.5 399.4 1.0X +Deserialization 134 145 10 1.5 671.0 0.6X Compressed Serialized MapStatus sizes: 2.5 MiB Compressed Serialized Broadcast MapStatus sizes: 0.0 B -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 200000 MapOutputs, 100 blocks w/ broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -Serialization 156 161 5 1.3 777.6 1.0X -Deserialization 157 165 13 1.3 786.6 1.0X +Serialization 159 163 7 1.3 793.4 1.0X +Deserialization 155 163 8 1.3 776.8 1.0X Compressed Serialized MapStatus sizes: 442.0 B Compressed Serialized Broadcast MapStatus sizes: 13.6 MiB -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 200000 MapOutputs, 100 blocks w/o broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -Serialization 145 147 3 1.4 725.2 1.0X -Deserialization 156 160 5 1.3 779.4 0.9X +Serialization 144 147 1 1.4 718.8 1.0X +Deserialization 154 159 4 1.3 770.4 0.9X Compressed Serialized MapStatus sizes: 13.6 MiB Compressed Serialized Broadcast MapStatus sizes: 0.0 B -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 200000 MapOutputs, 1000 blocks w/ broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -Serialization 721 746 27 0.3 3603.9 1.0X -Deserialization 330 348 15 0.6 1651.7 2.2X +Serialization 710 713 4 0.3 3549.7 1.0X +Deserialization 346 355 7 0.6 1730.4 2.1X -Compressed Serialized MapStatus sizes: 571.0 B +Compressed Serialized MapStatus sizes: 569.0 B Compressed Serialized Broadcast MapStatus sizes: 122.3 MiB -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 200000 MapOutputs, 1000 blocks w/o broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Serialization 575 581 8 0.3 2875.6 1.0X -Deserialization 326 341 10 0.6 1630.7 1.8X +Serialization 557 564 5 0.4 2783.1 1.0X +Deserialization 325 337 10 0.6 1626.5 1.7X Compressed Serialized MapStatus sizes: 122.3 MiB Compressed Serialized Broadcast MapStatus sizes: 0.0 B diff --git a/core/benchmarks/PersistenceEngineBenchmark-jdk21-results.txt b/core/benchmarks/PersistenceEngineBenchmark-jdk21-results.txt index 2a6bd778fc8a2..8da4b4953cad0 100644 --- a/core/benchmarks/PersistenceEngineBenchmark-jdk21-results.txt +++ b/core/benchmarks/PersistenceEngineBenchmark-jdk21-results.txt @@ -2,24 +2,17 @@ PersistenceEngineBenchmark ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 1000 Workers: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------- -ZooKeeperPersistenceEngine with JavaSerializer 5036 5232 229 0.0 5035730.1 1.0X -ZooKeeperPersistenceEngine with KryoSerializer 4038 4053 16 0.0 4038447.8 1.2X -FileSystemPersistenceEngine with JavaSerializer 2902 2906 5 0.0 2902453.3 1.7X -FileSystemPersistenceEngine with JavaSerializer (lz4) 816 829 19 0.0 816173.1 6.2X -FileSystemPersistenceEngine with JavaSerializer (lzf) 755 780 33 0.0 755209.0 6.7X -FileSystemPersistenceEngine with JavaSerializer (snappy) 814 832 16 0.0 813672.5 6.2X -FileSystemPersistenceEngine with JavaSerializer (zstd) 987 1014 45 0.0 986834.7 5.1X -FileSystemPersistenceEngine with KryoSerializer 687 698 14 0.0 687313.5 7.3X -FileSystemPersistenceEngine with KryoSerializer (lz4) 590 599 15 0.0 589867.9 8.5X -FileSystemPersistenceEngine with KryoSerializer (lzf) 915 922 9 0.0 915432.2 5.5X -FileSystemPersistenceEngine with KryoSerializer (snappy) 768 795 37 0.0 768494.4 6.6X -FileSystemPersistenceEngine with KryoSerializer (zstd) 898 950 45 0.0 898118.6 5.6X -RocksDBPersistenceEngine with JavaSerializer 299 299 0 0.0 298800.0 16.9X -RocksDBPersistenceEngine with KryoSerializer 112 113 1 0.0 111779.6 45.1X -BlackHolePersistenceEngine 0 0 0 5.5 180.3 27924.2X +ZooKeeperPersistenceEngine with JavaSerializer 6876 7518 567 0.0 6875910.8 1.0X +FileSystemPersistenceEngine with JavaSerializer 2973 3015 55 0.0 2973365.8 2.3X +FileSystemPersistenceEngine with JavaSerializer (lz4) 813 836 26 0.0 813019.4 8.5X +FileSystemPersistenceEngine with JavaSerializer (lzf) 756 772 21 0.0 755574.7 9.1X +FileSystemPersistenceEngine with JavaSerializer (snappy) 755 788 46 0.0 754897.8 9.1X +FileSystemPersistenceEngine with JavaSerializer (zstd) 930 964 35 0.0 930157.5 7.4X +RocksDBPersistenceEngine with JavaSerializer 302 305 3 0.0 302099.4 22.8X +BlackHolePersistenceEngine 0 0 0 5.9 168.2 40871.1X diff --git a/core/benchmarks/PersistenceEngineBenchmark-results.txt b/core/benchmarks/PersistenceEngineBenchmark-results.txt index da1838608de1c..f927e3c57aa33 100644 --- a/core/benchmarks/PersistenceEngineBenchmark-results.txt +++ b/core/benchmarks/PersistenceEngineBenchmark-results.txt @@ -2,24 +2,17 @@ PersistenceEngineBenchmark ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 1000 Workers: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------- -ZooKeeperPersistenceEngine with JavaSerializer 5192 5309 116 0.0 5192160.2 1.0X -ZooKeeperPersistenceEngine with KryoSerializer 4056 4059 5 0.0 4055626.8 1.3X -FileSystemPersistenceEngine with JavaSerializer 2926 2934 8 0.0 2926383.4 1.8X -FileSystemPersistenceEngine with JavaSerializer (lz4) 820 827 11 0.0 820359.8 6.3X -FileSystemPersistenceEngine with JavaSerializer (lzf) 772 781 9 0.0 772349.1 6.7X -FileSystemPersistenceEngine with JavaSerializer (snappy) 802 812 10 0.0 801815.8 6.5X -FileSystemPersistenceEngine with JavaSerializer (zstd) 972 994 31 0.0 972042.3 5.3X -FileSystemPersistenceEngine with KryoSerializer 708 726 15 0.0 707927.8 7.3X -FileSystemPersistenceEngine with KryoSerializer (lz4) 584 596 11 0.0 583999.8 8.9X -FileSystemPersistenceEngine with KryoSerializer (lzf) 880 896 14 0.0 880189.2 5.9X -FileSystemPersistenceEngine with KryoSerializer (snappy) 772 821 46 0.0 772130.1 6.7X -FileSystemPersistenceEngine with KryoSerializer (zstd) 906 928 29 0.0 905578.7 5.7X -RocksDBPersistenceEngine with JavaSerializer 302 302 0 0.0 301664.5 17.2X -RocksDBPersistenceEngine with KryoSerializer 109 111 2 0.0 108979.5 47.6X -BlackHolePersistenceEngine 0 0 0 6.3 158.3 32800.5X +ZooKeeperPersistenceEngine with JavaSerializer 6080 6179 119 0.0 6079694.4 1.0X +FileSystemPersistenceEngine with JavaSerializer 3011 3060 42 0.0 3011244.1 2.0X +FileSystemPersistenceEngine with JavaSerializer (lz4) 799 827 28 0.0 799357.3 7.6X +FileSystemPersistenceEngine with JavaSerializer (lzf) 800 839 35 0.0 800038.9 7.6X +FileSystemPersistenceEngine with JavaSerializer (snappy) 786 797 11 0.0 785847.0 7.7X +FileSystemPersistenceEngine with JavaSerializer (zstd) 1025 1028 3 0.0 1024806.3 5.9X +RocksDBPersistenceEngine with JavaSerializer 309 311 3 0.0 308522.6 19.7X +BlackHolePersistenceEngine 0 0 0 5.8 173.5 35032.8X diff --git a/core/benchmarks/PropertiesCloneBenchmark-jdk21-results.txt b/core/benchmarks/PropertiesCloneBenchmark-jdk21-results.txt index 24d94cd0ca462..06701357609a5 100644 --- a/core/benchmarks/PropertiesCloneBenchmark-jdk21-results.txt +++ b/core/benchmarks/PropertiesCloneBenchmark-jdk21-results.txt @@ -2,39 +2,39 @@ Properties Cloning ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Empty Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SerializationUtils.clone 0 0 0 0.3 3226.0 1.0X -Utils.cloneProperties 0 0 0 34.5 29.0 111.2X +SerializationUtils.clone 0 0 0 0.3 3356.0 1.0X +Utils.cloneProperties 0 0 0 34.5 29.0 115.7X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor System Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SerializationUtils.clone 0 0 0 0.0 142517.0 1.0X -Utils.cloneProperties 0 0 0 0.4 2705.0 52.7X +SerializationUtils.clone 0 0 0 0.0 160560.0 1.0X +Utils.cloneProperties 0 0 0 0.3 3085.0 52.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Small Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SerializationUtils.clone 0 0 0 0.0 243556.0 1.0X -Utils.cloneProperties 0 0 0 0.4 2615.0 93.1X +SerializationUtils.clone 0 0 0 0.0 253143.0 1.0X +Utils.cloneProperties 0 0 0 0.3 3696.0 68.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Medium Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SerializationUtils.clone 1 1 0 0.0 860141.0 1.0X -Utils.cloneProperties 0 0 0 0.1 15128.0 56.9X +SerializationUtils.clone 1 1 0 0.0 907866.0 1.0X +Utils.cloneProperties 0 0 0 0.0 20328.0 44.7X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Large Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SerializationUtils.clone 2 2 0 0.0 1604435.0 1.0X -Utils.cloneProperties 0 0 0 0.0 30647.0 52.4X +SerializationUtils.clone 2 2 0 0.0 1699706.0 1.0X +Utils.cloneProperties 0 0 0 0.0 40867.0 41.6X diff --git a/core/benchmarks/PropertiesCloneBenchmark-results.txt b/core/benchmarks/PropertiesCloneBenchmark-results.txt index b4a712748306d..13c241f53d182 100644 --- a/core/benchmarks/PropertiesCloneBenchmark-results.txt +++ b/core/benchmarks/PropertiesCloneBenchmark-results.txt @@ -2,39 +2,39 @@ Properties Cloning ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Empty Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SerializationUtils.clone 0 0 0 0.3 3246.0 1.0X -Utils.cloneProperties 0 0 0 34.5 29.0 111.9X +SerializationUtils.clone 0 0 0 0.3 3617.0 1.0X +Utils.cloneProperties 0 0 0 34.5 29.0 124.7X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor System Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SerializationUtils.clone 0 0 0 0.0 156702.0 1.0X -Utils.cloneProperties 0 0 0 0.4 2694.0 58.2X +SerializationUtils.clone 0 0 0 0.0 148347.0 1.0X +Utils.cloneProperties 0 0 0 0.4 2815.0 52.7X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Small Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SerializationUtils.clone 0 0 0 0.0 268443.0 1.0X -Utils.cloneProperties 0 0 0 0.3 3637.0 73.8X +SerializationUtils.clone 0 0 0 0.0 280142.0 1.0X +Utils.cloneProperties 0 0 0 0.3 3686.0 76.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Medium Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SerializationUtils.clone 1 1 0 0.0 890198.0 1.0X -Utils.cloneProperties 0 0 0 0.1 19807.0 44.9X +SerializationUtils.clone 1 1 0 0.0 941434.0 1.0X +Utils.cloneProperties 0 0 0 0.0 20278.0 46.4X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Large Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SerializationUtils.clone 2 2 0 0.0 1638110.0 1.0X -Utils.cloneProperties 0 0 0 0.0 40827.0 40.1X +SerializationUtils.clone 2 2 0 0.0 1737205.0 1.0X +Utils.cloneProperties 0 0 0 0.0 40696.0 42.7X diff --git a/core/benchmarks/XORShiftRandomBenchmark-jdk21-results.txt b/core/benchmarks/XORShiftRandomBenchmark-jdk21-results.txt index 609f39000d8ae..6f1f333363e96 100644 --- a/core/benchmarks/XORShiftRandomBenchmark-jdk21-results.txt +++ b/core/benchmarks/XORShiftRandomBenchmark-jdk21-results.txt @@ -2,43 +2,43 @@ Pseudo random ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor nextInt: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java.util.Random 451 451 0 222.0 4.5 1.0X +java.util.Random 451 451 0 221.9 4.5 1.0X XORShiftRandom 185 185 0 539.4 1.9 2.4X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor nextLong: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java.util.Random 897 897 0 111.5 9.0 1.0X -XORShiftRandom 371 371 0 269.7 3.7 2.4X +java.util.Random 894 899 7 111.8 8.9 1.0X +XORShiftRandom 371 372 2 269.4 3.7 2.4X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor nextDouble: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java.util.Random 902 902 0 110.9 9.0 1.0X -XORShiftRandom 371 372 1 269.5 3.7 2.4X +java.util.Random 900 900 0 111.1 9.0 1.0X +XORShiftRandom 371 371 0 269.6 3.7 2.4X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor nextGaussian: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java.util.Random 3389 3391 3 29.5 33.9 1.0X -XORShiftRandom 2501 2504 2 40.0 25.0 1.4X +java.util.Random 3373 3374 2 29.7 33.7 1.0X +XORShiftRandom 2454 2460 11 40.8 24.5 1.4X ================================================================================================ hash seed ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Hash seed: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -XORShiftRandom.hashSeed 1 1 0 12974.4 0.1 1.0X +XORShiftRandom.hashSeed 1 1 0 12974.9 0.1 1.0X diff --git a/core/benchmarks/XORShiftRandomBenchmark-results.txt b/core/benchmarks/XORShiftRandomBenchmark-results.txt index 4c5dc35208ae6..9701c0ca237bf 100644 --- a/core/benchmarks/XORShiftRandomBenchmark-results.txt +++ b/core/benchmarks/XORShiftRandomBenchmark-results.txt @@ -2,43 +2,43 @@ Pseudo random ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor nextInt: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java.util.Random 442 442 0 226.5 4.4 1.0X -XORShiftRandom 185 185 0 539.5 1.9 2.4X +java.util.Random 442 442 0 226.4 4.4 1.0X +XORShiftRandom 185 185 0 539.4 1.9 2.4X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor nextLong: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java.util.Random 909 910 1 110.0 9.1 1.0X -XORShiftRandom 371 371 0 269.7 3.7 2.5X +java.util.Random 913 913 0 109.6 9.1 1.0X +XORShiftRandom 371 372 1 269.6 3.7 2.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor nextDouble: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java.util.Random 905 905 0 110.5 9.0 1.0X -XORShiftRandom 371 371 1 269.7 3.7 2.4X +java.util.Random 906 906 0 110.4 9.1 1.0X +XORShiftRandom 371 371 1 269.6 3.7 2.4X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor nextGaussian: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java.util.Random 4069 4071 2 24.6 40.7 1.0X -XORShiftRandom 2983 2991 8 33.5 29.8 1.4X +java.util.Random 4170 4171 1 24.0 41.7 1.0X +XORShiftRandom 2993 2996 3 33.4 29.9 1.4X ================================================================================================ hash seed ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Hash seed: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -XORShiftRandom.hashSeed 1 1 0 12961.6 0.1 1.0X +XORShiftRandom.hashSeed 1 1 0 12975.0 0.1 1.0X diff --git a/core/benchmarks/ZStandardBenchmark-jdk21-results.txt b/core/benchmarks/ZStandardBenchmark-jdk21-results.txt index de19d9149a0bd..4c986ca0e4e0a 100644 --- a/core/benchmarks/ZStandardBenchmark-jdk21-results.txt +++ b/core/benchmarks/ZStandardBenchmark-jdk21-results.txt @@ -2,48 +2,48 @@ Benchmark ZStandardCompressionCodec ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Benchmark ZStandardCompressionCodec: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Compression 10000 times at level 1 without buffer pool 670 901 260 0.0 66991.5 1.0X -Compression 10000 times at level 2 without buffer pool 888 894 10 0.0 88794.6 0.8X -Compression 10000 times at level 3 without buffer pool 998 1001 3 0.0 99815.9 0.7X -Compression 10000 times at level 1 with buffer pool 941 941 1 0.0 94077.1 0.7X -Compression 10000 times at level 2 with buffer pool 977 978 1 0.0 97697.2 0.7X -Compression 10000 times at level 3 with buffer pool 1096 1096 0 0.0 109596.6 0.6X +Compression 10000 times at level 1 without buffer pool 649 748 156 0.0 64921.9 1.0X +Compression 10000 times at level 2 without buffer pool 689 689 0 0.0 68927.1 0.9X +Compression 10000 times at level 3 without buffer pool 782 782 0 0.0 78180.6 0.8X +Compression 10000 times at level 1 with buffer pool 580 582 2 0.0 57976.0 1.1X +Compression 10000 times at level 2 with buffer pool 614 618 4 0.0 61395.3 1.1X +Compression 10000 times at level 3 with buffer pool 725 734 11 0.0 72535.5 0.9X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Benchmark ZStandardCompressionCodec: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------ -Decompression 10000 times from level 1 without buffer pool 824 825 0 0.0 82440.7 1.0X -Decompression 10000 times from level 2 without buffer pool 826 827 1 0.0 82582.0 1.0X -Decompression 10000 times from level 3 without buffer pool 828 829 0 0.0 82846.0 1.0X -Decompression 10000 times from level 1 with buffer pool 751 751 0 0.0 75054.5 1.1X -Decompression 10000 times from level 2 with buffer pool 752 752 0 0.0 75223.2 1.1X -Decompression 10000 times from level 3 with buffer pool 751 752 0 0.0 75135.7 1.1X +Decompression 10000 times from level 1 without buffer pool 831 832 1 0.0 83114.9 1.0X +Decompression 10000 times from level 2 without buffer pool 834 835 1 0.0 83372.7 1.0X +Decompression 10000 times from level 3 without buffer pool 831 832 1 0.0 83092.3 1.0X +Decompression 10000 times from level 1 with buffer pool 759 760 1 0.0 75870.2 1.1X +Decompression 10000 times from level 2 with buffer pool 759 760 1 0.0 75877.3 1.1X +Decompression 10000 times from level 3 with buffer pool 759 759 0 0.0 75874.5 1.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Parallel Compression at level 3: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parallel Compression with 0 workers 48 50 1 0.0 376677.0 1.0X -Parallel Compression with 1 workers 42 43 2 0.0 329346.1 1.1X -Parallel Compression with 2 workers 39 41 2 0.0 304580.2 1.2X -Parallel Compression with 4 workers 38 39 1 0.0 297897.8 1.3X -Parallel Compression with 8 workers 41 42 1 0.0 317437.0 1.2X -Parallel Compression with 16 workers 45 47 1 0.0 351974.0 1.1X +Parallel Compression with 0 workers 48 50 1 0.0 376632.9 1.0X +Parallel Compression with 1 workers 35 37 2 0.0 272066.6 1.4X +Parallel Compression with 2 workers 34 38 2 0.0 263055.3 1.4X +Parallel Compression with 4 workers 37 39 2 0.0 286835.7 1.3X +Parallel Compression with 8 workers 38 40 1 0.0 299961.3 1.3X +Parallel Compression with 16 workers 43 45 1 0.0 335272.5 1.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Parallel Compression at level 9: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parallel Compression with 0 workers 160 163 6 0.0 1248450.7 1.0X -Parallel Compression with 1 workers 198 198 1 0.0 1543157.9 0.8X -Parallel Compression with 2 workers 116 127 19 0.0 908815.6 1.4X -Parallel Compression with 4 workers 111 114 2 0.0 863559.0 1.4X -Parallel Compression with 8 workers 114 119 2 0.0 892995.2 1.4X -Parallel Compression with 16 workers 116 119 2 0.0 906657.5 1.4X +Parallel Compression with 0 workers 157 158 1 0.0 1224138.2 1.0X +Parallel Compression with 1 workers 187 188 1 0.0 1463264.4 0.8X +Parallel Compression with 2 workers 111 115 6 0.0 863722.6 1.4X +Parallel Compression with 4 workers 105 109 2 0.0 822422.6 1.5X +Parallel Compression with 8 workers 110 114 2 0.0 862852.1 1.4X +Parallel Compression with 16 workers 111 115 2 0.0 870311.3 1.4X diff --git a/core/benchmarks/ZStandardBenchmark-results.txt b/core/benchmarks/ZStandardBenchmark-results.txt index cbe95071f1806..5569f27bb0169 100644 --- a/core/benchmarks/ZStandardBenchmark-results.txt +++ b/core/benchmarks/ZStandardBenchmark-results.txt @@ -2,48 +2,48 @@ Benchmark ZStandardCompressionCodec ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Benchmark ZStandardCompressionCodec: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Compression 10000 times at level 1 without buffer pool 674 680 8 0.0 67377.6 1.0X -Compression 10000 times at level 2 without buffer pool 712 713 1 0.0 71200.8 0.9X -Compression 10000 times at level 3 without buffer pool 822 828 5 0.0 82220.5 0.8X -Compression 10000 times at level 1 with buffer pool 591 592 1 0.0 59114.1 1.1X -Compression 10000 times at level 2 with buffer pool 627 628 1 0.0 62672.1 1.1X -Compression 10000 times at level 3 with buffer pool 748 750 2 0.0 74829.4 0.9X +Compression 10000 times at level 1 without buffer pool 656 660 3 0.0 65632.5 1.0X +Compression 10000 times at level 2 without buffer pool 695 696 1 0.0 69509.7 0.9X +Compression 10000 times at level 3 without buffer pool 803 807 7 0.0 80258.4 0.8X +Compression 10000 times at level 1 with buffer pool 584 586 2 0.0 58381.4 1.1X +Compression 10000 times at level 2 with buffer pool 615 616 1 0.0 61463.0 1.1X +Compression 10000 times at level 3 with buffer pool 743 743 0 0.0 74310.9 0.9X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Benchmark ZStandardCompressionCodec: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------ -Decompression 10000 times from level 1 without buffer pool 608 610 2 0.0 60825.6 1.0X -Decompression 10000 times from level 2 without buffer pool 607 608 1 0.0 60673.5 1.0X -Decompression 10000 times from level 3 without buffer pool 607 607 0 0.0 60683.0 1.0X -Decompression 10000 times from level 1 with buffer pool 564 565 1 0.0 56405.9 1.1X -Decompression 10000 times from level 2 with buffer pool 563 564 1 0.0 56319.2 1.1X -Decompression 10000 times from level 3 with buffer pool 564 565 0 0.0 56394.2 1.1X +Decompression 10000 times from level 1 without buffer pool 620 621 1 0.0 61972.9 1.0X +Decompression 10000 times from level 2 without buffer pool 622 622 1 0.0 62168.8 1.0X +Decompression 10000 times from level 3 without buffer pool 621 622 1 0.0 62130.0 1.0X +Decompression 10000 times from level 1 with buffer pool 549 550 0 0.0 54939.0 1.1X +Decompression 10000 times from level 2 with buffer pool 550 550 0 0.0 54963.5 1.1X +Decompression 10000 times from level 3 with buffer pool 549 550 1 0.0 54927.7 1.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Parallel Compression at level 3: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parallel Compression with 0 workers 51 52 1 0.0 395223.4 1.0X -Parallel Compression with 1 workers 43 46 5 0.0 338033.5 1.2X -Parallel Compression with 2 workers 42 44 1 0.0 325731.2 1.2X -Parallel Compression with 4 workers 42 44 1 0.0 325162.3 1.2X -Parallel Compression with 8 workers 44 46 1 0.0 342514.8 1.2X -Parallel Compression with 16 workers 48 50 1 0.0 375817.4 1.1X +Parallel Compression with 0 workers 47 48 1 0.0 365666.1 1.0X +Parallel Compression with 1 workers 34 36 3 0.0 268562.3 1.4X +Parallel Compression with 2 workers 32 35 2 0.0 251265.1 1.5X +Parallel Compression with 4 workers 35 38 1 0.0 273574.1 1.3X +Parallel Compression with 8 workers 37 40 1 0.0 288217.8 1.3X +Parallel Compression with 16 workers 42 44 1 0.0 330318.7 1.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Parallel Compression at level 9: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parallel Compression with 0 workers 163 164 1 0.0 1270581.2 1.0X -Parallel Compression with 1 workers 198 200 2 0.0 1548312.0 0.8X -Parallel Compression with 2 workers 117 122 5 0.0 914281.7 1.4X -Parallel Compression with 4 workers 112 115 2 0.0 872458.1 1.5X -Parallel Compression with 8 workers 116 120 2 0.0 909773.7 1.4X -Parallel Compression with 16 workers 116 122 8 0.0 908698.0 1.4X +Parallel Compression with 0 workers 155 157 2 0.0 1214057.2 1.0X +Parallel Compression with 1 workers 192 193 2 0.0 1499524.2 0.8X +Parallel Compression with 2 workers 112 119 9 0.0 871848.8 1.4X +Parallel Compression with 4 workers 106 109 2 0.0 830699.8 1.5X +Parallel Compression with 8 workers 111 114 2 0.0 870700.3 1.4X +Parallel Compression with 16 workers 112 114 2 0.0 873315.6 1.4X diff --git a/core/pom.xml b/core/pom.xml index 6468f500db046..adb1b3034b427 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -399,7 +399,7 @@ net.razorvine pickle - 1.3 + 1.5 net.sf.py4j @@ -547,6 +547,8 @@ org.eclipse.jetty:jetty-server com.google.guava:guava com.google.protobuf:* + diff --git a/core/src/main/java/org/apache/spark/io/NioBufferedFileInputStream.java b/core/src/main/java/org/apache/spark/io/NioBufferedFileInputStream.java index 441587cf7350e..4e251a1c2901b 100644 --- a/core/src/main/java/org/apache/spark/io/NioBufferedFileInputStream.java +++ b/core/src/main/java/org/apache/spark/io/NioBufferedFileInputStream.java @@ -1,9 +1,12 @@ /* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, diff --git a/core/src/main/java/org/apache/spark/io/ReadAheadInputStream.java b/core/src/main/java/org/apache/spark/io/ReadAheadInputStream.java index 33dfa44229064..5e9f1b78273a5 100644 --- a/core/src/main/java/org/apache/spark/io/ReadAheadInputStream.java +++ b/core/src/main/java/org/apache/spark/io/ReadAheadInputStream.java @@ -1,9 +1,12 @@ /* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -13,13 +16,6 @@ */ package org.apache.spark.io; -import com.google.common.base.Preconditions; -import com.google.common.base.Throwables; -import org.apache.spark.util.ThreadUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import javax.annotation.concurrent.GuardedBy; import java.io.EOFException; import java.io.IOException; import java.io.InputStream; @@ -30,6 +26,16 @@ import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.locks.Condition; import java.util.concurrent.locks.ReentrantLock; +import javax.annotation.concurrent.GuardedBy; + +import com.google.common.base.Preconditions; +import com.google.common.base.Throwables; + +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.MDC; +import org.apache.spark.util.ThreadUtils; /** * {@link InputStream} implementation which asynchronously reads ahead from the underlying input @@ -42,7 +48,8 @@ */ public class ReadAheadInputStream extends InputStream { - private static final Logger logger = LoggerFactory.getLogger(ReadAheadInputStream.class); + private static final SparkLogger logger = + SparkLoggerFactory.getLogger(ReadAheadInputStream.class); private ReentrantLock stateChangeLock = new ReentrantLock(); @@ -205,7 +212,7 @@ private void closeUnderlyingInputStreamIfNecessary() { try { underlyingInputStream.close(); } catch (IOException e) { - logger.warn(e.getMessage(), e); + logger.warn("{}", e, MDC.of(LogKeys.ERROR$.MODULE$, e.getMessage())); } } } diff --git a/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java b/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java index 83352611770fd..fe798e40a6ad7 100644 --- a/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java +++ b/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java @@ -18,6 +18,7 @@ package org.apache.spark.memory; import javax.annotation.concurrent.GuardedBy; +import java.io.InterruptedIOException; import java.io.IOException; import java.nio.channels.ClosedByInterruptException; import java.util.Arrays; @@ -29,9 +30,11 @@ import java.util.TreeMap; import com.google.common.annotations.VisibleForTesting; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.MDC; import org.apache.spark.unsafe.memory.MemoryBlock; import org.apache.spark.util.Utils; @@ -58,7 +61,7 @@ */ public class TaskMemoryManager { - private static final Logger logger = LoggerFactory.getLogger(TaskMemoryManager.class); + private static final SparkLogger logger = SparkLoggerFactory.getLogger(TaskMemoryManager.class); /** The number of bits used to address the page table. */ private static final int PAGE_NUMBER_BITS = 13; @@ -242,12 +245,14 @@ private long trySpillAndAcquire( cList.remove(idx); return 0; } - } catch (ClosedByInterruptException e) { + } catch (ClosedByInterruptException | InterruptedIOException e) { // This called by user to kill a task (e.g: speculative task). - logger.error("error while calling spill() on " + consumerToSpill, e); + logger.error("error while calling spill() on {}", e, + MDC.of(LogKeys.MEMORY_CONSUMER$.MODULE$, consumerToSpill)); throw new RuntimeException(e.getMessage()); } catch (IOException e) { - logger.error("error while calling spill() on " + consumerToSpill, e); + logger.error("error while calling spill() on {}", e, + MDC.of(LogKeys.MEMORY_CONSUMER$.MODULE$, consumerToSpill)); // checkstyle.off: RegexpSinglelineJava throw new SparkOutOfMemoryError("error while calling spill() on " + consumerToSpill + " : " + e.getMessage()); @@ -270,24 +275,29 @@ public void releaseExecutionMemory(long size, MemoryConsumer consumer) { * Dump the memory usage of all consumers. */ public void showMemoryUsage() { - logger.info("Memory used in task " + taskAttemptId); + logger.info("Memory used in task {}", + MDC.of(LogKeys.TASK_ATTEMPT_ID$.MODULE$, taskAttemptId)); synchronized (this) { long memoryAccountedForByConsumers = 0; for (MemoryConsumer c: consumers) { long totalMemUsage = c.getUsed(); memoryAccountedForByConsumers += totalMemUsage; if (totalMemUsage > 0) { - logger.info("Acquired by " + c + ": " + Utils.bytesToString(totalMemUsage)); + logger.info("Acquired by {}: {}", + MDC.of(LogKeys.MEMORY_CONSUMER$.MODULE$, c), + MDC.of(LogKeys.MEMORY_SIZE$.MODULE$, Utils.bytesToString(totalMemUsage))); } } long memoryNotAccountedFor = memoryManager.getExecutionMemoryUsageForTask(taskAttemptId) - memoryAccountedForByConsumers; logger.info( "{} bytes of memory were used by task {} but are not associated with specific consumers", - memoryNotAccountedFor, taskAttemptId); + MDC.of(LogKeys.MEMORY_SIZE$.MODULE$, memoryNotAccountedFor), + MDC.of(LogKeys.TASK_ATTEMPT_ID$.MODULE$, taskAttemptId)); logger.info( "{} bytes of memory are used for execution and {} bytes of memory are used for storage", - memoryManager.executionMemoryUsed(), memoryManager.storageMemoryUsed()); + MDC.of(LogKeys.EXECUTION_MEMORY_SIZE$.MODULE$, memoryManager.executionMemoryUsed()), + MDC.of(LogKeys.STORAGE_MEMORY_SIZE$.MODULE$, memoryManager.storageMemoryUsed())); } } @@ -333,7 +343,8 @@ public MemoryBlock allocatePage(long size, MemoryConsumer consumer) { try { page = memoryManager.tungstenMemoryAllocator().allocate(acquired); } catch (OutOfMemoryError e) { - logger.warn("Failed to allocate a page ({} bytes), try again.", acquired); + logger.warn("Failed to allocate a page ({} bytes), try again.", + MDC.of(LogKeys.PAGE_SIZE$.MODULE$, acquired)); // there is no enough memory actually, it means the actual free memory is smaller than // MemoryManager thought, we should keep the acquired memory. synchronized (this) { diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java index d067c870acc9e..86f7d5143eff5 100644 --- a/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java +++ b/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java @@ -33,9 +33,11 @@ import scala.collection.Iterator; import com.google.common.io.Closeables; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.MDC; import org.apache.spark.Partitioner; import org.apache.spark.ShuffleDependency; import org.apache.spark.SparkConf; @@ -81,7 +83,8 @@ final class BypassMergeSortShuffleWriter extends ShuffleWriter implements ShuffleChecksumSupport { - private static final Logger logger = LoggerFactory.getLogger(BypassMergeSortShuffleWriter.class); + private static final SparkLogger logger = + SparkLoggerFactory.getLogger(BypassMergeSortShuffleWriter.class); private final int fileBufferSize; private final boolean transferToEnabled; @@ -223,7 +226,8 @@ private long[] writePartitionedData(ShuffleMapOutputWriter mapOutputWriter) thro writePartitionedDataWithStream(file, writer); } if (!file.delete()) { - logger.error("Unable to delete file for partition {}", i); + logger.error("Unable to delete file for partition {}", + MDC.of(LogKeys.PARTITION_ID$.MODULE$, i)); } } } diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java b/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java index b097089282ce3..f96513f1b1097 100644 --- a/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java +++ b/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java @@ -23,17 +23,19 @@ import java.util.LinkedList; import java.util.zip.Checksum; -import org.apache.spark.SparkException; import scala.Tuple2; import com.google.common.annotations.VisibleForTesting; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import org.apache.spark.SparkConf; +import org.apache.spark.SparkException; import org.apache.spark.TaskContext; import org.apache.spark.executor.ShuffleWriteMetrics; import org.apache.spark.internal.config.package$; +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.MDC; import org.apache.spark.memory.MemoryConsumer; import org.apache.spark.memory.SparkOutOfMemoryError; import org.apache.spark.memory.TaskMemoryManager; @@ -70,7 +72,8 @@ */ final class ShuffleExternalSorter extends MemoryConsumer implements ShuffleChecksumSupport { - private static final Logger logger = LoggerFactory.getLogger(ShuffleExternalSorter.class); + private static final SparkLogger logger = + SparkLoggerFactory.getLogger(ShuffleExternalSorter.class); @VisibleForTesting static final int DISK_WRITE_BUFFER_SIZE = 1024 * 1024; @@ -159,11 +162,11 @@ private void writeSortedFile(boolean isFinalFile) { if (!isFinalFile) { logger.info( "Task {} on Thread {} spilling sort data of {} to disk ({} {} so far)", - taskContext.taskAttemptId(), - Thread.currentThread().getId(), - Utils.bytesToString(getMemoryUsage()), - spills.size(), - spills.size() != 1 ? " times" : " time"); + MDC.of(LogKeys.TASK_ATTEMPT_ID$.MODULE$, taskContext.taskAttemptId()), + MDC.of(LogKeys.THREAD_ID$.MODULE$, Thread.currentThread().getId()), + MDC.of(LogKeys.MEMORY_SIZE$.MODULE$, Utils.bytesToString(getMemoryUsage())), + MDC.of(LogKeys.NUM_SPILL_INFOS$.MODULE$, spills.size()), + MDC.of(LogKeys.SPILL_TIMES$.MODULE$, spills.size() != 1 ? "times" : "time")); } // This call performs the actual sort. @@ -349,7 +352,8 @@ public void cleanupResources() { } for (SpillInfo spill : spills) { if (spill.file.exists() && !spill.file.delete()) { - logger.error("Unable to delete spill file {}", spill.file.getPath()); + logger.error("Unable to delete spill file {}", + MDC.of(LogKeys.PATH$.MODULE$, spill.file.getPath())); } } } @@ -416,8 +420,8 @@ public void insertRecord(Object recordBase, long recordOffset, int length, int p // for tests assert(inMemSorter != null); if (inMemSorter.numRecords() >= numElementsForSpillThreshold) { - logger.info("Spilling data because number of spilledRecords crossed the threshold " + - numElementsForSpillThreshold); + logger.info("Spilling data because number of spilledRecords crossed the threshold {}" + + MDC.of(LogKeys.NUM_ELEMENTS_SPILL_THRESHOLD$.MODULE$, numElementsForSpillThreshold)); spill(); } diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java index f5949d6ae7a5b..13fd18c0942b1 100644 --- a/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java +++ b/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java @@ -35,12 +35,14 @@ import com.google.common.annotations.VisibleForTesting; import com.google.common.io.ByteStreams; import com.google.common.io.Closeables; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import org.apache.spark.*; import org.apache.spark.annotation.Private; import org.apache.spark.internal.config.package$; +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.MDC; import org.apache.spark.io.CompressionCodec; import org.apache.spark.io.CompressionCodec$; import org.apache.spark.io.NioBufferedFileInputStream; @@ -66,7 +68,7 @@ @Private public class UnsafeShuffleWriter extends ShuffleWriter { - private static final Logger logger = LoggerFactory.getLogger(UnsafeShuffleWriter.class); + private static final SparkLogger logger = SparkLoggerFactory.getLogger(UnsafeShuffleWriter.class); private static final ClassTag OBJECT_CLASS_TAG = ClassTag$.MODULE$.Object(); @@ -226,7 +228,8 @@ void closeAndWriteOutput() throws IOException { sorter = null; for (SpillInfo spill : spills) { if (spill.file.exists() && !spill.file.delete()) { - logger.error("Error while deleting spill file {}", spill.file.getPath()); + logger.error("Error while deleting spill file {}", + MDC.of(LogKeys.PATH$.MODULE$, spill.file.getPath())); } } } diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/io/LocalDiskShuffleExecutorComponents.java b/core/src/main/java/org/apache/spark/shuffle/sort/io/LocalDiskShuffleExecutorComponents.java index eb4d9d9abc8e3..38f0a60f8b0dd 100644 --- a/core/src/main/java/org/apache/spark/shuffle/sort/io/LocalDiskShuffleExecutorComponents.java +++ b/core/src/main/java/org/apache/spark/shuffle/sort/io/LocalDiskShuffleExecutorComponents.java @@ -17,6 +17,7 @@ package org.apache.spark.shuffle.sort.io; +import java.util.Collections; import java.util.Map; import java.util.Optional; @@ -56,7 +57,10 @@ public void initializeExecutor(String appId, String execId, Map if (blockManager == null) { throw new IllegalStateException("No blockManager available from the SparkEnv."); } - blockResolver = new IndexShuffleBlockResolver(sparkConf, blockManager); + blockResolver = + new IndexShuffleBlockResolver( + sparkConf, blockManager, Collections.emptyMap() /* Shouldn't be accessed */ + ); } @Override diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/io/LocalDiskShuffleMapOutputWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/io/LocalDiskShuffleMapOutputWriter.java index efe508d1361c7..c0b9018c770a0 100644 --- a/core/src/main/java/org/apache/spark/shuffle/sort/io/LocalDiskShuffleMapOutputWriter.java +++ b/core/src/main/java/org/apache/spark/shuffle/sort/io/LocalDiskShuffleMapOutputWriter.java @@ -26,10 +26,11 @@ import java.nio.channels.WritableByteChannel; import java.util.Optional; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import org.apache.spark.SparkConf; +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.MDC; import org.apache.spark.shuffle.api.ShuffleMapOutputWriter; import org.apache.spark.shuffle.api.ShufflePartitionWriter; import org.apache.spark.shuffle.api.WritableByteChannelWrapper; @@ -44,8 +45,8 @@ */ public class LocalDiskShuffleMapOutputWriter implements ShuffleMapOutputWriter { - private static final Logger log = - LoggerFactory.getLogger(LocalDiskShuffleMapOutputWriter.class); + private static final SparkLogger log = + SparkLoggerFactory.getLogger(LocalDiskShuffleMapOutputWriter.class); private final int shuffleId; private final long mapId; @@ -73,7 +74,7 @@ public LocalDiskShuffleMapOutputWriter( this.blockResolver = blockResolver; this.bufferSize = (int) (long) sparkConf.get( - package$.MODULE$.SHUFFLE_UNSAFE_FILE_OUTPUT_BUFFER_SIZE()) * 1024; + package$.MODULE$.SHUFFLE_LOCAL_DISK_FILE_OUTPUT_BUFFER_SIZE()) * 1024; this.partitionLengths = new long[numPartitions]; this.outputFile = blockResolver.getDataFile(shuffleId, mapId); this.outputTempFile = null; @@ -123,7 +124,8 @@ public MapOutputCommitMessage commitAllPartitions(long[] checksums) throws IOExc public void abort(Throwable error) throws IOException { cleanUp(); if (outputTempFile != null && outputTempFile.exists() && !outputTempFile.delete()) { - log.warn("Failed to delete temporary shuffle file at {}", outputTempFile.getAbsolutePath()); + log.warn("Failed to delete temporary shuffle file at {}", + MDC.of(LogKeys.PATH$.MODULE$, outputTempFile.getAbsolutePath())); } } diff --git a/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java b/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java index 35c5efc77f6f2..2a8e15cd09ccf 100644 --- a/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java +++ b/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java @@ -25,11 +25,13 @@ import com.google.common.annotations.VisibleForTesting; import com.google.common.io.Closeables; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import org.apache.spark.SparkEnv; import org.apache.spark.executor.ShuffleWriteMetrics; +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; +import org.apache.spark.internal.MDC; import org.apache.spark.memory.MemoryConsumer; import org.apache.spark.memory.SparkOutOfMemoryError; import org.apache.spark.memory.TaskMemoryManager; @@ -66,7 +68,7 @@ */ public final class BytesToBytesMap extends MemoryConsumer { - private static final Logger logger = LoggerFactory.getLogger(BytesToBytesMap.class); + private static final SparkLogger logger = SparkLoggerFactory.getLogger(BytesToBytesMap.class); private static final HashMapGrowthStrategy growthStrategy = HashMapGrowthStrategy.DOUBLING; @@ -392,7 +394,8 @@ private void handleFailedDelete() { // remove the spill file from disk File file = spillWriters.removeFirst().getFile(); if (file != null && file.exists() && !file.delete()) { - logger.error("Was unable to delete spill file {}", file.getAbsolutePath()); + logger.error("Was unable to delete spill file {}", + MDC.of(LogKeys.PATH$.MODULE$, file.getAbsolutePath())); } } } @@ -893,7 +896,8 @@ public void free() { File file = spillWriters.removeFirst().getFile(); if (file != null && file.exists()) { if (!file.delete()) { - logger.error("Was unable to delete spill file {}", file.getAbsolutePath()); + logger.error("Was unable to delete spill file {}", + MDC.of(LogKeys.PATH$.MODULE$, file.getAbsolutePath())); } } } diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java index 2f9e1a9f45460..af421e903ba3f 100644 --- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java +++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java @@ -28,11 +28,13 @@ import com.google.common.annotations.VisibleForTesting; import org.apache.commons.io.IOUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import org.apache.spark.TaskContext; import org.apache.spark.executor.ShuffleWriteMetrics; +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; +import org.apache.spark.internal.MDC; import org.apache.spark.memory.MemoryConsumer; import org.apache.spark.memory.SparkOutOfMemoryError; import org.apache.spark.memory.TaskMemoryManager; @@ -50,7 +52,8 @@ */ public final class UnsafeExternalSorter extends MemoryConsumer { - private static final Logger logger = LoggerFactory.getLogger(UnsafeExternalSorter.class); + private static final SparkLogger logger = + SparkLoggerFactory.getLogger(UnsafeExternalSorter.class); @Nullable private final PrefixComparator prefixComparator; @@ -217,10 +220,10 @@ public long spill(long size, MemoryConsumer trigger) throws IOException { } logger.info("Thread {} spilling sort data of {} to disk ({} {} so far)", - Thread.currentThread().getId(), - Utils.bytesToString(getMemoryUsage()), - spillWriters.size(), - spillWriters.size() > 1 ? " times" : " time"); + MDC.of(LogKeys.THREAD_ID$.MODULE$, Thread.currentThread().getId()), + MDC.of(LogKeys.MEMORY_SIZE$.MODULE$, Utils.bytesToString(getMemoryUsage())), + MDC.of(LogKeys.NUM_SPILL_WRITERS$.MODULE$, spillWriters.size()), + MDC.of(LogKeys.SPILL_TIMES$.MODULE$, spillWriters.size() > 1 ? "times" : "time")); ShuffleWriteMetrics writeMetrics = new ShuffleWriteMetrics(); @@ -335,7 +338,8 @@ private void deleteSpillFiles() { File file = spill.getFile(); if (file != null && file.exists()) { if (!file.delete()) { - logger.error("Was unable to delete spill file {}", file.getAbsolutePath()); + logger.error("Was unable to delete spill file {}", + MDC.of(LogKeys.PATH$.MODULE$, file.getAbsolutePath())); } } } @@ -476,8 +480,8 @@ public void insertRecord( assert(inMemSorter != null); if (inMemSorter.numRecords() >= numElementsForSpillThreshold) { - logger.info("Spilling data because number of spilledRecords crossed the threshold " + - numElementsForSpillThreshold); + logger.info("Spilling data because number of spilledRecords crossed the threshold {}", + MDC.of(LogKeys.NUM_ELEMENTS_SPILL_THRESHOLD$.MODULE$, numElementsForSpillThreshold)); spill(); } diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java index cf29835b2ce89..0693f8cb1a808 100644 --- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java +++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java @@ -23,13 +23,13 @@ import org.apache.spark.TaskContext; import org.apache.spark.internal.config.package$; import org.apache.spark.internal.config.ConfigEntry; +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; import org.apache.spark.io.NioBufferedFileInputStream; import org.apache.spark.io.ReadAheadInputStream; import org.apache.spark.serializer.SerializerManager; import org.apache.spark.storage.BlockId; import org.apache.spark.unsafe.Platform; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import java.io.*; @@ -38,7 +38,8 @@ * of the file format). */ public final class UnsafeSorterSpillReader extends UnsafeSorterIterator implements Closeable { - private static final Logger logger = LoggerFactory.getLogger(UnsafeSorterSpillReader.class); + private static final SparkLogger logger = + SparkLoggerFactory.getLogger(UnsafeSorterSpillReader.class); public static final int MAX_BUFFER_SIZE_BYTES = 16777216; // 16 mb private InputStream in; diff --git a/core/src/main/resources/META-INF/LICENSE b/core/src/main/resources/META-INF/LICENSE new file mode 100644 index 0000000000000..8dbc84b910d41 --- /dev/null +++ b/core/src/main/resources/META-INF/LICENSE @@ -0,0 +1,240 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + +------------------------------------------------------------------------------------ +This product bundles various third-party components under other open source licenses. +This section summarizes those components and their licenses. See licenses/ +for text of these licenses. + + +Apache Software Foundation License 2.0 +-------------------------------------- +com.google.guava:guava +com.google.protobuf:protobuf-java +core/src/main/java/org/apache/spark/util/collection/TimSort.java +core/src/main/resources/org/apache/spark/ui/static/bootstrap* +core/src/main/resources/org/apache/spark/ui/static/vis* +core/src/main/resources/org/apache/spark/ui/static/d3-flamegraph.min.js +core/src/main/resources/org/apache/spark/ui/static/d3-flamegraph.css +org.eclipse.jetty:jetty-client +org.eclipse.jetty:jetty-http +org.eclipse.jetty:jetty-io +org.eclipse.jetty:jetty-plus +org.eclipse.jetty:jetty-proxy +org.eclipse.jetty:jetty-security +org.eclipse.jetty:jetty-server +org.eclipse.jetty:jetty-servlet +org.eclipse.jetty:jetty-servlets +org.eclipse.jetty:jetty-util + +MIT License +----------- +core/src/main/resources/org/apache/spark/ui/static/dagre-d3.min.js +core/src/main/resources/org/apache/spark/ui/static/*dataTables* +core/src/main/resources/org/apache/spark/ui/static/graphlib-dot.min.js +core/src/main/resources/org/apache/spark/ui/static/jquery* +core/src/main/resources/org/apache/spark/ui/static/sorttable.js + +ISC License +----------- +core/src/main/resources/org/apache/spark/ui/static/d3.min.js diff --git a/core/src/main/resources/META-INF/NOTICE b/core/src/main/resources/META-INF/NOTICE new file mode 100644 index 0000000000000..5514b6dc7999b --- /dev/null +++ b/core/src/main/resources/META-INF/NOTICE @@ -0,0 +1,29 @@ +Apache Spark - Core +Copyright 2014 and onwards The Apache Software Foundation. + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + +This module also bundles software with extra NOTICE as following: + +=== NOTICE FOR Jetty === +Notices for Eclipse Jetty +========================= +This content is produced and maintained by the Eclipse Jetty project. + +Cryptography +------------ +Content may contain encryption software. The country in which you are currently +may have restrictions on the import, possession, and use, and/or re-export to +another country, of encryption software. BEFORE using any encryption software, +please check the country's laws, regulations and policies concerning the import, +possession, or use, and re-export of encryption software, to see if this is +permitted. + +The UnixCrypt.java code implements the one way cryptography used by +Unix systems for simple password protection. Copyright 1996 Aki Yoshida, +modified April 2001 by Iris Van den Broeke, Daniel Deville. +Permission to use, copy, modify and distribute UnixCrypt +for non-commercial or commercial purposes and without fee is +granted provided that the copyright notice appears in all copies. +=== END OF NOTICE FOR Jetty === diff --git a/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala b/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala index 942242107e22f..adce6c3f5ffdb 100644 --- a/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala +++ b/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala @@ -23,7 +23,8 @@ import java.util.function.Consumer import scala.collection.mutable.{ArrayBuffer, HashSet} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.rpc.{RpcCallContext, RpcEnv, ThreadSafeRpcEndpoint} import org.apache.spark.scheduler.{LiveListenerBus, SparkListener, SparkListenerStageCompleted} import org.apache.spark.util.ThreadUtils @@ -161,7 +162,8 @@ private[spark] class BarrierCoordinator( s"${request.numTasks} from Task $taskId, previously it was $numTasks.") // Check whether the epoch from the barrier tasks matches current barrierEpoch. - logInfo(s"Current barrier epoch for $barrierId is $barrierEpoch.") + logInfo(log"Current barrier epoch for ${MDC(BARRIER_ID, barrierId)}" + + log" is ${MDC(BARRIER_EPOCH, barrierEpoch)}.") if (epoch != barrierEpoch) { requester.sendFailure(new SparkException(s"The request to sync of $barrierId with " + s"barrier epoch $barrierEpoch has already finished. Maybe task $taskId is not " + @@ -176,14 +178,17 @@ private[spark] class BarrierCoordinator( // Add the requester to array of RPCCallContexts pending for reply. requesters += requester messages(request.partitionId) = request.message - logInfo(s"Barrier sync epoch $barrierEpoch from $barrierId received update from Task " + - s"$taskId, current progress: ${requesters.size}/$numTasks.") + logInfo(log"Barrier sync epoch ${MDC(BARRIER_EPOCH, barrierEpoch)}" + + log" from ${MDC(BARRIER_ID, barrierId)} received update from Task" + + log" ${MDC(TASK_ID, taskId)}, current progress:" + + log" ${MDC(REQUESTER_SIZE, requesters.size)}/${MDC(NUM_REQUEST_SYNC_TASK, numTasks)}.") if (requesters.size == numTasks) { requesters.foreach(_.reply(messages.clone())) // Finished current barrier() call successfully, clean up ContextBarrierState and // increase the barrier epoch. - logInfo(s"Barrier sync epoch $barrierEpoch from $barrierId received all updates from " + - s"tasks, finished successfully.") + logInfo(log"Barrier sync epoch ${MDC(BARRIER_EPOCH, barrierEpoch)}" + + log" from ${MDC(BARRIER_ID, barrierId)} received all updates from" + + log" tasks, finished successfully.") barrierEpoch += 1 requesters.clear() requestMethods.clear() diff --git a/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala b/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala index e083ece918b63..c8d6000cd6282 100644 --- a/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala +++ b/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala @@ -26,7 +26,8 @@ import scala.util.{Failure, Success => ScalaSuccess, Try} import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.executor.TaskMetrics -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC, MessageWithContext} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.memory.TaskMemoryManager import org.apache.spark.metrics.source.Source import org.apache.spark.resource.ResourceInformation @@ -56,19 +57,27 @@ class BarrierTaskContext private[spark] ( // with the driver side epoch. private var barrierEpoch = 0 + private def logProgressInfo(msg: MessageWithContext, startTime: Option[Long]): Unit = { + val waitMsg = startTime.fold(log"")(st => log", waited " + + log"for ${MDC(TOTAL_TIME, System.currentTimeMillis() - st)} ms,") + logInfo(log"Task ${MDC(TASK_ATTEMPT_ID, taskAttemptId())}" + + log" from Stage ${MDC(STAGE_ID, stageId())}" + + log"(Attempt ${MDC(STAGE_ATTEMPT, stageAttemptNumber())}) " + + msg + waitMsg + + log" current barrier epoch is ${MDC(BARRIER_EPOCH, barrierEpoch)}.") + } + private def runBarrier(message: String, requestMethod: RequestMethod.Value): Array[String] = { - logInfo(s"Task ${taskAttemptId()} from Stage ${stageId()}(Attempt ${stageAttemptNumber()}) " + - s"has entered the global sync, current barrier epoch is $barrierEpoch.") + logProgressInfo(log"has entered the global sync", None) logTrace("Current callSite: " + Utils.getCallSite()) val startTime = System.currentTimeMillis() val timerTask = new TimerTask { override def run(): Unit = { - logInfo(s"Task ${taskAttemptId()} from Stage ${stageId()}(Attempt " + - s"${stageAttemptNumber()}) waiting " + - s"under the global sync since $startTime, has been waiting for " + - s"${MILLISECONDS.toSeconds(System.currentTimeMillis() - startTime)} seconds, " + - s"current barrier epoch is $barrierEpoch.") + logProgressInfo( + log"waiting under the global sync since ${MDC(TIME, startTime)}", + Some(startTime) + ) } } // Log the update of global sync every 1 minute. @@ -104,17 +113,11 @@ class BarrierTaskContext private[spark] ( val messages = abortableRpcFuture.future.value.get.get barrierEpoch += 1 - logInfo(s"Task ${taskAttemptId()} from Stage ${stageId()}(Attempt ${stageAttemptNumber()}) " + - s"finished global sync successfully, waited for " + - s"${MILLISECONDS.toSeconds(System.currentTimeMillis() - startTime)} seconds, " + - s"current barrier epoch is $barrierEpoch.") + logProgressInfo(log"finished global sync successfully", Some(startTime)) messages } catch { case e: SparkException => - logInfo(s"Task ${taskAttemptId()} from Stage ${stageId()}(Attempt " + - s"${stageAttemptNumber()}) failed to perform global sync, waited for " + - s"${MILLISECONDS.toSeconds(System.currentTimeMillis() - startTime)} seconds, " + - s"current barrier epoch is $barrierEpoch.") + logProgressInfo(log"failed to perform global sync", Some(startTime)) throw e } finally { timerTask.cancel() diff --git a/core/src/main/scala/org/apache/spark/ContextCleaner.scala b/core/src/main/scala/org/apache/spark/ContextCleaner.scala index c16a84c13187b..fb56389cde77e 100644 --- a/core/src/main/scala/org/apache/spark/ContextCleaner.scala +++ b/core/src/main/scala/org/apache/spark/ContextCleaner.scala @@ -25,7 +25,7 @@ import scala.jdk.CollectionConverters._ import org.apache.spark.broadcast.Broadcast import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{ACCUMULATOR_ID, BROADCAST_ID, LISTENER, RDD_ID, SHUFFLE_ID} +import org.apache.spark.internal.LogKeys.{ACCUMULATOR_ID, BROADCAST_ID, LISTENER, RDD_ID, SHUFFLE_ID} import org.apache.spark.internal.config._ import org.apache.spark.rdd.{RDD, ReliableRDDCheckpointData} import org.apache.spark.scheduler.SparkListener diff --git a/core/src/main/scala/org/apache/spark/Dependency.scala b/core/src/main/scala/org/apache/spark/Dependency.scala index 3b5bb6792928e..3b7c7778e26ce 100644 --- a/core/src/main/scala/org/apache/spark/Dependency.scala +++ b/core/src/main/scala/org/apache/spark/Dependency.scala @@ -24,7 +24,8 @@ import scala.reflect.ClassTag import org.roaringbitmap.RoaringBitmap import org.apache.spark.annotation.DeveloperApi -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.rdd.RDD import org.apache.spark.serializer.Serializer import org.apache.spark.shuffle.{ShuffleHandle, ShuffleWriteProcessor} @@ -211,10 +212,13 @@ class ShuffleDependency[K: ClassTag, V: ClassTag, C: ClassTag]( // This may crash the driver with an OOM error. if (numPartitions.toLong * partitioner.numPartitions.toLong > (1L << 30)) { logWarning( - s"The number of shuffle blocks (${numPartitions.toLong * partitioner.numPartitions.toLong})" + - s" for shuffleId ${shuffleId} for ${_rdd} with ${numPartitions} partitions" + - " is possibly too large, which could cause the driver to crash with an out-of-memory" + - " error. Consider decreasing the number of partitions in this shuffle stage." + log"The number of shuffle blocks " + + log"(${MDC(NUM_PARTITIONS, numPartitions.toLong * partitioner.numPartitions.toLong)})" + + log" for shuffleId ${MDC(SHUFFLE_ID, shuffleId)} " + + log"for ${MDC(RDD_DESCRIPTION, _rdd)} " + + log"with ${MDC(NUM_PARTITIONS2, numPartitions)} partitions" + + log" is possibly too large, which could cause the driver to crash with an out-of-memory" + + log" error. Consider decreasing the number of partitions in this shuffle stage." ) } diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala index 94927caff1d78..1fe02eec3a072 100644 --- a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala +++ b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala @@ -26,7 +26,8 @@ import scala.util.control.NonFatal import com.codahale.metrics.{Counter, Gauge, MetricRegistry} -import org.apache.spark.internal.{config, Logging} +import org.apache.spark.internal.{config, Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.config._ import org.apache.spark.internal.config.DECOMMISSION_ENABLED import org.apache.spark.internal.config.Tests.TEST_DYNAMIC_ALLOCATION_SCHEDULE_ENABLED @@ -205,11 +206,13 @@ private[spark] class ExecutorAllocationManager( throw new SparkException( s"s${DYN_ALLOCATION_SUSTAINED_SCHEDULER_BACKLOG_TIMEOUT.key} must be > 0!") } + val shuffleTrackingEnabled = conf.get(config.DYN_ALLOCATION_SHUFFLE_TRACKING_ENABLED) + val shuffleDecommissionEnabled = decommissionEnabled && + conf.get(config.STORAGE_DECOMMISSION_SHUFFLE_BLOCKS_ENABLED) if (!conf.get(config.SHUFFLE_SERVICE_ENABLED) && !reliableShuffleStorage) { - if (conf.get(config.DYN_ALLOCATION_SHUFFLE_TRACKING_ENABLED)) { + if (shuffleTrackingEnabled) { logInfo("Dynamic allocation is enabled without a shuffle service.") - } else if (decommissionEnabled && - conf.get(config.STORAGE_DECOMMISSION_SHUFFLE_BLOCKS_ENABLED)) { + } else if (shuffleDecommissionEnabled) { logInfo("Shuffle data decommission is enabled without a shuffle service.") } else if (!testing) { throw new SparkException("Dynamic allocation of executors requires one of the " + @@ -223,6 +226,12 @@ private[spark] class ExecutorAllocationManager( } } + if (shuffleTrackingEnabled && (shuffleDecommissionEnabled || reliableShuffleStorage)) { + logWarning("You are enabling both shuffle tracking and other DA supported mechanism, " + + "which will cause idle executors not to be released in a timely, " + + "please check the configurations.") + } + if (executorAllocationRatio > 1.0 || executorAllocationRatio <= 0.0) { throw new SparkException( s"${DYN_ALLOCATION_EXECUTOR_ALLOCATION_RATIO.key} must be > 0 and <= 1.0") @@ -445,10 +454,12 @@ private[spark] class ExecutorAllocationManager( val delta = targetNum.delta totalDelta += delta if (delta > 0) { - val executorsString = "executor" + { if (delta > 1) "s" else "" } - logInfo(s"Requesting $delta new $executorsString because tasks are backlogged " + - s"(new desired total will be ${numExecutorsTargetPerResourceProfileId(rpId)} " + - s"for resource profile id: ${rpId})") + val executorsString = log" new executor" + { if (delta > 1) log"s" else log"" } + logInfo(log"Requesting ${MDC(TARGET_NUM_EXECUTOR_DELTA, delta)}" + + executorsString + log" because tasks are backlogged " + + log"(new desired total will be" + + log" ${MDC(TARGET_NUM_EXECUTOR, numExecutorsTargetPerResourceProfileId(rpId))} " + + log"for resource profile id: ${MDC(RESOURCE_PROFILE_ID, rpId)})") numExecutorsToAddPerResourceProfileId(rpId) = if (delta == numExecutorsToAddPerResourceProfileId(rpId)) { numExecutorsToAddPerResourceProfileId(rpId) * 2 @@ -542,8 +553,8 @@ private[spark] class ExecutorAllocationManager( if (testing) { throw new SparkException("ResourceProfile Id was UNKNOWN, this is not expected") } - logWarning(s"Not removing executor $executorIdToBeRemoved because the " + - "ResourceProfile was UNKNOWN!") + logWarning(log"Not removing executor ${MDC(EXECUTOR_IDS, executorIdToBeRemoved)} " + + log"because the ResourceProfile was UNKNOWN!") } else { // get the running total as we remove or initialize it to the count - pendingRemoval val newExecutorTotal = numExecutorsTotalPerRpId.getOrElseUpdate(rpId, @@ -603,11 +614,13 @@ private[spark] class ExecutorAllocationManager( } else { executorMonitor.executorsKilled(executorsRemoved.toSeq) } - logInfo(s"Executors ${executorsRemoved.mkString(",")} removed due to idle timeout.") + logInfo(log"Executors ${MDC(EXECUTOR_IDS, executorsRemoved.mkString(","))}" + + log"removed due to idle timeout.") executorsRemoved.toSeq } else { - logWarning(s"Unable to reach the cluster manager to kill executor/s " + - s"${executorIdsToBeRemoved.mkString(",")} or no executor eligible to kill!") + logWarning(log"Unable to reach the cluster manager to kill executor/s " + + log"${MDC(EXECUTOR_IDS, executorIdsToBeRemoved.mkString(","))} " + + log"or no executor eligible to kill!") Seq.empty[String] } } @@ -870,8 +883,9 @@ private[spark] class ExecutorAllocationManager( // really complete and no tasks left resourceProfileIdToStageAttempt(rpForStage.head) -= stageAttempt } else { - logWarning(s"Should have exactly one resource profile for stage $stageAttempt," + - s" but have $rpForStage") + logWarning(log"Should have exactly one resource profile for stage " + + log"${MDC(STAGE_ATTEMPT, stageAttempt)}, but have " + + log"${MDC(RESOURCE_PROFILE_ID, rpForStage)}") } } } diff --git a/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala b/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala index 5999040894ae5..92aea5959aab7 100644 --- a/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala +++ b/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala @@ -23,7 +23,8 @@ import scala.collection.mutable.{HashMap, Map} import scala.concurrent.Future import org.apache.spark.executor.ExecutorMetrics -import org.apache.spark.internal.{config, Logging} +import org.apache.spark.internal.{config, Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.config.Network import org.apache.spark.rpc.{IsolatedThreadSafeRpcEndpoint, RpcCallContext, RpcEnv} import org.apache.spark.scheduler._ @@ -159,7 +160,8 @@ private[spark] class HeartbeatReceiver(sc: SparkContext, clock: Clock) // Because Executor will sleep several seconds before sending the first "Heartbeat", this // case rarely happens. However, if it really happens, log it and ask the executor to // register itself again. - logWarning(s"Dropping $heartbeat because TaskScheduler is not ready yet") + logWarning(log"Dropping ${MDC(HEARTBEAT, heartbeat)} " + + log"because TaskScheduler is not ready yet") context.reply(HeartbeatResponse(reregisterBlockManager)) } } @@ -210,8 +212,10 @@ private[spark] class HeartbeatReceiver(sc: SparkContext, clock: Clock) val now = clock.getTimeMillis() for ((executorId, lastSeenMs) <- executorLastSeen) { if (now - lastSeenMs > executorTimeoutMs) { - logWarning(s"Removing executor $executorId with no recent heartbeats: " + - s"${now - lastSeenMs} ms exceeds timeout $executorTimeoutMs ms") + logWarning(log"Removing executor ${MDC(EXECUTOR_ID, executorId)} " + + log"with no recent heartbeats: " + + log"${MDC(TIME_UNITS, now - lastSeenMs)} ms exceeds timeout " + + log"${MDC(EXECUTOR_TIMEOUT, executorTimeoutMs)} ms") // Asynchronously kill the executor to avoid blocking the current thread killExecutorThread.submit(new Runnable { override def run(): Unit = Utils.tryLogNonFatalError { diff --git a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala index 48569eb713793..a660bccd2e68f 100644 --- a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala +++ b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala @@ -34,8 +34,8 @@ import org.apache.commons.io.output.{ByteArrayOutputStream => ApacheByteArrayOut import org.roaringbitmap.RoaringBitmap import org.apache.spark.broadcast.{Broadcast, BroadcastManager} -import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey._ +import org.apache.spark.internal.{Logging, MDC, MessageWithContext} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.config._ import org.apache.spark.io.CompressionCodec import org.apache.spark.rpc.{RpcCallContext, RpcEndpoint, RpcEndpointRef, RpcEnv} @@ -44,7 +44,6 @@ import org.apache.spark.shuffle.MetadataFetchFailedException import org.apache.spark.storage.{BlockId, BlockManagerId, ShuffleBlockId, ShuffleMergedBlockId} import org.apache.spark.util._ import org.apache.spark.util.ArrayImplicits._ -import org.apache.spark.util.collection.OpenHashMap import org.apache.spark.util.io.{ChunkedByteBuffer, ChunkedByteBufferOutputStream} /** @@ -153,17 +152,22 @@ private class ShuffleStatus( /** * Mapping from a mapId to the mapIndex, this is required to reduce the searching overhead within * the function updateMapOutput(mapId, bmAddress). + * + * Exposed for testing. */ - private[this] val mapIdToMapIndex = new OpenHashMap[Long, Int]() + private[spark] val mapIdToMapIndex = new HashMap[Long, Int]() /** * Register a map output. If there is already a registered location for the map output then it * will be replaced by the new location. */ def addMapOutput(mapIndex: Int, status: MapStatus): Unit = withWriteLock { - if (mapStatuses(mapIndex) == null) { + val currentMapStatus = mapStatuses(mapIndex) + if (currentMapStatus == null) { _numAvailableMapOutputs += 1 invalidateSerializedMapOutputStatusCache() + } else { + mapIdToMapIndex.remove(currentMapStatus.mapId) } mapStatuses(mapIndex) = status mapIdToMapIndex(status.mapId) = mapIndex @@ -188,26 +192,30 @@ private class ShuffleStatus( val mapStatusOpt = mapIndex.map(mapStatuses(_)).flatMap(Option(_)) mapStatusOpt match { case Some(mapStatus) => - logInfo(s"Updating map output for ${mapId} to ${bmAddress}") + logInfo(log"Updating map output for ${MDC(MAP_ID, mapId)}" + + log" to ${MDC(BLOCK_MANAGER_ID, bmAddress)}") mapStatus.updateLocation(bmAddress) invalidateSerializedMapOutputStatusCache() case None => - if (mapIndex.map(mapStatusesDeleted).exists(_.mapId == mapId)) { - val index = mapIndex.get + val index = mapStatusesDeleted.indexWhere(x => x != null && x.mapId == mapId) + if (index >= 0 && mapStatuses(index) == null) { val mapStatus = mapStatusesDeleted(index) mapStatus.updateLocation(bmAddress) mapStatuses(index) = mapStatus _numAvailableMapOutputs += 1 invalidateSerializedMapOutputStatusCache() mapStatusesDeleted(index) = null - logInfo(s"Recover ${mapStatus.mapId} ${mapStatus.location}") + logInfo(log"Recover ${MDC(MAP_ID, mapStatus.mapId)}" + + log" ${MDC(BLOCK_MANAGER_ID, mapStatus.location)}") } else { - logWarning(s"Asked to update map output ${mapId} for untracked map status.") + logWarning(log"Asked to update map output ${MDC(MAP_ID, mapId)} " + + log"for untracked map status.") } } } catch { case e: java.lang.NullPointerException => - logWarning(s"Unable to update map output for ${mapId}, status removed in-flight") + logWarning(log"Unable to update map output for ${MDC(MAP_ID, mapId)}, " + + log"status removed in-flight") } } @@ -218,9 +226,11 @@ private class ShuffleStatus( */ def removeMapOutput(mapIndex: Int, bmAddress: BlockManagerId): Unit = withWriteLock { logDebug(s"Removing existing map output ${mapIndex} ${bmAddress}") - if (mapStatuses(mapIndex) != null && mapStatuses(mapIndex).location == bmAddress) { + val currentMapStatus = mapStatuses(mapIndex) + if (currentMapStatus != null && currentMapStatus.location == bmAddress) { _numAvailableMapOutputs -= 1 - mapStatusesDeleted(mapIndex) = mapStatuses(mapIndex) + mapIdToMapIndex.remove(currentMapStatus.mapId) + mapStatusesDeleted(mapIndex) = currentMapStatus mapStatuses(mapIndex) = null invalidateSerializedMapOutputStatusCache() } @@ -286,9 +296,11 @@ private class ShuffleStatus( */ def removeOutputsByFilter(f: BlockManagerId => Boolean): Unit = withWriteLock { for (mapIndex <- mapStatuses.indices) { - if (mapStatuses(mapIndex) != null && f(mapStatuses(mapIndex).location)) { + val currentMapStatus = mapStatuses(mapIndex) + if (currentMapStatus != null && f(currentMapStatus.location)) { _numAvailableMapOutputs -= 1 - mapStatusesDeleted(mapIndex) = mapStatuses(mapIndex) + mapIdToMapIndex.remove(currentMapStatus.mapId) + mapStatusesDeleted(mapIndex) = currentMapStatus mapStatuses(mapIndex) = null invalidateSerializedMapOutputStatusCache() } @@ -488,20 +500,24 @@ private[spark] class MapOutputTrackerMasterEndpoint( logDebug("init") // force eager creation of logger + private def logInfoMsg(msg: MessageWithContext, shuffleId: Int, context: RpcCallContext): Unit = { + val hostPort = context.senderAddress.hostPort + logInfo(log"Asked to send " + + msg + + log" locations for shuffle ${MDC(SHUFFLE_ID, shuffleId)} to ${MDC(HOST_PORT, hostPort)}") + } + override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = { case GetMapOutputStatuses(shuffleId: Int) => - val hostPort = context.senderAddress.hostPort - logInfo(s"Asked to send map output locations for shuffle $shuffleId to $hostPort") + logInfoMsg(log"map output", shuffleId, context) tracker.post(GetMapOutputMessage(shuffleId, context)) case GetMapAndMergeResultStatuses(shuffleId: Int) => - val hostPort = context.senderAddress.hostPort - logInfo(s"Asked to send map/merge result locations for shuffle $shuffleId to $hostPort") + logInfoMsg(log"map/merge result", shuffleId, context) tracker.post(GetMapAndMergeOutputMessage(shuffleId, context)) case GetShufflePushMergerLocations(shuffleId: Int) => - logInfo(s"Asked to send shuffle push merger locations for shuffle" + - s" $shuffleId to ${context.senderAddress.hostPort}") + logInfoMsg(log"shuffle push merger", shuffleId, context) tracker.post(GetShufflePushMergersMessage(shuffleId, context)) case StopMapOutputTracker => @@ -815,7 +831,8 @@ private[spark] class MapOutputTrackerMaster( case Some(shuffleStatus) => shuffleStatus.updateMapOutput(mapId, bmAddress) case None if shuffleMigrationEnabled => - logWarning(s"Asked to update map output for unknown shuffle ${shuffleId}") + logWarning(log"Asked to update map output for unknown shuffle " + + log"${MDC(SHUFFLE_ID, shuffleId)}") case None => logError(log"Asked to update map output for unknown shuffle ${MDC(SHUFFLE_ID, shuffleId)}") } @@ -1419,13 +1436,15 @@ private[spark] class MapOutputTrackerWorker(conf: SparkConf) extends MapOutputTr val mergeOutputStatuses = mergeStatuses.get(shuffleId).orNull if (mapOutputStatuses == null || mergeOutputStatuses == null) { - logInfo("Don't have map/merge outputs for shuffle " + shuffleId + ", fetching them") + logInfo(log"Don't have map/merge outputs for" + + log" shuffle ${MDC(SHUFFLE_ID, shuffleId)}, fetching them") val startTimeNs = System.nanoTime() fetchingLock.withLock(shuffleId) { var fetchedMapStatuses = mapStatuses.get(shuffleId).orNull var fetchedMergeStatuses = mergeStatuses.get(shuffleId).orNull if (fetchedMapStatuses == null || fetchedMergeStatuses == null) { - logInfo("Doing the fetch; tracker endpoint = " + trackerEndpoint) + logInfo(log"Doing the fetch; tracker endpoint = " + + log"${MDC(RPC_ENDPOINT_REF, trackerEndpoint)}") val fetchedBytes = askTracker[(Array[Byte], Array[Byte])](GetMapAndMergeResultStatuses(shuffleId)) try { @@ -1453,12 +1472,14 @@ private[spark] class MapOutputTrackerWorker(conf: SparkConf) extends MapOutputTr } else { val statuses = mapStatuses.get(shuffleId).orNull if (statuses == null) { - logInfo("Don't have map outputs for shuffle " + shuffleId + ", fetching them") + logInfo(log"Don't have map outputs for shuffle ${MDC(SHUFFLE_ID, shuffleId)}," + + log" fetching them") val startTimeNs = System.nanoTime() fetchingLock.withLock(shuffleId) { var fetchedStatuses = mapStatuses.get(shuffleId).orNull if (fetchedStatuses == null) { - logInfo("Doing the fetch; tracker endpoint = " + trackerEndpoint) + logInfo(log"Doing the fetch; tracker endpoint =" + + log" ${MDC(RPC_ENDPOINT_REF, trackerEndpoint)}") val fetchedBytes = askTracker[Array[Byte]](GetMapOutputStatuses(shuffleId)) try { fetchedStatuses = @@ -1497,7 +1518,7 @@ private[spark] class MapOutputTrackerWorker(conf: SparkConf) extends MapOutputTr def updateEpoch(newEpoch: Long): Unit = { epochLock.synchronized { if (newEpoch > epoch) { - logInfo("Updating epoch to " + newEpoch + " and clearing cache") + logInfo(log"Updating epoch to ${MDC(EPOCH, newEpoch)} and clearing cache") epoch = newEpoch mapStatuses.clear() mergeStatuses.clear() @@ -1558,7 +1579,9 @@ private[spark] object MapOutputTracker extends Logging { oos.close() } val outArr = out.toByteArray - logInfo("Broadcast outputstatuses size = " + outArr.length + ", actual size = " + arrSize) + logInfo(log"Broadcast outputstatuses size = " + + log"${MDC(BROADCAST_OUTPUT_STATUS_SIZE, outArr.length)}," + + log" actual size = ${MDC(BROADCAST_OUTPUT_STATUS_SIZE, arrSize)}") (outArr, bcast) } else { (chunkedByteBuf.toArray, null) @@ -1591,8 +1614,10 @@ private[spark] object MapOutputTracker extends Logging { try { // deserialize the Broadcast, pull .value array out of it, and then deserialize that val bcast = deserializeObject(in).asInstanceOf[Broadcast[Array[Array[Byte]]]] - logInfo("Broadcast outputstatuses size = " + bytes.length + - ", actual size = " + bcast.value.foldLeft(0L)(_ + _.length)) + val actualSize = bcast.value.foldLeft(0L)(_ + _.length) + logInfo(log"Broadcast outputstatuses size =" + + log" ${MDC(BROADCAST_OUTPUT_STATUS_SIZE, bytes.length)}" + + log", actual size = ${MDC(BROADCAST_OUTPUT_STATUS_SIZE, actualSize)}") val bcastIn = new ChunkedByteBuffer(bcast.value.map(ByteBuffer.wrap)).toInputStream() // Important - ignore the DIRECT tag ! Start from offset 1 bcastIn.skip(1) diff --git a/core/src/main/scala/org/apache/spark/Partitioner.scala b/core/src/main/scala/org/apache/spark/Partitioner.scala index ae39e2e183e4a..357e71cdf4457 100644 --- a/core/src/main/scala/org/apache/spark/Partitioner.scala +++ b/core/src/main/scala/org/apache/spark/Partitioner.scala @@ -19,6 +19,7 @@ package org.apache.spark import java.io.{IOException, ObjectInputStream, ObjectOutputStream} +import scala.collection.immutable.ArraySeq import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import scala.math.log10 @@ -149,7 +150,9 @@ private[spark] class KeyGroupedPartitioner( override val numPartitions: Int) extends Partitioner { override def getPartition(key: Any): Int = { val keys = key.asInstanceOf[Seq[Any]] - valueMap.getOrElseUpdate(keys, Utils.nonNegativeMod(keys.hashCode, numPartitions)) + val normalizedKeys = ArraySeq.from(keys) + valueMap.getOrElseUpdate(normalizedKeys, + Utils.nonNegativeMod(normalizedKeys.hashCode, numPartitions)) } } diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala index f1ef36bbf19c2..cfb514913694b 100644 --- a/core/src/main/scala/org/apache/spark/SparkConf.scala +++ b/core/src/main/scala/org/apache/spark/SparkConf.scala @@ -25,7 +25,8 @@ import scala.jdk.CollectionConverters._ import org.apache.avro.{Schema, SchemaNormalization} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys import org.apache.spark.internal.config._ import org.apache.spark.internal.config.History._ import org.apache.spark.internal.config.Kryo._ @@ -507,11 +508,11 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria // Used by Yarn in 1.1 and before sys.props.get("spark.driver.libraryPath").foreach { value => val warning = - s""" - |spark.driver.libraryPath was detected (set to '$value'). + log""" + |spark.driver.libraryPath was detected (set to '${MDC(LogKeys.CONFIG, value)}'). |This is deprecated in Spark 1.2+. | - |Please instead use: ${DRIVER_LIBRARY_PATH.key} + |Please instead use: ${MDC(LogKeys.CONFIG2, DRIVER_LIBRARY_PATH.key)} """.stripMargin logWarning(warning) } @@ -554,9 +555,13 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria val executorCores = get(EXECUTOR_CORES) val leftCores = totalCores % executorCores if (leftCores != 0) { - logWarning(s"Total executor cores: ${totalCores} is not " + - s"divisible by cores per executor: ${executorCores}, " + - s"the left cores: ${leftCores} will not be allocated") + logWarning(log"Total executor cores: " + + log"${MDC(LogKeys.NUM_EXECUTOR_CORES_TOTAL, totalCores)} " + + log"is not divisible by cores per executor: " + + log"${MDC(LogKeys.NUM_EXECUTOR_CORES, executorCores)}, " + + log"the left cores: " + + log"${MDC(LogKeys.NUM_EXECUTOR_CORES_REMAINING, leftCores)} " + + log"will not be allocated") } } @@ -640,7 +645,11 @@ private[spark] object SparkConf extends Logging { DeprecatedConfig("spark.blacklist.killBlacklistedExecutors", "3.1.0", "Please use spark.excludeOnFailure.killExcludedExecutors"), DeprecatedConfig("spark.yarn.blacklist.executor.launch.blacklisting.enabled", "3.1.0", - "Please use spark.yarn.executor.launch.excludeOnFailure.enabled") + "Please use spark.yarn.executor.launch.excludeOnFailure.enabled"), + DeprecatedConfig("spark.network.remoteReadNioBufferConversion", "3.5.2", + "Please open a JIRA ticket to report it if you need to use this configuration."), + DeprecatedConfig("spark.shuffle.unsafe.file.output.buffer", "4.0.0", + "Please use spark.shuffle.localDisk.file.output.buffer") ) Map(configs.map { cfg => (cfg.key -> cfg) } : _*) @@ -772,15 +781,20 @@ private[spark] object SparkConf extends Logging { def logDeprecationWarning(key: String): Unit = { deprecatedConfigs.get(key).foreach { cfg => logWarning( - s"The configuration key '$key' has been deprecated as of Spark ${cfg.version} and " + - s"may be removed in the future. ${cfg.deprecationMessage}") + log"The configuration key '${MDC(LogKeys.CONFIG, key)}' has been deprecated " + + log"as of Spark ${MDC(LogKeys.CONFIG_VERSION, cfg.version)} and " + + log"may be removed in the future. " + + log"${MDC(LogKeys.CONFIG_DEPRECATION_MESSAGE, cfg.deprecationMessage)}") return } allAlternatives.get(key).foreach { case (newKey, cfg) => logWarning( - s"The configuration key '$key' has been deprecated as of Spark ${cfg.version} and " + - s"may be removed in the future. Please use the new key '$newKey' instead.") + log"The configuration key '${MDC(LogKeys.CONFIG, key)}' " + + log"has been deprecated as of " + + log"Spark ${MDC(LogKeys.CONFIG_VERSION, cfg.version)} and " + + log"may be removed in the future. Please use the new key " + + log"'${MDC(LogKeys.CONFIG_KEY_UPDATED, newKey)}' instead.") return } } diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index 9d908cd8713ce..76138640dd2ae 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -28,14 +28,13 @@ import scala.collection.concurrent.{Map => ScalaConcurrentMap} import scala.collection.immutable import scala.collection.mutable.HashMap import scala.jdk.CollectionConverters._ -import scala.language.implicitConversions import scala.reflect.{classTag, ClassTag} import scala.util.control.NonFatal import com.google.common.collect.MapMaker import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} -import org.apache.hadoop.io.{ArrayWritable, BooleanWritable, BytesWritable, DoubleWritable, FloatWritable, IntWritable, LongWritable, NullWritable, Text, Writable} +import org.apache.hadoop.io.{BooleanWritable, BytesWritable, DoubleWritable, FloatWritable, IntWritable, LongWritable, NullWritable, Text, Writable} import org.apache.hadoop.mapred.{FileInputFormat, InputFormat, JobConf, SequenceFileInputFormat, TextInputFormat} import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat, Job => NewHadoopJob} import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat => NewFileInputFormat} @@ -47,7 +46,7 @@ import org.apache.spark.errors.SparkCoreErrors import org.apache.spark.executor.{Executor, ExecutorMetrics, ExecutorMetricsSource} import org.apache.spark.input.{FixedLengthBinaryInputFormat, PortableDataStream, StreamInputFormat, WholeTextFileInputFormat} import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey +import org.apache.spark.internal.LogKeys import org.apache.spark.internal.config._ import org.apache.spark.internal.config.Tests._ import org.apache.spark.internal.config.UI._ @@ -88,6 +87,8 @@ class SparkContext(config: SparkConf) extends Logging { // The call site where this SparkContext was constructed. private val creationSite: CallSite = Utils.getCallSite() + private var stopSite: Option[CallSite] = None + if (!config.get(EXECUTOR_ALLOW_SPARK_CONTEXT)) { // In order to prevent SparkContext from being created in executors. SparkContext.assertOnDriver() @@ -117,6 +118,10 @@ class SparkContext(config: SparkConf) extends Logging { | |${creationSite.longForm} | + |And it was stopped at: + | + |${stopSite.getOrElse(CallSite.empty).longForm} + | |The currently active SparkContext was created at: | |$activeCreationSite @@ -194,10 +199,11 @@ class SparkContext(config: SparkConf) extends Logging { this(master, appName, sparkHome, jars, Map()) // log out Spark Version in Spark driver log - logInfo(s"Running Spark version $SPARK_VERSION") - logInfo(s"OS info ${System.getProperty("os.name")}, ${System.getProperty("os.version")}, " + - s"${System.getProperty("os.arch")}") - logInfo(s"Java version ${System.getProperty("java.version")}") + logInfo(log"Running Spark version ${MDC(LogKeys.SPARK_VERSION, SPARK_VERSION)}") + logInfo(log"OS info ${MDC(LogKeys.OS_NAME, System.getProperty("os.name"))}," + + log" ${MDC(LogKeys.OS_VERSION, System.getProperty("os.version"))}, " + + log"${MDC(LogKeys.OS_ARCH, System.getProperty("os.arch"))}") + logInfo(log"Java version ${MDC(LogKeys.JAVA_VERSION, System.getProperty("java.version"))}") /* ------------------------------------------------------------------------------------- * | Private variables. These variables keep the internal state of the context, and are | @@ -281,12 +287,7 @@ class SparkContext(config: SparkConf) extends Logging { conf: SparkConf, isLocal: Boolean, listenerBus: LiveListenerBus): SparkEnv = { - SparkEnv.createDriverEnv( - conf, - isLocal, - listenerBus, - SparkContext.numDriverCores(master, conf), - this) + SparkEnv.createDriverEnv(conf, isLocal, listenerBus, SparkContext.numDriverCores(master, conf)) } private[spark] def env: SparkEnv = _env @@ -420,7 +421,7 @@ class SparkContext(config: SparkConf) extends Logging { } // HADOOP-19097 Set fs.s3a.connection.establish.timeout to 30s // We can remove this after Apache Hadoop 3.4.1 releases - conf.setIfMissing("spark.hadoop.fs.s3a.connection.establish.timeout", "30s") + conf.setIfMissing("spark.hadoop.fs.s3a.connection.establish.timeout", "30000") // This should be set as early as possible. SparkContext.fillMissingMagicCommitterConfsIfNeeded(_conf) @@ -434,7 +435,7 @@ class SparkContext(config: SparkConf) extends Logging { logResourceInfo(SPARK_DRIVER_PREFIX, _resources) // log out spark.app.name in the Spark driver logs - logInfo(s"Submitted application: $appName") + logInfo(log"Submitted application: ${MDC(LogKeys.APP_NAME, appName)}") // System property spark.yarn.app.id must be set if user code ran by AM on a YARN cluster if (master == "yarn" && deployMode == "cluster" && !_conf.contains("spark.yarn.app.id")) { @@ -443,7 +444,7 @@ class SparkContext(config: SparkConf) extends Logging { } if (_conf.getBoolean("spark.logConf", false)) { - logInfo("Spark configuration:\n" + _conf.toDebugString) + logInfo(log"Spark configuration:\n${MDC(LogKeys.CONFIG, _conf.toDebugString)}") } // Set Spark driver host and port system properties. This explicitly sets the configuration @@ -595,6 +596,8 @@ class SparkContext(config: SparkConf) extends Logging { .foreach(logLevel => _schedulerBackend.updateExecutorsLogLevel(logLevel)) } + _conf.get(CHECKPOINT_DIR).foreach(setCheckpointDir) + val _executorMetricsSource = if (_conf.get(METRICS_EXECUTORMETRICS_SOURCE_ENABLED)) { Some(new ExecutorMetricsSource) @@ -742,15 +745,15 @@ class SparkContext(config: SparkConf) extends Logging { case Some(endpointRef) => Some(endpointRef.askSync[Array[ThreadStackTrace]](TriggerThreadDump)) case None => - logWarning(s"Executor $executorId might already have stopped and " + - "can not request thread dump from it.") + logWarning(log"Executor ${MDC(LogKeys.EXECUTOR_ID, executorId)} " + + log"might already have stopped and can not request thread dump from it.") None } } } catch { case e: Exception => logError( - log"Exception getting thread dump from executor ${MDC(LogKey.EXECUTOR_ID, executorId)}", + log"Exception getting thread dump from executor ${MDC(LogKeys.EXECUTOR_ID, executorId)}", e) None } @@ -774,8 +777,8 @@ class SparkContext(config: SparkConf) extends Logging { case Some(endpointRef) => Some(endpointRef.askSync[Array[String]](TriggerHeapHistogram)) case None => - logWarning(s"Executor $executorId might already have stopped and " + - "can not request heap histogram from it.") + logWarning(log"Executor ${MDC(LogKeys.EXECUTOR_ID, executorId)} " + + log"might already have stopped and can not request heap histogram from it.") None } } @@ -783,7 +786,7 @@ class SparkContext(config: SparkConf) extends Logging { case e: Exception => logError( log"Exception getting heap histogram from " + - log"executor ${MDC(LogKey.EXECUTOR_ID, executorId)}", e) + log"executor ${MDC(LogKeys.EXECUTOR_ID, executorId)}", e) None } } @@ -1699,7 +1702,8 @@ class SparkContext(config: SparkConf) extends Logging { "Can not directly broadcast RDDs; instead, call collect() and broadcast the result.") val bc = env.broadcastManager.newBroadcast[T](value, isLocal, serializedOnly) val callSite = getCallSite() - logInfo("Created broadcast " + bc.id + " from " + callSite.shortForm) + logInfo(log"Created broadcast ${MDC(LogKeys.BROADCAST_ID, bc.id)}" + + log" from ${MDC(LogKeys.CALL_SITE_SHORT_FORM, callSite.shortForm)}") cleaner.foreach(_.registerBroadcastForCleanup(bc)) bc } @@ -1780,8 +1784,9 @@ class SparkContext(config: SparkConf) extends Logging { val schemeCorrectedURI = uri.getScheme match { case null => new File(path).getCanonicalFile.toURI case "local" => - logWarning(s"File with 'local' scheme $path is not supported to add to file server, " + - s"since it is already available on every node.") + logWarning(log"File with 'local' scheme ${MDC(LogKeys.PATH, path)} " + + log"is not supported to add to file server, " + + log"since it is already available on every node.") return case _ => uri } @@ -1827,7 +1832,8 @@ class SparkContext(config: SparkConf) extends Logging { addedFiles .getOrElseUpdate(jobArtifactUUID, new ConcurrentHashMap[String, Long]().asScala) .putIfAbsent(key, timestamp).isEmpty) { - logInfo(s"Added file $path at $key with timestamp $timestamp") + logInfo(log"Added file ${MDC(LogKeys.PATH, path)} at ${MDC(LogKeys.KEY, key)} with" + + log" timestamp ${MDC(LogKeys.TIMESTAMP, timestamp)}") // Fetch the file locally so that closures which are run on the driver can still use the // SparkFiles API to access files. Utils.fetchFile(uri.toString, root, conf, hadoopConfiguration, timestamp, useCache = false) @@ -1839,7 +1845,8 @@ class SparkContext(config: SparkConf) extends Logging { .putIfAbsent( Utils.getUriBuilder(new URI(key)).fragment(uri.getFragment).build().toString, timestamp).isEmpty) { - logInfo(s"Added archive $path at $key with timestamp $timestamp") + logInfo(log"Added archive ${MDC(LogKeys.PATH, path)} at ${MDC(LogKeys.KEY, key)}" + + log" with timestamp ${MDC(LogKeys.TIMESTAMP, timestamp)}") // If the scheme is file, use URI to simply copy instead of downloading. val uriToUse = if (!isLocal && scheme == "file") uri else new URI(key) val uriToDownload = Utils.getUriBuilder(uriToUse).fragment(null).build() @@ -1849,13 +1856,16 @@ class SparkContext(config: SparkConf) extends Logging { root, if (uri.getFragment != null) uri.getFragment else source.getName) logInfo( - s"Unpacking an archive $path from ${source.getAbsolutePath} to ${dest.getAbsolutePath}") + log"Unpacking an archive ${MDC(LogKeys.PATH, path)}" + + log" from ${MDC(LogKeys.SOURCE_PATH, source.getAbsolutePath)}" + + log" to ${MDC(LogKeys.DESTINATION_PATH, dest.getAbsolutePath)}") Utils.deleteRecursively(dest) Utils.unpack(source, dest) postEnvironmentUpdate() } else { - logWarning(s"The path $path has been added already. Overwriting of added paths " + - "is not supported in the current version.") + logWarning(log"The path ${MDC(LogKeys.PATH, path)} " + + log"has been added already. Overwriting of added paths " + + log"is not supported in the current version.") } } @@ -2145,7 +2155,7 @@ class SparkContext(config: SparkConf) extends Logging { Seq(env.rpcEnv.fileServer.addJar(file)) } catch { case NonFatal(e) => - logError(log"Failed to add ${MDC(LogKey.PATH, path)} to Spark environment", e) + logError(log"Failed to add ${MDC(LogKeys.PATH, path)} to Spark environment", e) Nil } } @@ -2166,7 +2176,7 @@ class SparkContext(config: SparkConf) extends Logging { Seq(path) } catch { case NonFatal(e) => - logError(log"Failed to add ${MDC(LogKey.PATH, path)} to Spark environment", e) + logError(log"Failed to add ${MDC(LogKeys.PATH, path)} to Spark environment", e) Nil } } else { @@ -2209,14 +2219,21 @@ class SparkContext(config: SparkConf) extends Logging { .getOrElseUpdate(jobArtifactUUID, new ConcurrentHashMap[String, Long]().asScala) .putIfAbsent(_, timestamp).isEmpty) if (added.nonEmpty) { - val jarMessage = if (scheme != "ivy") "JAR" else "dependency jars of Ivy URI" - logInfo(s"Added $jarMessage $path at ${added.mkString(",")} with timestamp $timestamp") + val jarMessage = if (scheme != "ivy") { + log"Added JAR" + } else { + log"Added dependency jars of Ivy URI" + } + logInfo(jarMessage + log" ${MDC(LogKeys.PATH, path)}" + + log" at ${MDC(LogKeys.ADDED_JARS, added.mkString(","))}" + + log" with timestamp ${MDC(LogKeys.TIMESTAMP, timestamp)}") postEnvironmentUpdate() } if (existed.nonEmpty) { val jarMessage = if (scheme != "ivy") "JAR" else "dependency jars of Ivy URI" - logWarning(s"The $jarMessage $path at ${existed.mkString(",")} has been added already." + - " Overwriting of added jar is not supported in the current version.") + logWarning(log"The ${MDC(LogKeys.JAR_MESSAGE, jarMessage)} ${MDC(LogKeys.PATH, path)} " + + log"at ${MDC(LogKeys.EXISTING_PATH, existed.mkString(","))} has been added already." + + log" Overwriting of added jar is not supported in the current version.") } } } @@ -2265,7 +2282,9 @@ class SparkContext(config: SparkConf) extends Logging { * @param exitCode Specified exit code that will passed to scheduler backend in client mode. */ def stop(exitCode: Int): Unit = { - logInfo(s"SparkContext is stopping with exitCode $exitCode.") + stopSite = Some(getCallSite()) + logInfo(log"SparkContext is stopping with exitCode ${MDC(LogKeys.EXIT_CODE, exitCode)}" + + log" from ${MDC(LogKeys.STOP_SITE_SHORT_FORM, stopSite.get.shortForm)}.") if (LiveListenerBus.withinListenerThread.value) { throw new SparkException(s"Cannot stop SparkContext within listener bus thread.") } @@ -2302,6 +2321,11 @@ class SparkContext(config: SparkConf) extends Logging { } _dagScheduler = null } + // In case there are still events being posted during the shutdown of plugins, + // invoke the shutdown of each plugin before the listenerBus is stopped. + Utils.tryLogNonFatalError { + _plugins.foreach(_.shutdown()) + } if (_listenerBusStarted) { Utils.tryLogNonFatalError { listenerBus.stop() @@ -2313,9 +2337,6 @@ class SparkContext(config: SparkConf) extends Logging { env.metricsSystem.report() } } - Utils.tryLogNonFatalError { - _plugins.foreach(_.shutdown()) - } Utils.tryLogNonFatalError { FallbackStorage.cleanUp(_conf, _hadoopConfiguration) } @@ -2429,9 +2450,10 @@ class SparkContext(config: SparkConf) extends Logging { } val callSite = getCallSite() val cleanedFunc = clean(func) - logInfo("Starting job: " + callSite.shortForm) + logInfo(log"Starting job: ${MDC(LogKeys.CALL_SITE_SHORT_FORM, callSite.shortForm)}") if (conf.getBoolean("spark.logLineage", false)) { - logInfo("RDD's recursive dependencies:\n" + rdd.toDebugString) + logInfo(log"RDD's recursive dependencies:\n" + + log"${MDC(LogKeys.RDD_DEBUG_STRING, rdd.toDebugString)}") } dagScheduler.runJob(rdd, cleanedFunc, partitions, callSite, resultHandler, localProperties.get) progressBar.foreach(_.finishAll()) @@ -2550,13 +2572,14 @@ class SparkContext(config: SparkConf) extends Logging { timeout: Long): PartialResult[R] = { assertNotStopped() val callSite = getCallSite() - logInfo("Starting job: " + callSite.shortForm) - val start = System.nanoTime + logInfo(log"Starting job: ${MDC(LogKeys.CALL_SITE_SHORT_FORM, callSite.shortForm)}") + val start = System.currentTimeMillis() val cleanedFunc = clean(func) val result = dagScheduler.runApproximateJob(rdd, cleanedFunc, evaluator, callSite, timeout, localProperties.get) logInfo( - "Job finished: " + callSite.shortForm + ", took " + (System.nanoTime - start) / 1e9 + " s") + log"Job finished: ${MDC(LogKeys.CALL_SITE_SHORT_FORM, callSite.shortForm)}," + + log" took ${MDC(LogKeys.TOTAL_TIME, System.currentTimeMillis() - start)}ms") result } @@ -2736,9 +2759,9 @@ class SparkContext(config: SparkConf) extends Logging { // its own local file system, which is incorrect because the checkpoint files // are actually on the executor machines. if (!isLocal && Utils.nonLocalPaths(directory).isEmpty) { - logWarning("Spark is not running in local mode, therefore the checkpoint directory " + - s"must not be on the local filesystem. Directory '$directory' " + - "appears to be on the local filesystem.") + logWarning(log"Spark is not running in local mode, therefore the checkpoint directory " + + log"must not be on the local filesystem. Directory '${MDC(LogKeys.PATH, directory)}' " + + log"appears to be on the local filesystem.") } checkpointDir = Option(directory).map { dir => @@ -2784,7 +2807,8 @@ class SparkContext(config: SparkConf) extends Logging { val listeners = Utils.loadExtensions(classOf[SparkListenerInterface], classNames, conf) listeners.foreach { listener => listenerBus.addToSharedQueue(listener) - logInfo(s"Registered listener ${listener.getClass().getName()}") + logInfo(log"Registered listener" + + log"${MDC(LogKeys.CLASS_NAME, listener.getClass().getName())}") } } } catch { @@ -2899,10 +2923,11 @@ object SparkContext extends Logging { // its creationSite field being null: val otherContextCreationSite = Option(otherContext.creationSite).map(_.longForm).getOrElse("unknown location") - val warnMsg = "Another SparkContext is being constructed (or threw an exception in its" + - " constructor). This may indicate an error, since only one SparkContext should be" + - " running in this JVM (see SPARK-2243)." + - s" The other SparkContext was created at:\n$otherContextCreationSite" + val warnMsg = log"Another SparkContext is being constructed (or threw an exception in its" + + log" constructor). This may indicate an error, since only one SparkContext should be" + + log" running in this JVM (see SPARK-2243)." + + log" The other SparkContext was created at:\n" + + log"${MDC(LogKeys.CREATION_SITE, otherContextCreationSite)}" logWarning(warnMsg) } } @@ -3039,14 +3064,6 @@ object SparkContext extends Logging { } } - private implicit def arrayToArrayWritable[T <: Writable : ClassTag](arr: Iterable[T]) - : ArrayWritable = { - def anyToWritable[U <: Writable](u: U): Writable = u - - new ArrayWritable(classTag[T].runtimeClass.asInstanceOf[Class[Writable]], - arr.map(x => anyToWritable(x)).toArray) - } - /** * Find the JAR from which a given class was loaded, to make it easy for users to pass * their JARs to SparkContext. diff --git a/core/src/main/scala/org/apache/spark/SparkEnv.scala b/core/src/main/scala/org/apache/spark/SparkEnv.scala index 50d0358004d40..de2d215562b9f 100644 --- a/core/src/main/scala/org/apache/spark/SparkEnv.scala +++ b/core/src/main/scala/org/apache/spark/SparkEnv.scala @@ -32,7 +32,8 @@ import org.apache.spark.annotation.DeveloperApi import org.apache.spark.api.python.{PythonWorker, PythonWorkerFactory} import org.apache.spark.broadcast.BroadcastManager import org.apache.spark.executor.ExecutorBackend -import org.apache.spark.internal.{config, Logging} +import org.apache.spark.internal.{config, Logging, MDC} +import org.apache.spark.internal.LogKeys import org.apache.spark.internal.config._ import org.apache.spark.memory.{MemoryManager, UnifiedMemoryManager} import org.apache.spark.metrics.{MetricsSystem, MetricsSystemInstances} @@ -130,7 +131,8 @@ class SparkEnv ( Utils.deleteRecursively(new File(path)) } catch { case e: Exception => - logWarning(s"Exception while deleting Spark temp dir: $path", e) + logWarning(log"Exception while deleting Spark temp dir: " + + log"${MDC(LogKeys.PATH, path)}", e) } case None => // We just need to delete tmp dir created by driver, so do nothing on executor } @@ -142,7 +144,7 @@ class SparkEnv ( workerModule: String, daemonModule: String, envVars: Map[String, String], - useDaemon: Boolean): (PythonWorker, Option[Long]) = { + useDaemon: Boolean): (PythonWorker, Option[Int]) = { synchronized { val key = PythonWorkersKey(pythonExec, workerModule, daemonModule, envVars) val workerFactory = pythonWorkers.getOrElseUpdate(key, new PythonWorkerFactory( @@ -161,7 +163,7 @@ class SparkEnv ( pythonExec: String, workerModule: String, envVars: Map[String, String], - useDaemon: Boolean): (PythonWorker, Option[Long]) = { + useDaemon: Boolean): (PythonWorker, Option[Int]) = { createPythonWorker( pythonExec, workerModule, PythonWorkerFactory.defaultDaemonModule, envVars, useDaemon) } @@ -170,7 +172,7 @@ class SparkEnv ( pythonExec: String, workerModule: String, daemonModule: String, - envVars: Map[String, String]): (PythonWorker, Option[Long]) = { + envVars: Map[String, String]): (PythonWorker, Option[Int]) = { val useDaemon = conf.get(Python.PYTHON_USE_DAEMON) createPythonWorker( pythonExec, workerModule, daemonModule, envVars, useDaemon) @@ -256,7 +258,6 @@ object SparkEnv extends Logging { isLocal: Boolean, listenerBus: LiveListenerBus, numCores: Int, - sparkContext: SparkContext, mockOutputCommitCoordinator: Option[OutputCommitCoordinator] = None): SparkEnv = { assert(conf.contains(DRIVER_HOST_ADDRESS), s"${DRIVER_HOST_ADDRESS.key} is not set on the driver!") @@ -279,7 +280,6 @@ object SparkEnv extends Logging { numCores, ioEncryptionKey, listenerBus = listenerBus, - Option(sparkContext), mockOutputCommitCoordinator = mockOutputCommitCoordinator ) } @@ -315,7 +315,6 @@ object SparkEnv extends Logging { /** * Helper method to create a SparkEnv for a driver or an executor. */ - // scalastyle:off argcount private def create( conf: SparkConf, executorId: String, @@ -326,9 +325,7 @@ object SparkEnv extends Logging { numUsableCores: Int, ioEncryptionKey: Option[Array[Byte]], listenerBus: LiveListenerBus = null, - sc: Option[SparkContext] = None, mockOutputCommitCoordinator: Option[OutputCommitCoordinator] = None): SparkEnv = { - // scalastyle:on argcount val isDriver = executorId == SparkContext.DRIVER_IDENTIFIER @@ -471,12 +468,7 @@ object SparkEnv extends Logging { } val outputCommitCoordinator = mockOutputCommitCoordinator.getOrElse { - if (isDriver) { - new OutputCommitCoordinator(conf, isDriver, sc) - } else { - new OutputCommitCoordinator(conf, isDriver) - } - + new OutputCommitCoordinator(conf, isDriver) } val outputCommitCoordinatorRef = registerOrLookupEndpoint("OutputCommitCoordinator", new OutputCommitCoordinatorEndpoint(rpcEnv, outputCommitCoordinator)) diff --git a/core/src/main/scala/org/apache/spark/TaskContextImpl.scala b/core/src/main/scala/org/apache/spark/TaskContextImpl.scala index e433cc10ae731..8167952d6b87f 100644 --- a/core/src/main/scala/org/apache/spark/TaskContextImpl.scala +++ b/core/src/main/scala/org/apache/spark/TaskContextImpl.scala @@ -25,7 +25,7 @@ import scala.jdk.CollectionConverters._ import org.apache.spark.executor.TaskMetrics import org.apache.spark.internal.{config, Logging, MDC} -import org.apache.spark.internal.LogKey.LISTENER +import org.apache.spark.internal.LogKeys.LISTENER import org.apache.spark.memory.TaskMemoryManager import org.apache.spark.metrics.MetricsSystem import org.apache.spark.metrics.source.Source diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonGatewayServer.scala b/core/src/main/scala/org/apache/spark/api/python/PythonGatewayServer.scala index fb0584b458463..7737822f2af2b 100644 --- a/core/src/main/scala/org/apache/spark/api/python/PythonGatewayServer.scala +++ b/core/src/main/scala/org/apache/spark/api/python/PythonGatewayServer.scala @@ -24,7 +24,7 @@ import java.nio.file.Files import org.apache.spark.SparkConf import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{CLASS_NAME, PATH} +import org.apache.spark.internal.LogKeys.{CLASS_NAME, PATH} /** * Process that starts a Py4J server on an ephemeral port. diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonHadoopUtil.scala b/core/src/main/scala/org/apache/spark/api/python/PythonHadoopUtil.scala index 4f7c5bc0b0c05..5e2b5553f3dca 100644 --- a/core/src/main/scala/org/apache/spark/api/python/PythonHadoopUtil.scala +++ b/core/src/main/scala/org/apache/spark/api/python/PythonHadoopUtil.scala @@ -26,7 +26,7 @@ import org.apache.hadoop.io._ import org.apache.spark.SparkException import org.apache.spark.broadcast.Broadcast import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.CLASS_NAME +import org.apache.spark.internal.LogKeys.CLASS_NAME import org.apache.spark.rdd.RDD import org.apache.spark.util.{SerializableConfiguration, Utils} @@ -45,7 +45,7 @@ private[python] object Converter extends Logging { converterClass.map { cc => Try { val c = Utils.classForName[Converter[T, U]](cc).getConstructor().newInstance() - logInfo(s"Loaded converter: $cc") + logInfo(log"Loaded converter: ${MDC(CLASS_NAME, cc)}") c } match { case Success(c) => c diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala index 5aa080b5fb291..d643983ef5dfe 100644 --- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala @@ -37,7 +37,8 @@ import org.apache.spark.api.java.{JavaPairRDD, JavaRDD, JavaSparkContext} import org.apache.spark.api.python.PythonFunction.PythonAccumulator import org.apache.spark.broadcast.Broadcast import org.apache.spark.input.PortableDataStream -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{HOST, PORT} import org.apache.spark.internal.config.BUFFER_SIZE import org.apache.spark.network.util.JavaUtils import org.apache.spark.rdd.RDD @@ -733,7 +734,8 @@ private[spark] class PythonAccumulatorV2( private def openSocket(): Socket = synchronized { if (socket == null || socket.isClosed) { socket = new Socket(serverHost, serverPort) - logInfo(s"Connected to AccumulatorServer at host: $serverHost port: $serverPort") + logInfo(log"Connected to AccumulatorServer at host: ${MDC(HOST, serverHost)}" + + log" port: ${MDC(PORT, serverPort)}") // send the secret just for the initial authentication when opening a new connection socket.getOutputStream.write(secretToken.getBytes(StandardCharsets.UTF_8)) } diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala index 17cb0c5a55ddf..b2571ffddc577 100644 --- a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala +++ b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala @@ -31,7 +31,8 @@ import scala.util.control.NonFatal import org.apache.spark._ import org.apache.spark.api.python.PythonFunction.PythonAccumulator -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.TASK_NAME import org.apache.spark.internal.config.{BUFFER_SIZE, EXECUTOR_CORES, Python} import org.apache.spark.internal.config.Python._ import org.apache.spark.rdd.InputFileBlockHolder @@ -88,7 +89,7 @@ private object BasePythonRunner { private lazy val faultHandlerLogDir = Utils.createTempDir(namePrefix = "faulthandler") - private def faultHandlerLogPath(pid: Long): Path = { + private def faultHandlerLogPath(pid: Int): Path = { new File(faultHandlerLogDir, pid.toString).toPath } } @@ -204,7 +205,7 @@ private[spark] abstract class BasePythonRunner[IN, OUT]( envVars.put("SPARK_JOB_ARTIFACT_UUID", jobArtifactUUID.getOrElse("default")) - val (worker: PythonWorker, pid: Option[Long]) = env.createPythonWorker( + val (worker: PythonWorker, pid: Option[Int]) = env.createPythonWorker( pythonExec, workerModule, daemonModule, envVars.asScala.toMap) // Whether is the worker released into idle pool or closed. When any codes try to release or // close a worker, they should use `releasedOrClosed.compareAndSet` to flip the state to make @@ -257,7 +258,7 @@ private[spark] abstract class BasePythonRunner[IN, OUT]( startTime: Long, env: SparkEnv, worker: PythonWorker, - pid: Option[Long], + pid: Option[Int], releasedOrClosed: AtomicBoolean, context: TaskContext): Iterator[OUT] @@ -465,7 +466,7 @@ private[spark] abstract class BasePythonRunner[IN, OUT]( startTime: Long, env: SparkEnv, worker: PythonWorker, - pid: Option[Long], + pid: Option[Int], releasedOrClosed: AtomicBoolean, context: TaskContext) extends Iterator[OUT] { @@ -592,7 +593,8 @@ private[spark] abstract class BasePythonRunner[IN, OUT]( // Mimic the task name used in `Executor` to help the user find out the task to blame. val taskName = s"${context.partitionId()}.${context.attemptNumber()} " + s"in stage ${context.stageId()} (TID ${context.taskAttemptId()})" - logWarning(s"Incomplete task $taskName interrupted: Attempting to kill Python Worker") + logWarning(log"Incomplete task ${MDC(TASK_NAME, taskName)} " + + log"interrupted: Attempting to kill Python Worker") env.destroyPythonWorker( pythonExec, workerModule, daemonModule, envVars.asScala.toMap, worker) } catch { @@ -842,7 +844,7 @@ private[spark] class PythonRunner( startTime: Long, env: SparkEnv, worker: PythonWorker, - pid: Option[Long], + pid: Option[Int], releasedOrClosed: AtomicBoolean, context: TaskContext): Iterator[Array[Byte]] = { new ReaderIterator( diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala b/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala index 26c790a124470..045ed0e4c01cb 100644 --- a/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala +++ b/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala @@ -28,7 +28,8 @@ import scala.sys.process.Process import org.apache.spark.{SparkContext, SparkEnv} import org.apache.spark.api.java.{JavaRDD, JavaSparkContext} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{PATH, PYTHON_PACKAGES, PYTHON_VERSION} import org.apache.spark.util.ArrayImplicits.SparkArrayOps import org.apache.spark.util.Utils @@ -122,11 +123,11 @@ private[spark] object PythonUtils extends Logging { PythonUtils.sparkPythonPath, sys.env.getOrElse("PYTHONPATH", "")) val environment = Map("PYTHONPATH" -> pythonPath) - logInfo(s"Python path $pythonPath") + logInfo(log"Python path ${MDC(PATH, pythonPath)}") val processPythonVer = Process(pythonVersionCMD, None, environment.toSeq: _*) val output = runCommand(processPythonVer) - logInfo(s"Python version: ${output.getOrElse("Unable to determine")}") + logInfo(log"Python version: ${MDC(PYTHON_VERSION, output.getOrElse("Unable to determine"))}") val pythonCode = """ @@ -146,7 +147,8 @@ private[spark] object PythonUtils extends Logging { def formatOutput(output: String): String = { output.replaceAll("\\s+", ", ") } - listOfPackages.foreach(x => logInfo(s"List of Python packages :- ${formatOutput(x)}")) + listOfPackages.foreach(x => logInfo(log"List of Python packages :-" + + log" ${MDC(PYTHON_PACKAGES, formatOutput(x))}")) } } diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala b/core/src/main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala index eb740b72987c8..3221a4900f6ad 100644 --- a/core/src/main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala +++ b/core/src/main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala @@ -30,7 +30,8 @@ import scala.jdk.CollectionConverters._ import org.apache.spark._ import org.apache.spark.errors.SparkCoreErrors -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.security.SocketAuthHelper import org.apache.spark.util.{RedirectThread, Utils} @@ -92,7 +93,7 @@ private[spark] class PythonWorkerFactory( envVars.getOrElse("PYTHONPATH", ""), sys.env.getOrElse("PYTHONPATH", "")) - def create(): (PythonWorker, Option[Long]) = { + def create(): (PythonWorker, Option[Int]) = { if (useDaemon) { self.synchronized { // Pull from idle workers until we one that is alive, otherwise create a new one. @@ -102,12 +103,13 @@ private[spark] class PythonWorkerFactory( if (workerHandle.isAlive()) { try { worker.selectionKey.interestOps(SelectionKey.OP_READ | SelectionKey.OP_WRITE) - return (worker, Some(workerHandle.pid())) + return (worker, Some(workerHandle.pid().toInt)) } catch { case c: CancelledKeyException => /* pass */ } } - logWarning(s"Worker ${worker} process from idle queue is dead, discarding.") + logWarning(log"Worker ${MDC(WORKER, worker)} " + + log"process from idle queue is dead, discarding.") stopWorker(worker) } } @@ -122,9 +124,9 @@ private[spark] class PythonWorkerFactory( * processes itself to avoid the high cost of forking from Java. This currently only works * on UNIX-based systems. */ - private def createThroughDaemon(): (PythonWorker, Option[Long]) = { + private def createThroughDaemon(): (PythonWorker, Option[Int]) = { - def createWorker(): (PythonWorker, Option[Long]) = { + def createWorker(): (PythonWorker, Option[Int]) = { val socketChannel = SocketChannel.open(new InetSocketAddress(daemonHost, daemonPort)) // These calls are blocking. val pid = new DataInputStream(Channels.newInputStream(socketChannel)).readInt() @@ -165,7 +167,7 @@ private[spark] class PythonWorkerFactory( /** * Launch a worker by executing worker.py (by default) directly and telling it to connect to us. */ - private[spark] def createSimpleWorker(blockingMode: Boolean): (PythonWorker, Option[Long]) = { + private[spark] def createSimpleWorker(blockingMode: Boolean): (PythonWorker, Option[Int]) = { var serverSocketChannel: ServerSocketChannel = null try { serverSocketChannel = ServerSocketChannel.open() @@ -209,7 +211,8 @@ private[spark] class PythonWorkerFactory( "Timed out while waiting for the Python worker to connect back") } authHelper.authClient(socketChannel.socket()) - val pid = workerProcess.toHandle.pid() + // TODO: When we drop JDK 8, we can just use workerProcess.pid() + val pid = new DataInputStream(Channels.newInputStream(socketChannel)).readInt() if (pid < 0) { throw new IllegalStateException("Python failed to launch worker with code " + pid) } @@ -405,7 +408,7 @@ private[spark] class PythonWorkerFactory( daemonWorkers.get(worker).foreach { processHandle => // tell daemon to kill worker by pid val output = new DataOutputStream(daemon.getOutputStream) - output.writeLong(processHandle.pid()) + output.writeInt(processHandle.pid().toInt) output.flush() daemon.getOutputStream.flush() } diff --git a/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala b/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala index 6a46b611019fa..0fe57dd0bb0ae 100644 --- a/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala +++ b/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala @@ -28,7 +28,8 @@ import net.razorvine.pickle.{Pickler, Unpickler} import org.apache.spark.SparkException import org.apache.spark.api.java.JavaRDD -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.rdd.RDD import org.apache.spark.util.ArrayImplicits._ @@ -143,22 +144,26 @@ private[spark] object SerDeUtil extends Logging { } (kt, vt) match { case (Failure(kf), Failure(vf)) => - logWarning(s""" - |Failed to pickle Java object as key: ${t._1.getClass.getSimpleName}, falling back - |to 'toString'. Error: ${kf.getMessage}""".stripMargin) - logWarning(s""" - |Failed to pickle Java object as value: ${t._2.getClass.getSimpleName}, falling back - |to 'toString'. Error: ${vf.getMessage}""".stripMargin) + logWarning(log""" + |Failed to pickle Java object as key: + |${MDC(CLASS_NAME, t._1.getClass.getSimpleName)}, falling back + |to 'toString'. Error: ${MDC(ERROR, kf.getMessage)}""".stripMargin) + logWarning(log""" + |Failed to pickle Java object as value: + |${MDC(CLASS_NAME, t._2.getClass.getSimpleName)}, falling back + |to 'toString'. Error: ${MDC(ERROR, vf.getMessage)}""".stripMargin) (true, true) case (Failure(kf), _) => - logWarning(s""" - |Failed to pickle Java object as key: ${t._1.getClass.getSimpleName}, falling back - |to 'toString'. Error: ${kf.getMessage}""".stripMargin) + logWarning(log""" + |Failed to pickle Java object as key: + |${MDC(CLASS_NAME, t._1.getClass.getSimpleName)}, falling back + |to 'toString'. Error: ${MDC(ERROR, kf.getMessage)}""".stripMargin) (true, false) case (_, Failure(vf)) => - logWarning(s""" - |Failed to pickle Java object as value: ${t._2.getClass.getSimpleName}, falling back - |to 'toString'. Error: ${vf.getMessage}""".stripMargin) + logWarning(log""" + |Failed to pickle Java object as value: + |${MDC(CLASS_NAME, t._2.getClass.getSimpleName)}, falling back + |to 'toString'. Error: ${MDC(ERROR, vf.getMessage)}""".stripMargin) (false, true) case _ => (false, false) diff --git a/core/src/main/scala/org/apache/spark/api/python/StreamingPythonRunner.scala b/core/src/main/scala/org/apache/spark/api/python/StreamingPythonRunner.scala index de01a706b3f6c..0ff2b79ab6623 100644 --- a/core/src/main/scala/org/apache/spark/api/python/StreamingPythonRunner.scala +++ b/core/src/main/scala/org/apache/spark/api/python/StreamingPythonRunner.scala @@ -21,8 +21,9 @@ import java.io.{BufferedInputStream, BufferedOutputStream, DataInputStream, Data import scala.jdk.CollectionConverters._ -import org.apache.spark.SparkEnv -import org.apache.spark.internal.Logging +import org.apache.spark.{SparkEnv, SparkPythonException} +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{PYTHON_EXEC, PYTHON_WORKER_MODULE, PYTHON_WORKER_RESPONSE, SESSION_ID} import org.apache.spark.internal.config.BUFFER_SIZE import org.apache.spark.internal.config.Python.PYTHON_AUTH_SOCKET_TIMEOUT @@ -58,7 +59,8 @@ private[spark] class StreamingPythonRunner( * to be used with the functions. */ def init(): (DataOutputStream, DataInputStream) = { - logInfo(s"Initializing Python runner (session: $sessionId, pythonExec: $pythonExec)") + logInfo(log"Initializing Python runner (session: ${MDC(SESSION_ID, sessionId)}," + + log" pythonExec: ${MDC(PYTHON_EXEC, pythonExec)})") val env = SparkEnv.get val localdir = env.blockManager.diskBlockManager.localDirs.map(f => f.getPath()).mkString(",") @@ -91,16 +93,34 @@ private[spark] class StreamingPythonRunner( new BufferedInputStream(pythonWorker.get.channel.socket().getInputStream, bufferSize)) val resFromPython = dataIn.readInt() - logInfo(s"Runner initialization succeeded (returned $resFromPython).") + if (resFromPython != 0) { + val errMessage = PythonWorkerUtils.readUTF(dataIn) + throw streamingPythonRunnerInitializationFailure(resFromPython, errMessage) + } + logInfo(log"Runner initialization succeeded (returned" + + log" ${MDC(PYTHON_WORKER_RESPONSE, resFromPython)}).") (dataOut, dataIn) } + def streamingPythonRunnerInitializationFailure(resFromPython: Int, errMessage: String): + StreamingPythonRunnerInitializationException = { + new StreamingPythonRunnerInitializationException(resFromPython, errMessage) + } + + class StreamingPythonRunnerInitializationException(resFromPython: Int, errMessage: String) + extends SparkPythonException( + errorClass = "STREAMING_PYTHON_RUNNER_INITIALIZATION_FAILURE", + messageParameters = Map( + "resFromPython" -> resFromPython.toString, + "msg" -> errMessage)) + /** * Stops the Python worker. */ def stop(): Unit = { - logInfo(s"Stopping streaming runner for sessionId: $sessionId, module: $workerModule.") + logInfo(log"Stopping streaming runner for sessionId: ${MDC(SESSION_ID, sessionId)}," + + log" module: ${MDC(PYTHON_WORKER_MODULE, workerModule)}.") try { pythonWorkerFactory.foreach { factory => diff --git a/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala b/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala index 1a05c8f35b7fb..c3d01ec47458e 100644 --- a/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala +++ b/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala @@ -27,7 +27,7 @@ import io.netty.handler.timeout.ReadTimeoutException import org.apache.spark.{SparkConf, SparkEnv} import org.apache.spark.api.r.SerDe._ import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{METHOD_NAME, OBJECT_ID} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.config.R._ import org.apache.spark.util.{ThreadUtils, Utils} import org.apache.spark.util.ArrayImplicits._ @@ -155,10 +155,11 @@ private[r] class RBackendHandler(server: RBackend) args) if (index.isEmpty) { - logWarning(s"cannot find matching method ${cls}.$methodName. " - + s"Candidates are:") + logWarning(log"cannot find matching method " + + log"${MDC(CLASS_NAME, cls)}.${MDC(METHOD_NAME, methodName)}. Candidates are:") selectedMethods.foreach { method => - logWarning(s"$methodName(${method.getParameterTypes.mkString(",")})") + logWarning(log"${MDC(METHOD_NAME, methodName)}(" + + log"${MDC(METHOD_PARAM_TYPES, method.getParameterTypes.mkString(","))})") } throw new Exception(s"No matched method found for $cls.$methodName") } @@ -176,10 +177,11 @@ private[r] class RBackendHandler(server: RBackend) args) if (index.isEmpty) { - logWarning(s"cannot find matching constructor for ${cls}. " - + s"Candidates are:") + logWarning(log"cannot find matching constructor for ${MDC(CLASS_NAME, cls)}. " + + log"Candidates are:") ctors.foreach { ctor => - logWarning(s"$cls(${ctor.getParameterTypes.mkString(",")})") + logWarning(log"${MDC(CLASS_NAME, cls)}(" + + log"${MDC(METHOD_PARAM_TYPES, ctor.getParameterTypes.mkString(","))})") } throw new Exception(s"No matched constructor found for $cls") } diff --git a/core/src/main/scala/org/apache/spark/broadcast/Broadcast.scala b/core/src/main/scala/org/apache/spark/broadcast/Broadcast.scala index 445b7d4d7aa06..3adb540a7ad18 100644 --- a/core/src/main/scala/org/apache/spark/broadcast/Broadcast.scala +++ b/core/src/main/scala/org/apache/spark/broadcast/Broadcast.scala @@ -22,7 +22,7 @@ import java.io.Serializable import scala.reflect.ClassTag import org.apache.spark.SparkException -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.util.Utils /** @@ -106,7 +106,8 @@ abstract class Broadcast[T: ClassTag](val id: Long) extends Serializable with Lo assertValid() _isValid = false _destroySite = Utils.getCallSite().shortForm - logInfo("Destroying %s (from %s)".format(toString, _destroySite)) + logInfo(log"Destroying ${MDC(LogKeys.BROADCAST, toString)} " + + log"(from ${MDC(LogKeys.CALL_SITE_SHORT_FORM, _destroySite)})") doDestroy(blocking) } diff --git a/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala b/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala index b6ba9bbf29f30..0c7ec5c1a98a7 100644 --- a/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala +++ b/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala @@ -28,7 +28,7 @@ import scala.util.Random import org.apache.spark._ import org.apache.spark.internal.{config, Logging, MDC} -import org.apache.spark.internal.LogKey.BROADCAST_ID +import org.apache.spark.internal.LogKeys._ import org.apache.spark.io.CompressionCodec import org.apache.spark.serializer.Serializer import org.apache.spark.storage._ @@ -278,11 +278,12 @@ private[spark] class TorrentBroadcast[T: ClassTag](obj: T, id: Long, serializedO } case None => val estimatedTotalSize = Utils.bytesToString(numBlocks.toLong * blockSize) - logInfo(s"Started reading broadcast variable $id with $numBlocks pieces " + - s"(estimated total size $estimatedTotalSize)") + logInfo(log"Started reading broadcast variable ${MDC(BROADCAST_ID, id)} with ${MDC(NUM_BROADCAST_BLOCK, numBlocks)} pieces " + + log"(estimated total size ${MDC(NUM_BYTES, estimatedTotalSize)})") val startTimeNs = System.nanoTime() val blocks = readBlocks() - logInfo(s"Reading broadcast variable $id took ${Utils.getUsedTimeNs(startTimeNs)}") + logInfo(log"Reading broadcast variable ${MDC(BROADCAST_ID, id)}" + + log" took ${MDC(TOTAL_TIME, Utils.getUsedTimeNs(startTimeNs))}") try { val obj = TorrentBroadcast.unBlockifyObject[T]( diff --git a/core/src/main/scala/org/apache/spark/deploy/Client.scala b/core/src/main/scala/org/apache/spark/deploy/Client.scala index d38f94fd1ac26..226a6dcd36a16 100644 --- a/core/src/main/scala/org/apache/spark/deploy/Client.scala +++ b/core/src/main/scala/org/apache/spark/deploy/Client.scala @@ -32,7 +32,7 @@ import org.apache.spark.deploy.DeployMessages._ import org.apache.spark.deploy.master.{DriverState, Master} import org.apache.spark.deploy.master.DriverState.DriverState import org.apache.spark.internal.{config, Logging, MDC} -import org.apache.spark.internal.LogKey.{DRIVER_ID, ERROR, HOST_PORT} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.config.Network.RPC_ASK_TIMEOUT import org.apache.spark.resource.ResourceUtils import org.apache.spark.rpc.{RpcAddress, RpcEndpointRef, RpcEnv, ThreadSafeRpcEndpoint} @@ -135,7 +135,8 @@ private class ClientEndpoint( masterEndpoint.ask[T](message).onComplete { case Success(v) => self.send(v) case Failure(e) => - logWarning(s"Error sending messages to master $masterEndpoint", e) + logWarning(log"Error sending messages to master " + + log"${MDC(MASTER_URL, masterEndpoint)}", e) }(forwardMessageExecutionContext) } } @@ -163,11 +164,12 @@ private class ClientEndpoint( // logs again when waitAppCompletion is set to true if (!driverStatusReported) { driverStatusReported = true - logInfo(s"State of $submittedDriverID is ${state.get}") + logInfo(log"State of ${MDC(DRIVER_ID, submittedDriverID)}" + + log" is ${MDC(DRIVER_STATE, state.get)}") // Worker node, if present (workerId, workerHostPort, state) match { case (Some(id), Some(hostPort), Some(DriverState.RUNNING)) => - logInfo(s"Driver running on $hostPort ($id)") + logInfo(log"Driver running on ${MDC(HOST, hostPort)} (${MDC(WORKER_ID, id)})") case _ => } } @@ -180,17 +182,18 @@ private class ClientEndpoint( state.get match { case DriverState.FINISHED | DriverState.FAILED | DriverState.ERROR | DriverState.KILLED => - logInfo(s"State of driver $submittedDriverID is ${state.get}, " + - s"exiting spark-submit JVM.") + logInfo(log"State of driver ${MDC(DRIVER_ID, submittedDriverID)}" + + log" is ${MDC(DRIVER_STATE, state.get)}, exiting spark-submit JVM.") System.exit(0) case _ => if (!waitAppCompletion) { - logInfo(s"spark-submit not configured to wait for completion, " + - s"exiting spark-submit JVM.") + logInfo("spark-submit not configured to wait for completion, " + + " exiting spark-submit JVM.") System.exit(0) } else { - logDebug(s"State of driver $submittedDriverID is ${state.get}, " + - s"continue monitoring driver status.") + logDebug(log"State of driver ${MDC(DRIVER_ID, submittedDriverID)}" + + log" is ${MDC(DRIVER_STATE, state.get)}, " + + log"continue monitoring driver status.") } } } diff --git a/core/src/main/scala/org/apache/spark/deploy/DriverTimeoutPlugin.scala b/core/src/main/scala/org/apache/spark/deploy/DriverTimeoutPlugin.scala index 9b141d6075721..736c23556ec15 100644 --- a/core/src/main/scala/org/apache/spark/deploy/DriverTimeoutPlugin.scala +++ b/core/src/main/scala/org/apache/spark/deploy/DriverTimeoutPlugin.scala @@ -23,7 +23,8 @@ import scala.jdk.CollectionConverters._ import org.apache.spark.SparkContext import org.apache.spark.api.plugin.{DriverPlugin, ExecutorPlugin, PluginContext, SparkPlugin} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.config.DRIVER_TIMEOUT import org.apache.spark.util.{SparkExitCode, ThreadUtils} @@ -48,8 +49,9 @@ class DriverTimeoutDriverPlugin extends DriverPlugin with Logging { logWarning("Disabled with the timeout value 0.") } else { val task: Runnable = () => { - logWarning(s"Terminate Driver JVM because it runs after $timeout minute" + - (if (timeout == 1) "" else "s")) + logWarning(log"Terminate Driver JVM because it runs after " + + log"${MDC(TIME_UNITS, timeout)} minute" + + (if (timeout == 1) log"" else log"s")) // We cannot use 'SparkContext.stop' because SparkContext might be in abnormal situation. System.exit(SparkExitCode.DRIVER_TIMEOUT) } diff --git a/core/src/main/scala/org/apache/spark/deploy/ExternalShuffleService.scala b/core/src/main/scala/org/apache/spark/deploy/ExternalShuffleService.scala index a56fbd5a644ae..851fb453fd092 100644 --- a/core/src/main/scala/org/apache/spark/deploy/ExternalShuffleService.scala +++ b/core/src/main/scala/org/apache/spark/deploy/ExternalShuffleService.scala @@ -23,7 +23,8 @@ import java.util.concurrent.CountDownLatch import scala.jdk.CollectionConverters._ import org.apache.spark.{SecurityManager, SparkConf} -import org.apache.spark.internal.{config, Logging} +import org.apache.spark.internal.{config, Logging, MDC} +import org.apache.spark.internal.LogKeys.{AUTH_ENABLED, PORT, SHUFFLE_DB_BACKEND_KEY, SHUFFLE_DB_BACKEND_NAME} import org.apache.spark.metrics.{MetricsSystem, MetricsSystemInstances} import org.apache.spark.network.TransportContext import org.apache.spark.network.crypto.AuthServerBootstrap @@ -70,8 +71,8 @@ class ExternalShuffleService(sparkConf: SparkConf, securityManager: SecurityMana if (localDirs.length >= 1) { new File(localDirs.find(new File(_, dbName).exists()).getOrElse(localDirs(0)), dbName) } else { - logWarning(s"'spark.local.dir' should be set first when we use db in " + - s"ExternalShuffleService. Note that this only affects standalone mode.") + logWarning("'spark.local.dir' should be set first when we use db in " + + "ExternalShuffleService. Note that this only affects standalone mode.") null } } @@ -86,8 +87,8 @@ class ExternalShuffleService(sparkConf: SparkConf, securityManager: SecurityMana if (sparkConf.get(config.SHUFFLE_SERVICE_DB_ENABLED) && enabled) { val shuffleDBName = sparkConf.get(config.SHUFFLE_SERVICE_DB_BACKEND) val dbBackend = DBBackend.byName(shuffleDBName) - logInfo(s"Use ${dbBackend.name()} as the implementation of " + - s"${config.SHUFFLE_SERVICE_DB_BACKEND.key}") + logInfo(log"Use ${MDC(SHUFFLE_DB_BACKEND_NAME, dbBackend.name())} as the implementation of " + + log"${MDC(SHUFFLE_DB_BACKEND_KEY, config.SHUFFLE_SERVICE_DB_BACKEND.key)}") new ExternalBlockHandler(conf, findRegisteredExecutorsDBFile(dbBackend.fileName(registeredExecutorsDB))) } else { @@ -106,7 +107,8 @@ class ExternalShuffleService(sparkConf: SparkConf, securityManager: SecurityMana def start(): Unit = { require(server == null, "Shuffle server already started") val authEnabled = securityManager.isAuthenticationEnabled() - logInfo(s"Starting shuffle service on port $port (auth enabled = $authEnabled)") + logInfo(log"Starting shuffle service on port ${MDC(PORT, port)}" + + log" (auth enabled = ${MDC(AUTH_ENABLED, authEnabled)})") val bootstraps: Seq[TransportServerBootstrap] = if (authEnabled) { Seq(new AuthServerBootstrap(transportConf, securityManager)) diff --git a/core/src/main/scala/org/apache/spark/deploy/LocalSparkCluster.scala b/core/src/main/scala/org/apache/spark/deploy/LocalSparkCluster.scala index 9c57269b28f47..263b1a233b808 100644 --- a/core/src/main/scala/org/apache/spark/deploy/LocalSparkCluster.scala +++ b/core/src/main/scala/org/apache/spark/deploy/LocalSparkCluster.scala @@ -24,7 +24,7 @@ import scala.collection.mutable.ArrayBuffer import org.apache.spark.SparkConf import org.apache.spark.deploy.master.Master import org.apache.spark.deploy.worker.Worker -import org.apache.spark.internal.{config, Logging} +import org.apache.spark.internal.{config, Logging, LogKeys, MDC} import org.apache.spark.rpc.RpcEnv import org.apache.spark.util.Utils @@ -51,7 +51,8 @@ class LocalSparkCluster private ( private val workerDirs = ArrayBuffer[String]() def start(): Array[String] = { - logInfo("Starting a local Spark cluster with " + numWorkers + " workers.") + logInfo(log"Starting a local Spark cluster with " + + log"${MDC(LogKeys.NUM_WORKERS, numWorkers)} workers.") // Disable REST server on Master in this mode unless otherwise specified val _conf = conf.clone() diff --git a/core/src/main/scala/org/apache/spark/deploy/RPackageUtils.scala b/core/src/main/scala/org/apache/spark/deploy/RPackageUtils.scala index 1a1a680c7faf5..5d996381a485e 100644 --- a/core/src/main/scala/org/apache/spark/deploy/RPackageUtils.scala +++ b/core/src/main/scala/org/apache/spark/deploy/RPackageUtils.scala @@ -27,13 +27,15 @@ import scala.jdk.CollectionConverters._ import com.google.common.io.{ByteStreams, Files} import org.apache.spark.api.r.RUtils -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{LogEntry, Logging, MDC, MessageWithContext} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.util.{RedirectThread, Utils} private[deploy] object RPackageUtils extends Logging { /** The key in the MANIFEST.mf that we look for, in case a jar contains R code. */ private final val hasRPackage = "Spark-HasRPackage" + private final val hasRPackageMDC = MDC(CONFIG, hasRPackage) /** Base of the shell command used in order to install R packages. */ private final val baseInstallCmd = Seq("R", "CMD", "INSTALL", "-l") @@ -42,11 +44,11 @@ private[deploy] object RPackageUtils extends Logging { private final val RJarEntries = "R/pkg" /** Documentation on how the R source file layout should be in the jar. */ - private[deploy] final val RJarDoc = - s"""In order for Spark to build R packages that are parts of Spark Packages, there are a few + private[deploy] final val RJarDoc: MessageWithContext = + log"""In order for Spark to build R packages that are parts of Spark Packages, there are a few |requirements. The R source code must be shipped in a jar, with additional Java/Scala |classes. The jar must be in the following format: - | 1- The Manifest (META-INF/MANIFEST.mf) must contain the key-value: $hasRPackage: true + | 1- The Manifest (META-INF/MANIFEST.mf) must contain the key-value: $hasRPackageMDC: true | 2- The standard R package layout must be preserved under R/pkg/ inside the jar. More | information on the standard R package layout can be found in: | http://cran.r-project.org/doc/contrib/Leisch-CreatingPackages.pdf @@ -61,18 +63,17 @@ private[deploy] object RPackageUtils extends Logging { |R/pkg/R/myRcode.R |org/ |org/apache/ - |... - """.stripMargin.trim + |...""".stripMargin /** Internal method for logging. We log to a printStream in tests, for debugging purposes. */ private def print( - msg: String, + msg: LogEntry, printStream: PrintStream, level: Level = Level.FINE, e: Throwable = null): Unit = { if (printStream != null) { // scalastyle:off println - printStream.println(msg) + printStream.println(msg.message) // scalastyle:on println if (e != null) { e.printStackTrace(printStream) @@ -112,7 +113,7 @@ private[deploy] object RPackageUtils extends Logging { val pathToPkg = Seq(dir, "R", "pkg").mkString(File.separator) val installCmd = baseInstallCmd ++ Seq(libDir, pathToPkg) if (verbose) { - print(s"Building R package with the command: $installCmd", printStream) + print(log"Building R package with the command: ${MDC(COMMAND, installCmd)}", printStream) } try { val builder = new ProcessBuilder(installCmd.asJava) @@ -131,7 +132,7 @@ private[deploy] object RPackageUtils extends Logging { process.waitFor() == 0 } catch { case e: Throwable => - print("Failed to build R package.", printStream, Level.SEVERE, e) + print(log"Failed to build R package.", printStream, Level.SEVERE, e) false } } @@ -150,7 +151,7 @@ private[deploy] object RPackageUtils extends Logging { if (entry.isDirectory) { val dir = new File(tempDir, entryPath) if (verbose) { - print(s"Creating directory: $dir", printStream) + print(log"Creating directory: ${MDC(PATH, dir)}", printStream) } dir.mkdirs } else { @@ -159,7 +160,7 @@ private[deploy] object RPackageUtils extends Logging { Files.createParentDirs(outPath) val outStream = new FileOutputStream(outPath) if (verbose) { - print(s"Extracting $entry to $outPath", printStream) + print(log"Extracting ${MDC(JAR_ENTRY, entry)} to ${MDC(PATH, outPath)}", printStream) } Utils.copyStream(inStream, outStream, closeStreams = true) } @@ -181,32 +182,34 @@ private[deploy] object RPackageUtils extends Logging { val jar = new JarFile(file) Utils.tryWithSafeFinally { if (checkManifestForR(jar)) { - print(s"$file contains R source code. Now installing package.", printStream, Level.INFO) + print(log"${MDC(PATH, file)} contains R source code. Now installing package.", + printStream, Level.INFO) val rSource = extractRFolder(jar, printStream, verbose) if (RUtils.rPackages.isEmpty) { RUtils.rPackages = Some(Utils.createTempDir().getAbsolutePath) } try { if (!rPackageBuilder(rSource, printStream, verbose, RUtils.rPackages.get)) { - print(s"ERROR: Failed to build R package in $file.", printStream) + print(log"ERROR: Failed to build R package in ${MDC(PATH, file)}.", printStream) print(RJarDoc, printStream) } } finally { // clean up if (!rSource.delete()) { - logWarning(s"Error deleting ${rSource.getPath()}") + logWarning(log"Error deleting ${MDC(PATH, rSource.getPath())}") } } } else { if (verbose) { - print(s"$file doesn't contain R source code, skipping...", printStream) + print(log"${MDC(PATH, file)} doesn't contain R source code, skipping...", printStream) } } } { jar.close() } } else { - print(s"WARN: $file resolved as dependency, but not found.", printStream, Level.WARNING) + print(log"WARN: ${MDC(PATH, file)} resolved as dependency, but not found.", + printStream, Level.WARNING) } } } @@ -234,7 +237,7 @@ private[deploy] object RPackageUtils extends Logging { // create a zip file from scratch, do not append to existing file. val zipFile = new File(dir, name) if (!zipFile.delete()) { - logWarning(s"Error deleting ${zipFile.getPath()}") + logWarning(log"Error deleting ${MDC(PATH, zipFile.getPath())}") } val zipOutputStream = new ZipOutputStream(new FileOutputStream(zipFile, false)) try { diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala index 2edd80db2637f..ca932ef5dc05c 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala @@ -37,7 +37,7 @@ import org.apache.hadoop.security.token.{Token, TokenIdentifier} import org.apache.hadoop.security.token.delegation.AbstractDelegationTokenIdentifier import org.apache.spark.{SparkConf, SparkException} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.internal.config.BUFFER_SIZE import org.apache.spark.util.ArrayImplicits._ import org.apache.spark.util.Utils @@ -142,8 +142,9 @@ private[spark] class SparkHadoopUtil extends Logging { if (!new File(keytabFilename).exists()) { throw new SparkException(s"Keytab file: ${keytabFilename} does not exist") } else { - logInfo("Attempting to login to Kerberos " + - s"using principal: ${principalName} and keytab: ${keytabFilename}") + logInfo(log"Attempting to login to Kerberos using principal: " + + log"${MDC(LogKeys.PRINCIPAL, principalName)} and keytab: " + + log"${MDC(LogKeys.KEYTAB, keytabFilename)}") UserGroupInformation.loginUserFromKeytab(principalName, keytabFilename) } } diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala index c0df74f8d0cc6..7bb945ab9f147 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala @@ -39,8 +39,7 @@ import org.apache.hadoop.yarn.conf.YarnConfiguration import org.apache.spark._ import org.apache.spark.api.r.RUtils import org.apache.spark.deploy.rest._ -import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.CLASS_NAME +import org.apache.spark.internal.{LogEntry, Logging, LogKeys, MDC} import org.apache.spark.internal.config._ import org.apache.spark.internal.config.UI._ import org.apache.spark.launcher.SparkLauncher @@ -64,22 +63,39 @@ private[deploy] object SparkSubmitAction extends Enumeration { */ private[spark] class SparkSubmit extends Logging { + override protected def logName: String = classOf[SparkSubmit].getName + import DependencyUtils._ import SparkSubmit._ def doSubmit(args: Array[String]): Unit = { + val appArgs = parseArguments(args) + val sparkConf = appArgs.toSparkConf() + + // For interpreters, structured logging is disabled by default to avoid generating mixed + // plain text and structured logs on the same console. + if (isShell(appArgs.primaryResource) || isSqlShell(appArgs.mainClass)) { + Logging.disableStructuredLogging() + } else { + // For non-shell applications, enable structured logging if it's not explicitly disabled + // via the configuration `spark.log.structuredLogging.enabled`. + if (sparkConf.getBoolean(STRUCTURED_LOGGING_ENABLED.key, defaultValue = true)) { + Logging.enableStructuredLogging() + } else { + Logging.disableStructuredLogging() + } + } // Initialize logging if it hasn't been done yet. Keep track of whether logging needs to // be reset before the application starts. val uninitLog = initializeLogIfNecessary(true, silent = true) - val appArgs = parseArguments(args) if (appArgs.verbose) { logInfo(appArgs.toString) } appArgs.action match { - case SparkSubmitAction.SUBMIT => submit(appArgs, uninitLog) - case SparkSubmitAction.KILL => kill(appArgs) - case SparkSubmitAction.REQUEST_STATUS => requestStatus(appArgs) + case SparkSubmitAction.SUBMIT => submit(appArgs, uninitLog, sparkConf) + case SparkSubmitAction.KILL => kill(appArgs, sparkConf) + case SparkSubmitAction.REQUEST_STATUS => requestStatus(appArgs, sparkConf) case SparkSubmitAction.PRINT_VERSION => printVersion() } } @@ -91,12 +107,11 @@ private[spark] class SparkSubmit extends Logging { /** * Kill an existing submission. */ - private def kill(args: SparkSubmitArguments): Unit = { + private def kill(args: SparkSubmitArguments, sparkConf: SparkConf): Unit = { if (RestSubmissionClient.supportsRestClient(args.master)) { new RestSubmissionClient(args.master) .killSubmission(args.submissionToKill) } else { - val sparkConf = args.toSparkConf() sparkConf.set("spark.master", args.master) SparkSubmitUtils .getSubmitOperations(args.master) @@ -107,12 +122,11 @@ private[spark] class SparkSubmit extends Logging { /** * Request the status of an existing submission. */ - private def requestStatus(args: SparkSubmitArguments): Unit = { + private def requestStatus(args: SparkSubmitArguments, sparkConf: SparkConf): Unit = { if (RestSubmissionClient.supportsRestClient(args.master)) { new RestSubmissionClient(args.master) .requestSubmissionStatus(args.submissionToRequestStatusFor) } else { - val sparkConf = args.toSparkConf() sparkConf.set("spark.master", args.master) SparkSubmitUtils .getSubmitOperations(args.master) @@ -129,12 +143,14 @@ private[spark] class SparkSubmit extends Logging { /___/ .__/\_,_/_/ /_/\_\ version %s /_/ """.format(SPARK_VERSION)) - logInfo("Using Scala %s, %s, %s".format( - Properties.versionString, Properties.javaVmName, Properties.javaVersion)) - logInfo(s"Branch $SPARK_BRANCH") - logInfo(s"Compiled by user $SPARK_BUILD_USER on $SPARK_BUILD_DATE") - logInfo(s"Revision $SPARK_REVISION") - logInfo(s"Url $SPARK_REPO_URL") + logInfo(log"Using Scala ${MDC(LogKeys.SCALA_VERSION, Properties.versionString)}," + + log" ${MDC(LogKeys.JAVA_VM_NAME, Properties.javaVmName)}," + + log" ${MDC(LogKeys.JAVA_VERSION, Properties.javaVersion)}") + logInfo(log"Branch ${MDC(LogKeys.SPARK_BRANCH, SPARK_BRANCH)}") + logInfo(log"Compiled by user ${MDC(LogKeys.SPARK_BUILD_USER, SPARK_BUILD_USER)} on" + + log" ${MDC(LogKeys.SPARK_BUILD_DATE, SPARK_BUILD_DATE)}") + logInfo(log"Revision ${MDC(LogKeys.SPARK_REVISION, SPARK_REVISION)}") + logInfo(log"Url ${MDC(LogKeys.SPARK_REPO_URL, SPARK_REPO_URL)}") logInfo("Type --help for more information.") } @@ -143,7 +159,7 @@ private[spark] class SparkSubmit extends Logging { * in a doAs when --proxy-user is specified. */ @tailrec - private def submit(args: SparkSubmitArguments, uninitLog: Boolean): Unit = { + private def submit(args: SparkSubmitArguments, uninitLog: Boolean, sparkConf: SparkConf): Unit = { def doRunMain(): Unit = { if (args.proxyUser != null) { @@ -152,7 +168,7 @@ private[spark] class SparkSubmit extends Logging { // is done in client mode. val isKubernetesClusterModeDriver = args.master.startsWith("k8s") && "client".equals(args.deployMode) && - args.toSparkConf().getBoolean("spark.kubernetes.submitInDriver", false) + sparkConf.getBoolean("spark.kubernetes.submitInDriver", false) if (isKubernetesClusterModeDriver) { logInfo("Running driver with proxy user. Cluster manager: Kubernetes") SparkHadoopUtil.get.runAsSparkUser(() => runMain(args, uninitLog)) @@ -196,10 +212,10 @@ private[spark] class SparkSubmit extends Logging { } catch { // Fail over to use the legacy submission gateway case e: SubmitRestConnectionException => - logWarning(s"Master endpoint ${args.master} was not a REST server. " + - "Falling back to legacy submission gateway instead.") + logWarning(log"Master endpoint ${MDC(LogKeys.MASTER_URL, args.master)} " + + log"was not a REST server. Falling back to legacy submission gateway instead.") args.useRest = false - submit(args, false) + submit(args, false, sparkConf) } // In all other modes, just run the main class as prepared } else { @@ -229,11 +245,6 @@ private[spark] class SparkSubmit extends Logging { val childClasspath = new ArrayBuffer[String]() val sparkConf = args.toSparkConf() if (sparkConf.contains("spark.local.connect")) sparkConf.remove("spark.remote") - if (sparkConf.getBoolean(STRUCTURED_LOGGING_ENABLED.key, defaultValue = true)) { - Logging.enableStructuredLogging() - } else { - Logging.disableStructuredLogging() - } var childMainClass = "" // Set the cluster manager @@ -430,7 +441,9 @@ private[spark] class SparkSubmit extends Logging { workingDirectory, if (resolvedUri.getFragment != null) resolvedUri.getFragment else source.getName) .getCanonicalFile - logInfo(s"Files $resolvedUri from $source to $dest") + logInfo(log"Files ${MDC(LogKeys.URI, resolvedUri)}" + + log" from ${MDC(LogKeys.SOURCE_PATH, source)}" + + log" to ${MDC(LogKeys.DESTINATION_PATH, dest)}") Utils.deleteRecursively(dest) if (isArchive) { Utils.unpack(source, dest) @@ -897,9 +910,11 @@ private[spark] class SparkSubmit extends Logging { if (childClasspath.nonEmpty && isCustomClasspathInClusterModeDisallowed) { childClasspath.clear() - logWarning(s"Ignore classpath ${childClasspath.mkString(", ")} with proxy user specified " + - s"in Cluster mode when ${ALLOW_CUSTOM_CLASSPATH_BY_PROXY_USER_IN_CLUSTER_MODE.key} is " + - s"disabled") + logWarning(log"Ignore classpath " + + log"${MDC(LogKeys.CLASS_PATH, childClasspath.mkString(", "))} " + + log"with proxy user specified in Cluster mode when " + + log"${MDC(LogKeys.CONFIG, ALLOW_CUSTOM_CLASSPATH_BY_PROXY_USER_IN_CLUSTER_MODE.key)} is " + + log"disabled") } (childArgs.toSeq, childClasspath.toSeq, sparkConf, childMainClass) @@ -911,7 +926,7 @@ private[spark] class SparkSubmit extends Logging { private def setRMPrincipal(sparkConf: SparkConf): Unit = { val shortUserName = UserGroupInformation.getCurrentUser.getShortUserName val key = s"spark.hadoop.${YarnConfiguration.RM_PRINCIPAL}" - logInfo(s"Setting ${key} to ${shortUserName}") + logInfo(log"Setting ${MDC(LogKeys.KEY, key)} to ${MDC(LogKeys.SHORT_USER_NAME, shortUserName)}") sparkConf.set(key, shortUserName) } @@ -948,11 +963,12 @@ private[spark] class SparkSubmit extends Logging { } if (args.verbose) { - logInfo(s"Main class:\n$childMainClass") - logInfo(s"Arguments:\n${childArgs.mkString("\n")}") + logInfo(log"Main class:\n${MDC(LogKeys.CLASS_NAME, childMainClass)}") + logInfo(log"Arguments:\n${MDC(LogKeys.ARGS, childArgs.mkString("\n"))}") // sysProps may contain sensitive information, so redact before printing - logInfo(s"Spark config:\n${Utils.redact(sparkConf.getAll.toMap).sorted.mkString("\n")}") - logInfo(s"Classpath elements:\n${childClasspath.mkString("\n")}") + logInfo(log"Spark config:\n" + + log"${MDC(LogKeys.CONFIG, Utils.redact(sparkConf.getAll.toMap).sorted.mkString("\n"))}") + logInfo(log"Classpath elements:\n${MDC(LogKeys.CLASS_PATHS, childClasspath.mkString("\n"))}") logInfo("\n") } assert(!(args.deployMode == "cluster" && args.proxyUser != null && childClasspath.nonEmpty) || @@ -970,20 +986,20 @@ private[spark] class SparkSubmit extends Logging { mainClass = Utils.classForName(childMainClass) } catch { case e: ClassNotFoundException => - logError(log"Failed to load class ${MDC(CLASS_NAME, childMainClass)}.") + logError(log"Failed to load class ${MDC(LogKeys.CLASS_NAME, childMainClass)}.") if (childMainClass.contains("thriftserver")) { - logInfo(s"Failed to load main class $childMainClass.") + logInfo(log"Failed to load main class ${MDC(LogKeys.CLASS_NAME, childMainClass)}.") logInfo("You need to build Spark with -Phive and -Phive-thriftserver.") } else if (childMainClass.contains("org.apache.spark.sql.connect")) { - logInfo(s"Failed to load main class $childMainClass.") + logInfo(log"Failed to load main class ${MDC(LogKeys.CLASS_NAME, childMainClass)}.") // TODO(SPARK-42375): Should point out the user-facing page here instead. logInfo("You need to specify Spark Connect jars with --jars or --packages.") } throw new SparkUserAppException(CLASS_NOT_FOUND_EXIT_STATUS) case e: NoClassDefFoundError => - logError(log"Failed to load ${MDC(CLASS_NAME, childMainClass)}", e) + logError(log"Failed to load ${MDC(LogKeys.CLASS_NAME, childMainClass)}", e) if (e.getMessage.contains("org/apache/hadoop/hive")) { - logInfo(s"Failed to load hive class.") + logInfo("Failed to load hive class.") logInfo("You need to build Spark with -Phive and -Phive-thriftserver.") } throw new SparkUserAppException(CLASS_NOT_FOUND_EXIT_STATUS) @@ -1082,18 +1098,32 @@ object SparkSubmit extends CommandLineUtils with Logging { new SparkSubmitArguments(args.toImmutableArraySeq) { override protected def logInfo(msg: => String): Unit = self.logInfo(msg) + override protected def logInfo(entry: LogEntry): Unit = self.logInfo(entry) + override protected def logWarning(msg: => String): Unit = self.logWarning(msg) + override protected def logWarning(entry: LogEntry): Unit = self.logWarning(entry) + override protected def logError(msg: => String): Unit = self.logError(msg) + + override protected def logError(entry: LogEntry): Unit = self.logError(entry) } } override protected def logInfo(msg: => String): Unit = printMessage(msg) + override protected def logInfo(entry: LogEntry): Unit = printMessage(entry.message) + override protected def logWarning(msg: => String): Unit = printMessage(s"Warning: $msg") + override protected def logWarning(entry: LogEntry): Unit = + printMessage(s"Warning: ${entry.message}") + override protected def logError(msg: => String): Unit = printMessage(s"Error: $msg") + override protected def logError(entry: LogEntry): Unit = + printMessage(s"Error: ${entry.message}") + override def doSubmit(args: Array[String]): Unit = { try { super.doSubmit(args) diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala index 534c14000614d..32dd2f81bbc82 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala @@ -28,7 +28,8 @@ import scala.util.Try import org.apache.spark.{SparkConf, SparkException, SparkUserAppException} import org.apache.spark.deploy.SparkSubmitAction._ -import org.apache.spark.internal.{config, Logging} +import org.apache.spark.internal.{config, Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.config.DYN_ALLOCATION_ENABLED import org.apache.spark.launcher.SparkSubmitArgumentsParser import org.apache.spark.network.util.JavaUtils @@ -49,6 +50,7 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S var executorCores: String = null var totalExecutorCores: String = null var propertiesFile: String = null + private var loadSparkDefaults: Boolean = false var driverMemory: String = null var driverExtraClassPath: String = null var driverExtraLibraryPath: String = null @@ -77,7 +79,6 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S var principal: String = null var keytab: String = null private var dynamicAllocationEnabled: Boolean = false - // Standalone cluster mode only var supervise: Boolean = false var driverCores: String = null @@ -85,26 +86,7 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S var submissionToRequestStatusFor: String = null var useRest: Boolean = false // used internally - /** Default properties present in the currently defined defaults file. */ - lazy val defaultSparkProperties: HashMap[String, String] = { - val defaultProperties = new HashMap[String, String]() - if (verbose) { - logInfo(s"Using properties file: $propertiesFile") - } - Option(propertiesFile).foreach { filename => - val properties = Utils.getPropertiesFromFile(filename) - properties.foreach { case (k, v) => - defaultProperties(k) = v - } - // Property files may contain sensitive information, so redact before printing - if (verbose) { - Utils.redact(properties).foreach { case (k, v) => - logInfo(s"Adding default property: $k=$v") - } - } - } - defaultProperties - } + override protected def logName: String = classOf[SparkSubmitArguments].getName // Set parameters from command line arguments parse(args.asJava) @@ -120,18 +102,44 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S validateArguments() + /** + * Load properties from the file with the given path into `sparkProperties`. + * No-op if the file path is null + */ + private def loadPropertiesFromFile(filePath: String): Unit = { + if (filePath != null) { + if (verbose) { + logInfo(log"Using properties file: ${MDC(PATH, filePath)}") + } + val properties = Utils.getPropertiesFromFile(filePath) + properties.foreach { case (k, v) => + if (!sparkProperties.contains(k)) { + sparkProperties(k) = v + } + } + // Property files may contain sensitive information, so redact before printing + if (verbose) { + Utils.redact(properties).foreach { case (k, v) => + logInfo(log"Adding default property: ${MDC(KEY, k)}=${MDC(VALUE, v)}") + } + } + } + } + /** * Merge values from the default properties file with those specified through --conf. * When this is called, `sparkProperties` is already filled with configs from the latter. */ private def mergeDefaultSparkProperties(): Unit = { - // Use common defaults file, if not specified by user - propertiesFile = Option(propertiesFile).getOrElse(Utils.getDefaultPropertiesFile(env)) - // Honor --conf before the defaults file - defaultSparkProperties.foreach { case (k, v) => - if (!sparkProperties.contains(k)) { - sparkProperties(k) = v - } + // Honor --conf before the specified properties file and defaults file + loadPropertiesFromFile(propertiesFile) + + // Also load properties from `spark-defaults.conf` if they do not exist in the properties file + // and --conf list when: + // - no input properties file is specified + // - input properties file is specified, but `--load-spark-defaults` flag is set + if (propertiesFile == null || loadSparkDefaults) { + loadPropertiesFromFile(Utils.getDefaultPropertiesFile(env)) } } @@ -142,7 +150,7 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S sparkProperties.keys.foreach { k => if (!k.startsWith("spark.")) { sparkProperties -= k - logWarning(s"Ignoring non-Spark config property: $k") + logWarning(log"Ignoring non-Spark config property: ${MDC(CONFIG, k)}") } } } @@ -389,6 +397,9 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S case PROPERTIES_FILE => propertiesFile = value + case LOAD_SPARK_DEFAULTS => + loadSparkDefaults = true + case KILL_SUBMISSION => submissionToKill = value if (action != null) { @@ -489,7 +500,7 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S private def printUsageAndExit(exitCode: Int, unknownParam: Any = null): Unit = { if (unknownParam != null) { - logInfo("Unknown/unsupported param " + unknownParam) + logInfo(log"Unknown/unsupported param ${MDC(UNKNOWN_PARAM, unknownParam)}") } val command = sys.env.getOrElse("_SPARK_CMD_USAGE", """Usage: spark-submit [options] [app arguments] @@ -532,6 +543,10 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S | --conf, -c PROP=VALUE Arbitrary Spark configuration property. | --properties-file FILE Path to a file from which to load extra properties. If not | specified, this will look for conf/spark-defaults.conf. + | --load-spark-defaults Whether to load properties from conf/spark-defaults.conf, + | even if --properties-file is specified. Configurations + | specified in --properties-file will take precedence over + | those in conf/spark-defaults.conf. | | --driver-memory MEM Memory for driver (e.g. 1000M, 2G) (Default: ${mem_mb}M). | --driver-java-options Extra Java options to pass to the driver. @@ -592,7 +607,7 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S logInfo(getSqlShellOptions()) } - throw new SparkUserAppException(exitCode) + throw SparkUserAppException(exitCode) } /** diff --git a/core/src/main/scala/org/apache/spark/deploy/StandaloneResourceUtils.scala b/core/src/main/scala/org/apache/spark/deploy/StandaloneResourceUtils.scala index d317d6449f293..7b98461b01acf 100644 --- a/core/src/main/scala/org/apache/spark/deploy/StandaloneResourceUtils.scala +++ b/core/src/main/scala/org/apache/spark/deploy/StandaloneResourceUtils.scala @@ -28,7 +28,7 @@ import org.json4s.jackson.JsonMethods.{compact, render} import org.apache.spark.SparkException import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.COMPONENT +import org.apache.spark.internal.LogKeys.COMPONENT import org.apache.spark.resource.{ResourceAllocation, ResourceID, ResourceInformation, ResourceRequirement} import org.apache.spark.util.ArrayImplicits._ import org.apache.spark.util.Utils diff --git a/core/src/main/scala/org/apache/spark/deploy/Utils.scala b/core/src/main/scala/org/apache/spark/deploy/Utils.scala index 4d2546cb808c0..b3d871d75e6c7 100644 --- a/core/src/main/scala/org/apache/spark/deploy/Utils.scala +++ b/core/src/main/scala/org/apache/spark/deploy/Utils.scala @@ -23,7 +23,7 @@ import jakarta.servlet.http.HttpServletRequest import org.apache.spark.SparkConf import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{LOG_TYPE, PATH} +import org.apache.spark.internal.LogKeys.{LOG_TYPE, PATH} import org.apache.spark.ui.JettyUtils.createServletHandler import org.apache.spark.ui.WebUI import org.apache.spark.util.Utils.{getFileLength, offsetBytes} diff --git a/core/src/main/scala/org/apache/spark/deploy/client/StandaloneAppClient.scala b/core/src/main/scala/org/apache/spark/deploy/client/StandaloneAppClient.scala index b0ee6018970ab..b34e5c408c3be 100644 --- a/core/src/main/scala/org/apache/spark/deploy/client/StandaloneAppClient.scala +++ b/core/src/main/scala/org/apache/spark/deploy/client/StandaloneAppClient.scala @@ -29,7 +29,8 @@ import org.apache.spark.SparkConf import org.apache.spark.deploy.{ApplicationDescription, ExecutorState} import org.apache.spark.deploy.DeployMessages._ import org.apache.spark.deploy.master.Master -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.resource.ResourceProfile import org.apache.spark.rpc._ import org.apache.spark.scheduler.ExecutorDecommissionInfo @@ -104,12 +105,14 @@ private[spark] class StandaloneAppClient( if (registered.get) { return } - logInfo("Connecting to master " + masterAddress.toSparkURL + "...") + logInfo( + log"Connecting to master ${MDC(LogKeys.MASTER_URL, masterAddress.toSparkURL)}...") val masterRef = rpcEnv.setupEndpointRef(masterAddress, Master.ENDPOINT_NAME) masterRef.send(RegisterApplication(appDescription, self)) } catch { case ie: InterruptedException => // Cancelled - case NonFatal(e) => logWarning(s"Failed to connect to master $masterAddress", e) + case NonFatal(e) => logWarning(log"Failed to connect to master " + + log"${MDC(MASTER_URL, masterAddress)}", e) } }) } @@ -146,7 +149,8 @@ private[spark] class StandaloneAppClient( private def sendToMaster(message: Any): Unit = { master match { case Some(masterRef) => masterRef.send(message) - case None => logWarning(s"Drop $message because has not yet connected to master") + case None => logWarning( + log"Drop ${MDC(MESSAGE, message)} because has not yet connected to master") } } @@ -172,14 +176,16 @@ private[spark] class StandaloneAppClient( case ExecutorAdded(id: Int, workerId: String, hostPort: String, cores: Int, memory: Int) => val fullId = s"$appId/$id" - logInfo("Executor added: %s on %s (%s) with %d core(s)".format(fullId, workerId, hostPort, - cores)) + logInfo(log"Executor added: ${MDC(LogKeys.EXECUTOR_ID, fullId)} on " + + log"${MDC(LogKeys.WORKER_ID, workerId)} (${MDC(LogKeys.HOST_PORT, hostPort)}) " + + log"with ${MDC(LogKeys.NUM_CORES, cores)} core(s)") listener.executorAdded(fullId, workerId, hostPort, cores, memory) case ExecutorUpdated(id, state, message, exitStatus, workerHost) => val fullId = s"$appId/$id" val messageText = message.map(s => " (" + s + ")").getOrElse("") - logInfo("Executor updated: %s is now %s%s".format(fullId, state, messageText)) + logInfo(log"Executor updated: ${MDC(LogKeys.EXECUTOR_ID, fullId)} is now " + + log"${MDC(LogKeys.EXECUTOR_STATE, state)}${MDC(LogKeys.MESSAGE, messageText)}") if (ExecutorState.isFinished(state)) { listener.executorRemoved(fullId, message.getOrElse(""), exitStatus, workerHost) } else if (state == ExecutorState.DECOMMISSIONED) { @@ -188,11 +194,13 @@ private[spark] class StandaloneAppClient( } case WorkerRemoved(id, host, message) => - logInfo("Master removed worker %s: %s".format(id, message)) + logInfo(log"Master removed worker ${MDC(LogKeys.WORKER_ID, id)}: " + + log"${MDC(LogKeys.MESSAGE, message)}") listener.workerRemoved(id, host, message) case MasterChanged(masterRef, masterWebUiUrl) => - logInfo("Master has changed, new master is at " + masterRef.address.toSparkURL) + logInfo(log"Master has changed, new master is at " + + log"${MDC(LogKeys.MASTER_URL, masterRef.address.toSparkURL)}") master = Some(masterRef) alreadyDisconnected = false masterRef.send(MasterChangeAcknowledged(appId.get)) @@ -237,14 +245,16 @@ private[spark] class StandaloneAppClient( override def onDisconnected(address: RpcAddress): Unit = { if (master.exists(_.address == address)) { - logWarning(s"Connection to $address failed; waiting for master to reconnect...") + logWarning( + log"Connection to ${MDC(MASTER_URL, address)} failed; waiting for master to reconnect...") markDisconnected() } } override def onNetworkError(cause: Throwable, address: RpcAddress): Unit = { if (isPossibleMaster(address)) { - logWarning(s"Could not connect to $address: $cause") + logWarning(log"Could not connect to ${MDC(MASTER_URL, address)}: " + + log"${MDC(ERROR, cause)}") } } diff --git a/core/src/main/scala/org/apache/spark/deploy/history/ApplicationCache.scala b/core/src/main/scala/org/apache/spark/deploy/history/ApplicationCache.scala index 662746cf0c782..6e0fe69f3bfb6 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/ApplicationCache.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/ApplicationCache.scala @@ -28,7 +28,8 @@ import jakarta.servlet.{DispatcherType, Filter, FilterChain, FilterConfig, Servl import jakarta.servlet.http.{HttpServletRequest, HttpServletResponse} import org.eclipse.jetty.servlet.FilterHolder -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.metrics.source.Source import org.apache.spark.ui.SparkUI import org.apache.spark.util.Clock @@ -170,19 +171,19 @@ private[history] class ApplicationCache( */ @throws[NoSuchElementException] private def loadApplicationEntry(appId: String, attemptId: Option[String]): CacheEntry = { - lazy val application = s"$appId/${attemptId.mkString}" - logDebug(s"Loading application Entry $application") + lazy val application = log"${MDC(APP_ID, appId)}/${MDC(APP_ATTEMPT_ID, attemptId.mkString)}" + logDebug(log"Loading application Entry " + application) metrics.loadCount.inc() val loadedUI = time(metrics.loadTimer) { metrics.lookupCount.inc() operations.getAppUI(appId, attemptId) match { case Some(loadedUI) => - logDebug(s"Loaded application $application") + logDebug(log"Loaded application " + application) loadedUI case None => metrics.lookupFailureCount.inc() // guava's cache logs via java.util log, so is of limited use. Hence: our own message - logInfo(s"Failed to load application attempt $application") + logInfo(log"Failed to load application attempt " + application) throw new NoSuchElementException(s"no application with application Id '$appId'" + attemptId.map { id => s" attemptId '$id'" }.getOrElse(" and no attempt Id")) } @@ -197,7 +198,7 @@ private[history] class ApplicationCache( new CacheEntry(loadedUI, completed) } catch { case e: Exception => - logWarning(s"Failed to initialize application UI for $application", e) + logWarning(log"Failed to initialize application UI for ${MDC(APP_ID, application)}", e) operations.detachSparkUI(appId, attemptId, loadedUI.ui) throw e } diff --git a/core/src/main/scala/org/apache/spark/deploy/history/EventFilter.scala b/core/src/main/scala/org/apache/spark/deploy/history/EventFilter.scala index 59e52e6494987..05f109831499b 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/EventFilter.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/EventFilter.scala @@ -24,7 +24,7 @@ import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.deploy.history.EventFilter.FilterStatistics import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{LINE, LINE_NUM, PATH} +import org.apache.spark.internal.LogKeys.{LINE, LINE_NUM, PATH} import org.apache.spark.scheduler._ import org.apache.spark.util.{JsonProtocol, Utils} diff --git a/core/src/main/scala/org/apache/spark/deploy/history/EventLogFileCompactor.scala b/core/src/main/scala/org/apache/spark/deploy/history/EventLogFileCompactor.scala index 07a873ac704dc..f7889e8b54edf 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/EventLogFileCompactor.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/EventLogFileCompactor.scala @@ -28,7 +28,8 @@ import org.apache.hadoop.fs.{FileStatus, FileSystem, Path} import org.apache.spark.SparkConf import org.apache.spark.deploy.history.EventFilter.FilterStatistics -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys import org.apache.spark.scheduler.ReplayListenerBus import org.apache.spark.util.Utils @@ -160,7 +161,8 @@ class EventLogFileCompactor( } logWriter.stop() val duration = System.currentTimeMillis() - startTime - logInfo(s"Finished rewriting eventLog files to ${logWriter.logPath} took $duration ms.") + logInfo(log"Finished rewriting eventLog files to ${MDC(LogKeys.PATH, logWriter.logPath)}" + + log" took ${MDC(LogKeys.TOTAL_TIME, duration)} ms.") logWriter.logPath } @@ -174,7 +176,7 @@ class EventLogFileCompactor( case _: IOException => } if (!deleted) { - logWarning(s"Failed to remove ${file.getPath} / skip removing.") + logWarning(log"Failed to remove ${MDC(LogKeys.PATH, file.getPath)} / skip removing.") } } } diff --git a/core/src/main/scala/org/apache/spark/deploy/history/EventLogFileWriters.scala b/core/src/main/scala/org/apache/spark/deploy/history/EventLogFileWriters.scala index e7eb05c853671..f3bb6d5af3358 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/EventLogFileWriters.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/EventLogFileWriters.scala @@ -21,14 +21,16 @@ import java.io._ import java.net.URI import java.nio.charset.StandardCharsets -import org.apache.commons.compress.utils.CountingOutputStream +import org.apache.commons.io.output.CountingOutputStream import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, FileSystem, FSDataOutputStream, Path} import org.apache.hadoop.fs.permission.FsPermission import org.apache.spark.SparkConf import org.apache.spark.deploy.SparkHadoopUtil -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.config._ import org.apache.spark.io.CompressionCodec import org.apache.spark.util.Utils @@ -82,7 +84,7 @@ abstract class EventLogFileWriter( protected def initLogFile(path: Path)(fnSetupWriter: OutputStream => PrintWriter): Unit = { if (shouldOverwrite && fileSystem.delete(path, true)) { - logWarning(s"Event log $path already exists. Overwriting...") + logWarning(log"Event log ${MDC(LogKeys.PATH, path)} already exists. Overwriting...") } val defaultFs = FileSystem.getDefaultUri(hadoopConf).getScheme @@ -105,7 +107,7 @@ abstract class EventLogFileWriter( .getOrElse(dstream) val bstream = new BufferedOutputStream(cstream, outputBufferSize) fileSystem.setPermission(path, EventLogFileWriter.LOG_FILE_PERMISSIONS) - logInfo(s"Logging events to $path") + logInfo(log"Logging events to ${MDC(PATH, path)}") writer = Some(fnSetupWriter(bstream)) } catch { case e: Exception => @@ -131,9 +133,10 @@ abstract class EventLogFileWriter( protected def renameFile(src: Path, dest: Path, overwrite: Boolean): Unit = { if (fileSystem.exists(dest)) { if (overwrite) { - logWarning(s"Event log $dest already exists. Overwriting...") + logWarning(log"Event log ${MDC(EVENT_LOG_DESTINATION, dest)} already exists. " + + log"Overwriting...") if (!fileSystem.delete(dest, true)) { - logWarning(s"Error deleting $dest") + logWarning(log"Error deleting ${MDC(EVENT_LOG_DESTINATION, dest)}") } } else { throw new IOException(s"Target log file already exists ($dest)") @@ -327,7 +330,7 @@ class RollingEventLogFilesWriter( override def writeEvent(eventJson: String, flushLogger: Boolean = false): Unit = { writer.foreach { w => - val currentLen = countingOutputStream.get.getBytesWritten + val currentLen = countingOutputStream.get.getByteCount if (currentLen + eventJson.length > eventFileMaxLength) { rollEventLogFile() } diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala index 98cbd7b3eba82..95b23c0f894f8 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala @@ -38,11 +38,13 @@ import org.apache.hadoop.security.AccessControlException import org.apache.spark.{SecurityManager, SparkConf, SparkException} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.PATH +import org.apache.spark.internal.LogKeys +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.config._ import org.apache.spark.internal.config.History._ import org.apache.spark.internal.config.Status._ import org.apache.spark.internal.config.Tests.IS_TESTING +import org.apache.spark.internal.config.UI import org.apache.spark.internal.config.UI._ import org.apache.spark.scheduler._ import org.apache.spark.scheduler.ReplayListenerBus._ @@ -106,9 +108,12 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) private val historyUiAclsEnable = conf.get(History.HISTORY_SERVER_UI_ACLS_ENABLE) private val historyUiAdminAcls = conf.get(History.HISTORY_SERVER_UI_ADMIN_ACLS) private val historyUiAdminAclsGroups = conf.get(History.HISTORY_SERVER_UI_ADMIN_ACLS_GROUPS) - logInfo(s"History server ui acls " + (if (historyUiAclsEnable) "enabled" else "disabled") + - "; users with admin permissions: " + historyUiAdminAcls.mkString(",") + - "; groups with admin permissions: " + historyUiAdminAclsGroups.mkString(",")) + logInfo(log"History server ui acls" + + log" ${MDC(ACL_ENABLED, if (historyUiAclsEnable) "enabled" else "disabled")}" + + log"; users with admin permissions:" + + log" ${MDC(LogKeys.ADMIN_ACLS, historyUiAdminAcls.mkString(","))}" + + log"; groups with admin permissions:" + + log" ${MDC(ADMIN_ACL_GROUPS, historyUiAdminAclsGroups.mkString(","))}") private val hadoopConf = SparkHadoopUtil.get.newConfiguration(conf) // Visible for testing @@ -481,8 +486,8 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) true } catch { case e: IllegalArgumentException => - logInfo("Exception in getting modificationTime of " - + reader.rootPath.getName + ". " + e.toString) + logInfo(log"Exception in getting modificationTime of" + + log" ${MDC(PATH, reader.rootPath.getName)}. ${MDC(EXCEPTION, e.toString)}") false } } @@ -549,7 +554,7 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) try { if (conf.get(CLEANER_ENABLED) && reader.modificationTime < clock.getTimeMillis() - conf.get(MAX_LOG_AGE_S) * 1000) { - logInfo(s"Deleting expired event log ${reader.rootPath.toString}") + logInfo(log"Deleting expired event log ${MDC(PATH, reader.rootPath.toString)}") deleteLog(fs, reader.rootPath) // If the LogInfo read had succeeded, but the ApplicationInafoWrapper // read failure and throw the exception, we should also cleanup the log @@ -569,12 +574,13 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) case _: FileNotFoundException => false case _: NoSuchElementException => false case NonFatal(e) => - logWarning(s"Error while reading new log ${reader.rootPath}", e) + logWarning(log"Error while reading new log " + + log"${MDC(PATH, reader.rootPath)}", e) false } case NonFatal(e) => - logWarning(s"Error while filtering log ${reader.rootPath}", e) + logWarning(log"Error while filtering log ${MDC(PATH, reader.rootPath)}", e) false } } @@ -729,7 +735,7 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) throw e case e: AccessControlException => // We don't have read permissions on the log file - logWarning(s"Unable to read log $rootPath", e) + logWarning(log"Unable to read log ${MDC(PATH, rootPath)}", e) markInaccessible(rootPath) // SPARK-28157 We should remove this inaccessible entry from the KVStore // to handle permission-only changes with the same file sizes later. @@ -744,8 +750,9 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) // Do nothing, the application completed during processing, the final event log file // will be processed by next around. } else { - logWarning(s"In-progress event log file does not exist: ${reader.rootPath}, " + - s"neither does the final event log file: $finalFilePath.") + logWarning(log"In-progress event log file does not exist: " + + log"${MDC(PATH, reader.rootPath)}, " + + log"neither does the final event log file: ${MDC(FINAL_PATH, finalFilePath)}.") } case e: Exception => logError("Exception while merging application listings", e) @@ -798,7 +805,7 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) val listener = new AppListingListener(reader, clock, shouldHalt) bus.addListener(listener) - logInfo(s"Parsing $logPath for listing data...") + logInfo(log"Parsing ${MDC(PATH, logPath)} for listing data...") val logFiles = reader.listEventLogFiles parseAppEventLogs(logFiles, bus, !appCompleted, eventsFilter) @@ -826,7 +833,8 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) Utils.tryWithResource(EventLogFileReader.openEventLog(lastFile.getPath, fs)) { in => val target = lastFile.getLen - reparseChunkSize if (target > 0) { - logInfo(s"Looking for end event; skipping $target bytes from $logPath...") + logInfo(log"Looking for end event; skipping ${MDC(NUM_BYTES, target)} bytes" + + log" from ${MDC(PATH, logPath)}...") var skipped = 0L while (skipped < target) { skipped += in.skip(target - skipped) @@ -845,7 +853,7 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) } } - logInfo(s"Finished parsing $logPath") + logInfo(log"Finished parsing ${MDC(PATH, logPath)}") listener.applicationInfo match { case Some(app) if !lookForEndEvent || app.attempts.head.info.completed => @@ -880,7 +888,7 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) // In this case, the attempt is still not marked as finished but was expected to. This can // mean the end event is before the configured threshold, so call the method again to // re-parse the whole log. - logInfo(s"Reparsing $logPath since end event was not found.") + logInfo(log"Reparsing ${MDC(PATH, logPath)} since end event was not found.") doMergeApplicationListingInternal(reader, scanTime, enableOptimizations = false, lastEvaluatedForCompaction) @@ -919,7 +927,7 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) case e: InterruptedException => throw e case e: AccessControlException => - logWarning(s"Insufficient permission while compacting log for $rootPath", e) + logWarning(log"Insufficient permission while compacting log for ${MDC(PATH, rootPath)}", e) case e: Exception => logError(log"Exception while compacting log for ${MDC(PATH, rootPath)}", e) } finally { @@ -949,7 +957,7 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) val log = listing.read(classOf[LogInfo], logPath) if (log.lastProcessed <= maxTime && log.appId.isEmpty) { - logInfo(s"Deleting invalid / corrupt event log ${log.logPath}") + logInfo(log"Deleting invalid / corrupt event log ${MDC(PATH, log.logPath)}") deleteLog(fs, new Path(log.logPath)) listing.delete(classOf[LogInfo], log.logPath) } @@ -991,7 +999,7 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) .first(maxTime), Int.MaxValue) { l => l.logType == null || l.logType == LogType.EventLogs } stale.filterNot(isProcessing).foreach { log => if (log.appId.isEmpty) { - logInfo(s"Deleting invalid / corrupt event log ${log.logPath}") + logInfo(log"Deleting invalid / corrupt event log ${MDC(PATH, log.logPath)}") deleteLog(fs, new Path(log.logPath)) listing.delete(classOf[LogInfo], log.logPath) } @@ -1002,7 +1010,8 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) val num = KVUtils.size(listing.view(classOf[LogInfo]).index("lastProcessed")) var count = num - maxNum if (count > 0) { - logInfo(s"Try to delete $count old event logs to keep $maxNum logs in total.") + logInfo(log"Try to delete ${MDC(NUM_FILES, count)} old event logs" + + log" to keep ${MDC(MAX_NUM_FILES, maxNum)} logs in total.") KVUtils.foreach(listing.view(classOf[ApplicationInfoWrapper]).index("oldestAttempt")) { app => if (count > 0) { // Applications may have multiple attempts, some of which may not be completed yet. @@ -1011,7 +1020,8 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) } } if (count > 0) { - logWarning(s"Fail to clean up according to MAX_LOG_NUM policy ($maxNum).") + logWarning(log"Fail to clean up according to MAX_LOG_NUM policy " + + log"(${MDC(MAX_NUM_LOG_POLICY, maxNum)}).") } } @@ -1030,7 +1040,7 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) var countDeleted = 0 toDelete.foreach { attempt => - logInfo(s"Deleting expired event log for ${attempt.logPath}") + logInfo(log"Deleting expired event log for ${MDC(PATH, attempt.logPath)}") val logPath = new Path(logDir, attempt.logPath) listing.delete(classOf[LogInfo], logPath.toString()) cleanAppData(app.id, attempt.info.attemptId, logPath.toString()) @@ -1078,7 +1088,7 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) false } if (deleteFile) { - logInfo(s"Deleting expired driver log for: $logFileStr") + logInfo(log"Deleting expired driver log for: ${MDC(PATH, logFileStr)}") listing.delete(classOf[LogInfo], logFileStr) deleteLog(driverLogFs, f.getPath()) } @@ -1091,7 +1101,7 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) .reverse() .first(maxTime), Int.MaxValue) { l => l.logType != null && l.logType == LogType.DriverLogs } stale.filterNot(isProcessing).foreach { log => - logInfo(s"Deleting invalid driver log ${log.logPath}") + logInfo(log"Deleting invalid driver log ${MDC(PATH, log.logPath)}") listing.delete(classOf[LogInfo], log.logPath) deleteLog(driverLogFs, new Path(log.logPath)) } @@ -1120,10 +1130,10 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) try { val eventLogFiles = reader.listEventLogFiles - logInfo(s"Parsing ${reader.rootPath} to re-build UI...") + logInfo(log"Parsing ${MDC(PATH, reader.rootPath)} to re-build UI...") parseAppEventLogs(eventLogFiles, replayBus, !reader.completed) trackingStore.close(false) - logInfo(s"Finished parsing ${reader.rootPath}") + logInfo(log"Finished parsing ${MDC(PATH, reader.rootPath)}") } catch { case e: Exception => Utils.tryLogNonFatalError { @@ -1224,7 +1234,8 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) return KVUtils.open(path, metadata, conf, live = false) } catch { case e: Exception => - logInfo(s"Failed to open existing store for $appId/${attempt.info.attemptId}.", e) + logInfo(log"Failed to open existing store for" + + log" ${MDC(APP_ID, appId)}/${MDC(LogKeys.APP_ATTEMPT_ID, attempt.info.attemptId)}.", e) dm.release(appId, attempt.info.attemptId, delete = true) } } @@ -1240,11 +1251,14 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) case e: RuntimeException if e.getMessage != null && e.getMessage.contains("Not enough memory to create hybrid") => // Handle exception from `HistoryServerMemoryManager.lease`. - logInfo(s"Failed to create HybridStore for $appId/${attempt.info.attemptId}." + - s" Using $hybridStoreDiskBackend. " + e.getMessage) + logInfo(log"Failed to create HybridStore for" + + log" ${MDC(APP_ID, appId)}/${MDC(LogKeys.APP_ATTEMPT_ID, attempt.info.attemptId)}." + + log" Using ${MDC(LogKeys.HYBRID_STORE_DISK_BACKEND, hybridStoreDiskBackend)}." + + log" ${MDC(EXCEPTION, e.getMessage)}") case e: Exception => - logInfo(s"Failed to create HybridStore for $appId/${attempt.info.attemptId}." + - s" Using $hybridStoreDiskBackend.", e) + logInfo(log"Failed to create HybridStore for" + + log" ${MDC(APP_ID, appId)}/${MDC(LogKeys.APP_ATTEMPT_ID, attempt.info.attemptId)}." + + log" Using ${MDC(LogKeys.HYBRID_STORE_DISK_BACKEND, hybridStoreDiskBackend)}.", e) } } @@ -1275,8 +1289,9 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) case _: IOException if !retried => // compaction may touch the file(s) which app rebuild wants to read // compaction wouldn't run in short interval, so try again... - logWarning(s"Exception occurred while rebuilding log path ${attempt.logPath} - " + - "trying again...") + logWarning(log"Exception occurred while rebuilding log path " + + log"${MDC(PATH, attempt.logPath)} - " + + log"trying again...") store.close() memoryManager.release(appId, attempt.info.attemptId) retried = true @@ -1290,20 +1305,23 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) // Create a disk-base KVStore and start a background thread to dump data to it var lease: dm.Lease = null try { - logInfo(s"Leasing disk manager space for app $appId / ${attempt.info.attemptId}...") + logInfo(log"Leasing disk manager space for app" + + log" ${MDC(APP_ID, appId)} / ${MDC(LogKeys.APP_ATTEMPT_ID, attempt.info.attemptId)}...") lease = dm.lease(reader.totalSize, reader.compressionCodec.isDefined) val diskStore = KVUtils.open(lease.tmpPath, metadata, conf, live = false) hybridStore.setDiskStore(diskStore) hybridStore.switchToDiskStore(new HybridStore.SwitchToDiskStoreListener { override def onSwitchToDiskStoreSuccess(): Unit = { - logInfo(s"Completely switched to diskStore for app $appId / ${attempt.info.attemptId}.") + logInfo(log"Completely switched to diskStore for app" + + log" ${MDC(APP_ID, appId)} / ${MDC(LogKeys.APP_ATTEMPT_ID, attempt.info.attemptId)}.") diskStore.close() val newStorePath = lease.commit(appId, attempt.info.attemptId) hybridStore.setDiskStore(KVUtils.open(newStorePath, metadata, conf, live = false)) memoryManager.release(appId, attempt.info.attemptId) } override def onSwitchToDiskStoreFail(e: Exception): Unit = { - logWarning(s"Failed to switch to diskStore for app $appId / ${attempt.info.attemptId}", e) + logWarning(log"Failed to switch to diskStore for app ${MDC(APP_ID, appId)} / " + + log"${MDC(LogKeys.APP_ATTEMPT_ID, attempt.info.attemptId)}", e) diskStore.close() lease.rollback() } @@ -1332,7 +1350,8 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) val reader = EventLogFileReader(fs, new Path(logDir, attempt.logPath), attempt.lastIndex) val isCompressed = reader.compressionCodec.isDefined - logInfo(s"Leasing disk manager space for app $appId / ${attempt.info.attemptId}...") + logInfo(log"Leasing disk manager space for app" + + log" ${MDC(APP_ID, appId)} / ${MDC(LogKeys.APP_ATTEMPT_ID, attempt.info.attemptId)}...") val lease = dm.lease(reader.totalSize, isCompressed) try { Utils.tryWithResource(KVUtils.open(lease.tmpPath, metadata, conf, live = false)) { store => @@ -1343,7 +1362,8 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) case _: IOException if !retried => // compaction may touch the file(s) which app rebuild wants to read // compaction wouldn't run in short interval, so try again... - logWarning(s"Exception occurred while rebuilding app $appId - trying again...") + logWarning(log"Exception occurred while rebuilding app ${MDC(APP_ID, appId)} - " + + log"trying again...") lease.rollback() retried = true @@ -1370,8 +1390,8 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) case _: IOException if !retried => // compaction may touch the file(s) which app rebuild wants to read // compaction wouldn't run in short interval, so try again... - logWarning(s"Exception occurred while rebuilding log path ${attempt.logPath} - " + - "trying again...") + logWarning(log"Exception occurred while rebuilding log path " + + log"${MDC(LogKeys.PATH, attempt.logPath)} - trying again...") retried = true case e: Exception => @@ -1401,7 +1421,7 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) deleted = fs.delete(log, true) } catch { case _: AccessControlException => - logInfo(s"No permission to delete $log, ignoring.") + logInfo(log"No permission to delete ${MDC(PATH, log)}, ignoring.") case ioe: IOException => logError(log"IOException in cleaning ${MDC(PATH, log)}", ioe) } @@ -1553,7 +1573,7 @@ private[history] class AppListingListener( val allProperties = event.environmentDetails("Spark Properties").toMap attempt.viewAcls = emptyStringToNone(allProperties.get(UI_VIEW_ACLS.key)) - attempt.adminAcls = emptyStringToNone(allProperties.get(ADMIN_ACLS.key)) + attempt.adminAcls = emptyStringToNone(allProperties.get(UI.ADMIN_ACLS.key)) attempt.viewAclsGroups = emptyStringToNone(allProperties.get(UI_VIEW_ACLS_GROUPS.key)) attempt.adminAclsGroups = emptyStringToNone(allProperties.get(ADMIN_ACLS_GROUPS.key)) diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala index 7362634d5b09e..6e559dc4492ea 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala @@ -28,7 +28,8 @@ import org.eclipse.jetty.servlet.{ServletContextHandler, ServletHolder} import org.apache.spark.{SecurityManager, SparkConf} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.deploy.Utils.addRenderLogHandler -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.config._ import org.apache.spark.internal.config.History import org.apache.spark.internal.config.UI._ @@ -115,9 +116,9 @@ class HistoryServer( // requested, and the proper data should be served at that point. // Also, make sure that the redirect url contains the query string present in the request. val redirect = if (shouldAppendAttemptId) { - req.getRequestURI.stripSuffix("/") + "/" + attemptId.get + req.getRequestURI.stripSuffix("/") + "/" + attemptId.get + "/" } else { - req.getRequestURI + req.getRequestURI.stripSuffix("/") + "/" } val query = Option(req.getQueryString).map("?" + _).getOrElse("") res.sendRedirect(res.encodeRedirectURL(redirect + query)) @@ -301,7 +302,6 @@ object HistoryServer extends Logging { val securityManager = createSecurityManager(conf) val providerName = conf.get(History.PROVIDER) - .getOrElse(classOf[FsHistoryProvider].getName()) val provider = Utils.classForName[ApplicationHistoryProvider](providerName) .getConstructor(classOf[SparkConf]) .newInstance(conf) @@ -332,8 +332,8 @@ object HistoryServer extends Logging { } if (config.get(ACLS_ENABLE)) { - logInfo(s"${ACLS_ENABLE.key} is configured, " + - s"clearing it and only using ${History.HISTORY_SERVER_UI_ACLS_ENABLE.key}") + logInfo(log"${MDC(KEY, ACLS_ENABLE.key)} is configured, " + + log"clearing it and only using ${MDC(KEY2, History.HISTORY_SERVER_UI_ACLS_ENABLE.key)}") config.set(ACLS_ENABLE, false) } diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerArguments.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerArguments.scala index 01cc59e1d2e6e..2fdf7a473a298 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerArguments.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerArguments.scala @@ -21,6 +21,7 @@ import scala.annotation.tailrec import org.apache.spark.SparkConf import org.apache.spark.internal.Logging +import org.apache.spark.internal.config.{ConfigEntry, History} import org.apache.spark.util.Utils /** @@ -44,47 +45,62 @@ private[history] class HistoryServerArguments(conf: SparkConf, args: Array[Strin case Nil => - case _ => - printUsageAndExit(1) + case other => + val errorMsg = s"Unrecognized options: ${other.mkString(" ")}\n" + printUsageAndExit(1, errorMsg) } } - // This mutates the SparkConf, so all accesses to it must be made after this line - Utils.loadDefaultSparkProperties(conf, propertiesFile) + // This mutates the SparkConf, so all accesses to it must be made after this line + Utils.loadDefaultSparkProperties(conf, propertiesFile) - private def printUsageAndExit(exitCode: Int): Unit = { - // scalastyle:off println - System.err.println( - """ - |Usage: HistoryServer [options] - | - |Options: - | --properties-file FILE Path to a custom Spark properties file. - | Default is conf/spark-defaults.conf. - | - |Configuration options can be set by setting the corresponding JVM system property. - |History Server options are always available; additional options depend on the provider. - | - |History Server options: - | - | spark.history.ui.port Port where server will listen for connections - | (default 18080) - | spark.history.acls.enable Whether to enable view acls for all applications - | (default false) - | spark.history.provider Name of history provider class (defaults to - | file system-based provider) - | spark.history.retainedApplications Max number of application UIs to keep loaded in memory - | (default 50) - |FsHistoryProvider options: - | - | spark.history.fs.logDirectory Directory where app logs are stored - | (default: file:/tmp/spark-events) - | spark.history.fs.update.interval How often to reload log data from storage - | (in seconds, default: 10) - |""".stripMargin) - // scalastyle:on println + // scalastyle:off line.size.limit println + private def printUsageAndExit(exitCode: Int, error: String = ""): Unit = { + val configs = History.getClass.getDeclaredFields + .filter(f => classOf[ConfigEntry[_]].isAssignableFrom(f.getType)) + .map { f => + f.setAccessible(true) + f.get(History).asInstanceOf[ConfigEntry[_]] + } + val maxConfigLength = configs.map(_.key.length).max + val sb = new StringBuilder( + s""" + |${error}Usage: HistoryServer [options] + | + |Options: + | ${"--properties-file FILE".padTo(maxConfigLength, ' ')} Path to a custom Spark properties file. + | ${"".padTo(maxConfigLength, ' ')} Default is conf/spark-defaults.conf. + | + |Configuration options can be set by setting the corresponding JVM system property. + |History Server options are always available; additional options depend on the provider. + | + |""".stripMargin) + + def printConfigs(configs: Array[ConfigEntry[_]]): Unit = { + configs.sortBy(_.key).foreach { conf => + sb.append(" ").append(conf.key.padTo(maxConfigLength, ' ')) + var currentDocLen = 0 + val intention = "\n" + " " * (maxConfigLength + 2) + conf.doc.split("\\s+").foreach { word => + if (currentDocLen + word.length > 60) { + sb.append(intention).append(" ").append(word) + currentDocLen = word.length + 1 + } else { + sb.append(" ").append(word) + currentDocLen += word.length + 1 + } + } + sb.append(intention).append(" (Default: ").append(conf.defaultValueString).append(")\n") + } + } + val (common, fs) = configs.partition(!_.key.startsWith("spark.history.fs.")) + sb.append("History Server options:\n") + printConfigs(common) + sb.append("FsHistoryProvider options:\n") + printConfigs(fs) + System.err.println(sb.toString()) + // scalastyle:on line.size.limit println System.exit(exitCode) } - } diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerDiskManager.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerDiskManager.scala index a84e1b1819542..122ed299242f5 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerDiskManager.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerDiskManager.scala @@ -25,7 +25,9 @@ import scala.collection.mutable.{HashMap, ListBuffer} import org.apache.commons.io.FileUtils import org.apache.spark.SparkConf -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ +import org.apache.spark.internal.config.History import org.apache.spark.internal.config.History._ import org.apache.spark.internal.config.History.HybridStoreDiskBackend.ROCKSDB import org.apache.spark.status.KVUtils @@ -57,7 +59,7 @@ private class HistoryServerDiskManager( throw new IllegalArgumentException(s"Failed to create app directory ($appStoreDir).") } private val extension = - if (conf.get(HYBRID_STORE_DISK_BACKEND) == ROCKSDB.toString) ".rdb" else ".ldb" + if (conf.get(History.HYBRID_STORE_DISK_BACKEND) == ROCKSDB.toString) ".rdb" else ".ldb" private val tmpStoreDir = new File(path, "temp") if (!tmpStoreDir.isDirectory() && !tmpStoreDir.mkdir()) { @@ -99,9 +101,9 @@ private class HistoryServerDiskManager( } } - logInfo("Initialized disk manager: " + - s"current usage = ${Utils.bytesToString(currentUsage.get())}, " + - s"max usage = ${Utils.bytesToString(maxUsage)}") + logInfo(log"Initialized disk manager:" + + log" current usage = ${MDC(NUM_BYTES_CURRENT, Utils.bytesToString(currentUsage.get()))}," + + log" max usage = ${MDC(NUM_BYTES_MAX, Utils.bytesToString(maxUsage))}") } /** @@ -126,8 +128,9 @@ private class HistoryServerDiskManager( updateUsage(needed) val current = currentUsage.get() if (current > maxUsage) { - logInfo(s"Lease of ${Utils.bytesToString(needed)} may cause usage to exceed max " + - s"(${Utils.bytesToString(current)} > ${Utils.bytesToString(maxUsage)})") + logInfo(log"Lease of ${MDC(NUM_BYTES, Utils.bytesToString(needed))} may cause" + + log" usage to exceed max (${MDC(NUM_BYTES_CURRENT, Utils.bytesToString(current))}" + + log" > ${MDC(NUM_BYTES_MAX, Utils.bytesToString(maxUsage))})") } new Lease(tmp, needed) @@ -237,16 +240,19 @@ private class HistoryServerDiskManager( if (evicted.nonEmpty) { val freed = evicted.map { info => - logInfo(s"Deleting store for ${info.appId}/${info.attemptId}.") + logInfo(log"Deleting store for" + + log" ${MDC(APP_ID, info.appId)}/${MDC(APP_ATTEMPT_ID, info.attemptId)}.") deleteStore(new File(info.path)) updateUsage(-info.size, committed = true) info.size }.sum - logInfo(s"Deleted ${evicted.size} store(s) to free ${Utils.bytesToString(freed)} " + - s"(target = ${Utils.bytesToString(size)}).") + logInfo(log"Deleted ${MDC(NUM_BYTES_EVICTED, evicted.size)} store(s)" + + log" to free ${MDC(NUM_BYTES_TO_FREE, Utils.bytesToString(freed))}" + + log" (target = ${MDC(NUM_BYTES, Utils.bytesToString(size))}).") } else { - logWarning(s"Unable to free any space to make room for ${Utils.bytesToString(size)}.") + logWarning(log"Unable to free any space to make room for " + + log"${MDC(NUM_BYTES, Utils.bytesToString(size))}.") } } } @@ -312,8 +318,9 @@ private class HistoryServerDiskManager( if (committedUsage.get() > maxUsage) { val current = Utils.bytesToString(committedUsage.get()) val max = Utils.bytesToString(maxUsage) - logWarning(s"Commit of application $appId / $attemptId causes maximum disk usage to be " + - s"exceeded ($current > $max)") + logWarning(log"Commit of application ${MDC(APP_ID, appId)} / " + + log"${MDC(APP_ATTEMPT_ID, attemptId)} causes maximum disk usage to be " + + log"exceeded (${MDC(NUM_BYTES, current)} > ${MDC(NUM_BYTES_MAX, max)}") } updateApplicationStoreInfo(appId, attemptId, newSize) diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerMemoryManager.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerMemoryManager.scala index b95f1ed24f376..6e3dbb1170998 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerMemoryManager.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerMemoryManager.scala @@ -22,7 +22,8 @@ import java.util.concurrent.atomic.AtomicLong import scala.collection.mutable.HashMap import org.apache.spark.SparkConf -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.config.History._ import org.apache.spark.io.CompressionCodec import org.apache.spark.util.Utils @@ -39,9 +40,9 @@ private class HistoryServerMemoryManager( private[history] val active = new HashMap[(String, Option[String]), Long]() def initialize(): Unit = { - logInfo("Initialized memory manager: " + - s"current usage = ${Utils.bytesToString(currentUsage.get())}, " + - s"max usage = ${Utils.bytesToString(maxUsage)}") + logInfo(log"Initialized memory manager: " + + log"current usage = ${MDC(NUM_BYTES_CURRENT, Utils.bytesToString(currentUsage.get()))}, " + + log"max usage = ${MDC(NUM_BYTES_MAX, Utils.bytesToString(maxUsage))}") } def lease( @@ -58,8 +59,8 @@ private class HistoryServerMemoryManager( active(appId -> attemptId) = memoryUsage } currentUsage.addAndGet(memoryUsage) - logInfo(s"Leasing ${Utils.bytesToString(memoryUsage)} memory usage for " + - s"app $appId / $attemptId") + logInfo(log"Leasing ${MDC(NUM_BYTES, Utils.bytesToString(memoryUsage))} memory usage for " + + log"app ${MDC(APP_ID, appId)} / ${MDC(APP_ATTEMPT_ID, attemptId)}") } def release(appId: String, attemptId: Option[String]): Unit = { @@ -68,8 +69,8 @@ private class HistoryServerMemoryManager( memoryUsage match { case Some(m) => currentUsage.addAndGet(-m) - logInfo(s"Released ${Utils.bytesToString(m)} memory usage for " + - s"app $appId / $attemptId") + logInfo(log"Released ${MDC(NUM_BYTES, Utils.bytesToString(m))} memory usage for " + + log"app ${MDC(APP_ID, appId)} / ${MDC(APP_ATTEMPT_ID, attemptId)}") case None => } } diff --git a/core/src/main/scala/org/apache/spark/deploy/master/FileSystemPersistenceEngine.scala b/core/src/main/scala/org/apache/spark/deploy/master/FileSystemPersistenceEngine.scala index fb067f10c5a4f..4332544e4491c 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/FileSystemPersistenceEngine.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/FileSystemPersistenceEngine.scala @@ -22,7 +22,8 @@ import java.nio.file.{FileAlreadyExistsException, Files, Paths} import scala.reflect.ClassTag -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.io.CompressionCodec import org.apache.spark.serializer.{DeserializationStream, SerializationStream, Serializer} import org.apache.spark.util.ArrayImplicits._ @@ -56,7 +57,7 @@ private[master] class FileSystemPersistenceEngine( override def unpersist(name: String): Unit = { val f = new File(dir + File.separator + name) if (!f.delete()) { - logWarning(s"Error deleting ${f.getPath()}") + logWarning(log"Error deleting ${MDC(PATH, f.getPath())}") } } diff --git a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala index 2834434256358..84e67cba33a9f 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala @@ -33,7 +33,7 @@ import org.apache.spark.deploy.master.MasterMessages._ import org.apache.spark.deploy.master.ui.MasterWebUI import org.apache.spark.deploy.rest.StandaloneRestServer import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{APP_DESC, APP_ID, EXECUTOR_ID, RETRY_COUNT} +import org.apache.spark.internal.LogKeys import org.apache.spark.internal.config._ import org.apache.spark.internal.config.Deploy._ import org.apache.spark.internal.config.Deploy.WorkerSelectionPolicy._ @@ -42,7 +42,7 @@ import org.apache.spark.internal.config.Worker._ import org.apache.spark.metrics.{MetricsSystem, MetricsSystemInstances} import org.apache.spark.resource.{ResourceInformation, ResourceProfile, ResourceRequirement, ResourceUtils} import org.apache.spark.rpc._ -import org.apache.spark.serializer.{JavaSerializer, KryoSerializer, Serializer} +import org.apache.spark.serializer.{JavaSerializer, Serializer} import org.apache.spark.util.{SparkUncaughtExceptionHandler, ThreadUtils, Utils} import org.apache.spark.util.ArrayImplicits._ @@ -143,8 +143,9 @@ private[deploy] class Master( } override def onStart(): Unit = { - logInfo("Starting Spark master at " + masterUrl) - logInfo(s"Running Spark version ${org.apache.spark.SPARK_VERSION}") + logInfo(log"Starting Spark master at ${MDC(LogKeys.MASTER_URL, masterUrl)}") + logInfo(log"Running Spark version" + + log" ${MDC(LogKeys.SPARK_VERSION, org.apache.spark.SPARK_VERSION)}") webUi = new MasterWebUI(this, webUiPort) webUi.bind() masterWebUiUrl = webUi.webUrl @@ -157,8 +158,8 @@ private[deploy] class Master( masterWebUiUrl = uiReverseProxyUrl.get + "/" } webUi.addProxy() - logInfo(s"Spark Master is acting as a reverse proxy. Master, Workers and " + - s"Applications UIs are available at $masterWebUiUrl") + logInfo(log"Spark Master is acting as a reverse proxy. Master, Workers and " + + log"Applications UIs are available at ${MDC(LogKeys.WEB_URL, masterWebUiUrl)}") } checkForWorkerTimeOutTask = forwardMessageThread.scheduleAtFixedRate( () => Utils.tryLogNonFatalError { self.send(CheckForWorkerTimeOut) }, @@ -179,10 +180,7 @@ private[deploy] class Master( masterMetricsSystem.getServletHandlers.foreach(webUi.attachHandler) applicationMetricsSystem.getServletHandlers.foreach(webUi.attachHandler) - val serializer = RecoverySerializer.withName(conf.get(RECOVERY_SERIALIZER)) match { - case RecoverySerializer.JAVA => new JavaSerializer(conf) - case RecoverySerializer.KRYO => new KryoSerializer(conf) - } + val serializer = new JavaSerializer(conf) val (persistenceEngine_, leaderElectionAgent_) = recoveryMode match { case "ZOOKEEPER" => logInfo("Persisting recovery state to ZooKeeper") @@ -245,7 +243,7 @@ private[deploy] class Master( } else { RecoveryState.RECOVERING } - logInfo("I have been elected leader! New state: " + state) + logInfo(log"I have been elected leader! New state: ${MDC(LogKeys.RECOVERY_STATE, state)}") if (state == RecoveryState.RECOVERING) { if (beginRecovery(storedApps, storedDrivers, storedWorkers)) { recoveryCompletionTask = forwardMessageThread.schedule(new Runnable { @@ -294,10 +292,11 @@ private[deploy] class Master( if (state == RecoveryState.STANDBY) { // ignore, don't send response } else { - logInfo("Registering app " + description.name) + logInfo(log"Registering app ${MDC(LogKeys.APP_NAME, description.name)}") val app = createApplication(description, driver) registerApplication(app) - logInfo("Registered app " + description.name + " with ID " + app.id) + logInfo(log"Registered app ${MDC(LogKeys.APP_NAME, description.name)} with" + + log" ID ${MDC(LogKeys.APP_ID, app.id)}") persistenceEngine.addApplication(app) driver.send(RegisteredApplication(app.id, self)) schedule() @@ -317,22 +316,23 @@ private[deploy] class Master( workerInfo.lastHeartbeat = System.currentTimeMillis() case None => if (workers.map(_.id).contains(workerId)) { - logWarning(s"Got heartbeat from unregistered worker $workerId." + - " Asking it to re-register.") + logWarning(log"Got heartbeat from unregistered worker " + + log"${MDC(LogKeys.WORKER_ID, workerId)}. Asking it to re-register.") worker.send(ReconnectWorker(masterUrl)) } else { - logWarning(s"Got heartbeat from unregistered worker $workerId." + - " This worker was never registered, so ignoring the heartbeat.") + logWarning(log"Got heartbeat from unregistered worker " + + log"${MDC(LogKeys.WORKER_ID, workerId)}. " + + log"This worker was never registered, so ignoring the heartbeat.") } } case MasterChangeAcknowledged(appId) => idToApp.get(appId) match { case Some(app) => - logInfo("Application has been re-registered: " + appId) + logInfo(log"Application has been re-registered: ${MDC(LogKeys.APP_ID, appId)}") app.state = ApplicationState.WAITING case None => - logWarning("Master change ack from unknown app: " + appId) + logWarning(log"Master change ack from unknown app: ${MDC(LogKeys.APP_ID, appId)}") } if (canCompleteRecovery) { completeRecovery() } @@ -340,7 +340,7 @@ private[deploy] class Master( case WorkerSchedulerStateResponse(workerId, execResponses, driverResponses) => idToWorker.get(workerId) match { case Some(worker) => - logInfo("Worker has been re-registered: " + workerId) + logInfo(log"Worker has been re-registered: ${MDC(LogKeys.WORKER_ID, workerId)}") worker.state = WorkerState.ALIVE val validExecutors = execResponses.filter( @@ -366,7 +366,7 @@ private[deploy] class Master( } } case None => - logWarning("Scheduler state from unknown worker: " + workerId) + logWarning(log"Scheduler state from unknown worker: ${MDC(LogKeys.WORKER_ID, workerId)}") } if (canCompleteRecovery) { completeRecovery() } @@ -392,11 +392,12 @@ private[deploy] class Master( } } case None => - logWarning("Worker state from unknown worker: " + workerId) + logWarning(log"Worker state from unknown worker: ${MDC(LogKeys.WORKER_ID, workerId)}") } case UnregisterApplication(applicationId) => - logInfo(s"Received unregister request from application $applicationId") + logInfo(log"Received unregister request from application" + + log" ${MDC(LogKeys.APP_ID, applicationId)}") idToApp.get(applicationId).foreach(finishApplication) case CheckForWorkerTimeOut => @@ -411,7 +412,7 @@ private[deploy] class Master( "Can only accept driver submissions in ALIVE state." context.reply(SubmitDriverResponse(self, false, None, msg)) } else { - logInfo("Driver submitted " + description.command.mainClass) + logInfo(log"Driver submitted ${MDC(LogKeys.CLASS_NAME, description.command.mainClass)}") val driver = createDriver(description) persistenceEngine.addDriver(driver) waitingDrivers += driver @@ -431,7 +432,7 @@ private[deploy] class Master( s"Can only kill drivers in ALIVE state." context.reply(KillDriverResponse(self, driverId, success = false, msg)) } else { - logInfo("Asked to kill driver " + driverId) + logInfo(log"Asked to kill driver ${MDC(LogKeys.DRIVER_ID, driverId)}") val driver = drivers.find(_.id == driverId) driver match { case Some(d) => @@ -447,12 +448,13 @@ private[deploy] class Master( } } // TODO: It would be nice for this to be a synchronous response - val msg = s"Kill request for $driverId submitted" + val msg = log"Kill request for ${MDC(LogKeys.DRIVER_ID, driverId)} submitted" logInfo(msg) - context.reply(KillDriverResponse(self, driverId, success = true, msg)) + context.reply(KillDriverResponse(self, driverId, success = true, msg.message)) case None => val msg = s"Driver $driverId has already finished or does not exist" - logWarning(msg) + logWarning(log"Driver ${MDC(LogKeys.DRIVER_ID, driverId)} " + + log"has already finished or does not exist") context.reply(KillDriverResponse(self, driverId, success = false, msg)) } } @@ -477,7 +479,7 @@ private[deploy] class Master( w.endpoint.send(KillDriver(driverId)) } } - logInfo(s"Kill request for $driverId submitted") + logInfo(log"Kill request for ${MDC(LogKeys.DRIVER_ID, driverId)} submitted") } context.reply(KillAllDriversResponse(self, true, "Kill request for all drivers submitted")) } @@ -485,7 +487,8 @@ private[deploy] class Master( case RequestClearCompletedDriversAndApps => val numDrivers = completedDrivers.length val numApps = completedApps.length - logInfo(s"Asked to clear $numDrivers completed drivers and $numApps completed apps.") + logInfo(log"Asked to clear ${MDC(LogKeys.NUM_DRIVERS, numDrivers)} completed drivers and" + + log" ${MDC(LogKeys.NUM_APPS, numApps)} completed apps.") completedDrivers.clear() completedApps.clear() context.reply(true) @@ -550,7 +553,8 @@ private[deploy] class Master( if (ExecutorState.isFinished(state)) { // Remove this executor from the worker and app - logInfo(s"Removing executor ${exec.fullId} because it is $state") + logInfo(log"Removing executor ${MDC(LogKeys.EXECUTOR_ID, exec.fullId)}" + + log" because it is ${MDC(LogKeys.EXECUTOR_STATE, state)}") // If an application has already finished, preserve its // state to display its information properly on the UI if (!appInfo.isFinished) { @@ -569,23 +573,24 @@ private[deploy] class Master( && maxExecutorRetries >= 0) { // < 0 disables this application-killing path val execs = appInfo.executors.values if (!execs.exists(_.state == ExecutorState.RUNNING)) { - logError(log"Application ${MDC(APP_DESC, appInfo.desc.name)} " + - log"with ID ${MDC(APP_ID, appInfo.id)} " + - log"failed ${MDC(RETRY_COUNT, appInfo.retryCount)} times; removing it") + logError(log"Application ${MDC(LogKeys.APP_DESC, appInfo.desc.name)} " + + log"with ID ${MDC(LogKeys.APP_ID, appInfo.id)} " + + log"failed ${MDC(LogKeys.NUM_RETRY, appInfo.retryCount)} times; removing it") removeApplication(appInfo, ApplicationState.FAILED) } } } schedule() case None => - logWarning(s"Got status update for unknown executor $appId/$execId") + logWarning(log"Got status update for unknown executor ${MDC(LogKeys.APP_ID, appId)}" + + log"/${MDC(LogKeys.EXECUTOR_ID, execId)}") } context.reply(true) } override def onDisconnected(address: RpcAddress): Unit = { // The disconnected client could've been either a worker or an app; remove whichever it was - logInfo(s"$address got disassociated, removing it.") + logInfo(log"${MDC(LogKeys.RPC_ADDRESS, address)} got disassociated, removing it.") addressToWorker.get(address).foreach(removeWorker(_, s"${address} got disassociated")) addressToApp.get(address).foreach(finishApplication) if (state == RecoveryState.RECOVERING && canCompleteRecovery) { completeRecovery() } @@ -595,19 +600,20 @@ private[deploy] class Master( workers.count(_.state == WorkerState.UNKNOWN) == 0 && apps.count(_.state == ApplicationState.UNKNOWN) == 0 - private var recoveryStartTimeNs = 0L + private var recoveryStartTimeMs = 0L private def beginRecovery(storedApps: Seq[ApplicationInfo], storedDrivers: Seq[DriverInfo], storedWorkers: Seq[WorkerInfo]): Boolean = { - recoveryStartTimeNs = System.nanoTime() + recoveryStartTimeMs = System.currentTimeMillis() for (app <- storedApps) { - logInfo("Trying to recover app: " + app.id) + logInfo(log"Trying to recover app: ${MDC(LogKeys.APP_ID, app.id)}") try { registerApplication(app) app.state = ApplicationState.UNKNOWN app.driver.send(MasterChanged(self, masterWebUiUrl)) } catch { - case e: Exception => logInfo("App " + app.id + " had exception on reconnect") + case e: Exception => logInfo(log"App ${MDC(LogKeys.APP_ID, app.id)}" + + log" had exception on reconnect") } } @@ -618,13 +624,14 @@ private[deploy] class Master( } for (worker <- storedWorkers) { - logInfo("Trying to recover worker: " + worker.id) + logInfo(log"Trying to recover worker: ${MDC(LogKeys.WORKER_ID, worker.id)}") try { registerWorker(worker) worker.state = WorkerState.UNKNOWN worker.endpoint.send(MasterChanged(self, masterWebUiUrl)) } catch { - case e: Exception => logInfo("Worker " + worker.id + " had exception on reconnect") + case e: Exception => logInfo(log"Worker ${MDC(LogKeys.WORKER_ID, worker.id)}" + + log" had exception on reconnect") } } @@ -652,20 +659,23 @@ private[deploy] class Master( // Reschedule drivers which were not claimed by any workers drivers.filter(_.worker.isEmpty).foreach { d => - logWarning(s"Driver ${d.id} was not found after master recovery") + logWarning(log"Driver ${MDC(LogKeys.DRIVER_ID, d.id)} " + + log"was not found after master recovery") if (d.desc.supervise) { - logWarning(s"Re-launching ${d.id}") + logWarning(log"Re-launching ${MDC(LogKeys.DRIVER_ID, d.id)}") relaunchDriver(d) } else { removeDriver(d.id, DriverState.ERROR, None) - logWarning(s"Did not re-launch ${d.id} because it was not supervised") + logWarning(log"Did not re-launch " + + log"${MDC(LogKeys.DRIVER_ID, d.id)} because it was not supervised") } } state = RecoveryState.ALIVE schedule() - val timeTakenNs = System.nanoTime() - recoveryStartTimeNs - logInfo(f"Recovery complete in ${timeTakenNs / 1000000000d}%.3fs - resuming operations!") + val timeTakenMs = System.currentTimeMillis() - recoveryStartTimeMs + logInfo(log"Recovery complete in ${MDC(LogKeys.TOTAL_TIME, timeTakenMs)} ms" + + log" - resuming operations!") } private[master] def handleRegisterWorker( @@ -678,13 +688,15 @@ private[deploy] class Master( workerWebUiUrl: String, masterAddress: RpcAddress, resources: Map[String, ResourceInformation]): Unit = { - logInfo("Registering worker %s:%d with %d cores, %s RAM".format( - workerHost, workerPort, cores, Utils.megabytesToString(memory))) + logInfo(log"Registering worker" + + log" ${MDC(LogKeys.WORKER_HOST, workerHost)}:${MDC(LogKeys.WORKER_PORT, workerPort)}" + + log" with ${MDC(LogKeys.NUM_CORES, cores)} cores," + + log" ${MDC(LogKeys.MEMORY_SIZE, Utils.megabytesToString(memory))} RAM") if (state == RecoveryState.STANDBY) { workerRef.send(MasterInStandby) } else if (idToWorker.contains(id)) { if (idToWorker(id).state == WorkerState.UNKNOWN) { - logInfo("Worker has been re-registered: " + id) + logInfo(log"Worker has been re-registered: ${MDC(LogKeys.WORKER_ID, id)}") idToWorker(id).state = WorkerState.ALIVE } workerRef.send(RegisteredWorker(self, masterWebUiUrl, masterAddress, true)) @@ -699,8 +711,8 @@ private[deploy] class Master( schedule() } else { val workerAddress = worker.endpoint.address - logWarning("Worker registration failed. Attempted to re-register worker at same " + - "address: " + workerAddress) + logWarning(log"Worker registration failed. Attempted to re-register worker at same " + + log"address: ${MDC(LogKeys.WORKER_URL, workerAddress)}") workerRef.send(RegisterWorkerFailed("Attempted to re-register worker at same address: " + workerAddress)) } @@ -822,7 +834,8 @@ private[deploy] class Master( // first. for (app <- waitingApps) { for (rpId <- app.getRequestedRPIds()) { - logInfo(s"Start scheduling for app ${app.id} with rpId: $rpId") + logInfo(log"Start scheduling for app ${MDC(LogKeys.APP_ID, app.id)} with" + + log" rpId: ${MDC(LogKeys.RESOURCE_PROFILE_ID, rpId)}") val resourceDesc = app.getResourceDescriptionForRpId(rpId) val coresPerExecutor = resourceDesc.coresPerExecutor.getOrElse(1) @@ -836,12 +849,13 @@ private[deploy] class Master( case CORES_FREE_DESC => aliveWorkers.sortBy(w => (w.coresFree, w.id)).reverse case MEMORY_FREE_ASC => aliveWorkers.sortBy(w => (w.memoryFree, w.id)) case MEMORY_FREE_DESC => aliveWorkers.sortBy(w => (w.memoryFree, w.id)).reverse - case WORKER_ID => aliveWorkers.sortBy(_.id) + case WorkerSelectionPolicy.WORKER_ID => aliveWorkers.sortBy(_.id) } val appMayHang = waitingApps.length == 1 && waitingApps.head.executors.isEmpty && usableWorkers.isEmpty if (appMayHang) { - logWarning(s"App ${app.id} requires more resource than any of Workers could have.") + logWarning(log"App ${MDC(LogKeys.APP_ID, app.id)} requires more resource " + + log"than any of Workers could have.") } val assignedCores = scheduleExecutorsOnWorkers(app, rpId, resourceDesc, usableWorkers, spreadOutApps) @@ -956,7 +970,8 @@ private[deploy] class Master( curPos = (curPos + 1) % numWorkersAlive } if (!launched && isClusterIdle) { - logWarning(s"Driver ${driver.id} requires more resource than any of Workers could have.") + logWarning(log"Driver ${MDC(LogKeys.DRIVER_ID, driver.id)} " + + log"requires more resource than any of Workers could have.") } } } else { @@ -970,8 +985,8 @@ private[deploy] class Master( launchDriver(worker, driver) waitingDrivers -= driver case _ => - logWarning( - s"Driver ${driver.id} requires more resource than any of Workers could have.") + logWarning(log"Driver ${MDC(LogKeys.DRIVER_ID, driver.id)} " + + log"requires more resource than any of Workers could have.") } } } @@ -980,7 +995,8 @@ private[deploy] class Master( } private def launchExecutor(worker: WorkerInfo, exec: ExecutorDesc): Unit = { - logInfo("Launching executor " + exec.fullId + " on worker " + worker.id) + logInfo(log"Launching executor ${MDC(LogKeys.EXECUTOR_ID, exec.fullId)}" + + log" on worker ${MDC(LogKeys.WORKER_ID, worker.id)}") worker.addExecutor(exec) worker.endpoint.send(LaunchExecutor(masterUrl, exec.application.id, exec.id, exec.rpId, exec.application.desc, exec.cores, exec.memory, exec.resources)) @@ -1005,7 +1021,8 @@ private[deploy] class Master( // The old worker must thus be dead, so we will remove it and accept the new worker. removeWorker(oldWorker, "Worker replaced by a new worker with same address") } else { - logInfo("Attempted to re-register worker at same address: " + workerAddress) + logInfo(log"Attempted to re-register worker at same address:" + + log" ${MDC(LogKeys.RPC_ADDRESS, workerAddress)}") return false } } @@ -1032,7 +1049,8 @@ private[deploy] class Master( .values val workersToRemoveHostPorts = workersToRemove.map(_.hostPort) - logInfo(s"Decommissioning the workers with host:ports ${workersToRemoveHostPorts}") + logInfo(log"Decommissioning the workers with host:ports" + + log" ${MDC(LogKeys.HOST_PORT, workersToRemoveHostPorts)}") // The workers are removed async to avoid blocking the receive loop for the entire batch self.send(DecommissionWorkers(workersToRemove.map(_.id).toSeq)) @@ -1043,7 +1061,8 @@ private[deploy] class Master( private def decommissionWorker(worker: WorkerInfo): Unit = { if (worker.state != WorkerState.DECOMMISSIONED) { - logInfo("Decommissioning worker %s on %s:%d".format(worker.id, worker.host, worker.port)) + logInfo(log"Decommissioning worker ${MDC(LogKeys.WORKER_ID, worker.id)}" + + log" on ${MDC(LogKeys.WORKER_HOST, worker.host)}:${MDC(LogKeys.WORKER_PORT, worker.port)}") worker.setState(WorkerState.DECOMMISSIONED) for (exec <- worker.executors.values) { logInfo("Telling app of decommission executors") @@ -1060,19 +1079,21 @@ private[deploy] class Master( // On recovery do not add a decommissioned executor persistenceEngine.removeWorker(worker) } else { - logWarning("Skipping decommissioning worker %s on %s:%d as worker is already decommissioned". - format(worker.id, worker.host, worker.port)) + logWarning(log"Skipping decommissioning worker ${MDC(LogKeys.WORKER_ID, worker.id)} " + + log"on ${MDC(LogKeys.WORKER_HOST, worker.host)}:" + + log"${MDC(LogKeys.WORKER_PORT, worker.port)} as worker is already decommissioned") } } private def removeWorker(worker: WorkerInfo, msg: String): Unit = { - logInfo("Removing worker " + worker.id + " on " + worker.host + ":" + worker.port) + logInfo(log"Removing worker ${MDC(LogKeys.WORKER_ID, worker.id)} on" + + log" ${MDC(LogKeys.WORKER_HOST, worker.host)}:${MDC(LogKeys.WORKER_PORT, worker.port)}") worker.setState(WorkerState.DEAD) idToWorker -= worker.id addressToWorker -= worker.endpoint.address for (exec <- worker.executors.values) { - logInfo("Telling app of lost executor: " + exec.id) + logInfo(log"Telling app of lost executor: ${MDC(LogKeys.EXECUTOR_ID, exec.id)}") exec.application.driver.send(ExecutorUpdated( exec.id, ExecutorState.LOST, Some(s"worker lost: $msg"), None, Some(worker.host))) exec.state = ExecutorState.LOST @@ -1080,14 +1101,15 @@ private[deploy] class Master( } for (driver <- worker.drivers.values) { if (driver.desc.supervise) { - logInfo(s"Re-launching ${driver.id}") + logInfo(log"Re-launching ${MDC(LogKeys.DRIVER_ID, driver.id)}") relaunchDriver(driver) } else { - logInfo(s"Not re-launching ${driver.id} because it was not supervised") + logInfo(log"Not re-launching ${MDC(LogKeys.DRIVER_ID, driver.id)}" + + log" because it was not supervised") removeDriver(driver.id, DriverState.ERROR, None) } } - logInfo(s"Telling app of lost worker: " + worker.id) + logInfo(log"Telling app of lost worker: ${MDC(LogKeys.WORKER_ID, worker.id)}") apps.filterNot(completedApps.contains(_)).foreach { app => app.driver.send(WorkerRemoved(worker.id, worker.host, msg)) } @@ -1127,7 +1149,8 @@ private[deploy] class Master( private[master] def registerApplication(app: ApplicationInfo): Unit = { val appAddress = app.driver.address if (addressToApp.contains(appAddress)) { - logInfo("Attempted to re-register application at same address: " + appAddress) + logInfo(log"Attempted to re-register application at same" + + log" address: ${MDC(LogKeys.RPC_ADDRESS, appAddress)}") return } @@ -1145,7 +1168,7 @@ private[deploy] class Master( def removeApplication(app: ApplicationInfo, state: ApplicationState.Value): Unit = { if (apps.contains(app)) { - logInfo("Removing app " + app.id) + logInfo(log"Removing app ${MDC(LogKeys.APP_ID, app.id)}") apps -= app idToApp -= app.id endpointToApp -= app.driver @@ -1192,13 +1215,15 @@ private[deploy] class Master( resourceProfileToTotalExecs: Map[ResourceProfile, Int]): Boolean = { idToApp.get(appId) match { case Some(appInfo) => - logInfo(s"Application $appId requested executors: ${resourceProfileToTotalExecs}.") + logInfo(log"Application ${MDC(LogKeys.APP_ID, appId)} requested executors:" + + log" ${MDC(LogKeys.RESOURCE_PROFILE_TO_TOTAL_EXECS, resourceProfileToTotalExecs)}.") appInfo.requestExecutors(resourceProfileToTotalExecs) schedule() true case None => - logWarning(s"Unknown application $appId requested executors:" + - s" ${resourceProfileToTotalExecs}.") + logWarning(log"Unknown application " + + log"${MDC(LogKeys.APP_ID, appId)} requested executors:" + + log" ${MDC(LogKeys.RESOURCE_PROFILE_TO_TOTAL_EXECS, resourceProfileToTotalExecs)}.") false } } @@ -1215,7 +1240,8 @@ private[deploy] class Master( private def handleKillExecutors(appId: String, executorIds: Seq[Int]): Boolean = { idToApp.get(appId) match { case Some(appInfo) => - logInfo(s"Application $appId requests to kill executors: " + executorIds.mkString(", ")) + logInfo(log"Application ${MDC(LogKeys.APP_ID, appId)} requests to kill" + + log" executors: ${MDC(LogKeys.EXECUTOR_IDS, executorIds.mkString(", "))}") val (known, unknown) = executorIds.partition(appInfo.executors.contains) known.foreach { executorId => val desc = appInfo.executors(executorId) @@ -1223,13 +1249,15 @@ private[deploy] class Master( killExecutor(desc) } if (unknown.nonEmpty) { - logWarning(s"Application $appId attempted to kill non-existent executors: " - + unknown.mkString(", ")) + logWarning(log"Application ${MDC(LogKeys.APP_ID, appId)} attempted to kill " + + log"non-existent executors: " + + log"${MDC(LogKeys.EXECUTOR_IDS, unknown.mkString(", "))}") } schedule() true case None => - logWarning(s"Unregistered application $appId requested us to kill executors!") + logWarning(log"Unregistered application ${MDC(LogKeys.APP_ID, appId)} " + + log"requested us to kill executors!") false } } @@ -1248,7 +1276,8 @@ private[deploy] class Master( } catch { case e: NumberFormatException => // scalastyle:off line.size.limit - logError(log"Encountered executor with a non-integer ID: ${MDC(EXECUTOR_ID, executorId)}. Ignoring") + logError(log"Encountered executor with a non-integer ID: " + + log"${MDC(LogKeys.EXECUTOR_ID, executorId)}. Ignoring") // scalastyle:on None } @@ -1283,8 +1312,8 @@ private[deploy] class Master( for (worker <- toRemove) { if (worker.state != WorkerState.DEAD) { val workerTimeoutSecs = TimeUnit.MILLISECONDS.toSeconds(workerTimeoutMs) - logWarning("Removing %s because we got no heartbeat in %d seconds".format( - worker.id, workerTimeoutSecs)) + logWarning(log"Removing ${MDC(LogKeys.WORKER_ID, worker.id)} because we got no heartbeat " + + log"in ${MDC(LogKeys.TIME_UNITS, workerTimeoutMs)} ms") removeWorker(worker, s"Not receiving heartbeat for $workerTimeoutSecs seconds") } else { if (worker.lastHeartbeat < currentTime - ((reaperIterations + 1) * workerTimeoutMs)) { @@ -1308,7 +1337,7 @@ private[deploy] class Master( } private def launchDriver(worker: WorkerInfo, driver: DriverInfo): Unit = { - logInfo("Launching driver " + driver.id + " on worker " + worker.id) + logInfo(log"Launching driver ${MDC(LogKeys.DRIVER_ID, driver.id)} on worker ${MDC(LogKeys.WORKER_ID, worker.id)}") worker.addDriver(driver) driver.worker = Some(worker) worker.endpoint.send(LaunchDriver(driver.id, driver.desc, driver.resources)) @@ -1321,7 +1350,8 @@ private[deploy] class Master( exception: Option[Exception]): Unit = { drivers.find(d => d.id == driverId) match { case Some(driver) => - logInfo(s"Removing driver: $driverId ($finalState)") + logInfo(log"Removing driver: ${MDC(LogKeys.DRIVER_ID, driverId)}" + + log" (${MDC(LogKeys.DRIVER_STATE, finalState)})") drivers -= driver if (completedDrivers.size >= retainedDrivers) { val toRemove = math.max(retainedDrivers / 10, 1) @@ -1334,7 +1364,7 @@ private[deploy] class Master( driver.worker.foreach(w => w.removeDriver(driver)) schedule() case None => - logWarning(s"Asked to remove unknown driver: $driverId") + logWarning(log"Asked to remove unknown driver: ${MDC(LogKeys.DRIVER_ID, driverId)}") } } } diff --git a/core/src/main/scala/org/apache/spark/deploy/master/RecoveryModeFactory.scala b/core/src/main/scala/org/apache/spark/deploy/master/RecoveryModeFactory.scala index 106acc9a79446..964b115865aef 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/RecoveryModeFactory.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/RecoveryModeFactory.scala @@ -19,7 +19,7 @@ package org.apache.spark.deploy.master import org.apache.spark.SparkConf import org.apache.spark.annotation.DeveloperApi -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.internal.config.Deploy.{RECOVERY_COMPRESSION_CODEC, RECOVERY_DIRECTORY} import org.apache.spark.io.CompressionCodec import org.apache.spark.serializer.Serializer @@ -57,7 +57,7 @@ private[master] class FileSystemRecoveryModeFactory(conf: SparkConf, serializer: val recoveryDir = conf.get(RECOVERY_DIRECTORY) def createPersistenceEngine(): PersistenceEngine = { - logInfo("Persisting recovery state to directory: " + recoveryDir) + logInfo(log"Persisting recovery state to directory: ${MDC(LogKeys.PATH, recoveryDir)}") val codec = conf.get(RECOVERY_COMPRESSION_CODEC).map(c => CompressionCodec.createCodec(conf, c)) new FileSystemPersistenceEngine(recoveryDir, serializer, codec) } @@ -76,7 +76,8 @@ private[master] class RocksDBRecoveryModeFactory(conf: SparkConf, serializer: Se def createPersistenceEngine(): PersistenceEngine = { val recoveryDir = conf.get(RECOVERY_DIRECTORY) - logInfo("Persisting recovery state to directory: " + recoveryDir) + logInfo(log"Persisting recovery state to directory: " + + log"${MDC(LogKeys.PATH, recoveryDir)}") new RocksDBPersistenceEngine(recoveryDir, serializer) } diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ui/EnvironmentPage.scala b/core/src/main/scala/org/apache/spark/deploy/master/ui/EnvironmentPage.scala new file mode 100644 index 0000000000000..190e821524ba0 --- /dev/null +++ b/core/src/main/scala/org/apache/spark/deploy/master/ui/EnvironmentPage.scala @@ -0,0 +1,141 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.deploy.master.ui + +import scala.xml.Node + +import jakarta.servlet.http.HttpServletRequest + +import org.apache.spark.{SparkConf, SparkEnv} +import org.apache.spark.deploy.SparkHadoopUtil +import org.apache.spark.ui._ +import org.apache.spark.util.Utils + +private[ui] class EnvironmentPage( + parent: MasterWebUI, + conf: SparkConf) extends WebUIPage("Environment") { + + def render(request: HttpServletRequest): Seq[Node] = { + val details = SparkEnv.environmentDetails(conf, SparkHadoopUtil.get.newConfiguration(conf), + "", Seq.empty, Seq.empty, Seq.empty, Map.empty) + val jvmInformation = details("JVM Information").sorted + val sparkProperties = Utils.redact(conf, details("Spark Properties")).sorted + val hadoopProperties = Utils.redact(conf, details("Hadoop Properties")).sorted + val systemProperties = Utils.redact(conf, details("System Properties")).sorted + val metricsProperties = Utils.redact(conf, details("Metrics Properties")).sorted + val classpathEntries = details("Classpath Entries").sorted + + val runtimeInformationTable = UIUtils.listingTable(propertyHeader, propertyRow, + jvmInformation, fixedWidth = true, headerClasses = headerClasses) + val sparkPropertiesTable = UIUtils.listingTable(propertyHeader, propertyRow, + sparkProperties, fixedWidth = true, headerClasses = headerClasses) + val hadoopPropertiesTable = UIUtils.listingTable(propertyHeader, propertyRow, + hadoopProperties, fixedWidth = true, headerClasses = headerClasses) + val systemPropertiesTable = UIUtils.listingTable(propertyHeader, propertyRow, + systemProperties, fixedWidth = true, headerClasses = headerClasses) + val metricsPropertiesTable = UIUtils.listingTable(propertyHeader, propertyRow, + metricsProperties, fixedWidth = true, headerClasses = headerClasses) + val classpathEntriesTable = UIUtils.listingTable(classPathHeader, classPathRow, + classpathEntries, fixedWidth = true, headerClasses = headerClasses) + + val content = + + + +

+ + Runtime Information +

+
+
+ {runtimeInformationTable} +
+ +

+ + Spark Properties +

+
+
+ {sparkPropertiesTable} +
+ +

+ + Hadoop Properties +

+
+ + +

+ + System Properties +

+
+ + +

+ + Metrics Properties +

+
+ + +

+ + Classpath Entries +

+
+ +
+ UIUtils.basicSparkPage(request, content, "Environment") + } + + private def propertyHeader = Seq("Name", "Value") + private def classPathHeader = Seq("Resource", "Source") + private def headerClasses = Seq("sorttable_alpha", "sorttable_alpha") + private def headerClassesNoSortValues = Seq("sorttable_numeric", "sorttable_nosort") + + private def jvmRowDataPre(kv: (String, String)) = + {kv._1}
{kv._2}
+ private def propertyRow(kv: (String, String)) = {kv._1}{kv._2} + private def classPathRow(data: (String, String)) = {data._1}{data._2} +} + diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala index d07f299d52ba2..1248b1c368e71 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala @@ -169,8 +169,9 @@ private[ui] class MasterPage(parent: MasterWebUI) extends WebUIPage("") { {state.completedDrivers.count(_.state == DriverState.ERROR)} Error, {state.completedDrivers.count(_.state == DriverState.RELAUNCHING)} Relaunching) -
  • Status: - {state.status} +
  • Status: {state.status} + (Environment, + Log)
  • diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterWebUI.scala b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterWebUI.scala index da3c91956689e..6c7a8f582d915 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterWebUI.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterWebUI.scala @@ -24,7 +24,8 @@ import jakarta.servlet.http.{HttpServlet, HttpServletRequest, HttpServletRespons import org.apache.spark.deploy.DeployMessages.{DecommissionWorkersOnHosts, MasterStateResponse, RequestMasterState} import org.apache.spark.deploy.Utils.addRenderLogHandler import org.apache.spark.deploy.master.Master -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{HOSTS, NUM_REMOVED_WORKERS} import org.apache.spark.internal.config.DECOMMISSION_ENABLED import org.apache.spark.internal.config.UI.MASTER_UI_DECOMMISSION_ALLOW_MODE import org.apache.spark.internal.config.UI.UI_KILL_ENABLED @@ -54,6 +55,11 @@ class MasterWebUI( val masterPage = new MasterPage(this) attachPage(new ApplicationPage(this)) attachPage(new LogPage(this)) + val envPage = new EnvironmentPage(this, master.conf) + attachPage(envPage) + this.attachHandler(createServletHandler("/environment", + (request: HttpServletRequest) => envPage.render(request), + master.conf)) attachPage(masterPage) addStaticHandler(MasterWebUI.STATIC_RESOURCE_DIR) addRenderLogHandler(this, master.conf) @@ -73,7 +79,8 @@ class MasterWebUI( } else { val removedWorkers = masterEndpointRef.askSync[Integer]( DecommissionWorkersOnHosts(hostnames)) - logInfo(s"Decommissioning of hosts $hostnames decommissioned $removedWorkers workers") + logInfo(log"Decommissioning of hosts ${MDC(HOSTS, hostnames)}" + + log" decommissioned ${MDC(NUM_REMOVED_WORKERS, removedWorkers)} workers") if (removedWorkers > 0) { resp.setStatus(HttpServletResponse.SC_OK) } else if (removedWorkers == 0) { diff --git a/core/src/main/scala/org/apache/spark/deploy/rest/RestSubmissionClient.scala b/core/src/main/scala/org/apache/spark/deploy/rest/RestSubmissionClient.scala index 9107b2f5528c1..4fb95033cecef 100644 --- a/core/src/main/scala/org/apache/spark/deploy/rest/RestSubmissionClient.scala +++ b/core/src/main/scala/org/apache/spark/deploy/rest/RestSubmissionClient.scala @@ -34,7 +34,7 @@ import jakarta.servlet.http.HttpServletResponse import org.apache.spark.{SPARK_VERSION => sparkVersion, SparkConf, SparkException} import org.apache.spark.deploy.SparkApplication import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{CLASS_NAME, ERROR, SUBMISSION_ID} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.util.Utils /** @@ -79,7 +79,7 @@ private[spark] class RestSubmissionClient(master: String) extends Logging { * it to the user. Otherwise, report the error message provided by the server. */ def createSubmission(request: CreateSubmissionRequest): SubmitRestProtocolResponse = { - logInfo(s"Submitting a request to launch an application in $master.") + logInfo(log"Submitting a request to launch an application in ${MDC(MASTER_URL, master)}.") var handled: Boolean = false var response: SubmitRestProtocolResponse = null for (m <- masters if !handled) { @@ -109,7 +109,9 @@ private[spark] class RestSubmissionClient(master: String) extends Logging { /** Request that the server kill the specified submission. */ def killSubmission(submissionId: String): SubmitRestProtocolResponse = { - logInfo(s"Submitting a request to kill submission $submissionId in $master.") + logInfo(log"Submitting a request to kill submission " + + log"${MDC(SUBMISSION_ID, submissionId)} in " + + log"${MDC(MASTER_URL, master)}.") var handled: Boolean = false var response: SubmitRestProtocolResponse = null for (m <- masters if !handled) { @@ -138,7 +140,7 @@ private[spark] class RestSubmissionClient(master: String) extends Logging { /** Request that the server kill all submissions. */ def killAllSubmissions(): SubmitRestProtocolResponse = { - logInfo(s"Submitting a request to kill all submissions in $master.") + logInfo(log"Submitting a request to kill all submissions in ${MDC(MASTER_URL, master)}.") var handled: Boolean = false var response: SubmitRestProtocolResponse = null for (m <- masters if !handled) { @@ -167,7 +169,7 @@ private[spark] class RestSubmissionClient(master: String) extends Logging { /** Request that the server clears all submissions and applications. */ def clear(): SubmitRestProtocolResponse = { - logInfo(s"Submitting a request to clear $master.") + logInfo(log"Submitting a request to clear ${MDC(MASTER_URL, master)}.") var handled: Boolean = false var response: SubmitRestProtocolResponse = null for (m <- masters if !handled) { @@ -196,7 +198,7 @@ private[spark] class RestSubmissionClient(master: String) extends Logging { /** Check the readiness of Master. */ def readyz(): SubmitRestProtocolResponse = { - logInfo(s"Submitting a request to check the status of $master.") + logInfo(log"Submitting a request to check the status of ${MDC(MASTER_URL, master)}.") var handled: Boolean = false var response: SubmitRestProtocolResponse = new ErrorResponse for (m <- masters if !handled) { @@ -227,7 +229,9 @@ private[spark] class RestSubmissionClient(master: String) extends Logging { def requestSubmissionStatus( submissionId: String, quiet: Boolean = false): SubmitRestProtocolResponse = { - logInfo(s"Submitting a request for the status of submission $submissionId in $master.") + logInfo(log"Submitting a request for the status of submission " + + log"${MDC(SUBMISSION_ID, submissionId)} in " + + log"${MDC(MASTER_URL, master)}.") var handled: Boolean = false var response: SubmitRestProtocolResponse = null @@ -440,7 +444,8 @@ private[spark] class RestSubmissionClient(master: String) extends Logging { if (submitResponse.success) { val submissionId = submitResponse.submissionId if (submissionId != null) { - logInfo(s"Submission successfully created as $submissionId. Polling submission state...") + logInfo(log"Submission successfully created as ${MDC(SUBMISSION_ID, submissionId)}. " + + log"Polling submission state...") pollSubmissionStatus(submissionId) } else { // should never happen @@ -470,13 +475,17 @@ private[spark] class RestSubmissionClient(master: String) extends Logging { val exception = Option(statusResponse.message) // Log driver state, if present driverState match { - case Some(state) => logInfo(s"State of driver $submissionId is now $state.") + case Some(state) => + logInfo(log"State of driver ${MDC(SUBMISSION_ID, submissionId)} is now " + + log"${MDC(DRIVER_STATE, state)}.") case _ => logError(log"State of driver ${MDC(SUBMISSION_ID, submissionId)} was not found!") } // Log worker node, if present (workerId, workerHostPort) match { - case (Some(id), Some(hp)) => logInfo(s"Driver is running on worker $id at $hp.") + case (Some(id), Some(hp)) => + logInfo( + log"Driver is running on worker ${MDC(WORKER_ID, id)} at ${MDC(HOST_PORT, hp)}.") case _ => } // Log exception stack trace, if present @@ -490,7 +499,8 @@ private[spark] class RestSubmissionClient(master: String) extends Logging { /** Log the response sent by the server in the REST application submission protocol. */ private def handleRestResponse(response: SubmitRestProtocolResponse): Unit = { - logInfo(s"Server responded with ${response.messageType}:\n${response.toJson}") + logInfo(log"Server responded with ${MDC(CLASS_NAME, response.messageType)}:\n" + + log"${MDC(RESULT, response.toJson)}") } /** Log an appropriate error if the response sent by the server is not of the expected type. */ @@ -509,7 +519,7 @@ private[spark] class RestSubmissionClient(master: String) extends Logging { */ private def handleConnectionException(masterUrl: String): Boolean = { if (!lostMasters.contains(masterUrl)) { - logWarning(s"Unable to connect to server ${masterUrl}.") + logWarning(log"Unable to connect to server ${MDC(MASTER_URL, masterUrl)}.") lostMasters += masterUrl } lostMasters.size >= masters.length diff --git a/core/src/main/scala/org/apache/spark/deploy/rest/RestSubmissionServer.scala b/core/src/main/scala/org/apache/spark/deploy/rest/RestSubmissionServer.scala index bb91c7e7f4a22..8e534828e7778 100644 --- a/core/src/main/scala/org/apache/spark/deploy/rest/RestSubmissionServer.scala +++ b/core/src/main/scala/org/apache/spark/deploy/rest/RestSubmissionServer.scala @@ -28,7 +28,8 @@ import org.json4s._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.{SPARK_VERSION => sparkVersion, SparkConf} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.util.Utils /** @@ -76,7 +77,8 @@ private[spark] abstract class RestSubmissionServer( def start(): Int = { val (server, boundPort) = Utils.startServiceOnPort[Server](requestedPort, doStart, masterConf) _server = Some(server) - logInfo(s"Started REST server for submitting applications on $host with port $boundPort") + logInfo(log"Started REST server for submitting applications on ${MDC(HOST, host)}" + + log" with port ${MDC(PORT, boundPort)}") boundPort } diff --git a/core/src/main/scala/org/apache/spark/deploy/security/HBaseDelegationTokenProvider.scala b/core/src/main/scala/org/apache/spark/deploy/security/HBaseDelegationTokenProvider.scala index 1b2e41bc0a2e2..3a262a0d19fb5 100644 --- a/core/src/main/scala/org/apache/spark/deploy/security/HBaseDelegationTokenProvider.scala +++ b/core/src/main/scala/org/apache/spark/deploy/security/HBaseDelegationTokenProvider.scala @@ -28,7 +28,7 @@ import org.apache.hadoop.security.token.{Token, TokenIdentifier} import org.apache.spark.SparkConf import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.SERVICE_NAME +import org.apache.spark.internal.LogKeys._ import org.apache.spark.security.HadoopDelegationTokenProvider import org.apache.spark.util.Utils @@ -50,7 +50,7 @@ private[security] class HBaseDelegationTokenProvider logDebug("Attempting to fetch HBase security token.") val token = obtainToken.invoke(null, hbaseConf(hadoopConf)) .asInstanceOf[Token[_ <: TokenIdentifier]] - logInfo(s"Get token from HBase: ${token.toString}") + logInfo(log"Get token from HBase: ${MDC(TOKEN, token.toString)}") creds.addToken(token.getService, token) } catch { case NonFatal(e) => @@ -94,7 +94,7 @@ private[security] class HBaseDelegationTokenProvider logDebug("Attempting to fetch HBase security token.") val token = obtainTokenMethod.invoke(null, hbaseConnection) .asInstanceOf[Token[_ <: TokenIdentifier]] - logInfo(s"Get token from HBase: ${token.toString}") + logInfo(log"Get token from HBase: ${MDC(TOKEN, token.toString)}") creds.addToken(token.getService, token) } catch { case NonFatal(e) => diff --git a/core/src/main/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManager.scala b/core/src/main/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManager.scala index 54a24927ded4e..de517acbf8c5b 100644 --- a/core/src/main/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManager.scala +++ b/core/src/main/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManager.scala @@ -31,7 +31,8 @@ import org.apache.hadoop.security.{Credentials, UserGroupInformation} import org.apache.spark.SparkConf import org.apache.spark.deploy.SparkHadoopUtil -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys import org.apache.spark.internal.config._ import org.apache.spark.rpc.RpcEndpointRef import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages.UpdateDelegationTokens @@ -182,7 +183,7 @@ private[spark] class HadoopDelegationTokenManager( private def scheduleRenewal(delay: Long): Unit = { val _delay = math.max(0, delay) - logInfo(s"Scheduling renewal in ${UIUtils.formatDuration(_delay)}.") + logInfo(log"Scheduling renewal in ${MDC(LogKeys.TIME_UNITS, UIUtils.formatDuration(_delay))}.") val renewalTask = new Runnable() { override def run(): Unit = { @@ -211,8 +212,9 @@ private[spark] class HadoopDelegationTokenManager( null case e: Exception => val delay = TimeUnit.SECONDS.toMillis(sparkConf.get(CREDENTIALS_RENEWAL_RETRY_WAIT)) - logWarning(s"Failed to update tokens, will try again in ${UIUtils.formatDuration(delay)}!" + - " If this happens too often tasks will fail.", e) + logWarning(log"Failed to update tokens, will try again in " + + log"${MDC(LogKeys.TIME_UNITS, UIUtils.formatDuration(delay))}!" + + log" If this happens too often tasks will fail.", e) scheduleRenewal(delay) null } @@ -234,8 +236,10 @@ private[spark] class HadoopDelegationTokenManager( val now = System.currentTimeMillis val ratio = sparkConf.get(CREDENTIALS_RENEWAL_INTERVAL_RATIO) val delay = (ratio * (nextRenewal - now)).toLong - logInfo(s"Calculated delay on renewal is $delay, based on next renewal $nextRenewal " + - s"and the ratio $ratio, and current time $now") + logInfo(log"Calculated delay on renewal is ${MDC(LogKeys.DELAY, delay)}," + + log" based on next renewal ${MDC(LogKeys.NEXT_RENEWAL_TIME, nextRenewal)}" + + log" and the ratio ${MDC(LogKeys.CREDENTIALS_RENEWAL_INTERVAL_RATIO, ratio)}," + + log" and current time ${MDC(LogKeys.CURRENT_TIME, now)}") scheduleRenewal(delay) creds } @@ -244,13 +248,13 @@ private[spark] class HadoopDelegationTokenManager( private def doLogin(): UserGroupInformation = { if (principal != null) { - logInfo(s"Attempting to login to KDC using principal: $principal") + logInfo(log"Attempting to login to KDC using principal: ${MDC(LogKeys.PRINCIPAL, principal)}") require(new File(keytab).isFile(), s"Cannot find keytab at $keytab.") val ugi = UserGroupInformation.loginUserFromKeytabAndReturnUGI(principal, keytab) logInfo("Successfully logged into KDC.") ugi } else if (!SparkHadoopUtil.get.isProxyUser(UserGroupInformation.getCurrentUser())) { - logInfo(s"Attempting to load user's ticket cache.") + logInfo("Attempting to load user's ticket cache.") val ccache = sparkConf.getenv("KRB5CCNAME") val user = Option(sparkConf.getenv("KRB5PRINCIPAL")).getOrElse( UserGroupInformation.getCurrentUser().getUserName()) @@ -296,7 +300,8 @@ private[spark] object HadoopDelegationTokenManager extends Logging { deprecatedProviderEnabledConfigs.foreach { pattern => val deprecatedKey = pattern.format(serviceName) if (sparkConf.contains(deprecatedKey)) { - logWarning(s"${deprecatedKey} is deprecated. Please use ${key} instead.") + logWarning(log"${MDC(LogKeys.DEPRECATED_KEY, deprecatedKey)} is deprecated. " + + log"Please use ${MDC(LogKeys.CONFIG, key)} instead.") } } diff --git a/core/src/main/scala/org/apache/spark/deploy/security/HadoopFSDelegationTokenProvider.scala b/core/src/main/scala/org/apache/spark/deploy/security/HadoopFSDelegationTokenProvider.scala index 8eb45238b4772..b47f9e5a43afc 100644 --- a/core/src/main/scala/org/apache/spark/deploy/security/HadoopFSDelegationTokenProvider.scala +++ b/core/src/main/scala/org/apache/spark/deploy/security/HadoopFSDelegationTokenProvider.scala @@ -28,7 +28,8 @@ import org.apache.hadoop.security.{Credentials, UserGroupInformation} import org.apache.hadoop.security.token.delegation.AbstractDelegationTokenIdentifier import org.apache.spark.{SparkConf, SparkException} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.config._ import org.apache.spark.security.HadoopDelegationTokenProvider import org.apache.spark.util.Utils @@ -78,7 +79,7 @@ private[deploy] class HadoopFSDelegationTokenProvider nextRenewalDate } catch { case NonFatal(e) => - logWarning(s"Failed to get token from service $serviceName", e) + logWarning(log"Failed to get token from service ${MDC(SERVICE_NAME, serviceName)}", e) None } } @@ -116,10 +117,11 @@ private[deploy] class HadoopFSDelegationTokenProvider filesystems.foreach { fs => if (fsToExclude.contains(fs.getUri.getHost)) { // YARN RM skips renewing token with empty renewer - logInfo(s"getting token for: $fs with empty renewer to skip renewal") + logInfo(log"getting token for: ${MDC(FILE_SYSTEM, fs)} with empty renewer to skip renewal") Utils.tryLogNonFatalError { fs.addDelegationTokens("", creds) } } else { - logInfo(s"getting token for: $fs with renewer $renewer") + logInfo(log"getting token for: ${MDC(FILE_SYSTEM, fs)} with" + + log" renewer ${MDC(TOKEN_RENEWER, renewer)}") Utils.tryLogNonFatalError { fs.addDelegationTokens(renewer, creds) } } } @@ -146,7 +148,8 @@ private[deploy] class HadoopFSDelegationTokenProvider val identifier = token.decodeIdentifier().asInstanceOf[AbstractDelegationTokenIdentifier] val tokenKind = token.getKind.toString val interval = newExpiration - getIssueDate(tokenKind, identifier) - logInfo(s"Renewal interval is $interval for token $tokenKind") + logInfo(log"Renewal interval is ${MDC(TOTAL_TIME, interval)} for" + + log" token ${MDC(TOKEN_KIND, tokenKind)}") interval }.toOption } @@ -157,17 +160,20 @@ private[deploy] class HadoopFSDelegationTokenProvider val now = System.currentTimeMillis() val issueDate = identifier.getIssueDate if (issueDate > now) { - logWarning(s"Token $kind has set up issue date later than current time. (provided: " + - s"$issueDate / current timestamp: $now) Please make sure clocks are in sync between " + - "machines. If the issue is not a clock mismatch, consult token implementor to check " + - "whether issue date is valid.") + logWarning(log"Token ${MDC(TOKEN_KIND, kind)} has set up issue date later than " + + log"current time (provided: " + + log"${MDC(ISSUE_DATE, issueDate)} / current timestamp: ${MDC(CURRENT_TIME, now)}). " + + log"Please make sure clocks are in sync between " + + log"machines. If the issue is not a clock mismatch, consult token implementor to check " + + log"whether issue date is valid.") issueDate } else if (issueDate > 0L) { issueDate } else { - logWarning(s"Token $kind has not set up issue date properly. (provided: $issueDate) " + - s"Using current timestamp ($now) as issue date instead. Consult token implementor to fix " + - "the behavior.") + logWarning(log"Token ${MDC(TOKEN_KIND, kind)} has not set up issue date properly " + + log"(provided: ${MDC(ISSUE_DATE, issueDate)}). " + + log"Using current timestamp (${MDC(CURRENT_TIME, now)} as issue date instead. " + + log"Consult token implementor to fix the behavior.") now } } diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/CommandUtils.scala b/core/src/main/scala/org/apache/spark/deploy/worker/CommandUtils.scala index d1190ca46c2a8..a3e7276fc83e1 100644 --- a/core/src/main/scala/org/apache/spark/deploy/worker/CommandUtils.scala +++ b/core/src/main/scala/org/apache/spark/deploy/worker/CommandUtils.scala @@ -24,7 +24,7 @@ import scala.jdk.CollectionConverters._ import org.apache.spark.{SecurityManager, SSLOptions} import org.apache.spark.deploy.Command -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.launcher.WorkerCommandBuilder import org.apache.spark.util.Utils @@ -120,7 +120,8 @@ object CommandUtils extends Logging { Utils.copyStream(in, out, true) } catch { case e: IOException => - logInfo("Redirection to " + file + " closed: " + e.getMessage) + logInfo(log"Redirection to ${MDC(LogKeys.FILE_NAME, file)} closed: " + + log"${MDC(LogKeys.ERROR, e.getMessage)}") } } }.start() diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/DriverRunner.scala b/core/src/main/scala/org/apache/spark/deploy/worker/DriverRunner.scala index e7fca402a8870..bb96ecb38a640 100644 --- a/core/src/main/scala/org/apache/spark/deploy/worker/DriverRunner.scala +++ b/core/src/main/scala/org/apache/spark/deploy/worker/DriverRunner.scala @@ -31,7 +31,8 @@ import org.apache.spark.deploy.DeployMessages.DriverStateChanged import org.apache.spark.deploy.StandaloneResourceUtils.prepareResourcesFile import org.apache.spark.deploy.master.DriverState import org.apache.spark.deploy.master.DriverState.DriverState -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.config.{DRIVER_RESOURCES_FILE, SPARK_DRIVER_PREFIX} import org.apache.spark.internal.config.UI.UI_REVERSE_PROXY import org.apache.spark.internal.config.Worker.WORKER_DRIVER_TERMINATE_TIMEOUT @@ -91,7 +92,7 @@ private[deploy] class DriverRunner( var shutdownHook: AnyRef = null try { shutdownHook = ShutdownHookManager.addShutdownHook { () => - logInfo(s"Worker shutting down, killing driver $driverId") + logInfo(log"Worker shutting down, killing driver ${MDC(DRIVER_ID, driverId)}") kill() } @@ -131,8 +132,8 @@ private[deploy] class DriverRunner( process.foreach { p => val exitCode = Utils.terminateProcess(p, driverTerminateTimeoutMs) if (exitCode.isEmpty) { - logWarning("Failed to terminate driver process: " + p + - ". This process will likely be orphaned.") + logWarning(log"Failed to terminate driver process: ${MDC(PROCESS, p)} " + + log". This process will likely be orphaned.") } } } @@ -158,7 +159,8 @@ private[deploy] class DriverRunner( val jarFileName = new URI(driverDesc.jarUrl).getPath.split("/").last val localJarFile = new File(driverDir, jarFileName) if (!localJarFile.exists()) { // May already exist if running multiple workers on one node - logInfo(s"Copying user jar ${driverDesc.jarUrl} to $localJarFile") + logInfo(log"Copying user jar ${MDC(JAR_URL, driverDesc.jarUrl)}" + + log" to ${MDC(FILE_NAME, localJarFile)}") Utils.fetchFile( driverDesc.jarUrl, driverDir, @@ -232,7 +234,7 @@ private[deploy] class DriverRunner( val redactedCommand = Utils.redactCommandLineArgs(conf, command.command) .mkString("\"", "\" \"", "\"") while (keepTrying) { - logInfo("Launch Command: " + redactedCommand) + logInfo(log"Launch Command: ${MDC(COMMAND, redactedCommand)}") synchronized { if (killed) { return exitCode } @@ -249,7 +251,8 @@ private[deploy] class DriverRunner( if (clock.getTimeMillis() - processStart > successfulRunDuration * 1000L) { waitSeconds = 1 } - logInfo(s"Command exited with status $exitCode, re-launching after $waitSeconds s.") + logInfo(log"Command exited with status ${MDC(EXIT_CODE, exitCode)}," + + log" re-launching after ${MDC(TIME_UNITS, waitSeconds)} s.") sleeper.sleep(waitSeconds) waitSeconds = waitSeconds * 2 // exponential back-off } diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala b/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala index 9176897163846..4f42088903464 100644 --- a/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala +++ b/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala @@ -21,7 +21,8 @@ import java.io.File import org.apache.spark.{SecurityManager, SparkConf} import org.apache.spark.deploy.SparkHadoopUtil -import org.apache.spark.internal.{config, Logging} +import org.apache.spark.internal.{config, Logging, MDC} +import org.apache.spark.internal.LogKeys.RPC_ADDRESS import org.apache.spark.rpc.RpcEnv import org.apache.spark.util._ @@ -43,7 +44,7 @@ object DriverWrapper extends Logging { val host: String = Utils.localHostName() val port: Int = sys.props.getOrElse(config.DRIVER_PORT.key, "0").toInt val rpcEnv = RpcEnv.create("Driver", host, port, conf, new SecurityManager(conf)) - logInfo(s"Driver address: ${rpcEnv.address}") + logInfo(log"Driver address: ${MDC(RPC_ADDRESS, rpcEnv.address)}") rpcEnv.setupEndpoint("workerWatcher", new WorkerWatcher(rpcEnv, workerUrl)) val currentLoader = Thread.currentThread.getContextClassLoader diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala b/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala index 5547593a28f5e..8d0fb7a54f72a 100644 --- a/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala +++ b/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala @@ -28,7 +28,8 @@ import org.apache.spark.{SecurityManager, SparkConf} import org.apache.spark.deploy.{ApplicationDescription, ExecutorState} import org.apache.spark.deploy.DeployMessages.ExecutorStateChanged import org.apache.spark.deploy.StandaloneResourceUtils.prepareResourcesFile -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.config.SPARK_EXECUTOR_PREFIX import org.apache.spark.internal.config.UI._ import org.apache.spark.resource.ResourceInformation @@ -87,7 +88,7 @@ private[deploy] class ExecutorRunner( if (state == ExecutorState.LAUNCHING || state == ExecutorState.RUNNING) { state = ExecutorState.FAILED } - killProcess(Some("Worker shutting down")) } + killProcess("Worker shutting down") } } /** @@ -95,7 +96,7 @@ private[deploy] class ExecutorRunner( * * @param message the exception message which caused the executor's death */ - private def killProcess(message: Option[String]): Unit = { + private def killProcess(message: String): Unit = { var exitCode: Option[Int] = None if (process != null) { logInfo("Killing process!") @@ -107,14 +108,14 @@ private[deploy] class ExecutorRunner( } exitCode = Utils.terminateProcess(process, EXECUTOR_TERMINATE_TIMEOUT_MS) if (exitCode.isEmpty) { - logWarning("Failed to terminate process: " + process + - ". This process will likely be orphaned.") + logWarning(log"Failed to terminate process: ${MDC(PROCESS, process)}" + + log". This process will likely be orphaned.") } } try { - worker.send(ExecutorStateChanged(appId, execId, state, message, exitCode)) + worker.send(ExecutorStateChanged(appId, execId, state, Some(message), exitCode)) } catch { - case e: IllegalStateException => logWarning(e.getMessage(), e) + case e: IllegalStateException => logWarning(log"${MDC(ERROR, e.getMessage())}", e) } } @@ -162,7 +163,7 @@ private[deploy] class ExecutorRunner( val command = builder.command() val redactedCommand = Utils.redactCommandLineArgs(conf, command.asScala.toSeq) .mkString("\"", "\" \"", "\"") - logInfo(s"Launch command: $redactedCommand") + logInfo(log"Launch command: ${MDC(COMMAND, redactedCommand)}") builder.directory(executorDir) builder.environment.put("SPARK_EXECUTOR_DIRS", appLocalDirs.mkString(File.pathSeparator)) @@ -203,13 +204,13 @@ private[deploy] class ExecutorRunner( worker.send(ExecutorStateChanged(appId, execId, state, Some(message), Some(exitCode))) } catch { case interrupted: InterruptedException => - logInfo("Runner thread for executor " + fullId + " interrupted") + logInfo(log"Runner thread for executor ${MDC(EXECUTOR_ID, fullId)} interrupted") state = ExecutorState.KILLED - killProcess(None) + killProcess(s"Runner thread for executor $fullId interrupted") case e: Exception => logError("Error running executor", e) state = ExecutorState.FAILED - killProcess(Some(e.toString)) + killProcess(s"Error running executor: $e") } } } diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala index 0659c26fd15b6..7ff7974ab59f6 100755 --- a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala +++ b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala @@ -38,7 +38,7 @@ import org.apache.spark.deploy.StandaloneResourceUtils._ import org.apache.spark.deploy.master.{DriverState, Master} import org.apache.spark.deploy.worker.ui.WorkerWebUI import org.apache.spark.internal.{config, Logging, MDC} -import org.apache.spark.internal.LogKey._ +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.config.Tests.IS_TESTING import org.apache.spark.internal.config.UI._ import org.apache.spark.internal.config.Worker._ @@ -73,9 +73,9 @@ private[deploy] class Worker( // If worker decommissioning is enabled register a handler on the configured signal to shutdown. if (conf.get(config.DECOMMISSION_ENABLED)) { val signal = conf.get(config.Worker.WORKER_DECOMMISSION_SIGNAL) - logInfo(s"Registering SIG$signal handler to trigger decommissioning.") - SignalUtils.register(signal, s"Failed to register SIG$signal handler - " + - "disabling worker decommission feature.") { + logInfo(log"Registering SIG${MDC(SIGNAL, signal)} handler to trigger decommissioning.") + SignalUtils.register(signal, log"Failed to register SIG${MDC(SIGNAL, signal)} " + + log"handler - disabling worker decommission feature.") { self.send(WorkerDecommissionSigReceived) true } @@ -105,8 +105,12 @@ private[deploy] class Worker( private val INITIAL_REGISTRATION_RETRIES = conf.get(WORKER_INITIAL_REGISTRATION_RETRIES) private val TOTAL_REGISTRATION_RETRIES = conf.get(WORKER_MAX_REGISTRATION_RETRIES) if (INITIAL_REGISTRATION_RETRIES > TOTAL_REGISTRATION_RETRIES) { - logInfo(s"${WORKER_INITIAL_REGISTRATION_RETRIES.key} ($INITIAL_REGISTRATION_RETRIES) is " + - s"capped by ${WORKER_MAX_REGISTRATION_RETRIES.key} ($TOTAL_REGISTRATION_RETRIES)") + logInfo( + log"${MDC(CONFIG, WORKER_INITIAL_REGISTRATION_RETRIES.key)} " + + log"(${MDC(VALUE, INITIAL_REGISTRATION_RETRIES)}) is capped by " + + log"${MDC(CONFIG2, WORKER_MAX_REGISTRATION_RETRIES.key)} " + + log"(${MDC(MAX_ATTEMPTS, TOTAL_REGISTRATION_RETRIES)})" + ) } private val FUZZ_MULTIPLIER_INTERVAL_LOWER_BOUND = 0.500 private val REGISTRATION_RETRY_FUZZ_MULTIPLIER = { @@ -235,10 +239,11 @@ private[deploy] class Worker( override def onStart(): Unit = { assert(!registered) - logInfo("Starting Spark worker %s:%d with %d cores, %s RAM".format( - host, port, cores, Utils.megabytesToString(memory))) - logInfo(s"Running Spark version ${org.apache.spark.SPARK_VERSION}") - logInfo("Spark home: " + sparkHome) + logInfo(log"Starting Spark worker ${MDC(HOST, host)}:${MDC(PORT, port)} " + + log"with ${MDC(NUM_CORES, cores)} cores, " + + log"${MDC(MEMORY_SIZE, Utils.megabytesToString(memory))} RAM") + logInfo(log"Running Spark version ${MDC(SPARK_VERSION, org.apache.spark.SPARK_VERSION)}") + logInfo(log"Spark home: ${MDC(PATH, sparkHome)}") createWorkDir() startExternalShuffleService() setupWorkerResources() @@ -299,8 +304,9 @@ private[deploy] class Worker( master = Some(masterRef) connected = true if (reverseProxy) { - logInfo("WorkerWebUI is available at %s/proxy/%s".format( - activeMasterWebUiUrl.stripSuffix("/"), workerId)) + logInfo( + log"WorkerWebUI is available at ${MDC(WEB_URL, activeMasterWebUiUrl.stripSuffix("/"))}" + + log"/proxy/${MDC(WORKER_ID, workerId)}") // if reverseProxyUrl is not set, then we continue to generate relative URLs // starting with "/" throughout the UI and do not use activeMasterWebUiUrl val proxyUrl = conf.get(UI_REVERSE_PROXY_URL.key, "").stripSuffix("/") @@ -317,12 +323,13 @@ private[deploy] class Worker( registerMasterThreadPool.submit(new Runnable { override def run(): Unit = { try { - logInfo("Connecting to master " + masterAddress + "...") + logInfo(log"Connecting to master ${MDC(MASTER_URL, masterAddress)}...") val masterEndpoint = rpcEnv.setupEndpointRef(masterAddress, Master.ENDPOINT_NAME) sendRegisterMessageToMaster(masterEndpoint) } catch { case ie: InterruptedException => // Cancelled - case NonFatal(e) => logWarning(s"Failed to connect to master $masterAddress", e) + case NonFatal(e) => logWarning( + log"Failed to connect to master ${MDC(MASTER_URL, masterAddress)}", e) } } }) @@ -340,7 +347,8 @@ private[deploy] class Worker( if (registered) { cancelLastRegistrationRetry() } else if (connectionAttemptCount <= TOTAL_REGISTRATION_RETRIES) { - logInfo(s"Retrying connection to master (attempt # $connectionAttemptCount)") + logInfo(log"Retrying connection to master (attempt # " + + log"${MDC(NUM_ATTEMPT, connectionAttemptCount)})") /** * Re-register with the active master this worker has been communicating with. If there * is none, then it means this worker is still bootstrapping and hasn't established a @@ -374,12 +382,14 @@ private[deploy] class Worker( registerMasterFutures = Array(registerMasterThreadPool.submit(new Runnable { override def run(): Unit = { try { - logInfo("Connecting to master " + masterAddress + "...") + logInfo(log"Connecting to master ${MDC(MASTER_URL, masterAddress)}...") val masterEndpoint = rpcEnv.setupEndpointRef(masterAddress, Master.ENDPOINT_NAME) sendRegisterMessageToMaster(masterEndpoint) } catch { case ie: InterruptedException => // Cancelled - case NonFatal(e) => logWarning(s"Failed to connect to master $masterAddress", e) + case NonFatal(e) => + logWarning(log"Failed to connect to master " + + log"${MDC(MASTER_URL, masterAddress)}", e) } } })) @@ -475,10 +485,11 @@ private[deploy] class Worker( // e.g. Master disconnect(maybe due to network drop) and recover immediately, see // SPARK-23191 for more details. if (duplicate) { - logWarning(s"Duplicate registration at master $preferredMasterAddress") + logWarning(log"Duplicate registration at master " + + log"${MDC(MASTER_URL, preferredMasterAddress)}") } - logInfo(s"Successfully registered with master $preferredMasterAddress") + logInfo(log"Successfully registered with master ${MDC(MASTER_URL, preferredMasterAddress)}") registered = true changeMaster(masterRef, masterWebUiUrl, masterAddress) forwardMessageScheduler.scheduleAtFixedRate( @@ -486,7 +497,8 @@ private[deploy] class Worker( 0, HEARTBEAT_MILLIS, TimeUnit.MILLISECONDS) if (CLEANUP_ENABLED) { logInfo( - s"Worker cleanup enabled; old application directories will be deleted in: $workDir") + log"Worker cleanup enabled; old application directories will be deleted in: " + + log"${MDC(PATH, workDir)}") forwardMessageScheduler.scheduleAtFixedRate( () => Utils.tryLogNonFatalError { self.send(WorkDirCleanup) }, CLEANUP_INTERVAL_MILLIS, CLEANUP_INTERVAL_MILLIS, TimeUnit.MILLISECONDS) @@ -534,7 +546,7 @@ private[deploy] class Worker( dir.isDirectory && !isAppStillRunning && !Utils.doesDirectoryContainAnyNewFiles(dir, APP_DATA_RETENTION_SECONDS) }.foreach { dir => - logInfo(s"Removing directory: ${dir.getPath}") + logInfo(log"Removing directory: ${MDC(PATH, dir.getPath)}") Utils.deleteRecursively(dir) // Remove some registeredExecutors information of DB in external shuffle service when @@ -557,7 +569,8 @@ private[deploy] class Worker( } case MasterChanged(masterRef, masterWebUiUrl) => - logInfo("Master has changed, new master is at " + masterRef.address.toSparkURL) + logInfo(log"Master has changed, new master is at " + + log"${MDC(MASTER_URL, masterRef.address.toSparkURL)}") changeMaster(masterRef, masterWebUiUrl, masterRef.address) val executorResponses = executors.values.map { e => @@ -570,17 +583,20 @@ private[deploy] class Worker( workerId, executorResponses.toList, driverResponses.toSeq)) case ReconnectWorker(masterUrl) => - logInfo(s"Master with url $masterUrl requested this worker to reconnect.") + logInfo( + log"Master with url ${MDC(MASTER_URL, masterUrl)} requested this worker to reconnect.") registerWithMaster() case LaunchExecutor(masterUrl, appId, execId, rpId, appDesc, cores_, memory_, resources_) => if (masterUrl != activeMasterUrl) { - logWarning("Invalid Master (" + masterUrl + ") attempted to launch executor.") + logWarning(log"Invalid Master (${MDC(MASTER_URL, masterUrl)}) " + + log"attempted to launch executor.") } else if (decommissioned) { logWarning("Asked to launch an executor while decommissioned. Not launching executor.") } else { try { - logInfo("Asked to launch executor %s/%d for %s".format(appId, execId, appDesc.name)) + logInfo(log"Asked to launch executor ${MDC(APP_ID, appId)}/${MDC(EXECUTOR_ID, execId)}" + + log" for ${MDC(APP_DESC, appDesc.name)}") // Create the executor's working directory val executorDir = new File(workDir, appId + "/" + execId) @@ -600,7 +616,7 @@ private[deploy] class Worker( Some(appDir.getAbsolutePath()) } catch { case e: IOException => - logWarning(s"${e.getMessage}. Ignoring this directory.") + logWarning(log"${MDC(ERROR, e.getMessage)}. Ignoring this directory.") None } }.toImmutableArraySeq @@ -639,8 +655,8 @@ private[deploy] class Worker( } catch { case e: Exception => logError( - log"Failed to launch executor ${MDC(APP_ID, appId)}/${MDC(EXECUTOR_ID, execId)} " + - log"for ${MDC(APP_DESC, appDesc.name)}.", e) + log"Failed to launch executor ${MDC(APP_ID, appId)}/" + + log"${MDC(EXECUTOR_ID, execId)} for ${MDC(APP_DESC, appDesc.name)}.", e) if (executors.contains(appId + "/" + execId)) { executors(appId + "/" + execId).kill() executors -= appId + "/" + execId @@ -655,20 +671,21 @@ private[deploy] class Worker( case KillExecutor(masterUrl, appId, execId) => if (masterUrl != activeMasterUrl) { - logWarning("Invalid Master (" + masterUrl + ") attempted to kill executor " + execId) + logWarning(log"Invalid Master (${MDC(MASTER_URL, masterUrl)}) " + + log"attempted to kill executor ${MDC(EXECUTOR_ID, execId)}") } else { val fullId = appId + "/" + execId executors.get(fullId) match { case Some(executor) => - logInfo("Asked to kill executor " + fullId) + logInfo(log"Asked to kill executor ${MDC(EXECUTOR_ID, fullId)}") executor.kill() case None => - logInfo("Asked to kill unknown executor " + fullId) + logInfo(log"Asked to kill unknown executor ${MDC(EXECUTOR_ID, fullId)}") } } case LaunchDriver(driverId, driverDesc, resources_) => - logInfo(s"Asked to launch driver $driverId") + logInfo(log"Asked to launch driver ${MDC(DRIVER_ID, driverId)}") val driver = new DriverRunner( conf, driverId, @@ -688,7 +705,7 @@ private[deploy] class Worker( addResourcesUsed(resources_) case KillDriver(driverId) => - logInfo(s"Asked to kill driver $driverId") + logInfo(log"Asked to kill driver ${MDC(DRIVER_ID, driverId)}") drivers.get(driverId) match { case Some(runner) => runner.kill() @@ -728,7 +745,7 @@ private[deploy] class Worker( override def onDisconnected(remoteAddress: RpcAddress): Unit = { if (master.exists(_.address == remoteAddress) || masterAddressToConnect.contains(remoteAddress)) { - logInfo(s"$remoteAddress Disassociated !") + logInfo(log"${MDC(REMOTE_ADDRESS, remoteAddress)} Disassociated !") masterDisconnected() } } @@ -746,7 +763,7 @@ private[deploy] class Worker( try { appDirectories.remove(id).foreach { dirList => concurrent.Future { - logInfo(s"Cleaning up local directories for application $id") + logInfo(log"Cleaning up local directories for application ${MDC(APP_ID, id)}") dirList.foreach { dir => Utils.deleteRecursively(new File(dir)) } @@ -771,7 +788,8 @@ private[deploy] class Worker( case Some(masterRef) => masterRef.send(message) case None => logWarning( - s"Dropping $message because the connection to master has not yet been established") + log"Dropping ${MDC(MESSAGE, message)} " + + log"because the connection to master has not yet been established") } } @@ -821,7 +839,8 @@ private[deploy] class Worker( case None => logWarning( - s"Dropping $newState because the connection to master has not yet been established") + log"Dropping ${MDC(NEW_STATE, newState)} " + + log"because the connection to master has not yet been established") } } @@ -865,11 +884,11 @@ private[deploy] class Worker( private[deploy] def decommissionSelf(): Unit = { if (conf.get(config.DECOMMISSION_ENABLED) && !decommissioned) { decommissioned = true - logInfo(s"Decommission worker $workerId.") + logInfo(log"Decommission worker ${MDC(WORKER_ID, workerId)}.") } else if (decommissioned) { - logWarning(s"Worker $workerId already started decommissioning.") + logWarning(log"Worker ${MDC(WORKER_ID, workerId)} already started decommissioning.") } else { - logWarning(s"Receive decommission request, but decommission feature is disabled.") + logWarning("Receive decommission request, but decommission feature is disabled.") } } @@ -879,18 +898,20 @@ private[deploy] class Worker( val state = driverStateChanged.state state match { case DriverState.ERROR => - logWarning(s"Driver $driverId failed with unrecoverable exception: ${exception.get}") + logWarning(log"Driver ${MDC(DRIVER_ID, driverId)} " + + log"failed with unrecoverable exception: ${MDC(ERROR, exception.get)}") case DriverState.FAILED => - logWarning(s"Driver $driverId exited with failure") + logWarning(log"Driver ${MDC(DRIVER_ID, driverId)} exited with failure") case DriverState.FINISHED => registrationRetryTimer match { case Some(_) => - logWarning(s"Driver $driverId exited successfully while master is disconnected.") + logWarning(log"Driver ${MDC(DRIVER_ID, driverId)} " + + log"exited successfully while master is disconnected.") case _ => - logInfo(s"Driver $driverId exited successfully") + logInfo(log"Driver ${MDC(DRIVER_ID, driverId)} exited successfully") } case DriverState.KILLED => - logInfo(s"Driver $driverId was killed by user") + logInfo(log"Driver ${MDC(DRIVER_ID, driverId)} was killed by user") case _ => logDebug(s"Driver $driverId changed state to $state") } @@ -910,13 +931,22 @@ private[deploy] class Worker( if (ExecutorState.isFinished(state)) { val appId = executorStateChanged.appId val fullId = appId + "/" + executorStateChanged.execId - val message = executorStateChanged.message - val exitStatus = executorStateChanged.exitStatus + val message = executorStateChanged.message match { + case Some(msg) => + log" message ${MDC(MESSAGE, msg)}" + case None => + log"" + } + val exitStatus = executorStateChanged.exitStatus match { + case Some(status) => + log" exitStatus ${MDC(EXIT_CODE, status)}" + case None => + log"" + } executors.get(fullId) match { case Some(executor) => - logInfo("Executor " + fullId + " finished with state " + state + - message.map(" message " + _).getOrElse("") + - exitStatus.map(" exitStatus " + _).getOrElse("")) + logInfo(log"Executor ${MDC(EXECUTOR_ID, fullId)} finished with state " + + log"${MDC(EXECUTOR_STATE, state)}" + message + exitStatus) executors -= fullId finishedExecutors(fullId) = executor trimFinishedExecutorsIfNecessary() @@ -928,9 +958,8 @@ private[deploy] class Worker( shuffleService.executorRemoved(executorStateChanged.execId.toString, appId) } case None => - logInfo("Unknown Executor " + fullId + " finished with state " + state + - message.map(" message " + _).getOrElse("") + - exitStatus.map(" exitStatus " + _).getOrElse("")) + logInfo(log"Unknown Executor ${MDC(EXECUTOR_ID, fullId)} finished with state " + + log"${MDC(EXECUTOR_STATE, state)}" + message + exitStatus) } maybeCleanupApplication(appId) } diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/WorkerWatcher.scala b/core/src/main/scala/org/apache/spark/deploy/worker/WorkerWatcher.scala index 2488994112742..bd07a0ade523d 100644 --- a/core/src/main/scala/org/apache/spark/deploy/worker/WorkerWatcher.scala +++ b/core/src/main/scala/org/apache/spark/deploy/worker/WorkerWatcher.scala @@ -19,8 +19,8 @@ package org.apache.spark.deploy.worker import java.util.concurrent.atomic.AtomicBoolean -import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.WORKER_URL +import org.apache.spark.internal.{Logging, LogKeys, MDC} +import org.apache.spark.internal.LogKeys.WORKER_URL import org.apache.spark.rpc._ /** @@ -35,7 +35,7 @@ private[spark] class WorkerWatcher( isChildProcessStopping: AtomicBoolean = new AtomicBoolean(false)) extends RpcEndpoint with Logging { - logInfo(s"Connecting to worker $workerUrl") + logInfo(log"Connecting to worker ${MDC(WORKER_URL, workerUrl)}") if (!isTesting) { rpcEnv.asyncSetupEndpointRefByURI(workerUrl) } @@ -64,12 +64,12 @@ private[spark] class WorkerWatcher( } override def receive: PartialFunction[Any, Unit] = { - case e => logWarning(s"Received unexpected message: $e") + case e => logWarning(log"Received unexpected message: ${MDC(LogKeys.ERROR, e)}") } override def onConnected(remoteAddress: RpcAddress): Unit = { if (isWorker(remoteAddress)) { - logInfo(s"Successfully connected to $workerUrl") + logInfo(log"Successfully connected to ${MDC(WORKER_URL, workerUrl)}") } } diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/ui/LogPage.scala b/core/src/main/scala/org/apache/spark/deploy/worker/ui/LogPage.scala index 006a388e98b5b..defce5acc6168 100644 --- a/core/src/main/scala/org/apache/spark/deploy/worker/ui/LogPage.scala +++ b/core/src/main/scala/org/apache/spark/deploy/worker/ui/LogPage.scala @@ -24,7 +24,7 @@ import scala.xml.{Node, Unparsed} import jakarta.servlet.http.HttpServletRequest import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{LOG_TYPE, PATH} +import org.apache.spark.internal.LogKeys.{LOG_TYPE, PATH} import org.apache.spark.ui.{UIUtils, WebUIPage} import org.apache.spark.util.Utils import org.apache.spark.util.logging.RollingFileAppender diff --git a/core/src/main/scala/org/apache/spark/errors/SparkCoreErrors.scala b/core/src/main/scala/org/apache/spark/errors/SparkCoreErrors.scala index a131f8233b0df..8a790291b4e72 100644 --- a/core/src/main/scala/org/apache/spark/errors/SparkCoreErrors.scala +++ b/core/src/main/scala/org/apache/spark/errors/SparkCoreErrors.scala @@ -501,6 +501,15 @@ private[spark] object SparkCoreErrors { "configVal" -> toConfVal(FALLBACK_COMPRESSION_CODEC))) } + def tooManyArrayElementsError(numElements: Long, maxRoundedArrayLength: Int): Throwable = { + new SparkIllegalArgumentException( + errorClass = "COLLECTION_SIZE_LIMIT_EXCEEDED.INITIALIZE", + messageParameters = Map( + "numberOfElements" -> numElements.toString, + "maxRoundedArrayLength" -> maxRoundedArrayLength.toString) + ) + } + private def quoteByDefault(elem: String): String = { "\"" + elem + "\"" } diff --git a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala index 8488333ec3ceb..1b1053a7013e0 100644 --- a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala +++ b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala @@ -32,7 +32,7 @@ import org.apache.spark.TaskState.TaskState import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.deploy.worker.WorkerWatcher import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{CONFIG, CONFIG2, REASON} +import org.apache.spark.internal.LogKeys import org.apache.spark.internal.config._ import org.apache.spark.network.netty.SparkTransportConf import org.apache.spark.network.util.NettyUtils @@ -74,12 +74,14 @@ private[spark] class CoarseGrainedExecutorBackend( override def onStart(): Unit = { if (env.conf.get(DECOMMISSION_ENABLED)) { val signal = env.conf.get(EXECUTOR_DECOMMISSION_SIGNAL) - logInfo(s"Registering SIG$signal handler to trigger decommissioning.") - SignalUtils.register(signal, s"Failed to register SIG$signal handler - disabling" + - s" executor decommission feature.") (self.askSync[Boolean](ExecutorDecommissionSigReceived)) + logInfo(log"Registering SIG${MDC(LogKeys.SIGNAL, signal)}" + + log" handler to trigger decommissioning.") + SignalUtils.register(signal, log"Failed to register SIG${MDC(LogKeys.SIGNAL, signal)} " + + log"handler - disabling executor decommission feature.")( + self.askSync[Boolean](ExecutorDecommissionSigReceived)) } - logInfo("Connecting to driver: " + driverUrl) + logInfo(log"Connecting to driver: ${MDC(LogKeys.URL, driverUrl)}" ) try { val securityManager = new SecurityManager(env.conf) val shuffleClientTransportConf = SparkTransportConf.fromSparkConf( @@ -181,7 +183,7 @@ private[spark] class CoarseGrainedExecutorBackend( exitExecutor(1, "Received LaunchTask command but executor was null") } else { val taskDesc = TaskDescription.decode(data.value) - logInfo("Got assigned task " + taskDesc.taskId) + logInfo(log"Got assigned task ${MDC(LogKeys.TASK_ID, taskDesc.taskId)}") executor.launchTask(this, taskDesc) } @@ -218,7 +220,7 @@ private[spark] class CoarseGrainedExecutorBackend( }.start() case UpdateDelegationTokens(tokenBytes) => - logInfo(s"Received tokens of ${tokenBytes.length} bytes") + logInfo(log"Received tokens of ${MDC(LogKeys.NUM_BYTES, tokenBytes.length)} bytes") SparkHadoopUtil.get.addDelegationTokens(tokenBytes, env.conf) case DecommissionExecutor => @@ -251,12 +253,14 @@ private[spark] class CoarseGrainedExecutorBackend( override def onDisconnected(remoteAddress: RpcAddress): Unit = { if (stopping.get()) { - logInfo(s"Driver from $remoteAddress disconnected during shutdown") + logInfo(log"Driver from ${MDC(LogKeys.RPC_ADDRESS, remoteAddress)}" + + log" disconnected during shutdown") } else if (driver.exists(_.address == remoteAddress)) { exitExecutor(1, s"Driver $remoteAddress disassociated! Shutting down.", null, notifyDriver = false) } else { - logWarning(s"An unknown ($remoteAddress) driver disconnected.") + logWarning(log"An unknown (${MDC(LogKeys.REMOTE_ADDRESS, remoteAddress)} " + + log"driver disconnected.") } } @@ -269,7 +273,8 @@ private[spark] class CoarseGrainedExecutorBackend( } driver match { case Some(driverRef) => driverRef.send(msg) - case None => logWarning(s"Drop $msg because has not yet connected to driver") + case None => + logWarning(log"Drop ${MDC(LogKeys.MESSAGE, msg)} because has not yet connected to driver") } } @@ -283,7 +288,7 @@ private[spark] class CoarseGrainedExecutorBackend( throwable: Throwable = null, notifyDriver: Boolean = true) = { if (stopping.compareAndSet(false, true)) { - val message = log"Executor self-exiting due to : ${MDC(REASON, reason)}" + val message = log"Executor self-exiting due to : ${MDC(LogKeys.REASON, reason)}" if (throwable != null) { logError(message, throwable) } else { @@ -305,14 +310,14 @@ private[spark] class CoarseGrainedExecutorBackend( private def decommissionSelf(): Unit = { if (!env.conf.get(DECOMMISSION_ENABLED)) { - logWarning(s"Receive decommission request, but decommission feature is disabled.") + logWarning("Receive decommission request, but decommission feature is disabled.") return } else if (decommissioned) { - logWarning(s"Executor $executorId already started decommissioning.") + logWarning(log"Executor ${MDC(LogKeys.EXECUTOR_ID, executorId)} " + + log"already started decommissioning.") return } - val msg = s"Decommission executor $executorId." - logInfo(msg) + logInfo(log"Decommission executor ${MDC(LogKeys.EXECUTOR_ID, executorId)}.") try { decommissioned = true val migrationEnabled = env.conf.get(STORAGE_DECOMMISSION_ENABLED) && @@ -322,8 +327,8 @@ private[spark] class CoarseGrainedExecutorBackend( env.blockManager.decommissionBlockManager() } else if (env.conf.get(STORAGE_DECOMMISSION_ENABLED)) { logError(log"Storage decommissioning attempted but neither " + - log"${MDC(CONFIG, STORAGE_DECOMMISSION_SHUFFLE_BLOCKS_ENABLED.key)} or " + - log"${MDC(CONFIG2, STORAGE_DECOMMISSION_RDD_BLOCKS_ENABLED.key)} is enabled ") + log"${MDC(LogKeys.CONFIG, STORAGE_DECOMMISSION_SHUFFLE_BLOCKS_ENABLED.key)} or " + + log"${MDC(LogKeys.CONFIG2, STORAGE_DECOMMISSION_RDD_BLOCKS_ENABLED.key)} is enabled ") } if (executor != null) { executor.decommission() @@ -365,7 +370,8 @@ private[spark] class CoarseGrainedExecutorBackend( exitExecutor(0, ExecutorLossMessage.decommissionFinished, notifyDriver = true) } } else { - logInfo(s"Blocked from shutdown by ${executor.numRunningTasks} running tasks") + logInfo(log"Blocked from shutdown by" + + log" ${MDC(LogKeys.NUM_TASKS, executor.numRunningTasks)} running tasks") } Thread.sleep(sleep_time) } diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala index a7657cd78cd9b..586a8a7db28a3 100644 --- a/core/src/main/scala/org/apache/spark/executor/Executor.scala +++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala @@ -40,9 +40,10 @@ import org.slf4j.MDC import org.apache.spark._ import org.apache.spark.deploy.SparkHadoopUtil -import org.apache.spark.internal.{Logging, MDC => LogMDC} -import org.apache.spark.internal.LogKey.{CLASS_NAME, ERROR, MAX_ATTEMPTS, TASK_ID, TASK_NAME, TIMEOUT} +import org.apache.spark.internal.{Logging, LogKeys, MDC => LogMDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.config._ +import org.apache.spark.internal.config.{EXECUTOR_USER_CLASS_PATH_FIRST => EXECUTOR_USER_CLASS_PATH_FIRST_CONFIG} import org.apache.spark.internal.plugin.PluginContainer import org.apache.spark.memory.{SparkOutOfMemoryError, TaskMemoryManager} import org.apache.spark.metrics.source.JVMCPUSource @@ -81,10 +82,12 @@ private[spark] class Executor( resources: immutable.Map[String, ResourceInformation]) extends Logging { - logInfo(s"Starting executor ID $executorId on host $executorHostname") - logInfo(s"OS info ${System.getProperty("os.name")}, ${System.getProperty("os.version")}, " + - s"${System.getProperty("os.arch")}") - logInfo(s"Java version ${System.getProperty("java.version")}") + logInfo(log"Starting executor ID ${LogMDC(LogKeys.EXECUTOR_ID, executorId)}" + + log" on host ${LogMDC(HOST, executorHostname)}") + logInfo(log"OS info ${LogMDC(OS_NAME, System.getProperty("os.name"))}," + + log" ${LogMDC(OS_VERSION, System.getProperty("os.version"))}, " + + log"${LogMDC(OS_ARCH, System.getProperty("os.arch"))}") + logInfo(log"Java version ${LogMDC(JAVA_VERSION, System.getProperty("java.version"))}") private val executorShutdown = new AtomicBoolean(false) val stopHookReference = ShutdownHookManager.addShutdownHook( @@ -95,6 +98,13 @@ private[spark] class Executor( private[executor] val conf = env.conf + // SPARK-48131: Unify MDC key mdc.taskName and task_name in Spark 4.0 release. + private[executor] val taskNameMDCKey = if (conf.get(LEGACY_TASK_NAME_MDC_ENABLED)) { + "mdc.taskName" + } else { + LogKeys.TASK_NAME.name + } + // SPARK-40235: updateDependencies() uses a ReentrantLock instead of the `synchronized` keyword // so that tasks can exit quickly if they are interrupted while waiting on another task to // finish downloading dependencies. @@ -162,7 +172,7 @@ private[spark] class Executor( } // Whether to load classes in user jars before those in Spark jars - private val userClassPathFirst = conf.get(EXECUTOR_USER_CLASS_PATH_FIRST) + private val userClassPathFirst = conf.get(EXECUTOR_USER_CLASS_PATH_FIRST_CONFIG) // Whether to monitor killed / interrupted tasks private val taskReaperEnabled = conf.get(TASK_REAPER_ENABLED) @@ -212,7 +222,7 @@ private[spark] class Executor( if (sessionBasedRoot.isDirectory && sessionBasedRoot.exists()) { Utils.deleteRecursively(sessionBasedRoot) } - logInfo(s"Session evicted: ${state.sessionUUID}") + logInfo(log"Session evicted: ${LogMDC(SESSION_ID, state.sessionUUID)}") } }) .build[String, IsolatedSessionState] @@ -494,7 +504,8 @@ private[spark] class Executor( @volatile var task: Task[Any] = _ def kill(interruptThread: Boolean, reason: String): Unit = { - logInfo(s"Executor is trying to kill $taskName, reason: $reason") + logInfo(log"Executor is trying to kill ${LogMDC(TASK_NAME, taskName)}," + + log" reason: ${LogMDC(REASON, reason)}") reasonIfKilled = Some(reason) if (task != null) { synchronized { @@ -565,7 +576,7 @@ private[spark] class Executor( } else 0L Thread.currentThread.setContextClassLoader(isolatedSession.replClassLoader) val ser = env.closureSerializer.newInstance() - logInfo(s"Running $taskName") + logInfo(log"Running ${LogMDC(TASK_NAME, taskName)}") execBackend.statusUpdate(taskId, TaskState.RUNNING, EMPTY_BYTE_BUFFER) var taskStartTimeNs: Long = 0 var taskStartCpu: Long = 0 @@ -638,9 +649,10 @@ private[spark] class Executor( val freedMemory = taskMemoryManager.cleanUpAllAllocatedMemory() if (freedMemory > 0 && !threwException) { - val errMsg = s"Managed memory leak detected; size = $freedMemory bytes, $taskName" + val errMsg = log"Managed memory leak detected; size = " + + log"${LogMDC(NUM_BYTES, freedMemory)} bytes, ${LogMDC(TASK_NAME, taskName)}" if (conf.get(UNSAFE_EXCEPTION_ON_MEMORY_LEAK)) { - throw SparkException.internalError(errMsg, category = "EXECUTOR") + throw SparkException.internalError(errMsg.message, category = "EXECUTOR") } else { logWarning(errMsg) } @@ -648,10 +660,11 @@ private[spark] class Executor( if (releasedLocks.nonEmpty && !threwException) { val errMsg = - s"${releasedLocks.size} block locks were not released by $taskName\n" + - releasedLocks.mkString("[", ", ", "]") + log"${LogMDC(NUM_RELEASED_LOCKS, releasedLocks.size)} block locks" + + log" were not released by ${LogMDC(TASK_NAME, taskName)}\n" + + log" ${LogMDC(RELEASED_LOCKS, releasedLocks.mkString("[", ", ", "]"))})" if (conf.get(STORAGE_EXCEPTION_PIN_LEAK)) { - throw SparkException.internalError(errMsg, category = "EXECUTOR") + throw SparkException.internalError(errMsg.message, category = "EXECUTOR") } else { logInfo(errMsg) } @@ -727,9 +740,11 @@ private[spark] class Executor( // directSend = sending directly back to the driver val serializedResult: ByteBuffer = { if (maxResultSize > 0 && resultSize > maxResultSize) { - logWarning(s"Finished $taskName. Result is larger than maxResultSize " + - s"(${Utils.bytesToString(resultSize)} > ${Utils.bytesToString(maxResultSize)}), " + - s"dropping it.") + logWarning(log"Finished ${LogMDC(TASK_NAME, taskName)}. " + + log"Result is larger than maxResultSize " + + log"(${LogMDC(RESULT_SIZE_BYTES, Utils.bytesToString(resultSize))} > " + + log"${LogMDC(RESULT_SIZE_BYTES_MAX, Utils.bytesToString(maxResultSize))}), " + + log"dropping it.") ser.serialize(new IndirectTaskResult[Any](TaskResultBlockId(taskId), resultSize)) } else if (resultSize > maxDirectResultSize) { val blockId = TaskResultBlockId(taskId) @@ -737,10 +752,12 @@ private[spark] class Executor( blockId, serializedDirectResult, StorageLevel.MEMORY_AND_DISK_SER) - logInfo(s"Finished $taskName. $resultSize bytes result sent via BlockManager)") + logInfo(log"Finished ${LogMDC(TASK_NAME, taskName)}." + + log" ${LogMDC(NUM_BYTES, resultSize)} bytes result sent via BlockManager)") ser.serialize(new IndirectTaskResult[Any](blockId, resultSize)) } else { - logInfo(s"Finished $taskName. $resultSize bytes result sent to driver") + logInfo(log"Finished ${LogMDC(TASK_NAME, taskName)}." + + log" ${LogMDC(NUM_BYTES, resultSize)} bytes result sent to driver") // toByteBuffer is safe here, guarded by maxDirectResultSize serializedDirectResult.toByteBuffer } @@ -752,7 +769,8 @@ private[spark] class Executor( execBackend.statusUpdate(taskId, TaskState.FINISHED, serializedResult) } catch { case t: TaskKilledException => - logInfo(s"Executor killed $taskName, reason: ${t.reason}") + logInfo(log"Executor killed ${LogMDC(TASK_NAME, taskName)}," + + log" reason: ${LogMDC(REASON, t.reason)}") val (accums, accUpdates) = collectAccumulatorsAndResetStatusOnFailure(taskStartTimeNs) // Here and below, put task metric peaks in an immutable.ArraySeq to expose them as an @@ -765,7 +783,8 @@ private[spark] class Executor( case _: InterruptedException | NonFatal(_) if task != null && task.reasonIfKilled.isDefined => val killReason = task.reasonIfKilled.getOrElse("unknown reason") - logInfo(s"Executor interrupted and killed $taskName, reason: $killReason") + logInfo(log"Executor interrupted and killed ${LogMDC(TASK_NAME, taskName)}," + + log" reason: ${LogMDC(REASON, killReason)}") val (accums, accUpdates) = collectAccumulatorsAndResetStatusOnFailure(taskStartTimeNs) val metricPeaks = metricsPoller.getTaskMetricPeaks(taskId).toImmutableArraySeq @@ -778,11 +797,12 @@ private[spark] class Executor( if (!t.isInstanceOf[FetchFailedException]) { // there was a fetch failure in the task, but some user code wrapped that exception // and threw something else. Regardless, we treat it as a fetch failure. - val fetchFailedCls = classOf[FetchFailedException].getName - logWarning(s"$taskName encountered a ${fetchFailedCls} and " + - s"failed, but the ${fetchFailedCls} was hidden by another " + - s"exception. Spark is handling this like a fetch failure and ignoring the " + - s"other exception: $t") + logWarning(log"${LogMDC(TASK_NAME, taskName)} encountered a " + + log"${LogMDC(CLASS_NAME, classOf[FetchFailedException].getName)} " + + log"and failed, but the " + + log"${LogMDC(CLASS_NAME, classOf[FetchFailedException].getName)} " + + log"was hidden by another exception. Spark is handling this like a fetch failure " + + log"and ignoring the other exception: ${LogMDC(ERROR, t)}") } setTaskFinishedAndClearInterruptStatus() plugins.foreach(_.onTaskFailed(reason)) @@ -910,7 +930,7 @@ private[spark] class Executor( try { mdc.foreach { case (key, value) => MDC.put(key, value) } // avoid overriding the takName by the user - MDC.put("mdc.taskName", taskName) + MDC.put(taskNameMDCKey, taskName) } catch { case _: NoSuchFieldError => logInfo("MDC is not supported.") } @@ -919,7 +939,7 @@ private[spark] class Executor( private def cleanMDCForTask(taskName: String, mdc: Seq[(String, String)]): Unit = { try { mdc.foreach { case (key, _) => MDC.remove(key) } - MDC.remove("mdc.taskName") + MDC.remove(taskNameMDCKey) } catch { case _: NoSuchFieldError => logInfo("MDC is not supported.") } @@ -994,12 +1014,14 @@ private[spark] class Executor( finished = true } else { val elapsedTimeMs = TimeUnit.NANOSECONDS.toMillis(elapsedTimeNs) - logWarning(s"Killed task $taskId is still running after $elapsedTimeMs ms") + logWarning(log"Killed task ${LogMDC(TASK_ID, taskId)} " + + log"is still running after ${LogMDC(TIME_UNITS, elapsedTimeMs)} ms") if (takeThreadDump) { try { taskRunner.theadDump().foreach { thread => if (thread.threadName == taskRunner.threadName) { - logWarning(s"Thread dump from task $taskId:\n${thread.toString}") + logWarning(log"Thread dump from task ${LogMDC(TASK_ID, taskId)}:\n" + + log"${LogMDC(THREAD, thread.toString)}") } } } catch { @@ -1019,9 +1041,8 @@ private[spark] class Executor( } else { // In non-local-mode, the exception thrown here will bubble up to the uncaught exception // handler and cause the executor JVM to exit. - throw SparkException.internalError( - s"Killing executor JVM because killed task $taskId could not be stopped within " + - s"$killTimeoutMs ms.", category = "EXECUTOR") + throw new KilledByTaskReaperException(s"Killing executor JVM because killed task " + + s"$taskId could not be stopped within $killTimeoutMs ms.") } } } finally { @@ -1065,8 +1086,10 @@ private[spark] class Executor( private def createClassLoader(urls: Array[URL], useStub: Boolean): MutableURLClassLoader = { logInfo( - s"Starting executor with user classpath (userClassPathFirst = $userClassPathFirst): " + - urls.mkString("'", ",", "'") + log"Starting executor with user classpath" + + log" (userClassPathFirst =" + + log" ${LogMDC(LogKeys.EXECUTOR_USER_CLASS_PATH_FIRST, userClassPathFirst)}): " + + log"${LogMDC(URLS, urls.mkString("'", ",", "'"))}" ) if (useStub) { @@ -1110,12 +1133,13 @@ private[spark] class Executor( sessionUUID: String): ClassLoader = { val classUri = sessionClassUri.getOrElse(conf.get("spark.repl.class.uri", null)) val classLoader = if (classUri != null) { - logInfo("Using REPL class URI: " + classUri) + logInfo(log"Using REPL class URI: ${LogMDC(LogKeys.URI, classUri)}") new ExecutorClassLoader(conf, env, classUri, parent, userClassPathFirst) } else { parent } - logInfo(s"Created or updated repl class loader $classLoader for $sessionUUID.") + logInfo(log"Created or updated repl class loader ${LogMDC(CLASS_LOADER, classLoader)}" + + log" for ${LogMDC(SESSION_ID, sessionUUID)}.") classLoader } @@ -1150,14 +1174,16 @@ private[spark] class Executor( // Fetch missing dependencies for ((name, timestamp) <- newFiles if state.currentFiles.getOrElse(name, -1L) < timestamp) { - logInfo(s"Fetching $name with timestamp $timestamp") + logInfo(log"Fetching ${LogMDC(FILE_NAME, name)} with" + + log" timestamp ${LogMDC(TIMESTAMP, timestamp)}") // Fetch file with useCache mode, close cache for local mode. Utils.fetchFile(name, root, conf, hadoopConf, timestamp, useCache = !isLocal) state.currentFiles(name) = timestamp } for ((name, timestamp) <- newArchives if state.currentArchives.getOrElse(name, -1L) < timestamp) { - logInfo(s"Fetching $name with timestamp $timestamp") + logInfo(log"Fetching ${LogMDC(ARCHIVE_NAME, name)} with" + + log" timestamp ${LogMDC(TIMESTAMP, timestamp)}") val sourceURI = new URI(name) val uriToDownload = Utils.getUriBuilder(sourceURI).fragment(null).build() val source = Utils.fetchFile(uriToDownload.toString, Utils.createTempDir(), conf, @@ -1166,7 +1192,9 @@ private[spark] class Executor( root, if (sourceURI.getFragment != null) sourceURI.getFragment else source.getName) logInfo( - s"Unpacking an archive $name from ${source.getAbsolutePath} to ${dest.getAbsolutePath}") + log"Unpacking an archive ${LogMDC(ARCHIVE_NAME, name)}" + + log" from ${LogMDC(SOURCE_PATH, source.getAbsolutePath)}" + + log" to ${LogMDC(DESTINATION_PATH, dest.getAbsolutePath)}") Utils.deleteRecursively(dest) Utils.unpack(source, dest) state.currentArchives(name) = timestamp @@ -1177,7 +1205,8 @@ private[spark] class Executor( .orElse(state.currentJars.get(localName)) .getOrElse(-1L) if (currentTimeStamp < timestamp) { - logInfo(s"Fetching $name with timestamp $timestamp") + logInfo(log"Fetching ${LogMDC(JAR_URL, name)} with" + + log" timestamp ${LogMDC(TIMESTAMP, timestamp)}") // Fetch file with useCache mode, close cache for local mode. Utils.fetchFile(name, root, conf, hadoopConf, timestamp, useCache = !isLocal) @@ -1185,7 +1214,8 @@ private[spark] class Executor( // Add it to our class loader val url = new File(root, localName).toURI.toURL if (!state.urlClassLoader.getURLs().contains(url)) { - logInfo(s"Adding $url to class loader ${state.sessionUUID}") + logInfo(log"Adding ${LogMDC(LogKeys.URL, url)} to" + + log" class loader ${LogMDC(UUID, state.sessionUUID)}") state.urlClassLoader.addURL(url) if (isStubbingEnabledForState(state.sessionUUID)) { renewClassLoader = true @@ -1259,7 +1289,7 @@ private[spark] class Executor( if (runner != null) { runner.theadDump() } else { - logWarning(s"Failed to dump thread for task $taskId") + logWarning(log"Failed to dump thread for task ${LogMDC(TASK_ID, taskId)}") None } } @@ -1297,3 +1327,5 @@ private[spark] object Executor { } } } + +class KilledByTaskReaperException(message: String) extends SparkException(message) diff --git a/core/src/main/scala/org/apache/spark/executor/ExecutorClassLoader.scala b/core/src/main/scala/org/apache/spark/executor/ExecutorClassLoader.scala index c7047ddd278b2..48d7f150ad9bd 100644 --- a/core/src/main/scala/org/apache/spark/executor/ExecutorClassLoader.scala +++ b/core/src/main/scala/org/apache/spark/executor/ExecutorClassLoader.scala @@ -30,7 +30,7 @@ import org.apache.xbean.asm9.Opcodes._ import org.apache.spark.{SparkConf, SparkEnv} import org.apache.spark.deploy.SparkHadoopUtil -import org.apache.spark.internal.{Logging, LogKey, MDC} +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.util.ParentClassLoader /** @@ -183,8 +183,8 @@ class ExecutorClassLoader( None case e: Exception => // Something bad happened while checking if the class exists - logError(log"Failed to check existence of class ${MDC(LogKey.CLASS_NAME, name)} " + - log"on REPL class server at ${MDC(LogKey.URI, uri)}", e) + logError(log"Failed to check existence of class ${MDC(LogKeys.CLASS_NAME, name)} " + + log"on REPL class server at ${MDC(LogKeys.URI, uri)}", e) if (userClassPathFirst) { // Allow to try to load from "parentLoader" None diff --git a/core/src/main/scala/org/apache/spark/executor/ExecutorExitCode.scala b/core/src/main/scala/org/apache/spark/executor/ExecutorExitCode.scala index 99858f785600d..5300598ef53eb 100644 --- a/core/src/main/scala/org/apache/spark/executor/ExecutorExitCode.scala +++ b/core/src/main/scala/org/apache/spark/executor/ExecutorExitCode.scala @@ -45,6 +45,10 @@ object ExecutorExitCode { */ val HEARTBEAT_FAILURE = 56 + /** The default uncaught exception handler was reached and the exception was thrown by + * TaskReaper. */ + val KILLED_BY_TASK_REAPER = 57 + def explainExitCode(exitCode: Int): String = { exitCode match { case UNCAUGHT_EXCEPTION => "Uncaught exception" @@ -59,6 +63,8 @@ object ExecutorExitCode { "ExternalBlockStore failed to create a local temporary directory." case HEARTBEAT_FAILURE => "Unable to send heartbeats to driver." + case KILLED_BY_TASK_REAPER => + "Executor killed by TaskReaper." case _ => "Unknown executor exit code (" + exitCode + ")" + ( if (exitCode > 128) { diff --git a/core/src/main/scala/org/apache/spark/executor/ExecutorLogUrlHandler.scala b/core/src/main/scala/org/apache/spark/executor/ExecutorLogUrlHandler.scala index 0ddeef8e9a82d..2202489509fc4 100644 --- a/core/src/main/scala/org/apache/spark/executor/ExecutorLogUrlHandler.scala +++ b/core/src/main/scala/org/apache/spark/executor/ExecutorLogUrlHandler.scala @@ -21,7 +21,8 @@ import java.util.concurrent.atomic.AtomicBoolean import scala.util.matching.Regex -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys private[spark] class ExecutorLogUrlHandler(logUrlPattern: Option[String]) extends Logging { import ExecutorLogUrlHandler._ @@ -82,8 +83,10 @@ private[spark] class ExecutorLogUrlHandler(logUrlPattern: Option[String]) extend allPatterns: Set[String], allAttributes: Set[String]): Unit = { if (informedForMissingAttributes.compareAndSet(false, true)) { - logInfo(s"Fail to renew executor log urls: $reason. Required: $allPatterns / " + - s"available: $allAttributes. Falling back to show app's original log urls.") + logInfo(log"Fail to renew executor log urls: ${MDC(LogKeys.REASON, reason)}." + + log" Required: ${MDC(LogKeys.REGEX, allPatterns)} / " + + log"available: ${MDC(LogKeys.ATTRIBUTE_MAP, allAttributes)}." + + log" Falling back to show app's original log urls.") } } } diff --git a/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala b/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala index dbfd02b7d3b34..263de0121f7c7 100644 --- a/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala +++ b/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala @@ -328,16 +328,19 @@ private[spark] object TaskMetrics extends Logging { */ def fromAccumulators(accums: Seq[AccumulatorV2[_, _]]): TaskMetrics = { val tm = new TaskMetrics + val externalAccums = new java.util.ArrayList[AccumulatorV2[Any, Any]]() for (acc <- accums) { val name = acc.name + val tmpAcc = acc.asInstanceOf[AccumulatorV2[Any, Any]] if (name.isDefined && tm.nameToAccums.contains(name.get)) { val tmAcc = tm.nameToAccums(name.get).asInstanceOf[AccumulatorV2[Any, Any]] tmAcc.metadata = acc.metadata - tmAcc.merge(acc.asInstanceOf[AccumulatorV2[Any, Any]]) + tmAcc.merge(tmpAcc) } else { - tm._externalAccums.add(acc) + externalAccums.add(tmpAcc) } } + tm._externalAccums.addAll(externalAccums) tm } } diff --git a/core/src/main/scala/org/apache/spark/internal/config/ConfigBuilder.scala b/core/src/main/scala/org/apache/spark/internal/config/ConfigBuilder.scala index 1f19e9444d383..f50cc0f88842a 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/ConfigBuilder.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/ConfigBuilder.scala @@ -94,7 +94,7 @@ private[spark] class TypedConfigBuilder[T]( import ConfigHelpers._ def this(parent: ConfigBuilder, converter: String => T) = { - this(parent, converter, Option(_).map(_.toString).orNull) + this(parent, converter, { v: T => v.toString }) } /** Apply a transformation to the user-provided values of the config entry. */ @@ -157,6 +157,7 @@ private[spark] class TypedConfigBuilder[T]( /** Creates a [[ConfigEntry]] that has a default value. */ def createWithDefault(default: T): ConfigEntry[T] = { + assert(default != null, "Use createOptional.") // Treat "String" as a special case, so that both createWithDefault and createWithDefaultString // behave the same w.r.t. variable expansion of default values. default match { diff --git a/core/src/main/scala/org/apache/spark/internal/config/ConfigEntry.scala b/core/src/main/scala/org/apache/spark/internal/config/ConfigEntry.scala index a295ef06a6376..17d3329e6b494 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/ConfigEntry.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/ConfigEntry.scala @@ -89,14 +89,14 @@ private[spark] abstract class ConfigEntry[T] ( def defaultValueString: String protected def readString(reader: ConfigReader): Option[String] = { - val values = Seq( - prependedKey.flatMap(reader.get(_)), - alternatives.foldLeft(reader.get(key))((res, nextKey) => res.orElse(reader.get(nextKey))) - ).flatten - if (values.nonEmpty) { - Some(values.mkString(prependSeparator)) - } else { - None + // SPARK-48678: performance optimization: this code could be expressed more succinctly + // using flatten and mkString, but doing so adds lots of Scala collections perf. overhead. + val maybePrependedValue: Option[String] = prependedKey.flatMap(reader.get) + val maybeValue: Option[String] = alternatives + .foldLeft(reader.get(key))((res, nextKey) => res.orElse(reader.get(nextKey))) + (maybePrependedValue, maybeValue) match { + case (Some(prependedValue), Some(value)) => Some(s"$prependedValue$prependSeparator$value") + case _ => maybeValue.orElse(maybePrependedValue) } } diff --git a/core/src/main/scala/org/apache/spark/internal/config/ConfigReader.scala b/core/src/main/scala/org/apache/spark/internal/config/ConfigReader.scala index c1ab22150d024..8824d196489a8 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/ConfigReader.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/ConfigReader.scala @@ -84,7 +84,9 @@ private[spark] class ConfigReader(conf: ConfigProvider) { def substitute(input: String): String = substitute(input, Set()) private def substitute(input: String, usedRefs: Set[String]): String = { - if (input != null) { + // SPARK-48678: performance optimization: skip the costly regex processing + // if the string cannot possibly contain a variable reference: + if (input != null && input.contains("${")) { ConfigReader.REF_RE.replaceAllIn(input, { m => val prefix = m.group(1) val name = m.group(2) diff --git a/core/src/main/scala/org/apache/spark/internal/config/Deploy.scala b/core/src/main/scala/org/apache/spark/internal/config/Deploy.scala index b09fbd7a5bb28..0c2db21905d1f 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/Deploy.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/Deploy.scala @@ -26,20 +26,6 @@ private[spark] object Deploy { .stringConf .createWithDefault("NONE") - object RecoverySerializer extends Enumeration { - val JAVA, KRYO = Value - } - - val RECOVERY_SERIALIZER = ConfigBuilder("spark.deploy.recoverySerializer") - .doc("Serializer for writing/reading objects to/from persistence engines; " + - "JAVA or KRYO. Java serializer has been the default mode since Spark 0.8.1." + - "KRYO serializer is a new fast and compact mode from Spark 4.0.0.") - .version("4.0.0") - .stringConf - .transform(_.toUpperCase(Locale.ROOT)) - .checkValues(RecoverySerializer.values.map(_.toString)) - .createWithDefault(RecoverySerializer.JAVA.toString) - val RECOVERY_COMPRESSION_CODEC = ConfigBuilder("spark.deploy.recoveryCompressionCodec") .doc("A compression codec for persistence engines. none (default), lz4, lzf, snappy, and " + "zstd. Currently, only FILESYSTEM mode supports this configuration.") diff --git a/core/src/main/scala/org/apache/spark/internal/config/History.scala b/core/src/main/scala/org/apache/spark/internal/config/History.scala index 2306856f9331e..64a8681ca2954 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/History.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/History.scala @@ -28,16 +28,19 @@ private[spark] object History { val HISTORY_LOG_DIR = ConfigBuilder("spark.history.fs.logDirectory") .version("1.1.0") + .doc("Directory where app logs are stored") .stringConf .createWithDefault(DEFAULT_LOG_DIR) val SAFEMODE_CHECK_INTERVAL_S = ConfigBuilder("spark.history.fs.safemodeCheck.interval") .version("1.6.0") + .doc("Interval between HDFS safemode checks for the event log directory") .timeConf(TimeUnit.SECONDS) .createWithDefaultString("5s") val UPDATE_INTERVAL_S = ConfigBuilder("spark.history.fs.update.interval") .version("1.4.0") + .doc("How often(in seconds) to reload log data from storage") .timeConf(TimeUnit.SECONDS) .createWithDefaultString("10s") @@ -53,16 +56,21 @@ private[spark] object History { val CLEANER_ENABLED = ConfigBuilder("spark.history.fs.cleaner.enabled") .version("1.4.0") + .doc("Whether the History Server should periodically clean up event logs from storage") .booleanConf .createWithDefault(false) val CLEANER_INTERVAL_S = ConfigBuilder("spark.history.fs.cleaner.interval") .version("1.4.0") + .doc("When spark.history.fs.cleaner.enabled=true, specifies how often the filesystem " + + "job history cleaner checks for files to delete.") .timeConf(TimeUnit.SECONDS) .createWithDefaultString("1d") val MAX_LOG_AGE_S = ConfigBuilder("spark.history.fs.cleaner.maxAge") .version("1.4.0") + .doc("When spark.history.fs.cleaner.enabled=true, history files older than this will be " + + "deleted when the filesystem history cleaner runs.") .timeConf(TimeUnit.SECONDS) .createWithDefaultString("7d") @@ -96,6 +104,8 @@ private[spark] object History { val MAX_LOCAL_DISK_USAGE = ConfigBuilder("spark.history.store.maxDiskUsage") .version("2.3.0") + .doc("Maximum disk usage for the local directory where the cache application history " + + "information are stored.") .bytesConf(ByteUnit.BYTE) .createWithDefaultString("10g") @@ -145,60 +155,90 @@ private[spark] object History { val DRIVER_LOG_CLEANER_ENABLED = ConfigBuilder("spark.history.fs.driverlog.cleaner.enabled") .version("3.0.0") + .doc("Specifies whether the History Server should periodically clean up driver logs from " + + "storage.") .fallbackConf(CLEANER_ENABLED) - val DRIVER_LOG_CLEANER_INTERVAL = ConfigBuilder("spark.history.fs.driverlog.cleaner.interval") - .version("3.0.0") - .fallbackConf(CLEANER_INTERVAL_S) - val MAX_DRIVER_LOG_AGE_S = ConfigBuilder("spark.history.fs.driverlog.cleaner.maxAge") .version("3.0.0") + .doc(s"When ${DRIVER_LOG_CLEANER_ENABLED.key}=true, driver log files older than this will be " + + s"deleted when the driver log cleaner runs.") .fallbackConf(MAX_LOG_AGE_S) + val DRIVER_LOG_CLEANER_INTERVAL = ConfigBuilder("spark.history.fs.driverlog.cleaner.interval") + .version("3.0.0") + .doc(s" When ${DRIVER_LOG_CLEANER_ENABLED.key}=true, specifies how often the filesystem " + + s"driver log cleaner checks for files to delete. Files are only deleted if they are older " + + s"than ${MAX_DRIVER_LOG_AGE_S.key}.") + .fallbackConf(CLEANER_INTERVAL_S) + val HISTORY_SERVER_UI_ACLS_ENABLE = ConfigBuilder("spark.history.ui.acls.enable") .version("1.0.1") + .doc("Specifies whether ACLs should be checked to authorize users viewing the applications " + + "in the history server. If enabled, access control checks are performed regardless of " + + "what the individual applications had set for spark.ui.acls.enable. The application owner " + + "will always have authorization to view their own application and any users specified via " + + "spark.ui.view.acls and groups specified via spark.ui.view.acls.groups when the " + + "application was run will also have authorization to view that application. If disabled, " + + "no access control checks are made for any application UIs available through the history " + + "server.") .booleanConf .createWithDefault(false) val HISTORY_SERVER_UI_ADMIN_ACLS = ConfigBuilder("spark.history.ui.admin.acls") .version("2.1.1") + .doc("Comma separated list of users that have view access to all the Spark applications in " + + "history server.") .stringConf .toSequence .createWithDefault(Nil) val HISTORY_SERVER_UI_ADMIN_ACLS_GROUPS = ConfigBuilder("spark.history.ui.admin.acls.groups") .version("2.1.1") + .doc("Comma separated list of groups that have view access to all the Spark applications " + + "in history server.") .stringConf .toSequence .createWithDefault(Nil) val NUM_REPLAY_THREADS = ConfigBuilder("spark.history.fs.numReplayThreads") .version("2.0.0") + .doc("Number of threads that will be used by history server to process event logs.") .intConf .createWithDefaultFunction(() => Math.ceil(Runtime.getRuntime.availableProcessors() / 4f).toInt) val RETAINED_APPLICATIONS = ConfigBuilder("spark.history.retainedApplications") .version("1.0.0") + .doc("The number of applications to retain UI data for in the cache. If this cap is " + + "exceeded, then the oldest applications will be removed from the cache. If an application " + + "is not in the cache, it will have to be loaded from disk if it is accessed from the UI.") .intConf .createWithDefault(50) val PROVIDER = ConfigBuilder("spark.history.provider") .version("1.1.0") + .doc("Name of the class implementing the application history backend.") .stringConf - .createOptional + .createWithDefault("org.apache.spark.deploy.history.FsHistoryProvider") val KERBEROS_ENABLED = ConfigBuilder("spark.history.kerberos.enabled") .version("1.0.1") + .doc("Indicates whether the history server should use kerberos to login. This is required " + + "if the history server is accessing HDFS files on a secure Hadoop cluster.") .booleanConf .createWithDefault(false) val KERBEROS_PRINCIPAL = ConfigBuilder("spark.history.kerberos.principal") .version("1.0.1") + .doc(s"When ${KERBEROS_ENABLED.key}=true, specifies kerberos principal name for " + + s" the History Server.") .stringConf .createOptional val KERBEROS_KEYTAB = ConfigBuilder("spark.history.kerberos.keytab") .version("1.0.1") + .doc(s"When ${KERBEROS_ENABLED.key}=true, specifies location of the kerberos keytab file " + + s"for the History Server.") .stringConf .createOptional diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala index b2cbb6f6deb69..9fcd9ba529c16 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/package.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala @@ -143,14 +143,25 @@ package object config { private[spark] val STRUCTURED_LOGGING_ENABLED = ConfigBuilder("spark.log.structuredLogging.enabled") - .doc("When true, the default log4j output format is structured JSON lines, and there will " + - "be Mapped Diagnostic Context (MDC) from Spark added to the logs. This is useful for log " + - "aggregation and analysis tools. When false, the default log4j output will be plain " + - "text and no MDC from Spark will be set.") + .doc("When true, Spark logs are output as structured JSON lines with added Spark " + + "Mapped Diagnostic Context (MDC), facilitating easier integration with log aggregation " + + "and analysis tools. When false, logs are plain text without MDC. This configuration " + + "does not apply to interactive environments such as spark-shell, spark-sql, and " + + "PySpark shell.") .version("4.0.0") .booleanConf .createWithDefault(true) + private[spark] val LEGACY_TASK_NAME_MDC_ENABLED = + ConfigBuilder("spark.log.legacyTaskNameMdc.enabled") + .doc("When true, the MDC (Mapped Diagnostic Context) key `mdc.taskName` will be set in the " + + "log output, which is the behavior of Spark version 3.1 through Spark 3.5 releases. " + + "When false, the logging framework will use `task_name` as the MDC key, " + + "aligning it with the naming convention of newer MDC keys introduced in Spark 4.0 release.") + .version("4.0.0") + .booleanConf + .createWithDefault(false) + private[spark] val DRIVER_LOG_LOCAL_DIR = ConfigBuilder("spark.driver.log.localDir") .doc("Specifies a local directory to write driver logs and enable Driver Log UI Tab.") @@ -1306,6 +1317,15 @@ package object config { s" be less than or equal to ${ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH}.") .createWithDefault(64 * 1024 * 1024) + private[spark] val CHECKPOINT_DIR = + ConfigBuilder("spark.checkpoint.dir") + .doc( + "Set the default directory for checkpointing. It can be overwritten by " + + "SparkContext.setCheckpointDir.") + .version("4.0.0") + .stringConf + .createOptional + private[spark] val CHECKPOINT_COMPRESS = ConfigBuilder("spark.checkpoint.compress") .doc("Whether to compress RDD checkpoints. Generally a good idea. Compression will use " + @@ -1443,8 +1463,7 @@ package object config { private[spark] val SHUFFLE_UNSAFE_FILE_OUTPUT_BUFFER_SIZE = ConfigBuilder("spark.shuffle.unsafe.file.output.buffer") - .doc("The file system for this buffer size after each partition " + - "is written in unsafe shuffle writer. In KiB unless otherwise specified.") + .doc("(Deprecated since Spark 4.0, please use 'spark.shuffle.localDisk.file.output.buffer'.)") .version("2.3.0") .bytesConf(ByteUnit.KiB) .checkValue(v => v > 0 && v <= ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH / 1024, @@ -1452,6 +1471,13 @@ package object config { s" ${ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH / 1024}.") .createWithDefaultString("32k") + private[spark] val SHUFFLE_LOCAL_DISK_FILE_OUTPUT_BUFFER_SIZE = + ConfigBuilder("spark.shuffle.localDisk.file.output.buffer") + .doc("The file system for this buffer size after each partition " + + "is written in all local disk shuffle writers. In KiB unless otherwise specified.") + .version("4.0.0") + .fallbackConf(SHUFFLE_UNSAFE_FILE_OUTPUT_BUFFER_SIZE) + private[spark] val SHUFFLE_DISK_WRITE_BUFFER_SIZE = ConfigBuilder("spark.shuffle.spill.diskWriteBufferSize") .doc("The buffer size, in bytes, to use when writing the sorted records to an on-disk file.") @@ -2011,6 +2037,13 @@ package object config { .intConf .createWithDefault(1) + private[spark] val IO_COMPRESSION_LZF_PARALLEL = + ConfigBuilder("spark.io.compression.lzf.parallel.enabled") + .doc("When true, LZF compression will use multiple threads to compress data in parallel.") + .version("4.0.0") + .booleanConf + .createWithDefault(false) + private[spark] val IO_WARNING_LARGEFILETHRESHOLD = ConfigBuilder("spark.io.warning.largeFileThreshold") .internal() @@ -2403,7 +2436,7 @@ package object config { s"count ${STAGE_MAX_CONSECUTIVE_ATTEMPTS.key}") .version("3.4.0") .booleanConf - .createWithDefault(false) + .createWithDefault(true) private[spark] val SCHEDULER_MAX_RETAINED_REMOVED_EXECUTORS = ConfigBuilder("spark.scheduler.maxRetainedRemovedDecommissionExecutors") diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala index af0aa41518766..44f8d7cd63635 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala @@ -20,6 +20,9 @@ package org.apache.spark.internal.io import org.apache.hadoop.mapred._ import org.apache.hadoop.mapreduce.{TaskAttemptContext => NewTaskAttemptContext} +import org.apache.spark.internal.LogKeys +import org.apache.spark.internal.MDC + /** * An [[FileCommitProtocol]] implementation backed by an underlying Hadoop OutputCommitter * (from the old mapred API). @@ -32,7 +35,8 @@ class HadoopMapRedCommitProtocol(jobId: String, path: String) override def setupCommitter(context: NewTaskAttemptContext): OutputCommitter = { val config = context.getConfiguration.asInstanceOf[JobConf] val committer = config.getOutputCommitter - logInfo(s"Using output committer class ${committer.getClass.getCanonicalName}") + logInfo(log"Using output committer class" + + log" ${MDC(LogKeys.CLASS_NAME, committer.getClass.getCanonicalName)}") committer } } diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala index 3a24da98ecc24..f245d2d4e4074 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala @@ -29,7 +29,8 @@ import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.mapred.SparkHadoopMapRedUtil /** @@ -252,7 +253,7 @@ class HadoopMapReduceCommitProtocol( committer.abortJob(jobContext, JobStatus.State.FAILED) } catch { case e: IOException => - logWarning(s"Exception while aborting ${jobContext.getJobID}", e) + logWarning(log"Exception while aborting ${MDC(JOB_ID, jobContext.getJobID)}", e) } try { if (hasValidPath) { @@ -261,7 +262,7 @@ class HadoopMapReduceCommitProtocol( } } catch { case e: IOException => - logWarning(s"Exception while aborting ${jobContext.getJobID}", e) + logWarning(log"Exception while aborting ${MDC(JOB_ID, jobContext.getJobID)}", e) } } @@ -292,7 +293,8 @@ class HadoopMapReduceCommitProtocol( committer.abortTask(taskContext) } catch { case e: IOException => - logWarning(s"Exception while aborting ${taskContext.getTaskAttemptID}", e) + logWarning(log"Exception while aborting " + + log"${MDC(TASK_ATTEMPT_ID, taskContext.getTaskAttemptID)}", e) } // best effort cleanup of other staged files try { @@ -302,7 +304,8 @@ class HadoopMapReduceCommitProtocol( } } catch { case e: IOException => - logWarning(s"Exception while aborting ${taskContext.getTaskAttemptID}", e) + logWarning(log"Exception while aborting " + + log"${MDC(TASK_ATTEMPT_ID, taskContext.getTaskAttemptID)}", e) } } } diff --git a/core/src/main/scala/org/apache/spark/internal/io/SparkHadoopWriter.scala b/core/src/main/scala/org/apache/spark/internal/io/SparkHadoopWriter.scala index 95ea814042d35..db961b3c42f4c 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/SparkHadoopWriter.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/SparkHadoopWriter.scala @@ -33,7 +33,7 @@ import org.apache.hadoop.mapreduce.task.{TaskAttemptContextImpl => NewTaskAttemp import org.apache.spark.{SerializableWritable, SparkConf, SparkException, TaskContext} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{JOB_ID, TASK_ATTEMPT_ID} +import org.apache.spark.internal.LogKeys.{DURATION, JOB_ID, TASK_ATTEMPT_ID} import org.apache.spark.internal.io.FileCommitProtocol.TaskCommitMessage import org.apache.spark.rdd.{HadoopRDD, RDD} import org.apache.spark.util.{SerializableConfiguration, SerializableJobConf, Utils} @@ -98,10 +98,11 @@ object SparkHadoopWriter extends Logging { iterator = iter) }) - logInfo(s"Start to commit write Job ${jobContext.getJobID}.") + logInfo(log"Start to commit write Job ${MDC(JOB_ID, jobContext.getJobID)}.") val (_, duration) = Utils .timeTakenMs { committer.commitJob(jobContext, ret.toImmutableArraySeq) } - logInfo(s"Write Job ${jobContext.getJobID} committed. Elapsed time: $duration ms.") + logInfo(log"Write Job ${MDC(JOB_ID, jobContext.getJobID)} committed." + + log" Elapsed time: ${MDC(DURATION, duration)} ms.") } catch { case cause: Throwable => logError(log"Aborting job ${MDC(JOB_ID, jobContext.getJobID)}.", cause) diff --git a/core/src/main/scala/org/apache/spark/internal/plugin/PluginContainer.scala b/core/src/main/scala/org/apache/spark/internal/plugin/PluginContainer.scala index 261e016ce9bf0..a0c07bd75f885 100644 --- a/core/src/main/scala/org/apache/spark/internal/plugin/PluginContainer.scala +++ b/core/src/main/scala/org/apache/spark/internal/plugin/PluginContainer.scala @@ -22,7 +22,7 @@ import scala.util.{Either, Left, Right} import org.apache.spark.{SparkContext, SparkEnv, TaskFailedReason} import org.apache.spark.api.plugin._ -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.internal.config._ import org.apache.spark.resource.ResourceInformation import org.apache.spark.util.Utils @@ -56,7 +56,7 @@ private class DriverPluginContainer( sc.conf.set(s"${PluginContainer.EXTRA_CONF_PREFIX}$name.$k", v) } } - logInfo(s"Initialized driver component for plugin $name.") + logInfo(log"Initialized driver component for plugin ${MDC(LogKeys.CLASS_NAME, name)}.") Some((p.getClass().getName(), driverPlugin, ctx)) } else { None @@ -83,7 +83,7 @@ private class DriverPluginContainer( plugin.shutdown() } catch { case t: Throwable => - logInfo(s"Exception while shutting down plugin $name.", t) + logInfo(log"Exception while shutting down plugin ${MDC(LogKeys.CLASS_NAME, name)}.", t) } } } @@ -125,7 +125,7 @@ private class ExecutorPluginContainer( executorPlugin.init(ctx, extraConf) ctx.registerMetrics() - logInfo(s"Initialized executor component for plugin $name.") + logInfo(log"Initialized executor component for plugin ${MDC(LogKeys.CLASS_NAME, name)}.") Some(p.getClass().getName() -> executorPlugin) } else { None @@ -144,7 +144,7 @@ private class ExecutorPluginContainer( plugin.shutdown() } catch { case t: Throwable => - logInfo(s"Exception while shutting down plugin $name.", t) + logInfo(log"Exception while shutting down plugin ${MDC(LogKeys.CLASS_NAME, name)}.", t) } } } @@ -155,7 +155,8 @@ private class ExecutorPluginContainer( plugin.onTaskStart() } catch { case t: Throwable => - logInfo(s"Exception while calling onTaskStart on plugin $name.", t) + logInfo(log"Exception while calling onTaskStart on" + + log" plugin ${MDC(LogKeys.CLASS_NAME, name)}.", t) } } } @@ -166,7 +167,8 @@ private class ExecutorPluginContainer( plugin.onTaskSucceeded() } catch { case t: Throwable => - logInfo(s"Exception while calling onTaskSucceeded on plugin $name.", t) + logInfo(log"Exception while calling onTaskSucceeded on" + + log" plugin ${MDC(LogKeys.CLASS_NAME, name)}.", t) } } } @@ -177,7 +179,8 @@ private class ExecutorPluginContainer( plugin.onTaskFailed(failureReason) } catch { case t: Throwable => - logInfo(s"Exception while calling onTaskFailed on plugin $name.", t) + logInfo(log"Exception while calling onTaskFailed on" + + log" plugin ${MDC(LogKeys.CLASS_NAME, name)}.", t) } } } diff --git a/core/src/main/scala/org/apache/spark/internal/plugin/PluginEndpoint.scala b/core/src/main/scala/org/apache/spark/internal/plugin/PluginEndpoint.scala index bc45aefa560ed..6ff918979c9ed 100644 --- a/core/src/main/scala/org/apache/spark/internal/plugin/PluginEndpoint.scala +++ b/core/src/main/scala/org/apache/spark/internal/plugin/PluginEndpoint.scala @@ -18,7 +18,8 @@ package org.apache.spark.internal.plugin import org.apache.spark.api.plugin.DriverPlugin -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.rpc.{IsolatedThreadSafeRpcEndpoint, RpcCallContext, RpcEnv} case class PluginMessage(pluginName: String, message: AnyRef) @@ -36,13 +37,15 @@ private class PluginEndpoint( val reply = plugin.receive(message) if (reply != null) { logWarning( - s"Plugin $pluginName returned reply for one-way message of type " + - s"${message.getClass().getName()}.") + log"Plugin ${MDC(PLUGIN_NAME, pluginName)} " + + log"returned reply for one-way message of type " + + log"${MDC(CLASS_NAME, message.getClass().getName())}.") } } catch { case e: Exception => - logWarning(s"Error in plugin $pluginName when handling message of type " + - s"${message.getClass().getName()}.", e) + logWarning(log"Error in plugin ${MDC(PLUGIN_NAME, pluginName)} " + + log"when handling message of type " + + log"${MDC(CLASS_NAME, message.getClass().getName())}.", e) } case None => diff --git a/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala b/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala index 07e694b6c5b03..233228a9c6d4c 100644 --- a/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala +++ b/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala @@ -22,6 +22,7 @@ import java.util.Locale import com.github.luben.zstd.{NoPool, RecyclingBufferPool, ZstdInputStreamNoFinalizer, ZstdOutputStreamNoFinalizer} import com.ning.compress.lzf.{LZFInputStream, LZFOutputStream} +import com.ning.compress.lzf.parallel.PLZFOutputStream import net.jpountz.lz4.{LZ4BlockInputStream, LZ4BlockOutputStream, LZ4Factory} import net.jpountz.xxhash.XXHashFactory import org.xerial.snappy.{Snappy, SnappyInputStream, SnappyOutputStream} @@ -100,8 +101,9 @@ private[spark] object CompressionCodec { * If it is already a short name, just return it. */ def getShortName(codecName: String): String = { - if (shortCompressionCodecNames.contains(codecName)) { - codecName + val lowercasedCodec = codecName.toLowerCase(Locale.ROOT) + if (shortCompressionCodecNames.contains(lowercasedCodec)) { + lowercasedCodec } else { shortCompressionCodecNames .collectFirst { case (k, v) if v == codecName => k } @@ -170,9 +172,14 @@ class LZ4CompressionCodec(conf: SparkConf) extends CompressionCodec { */ @DeveloperApi class LZFCompressionCodec(conf: SparkConf) extends CompressionCodec { + private val parallelCompression = conf.get(IO_COMPRESSION_LZF_PARALLEL) override def compressedOutputStream(s: OutputStream): OutputStream = { - new LZFOutputStream(s).setFinishBlockOnFlush(true) + if (parallelCompression) { + new PLZFOutputStream(s) + } else { + new LZFOutputStream(s).setFinishBlockOnFlush(true) + } } override def compressedInputStream(s: InputStream): InputStream = new LZFInputStream(s) diff --git a/core/src/main/scala/org/apache/spark/mapred/SparkHadoopMapRedUtil.scala b/core/src/main/scala/org/apache/spark/mapred/SparkHadoopMapRedUtil.scala index c68999f34079d..0aaa222e6195e 100644 --- a/core/src/main/scala/org/apache/spark/mapred/SparkHadoopMapRedUtil.scala +++ b/core/src/main/scala/org/apache/spark/mapred/SparkHadoopMapRedUtil.scala @@ -25,7 +25,7 @@ import org.apache.hadoop.mapreduce.{OutputCommitter => MapReduceOutputCommitter} import org.apache.spark.{SparkEnv, TaskContext} import org.apache.spark.executor.CommitDeniedException import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.TASK_ATTEMPT_ID +import org.apache.spark.internal.LogKeys.{TASK_ATTEMPT_ID, TOTAL_TIME} import org.apache.spark.util.Utils object SparkHadoopMapRedUtil extends Logging { @@ -50,7 +50,8 @@ object SparkHadoopMapRedUtil extends Logging { def performCommit(): Unit = { try { val (_, timeCost) = Utils.timeTakenMs(committer.commitTask(mrTaskContext)) - logInfo(s"$mrTaskAttemptID: Committed. Elapsed time: $timeCost ms.") + logInfo(log"${MDC(TASK_ATTEMPT_ID, mrTaskAttemptID)}: Committed." + + log" Elapsed time: ${MDC(TOTAL_TIME, timeCost)} ms.") } catch { case cause: IOException => logError( @@ -80,12 +81,13 @@ object SparkHadoopMapRedUtil extends Logging { if (canCommit) { performCommit() } else { - val message = - s"$mrTaskAttemptID: Not committed because the driver did not authorize commit" + val message = log"${MDC(TASK_ATTEMPT_ID, mrTaskAttemptID)}: Not committed because" + + log" the driver did not authorize commit" logInfo(message) // We need to abort the task so that the driver can reschedule new attempts, if necessary committer.abortTask(mrTaskContext) - throw new CommitDeniedException(message, ctx.stageId(), splitId, ctx.attemptNumber()) + throw new CommitDeniedException(message.message, ctx.stageId(), splitId, + ctx.attemptNumber()) } } else { // Speculation is disabled or a user has chosen to manually bypass the commit coordination @@ -93,7 +95,8 @@ object SparkHadoopMapRedUtil extends Logging { } } else { // Some other attempt committed the output, so we do nothing and signal success - logInfo(s"No need to commit output of task because needsTaskCommit=false: $mrTaskAttemptID") + logInfo(log"No need to commit output of task because needsTaskCommit=false:" + + log" ${MDC(TASK_ATTEMPT_ID, mrTaskAttemptID)}") } } } diff --git a/core/src/main/scala/org/apache/spark/memory/ExecutionMemoryPool.scala b/core/src/main/scala/org/apache/spark/memory/ExecutionMemoryPool.scala index 4cffbb2a5701c..7098961d1649a 100644 --- a/core/src/main/scala/org/apache/spark/memory/ExecutionMemoryPool.scala +++ b/core/src/main/scala/org/apache/spark/memory/ExecutionMemoryPool.scala @@ -21,7 +21,8 @@ import javax.annotation.concurrent.GuardedBy import scala.collection.mutable -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ /** * Implements policies and bookkeeping for sharing an adjustable-sized pool of memory between tasks. @@ -136,7 +137,8 @@ private[memory] class ExecutionMemoryPool( // if we can't give it this much now, wait for other tasks to free up memory // (this happens if older tasks allocated lots of memory before N grew) if (toGrant < numBytes && curMem + toGrant < minMemoryPerTask) { - logInfo(s"TID $taskAttemptId waiting for at least 1/2N of $poolName pool to be free") + logInfo(log"TID ${MDC(TASK_ATTEMPT_ID, taskAttemptId)} waiting for at least 1/2N of" + + log" ${MDC(POOL_NAME, poolName)} pool to be free") lock.wait() } else { memoryForTask(taskAttemptId) += toGrant @@ -153,8 +155,9 @@ private[memory] class ExecutionMemoryPool( val curMem = memoryForTask.getOrElse(taskAttemptId, 0L) val memoryToFree = if (curMem < numBytes) { logWarning( - s"Internal error: release called on $numBytes bytes but task only has $curMem bytes " + - s"of memory from the $poolName pool") + log"Internal error: release called on ${MDC(NUM_BYTES, numBytes)} " + + log"bytes but task only has ${MDC(CURRENT_MEMORY_SIZE, curMem)} bytes " + + log"of memory from the ${MDC(MEMORY_POOL_NAME, poolName)} pool") curMem } else { numBytes diff --git a/core/src/main/scala/org/apache/spark/memory/StorageMemoryPool.scala b/core/src/main/scala/org/apache/spark/memory/StorageMemoryPool.scala index 0abdca99de1b2..24fcb5b17f388 100644 --- a/core/src/main/scala/org/apache/spark/memory/StorageMemoryPool.scala +++ b/core/src/main/scala/org/apache/spark/memory/StorageMemoryPool.scala @@ -20,7 +20,8 @@ package org.apache.spark.memory import javax.annotation.concurrent.GuardedBy import org.apache.spark.SparkException -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.storage.BlockId import org.apache.spark.storage.memory.MemoryStore @@ -104,8 +105,8 @@ private[memory] class StorageMemoryPool( def releaseMemory(size: Long): Unit = lock.synchronized { if (size > _memoryUsed) { - logWarning(s"Attempted to release $size bytes of storage " + - s"memory when we only have ${_memoryUsed} bytes") + logWarning(log"Attempted to release ${MDC(NUM_BYTES, size)} bytes of storage " + + log"memory when we only have ${MDC(NUM_BYTES_USED, _memoryUsed)} bytes") _memoryUsed = 0 } else { _memoryUsed -= size diff --git a/core/src/main/scala/org/apache/spark/memory/UnifiedMemoryManager.scala b/core/src/main/scala/org/apache/spark/memory/UnifiedMemoryManager.scala index 73805c11e0371..d4ec6ed8495af 100644 --- a/core/src/main/scala/org/apache/spark/memory/UnifiedMemoryManager.scala +++ b/core/src/main/scala/org/apache/spark/memory/UnifiedMemoryManager.scala @@ -18,7 +18,8 @@ package org.apache.spark.memory import org.apache.spark.{SparkConf, SparkIllegalArgumentException} -import org.apache.spark.internal.config +import org.apache.spark.internal.{config, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.config.Tests._ import org.apache.spark.storage.BlockId @@ -166,8 +167,9 @@ private[spark] class UnifiedMemoryManager( } if (numBytes > maxMemory) { // Fail fast if the block simply won't fit - logInfo(s"Will not store $blockId as the required space ($numBytes bytes) exceeds our " + - s"memory limit ($maxMemory bytes)") + logInfo(log"Will not store ${MDC(BLOCK_ID, blockId)} as the required space" + + log" (${MDC(NUM_BYTES, numBytes)} bytes) exceeds our" + + log" memory limit (${MDC(NUM_BYTES_MAX, maxMemory)} bytes)") return false } if (numBytes > storagePool.memoryFree) { diff --git a/core/src/main/scala/org/apache/spark/metrics/ExecutorMetricType.scala b/core/src/main/scala/org/apache/spark/metrics/ExecutorMetricType.scala index 50b7ddcb13ae1..965468ac2418f 100644 --- a/core/src/main/scala/org/apache/spark/metrics/ExecutorMetricType.scala +++ b/core/src/main/scala/org/apache/spark/metrics/ExecutorMetricType.scala @@ -24,7 +24,8 @@ import scala.jdk.CollectionConverters._ import org.apache.spark.SparkEnv import org.apache.spark.executor.ProcfsMetricsGetter -import org.apache.spark.internal.{config, Logging} +import org.apache.spark.internal.{config, Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.memory.MemoryManager /** @@ -157,10 +158,13 @@ case object GarbageCollectionMetrics extends ExecutorMetricType with Logging { } else if (!nonBuiltInCollectors.contains(mxBean.getName)) { nonBuiltInCollectors = mxBean.getName +: nonBuiltInCollectors // log it when first seen - logWarning(s"To enable non-built-in garbage collector(s) " + - s"$nonBuiltInCollectors, users should configure it(them) to " + - s"${config.EVENT_LOG_GC_METRICS_YOUNG_GENERATION_GARBAGE_COLLECTORS.key} or " + - s"${config.EVENT_LOG_GC_METRICS_OLD_GENERATION_GARBAGE_COLLECTORS.key}") + val youngGenerationGc = MDC(YOUNG_GENERATION_GC, + config.EVENT_LOG_GC_METRICS_YOUNG_GENERATION_GARBAGE_COLLECTORS.key) + val oldGenerationGc = MDC(OLD_GENERATION_GC, + config.EVENT_LOG_GC_METRICS_OLD_GENERATION_GARBAGE_COLLECTORS.key) + logWarning(log"To enable non-built-in garbage collector(s) " + + log"${MDC(NON_BUILT_IN_CONNECTORS, nonBuiltInCollectors)}, " + + log"users should configure it(them) to $youngGenerationGc or $oldGenerationGc") } else { // do nothing } diff --git a/core/src/main/scala/org/apache/spark/metrics/MetricsConfig.scala b/core/src/main/scala/org/apache/spark/metrics/MetricsConfig.scala index 12df40c3476a0..a845feeb67ff0 100644 --- a/core/src/main/scala/org/apache/spark/metrics/MetricsConfig.scala +++ b/core/src/main/scala/org/apache/spark/metrics/MetricsConfig.scala @@ -26,7 +26,7 @@ import scala.util.matching.Regex import org.apache.spark.SparkConf import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.PATH +import org.apache.spark.internal.LogKeys.PATH import org.apache.spark.internal.config.METRICS_CONF import org.apache.spark.util.Utils diff --git a/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala b/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala index 555083bb65d24..709ce0060e150 100644 --- a/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala +++ b/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala @@ -27,7 +27,7 @@ import org.eclipse.jetty.servlet.ServletContextHandler import org.apache.spark.{SecurityManager, SparkConf} import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.CLASS_NAME +import org.apache.spark.internal.LogKeys import org.apache.spark.internal.config._ import org.apache.spark.metrics.sink.{MetricsServlet, PrometheusServlet, Sink} import org.apache.spark.metrics.source.{Source, StaticSources} @@ -141,12 +141,13 @@ private[spark] class MetricsSystem private ( // Only Driver and Executor set spark.app.id and spark.executor.id. // Other instance types, e.g. Master and Worker, are not related to a specific application. if (metricsNamespace.isEmpty) { - logWarning(s"Using default name $defaultName for source because neither " + - s"${METRICS_NAMESPACE.key} nor spark.app.id is set.") + logWarning(log"Using default name ${MDC(LogKeys.DEFAULT_NAME, defaultName)} " + + log"for source because neither " + + log"${MDC(LogKeys.CONFIG, METRICS_NAMESPACE.key)} nor spark.app.id is set.") } if (executorId.isEmpty) { - logWarning(s"Using default name $defaultName for source because spark.executor.id is " + - s"not set.") + logWarning(log"Using default name ${MDC(LogKeys.DEFAULT_NAME, defaultName)} " + + log"for source because spark.executor.id is not set.") } defaultName } @@ -189,7 +190,8 @@ private[spark] class MetricsSystem private ( registerSource(source) } catch { case e: Exception => - logError(log"Source class ${MDC(CLASS_NAME, classPath)} cannot be instantiated", e) + logError(log"Source class ${MDC(LogKeys.CLASS_NAME, classPath)} " + + log"cannot be instantiated", e) } } } @@ -229,7 +231,8 @@ private[spark] class MetricsSystem private ( } } catch { case e: Exception => - logError(log"Sink class ${MDC(CLASS_NAME, classPath)} cannot be instantiated") + logError(log"Sink class ${MDC(LogKeys.CLASS_NAME, classPath)} " + + log"cannot be instantiated") throw e } } diff --git a/core/src/main/scala/org/apache/spark/metrics/sink/StatsdSink.scala b/core/src/main/scala/org/apache/spark/metrics/sink/StatsdSink.scala index c506b86b4563b..30b10d64882ac 100644 --- a/core/src/main/scala/org/apache/spark/metrics/sink/StatsdSink.scala +++ b/core/src/main/scala/org/apache/spark/metrics/sink/StatsdSink.scala @@ -22,7 +22,8 @@ import java.util.concurrent.TimeUnit import com.codahale.metrics.{Metric, MetricFilter, MetricRegistry} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.PREFIX import org.apache.spark.metrics.MetricsSystem private[spark] object StatsdSink { @@ -69,7 +70,7 @@ private[spark] class StatsdSink( override def start(): Unit = { reporter.start(pollPeriod, pollUnit) - logInfo(s"StatsdSink started with prefix: '$prefix'") + logInfo(log"StatsdSink started with prefix: '${MDC(PREFIX, prefix)}'") } override def stop(): Unit = { diff --git a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockRpcServer.scala b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockRpcServer.scala index aa0da153f7fa3..a922eb336c28f 100644 --- a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockRpcServer.scala +++ b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockRpcServer.scala @@ -23,7 +23,8 @@ import scala.jdk.CollectionConverters._ import scala.reflect.ClassTag import org.apache.spark.SparkException -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.network.BlockDataManager import org.apache.spark.network.buffer.NioManagedBuffer import org.apache.spark.network.client.{RpcResponseCallback, StreamCallbackWithID, TransportClient} @@ -55,17 +56,21 @@ class NettyBlockRpcServer( BlockTransferMessage.Decoder.fromByteBuffer(rpcMessage) } catch { case e: IllegalArgumentException if e.getMessage.startsWith("Unknown message type") => - logWarning(s"This could be a corrupted RPC message (capacity: ${rpcMessage.capacity()}) " + - s"from ${client.getSocketAddress}. Please use `spark.authenticate.*` configurations " + - "in case of security incidents.") + logWarning(log"This could be a corrupted RPC message (capacity: " + + log"${MDC(RPC_MESSAGE_CAPACITY, rpcMessage.capacity())}) " + + log"from ${MDC(SOCKET_ADDRESS, client.getSocketAddress)}. " + + log"Please use `spark.authenticate.*` configurations " + + log"in case of security incidents.") throw e case _: IndexOutOfBoundsException | _: NegativeArraySizeException => // Netty may throw non-'IOException's for corrupted buffers. In this case, // we ignore the entire message with warnings because we cannot trust any contents. - logWarning(s"Ignored a corrupted RPC message (capacity: ${rpcMessage.capacity()}) " + - s"from ${client.getSocketAddress}. Please use `spark.authenticate.*` configurations " + - "in case of security incidents.") + logWarning(log"Ignored a corrupted RPC message (capacity: " + + log"${MDC(RPC_MESSAGE_CAPACITY, rpcMessage.capacity())}) " + + log"from ${MDC(SOCKET_ADDRESS, client.getSocketAddress)}. " + + log"Please use `spark.authenticate.*` configurations " + + log"in case of security incidents.") return } logTrace(s"Received request: $message") diff --git a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala index 6b785a07c7f43..7ceb50db5966a 100644 --- a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala +++ b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala @@ -30,7 +30,7 @@ import com.codahale.metrics.{Metric, MetricSet} import org.apache.spark.{SecurityManager, SparkConf} import org.apache.spark.ExecutorDeadException -import org.apache.spark.internal.config +import org.apache.spark.internal.{config, LogKeys, MDC} import org.apache.spark.network._ import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer} import org.apache.spark.network.client.{RpcResponseCallback, TransportClientBootstrap} @@ -85,9 +85,11 @@ private[spark] class NettyBlockTransferService( appId = conf.getAppId if (hostName.equals(bindAddress)) { - logger.info(s"Server created on $hostName:${server.getPort}") + logger.info("Server created on {}:{}", + MDC(LogKeys.HOST, hostName), MDC(LogKeys.PORT, server.getPort)) } else { - logger.info(s"Server created on $hostName $bindAddress:${server.getPort}") + logger.info("Server created on {} {}:{}", MDC(LogKeys.HOST, hostName), + MDC(LogKeys.BIND_ADDRESS, bindAddress), MDC(LogKeys.PORT, server.getPort)) } } @@ -193,7 +195,11 @@ private[spark] class NettyBlockTransferService( } override def onFailure(e: Throwable): Unit = { - logger.error(s"Error while uploading $blockId${if (asStream) " as stream" else ""}", e) + if (asStream) { + logger.error(s"Error while uploading {} as stream", e, MDC.of(LogKeys.BLOCK_ID, blockId)) + } else { + logger.error(s"Error while uploading {}", e, MDC.of(LogKeys.BLOCK_ID, blockId)) + } result.failure(e) } } diff --git a/core/src/main/scala/org/apache/spark/rdd/CartesianRDD.scala b/core/src/main/scala/org/apache/spark/rdd/CartesianRDD.scala index fddd35b657479..850c07f460b75 100644 --- a/core/src/main/scala/org/apache/spark/rdd/CartesianRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/CartesianRDD.scala @@ -22,6 +22,7 @@ import java.io.{IOException, ObjectOutputStream} import scala.reflect.ClassTag import org.apache.spark._ +import org.apache.spark.errors.SparkCoreErrors import org.apache.spark.util.Utils private[spark] @@ -57,7 +58,11 @@ class CartesianRDD[T: ClassTag, U: ClassTag]( override def getPartitions: Array[Partition] = { // create the cross product split - val array = new Array[Partition](rdd1.partitions.length * rdd2.partitions.length) + val partitionNum: Long = numPartitionsInRdd2.toLong * rdd1.partitions.length + if (partitionNum > Int.MaxValue) { + throw SparkCoreErrors.tooManyArrayElementsError(partitionNum, Int.MaxValue) + } + val array = new Array[Partition](partitionNum.toInt) for (s1 <- rdd1.partitions; s2 <- rdd2.partitions) { val idx = s1.index * numPartitionsInRdd2 + s2.index array(idx) = new CartesianPartition(idx, rdd1, rdd2, s1.index, s2.index) diff --git a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala index 908ce1b233c57..545eafe7a4449 100644 --- a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala @@ -37,7 +37,8 @@ import org.apache.spark.annotation.DeveloperApi import org.apache.spark.broadcast.Broadcast import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.errors.SparkCoreErrors -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.config._ import org.apache.spark.rdd.HadoopRDD.HadoopMapPartitionsWithSplitRDD import org.apache.spark.scheduler.{HDFSCacheTaskLocation, HostTaskLocation} @@ -239,11 +240,13 @@ class HadoopRDD[K, V]( if (fileSplit.getLength > conf.get(IO_WARNING_LARGEFILETHRESHOLD)) { val codecFactory = new CompressionCodecFactory(jobConf) if (Utils.isFileSplittable(path, codecFactory)) { - logWarning(s"Loading one large file ${path.toString} with only one partition, " + - s"we can increase partition numbers for improving performance.") + logWarning(log"Loading one large file ${MDC(PATH, path.toString)} " + + log"with only one partition, " + + log"we can increase partition numbers for improving performance.") } else { - logWarning(s"Loading one large unsplittable file ${path.toString} with only one " + - s"partition, because the file is compressed by unsplittable compression codec.") + logWarning(log"Loading one large unsplittable file ${MDC(PATH, path.toString)} " + + log"with only one " + + log"partition, because the file is compressed by unsplittable compression codec.") } } } @@ -254,8 +257,8 @@ class HadoopRDD[K, V]( array } catch { case e: InvalidInputException if ignoreMissingFiles => - logWarning(s"${jobConf.get(FileInputFormat.INPUT_DIR)} doesn't exist and no" + - s" partitions returned from this path.", e) + logWarning(log"${MDC(PATH, jobConf.get(FileInputFormat.INPUT_DIR))} " + + log"doesn't exist and no partitions returned from this path.", e) Array.empty[Partition] case e: IOException if e.getMessage.startsWith("Not a file:") => val path = e.getMessage.split(":").map(_.trim).apply(2) @@ -267,7 +270,8 @@ class HadoopRDD[K, V]( val iter = new NextIterator[(K, V)] { private val split = theSplit.asInstanceOf[HadoopPartition] - logInfo("Input split: " + split.inputSplit) + logInfo(log"Task (TID ${MDC(TASK_ID, context.taskAttemptId())}) input split: " + + log"${MDC(INPUT_SPLIT, split.inputSplit)}") private val jobConf = getJobConf() private val inputMetrics = context.taskMetrics().inputMetrics @@ -310,13 +314,14 @@ class HadoopRDD[K, V]( inputFormat.getRecordReader(split.inputSplit.value, jobConf, Reporter.NULL) } catch { case e: FileNotFoundException if ignoreMissingFiles => - logWarning(s"Skipped missing file: ${split.inputSplit}", e) + logWarning(log"Skipped missing file: ${MDC(PATH, split.inputSplit)}", e) finished = true null // Throw FileNotFoundException even if `ignoreCorruptFiles` is true case e: FileNotFoundException if !ignoreMissingFiles => throw e case e: IOException if ignoreCorruptFiles => - logWarning(s"Skipped the rest content in the corrupted file: ${split.inputSplit}", e) + logWarning(log"Skipped the rest content in the corrupted file: " + + log"${MDC(PATH, split.inputSplit)}", e) finished = true null } @@ -336,12 +341,13 @@ class HadoopRDD[K, V]( finished = !reader.next(key, value) } catch { case e: FileNotFoundException if ignoreMissingFiles => - logWarning(s"Skipped missing file: ${split.inputSplit}", e) + logWarning(log"Skipped missing file: ${MDC(PATH, split.inputSplit)}", e) finished = true // Throw FileNotFoundException even if `ignoreCorruptFiles` is true case e: FileNotFoundException if !ignoreMissingFiles => throw e case e: IOException if ignoreCorruptFiles => - logWarning(s"Skipped the rest content in the corrupted file: ${split.inputSplit}", e) + logWarning(log"Skipped the rest content in the corrupted file: " + + log"${MDC(PATH, split.inputSplit)}", e) finished = true } if (!finished) { diff --git a/core/src/main/scala/org/apache/spark/rdd/JdbcRDD.scala b/core/src/main/scala/org/apache/spark/rdd/JdbcRDD.scala index c41255491e976..8c10bcbc25a86 100644 --- a/core/src/main/scala/org/apache/spark/rdd/JdbcRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/JdbcRDD.scala @@ -25,7 +25,8 @@ import org.apache.spark.{Partition, SparkContext, TaskContext} import org.apache.spark.api.java.{JavaRDD, JavaSparkContext} import org.apache.spark.api.java.JavaSparkContext.fakeClassTag import org.apache.spark.api.java.function.{Function => JFunction} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.util.NextIterator private[spark] class JdbcPartition(idx: Int, val lower: Long, val upper: Long) extends Partition { @@ -93,7 +94,7 @@ class JdbcRDD[T: ClassTag]( stmt.setFetchSize(100) } - logInfo(s"statement fetch size set to: ${stmt.getFetchSize}") + logInfo(log"statement fetch size set to: ${MDC(FETCH_SIZE, stmt.getFetchSize)}") stmt.setLong(1, part.lower) stmt.setLong(2, part.upper) diff --git a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala index 7db8531e4a59a..bf539320b5985 100644 --- a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala @@ -37,7 +37,8 @@ import org.apache.spark._ import org.apache.spark.annotation.DeveloperApi import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.errors.SparkCoreErrors -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.config._ import org.apache.spark.rdd.NewHadoopRDD.NewHadoopMapPartitionsWithSplitRDD import org.apache.spark.storage.StorageLevel @@ -168,11 +169,13 @@ class NewHadoopRDD[K, V]( if (fileSplit.getLength > conf.get(IO_WARNING_LARGEFILETHRESHOLD)) { val codecFactory = new CompressionCodecFactory(_conf) if (Utils.isFileSplittable(path, codecFactory)) { - logWarning(s"Loading one large file ${path.toString} with only one partition, " + - s"we can increase partition numbers for improving performance.") + logWarning(log"Loading one large file ${MDC(PATH, path.toString)} " + + log"with only one partition, " + + log"we can increase partition numbers for improving performance.") } else { - logWarning(s"Loading one large unsplittable file ${path.toString} with only one " + - s"partition, because the file is compressed by unsplittable compression codec.") + logWarning(log"Loading one large unsplittable file ${MDC(PATH, path.toString)} " + + log"with only one " + + log"partition, because the file is compressed by unsplittable compression codec.") } } } @@ -185,8 +188,8 @@ class NewHadoopRDD[K, V]( result } catch { case e: InvalidInputException if ignoreMissingFiles => - logWarning(s"${_conf.get(FileInputFormat.INPUT_DIR)} doesn't exist and no" + - s" partitions returned from this path.", e) + logWarning(log"${MDC(PATH, _conf.get(FileInputFormat.INPUT_DIR))} " + + log"doesn't exist and no partitions returned from this path.", e) Array.empty[Partition] } } @@ -194,7 +197,8 @@ class NewHadoopRDD[K, V]( override def compute(theSplit: Partition, context: TaskContext): InterruptibleIterator[(K, V)] = { val iter = new Iterator[(K, V)] { private val split = theSplit.asInstanceOf[NewHadoopPartition] - logInfo("Input split: " + split.serializableHadoopSplit) + logInfo(log"Task (TID ${MDC(TASK_ID, context.taskAttemptId())}) input split: " + + log"${MDC(INPUT_SPLIT, split.serializableHadoopSplit)}") private val conf = getConf private val inputMetrics = context.taskMetrics().inputMetrics @@ -244,14 +248,15 @@ class NewHadoopRDD[K, V]( _reader } catch { case e: FileNotFoundException if ignoreMissingFiles => - logWarning(s"Skipped missing file: ${split.serializableHadoopSplit}", e) + logWarning(log"Skipped missing file: ${MDC(PATH, split.serializableHadoopSplit)}", e) finished = true null // Throw FileNotFoundException even if `ignoreCorruptFiles` is true case e: FileNotFoundException if !ignoreMissingFiles => throw e case e: IOException if ignoreCorruptFiles => logWarning( - s"Skipped the rest content in the corrupted file: ${split.serializableHadoopSplit}", + log"Skipped the rest content in the corrupted file: " + + log"${MDC(PATH, split.serializableHadoopSplit)}", e) finished = true null @@ -273,13 +278,14 @@ class NewHadoopRDD[K, V]( finished = !reader.nextKeyValue } catch { case e: FileNotFoundException if ignoreMissingFiles => - logWarning(s"Skipped missing file: ${split.serializableHadoopSplit}", e) + logWarning(log"Skipped missing file: ${MDC(PATH, split.serializableHadoopSplit)}", e) finished = true // Throw FileNotFoundException even if `ignoreCorruptFiles` is true case e: FileNotFoundException if !ignoreMissingFiles => throw e case e: IOException if ignoreCorruptFiles => logWarning( - s"Skipped the rest content in the corrupted file: ${split.serializableHadoopSplit}", + log"Skipped the rest content in the corrupted file: " + + log"${MDC(PATH, split.serializableHadoopSplit)}", e) finished = true } diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala index f5a731d134eaf..c0966dd5ede14 100644 --- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala +++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala @@ -35,7 +35,8 @@ import org.apache.hadoop.mapreduce.{Job => NewAPIHadoopJob, OutputFormat => NewO import org.apache.spark._ import org.apache.spark.Partitioner.defaultPartitioner import org.apache.spark.errors.SparkCoreErrors -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.config.SPECULATION_ENABLED import org.apache.spark.internal.io._ import org.apache.spark.partial.{BoundedDouble, PartialResult} @@ -1051,10 +1052,11 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) val outputCommitterClass = hadoopConf.get("mapred.output.committer.class", "") if (speculationEnabled && outputCommitterClass.contains("Direct")) { val warningMessage = - s"$outputCommitterClass may be an output committer that writes data directly to " + - "the final location. Because speculation is enabled, this output committer may " + - "cause data loss (see the case in SPARK-10063). If possible, please use an output " + - "committer that does not have this behavior (e.g. FileOutputCommitter)." + log"${MDC(CLASS_NAME, outputCommitterClass)} " + + log"may be an output committer that writes data directly to " + + log"the final location. Because speculation is enabled, this output committer may " + + log"cause data loss (see the case in SPARK-10063). If possible, please use an output " + + log"committer that does not have this behavior (e.g. FileOutputCommitter)." logWarning(warningMessage) } diff --git a/core/src/main/scala/org/apache/spark/rdd/PipedRDD.scala b/core/src/main/scala/org/apache/spark/rdd/PipedRDD.scala index 127bdf6d91812..a806b72766c6f 100644 --- a/core/src/main/scala/org/apache/spark/rdd/PipedRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/PipedRDD.scala @@ -34,7 +34,7 @@ import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.errors.SparkCoreErrors -import org.apache.spark.internal.LogKey.{COMMAND, ERROR, PATH} +import org.apache.spark.internal.LogKeys.{COMMAND, ERROR, PATH} import org.apache.spark.internal.MDC import org.apache.spark.util.Utils diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala index a48eaa253ad1d..ac93abf3fe7a0 100644 --- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala @@ -36,7 +36,8 @@ import org.apache.spark.Partitioner._ import org.apache.spark.annotation.{DeveloperApi, Experimental, Since} import org.apache.spark.api.java.JavaRDD import org.apache.spark.errors.SparkCoreErrors -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.config._ import org.apache.spark.internal.config.RDD_LIMIT_SCALE_UP_FACTOR import org.apache.spark.partial.BoundedDouble @@ -210,7 +211,7 @@ abstract class RDD[T: ClassTag]( * @return This RDD. */ def unpersist(blocking: Boolean = false): this.type = { - logInfo(s"Removing RDD $id from persistence list") + logInfo(log"Removing RDD ${MDC(RDD_ID, id)} from persistence list") sc.unpersistRDD(id, blocking) storageLevel = StorageLevel.NONE this @@ -643,7 +644,8 @@ abstract class RDD[T: ClassTag]( // this shouldn't happen often because we use a big multiplier for the initial size var numIters = 0 while (samples.length < num) { - logWarning(s"Needed to re-sample due to insufficient sample size. Repeat #$numIters") + logWarning(log"Needed to re-sample due to insufficient sample size. " + + log"Repeat #${MDC(NUM_ITERATIONS, numIters)}") samples = this.sample(withReplacement, fraction, rand.nextInt()).collect() numIters += 1 } diff --git a/core/src/main/scala/org/apache/spark/rdd/ReliableCheckpointRDD.scala b/core/src/main/scala/org/apache/spark/rdd/ReliableCheckpointRDD.scala index 863bcd5b12d35..cc777659f541e 100644 --- a/core/src/main/scala/org/apache/spark/rdd/ReliableCheckpointRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/ReliableCheckpointRDD.scala @@ -29,7 +29,8 @@ import org.apache.hadoop.fs.Path import org.apache.spark._ import org.apache.spark.broadcast.Broadcast import org.apache.spark.errors.SparkCoreErrors -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.config.{BUFFER_SIZE, CACHE_CHECKPOINT_PREFERRED_LOCS_EXPIRE_TIME, CHECKPOINT_COMPRESS} import org.apache.spark.io.CompressionCodec import org.apache.spark.util.{SerializableConfiguration, Utils} @@ -172,7 +173,7 @@ private[spark] object ReliableCheckpointRDD extends Logging { val checkpointDurationMs = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - checkpointStartTimeNs) - logInfo(s"Checkpointing took $checkpointDurationMs ms.") + logInfo(log"Checkpointing took ${MDC(TOTAL_TIME, checkpointDurationMs)} ms.") val newRDD = new ReliableCheckpointRDD[T]( sc, checkpointDirPath.toString, originalRDD.partitioner) @@ -219,7 +220,7 @@ private[spark] object ReliableCheckpointRDD extends Logging { } (catchBlock = { val deleted = fs.delete(tempOutputPath, false) if (!deleted) { - logInfo(s"Failed to delete tempOutputPath $tempOutputPath.") + logInfo(log"Failed to delete tempOutputPath ${MDC(TEMP_OUTPUT_PATH, tempOutputPath)}.") } }, finallyBlock = { serializeStream.close() @@ -227,14 +228,15 @@ private[spark] object ReliableCheckpointRDD extends Logging { if (!fs.rename(tempOutputPath, finalOutputPath)) { if (!fs.exists(finalOutputPath)) { - logInfo(s"Deleting tempOutputPath $tempOutputPath") + logInfo(log"Deleting tempOutputPath ${MDC(TEMP_OUTPUT_PATH, tempOutputPath)}") fs.delete(tempOutputPath, false) throw SparkCoreErrors.checkpointFailedToSaveError(ctx.attemptNumber(), finalOutputPath) } else { // Some other copy of this task must've finished before us and renamed it - logInfo(s"Final output path $finalOutputPath already exists; not overwriting it") + logInfo(log"Final output path" + + log" ${MDC(FINAL_OUTPUT_PATH, finalOutputPath)} already exists; not overwriting it") if (!fs.delete(tempOutputPath, false)) { - logWarning(s"Error deleting ${tempOutputPath}") + logWarning(log"Error deleting ${MDC(PATH, tempOutputPath)}") } } } @@ -261,7 +263,8 @@ private[spark] object ReliableCheckpointRDD extends Logging { logDebug(s"Written partitioner to $partitionerFilePath") } catch { case NonFatal(e) => - logWarning(s"Error writing partitioner $partitioner to $checkpointDirPath") + logWarning(log"Error writing partitioner ${MDC(PARTITIONER, partitioner)} to " + + log"${MDC(PATH, checkpointDirPath)}") } } @@ -298,8 +301,8 @@ private[spark] object ReliableCheckpointRDD extends Logging { logDebug("No partitioner file", e) None case NonFatal(e) => - logWarning(s"Error reading partitioner from $checkpointDirPath, " + - s"partitioner will not be recovered which may lead to performance loss", e) + logWarning(log"Error reading partitioner from ${MDC(PATH, checkpointDirPath)}, " + + log"partitioner will not be recovered which may lead to performance loss", e) None } } diff --git a/core/src/main/scala/org/apache/spark/rdd/ReliableRDDCheckpointData.scala b/core/src/main/scala/org/apache/spark/rdd/ReliableRDDCheckpointData.scala index 0d1bc1425161e..b468a38fcf229 100644 --- a/core/src/main/scala/org/apache/spark/rdd/ReliableRDDCheckpointData.scala +++ b/core/src/main/scala/org/apache/spark/rdd/ReliableRDDCheckpointData.scala @@ -23,7 +23,8 @@ import org.apache.hadoop.fs.Path import org.apache.spark._ import org.apache.spark.errors.SparkCoreErrors -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{NEW_RDD_ID, RDD_CHECKPOINT_DIR, RDD_ID} import org.apache.spark.internal.config.CLEANER_REFERENCE_TRACKING_CLEAN_CHECKPOINTS /** @@ -66,7 +67,8 @@ private[spark] class ReliableRDDCheckpointData[T: ClassTag](@transient private v } } - logInfo(s"Done checkpointing RDD ${rdd.id} to $cpDir, new parent is RDD ${newRDD.id}") + logInfo(log"Done checkpointing RDD ${MDC(RDD_ID, rdd.id)}" + + log" to ${MDC(RDD_CHECKPOINT_DIR, cpDir)}, new parent is RDD ${MDC(NEW_RDD_ID, newRDD.id)}") newRDD } diff --git a/core/src/main/scala/org/apache/spark/rdd/SequenceFileRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/SequenceFileRDDFunctions.scala index 2f6ff0acdf024..118660ef69476 100644 --- a/core/src/main/scala/org/apache/spark/rdd/SequenceFileRDDFunctions.scala +++ b/core/src/main/scala/org/apache/spark/rdd/SequenceFileRDDFunctions.scala @@ -23,7 +23,7 @@ import org.apache.hadoop.io.compress.CompressionCodec import org.apache.hadoop.mapred.JobConf import org.apache.hadoop.mapred.SequenceFileOutputFormat -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} /** * Extra functions available on RDDs of (key, value) pairs to create a Hadoop SequenceFile, @@ -58,8 +58,9 @@ class SequenceFileRDDFunctions[K: IsWritable: ClassTag, V: IsWritable: ClassTag] val convertKey = self.keyClass != _keyWritableClass val convertValue = self.valueClass != _valueWritableClass - logInfo("Saving as sequence file of type " + - s"(${_keyWritableClass.getSimpleName},${_valueWritableClass.getSimpleName})" ) + logInfo(log"Saving as sequence file of type " + + log"(${MDC(LogKeys.KEY, _keyWritableClass.getSimpleName)}," + + log"${MDC(LogKeys.VALUE, _valueWritableClass.getSimpleName)})") val format = classOf[SequenceFileOutputFormat[Writable, Writable]] val jobConf = new JobConf(self.context.hadoopConfiguration) if (!convertKey && !convertValue) { diff --git a/core/src/main/scala/org/apache/spark/resource/ResourceDiscoveryScriptPlugin.scala b/core/src/main/scala/org/apache/spark/resource/ResourceDiscoveryScriptPlugin.scala index d861e91771673..51de7e2b9ac70 100644 --- a/core/src/main/scala/org/apache/spark/resource/ResourceDiscoveryScriptPlugin.scala +++ b/core/src/main/scala/org/apache/spark/resource/ResourceDiscoveryScriptPlugin.scala @@ -23,7 +23,8 @@ import java.util.Optional import org.apache.spark.{SparkConf, SparkException} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.api.resource.ResourceDiscoveryPlugin -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys import org.apache.spark.util.Utils.executeAndGetOutput /** @@ -44,7 +45,8 @@ class ResourceDiscoveryScriptPlugin extends ResourceDiscoveryPlugin with Logging val resourceName = request.id.resourceName val result = if (script.isPresent) { val scriptFile = new File(script.get) - logInfo(s"Discovering resources for $resourceName with script: $scriptFile") + logInfo(log"Discovering resources for ${MDC(LogKeys.RESOURCE_NAME, resourceName)}" + + log" with script: ${MDC(LogKeys.PATH, scriptFile)}") // check that script exists and try to execute if (scriptFile.exists()) { val output = executeAndGetOutput(Seq(script.get), new File(".")) diff --git a/core/src/main/scala/org/apache/spark/resource/ResourceProfile.scala b/core/src/main/scala/org/apache/spark/resource/ResourceProfile.scala index e95dbe973691a..7dcde35de2518 100644 --- a/core/src/main/scala/org/apache/spark/resource/ResourceProfile.scala +++ b/core/src/main/scala/org/apache/spark/resource/ResourceProfile.scala @@ -26,7 +26,8 @@ import scala.jdk.CollectionConverters._ import org.apache.spark.{SparkConf, SparkContext, SparkEnv, SparkException} import org.apache.spark.annotation.{Evolving, Since} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.config._ import org.apache.spark.internal.config.Python.PYSPARK_EXECUTOR_MEMORY import org.apache.spark.util.Utils @@ -221,8 +222,8 @@ class ResourceProfile( } taskResourcesToCheck -= rName } else { - logWarning(s"The executor resource config for resource: $rName was specified but " + - "no corresponding task resource request was specified.") + logWarning(log"The executor resource config for resource: ${MDC(RESOURCE_NAME, rName)} " + + log"was specified but no corresponding task resource request was specified.") } } if (taskResourcesToCheck.nonEmpty) { @@ -231,7 +232,7 @@ class ResourceProfile( } val limiting = if (taskLimit == -1) "cpu" else s"$limitingResource at $taskLimit tasks per executor" - logInfo(s"Limiting resource is $limiting") + logInfo(log"Limiting resource is ${MDC(RESOURCE, limiting)}") _executorResourceSlotsPerAddr = Some(numPartsPerResourceMap.toMap) _maxTasksPerExecutor = if (taskLimit == -1) Some(1) else Some(taskLimit) _limitingResource = Some(limitingResource) @@ -373,9 +374,9 @@ object ResourceProfile extends Logging { val defProf = new ResourceProfile(executorResources, taskResources) defProf.setToDefaultProfile() defaultProfile = Some(defProf) - logInfo("Default ResourceProfile created, executor resources: " + - s"${defProf.executorResources}, task resources: " + - s"${defProf.taskResources}") + logInfo(log"Default ResourceProfile created, executor resources: " + + log"${MDC(EXECUTOR_RESOURCES, defProf.executorResources)}, task resources: " + + log"${MDC(TASK_RESOURCES, defProf.taskResources)}") defProf } } diff --git a/core/src/main/scala/org/apache/spark/resource/ResourceProfileManager.scala b/core/src/main/scala/org/apache/spark/resource/ResourceProfileManager.scala index 580a5b7bb07ac..6a6b5067f70f2 100644 --- a/core/src/main/scala/org/apache/spark/resource/ResourceProfileManager.scala +++ b/core/src/main/scala/org/apache/spark/resource/ResourceProfileManager.scala @@ -23,7 +23,8 @@ import scala.collection.mutable.HashMap import org.apache.spark.{SparkConf, SparkException} import org.apache.spark.annotation.Evolving -import org.apache.spark.internal.{config, Logging} +import org.apache.spark.internal.{config, Logging, MDC} +import org.apache.spark.internal.LogKeys import org.apache.spark.internal.config.Tests._ import org.apache.spark.scheduler.{LiveListenerBus, SparkListenerResourceProfileAdded} import org.apache.spark.util.Utils @@ -140,7 +141,7 @@ private[spark] class ResourceProfileManager(sparkConf: SparkConf, if (putNewProfile) { // force the computation of maxTasks and limitingResource now so we don't have cost later rp.limitingResource(sparkConf) - logInfo(s"Added ResourceProfile id: ${rp.id}") + logInfo(log"Added ResourceProfile id: ${MDC(LogKeys.RESOURCE_PROFILE_ID, rp.id)}") listenerBus.post(SparkListenerResourceProfileAdded(rp)) } } diff --git a/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala b/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala index 8718ce8ea0833..78c45cdc75418 100644 --- a/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala +++ b/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala @@ -28,7 +28,8 @@ import org.json4s.jackson.JsonMethods._ import org.apache.spark.{SparkConf, SparkException} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.api.resource.ResourceDiscoveryPlugin -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.config.{EXECUTOR_CORES, RESOURCES_DISCOVERY_PLUGIN, SPARK_TASK_PREFIX} import org.apache.spark.internal.config.Tests.RESOURCES_WARNING_TESTING import org.apache.spark.util.ArrayImplicits._ @@ -454,13 +455,15 @@ private[spark] object ResourceUtils extends Logging { if (limitingResource.nonEmpty && !limitingResource.equals(ResourceProfile.CPUS)) { if ((taskCpus * maxTaskPerExec) < cores) { val resourceNumSlots = Math.floor(cores/taskCpus).toInt - val message = s"The configuration of cores (exec = ${cores} " + - s"task = ${taskCpus}, runnable tasks = ${resourceNumSlots}) will " + - s"result in wasted resources due to resource ${limitingResource} limiting the " + - s"number of runnable tasks per executor to: ${maxTaskPerExec}. Please adjust " + - "your configuration." + val message = log"The configuration of cores (exec = ${MDC(NUM_CORES, cores)} " + + log"task = ${MDC(NUM_TASK_CPUS, taskCpus)}, runnable tasks = " + + log"${MDC(NUM_RESOURCE_SLOTS, resourceNumSlots)}) will " + + log"result in wasted resources due to resource ${MDC(RESOURCE, limitingResource)} " + + log"limiting the number of runnable tasks per executor to: " + + log"${MDC(NUM_TASKS, maxTaskPerExec)}. Please adjust " + + log"your configuration." if (sparkConf.get(RESOURCES_WARNING_TESTING)) { - throw new SparkException(message) + throw new SparkException(message.message) } else { logWarning(message) } @@ -476,14 +479,16 @@ private[spark] object ResourceUtils extends Logging { val origTaskAmount = treq.amount val taskReqStr = s"${origTaskAmount}/${numParts}" val resourceNumSlots = (execAmount * numParts / taskAmount).toInt - val message = s"The configuration of resource: ${treq.resourceName} " + - s"(exec = ${execAmount}, task = ${taskReqStr}, " + - s"runnable tasks = ${resourceNumSlots}) will " + - s"result in wasted resources due to resource ${limitingResource} limiting the " + - s"number of runnable tasks per executor to: ${maxTaskPerExec}. Please adjust " + - "your configuration." + val message = log"The configuration of resource: " + + log"${MDC(RESOURCE_NAME, treq.resourceName)} " + + log"(exec = ${MDC(EXEC_AMOUNT, execAmount)}, " + + log"task = ${MDC(TASK_REQUIREMENTS, taskReqStr)}, " + + log"runnable tasks = ${MDC(NUM_RESOURCE_SLOTS, resourceNumSlots)}) will " + + log"result in wasted resources due to resource ${MDC(RESOURCE, limitingResource)} " + + log"limiting the number of runnable tasks per executor to: " + + log"${MDC(NUM_TASKS, maxTaskPerExec)}. Please adjust your configuration." if (sparkConf.get(RESOURCES_WARNING_TESTING)) { - throw new SparkException(message) + throw new SparkException(message.message) } else { logWarning(message) } diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala b/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala index 0e35842fece92..8acfef38659c0 100644 --- a/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala +++ b/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala @@ -25,7 +25,8 @@ import scala.jdk.CollectionConverters._ import scala.util.control.NonFatal import org.apache.spark.{SparkEnv, SparkException} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.network.client.RpcResponseCallback import org.apache.spark.rpc._ @@ -123,7 +124,8 @@ private[netty] class Dispatcher(nettyEnv: NettyRpcEnv, numUsableCores: Int) exte val name = iter.next postMessage(name, message, (e) => { e match { case e: RpcEnvStoppedException => logDebug(s"Message $message dropped. ${e.getMessage}") - case e: Throwable => logWarning(s"Message $message dropped. ${e.getMessage}") + case e: Throwable => + logWarning(log"Message ${MDC(MESSAGE, message)} dropped. ${MDC(ERROR, e.getMessage)}") }} )} } @@ -154,7 +156,8 @@ private[netty] class Dispatcher(nettyEnv: NettyRpcEnv, numUsableCores: Int) exte // cluster in spark shell. case re: RpcEnvStoppedException => logDebug(s"Message $message dropped. ${re.getMessage}") case e if SparkEnv.get.isStopped => - logWarning(s"Message $message dropped due to sparkEnv is stopped. ${e.getMessage}") + logWarning(log"Message ${MDC(MESSAGE, message)} dropped due to sparkEnv " + + log"is stopped. ${MDC(ERROR, e.getMessage)}") case e => throw e }) } diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/Inbox.scala b/core/src/main/scala/org/apache/spark/rpc/netty/Inbox.scala index b503c5a0f8089..0de67a65593b1 100644 --- a/core/src/main/scala/org/apache/spark/rpc/netty/Inbox.scala +++ b/core/src/main/scala/org/apache/spark/rpc/netty/Inbox.scala @@ -23,7 +23,7 @@ import scala.util.control.NonFatal import org.apache.spark.SparkException import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.END_POINT +import org.apache.spark.internal.LogKeys._ import org.apache.spark.rpc.{RpcAddress, RpcEndpoint, ThreadSafeRpcEndpoint} @@ -194,7 +194,8 @@ private[netty] class Inbox(val endpointName: String, val endpoint: RpcEndpoint) * Exposed for testing. */ protected def onDrop(message: InboxMessage): Unit = { - logWarning(s"Drop $message because endpoint $endpointName is stopped") + logWarning(log"Drop ${MDC(MESSAGE, message)} " + + log"because endpoint ${MDC(END_POINT, endpointName)} is stopped") } /** diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/MessageLoop.scala b/core/src/main/scala/org/apache/spark/rpc/netty/MessageLoop.scala index 2d94ed5d05e1c..2fd1c6d7fe71e 100644 --- a/core/src/main/scala/org/apache/spark/rpc/netty/MessageLoop.scala +++ b/core/src/main/scala/org/apache/spark/rpc/netty/MessageLoop.scala @@ -23,7 +23,7 @@ import scala.util.control.NonFatal import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.ERROR +import org.apache.spark.internal.LogKeys.ERROR import org.apache.spark.internal.config.EXECUTOR_ID import org.apache.spark.internal.config.Network._ import org.apache.spark.rpc.{IsolatedRpcEndpoint, RpcEndpoint} diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala index 7909f2327cdf7..c2688610fe8b1 100644 --- a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala +++ b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala @@ -30,7 +30,8 @@ import scala.util.{DynamicVariable, Failure, Success, Try} import scala.util.control.NonFatal import org.apache.spark.{SecurityManager, SparkConf, SparkContext} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.config.EXECUTOR_ID import org.apache.spark.internal.config.Network._ import org.apache.spark.network.TransportContext @@ -216,7 +217,7 @@ private[netty] class NettyRpcEnv( if (!promise.tryFailure(e)) { e match { case e : RpcEnvStoppedException => logDebug(s"Ignored failure: $e") - case _ => logWarning(s"Ignored failure: $e") + case _ => logWarning(log"Ignored failure: ${MDC(ERROR, e)}") } } } @@ -225,7 +226,7 @@ private[netty] class NettyRpcEnv( case RpcFailure(e) => onFailure(e) case rpcReply => if (!promise.trySuccess(rpcReply)) { - logWarning(s"Ignored message: $reply") + logWarning(log"Ignored message: ${MDC(MESSAGE, reply)}") } } diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/Outbox.scala b/core/src/main/scala/org/apache/spark/rpc/netty/Outbox.scala index 205e6e966866f..b212a818ffc49 100644 --- a/core/src/main/scala/org/apache/spark/rpc/netty/Outbox.scala +++ b/core/src/main/scala/org/apache/spark/rpc/netty/Outbox.scala @@ -46,7 +46,7 @@ private[netty] case class OneWayOutboxMessage(content: ByteBuffer) extends Outbo override def onFailure(e: Throwable): Unit = { e match { case e1: RpcEnvStoppedException => logDebug(e1.getMessage) - case e1: Throwable => logWarning(s"Failed to send one-way RPC.", e1) + case e1: Throwable => logWarning(log"Failed to send one-way RPC.", e1) } } diff --git a/core/src/main/scala/org/apache/spark/scheduler/AsyncEventQueue.scala b/core/src/main/scala/org/apache/spark/scheduler/AsyncEventQueue.scala index 271fc9ac92ba6..16e9211b54851 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/AsyncEventQueue.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/AsyncEventQueue.scala @@ -24,7 +24,7 @@ import com.codahale.metrics.{Gauge, Timer} import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.EVENT_QUEUE +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.config._ import org.apache.spark.util.Utils @@ -187,8 +187,9 @@ private class AsyncEventQueue( if (lastReportTimestamp.compareAndSet(lastReportTime, curTime)) { val previous = new java.util.Date(lastReportTime) lastDroppedEventsCounter = droppedEventsCount - logWarning(s"Dropped $droppedCountIncreased events from $name since " + - s"${if (lastReportTime == 0) "the application started" else s"$previous"}.") + logWarning(log"Dropped ${MDC(NUM_EVENTS, droppedCountIncreased)} events from " + + log"${MDC(EVENT_NAME, name)} since " + + (if (lastReportTime == 0) log"the application started" else log"${MDC(TIME, previous)}")) } } } diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala index 41cbd795b7e5e..f50e8bd25fec8 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala @@ -37,7 +37,7 @@ import org.apache.spark.broadcast.Broadcast import org.apache.spark.errors.SparkCoreErrors import org.apache.spark.executor.{ExecutorMetrics, TaskMetrics} import org.apache.spark.internal.{config, Logging, MDC} -import org.apache.spark.internal.LogKey.{ACCUMULATOR_ID, CLASS_NAME, JOB_ID, PARTITION_ID, STAGE_ID, TASK_ID} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.config.{LEGACY_ABORT_STAGE_AFTER_KILL_TASKS, RDD_CACHE_VISIBILITY_TRACKING_ENABLED} import org.apache.spark.internal.config.Tests.TEST_NO_STAGE_RETRY import org.apache.spark.network.shuffle.{BlockStoreClient, MergeFinalizerListener} @@ -535,8 +535,9 @@ private[spark] class DAGScheduler( if (!mapOutputTracker.containsShuffle(shuffleDep.shuffleId)) { // Kind of ugly: need to register RDDs with the cache and map output tracker here // since we can't do it in the RDD constructor because # of partitions is unknown - logInfo(s"Registering RDD ${rdd.id} (${rdd.getCreationSite}) as input to " + - s"shuffle ${shuffleDep.shuffleId}") + logInfo(log"Registering RDD ${MDC(RDD_ID, rdd.id)} " + + log"(${MDC(CREATION_SITE, rdd.getCreationSite)}) as input to " + + log"shuffle ${MDC(SHUFFLE_ID, shuffleDep.shuffleId)}") mapOutputTracker.registerShuffle(shuffleDep.shuffleId, rdd.partitions.length, shuffleDep.partitioner.numPartitions) } @@ -1097,7 +1098,7 @@ private[spark] class DAGScheduler( * Cancel a job that is running or waiting in the queue. */ def cancelJob(jobId: Int, reason: Option[String]): Unit = { - logInfo("Asked to cancel job " + jobId) + logInfo(log"Asked to cancel job ${MDC(JOB_ID, jobId)}") eventProcessLoop.post(JobCancelled(jobId, reason)) } @@ -1106,7 +1107,8 @@ private[spark] class DAGScheduler( * @param cancelFutureJobs if true, future submitted jobs in this job group will be cancelled */ def cancelJobGroup(groupId: String, cancelFutureJobs: Boolean = false): Unit = { - logInfo(s"Asked to cancel job group $groupId with cancelFutureJobs=$cancelFutureJobs") + logInfo(log"Asked to cancel job group ${MDC(GROUP_ID, groupId)} with " + + log"cancelFutureJobs=${MDC(CANCEL_FUTURE_JOBS, cancelFutureJobs)}") eventProcessLoop.post(JobGroupCancelled(groupId, cancelFutureJobs)) } @@ -1115,7 +1117,7 @@ private[spark] class DAGScheduler( */ def cancelJobsWithTag(tag: String): Unit = { SparkContext.throwIfInvalidTag(tag) - logInfo(s"Asked to cancel jobs with tag $tag") + logInfo(log"Asked to cancel jobs with tag ${MDC(TAG, tag)}") eventProcessLoop.post(JobTagCancelled(tag)) } @@ -1209,7 +1211,7 @@ private[spark] class DAGScheduler( // If cancelFutureJobs is true, store the cancelled job group id into internal states. // When a job belonging to this job group is submitted, skip running it. if (cancelFutureJobs) { - logInfo(s"Add job group $groupId into cancelled job groups") + logInfo(log"Add job group ${MDC(GROUP_ID, groupId)} into cancelled job groups") cancelledJobGroups.add(groupId) } @@ -1221,7 +1223,8 @@ private[spark] class DAGScheduler( } } if (activeInGroup.isEmpty && !cancelFutureJobs) { - logWarning(s"Failed to cancel job group $groupId. Cannot find active jobs for it.") + logWarning(log"Failed to cancel job group ${MDC(GROUP_ID, groupId)}. " + + log"Cannot find active jobs for it.") } val jobIds = activeInGroup.map(_.jobId) jobIds.foreach(handleJobCancellation(_, @@ -1313,7 +1316,7 @@ private[spark] class DAGScheduler( if (jobGroupIdOpt.exists(cancelledJobGroups.contains(_))) { listener.jobFailed( SparkCoreErrors.sparkJobCancelledAsPartOfJobGroupError(jobId, jobGroupIdOpt.get)) - logInfo(s"Skip running a job that belongs to the cancelled job group ${jobGroupIdOpt.get}.") + logInfo(log"Skip running a job that belongs to the cancelled job group ${MDC(GROUP_ID, jobGroupIdOpt.get)}") return } @@ -1328,9 +1331,11 @@ private[spark] class DAGScheduler( val numCheckFailures = barrierJobIdToNumTasksCheckFailures.compute(jobId, (_: Int, value: Int) => value + 1) - logWarning(s"Barrier stage in job $jobId requires ${e.requiredConcurrentTasks} slots, " + - s"but only ${e.maxConcurrentTasks} are available. " + - s"Will retry up to ${maxFailureNumTasksCheck - numCheckFailures + 1} more times") + logWarning(log"Barrier stage in job ${MDC(JOB_ID, jobId)} " + + log"requires ${MDC(NUM_SLOTS, e.requiredConcurrentTasks)} slots, " + + log"but only ${MDC(MAX_SLOTS, e.maxConcurrentTasks)} are available. " + + log"Will retry up to ${MDC(NUM_RETRIES, maxFailureNumTasksCheck - numCheckFailures + 1)} " + + log"more times") if (numCheckFailures <= maxFailureNumTasksCheck) { messageScheduler.schedule( @@ -1350,7 +1355,7 @@ private[spark] class DAGScheduler( } case e: Exception => - logWarning("Creating new stage failed due to exception - job: " + jobId, e) + logWarning(log"Creating new stage failed due to exception - job: ${MDC(JOB_ID, jobId)}", e) listener.jobFailed(e) return } @@ -1359,11 +1364,13 @@ private[spark] class DAGScheduler( val job = new ActiveJob(jobId, finalStage, callSite, listener, artifacts, properties) clearCacheLocs() - logInfo("Got job %s (%s) with %d output partitions".format( - job.jobId, callSite.shortForm, partitions.length)) - logInfo("Final stage: " + finalStage + " (" + finalStage.name + ")") - logInfo("Parents of final stage: " + finalStage.parents) - logInfo("Missing parents: " + getMissingParentStages(finalStage)) + logInfo( + log"Got job ${MDC(JOB_ID, job.jobId)} (${MDC(CALL_SITE_SHORT_FORM, callSite.shortForm)}) " + + log"with ${MDC(NUM_PARTITIONS, partitions.length)} output partitions") + logInfo(log"Final stage: ${MDC(STAGE_ID, finalStage)} " + + log"(${MDC(STAGE_NAME, finalStage.name)})") + logInfo(log"Parents of final stage: ${MDC(STAGE_ID, finalStage.parents)}") + logInfo(log"Missing parents: ${MDC(MISSING_PARENT_STAGES, getMissingParentStages(finalStage))}") val jobSubmissionTime = clock.getTimeMillis() jobIdToActiveJob(jobId) = job @@ -1393,18 +1400,20 @@ private[spark] class DAGScheduler( finalStage = getOrCreateShuffleMapStage(dependency, jobId) } catch { case e: Exception => - logWarning("Creating new stage failed due to exception - job: " + jobId, e) + logWarning(log"Creating new stage failed due to exception - job: ${MDC(JOB_ID, jobId)}", e) listener.jobFailed(e) return } val job = new ActiveJob(jobId, finalStage, callSite, listener, artifacts, properties) clearCacheLocs() - logInfo("Got map stage job %s (%s) with %d output partitions".format( - jobId, callSite.shortForm, dependency.rdd.partitions.length)) - logInfo("Final stage: " + finalStage + " (" + finalStage.name + ")") - logInfo("Parents of final stage: " + finalStage.parents) - logInfo("Missing parents: " + getMissingParentStages(finalStage)) + logInfo(log"Got map stage job ${MDC(JOB_ID, jobId)} " + + log"(${MDC(CALL_SITE_SHORT_FORM, callSite.shortForm)}) with " + + log"${MDC(NUM_PARTITIONS, dependency.rdd.partitions.length)} output partitions") + logInfo(log"Final stage: ${MDC(STAGE_ID, finalStage)} " + + log"(${MDC(STAGE_NAME, finalStage.name)})") + logInfo(log"Parents of final stage: ${MDC(PARENT_STAGES, finalStage.parents.toString)}") + logInfo(log"Missing parents: ${MDC(MISSING_PARENT_STAGES, getMissingParentStages(finalStage))}") val jobSubmissionTime = clock.getTimeMillis() jobIdToActiveJob(jobId) = job @@ -1441,7 +1450,8 @@ private[spark] class DAGScheduler( val missing = getMissingParentStages(stage).sortBy(_.id) logDebug("missing: " + missing) if (missing.isEmpty) { - logInfo("Submitting " + stage + " (" + stage.rdd + "), which has no missing parents") + logInfo(log"Submitting ${MDC(STAGE_ID, stage)} (${MDC(RDD_ID, stage.rdd)}), " + + log"which has no missing parents") submitMissingTasks(stage, jobId.get) } else { for (parent <- missing) { @@ -1492,13 +1502,16 @@ private[spark] class DAGScheduler( val shuffleId = stage.shuffleDep.shuffleId val shuffleMergeId = stage.shuffleDep.shuffleMergeId if (stage.shuffleDep.shuffleMergeEnabled) { - logInfo(s"Shuffle merge enabled before starting the stage for $stage with shuffle" + - s" $shuffleId and shuffle merge $shuffleMergeId with" + - s" ${stage.shuffleDep.getMergerLocs.size} merger locations") + logInfo(log"Shuffle merge enabled before starting the stage for ${MDC(STAGE_ID, stage)}" + + log" with shuffle ${MDC(SHUFFLE_ID, shuffleId)} and shuffle merge" + + log" ${MDC(SHUFFLE_MERGE_ID, shuffleMergeId)} with" + + log" ${MDC(NUM_MERGER_LOCATIONS, stage.shuffleDep.getMergerLocs.size.toString)} merger locations") } else { - logInfo(s"Shuffle merge disabled for $stage with shuffle $shuffleId" + - s" and shuffle merge $shuffleMergeId, but can get enabled later adaptively" + - s" once enough mergers are available") + logInfo(log"Shuffle merge disabled for ${MDC(STAGE_ID, stage)} with " + + log"shuffle ${MDC(SHUFFLE_ID, shuffleId)} and " + + log"shuffle merge ${MDC(SHUFFLE_MERGE_ID, shuffleMergeId)}, " + + log"but can get enabled later adaptively once enough " + + log"mergers are available") } } @@ -1555,8 +1568,8 @@ private[spark] class DAGScheduler( // merger locations but the corresponding shuffle map stage did not complete // successfully, we would still enable push for its retry. s.shuffleDep.setShuffleMergeAllowed(false) - logInfo(s"Push-based shuffle disabled for $stage (${stage.name}) since it" + - " is already shuffle merge finalized") + logInfo(log"Push-based shuffle disabled for ${MDC(STAGE_ID, stage)} " + + log"(${MDC(STAGE_NAME, stage.name)}) since it is already shuffle merge finalized") } } case s: ResultStage => @@ -1622,8 +1635,8 @@ private[spark] class DAGScheduler( } if (taskBinaryBytes.length > TaskSetManager.TASK_SIZE_TO_WARN_KIB * 1024) { - logWarning(s"Broadcasting large task binary with size " + - s"${Utils.bytesToString(taskBinaryBytes.length)}") + logWarning(log"Broadcasting large task binary with size " + + log"${MDC(NUM_BYTES, Utils.bytesToString(taskBinaryBytes.length))}") } taskBinary = sc.broadcast(taskBinaryBytes) } catch { @@ -1678,8 +1691,9 @@ private[spark] class DAGScheduler( } if (tasks.nonEmpty) { - logInfo(s"Submitting ${tasks.size} missing tasks from $stage (${stage.rdd}) (first 15 " + - s"tasks are for partitions ${tasks.take(15).map(_.partitionId)})") + logInfo(log"Submitting ${MDC(NUM_TASKS, tasks.size)} missing tasks from " + + log"${MDC(STAGE_ID, stage)} (${MDC(RDD_ID, stage.rdd)}) (first 15 tasks are " + + log"for partitions ${MDC(PARTITION_IDS, tasks.take(15).map(_.partitionId))})") val shuffleId = stage match { case s: ShuffleMapStage => Some(s.shuffleDep.shuffleId) case _: ResultStage => None @@ -1748,9 +1762,10 @@ private[spark] class DAGScheduler( case Some(accum) => accum.getClass.getName case None => "Unknown class" } - logError( - log"Failed to update accumulator ${MDC(ACCUMULATOR_ID, id)} (${MDC(CLASS_NAME, accumClassName)}) " + - log"for task ${MDC(PARTITION_ID, task.partitionId)}", e) + logError( + log"Failed to update accumulator ${MDC(ACCUMULATOR_ID, id)} " + + log"(${MDC(CLASS_NAME, accumClassName)}) for task " + + log"${MDC(PARTITION_ID, task.partitionId)}", e) } } } @@ -1791,8 +1806,10 @@ private[spark] class DAGScheduler( shouldInterruptThread.toBoolean } catch { case e: IllegalArgumentException => - logWarning(s"${SparkContext.SPARK_JOB_INTERRUPT_ON_CANCEL} in Job ${job.jobId} " + - s"is invalid: $shouldInterruptThread. Using 'false' instead", e) + logWarning(log"${MDC(CONFIG, SparkContext.SPARK_JOB_INTERRUPT_ON_CANCEL)} " + + log"in Job ${MDC(JOB_ID, job.jobId)} " + + log"is invalid: ${MDC(CONFIG2, shouldInterruptThread)}. " + + log"Using 'false' instead", e) false } } @@ -1921,8 +1938,8 @@ private[spark] class DAGScheduler( try { // killAllTaskAttempts will fail if a SchedulerBackend does not implement // killTask. - logInfo(s"Job ${job.jobId} is finished. Cancelling potential speculative " + - "or zombie tasks for this job") + logInfo(log"Job ${MDC(JOB_ID, job.jobId)} is finished. Cancelling " + + log"potential speculative or zombie tasks for this job") // ResultStage is only used by this job. It's safe to kill speculative or // zombie tasks in this stage. taskScheduler.killAllTaskAttempts( @@ -1931,7 +1948,8 @@ private[spark] class DAGScheduler( reason = "Stage finished") } catch { case e: UnsupportedOperationException => - logWarning(s"Could not cancel tasks for stage $stageId", e) + logWarning(log"Could not cancel tasks " + + log"for stage ${MDC(STAGE_ID, stageId)}", e) } listenerBus.post( SparkListenerJobEnd(job.jobId, clock.getTimeMillis(), JobSucceeded)) @@ -1948,7 +1966,7 @@ private[spark] class DAGScheduler( } } case None => - logInfo("Ignoring result from " + rt + " because its job has finished") + logInfo(log"Ignoring result from ${MDC(RESULT, rt)} because its job has finished") } case smt: ShuffleMapTask => @@ -1963,7 +1981,8 @@ private[spark] class DAGScheduler( logDebug("ShuffleMapTask finished on " + execId) if (executorFailureEpoch.contains(execId) && smt.epoch <= executorFailureEpoch(execId)) { - logInfo(s"Ignoring possibly bogus $smt completion from executor $execId") + logInfo(log"Ignoring possibly bogus ${MDC(STAGE_ID, smt)} completion from " + + log"executor ${MDC(EXECUTOR_ID, execId)}") } else { // The epoch of the task is acceptable (i.e., the task was launched after the most // recent failure we're aware of for the executor), so mark the task's output as @@ -1972,7 +1991,7 @@ private[spark] class DAGScheduler( shuffleStage.shuffleDep.shuffleId, smt.partitionId, status) } } else { - logInfo(s"Ignoring $smt completion from an older attempt of indeterminate stage") + logInfo(log"Ignoring ${MDC(TASK_NAME, smt)} completion from an older attempt of indeterminate stage") } if (runningStages.contains(shuffleStage) && shuffleStage.pendingPartitions.isEmpty) { @@ -1990,17 +2009,22 @@ private[spark] class DAGScheduler( val mapStage = shuffleIdToMapStage(shuffleId) if (failedStage.latestInfo.attemptNumber() != task.stageAttemptId) { - logInfo(s"Ignoring fetch failure from $task as it's from $failedStage attempt" + - s" ${task.stageAttemptId} and there is a more recent attempt for that stage " + - s"(attempt ${failedStage.latestInfo.attemptNumber()}) running") + logInfo(log"Ignoring fetch failure from " + + log"${MDC(TASK_ID, task)} as it's from " + + log"${MDC(STAGE_ID, failedStage)} attempt " + + log"${MDC(STAGE_ATTEMPT, task.stageAttemptId)} and there is a more recent attempt for " + + log"that stage (attempt " + + log"${MDC(NUM_ATTEMPT, failedStage.latestInfo.attemptNumber())}) running") } else { val ignoreStageFailure = ignoreDecommissionFetchFailure && isExecutorDecommissioningOrDecommissioned(taskScheduler, bmAddress) if (ignoreStageFailure) { - logInfo(s"Ignoring fetch failure from $task of $failedStage attempt " + - s"${task.stageAttemptId} when count ${config.STAGE_MAX_CONSECUTIVE_ATTEMPTS.key} " + - s"as executor ${bmAddress.executorId} is decommissioned and " + - s" ${config.STAGE_IGNORE_DECOMMISSION_FETCH_FAILURE.key}=true") + logInfo(log"Ignoring fetch failure from ${MDC(TASK_NAME, task)} of " + + log"${MDC(STAGE, failedStage)} attempt " + + log"${MDC(STAGE_ATTEMPT, task.stageAttemptId)} when count " + + log"${MDC(MAX_ATTEMPTS, config.STAGE_MAX_CONSECUTIVE_ATTEMPTS.key)} " + + log"as executor ${MDC(EXECUTOR_ID, bmAddress.executorId)} is decommissioned and " + + log"${MDC(CONFIG, config.STAGE_IGNORE_DECOMMISSION_FETCH_FAILURE.key)}=true") } else { failedStage.failedAttemptIds.add(task.stageAttemptId) } @@ -2013,8 +2037,10 @@ private[spark] class DAGScheduler( // multiple tasks running concurrently on different executors). In that case, it is // possible the fetch failure has already been handled by the scheduler. if (runningStages.contains(failedStage)) { - logInfo(s"Marking $failedStage (${failedStage.name}) as failed " + - s"due to a fetch failure from $mapStage (${mapStage.name})") + logInfo(log"Marking ${MDC(FAILED_STAGE, failedStage)} " + + log"(${MDC(FAILED_STAGE_NAME, failedStage.name)}) as failed " + + log"due to a fetch failure from ${MDC(STAGE, mapStage)} " + + log"(${MDC(STAGE_NAME, mapStage.name)})") markStageAsFinished(failedStage, errorMessage = Some(failureMessage), willRetry = !shouldAbortStage) } else { @@ -2142,9 +2168,9 @@ private[spark] class DAGScheduler( case _ => } - logInfo(s"The shuffle map stage $mapStage with indeterminate output was failed, " + - s"we will roll back and rerun below stages which include itself and all its " + - s"indeterminate child stages: $rollingBackStages") + logInfo(log"The shuffle map stage ${MDC(SHUFFLE_ID, mapStage)} with indeterminate output was failed, " + + log"we will roll back and rerun below stages which include itself and all its " + + log"indeterminate child stages: ${MDC(STAGES, rollingBackStages)}") } // We expect one executor failure to trigger many FetchFailures in rapid succession, @@ -2156,9 +2182,9 @@ private[spark] class DAGScheduler( // producing a resubmit for each failed stage makes debugging and logging a little // simpler while not producing an overwhelming number of scheduler events. logInfo( - s"Resubmitting $mapStage (${mapStage.name}) and " + - s"$failedStage (${failedStage.name}) due to fetch failure" - ) + log"Resubmitting ${MDC(STAGE, mapStage)} " + + log"(${MDC(STAGE_NAME, mapStage.name)}) and ${MDC(FAILED_STAGE, failedStage)} " + + log"(${MDC(FAILED_STAGE_NAME, failedStage.name)}) due to fetch failure") messageScheduler.schedule( new Runnable { override def run(): Unit = eventProcessLoop.post(ResubmitFailedStages) @@ -2217,12 +2243,13 @@ private[spark] class DAGScheduler( // Always fail the current stage and retry all the tasks when a barrier task fail. val failedStage = stageIdToStage(task.stageId) if (failedStage.latestInfo.attemptNumber() != task.stageAttemptId) { - logInfo(s"Ignoring task failure from $task as it's from $failedStage attempt" + - s" ${task.stageAttemptId} and there is a more recent attempt for that stage " + - s"(attempt ${failedStage.latestInfo.attemptNumber()}) running") + logInfo(log"Ignoring task failure from ${MDC(TASK_NAME, task)} as it's from " + + log"${MDC(FAILED_STAGE, failedStage)} attempt ${MDC(STAGE_ATTEMPT, task.stageAttemptId)} " + + log"and there is a more recent attempt for that stage (attempt " + + log"${MDC(NUM_ATTEMPT, failedStage.latestInfo.attemptNumber())}) running") } else { - logInfo(s"Marking $failedStage (${failedStage.name}) as failed due to a barrier task " + - "failed.") + logInfo(log"Marking ${MDC(STAGE_ID, failedStage.id)} (${MDC(STAGE_NAME, failedStage.name)}) " + + log"as failed due to a barrier task failed.") val message = s"Stage failed because barrier task $task finished unsuccessfully.\n" + failure.toErrorString try { @@ -2236,7 +2263,7 @@ private[spark] class DAGScheduler( case e: UnsupportedOperationException => // Cannot continue with barrier stage if failed to cancel zombie barrier tasks. // TODO SPARK-24877 leave the zombie tasks and ignore their completion events. - logWarning(s"Could not kill all tasks for stage $stageId", e) + logWarning(log"Could not kill all tasks for stage ${MDC(STAGE_ID, stageId)}", e) abortStage(failedStage, "Could not kill zombie barrier tasks for stage " + s"$failedStage (${failedStage.name})", Some(e)) } @@ -2277,8 +2304,8 @@ private[spark] class DAGScheduler( val noResubmitEnqueued = !failedStages.contains(failedStage) failedStages += failedStage if (noResubmitEnqueued) { - logInfo(s"Resubmitting $failedStage (${failedStage.name}) due to barrier stage " + - "failure.") + logInfo(log"Resubmitting ${MDC(FAILED_STAGE, failedStage)} " + + log"(${MDC(FAILED_STAGE_NAME, failedStage.name)}) due to barrier stage failure.") messageScheduler.schedule(new Runnable { override def run(): Unit = eventProcessLoop.post(ResubmitFailedStages) }, DAGScheduler.RESUBMIT_TIMEOUT, TimeUnit.MILLISECONDS) @@ -2355,8 +2382,8 @@ private[spark] class DAGScheduler( // delay should be 0 and registerMergeResults should be true. assert(delay == 0 && registerMergeResults) if (task.getDelay(TimeUnit.NANOSECONDS) > 0 && task.cancel(false)) { - logInfo(s"$stage (${stage.name}) scheduled for finalizing shuffle merge immediately " + - s"after cancelling previously scheduled task.") + logInfo(log"${MDC(STAGE, stage)} (${MDC(STAGE_NAME, stage.name)}) scheduled " + + log"for finalizing shuffle merge immediately after cancelling previously scheduled task.") shuffleDep.setFinalizeTask( shuffleMergeFinalizeScheduler.schedule( new Runnable { @@ -2367,13 +2394,15 @@ private[spark] class DAGScheduler( ) ) } else { - logInfo(s"$stage (${stage.name}) existing scheduled task for finalizing shuffle merge" + - s"would either be in-progress or finished. No need to schedule shuffle merge" + - s" finalization again.") + logInfo( + log"${MDC(STAGE, stage)} (${MDC(STAGE_NAME, stage.name)}) existing scheduled task " + + log"for finalizing shuffle merge would either be in-progress or finished. " + + log"No need to schedule shuffle merge finalization again.") } case None => // If no previous finalization task is scheduled, schedule the finalization task. - logInfo(s"$stage (${stage.name}) scheduled for finalizing shuffle merge in $delay s") + logInfo(log"${MDC(STAGE, stage)} (${MDC(STAGE_NAME, stage.name)}) scheduled for " + + log"finalizing shuffle merge in ${MDC(DELAY, delay * 1000L)} ms") shuffleDep.setFinalizeTask( shuffleMergeFinalizeScheduler.schedule( new Runnable { @@ -2402,8 +2431,9 @@ private[spark] class DAGScheduler( private[scheduler] def finalizeShuffleMerge( stage: ShuffleMapStage, registerMergeResults: Boolean = true): Unit = { - logInfo(s"$stage (${stage.name}) finalizing the shuffle merge with registering merge " + - s"results set to $registerMergeResults") + logInfo( + log"${MDC(STAGE, stage)} (${MDC(STAGE_NAME, stage.name)}) finalizing the shuffle merge with" + + log" registering merge results set to ${MDC(REGISTER_MERGE_RESULTS, registerMergeResults)}") val shuffleId = stage.shuffleDep.shuffleId val shuffleMergeId = stage.shuffleDep.shuffleMergeId val numMergers = stage.shuffleDep.getMergerLocs.length @@ -2451,8 +2481,9 @@ private[spark] class DAGScheduler( } override def onShuffleMergeFailure(e: Throwable): Unit = { - logWarning(s"Exception encountered when trying to finalize shuffle " + - s"merge on ${shuffleServiceLoc.host} for shuffle $shuffleId", e) + logWarning(log"Exception encountered when trying to finalize shuffle " + + log"merge on ${MDC(HOST_PORT, shuffleServiceLoc.host)} " + + log"for shuffle ${MDC(SHUFFLE_ID, shuffleId)}", e) // Do not fail the future as this would cause dag scheduler to prematurely // give up on waiting for merge results from the remaining shuffle services // if one fails @@ -2472,8 +2503,9 @@ private[spark] class DAGScheduler( } catch { case _: TimeoutException => timedOut = true - logInfo(s"Timed out on waiting for merge results from all " + - s"$numMergers mergers for shuffle $shuffleId") + logInfo(log"Timed out on waiting for merge results from all " + + log"${MDC(NUM_MERGERS, numMergers)} mergers for " + + log"shuffle ${MDC(SHUFFLE_ID, shuffleId)}") } finally { if (timedOut || !registerMergeResults) { cancelFinalizeShuffleMergeFutures(scheduledFutures, @@ -2504,9 +2536,9 @@ private[spark] class DAGScheduler( private def processShuffleMapStageCompletion(shuffleStage: ShuffleMapStage): Unit = { markStageAsFinished(shuffleStage) logInfo("looking for newly runnable stages") - logInfo("running: " + runningStages) - logInfo("waiting: " + waitingStages) - logInfo("failed: " + failedStages) + logInfo(log"running: ${MDC(STAGES, runningStages)}") + logInfo(log"waiting: ${MDC(STAGES, waitingStages)}") + logInfo(log"failed: ${MDC(STAGES, failedStages)}") // This call to increment the epoch may not be strictly necessary, but it is retained // for now in order to minimize the changes in behavior from an earlier version of the @@ -2522,9 +2554,10 @@ private[spark] class DAGScheduler( if (!shuffleStage.isAvailable) { // Some tasks had failed; let's resubmit this shuffleStage. // TODO: Lower-level scheduler should also deal with this - logInfo("Resubmitting " + shuffleStage + " (" + shuffleStage.name + - ") because some of its tasks had failed: " + - shuffleStage.findMissingPartitions().mkString(", ")) + logInfo(log"Resubmitting ${MDC(STAGE, shuffleStage)} " + + log"(${MDC(STAGE_NAME, shuffleStage.name)}) " + + log"because some of its tasks had failed: " + + log"${MDC(PARTITION_IDS, shuffleStage.findMissingPartitions().mkString(", "))}") submitStage(shuffleStage) } else { markMapStageJobsAsFinished(shuffleStage) @@ -2596,7 +2629,7 @@ private[spark] class DAGScheduler( } private def handleResubmittedFailure(task: Task[_], stage: Stage): Unit = { - logInfo(s"Resubmitted $task, so marking it as still running.") + logInfo(log"Resubmitted ${MDC(TASK_NAME, task)}, so marking it as still running.") stage match { case sms: ShuffleMapStage => sms.pendingPartitions += task.partitionId @@ -2672,7 +2705,7 @@ private[spark] class DAGScheduler( if (!isShuffleMerger && (!executorFailureEpoch.contains(execId) || executorFailureEpoch(execId) < currentEpoch)) { executorFailureEpoch(execId) = currentEpoch - logInfo(s"Executor lost: $execId (epoch $currentEpoch)") + logInfo(log"Executor lost: ${MDC(EXECUTOR_ID, execId)} (epoch ${MDC(EPOCH, currentEpoch)})") if (pushBasedShuffleEnabled) { // Remove fetchFailed host in the shuffle push merger list for push based shuffle hostToUnregisterOutputs.foreach( @@ -2696,10 +2729,12 @@ private[spark] class DAGScheduler( if (remove) { hostToUnregisterOutputs match { case Some(host) => - logInfo(s"Shuffle files lost for host: $host (epoch $currentEpoch)") + logInfo(log"Shuffle files lost for host: ${MDC(HOST, host)} (epoch " + + log"${MDC(EPOCH, currentEpoch)}") mapOutputTracker.removeOutputsOnHost(host) case None => - logInfo(s"Shuffle files lost for executor: $execId (epoch $currentEpoch)") + logInfo(log"Shuffle files lost for executor: ${MDC(EXECUTOR_ID, execId)} " + + log"(epoch ${MDC(EPOCH, currentEpoch)})") mapOutputTracker.removeOutputsOnExecutor(execId) } } @@ -2721,7 +2756,8 @@ private[spark] class DAGScheduler( workerId: String, host: String, message: String): Unit = { - logInfo("Shuffle files lost for worker %s on host %s".format(workerId, host)) + logInfo(log"Shuffle files lost for worker ${MDC(WORKER_ID, workerId)} " + + log"on host ${MDC(HOST, host)}") mapOutputTracker.removeOutputsOnHost(host) clearCacheLocs() } @@ -2729,7 +2765,7 @@ private[spark] class DAGScheduler( private[scheduler] def handleExecutorAdded(execId: String, host: String): Unit = { // remove from executorFailureEpoch(execId) ? if (executorFailureEpoch.contains(execId)) { - logInfo("Host added was in lost list earlier: " + host) + logInfo(log"Host added was in lost list earlier: ${MDC(HOST, host)}") executorFailureEpoch -= execId } shuffleFileLostEpoch -= execId @@ -2742,10 +2778,10 @@ private[spark] class DAGScheduler( }.foreach { case (_, stage: ShuffleMapStage) => configureShufflePushMergerLocations(stage) if (stage.shuffleDep.getMergerLocs.nonEmpty) { - logInfo(s"Shuffle merge enabled adaptively for $stage with shuffle" + - s" ${stage.shuffleDep.shuffleId} and shuffle merge" + - s" ${stage.shuffleDep.shuffleMergeId} with ${stage.shuffleDep.getMergerLocs.size}" + - s" merger locations") + logInfo(log"Shuffle merge enabled adaptively for ${MDC(STAGE, stage)} with shuffle" + + log" ${MDC(SHUFFLE_ID, stage.shuffleDep.shuffleId)} and shuffle merge" + + log" ${MDC(SHUFFLE_MERGE_ID, stage.shuffleDep.shuffleMergeId)} with " + + log"${MDC(NUM_MERGER_LOCATIONS, stage.shuffleDep.getMergerLocs.size)} merger locations") } } } @@ -2765,7 +2801,7 @@ private[spark] class DAGScheduler( handleJobCancellation(jobId, Option(reasonStr)) } case None => - logInfo("No active jobs to kill for Stage " + stageId) + logInfo(log"No active jobs to kill for Stage ${MDC(STAGE_ID, stageId)}") } } @@ -2788,11 +2824,12 @@ private[spark] class DAGScheduler( errorMessage: Option[String] = None, willRetry: Boolean = false): Unit = { val serviceTime = stage.latestInfo.submissionTime match { - case Some(t) => "%.03f".format((clock.getTimeMillis() - t) / 1000.0) + case Some(t) => clock.getTimeMillis() - t case _ => "Unknown" } if (errorMessage.isEmpty) { - logInfo("%s (%s) finished in %s s".format(stage, stage.name, serviceTime)) + logInfo(log"${MDC(STAGE, stage)} (${MDC(STAGE_NAME, stage.name)}) " + + log"finished in ${MDC(TIME_UNITS, serviceTime)} ms") stage.latestInfo.completionTime = Some(clock.getTimeMillis()) // Clear failure count for this stage, now that it's succeeded. @@ -2802,7 +2839,8 @@ private[spark] class DAGScheduler( stage.clearFailures() } else { stage.latestInfo.stageFailed(errorMessage.get) - logInfo(s"$stage (${stage.name}) failed in $serviceTime s due to ${errorMessage.get}") + logInfo(log"${MDC(STAGE, stage)} (${MDC(STAGE_NAME, stage.name)}) failed in " + + log"${MDC(TIME_UNITS, serviceTime)} ms due to ${MDC(ERROR, errorMessage.get)}") } updateStageInfoForPushBasedShuffle(stage) if (!willRetry) { @@ -2848,7 +2886,8 @@ private[spark] class DAGScheduler( failJobAndIndependentStages(job, finalException) } if (dependentJobs.isEmpty) { - logInfo("Ignoring failure of " + failedStage + " because all jobs depending on it are done") + logInfo(log"Ignoring failure of ${MDC(FAILED_STAGE, failedStage)} because all jobs " + + log"depending on it are done") } } @@ -2892,7 +2931,7 @@ private[spark] class DAGScheduler( markStageAsFinished(stage, Some(reason)) } catch { case e: UnsupportedOperationException => - logWarning(s"Could not cancel tasks for stage $stageId", e) + logWarning(log"Could not cancel tasks for stage ${MDC(STAGE_ID, stageId)}", e) ableToCancelStages = false } } diff --git a/core/src/main/scala/org/apache/spark/scheduler/HealthTracker.scala b/core/src/main/scala/org/apache/spark/scheduler/HealthTracker.scala index aaa9e5bdd9e1c..1606072153906 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/HealthTracker.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/HealthTracker.scala @@ -23,7 +23,7 @@ import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet} import org.apache.spark.{ExecutorAllocationClient, SparkConf, SparkContext} import org.apache.spark.internal.{config, Logging, MDC} -import org.apache.spark.internal.LogKey.HOST +import org.apache.spark.internal.LogKeys._ import org.apache.spark.util.{Clock, SystemClock, Utils} /** @@ -111,8 +111,8 @@ private[scheduler] class HealthTracker ( val execsToInclude = executorIdToExcludedStatus.filter(_._2.expiryTime < now).keys if (execsToInclude.nonEmpty) { // Include any executors that have been excluded longer than the excludeOnFailure timeout. - logInfo(s"Removing executors $execsToInclude from exclude list because the " + - s"the executors have reached the timed out") + logInfo(log"Removing executors ${MDC(EXECUTOR_IDS, execsToInclude)} from " + + log"exclude list because the executors have reached the timed out") execsToInclude.foreach { exec => val status = executorIdToExcludedStatus.remove(exec).get val failedExecsOnNode = nodeToExcludedExecs(status.node) @@ -128,8 +128,8 @@ private[scheduler] class HealthTracker ( val nodesToInclude = nodeIdToExcludedExpiryTime.filter(_._2 < now).keys if (nodesToInclude.nonEmpty) { // Include any nodes that have been excluded longer than the excludeOnFailure timeout. - logInfo(s"Removing nodes $nodesToInclude from exclude list because the " + - s"nodes have reached has timed out") + logInfo(log"Removing nodes ${MDC(NODES, nodesToInclude)} from exclude list because the " + + log"nodes have reached has timed out") nodesToInclude.foreach { node => nodeIdToExcludedExpiryTime.remove(node) // post both to keep backwards compatibility @@ -173,8 +173,8 @@ private[scheduler] class HealthTracker ( force = true) } case None => - logInfo(s"Not attempting to kill excluded executor id $exec " + - s"since allocation client is not defined.") + logInfo(log"Not attempting to kill excluded executor id ${MDC(EXECUTOR_ID, exec)}" + + log" since allocation client is not defined.") } } @@ -196,21 +196,23 @@ private[scheduler] class HealthTracker ( allocationClient match { case Some(a) => if (EXCLUDE_ON_FAILURE_DECOMMISSION_ENABLED) { - logInfo(s"Decommissioning all executors on excluded host $node " + - s"since ${config.EXCLUDE_ON_FAILURE_KILL_ENABLED.key} is set.") + logInfo(log"Decommissioning all executors on excluded host ${MDC(HOST, node)} " + + log"since ${MDC(CONFIG, config.EXCLUDE_ON_FAILURE_KILL_ENABLED.key)} " + + log"is set.") if (!a.decommissionExecutorsOnHost(node)) { logError(log"Decommissioning executors on ${MDC(HOST, node)} failed.") } } else { - logInfo(s"Killing all executors on excluded host $node " + - s"since ${config.EXCLUDE_ON_FAILURE_KILL_ENABLED.key} is set.") + logInfo(log"Killing all executors on excluded host ${MDC(HOST, node)} " + + log"since ${MDC(CONFIG, config.EXCLUDE_ON_FAILURE_KILL_ENABLED.key)} is set.") if (!a.killExecutorsOnHost(node)) { logError(log"Killing executors on node ${MDC(HOST, node)} failed.") } } case None => - logWarning(s"Not attempting to kill executors on excluded host $node " + - s"since allocation client is not defined.") + logWarning( + log"Not attempting to kill executors on excluded host ${MDC(HOST_PORT, node)} " + + log"since allocation client is not defined.") } } } @@ -230,7 +232,8 @@ private[scheduler] class HealthTracker ( if (conf.get(config.SHUFFLE_SERVICE_ENABLED)) { if (!nodeIdToExcludedExpiryTime.contains(host)) { - logInfo(s"excluding node $host due to fetch failure of external shuffle service") + logInfo(log"excluding node ${MDC(HOST, host)} due to fetch failure of " + + log"external shuffle service") nodeIdToExcludedExpiryTime.put(host, expiryTimeForNewExcludes) // post both to keep backwards compatibility @@ -241,7 +244,7 @@ private[scheduler] class HealthTracker ( updateNextExpiryTime() } } else if (!executorIdToExcludedStatus.contains(exec)) { - logInfo(s"Excluding executor $exec due to fetch failure") + logInfo(log"Excluding executor ${MDC(EXECUTOR_ID, exec)} due to fetch failure") executorIdToExcludedStatus.put(exec, ExcludedExecutor(host, expiryTimeForNewExcludes)) // We hardcoded number of failure tasks to 1 for fetch failure, because there's no @@ -279,8 +282,8 @@ private[scheduler] class HealthTracker ( // some of the logic around expiry times a little more confusing. But it also wouldn't be a // problem to re-exclude, with a later expiry time. if (newTotal >= MAX_FAILURES_PER_EXEC && !executorIdToExcludedStatus.contains(exec)) { - logInfo(s"Excluding executor id: $exec because it has $newTotal" + - s" task failures in successful task sets") + logInfo(log"Excluding executor id: ${MDC(EXECUTOR_ID, exec)} because it has " + + log"${MDC(TOTAL, newTotal)} task failures in successful task sets") val node = failuresInTaskSet.node executorIdToExcludedStatus.put(exec, ExcludedExecutor(node, expiryTimeForNewExcludes)) // post both to keep backwards compatibility @@ -298,8 +301,9 @@ private[scheduler] class HealthTracker ( // time. if (excludedExecsOnNode.size >= MAX_FAILED_EXEC_PER_NODE && !nodeIdToExcludedExpiryTime.contains(node)) { - logInfo(s"Excluding node $node because it has ${excludedExecsOnNode.size} " + - s"executors excluded: ${excludedExecsOnNode}") + logInfo(log"Excluding node ${MDC(HOST, node)} because it has " + + log"${MDC(NUM_EXECUTORS, excludedExecsOnNode.size)} executors " + + log"excluded: ${MDC(EXECUTOR_IDS, excludedExecsOnNode)}") nodeIdToExcludedExpiryTime.put(node, expiryTimeForNewExcludes) // post both to keep backwards compatibility listenerBus.post(SparkListenerNodeBlacklisted(now, node, excludedExecsOnNode.size)) @@ -437,10 +441,12 @@ private[spark] object HealthTracker extends Logging { val legacyKey = config.EXCLUDE_ON_FAILURE_LEGACY_TIMEOUT_CONF.key conf.get(config.EXCLUDE_ON_FAILURE_LEGACY_TIMEOUT_CONF).exists { legacyTimeout => if (legacyTimeout == 0) { - logWarning(s"Turning off excludeOnFailure due to legacy configuration: $legacyKey == 0") + logWarning(log"Turning off excludeOnFailure due to legacy configuration: " + + log"${MDC(CONFIG, legacyKey)} == 0") false } else { - logWarning(s"Turning on excludeOnFailure due to legacy configuration: $legacyKey > 0") + logWarning(log"Turning on excludeOnFailure due to legacy configuration: " + + log"${MDC(CONFIG, legacyKey)} > 0") true } } diff --git a/core/src/main/scala/org/apache/spark/scheduler/LiveListenerBus.scala b/core/src/main/scala/org/apache/spark/scheduler/LiveListenerBus.scala index bd0bff18ff578..7251eb2c86ea1 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/LiveListenerBus.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/LiveListenerBus.scala @@ -31,7 +31,7 @@ import com.codahale.metrics.{Counter, MetricRegistry, Timer} import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.errors.SparkCoreErrors import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{CLASS_NAME, MAX_SIZE} +import org.apache.spark.internal.LogKeys.{CLASS_NAME, MAX_SIZE} import org.apache.spark.internal.config._ import org.apache.spark.metrics.MetricsSystem import org.apache.spark.metrics.source.Source diff --git a/core/src/main/scala/org/apache/spark/scheduler/OutputCommitCoordinator.scala b/core/src/main/scala/org/apache/spark/scheduler/OutputCommitCoordinator.scala index cd5d6b8f9c90d..df28a97a349ea 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/OutputCommitCoordinator.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/OutputCommitCoordinator.scala @@ -20,7 +20,7 @@ package org.apache.spark.scheduler import scala.collection.mutable import org.apache.spark._ -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.rpc.{RpcCallContext, RpcEndpoint, RpcEndpointRef, RpcEnv} import org.apache.spark.util.{RpcUtils, ThreadUtils} @@ -44,10 +44,7 @@ private case class AskPermissionToCommitOutput( * This class was introduced in SPARK-4879; see that JIRA issue (and the associated pull requests) * for an extensive design discussion. */ -private[spark] class OutputCommitCoordinator( - conf: SparkConf, - isDriver: Boolean, - sc: Option[SparkContext] = None) extends Logging { +private[spark] class OutputCommitCoordinator(conf: SparkConf, isDriver: Boolean) extends Logging { // Initialized by SparkEnv var coordinatorRef: Option[RpcEndpointRef] = None @@ -124,7 +121,7 @@ private[spark] class OutputCommitCoordinator( stageStates.get(stage) match { case Some(state) => require(state.authorizedCommitters.length == maxPartitionId + 1) - logInfo(s"Reusing state from previous attempt of stage $stage.") + logInfo(log"Reusing state from previous attempt of stage ${MDC(LogKeys.STAGE_ID, stage)}") case _ => stageStates(stage) = new StageState(maxPartitionId + 1) @@ -151,17 +148,18 @@ private[spark] class OutputCommitCoordinator( case Success => // The task output has been committed successfully case _: TaskCommitDenied => - logInfo(s"Task was denied committing, stage: $stage.$stageAttempt, " + - s"partition: $partition, attempt: $attemptNumber") + logInfo(log"Task was denied committing, stage: ${MDC(LogKeys.STAGE_ID, stage)}." + + log"${MDC(LogKeys.STAGE_ATTEMPT, stageAttempt)}, " + + log"partition: ${MDC(LogKeys.PARTITION_ID, partition)}, " + + log"attempt: ${MDC(LogKeys.NUM_ATTEMPT, attemptNumber)}") case _ => // Mark the attempt as failed to exclude from future commit protocol val taskId = TaskIdentifier(stageAttempt, attemptNumber) stageState.failures.getOrElseUpdate(partition, mutable.Set()) += taskId if (stageState.authorizedCommitters(partition) == taskId) { - sc.foreach(_.dagScheduler.stageFailed(stage, s"Authorized committer " + - s"(attemptNumber=$attemptNumber, stage=$stage, partition=$partition) failed; " + - s"but task commit success, data duplication may happen. " + - s"reason=$reason")) + logDebug(s"Authorized committer (attemptNumber=$attemptNumber, stage=$stage, " + + s"partition=$partition) failed; clearing lock") + stageState.authorizedCommitters(partition) = null } } } @@ -182,8 +180,10 @@ private[spark] class OutputCommitCoordinator( attemptNumber: Int): Boolean = synchronized { stageStates.get(stage) match { case Some(state) if attemptFailed(state, stageAttempt, partition, attemptNumber) => - logInfo(s"Commit denied for stage=$stage.$stageAttempt, partition=$partition: " + - s"task attempt $attemptNumber already marked as failed.") + logInfo(log"Commit denied for stage=${MDC(LogKeys.STAGE_ID, stage)}." + + log"${MDC(LogKeys.STAGE_ATTEMPT, stageAttempt)}, partition=" + + log"${MDC(LogKeys.PARTITION_ID, partition)}: task attempt " + + log"${MDC(LogKeys.NUM_ATTEMPT, attemptNumber)} already marked as failed.") false case Some(state) => val existing = state.authorizedCommitters(partition) diff --git a/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala b/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala index 24c25d2377948..26c7486010c02 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala @@ -25,7 +25,7 @@ import com.fasterxml.jackson.core.JsonParseException import com.fasterxml.jackson.databind.exc.UnrecognizedPropertyException import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{LINE, LINE_NUM, PATH} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.scheduler.ReplayListenerBus._ import org.apache.spark.util.JsonProtocol @@ -92,7 +92,7 @@ private[spark] class ReplayListenerBus extends SparkListenerBus with Logging { // Ignore unknown events, parse through the event log file. // To avoid spamming, warnings are only displayed once for each unknown event. if (!unrecognizedEvents.contains(e.getMessage)) { - logWarning(s"Drop unrecognized event: ${e.getMessage}") + logWarning(log"Drop unrecognized event: ${MDC(ERROR, e.getMessage)}") unrecognizedEvents.add(e.getMessage) } logDebug(s"Drop incompatible event log: $currentLine") @@ -100,7 +100,7 @@ private[spark] class ReplayListenerBus extends SparkListenerBus with Logging { // Ignore unrecognized properties, parse through the event log file. // To avoid spamming, warnings are only displayed once for each unrecognized property. if (!unrecognizedProperties.contains(e.getMessage)) { - logWarning(s"Drop unrecognized property: ${e.getMessage}") + logWarning(log"Drop unrecognized property: ${MDC(ERROR, e.getMessage)}") unrecognizedProperties.add(e.getMessage) } logDebug(s"Drop incompatible event log: $currentLine") @@ -111,8 +111,9 @@ private[spark] class ReplayListenerBus extends SparkListenerBus with Logging { if (!maybeTruncated || lineEntries.hasNext) { throw jpe } else { - logWarning(s"Got JsonParseException from log file $sourceName" + - s" at line $lineNumber, the file might not have finished writing cleanly.") + logWarning(log"Got JsonParseException from log file ${MDC(FILE_NAME, sourceName)}" + + log" at line ${MDC(LINE_NUM, lineNumber)}, " + + log"the file might not have finished writing cleanly.") } } } diff --git a/core/src/main/scala/org/apache/spark/scheduler/SchedulableBuilder.scala b/core/src/main/scala/org/apache/spark/scheduler/SchedulableBuilder.scala index 7e61dad3c141b..6f64dff3f39d6 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/SchedulableBuilder.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/SchedulableBuilder.scala @@ -27,7 +27,8 @@ import org.apache.hadoop.fs.Path import org.apache.spark.SparkContext import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.PATH +import org.apache.spark.internal.LogKeys +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.config.{SCHEDULER_ALLOCATION_FILE, SCHEDULER_MODE} import org.apache.spark.scheduler.SchedulingMode.SchedulingMode import org.apache.spark.util.Utils @@ -79,20 +80,23 @@ private[spark] class FairSchedulableBuilder(val rootPool: Pool, sc: SparkContext fileData = schedulerAllocFile.map { f => val filePath = new Path(f) val fis = filePath.getFileSystem(sc.hadoopConfiguration).open(filePath) - logInfo(s"Creating Fair Scheduler pools from $f") + logInfo(log"Creating Fair Scheduler pools from ${MDC(LogKeys.FILE_NAME, f)}") Some((fis, f)) }.getOrElse { val is = Utils.getSparkClassLoader.getResourceAsStream(DEFAULT_SCHEDULER_FILE) if (is != null) { - logInfo(s"Creating Fair Scheduler pools from default file: $DEFAULT_SCHEDULER_FILE") + logInfo(log"Creating Fair Scheduler pools from default file: " + + log"${MDC(LogKeys.FILE_NAME, DEFAULT_SCHEDULER_FILE)}") Some((is, DEFAULT_SCHEDULER_FILE)) } else { val schedulingMode = SchedulingMode.withName(sc.conf.get(SCHEDULER_MODE)) rootPool.addSchedulable(new Pool( DEFAULT_POOL_NAME, schedulingMode, DEFAULT_MINIMUM_SHARE, DEFAULT_WEIGHT)) - logInfo("Fair scheduler configuration not found, created default pool: " + - "%s, schedulingMode: %s, minShare: %d, weight: %d".format( - DEFAULT_POOL_NAME, schedulingMode, DEFAULT_MINIMUM_SHARE, DEFAULT_WEIGHT)) + logInfo(log"Fair scheduler configuration not found, created default pool: " + + log"${MDC(LogKeys.DEFAULT_NAME, DEFAULT_POOL_NAME)}, " + + log"schedulingMode: ${MDC(LogKeys.SCHEDULING_MODE, schedulingMode)}, " + + log"minShare: ${MDC(LogKeys.MIN_SHARE, DEFAULT_MINIMUM_SHARE)}, " + + log"weight: ${MDC(LogKeys.WEIGHT, DEFAULT_WEIGHT)}") None } } @@ -121,8 +125,10 @@ private[spark] class FairSchedulableBuilder(val rootPool: Pool, sc: SparkContext val pool = new Pool(DEFAULT_POOL_NAME, DEFAULT_SCHEDULING_MODE, DEFAULT_MINIMUM_SHARE, DEFAULT_WEIGHT) rootPool.addSchedulable(pool) - logInfo("Created default pool: %s, schedulingMode: %s, minShare: %d, weight: %d".format( - DEFAULT_POOL_NAME, DEFAULT_SCHEDULING_MODE, DEFAULT_MINIMUM_SHARE, DEFAULT_WEIGHT)) + logInfo(log"Created default pool: ${MDC(LogKeys.POOL_NAME, DEFAULT_POOL_NAME)}, " + + log"schedulingMode: ${MDC(LogKeys.SCHEDULING_MODE, DEFAULT_SCHEDULING_MODE)}, " + + log"minShare: ${MDC(LogKeys.MIN_SHARE, DEFAULT_MINIMUM_SHARE)}, " + + log"weight: ${MDC(LogKeys.WEIGHT, DEFAULT_WEIGHT)}") } } @@ -141,8 +147,10 @@ private[spark] class FairSchedulableBuilder(val rootPool: Pool, sc: SparkContext rootPool.addSchedulable(new Pool(poolName, schedulingMode, minShare, weight)) - logInfo("Created pool: %s, schedulingMode: %s, minShare: %d, weight: %d".format( - poolName, schedulingMode, minShare, weight)) + logInfo(log"Created pool: ${MDC(LogKeys.POOL_NAME, poolName)}, " + + log"schedulingMode: ${MDC(LogKeys.SCHEDULING_MODE, schedulingMode)}, " + + log"minShare: ${MDC(LogKeys.MIN_SHARE, minShare)}, " + + log"weight: ${MDC(LogKeys.WEIGHT, weight)}") } } @@ -154,9 +162,12 @@ private[spark] class FairSchedulableBuilder(val rootPool: Pool, sc: SparkContext val xmlSchedulingMode = (poolNode \ SCHEDULING_MODE_PROPERTY).text.trim.toUpperCase(Locale.ROOT) - val warningMessage = s"Unsupported schedulingMode: $xmlSchedulingMode found in " + - s"Fair Scheduler configuration file: $fileName, using " + - s"the default schedulingMode: $defaultValue for pool: $poolName" + val warningMessage = log"Unsupported schedulingMode: " + + log"${MDC(XML_SCHEDULING_MODE, xmlSchedulingMode)} found in " + + log"Fair Scheduler configuration file: ${MDC(FILE_NAME, fileName)}, using " + + log"the default schedulingMode: " + + log"${MDC(LogKeys.SCHEDULING_MODE, defaultValue)} for pool: " + + log"${MDC(POOL_NAME, poolName)}" try { if (SchedulingMode.withName(xmlSchedulingMode) != SchedulingMode.NONE) { SchedulingMode.withName(xmlSchedulingMode) @@ -165,7 +176,7 @@ private[spark] class FairSchedulableBuilder(val rootPool: Pool, sc: SparkContext defaultValue } } catch { - case e: NoSuchElementException => + case _: NoSuchElementException => logWarning(warningMessage) defaultValue } @@ -182,10 +193,12 @@ private[spark] class FairSchedulableBuilder(val rootPool: Pool, sc: SparkContext try { data.toInt } catch { - case e: NumberFormatException => - logWarning(s"Error while loading fair scheduler configuration from $fileName: " + - s"$propertyName is blank or invalid: $data, using the default $propertyName: " + - s"$defaultValue for pool: $poolName") + case _: NumberFormatException => + logWarning(log"Error while loading fair scheduler configuration from " + + log"${MDC(FILE_NAME, fileName)}: " + + log"${MDC(PROPERTY_NAME, propertyName)} is blank or invalid: ${MDC(DATA, data)}, " + + log"using the default ${MDC(DEFAULT_NAME, propertyName)}: " + + log"${MDC(DEFAULT_VALUE, defaultValue)} for pool: ${MDC(POOL_NAME, poolName)}") defaultValue } } @@ -203,13 +216,18 @@ private[spark] class FairSchedulableBuilder(val rootPool: Pool, sc: SparkContext parentPool = new Pool(poolName, DEFAULT_SCHEDULING_MODE, DEFAULT_MINIMUM_SHARE, DEFAULT_WEIGHT) rootPool.addSchedulable(parentPool) - logWarning(s"A job was submitted with scheduler pool $poolName, which has not been " + - "configured. This can happen when the file that pools are read from isn't set, or " + - s"when that file doesn't contain $poolName. Created $poolName with default " + - s"configuration (schedulingMode: $DEFAULT_SCHEDULING_MODE, " + - s"minShare: $DEFAULT_MINIMUM_SHARE, weight: $DEFAULT_WEIGHT)") + logWarning(log"A job was submitted with scheduler pool " + + log"${MDC(SCHEDULER_POOL_NAME, poolName)}, which has not been " + + log"configured. This can happen when the file that pools are read from isn't set, or " + + log"when that file doesn't contain ${MDC(POOL_NAME, poolName)}. " + + log"Created ${MDC(CREATED_POOL_NAME, poolName)} with default " + + log"configuration (schedulingMode: " + + log"${MDC(LogKeys.SCHEDULING_MODE, DEFAULT_SCHEDULING_MODE)}, " + + log"minShare: ${MDC(MIN_SHARE, DEFAULT_MINIMUM_SHARE)}, " + + log"weight: ${MDC(WEIGHT, DEFAULT_WEIGHT)}") } parentPool.addSchedulable(manager) - logInfo("Added task set " + manager.name + " tasks to pool " + poolName) + logInfo(log"Added task set ${MDC(LogKeys.TASK_SET_MANAGER, manager.name)} tasks to pool " + + log"${MDC(LogKeys.POOL_NAME, poolName)}") } } diff --git a/core/src/main/scala/org/apache/spark/scheduler/StatsReportListener.scala b/core/src/main/scala/org/apache/spark/scheduler/StatsReportListener.scala index 1f12b46412bc5..e46dde5561a26 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/StatsReportListener.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/StatsReportListener.scala @@ -21,7 +21,7 @@ import scala.collection.mutable import org.apache.spark.annotation.DeveloperApi import org.apache.spark.executor.TaskMetrics -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.util.{Distribution, Utils} @@ -46,7 +46,8 @@ class StatsReportListener extends SparkListener with Logging { override def onStageCompleted(stageCompleted: SparkListenerStageCompleted): Unit = { implicit val sc = stageCompleted - this.logInfo(s"Finished stage: ${getStatusDetail(stageCompleted.stageInfo)}") + this.logInfo( + log"Finished stage: ${MDC(LogKeys.STAGE, getStatusDetail(stageCompleted.stageInfo))}") showMillisDistribution("task runtime:", (info, _) => info.duration, taskInfoMetrics.toSeq) // Shuffle write @@ -111,9 +112,9 @@ private[spark] object StatsReportListener extends Logging { def showDistribution(heading: String, d: Distribution, formatNumber: Double => String): Unit = { val stats = d.statCounter val quantiles = d.getQuantiles(probabilities).map(formatNumber) - logInfo(heading + stats) + logInfo(log"${MDC(LogKeys.DESCRIPTION, heading)}${MDC(LogKeys.STATS, stats)}") logInfo(percentilesHeader) - logInfo("\t" + quantiles.mkString("\t")) + logInfo(log"\t" + log"${MDC(LogKeys.QUANTILES, quantiles.mkString("\t"))}") } def showDistribution( diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala index e93bc0747349c..97c539bb05a58 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala @@ -26,7 +26,7 @@ import scala.util.control.NonFatal import org.apache.spark._ import org.apache.spark.TaskState.TaskState import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.CLASS_LOADER +import org.apache.spark.internal.LogKeys.CLASS_LOADER import org.apache.spark.serializer.{SerializerHelper, SerializerInstance} import org.apache.spark.util.{LongAccumulator, ThreadUtils, Utils} diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala index 17c44926d6265..ec678256a708e 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala @@ -33,8 +33,8 @@ import org.apache.spark.InternalAccumulator.{input, shuffleRead} import org.apache.spark.TaskState.TaskState import org.apache.spark.errors.SparkCoreErrors import org.apache.spark.executor.ExecutorMetrics -import org.apache.spark.internal.{config, Logging, LogKey, MDC} -import org.apache.spark.internal.LogKey.{REASON, TASK_SET_NAME, TASK_STATE, TID} +import org.apache.spark.internal.{config, Logging, LogKeys, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.config._ import org.apache.spark.resource.ResourceProfile import org.apache.spark.rpc.RpcEndpoint @@ -250,8 +250,9 @@ private[spark] class TaskSchedulerImpl( override def submitTasks(taskSet: TaskSet): Unit = { val tasks = taskSet.tasks - logInfo("Adding task set " + taskSet.id + " with " + tasks.length + " tasks " - + "resource profile " + taskSet.resourceProfileId) + logInfo(log"Adding task set " + taskSet.logId + + log" with ${MDC(LogKeys.NUM_TASKS, tasks.length)} tasks resource profile " + + log"${MDC(LogKeys.RESOURCE_PROFILE_ID, taskSet.resourceProfileId)}") this.synchronized { val manager = createTaskSetManager(taskSet, maxTaskFailures) val stage = taskSet.stageId @@ -306,9 +307,10 @@ private[spark] class TaskSchedulerImpl( stageId: Int, interruptThread: Boolean, reason: String): Unit = synchronized { - logInfo("Cancelling stage " + stageId) + logInfo(log"Canceling stage ${MDC(LogKeys.STAGE_ID, stageId)}") // Kill all running tasks for the stage. - logInfo(s"Killing all running tasks in stage $stageId: $reason") + logInfo(log"Killing all running tasks in stage ${MDC(LogKeys.STAGE_ID, stageId)}: " + + log"${MDC(LogKeys.REASON, reason)}") taskSetsByStageIdAndAttempt.get(stageId).foreach { attempts => attempts.foreach { case (_, tsm) => // There are two possible cases here: @@ -322,7 +324,8 @@ private[spark] class TaskSchedulerImpl( } } tsm.suspend() - logInfo("Stage %s.%s was cancelled".format(stageId, tsm.taskSet.stageAttemptId)) + logInfo(log"Stage ${MDC(LogKeys.STAGE_ID, stageId)}." + + log"${MDC(LogKeys.STAGE_ATTEMPT, tsm.taskSet.stageAttemptId)} was cancelled") } } } @@ -331,13 +334,14 @@ private[spark] class TaskSchedulerImpl( taskId: Long, interruptThread: Boolean, reason: String): Boolean = synchronized { - logInfo(s"Killing task $taskId: $reason") + logInfo(log"Killing task ${MDC(LogKeys.TASK_ID, taskId)}: ${MDC(LogKeys.REASON, reason)}") val execId = taskIdToExecutorId.get(taskId) if (execId.isDefined) { backend.killTask(taskId, execId.get, interruptThread, reason) true } else { - logWarning(s"Could not kill task $taskId because no task with that ID was found.") + logWarning(log"Could not kill task ${MDC(TASK_ID, taskId)} " + + log"because no task with that ID was found.") false } } @@ -360,8 +364,9 @@ private[spark] class TaskSchedulerImpl( } noRejectsSinceLastReset -= manager.taskSet manager.parent.removeSchedulable(manager) - logInfo(s"Removed TaskSet ${manager.taskSet.id}, whose tasks have all completed, from pool" + - s" ${manager.parent.name}") + logInfo(log"Removed TaskSet " + manager.taskSet.logId + + log" whose tasks have all completed, from pool ${MDC(LogKeys.POOL_NAME, manager.parent.name)}" + ) } /** @@ -428,7 +433,8 @@ private[spark] class TaskSchedulerImpl( } catch { case e: TaskNotSerializableException => // scalastyle:off line.size.limit - logError(log"Resource offer failed, task set ${MDC(TASK_SET_NAME, taskSet.name)} was not serializable") + logError(log"Resource offer failed, task set " + + log"${MDC(LogKeys.TASK_SET_NAME, taskSet.name)} was not serializable") // scalastyle:on // Do not offer resources for this task, but don't throw an error to allow other // task sets to be submitted. @@ -557,9 +563,10 @@ private[spark] class TaskSchedulerImpl( // Skip the launch process. // TODO SPARK-24819 If the job requires more slots than available (both busy and free // slots), fail the job on submit. - logInfo(s"Skip current round of resource offers for barrier stage ${taskSet.stageId} " + - s"because the barrier taskSet requires ${taskSet.numTasks} slots, while the total " + - s"number of available slots is $numBarrierSlotsAvailable.") + logInfo(log"Skip current round of resource offers for barrier stage " + + log"${MDC(LogKeys.STAGE_ID, taskSet.stageId)} because the barrier taskSet requires " + + log"${MDC(LogKeys.TASK_SET_NAME, taskSet.numTasks)} slots, while the total " + + log"number of available slots is ${MDC(LogKeys.NUM_SLOTS, numBarrierSlotsAvailable)}.") } else { var launchedAnyTask = false var noDelaySchedulingRejects = true @@ -617,18 +624,18 @@ private[spark] class TaskSchedulerImpl( // in order to provision more executors to make them schedulable if (Utils.isDynamicAllocationEnabled(conf)) { if (!unschedulableTaskSetToExpiryTime.contains(taskSet)) { - logInfo("Notifying ExecutorAllocationManager to allocate more executors to" + - " schedule the unschedulable task before aborting" + - s" stage ${taskSet.stageId}.") + logInfo(log"Notifying ExecutorAllocationManager to allocate more executors to" + + log" schedule the unschedulable task before aborting" + + log" stage ${MDC(LogKeys.STAGE_ID, taskSet.stageId)}.") dagScheduler.unschedulableTaskSetAdded(taskSet.taskSet.stageId, taskSet.taskSet.stageAttemptId) updateUnschedulableTaskSetTimeoutAndStartAbortTimer(taskSet, taskIndex) } } else { // Abort Immediately - logInfo("Cannot schedule any task because all executors excluded from " + - "failures. No idle executors can be found to kill. Aborting stage " + - s"${taskSet.stageId}.") + logInfo(log"Cannot schedule any task because all executors excluded from " + + log"failures. No idle executors can be found to kill. Aborting stage " + + log"${MDC(LogKeys.STAGE_ID, taskSet.stageId)}.") taskSet.abortSinceCompletelyExcludedOnFailure(taskIndex) } } @@ -641,8 +648,8 @@ private[spark] class TaskSchedulerImpl( // non-excluded executor and the abort timer doesn't kick in because of a constant // submission of new TaskSets. See the PR for more details. if (unschedulableTaskSetToExpiryTime.nonEmpty) { - logInfo("Clearing the expiry times for all unschedulable taskSets as a task was " + - "recently scheduled.") + logInfo(log"Clearing the expiry times for all unschedulable taskSets as a task " + + log"was recently scheduled.") // Notify ExecutorAllocationManager as well as other subscribers that a task now // recently becomes schedulable dagScheduler.unschedulableTaskSetRemoved(taskSet.taskSet.stageId, @@ -662,20 +669,23 @@ private[spark] class TaskSchedulerImpl( // always reject the offered resources. As a result, the barrier taskset can't get // launched. And if we retry the resourceOffer, we'd go through the same path again // and get into the endless loop in the end. - val errorMsg = s"Fail resource offers for barrier stage ${taskSet.stageId} " + - s"because only ${barrierPendingLaunchTasks.length} out of a total number " + - s"of ${taskSet.numTasks} tasks got resource offers. We highly recommend " + - "you to use the non-legacy delay scheduling by setting " + - s"${LEGACY_LOCALITY_WAIT_RESET.key} to false to get rid of this error." - logWarning(errorMsg) - taskSet.abort(errorMsg) - throw SparkCoreErrors.sparkError(errorMsg) + val logMsg = log"Fail resource offers for barrier stage " + + log"${MDC(STAGE_ID, taskSet.stageId)} because only " + + log"${MDC(NUM_PENDING_LAUNCH_TASKS, barrierPendingLaunchTasks.length)} " + + log"out of a total number " + + log"of ${MDC(NUM_TASKS, taskSet.numTasks)} tasks got resource offers. " + + log"We highly recommend you to use the non-legacy delay scheduling by setting " + + log"${MDC(CONFIG, LEGACY_LOCALITY_WAIT_RESET.key)} to false " + + log"to get rid of this error." + logWarning(logMsg) + taskSet.abort(logMsg.message) + throw SparkCoreErrors.sparkError(logMsg.message) } else { val curTime = clock.getTimeMillis() if (curTime - taskSet.lastResourceOfferFailLogTime > TaskSetManager.BARRIER_LOGGING_INTERVAL) { - logInfo("Releasing the assigned resource offers since only partial tasks can " + - "be launched. Waiting for later round resource offers.") + logInfo(log"Releasing the assigned resource offers since only partial tasks can " + + log"be launched. Waiting for later round resource offers.") taskSet.lastResourceOfferFailLogTime = curTime } barrierPendingLaunchTasks.foreach { task => @@ -717,8 +727,8 @@ private[spark] class TaskSchedulerImpl( .mkString(",") addressesWithDescs.foreach(_._2.properties.setProperty("addresses", addressesStr)) - logInfo(s"Successfully scheduled all the ${addressesWithDescs.length} tasks for " + - s"barrier stage ${taskSet.stageId}.") + logInfo(log"Successfully scheduled all the ${MDC(LogKeys.NUM_TASKS, addressesWithDescs.length)} " + + log"tasks for barrier stage ${MDC(LogKeys.STAGE_ID, taskSet.stageId)}.") } taskSet.barrierPendingLaunchTasks.clear() } @@ -738,8 +748,8 @@ private[spark] class TaskSchedulerImpl( taskIndex: Int): Unit = { val timeout = conf.get(config.UNSCHEDULABLE_TASKSET_TIMEOUT) * 1000 unschedulableTaskSetToExpiryTime(taskSet) = clock.getTimeMillis() + timeout - logInfo(s"Waiting for $timeout ms for completely " + - s"excluded task to be schedulable again before aborting stage ${taskSet.stageId}.") + logInfo(log"Waiting for ${MDC(LogKeys.TIMEOUT, timeout)} ms for completely " + + log"excluded task to be schedulable again before aborting stage ${MDC(LogKeys.STAGE_ID, taskSet.stageId)}.") abortTimer.schedule( createUnschedulableTaskSetAbortTimer(taskSet, taskIndex), timeout, TimeUnit.MILLISECONDS) } @@ -751,8 +761,8 @@ private[spark] class TaskSchedulerImpl( override def run(): Unit = TaskSchedulerImpl.this.synchronized { if (unschedulableTaskSetToExpiryTime.contains(taskSet) && unschedulableTaskSetToExpiryTime(taskSet) <= clock.getTimeMillis()) { - logInfo("Cannot schedule any task because all executors excluded due to failures. " + - s"Wait time for scheduling expired. Aborting stage ${taskSet.stageId}.") + logInfo(log"Cannot schedule any task because all executors excluded due to failures. " + + log"Wait time for scheduling expired. Aborting stage ${MDC(LogKeys.STAGE_ID, taskSet.stageId)}.") taskSet.abortSinceCompletelyExcludedOnFailure(taskIndex) } else { this.cancel() @@ -807,9 +817,10 @@ private[spark] class TaskSchedulerImpl( taskSet.taskInfos(tid).launchSucceeded() } case None => - logError( - log"Ignoring update with state ${MDC(TASK_STATE, state)} for TID ${MDC(TID, tid)} because its task set is gone (this is " + - log"likely the result of receiving duplicate task finished status updates) or its executor has been marked as failed.") + logError(log"Ignoring update with state ${MDC(LogKeys.TASK_STATE, state)} for " + + log"TID ${MDC(LogKeys.TASK_ID, tid)} because its task set is gone (this is " + + log"likely the result of receiving duplicate task finished status updates) or its " + + log"executor has been marked as failed.") } } catch { case e: Exception => logError("Exception in statusUpdate", e) @@ -1024,7 +1035,8 @@ private[spark] class TaskSchedulerImpl( // one may be triggered by a dropped connection from the worker while another may be a // report of executor termination. We produce log messages for both so we // eventually report the termination reason. - logError(log"Lost an executor ${MDC(LogKey.EXECUTOR_ID, executorId)} (already removed): ${MDC(REASON, reason)}") + logError(log"Lost an executor ${MDC(LogKeys.EXECUTOR_ID, executorId)} " + + log"(already removed): ${MDC(LogKeys.REASON, reason)}") } } } @@ -1036,7 +1048,8 @@ private[spark] class TaskSchedulerImpl( } override def workerRemoved(workerId: String, host: String, message: String): Unit = { - logInfo(s"Handle removed worker $workerId: $message") + logInfo(log"Handle removed worker ${MDC(LogKeys.WORKER_ID, workerId)}: " + + log"${MDC(LogKeys.MESSAGE, message)}") dagScheduler.workerRemoved(workerId, host, message) } @@ -1047,12 +1060,15 @@ private[spark] class TaskSchedulerImpl( case LossReasonPending => logDebug(s"Executor $executorId on $hostPort lost, but reason not yet known.") case ExecutorKilled => - logInfo(s"Executor $executorId on $hostPort killed by driver.") + logInfo(log"Executor ${MDC(LogKeys.EXECUTOR_ID, executorId)} on " + + log"${MDC(LogKeys.HOST_PORT, hostPort)} killed by driver.") case _: ExecutorDecommission => - logInfo(s"Executor $executorId on $hostPort is decommissioned" + - s"${getDecommissionDuration(executorId)}.") + logInfo(log"Executor ${MDC(LogKeys.EXECUTOR_ID, executorId)} on " + + log"${MDC(LogKeys.HOST_PORT, hostPort)} is decommissioned" + + log"${MDC(DURATION, getDecommissionDuration(executorId))}.") case _ => - logError(log"Lost executor ${MDC(LogKey.EXECUTOR_ID, executorId)} on ${MDC(LogKey.HOST, hostPort)}: ${MDC(REASON, reason)}") + logError(log"Lost executor ${MDC(LogKeys.EXECUTOR_ID, executorId)} on " + + log"${MDC(LogKeys.HOST, hostPort)}: ${MDC(LogKeys.REASON, reason)}") } // return decommission duration in string or "" if decommission startTime not exists diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSet.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSet.scala index e03c4101709cd..2474a1342eb2e 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskSet.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSet.scala @@ -19,6 +19,9 @@ package org.apache.spark.scheduler import java.util.Properties +import org.apache.spark.internal.LogKeys.{STAGE_ATTEMPT, STAGE_ID} +import org.apache.spark.internal.MessageWithContext + /** * A set of tasks submitted together to the low-level TaskScheduler, usually representing * missing partitions of a particular stage. @@ -34,4 +37,12 @@ private[spark] class TaskSet( val id: String = s"$stageId.$stageAttemptId" override def toString: String = "TaskSet " + id + + // Identifier used in the structured logging framework. + lazy val logId: MessageWithContext = { + val hashMap = new java.util.HashMap[String, String]() + hashMap.put(STAGE_ID.name, stageId.toString) + hashMap.put(STAGE_ATTEMPT.name, stageAttemptId.toString) + MessageWithContext(id, hashMap) + } } diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetExcludeList.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetExcludeList.scala index f479e5e32bc2f..c9aa74e0852be 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetExcludeList.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetExcludeList.scala @@ -19,8 +19,7 @@ package org.apache.spark.scheduler import scala.collection.mutable.{HashMap, HashSet} import org.apache.spark.SparkConf -import org.apache.spark.internal.Logging -import org.apache.spark.internal.config +import org.apache.spark.internal.{config, Logging, LogKeys, MDC} import org.apache.spark.util.Clock /** @@ -134,7 +133,8 @@ private[scheduler] class TaskSetExcludelist( val numFailures = execFailures.numUniqueTasksWithFailures if (numFailures >= MAX_FAILURES_PER_EXEC_STAGE) { if (excludedExecs.add(exec)) { - logInfo(s"Excluding executor ${exec} for stage $stageId") + logInfo(log"Excluding executor ${MDC(LogKeys.EXECUTOR_ID, exec)} for stage " + + log"${MDC(LogKeys.STAGE_ID, stageId)}") // This executor has been excluded for this stage. Let's check if it // the whole node should be excluded. val excludedExecutorsOnNode = @@ -149,7 +149,8 @@ private[scheduler] class TaskSetExcludelist( val numFailExec = excludedExecutorsOnNode.size if (numFailExec >= MAX_FAILED_EXEC_PER_NODE_STAGE) { if (excludedNodes.add(host)) { - logInfo(s"Excluding ${host} for stage $stageId") + logInfo(log"Excluding ${MDC(LogKeys.HOST, host)} for " + + log"stage ${MDC(LogKeys.STAGE_ID, stageId)}") // SparkListenerNodeBlacklistedForStage is deprecated but post both events // to keep backward compatibility listenerBus.post( diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala index 1418901e3442c..7dba4a6dc8fc4 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala @@ -30,8 +30,8 @@ import org.apache.spark.InternalAccumulator import org.apache.spark.InternalAccumulator.{input, shuffleRead} import org.apache.spark.TaskState.TaskState import org.apache.spark.errors.SparkCoreErrors -import org.apache.spark.internal.{config, Logging, MDC} -import org.apache.spark.internal.LogKey._ +import org.apache.spark.internal.{config, Logging, LogKeys, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.config._ import org.apache.spark.scheduler.SchedulingMode._ import org.apache.spark.util.{AccumulatorV2, Clock, LongAccumulator, SystemClock, Utils} @@ -279,8 +279,9 @@ private[spark] class TaskSetManager( for (e <- set) { pendingTaskSetToAddTo.forExecutor.getOrElseUpdate(e, new ArrayBuffer) += index } - logInfo(s"Pending task $index has a cached location at ${e.host} " + - ", where there are executors " + set.mkString(",")) + logInfo(log"Pending task ${MDC(INDEX, index)} has a cached location at " + + log"${MDC(HOST, e.host)}, where there are executors " + + log"${MDC(EXECUTOR_IDS, set.mkString(","))}") case None => logDebug(s"Pending task $index has a cached location at ${e.host} " + ", but there are no executors alive there.") } @@ -542,9 +543,10 @@ private[spark] class TaskSetManager( if (serializedTask.limit() > TaskSetManager.TASK_SIZE_TO_WARN_KIB * 1024 && !emittedTaskSizeWarning) { emittedTaskSizeWarning = true - logWarning(s"Stage ${task.stageId} contains a task of very large size " + - s"(${serializedTask.limit() / 1024} KiB). The maximum recommended task size is " + - s"${TaskSetManager.TASK_SIZE_TO_WARN_KIB} KiB.") + logWarning(log"Stage ${MDC(STAGE_ID, task.stageId)} contains a task of very large size " + + log"(${MDC(NUM_BYTES, serializedTask.limit() / 1024)} KiB). " + + log"The maximum recommended task size is " + + log"${MDC(NUM_BYTES_TO_WARN, TaskSetManager.TASK_SIZE_TO_WARN_KIB)} KiB.") } addRunningTask(taskId) @@ -552,10 +554,16 @@ private[spark] class TaskSetManager( // a good proxy to task serialization time. // val timeTaken = clock.getTime() - startTime val tName = taskName(taskId) - logInfo(s"Starting $tName ($host, executor ${info.executorId}, " + - s"partition ${task.partitionId}, $taskLocality, ${serializedTask.limit()} bytes) " + - (if (taskResourceAssignments.nonEmpty) s"taskResourceAssignments ${taskResourceAssignments}" - else "")) + logInfo(log"Starting ${MDC(TASK_NAME, tName)} (${MDC(HOST, host)}," + + log"executor ${MDC(LogKeys.EXECUTOR_ID, info.executorId)}, " + + log"partition ${MDC(PARTITION_ID, task.partitionId)}, " + + log"${MDC(TASK_LOCALITY, taskLocality)}, " + + log"${MDC(SIZE, serializedTask.limit())} bytes) " + + (if (taskResourceAssignments.nonEmpty) { + log"taskResourceAssignments ${MDC(TASK_RESOURCE_ASSIGNMENTS, taskResourceAssignments)}" + } else { + log"" + })) sched.dagScheduler.taskStarted(task, info) new TaskDescription( @@ -827,8 +835,11 @@ private[spark] class TaskSetManager( // Kill any other attempts for the same task (since those are unnecessary now that one // attempt completed successfully). for (attemptInfo <- taskAttempts(index) if attemptInfo.running) { - logInfo(s"Killing attempt ${attemptInfo.attemptNumber} for ${taskName(attemptInfo.taskId)}" + - s" on ${attemptInfo.host} as the attempt ${info.attemptNumber} succeeded on ${info.host}") + logInfo(log"Killing attempt ${MDC(NUM_ATTEMPT, attemptInfo.attemptNumber)} for " + + log"${MDC(TASK_NAME, taskName(attemptInfo.taskId))} on " + + log"${MDC(HOST, attemptInfo.host)} as the attempt " + + log"${MDC(TASK_ATTEMPT_ID, info.attemptNumber)} succeeded on " + + log"${MDC(HOST, info.host)}") killedByOtherAttempt += attemptInfo.taskId sched.backend.killTask( attemptInfo.taskId, @@ -838,8 +849,10 @@ private[spark] class TaskSetManager( } if (!successful(index)) { tasksSuccessful += 1 - logInfo(s"Finished ${taskName(info.taskId)} in ${info.duration} ms " + - s"on ${info.host} (executor ${info.executorId}) ($tasksSuccessful/$numTasks)") + logInfo(log"Finished ${MDC(TASK_NAME, taskName(info.taskId))} in " + + log"${MDC(DURATION, info.duration)} ms on ${MDC(HOST, info.host)} " + + log"(executor ${MDC(LogKeys.EXECUTOR_ID, info.executorId)}) " + + log"(${MDC(NUM_SUCCESSFUL_TASKS, tasksSuccessful)}/${MDC(NUM_TASKS, numTasks)})") // Mark successful and stop if all the tasks have succeeded. successful(index) = true numFailures(index) = 0 @@ -847,8 +860,9 @@ private[spark] class TaskSetManager( isZombie = true } } else { - logInfo(s"Ignoring task-finished event for ${taskName(info.taskId)} " + - s"because it has already completed successfully") + logInfo(log"Ignoring task-finished event for " + + log"${MDC(TASK_NAME, taskName(info.taskId))} " + + log"because it has already completed successfully") } // This method is called by "TaskSchedulerImpl.handleSuccessfulTask" which holds the // "TaskSchedulerImpl" lock until exiting. To avoid the SPARK-7655 issue, we should not @@ -936,7 +950,11 @@ private[spark] class TaskSetManager( copiesRunning(index) -= 1 var accumUpdates: Seq[AccumulatorV2[_, _]] = Seq.empty var metricPeaks: Array[Long] = Array.empty - val failureReason = s"Lost ${taskName(tid)} (${info.host} " + + val failureReason = log"Lost ${MDC(TASK_NAME, taskName(tid))} " + + log"(${MDC(HOST_PORT, info.host)} " + + log"executor ${MDC(LogKeys.EXECUTOR_ID, info.executorId)}): " + + log"${MDC(ERROR, reason.toErrorString)}" + val failureReasonString = s"Lost ${taskName(tid)} (${info.host} " + s"executor ${info.executorId}): ${reason.toErrorString}" val failureException: Option[Throwable] = reason match { case fetchFailed: FetchFailed => @@ -971,9 +989,11 @@ private[spark] class TaskSetManager( if (ef.className == classOf[TaskOutputFileAlreadyExistException].getName) { // If we can not write to output file in the task, there's no point in trying to // re-execute it. - logError(log"Task ${MDC(TASK_ID, info.id)} in stage ${MDC(STAGE_ID, taskSet.id)} " + - log"(TID ${MDC(TID, tid)}) can not write to output file: " + - log"${MDC(ERROR, ef.description)}; not retrying") + logError( + log"Task ${MDC(TASK_INDEX, info.index)}.${MDC(TASK_ATTEMPT_ID, info.attemptNumber)} " + + log"in stage ${MDC(STAGE_ID, taskSet.stageId)}." + + log"${MDC(STAGE_ATTEMPT, taskSet.stageAttemptId)} (TID ${MDC(TASK_ID, tid)}) " + + log"can not write to output file: ${MDC(ERROR, ef.description)}; not retrying") emptyTaskInfoAccumulablesAndNotifyDagScheduler(tid, tasks(index), reason, null, accumUpdates, metricPeaks) abort("Task %s in stage %s (TID %d) can not write to output file: %s".format( @@ -1001,8 +1021,10 @@ private[spark] class TaskSetManager( logWarning(failureReason) } else { logInfo( - s"Lost $task on ${info.host}, executor ${info.executorId}: " + - s"${ef.className} (${ef.description}) [duplicate $dupCount]") + log"Lost ${MDC(TASK_NAME, task)} on ${MDC(HOST, info.host)}, " + + log"executor ${MDC(LogKeys.EXECUTOR_ID, info.executorId)}: " + + log"${MDC(CLASS_NAME, ef.className)} " + + log"(${MDC(DESCRIPTION, ef.description)}) [duplicate ${MDC(COUNT, dupCount)}]") } ef.exception @@ -1014,12 +1036,12 @@ private[spark] class TaskSetManager( None case e: ExecutorLostFailure if !e.exitCausedByApp => - logInfo(s"${taskName(tid)} failed because while it was being computed, its executor " + - "exited for a reason unrelated to the task. Not counting this failure towards the " + - "maximum number of failures for the task.") + logInfo(log"${MDC(TASK_NAME, taskName(tid))} failed because while it was being computed," + + log" its executor exited for a reason unrelated to the task. " + + log"Not counting this failure towards the maximum number of failures for the task.") None - case e: TaskFailedReason => // TaskResultLost and others + case _: TaskFailedReason => // TaskResultLost and others logWarning(failureReason) None } @@ -1034,22 +1056,22 @@ private[spark] class TaskSetManager( if (!isZombie && reason.countTowardsTaskFailures) { assert (null != failureReason) taskSetExcludelistHelperOpt.foreach(_.updateExcludedForFailedTask( - info.host, info.executorId, index, failureReason)) + info.host, info.executorId, index, failureReasonString)) numFailures(index) += 1 if (numFailures(index) >= maxTaskFailures) { - logError(log"Task ${MDC(TASK_ID, index)} in stage ${MDC(STAGE_ID, taskSet.id)} failed " + - log"${MDC(MAX_ATTEMPTS, maxTaskFailures)} times; aborting job") + logError(log"Task ${MDC(TASK_INDEX, index)} in stage " + taskSet.logId + + log" failed ${MDC(MAX_ATTEMPTS, maxTaskFailures)} times; aborting job") abort("Task %d in stage %s failed %d times, most recent failure: %s\nDriver stacktrace:" - .format(index, taskSet.id, maxTaskFailures, failureReason), failureException) + .format(index, taskSet.id, maxTaskFailures, failureReasonString), failureException) return } } if (successful(index)) { - logInfo(s"${taskName(info.taskId)} failed, but the task will not" + - " be re-executed (either because the task failed with a shuffle data fetch failure," + - " so the previous stage needs to be re-run, or because a different copy of the task" + - " has already succeeded).") + logInfo(log"${MDC(LogKeys.TASK_NAME, taskName(info.taskId))} failed, but the task will not" + + log" be re-executed (either because the task failed with a shuffle data fetch failure," + + log" so the previous stage needs to be re-run, or because a different copy of the task" + + log" has already succeeded).") } else { addPendingTask(index) } @@ -1232,9 +1254,10 @@ private[spark] class TaskSetManager( if (speculated) { addPendingTask(index, speculatable = true) logInfo( - ("Marking task %d in stage %s (on %s) as speculatable because it ran more" + - " than %.0f ms(%d speculatable tasks in this taskset now)") - .format(index, taskSet.id, info.host, threshold, speculatableTasks.size + 1)) + log"Marking task ${MDC(TASK_INDEX, index)} in stage " + taskSet.logId + + log" (on ${MDC(HOST, info.host)}) as speculatable because it ran more than " + + log"${MDC(TIMEOUT, threshold)} ms(${MDC(NUM_TASKS, speculatableTasks.size + 1)}" + + log"speculatable tasks in this taskset now)") speculatableTasks += index sched.dagScheduler.speculativeTaskSubmitted(tasks(index), index) } @@ -1281,7 +1304,8 @@ private[spark] class TaskSetManager( if (foundTasks) { val elapsedMs = clock.getTimeMillis() - timeMs if (elapsedMs > minTimeToSpeculation) { - logWarning(s"Time to checkSpeculatableTasks ${elapsedMs}ms > ${minTimeToSpeculation}ms") + logWarning(log"Time to checkSpeculatableTasks ${MDC(TIME_UNITS, elapsedMs)}ms > " + + log"${MDC(MIN_TIME, minTimeToSpeculation)}ms") } } foundTasks diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala index 06cfb53e2dede..deaa1b4e47906 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala @@ -33,7 +33,8 @@ import org.apache.spark.deploy.security.HadoopDelegationTokenManager import org.apache.spark.errors.SparkCoreErrors import org.apache.spark.executor.ExecutorLogUrlHandler import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.ERROR +import org.apache.spark.internal.LogKeys +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.config._ import org.apache.spark.internal.config.Network._ import org.apache.spark.resource.ResourceProfile @@ -41,6 +42,7 @@ import org.apache.spark.rpc._ import org.apache.spark.scheduler._ import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages._ import org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend.ENDPOINT_NAME +import org.apache.spark.sql.catalyst.util.DateTimeConstants.NANOS_PER_MILLIS import org.apache.spark.status.api.v1.ThreadStackTrace import org.apache.spark.util.{RpcUtils, SerializableBuffer, ThreadUtils, Utils} import org.apache.spark.util.ArrayImplicits._ @@ -181,8 +183,9 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp makeOffers(executorId) case None => // Ignoring the update since we don't know about the executor. - logWarning(s"Ignored task status update ($taskId state $state) " + - s"from unknown executor with ID $executorId") + logWarning(log"Ignored task status update (${MDC(TASK_ID, taskId)} " + + log"state ${MDC(TASK_STATE, state)}) " + + log"from unknown executor with ID ${MDC(LogKeys.EXECUTOR_ID, executorId)}") } } @@ -199,7 +202,8 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp KillTask(taskId, executorId, interruptThread, reason)) case None => // Ignoring the task kill since the executor is not registered. - logWarning(s"Attempted to kill task $taskId for unknown executor $executorId.") + logWarning(log"Attempted to kill task ${MDC(TASK_ID, taskId)} " + + log"for unknown executor ${MDC(LogKeys.EXECUTOR_ID, executorId)}.") } case KillExecutorsOnHost(host) => @@ -255,7 +259,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp // If the cluster manager gives us an executor on an excluded node (because it // already started allocating those resources before we informed it of our exclusion, // or if it ignored our exclusion), then we reject that executor immediately. - logInfo(s"Rejecting $executorId as it has been excluded.") + logInfo(log"Rejecting ${MDC(LogKeys.EXECUTOR_ID, executorId)} as it has been excluded.") context.sendFailure( new IllegalStateException(s"Executor is excluded due to failures: $executorId")) } else { @@ -266,8 +270,10 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp } else { context.senderAddress } - logInfo(s"Registered executor $executorRef ($executorAddress) with ID $executorId, " + - s" ResourceProfileId $resourceProfileId") + logInfo(log"Registered executor ${MDC(LogKeys.RPC_ENDPOINT_REF, executorRef)} " + + log"(${MDC(LogKeys.RPC_ADDRESS, executorAddress)}) " + + log"with ID ${MDC(LogKeys.EXECUTOR_ID, executorId)}, " + + log"ResourceProfileId ${MDC(LogKeys.RESOURCE_PROFILE_ID, resourceProfileId)}") addressToExecutorId(executorAddress) = executorId totalCoreCount.addAndGet(cores) totalRegisteredExecutors.addAndGet(1) @@ -321,7 +327,8 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp case UpdateExecutorsLogLevel(logLevel) => currentLogLevel = Some(logLevel) - logInfo(s"Asking each executor to refresh the log level to $logLevel") + logInfo(log"Asking each executor to refresh the log level to " + + log"${MDC(LogKeys.LOG_LEVEL, logLevel)}") for ((_, executorData) <- executorDataMap) { executorData.executorEndpoint.send(UpdateExecutorLogLevel(logLevel)) } @@ -340,7 +347,8 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp // Do not change this code without running the K8s integration suites case ExecutorDecommissioning(executorId) => - logWarning(s"Received executor $executorId decommissioned message") + logWarning(log"Received executor ${MDC(LogKeys.EXECUTOR_ID, executorId)} " + + log"decommissioned message") context.reply( decommissionExecutor( executorId, @@ -493,7 +501,8 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp // forever. Therefore, we should also post `SparkListenerExecutorRemoved` here. listenerBus.post(SparkListenerExecutorRemoved( System.currentTimeMillis(), executorId, reason.toString)) - logInfo(s"Asked to remove non-existent executor $executorId") + logInfo( + log"Asked to remove non-existent executor ${MDC(LogKeys.EXECUTOR_ID, executorId)}") } } @@ -522,7 +531,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp } if (shouldDisable) { - logInfo(s"Disabling executor $executorId.") + logInfo(log"Disabling executor ${MDC(LogKeys.EXECUTOR_ID, executorId)}.") scheduler.executorLost(executorId, LossReasonPending) } @@ -566,7 +575,8 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp return executorsToDecommission.toImmutableArraySeq } - logInfo(s"Decommission executors: ${executorsToDecommission.mkString(", ")}") + logInfo(log"Decommission executors: " + + log"${MDC(LogKeys.EXECUTOR_IDS, executorsToDecommission.mkString(", "))}") // If we don't want to replace the executors we are decommissioning if (adjustTargetNumExecutors) { @@ -585,7 +595,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp if (!triggeredByExecutor) { executorsToDecommission.foreach { executorId => - logInfo(s"Notify executor $executorId to decommission.") + logInfo(log"Notify executor ${MDC(LogKeys.EXECUTOR_ID, executorId)} to decommission.") executorDataMap(executorId).executorEndpoint.send(DecommissionExecutor) } } @@ -597,7 +607,9 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp executorsToDecommission.filter(executorsPendingDecommission.contains) } if (stragglers.nonEmpty) { - logInfo(s"${stragglers.toList} failed to decommission in ${cleanupInterval}, killing.") + logInfo( + log"${MDC(LogKeys.EXECUTOR_IDS, stragglers.toList)} failed to decommission in " + + log"${MDC(LogKeys.INTERVAL, cleanupInterval)}, killing.") killExecutors(stragglers.toImmutableArraySeq, false, false, true) } } @@ -714,13 +726,14 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp override def isReady(): Boolean = { if (sufficientResourcesRegistered()) { - logInfo("SchedulerBackend is ready for scheduling beginning after " + - s"reached minRegisteredResourcesRatio: $minRegisteredRatio") + logInfo(log"SchedulerBackend is ready for scheduling beginning after " + + log"reached minRegisteredResourcesRatio: ${MDC(LogKeys.MIN_SIZE, minRegisteredRatio)}") return true } if ((System.nanoTime() - createTimeNs) >= maxRegisteredWaitingTimeNs) { - logInfo("SchedulerBackend is ready for scheduling beginning after waiting " + - s"maxRegisteredResourcesWaitingTime: $maxRegisteredWaitingTimeNs(ns)") + logInfo(log"SchedulerBackend is ready for scheduling beginning after waiting " + + log"maxRegisteredResourcesWaitingTime: " + + log"${MDC(LogKeys.TIMEOUT, maxRegisteredWaitingTimeNs / NANOS_PER_MILLIS.toDouble)}(ms)") return true } false @@ -797,7 +810,8 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp "Attempted to request a negative number of additional executor(s) " + s"$numAdditionalExecutors from the cluster manager. Please specify a positive number!") } - logInfo(s"Requesting $numAdditionalExecutors additional executor(s) from the cluster manager") + logInfo(log"Requesting ${MDC(LogKeys.NUM_EXECUTORS, numAdditionalExecutors)} additional " + + log"executor(s) from the cluster manager") val response = synchronized { val defaultProf = scheduler.sc.resourceProfileManager.defaultResourceProfile @@ -947,12 +961,13 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp adjustTargetNumExecutors: Boolean, countFailures: Boolean, force: Boolean): Seq[String] = { - logInfo(s"Requesting to kill executor(s) ${executorIds.mkString(", ")}") + logInfo( + log"Requesting to kill executor(s) ${MDC(LogKeys.EXECUTOR_IDS, executorIds.mkString(", "))}") val response = withLock { val (knownExecutors, unknownExecutors) = executorIds.partition(executorDataMap.contains) unknownExecutors.foreach { id => - logWarning(s"Executor to kill $id does not exist!") + logWarning(log"Executor to kill ${MDC(LogKeys.EXECUTOR_ID, id)} does not exist!") } // If an executor is already pending to be removed, do not kill it again (SPARK-9795) @@ -962,7 +977,8 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp .filter { id => force || !scheduler.isExecutorBusy(id) } executorsToKill.foreach { id => executorsPendingToRemove(id) = !countFailures } - logInfo(s"Actual list of executor(s) to be killed is ${executorsToKill.mkString(", ")}") + logInfo(log"Actual list of executor(s) to be killed is " + + log"${MDC(LogKeys.EXECUTOR_IDS, executorsToKill.mkString(", "))}") // If we do not wish to replace the executors we kill, sync the target number of executors // with the cluster manager to avoid allocating new ones. When computing the new target, @@ -1003,7 +1019,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp * @return whether the decommission request is acknowledged. */ final override def decommissionExecutorsOnHost(host: String): Boolean = { - logInfo(s"Requesting to kill any and all executors on host $host") + logInfo(log"Requesting to kill any and all executors on host ${MDC(LogKeys.HOST, host)}") // A potential race exists if a new executor attempts to register on a host // that is on the exclude list and is no longer valid. To avoid this race, // all executor registration and decommissioning happens in the event loop. This way, either @@ -1019,7 +1035,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp * @return whether the kill request is acknowledged. */ final override def killExecutorsOnHost(host: String): Boolean = { - logInfo(s"Requesting to kill any and all executors on host $host") + logInfo(log"Requesting to kill any and all executors on host ${MDC(LogKeys.HOST, host)}") // A potential race exists if a new executor attempts to register on a host // that is on the exclude list and is no longer valid. To avoid this race, // all executor registration and killing happens in the event loop. This way, either diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala index f92756105977c..f4caecd7d6741 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala @@ -27,8 +27,7 @@ import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.deploy.{ApplicationDescription, Command} import org.apache.spark.deploy.client.{StandaloneAppClient, StandaloneAppClientListener} import org.apache.spark.executor.ExecutorExitCode -import org.apache.spark.internal.{config, Logging, MDC} -import org.apache.spark.internal.LogKey.REASON +import org.apache.spark.internal.{config, Logging, LogKeys, MDC} import org.apache.spark.internal.config.EXECUTOR_REMOVE_DELAY import org.apache.spark.internal.config.Tests.IS_TESTING import org.apache.spark.launcher.{LauncherBackend, SparkAppHandle} @@ -145,7 +144,7 @@ private[spark] class StandaloneSchedulerBackend( } override def connected(appId: String): Unit = { - logInfo("Connected to Spark cluster with app ID " + appId) + logInfo(log"Connected to Spark cluster with app ID ${MDC(LogKeys.APP_ID, appId)}") this.appId = appId notifyContext() launcherBackend.setAppId(appId) @@ -162,7 +161,7 @@ private[spark] class StandaloneSchedulerBackend( notifyContext() if (!stopping.get) { launcherBackend.setState(SparkAppHandle.State.KILLED) - logError(log"Application has been killed. Reason: ${MDC(REASON, reason)}") + logError(log"Application has been killed. Reason: ${MDC(LogKeys.REASON, reason)}") try { scheduler.error(reason) } finally { @@ -174,8 +173,9 @@ private[spark] class StandaloneSchedulerBackend( override def executorAdded(fullId: String, workerId: String, hostPort: String, cores: Int, memory: Int): Unit = { - logInfo("Granted executor ID %s on hostPort %s with %d core(s), %s RAM".format( - fullId, hostPort, cores, Utils.megabytesToString(memory))) + logInfo(log"Granted executor ID ${MDC(LogKeys.EXECUTOR_ID, fullId)} on hostPort " + + log"${MDC(LogKeys.HOST_PORT, hostPort)} with ${MDC(LogKeys.NUM_CORES, cores)} core(s), " + + log"${MDC(LogKeys.MEMORY_SIZE, Utils.megabytesToString(memory))} RAM") } override def executorRemoved( @@ -192,23 +192,28 @@ private[spark] class StandaloneSchedulerBackend( case Some(code) => ExecutorExited(code, exitCausedByApp = true, message) case None => ExecutorProcessLost(message, workerHost, causedByApp = workerHost.isEmpty) } - logInfo("Executor %s removed: %s".format(fullId, message)) + logInfo( + log"Executor ${MDC(LogKeys.EXECUTOR_ID, fullId)} removed: ${MDC(LogKeys.MESSAGE, message)}") removeExecutor(fullId.split("/")(1), reason) } override def executorDecommissioned(fullId: String, decommissionInfo: ExecutorDecommissionInfo): Unit = { - logInfo(s"Asked to decommission executor $fullId") + logInfo(log"Asked to decommission executor ${MDC(LogKeys.EXECUTOR_ID, fullId)}") val execId = fullId.split("/")(1) decommissionExecutors( Array((execId, decommissionInfo)), adjustTargetNumExecutors = false, triggeredByExecutor = false) - logInfo("Executor %s decommissioned: %s".format(fullId, decommissionInfo)) + logInfo( + log"Executor ${MDC(LogKeys.EXECUTOR_ID, fullId)} " + + log"decommissioned: ${MDC(LogKeys.DESCRIPTION, decommissionInfo)}" + ) } override def workerRemoved(workerId: String, host: String, message: String): Unit = { - logInfo("Worker %s removed: %s".format(workerId, message)) + logInfo(log"Worker ${MDC(LogKeys.WORKER_ID, workerId)} removed: " + + log"${MDC(LogKeys.MESSAGE, message)}") removeWorker(workerId, host, message) } @@ -349,8 +354,8 @@ private[spark] class StandaloneSchedulerBackend( _executorRemoveDelay, TimeUnit.MILLISECONDS) } catch { case _: RejectedExecutionException if stopping.get() => - logWarning( - "Skipping onDisconnected RemoveExecutor call because the scheduler is stopping") + logWarning("Skipping onDisconnected RemoveExecutor call " + + "because the scheduler is stopping") } } } diff --git a/core/src/main/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitor.scala b/core/src/main/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitor.scala index c389b0c988f4d..57505c87f879e 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitor.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitor.scala @@ -25,7 +25,7 @@ import scala.jdk.CollectionConverters._ import org.apache.spark._ import org.apache.spark.errors.SparkCoreErrors -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.internal.config._ import org.apache.spark.resource.ResourceProfile.UNKNOWN_RESOURCE_PROFILE_ID import org.apache.spark.scheduler._ @@ -342,7 +342,8 @@ private[spark] class ExecutorMonitor( override def onExecutorAdded(event: SparkListenerExecutorAdded): Unit = { val exec = ensureExecutorIsTracked(event.executorId, event.executorInfo.resourceProfileId) exec.updateRunningTasks(0) - logInfo(s"New executor ${event.executorId} has registered (new total is ${executors.size()})") + logInfo(log"New executor ${MDC(LogKeys.EXECUTOR_ID, event.executorId)} has registered " + + log"(new total is ${MDC(LogKeys.COUNT, executors.size())})") } private def decrementExecResourceProfileCount(rpId: Int): Unit = { @@ -365,11 +366,14 @@ private[spark] class ExecutorMonitor( } else { metrics.exitedUnexpectedly.inc() } - logInfo(s"Executor ${event.executorId} is removed. Remove reason statistics: (" + - s"gracefully decommissioned: ${metrics.gracefullyDecommissioned.getCount()}, " + - s"decommision unfinished: ${metrics.decommissionUnfinished.getCount()}, " + - s"driver killed: ${metrics.driverKilled.getCount()}, " + - s"unexpectedly exited: ${metrics.exitedUnexpectedly.getCount()}).") + // scalastyle:off line.size.limit + logInfo(log"Executor ${MDC(LogKeys.EXECUTOR_ID, event.executorId)} is removed. " + + log"Remove reason statistics: (gracefully decommissioned: " + + log"${MDC(LogKeys.NUM_DECOMMISSIONED, metrics.gracefullyDecommissioned.getCount())}, " + + log"decommission unfinished: ${MDC(LogKeys.NUM_UNFINISHED_DECOMMISSIONED, metrics.decommissionUnfinished.getCount())}, " + + log"driver killed: ${MDC(LogKeys.NUM_EXECUTORS_KILLED, metrics.driverKilled.getCount())}, " + + log"unexpectedly exited: ${MDC(LogKeys.NUM_EXECUTORS_EXITED, metrics.exitedUnexpectedly.getCount())}).") + // scalastyle:on line.size.limit if (!removed.pendingRemoval || !removed.decommissioning) { nextTimeout.set(Long.MinValue) } diff --git a/core/src/main/scala/org/apache/spark/scheduler/local/LocalSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/local/LocalSchedulerBackend.scala index a00fe2a06899f..298669327a39c 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/local/LocalSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/local/LocalSchedulerBackend.scala @@ -142,6 +142,7 @@ private[spark] class LocalSchedulerBackend( Map.empty))) launcherBackend.setAppId(appId) launcherBackend.setState(SparkAppHandle.State.RUNNING) + reviveOffers() } override def stop(): Unit = { diff --git a/core/src/main/scala/org/apache/spark/security/CryptoStreamUtils.scala b/core/src/main/scala/org/apache/spark/security/CryptoStreamUtils.scala index 409223132a626..1ee46d51ce70b 100644 --- a/core/src/main/scala/org/apache/spark/security/CryptoStreamUtils.scala +++ b/core/src/main/scala/org/apache/spark/security/CryptoStreamUtils.scala @@ -31,7 +31,8 @@ import org.apache.commons.crypto.random._ import org.apache.commons.crypto.stream._ import org.apache.spark.SparkConf -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.config._ import org.apache.spark.network.util.{CryptoUtils, JavaUtils} @@ -131,8 +132,8 @@ private[spark] object CryptoStreamUtils extends Logging { val initialIVFinish = System.nanoTime() val initialIVTime = TimeUnit.NANOSECONDS.toMillis(initialIVFinish - initialIVStart) if (initialIVTime > 2000) { - logWarning(s"It costs ${initialIVTime} milliseconds to create the Initialization Vector " + - s"used by CryptoStream") + logWarning(log"It costs ${MDC(TIME_UNITS, initialIVTime)} milliseconds " + + log"to create the Initialization Vector used by CryptoStream") } iv } diff --git a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala index 5a0b2ba3735c5..ec5d53e91b3e0 100644 --- a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala +++ b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala @@ -40,7 +40,7 @@ import org.roaringbitmap.RoaringBitmap import org.apache.spark._ import org.apache.spark.api.python.PythonBroadcast import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.CLASS_NAME +import org.apache.spark.internal.LogKeys.CLASS_NAME import org.apache.spark.internal.config.Kryo._ import org.apache.spark.internal.io.FileCommitProtocol._ import org.apache.spark.network.util.ByteUnit @@ -227,10 +227,7 @@ class KryoSerializer(conf: SparkConf) // scalastyle:on - kryo.register(None.getClass) - kryo.register(Nil.getClass) kryo.register(Utils.classForName("scala.collection.immutable.ArraySeq$ofRef")) - kryo.register(Utils.classForName("scala.collection.immutable.$colon$colon")) kryo.register(Utils.classForName("scala.collection.immutable.Map$EmptyMap$")) kryo.register(Utils.classForName("scala.math.Ordering$Reverse")) kryo.register(Utils.classForName("scala.reflect.ClassTag$GenericClassTag")) diff --git a/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala b/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala index b878c88c43b03..30bc1382fb021 100644 --- a/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala +++ b/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala @@ -21,13 +21,16 @@ import java.io._ import java.nio.ByteBuffer import java.nio.channels.Channels import java.nio.file.Files +import java.util.{Collections, Map => JMap} import scala.collection.mutable.ArrayBuffer +import com.google.common.cache.CacheBuilder + import org.apache.spark.{SecurityManager, SparkConf, SparkEnv, SparkException} import org.apache.spark.errors.SparkCoreErrors -import org.apache.spark.internal.{config, Logging, MDC} -import org.apache.spark.internal.LogKey.PATH +import org.apache.spark.internal.{config, Logging, LogKeys, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.io.NioBufferedFileInputStream import org.apache.spark.network.buffer.{FileSegmentManagedBuffer, ManagedBuffer} import org.apache.spark.network.client.StreamCallbackWithID @@ -38,6 +41,7 @@ import org.apache.spark.serializer.SerializerManager import org.apache.spark.shuffle.IndexShuffleBlockResolver.NOOP_REDUCE_ID import org.apache.spark.storage._ import org.apache.spark.util.Utils +import org.apache.spark.util.collection.OpenHashSet /** * Create and maintain the shuffle blocks' mapping between logic block and physical file location. @@ -53,10 +57,23 @@ import org.apache.spark.util.Utils private[spark] class IndexShuffleBlockResolver( conf: SparkConf, // var for testing - var _blockManager: BlockManager = null) + var _blockManager: BlockManager, + val taskIdMapsForShuffle: JMap[Int, OpenHashSet[Long]]) extends ShuffleBlockResolver with Logging with MigratableResolver { + def this(conf: SparkConf) = { + this(conf, null, Collections.emptyMap()) + } + + def this(conf: SparkConf, _blockManager: BlockManager) = { + this(conf, _blockManager, Collections.emptyMap()) + } + + def this(conf: SparkConf, taskIdMapsForShuffle: JMap[Int, OpenHashSet[Long]]) = { + this(conf, null, taskIdMapsForShuffle) + } + private lazy val blockManager = Option(_blockManager).getOrElse(SparkEnv.get.blockManager) private val transportConf = { @@ -76,13 +93,21 @@ private[spark] class IndexShuffleBlockResolver( override def getStoredShuffles(): Seq[ShuffleBlockInfo] = { val allBlocks = blockManager.diskBlockManager.getAllBlocks() allBlocks.flatMap { - case ShuffleIndexBlockId(shuffleId, mapId, _) => + case ShuffleIndexBlockId(shuffleId, mapId, _) + if Option(shuffleIdsToSkip.getIfPresent(shuffleId)).isEmpty => Some(ShuffleBlockInfo(shuffleId, mapId)) case _ => None } } + private val shuffleIdsToSkip = + CacheBuilder.newBuilder().maximumSize(1000).build[java.lang.Integer, java.lang.Boolean]() + + override def addShuffleToSkip(shuffleId: ShuffleId): Unit = { + shuffleIdsToSkip.put(shuffleId, true) + } + private def getShuffleBytesStored(): Long = { val shuffleFiles: Seq[File] = getStoredShuffles().map { si => getDataFile(si.shuffleId, si.mapId) @@ -162,17 +187,17 @@ private[spark] class IndexShuffleBlockResolver( def removeDataByMap(shuffleId: Int, mapId: Long): Unit = { var file = getDataFile(shuffleId, mapId) if (file.exists() && !file.delete()) { - logWarning(s"Error deleting data ${file.getPath()}") + logWarning(log"Error deleting data ${MDC(PATH, file.getPath())}") } file = getIndexFile(shuffleId, mapId) if (file.exists() && !file.delete()) { - logWarning(s"Error deleting index ${file.getPath()}") + logWarning(log"Error deleting index ${MDC(PATH, file.getPath())}") } file = getChecksumFile(shuffleId, mapId, conf.get(config.SHUFFLE_CHECKSUM_ALGORITHM)) if (file.exists() && !file.delete()) { - logWarning(s"Error deleting checksum ${file.getPath()}") + logWarning(log"Error deleting checksum ${MDC(PATH, file.getPath())}") } } @@ -275,12 +300,27 @@ private[spark] class IndexShuffleBlockResolver( throw SparkCoreErrors.failedRenameTempFileError(fileTmp, file) } } + blockId match { + case ShuffleIndexBlockId(shuffleId, mapId, _) => + val mapTaskIds = taskIdMapsForShuffle.computeIfAbsent( + shuffleId, _ => new OpenHashSet[Long](8) + ) + mapTaskIds.add(mapId) + + case ShuffleDataBlockId(shuffleId, mapId, _) => + val mapTaskIds = taskIdMapsForShuffle.computeIfAbsent( + shuffleId, _ => new OpenHashSet[Long](8) + ) + mapTaskIds.add(mapId) + + case _ => // Unreachable + } blockManager.reportBlockStatus(blockId, BlockStatus(StorageLevel.DISK_ONLY, 0, diskSize)) } override def onFailure(streamId: String, cause: Throwable): Unit = { // the framework handles the connection itself, we just need to do local cleanup - logWarning(s"Error while uploading $blockId", cause) + logWarning(log"Error while uploading ${MDC(BLOCK_ID, blockId)}", cause) channel.close() fileTmp.delete() } @@ -318,8 +358,9 @@ private[spark] class IndexShuffleBlockResolver( } } catch { case _: Exception => // If we can't load the blocks ignore them. - logWarning(s"Failed to resolve shuffle block ${shuffleBlockInfo}. " + - "This is expected to occur if a block is removed after decommissioning has started.") + logWarning(log"Failed to resolve shuffle block " + + log"${MDC(SHUFFLE_BLOCK_INFO, shuffleBlockInfo)}. " + + log"This is expected to occur if a block is removed after decommissioning has started.") List.empty[(BlockId, ManagedBuffer)] } } @@ -425,8 +466,8 @@ private[spark] class IndexShuffleBlockResolver( if (checksumTmp.exists()) { try { if (!checksumTmp.delete()) { - logError(s"Failed to delete temporary checksum file " + - s"at ${checksumTmp.getAbsolutePath}") + logError(log"Failed to delete temporary checksum file at " + + log"${MDC(LogKeys.PATH, checksumTmp.getAbsolutePath)}") } } catch { case e: Exception => @@ -475,7 +516,8 @@ private[spark] class IndexShuffleBlockResolver( if (propagateError) { throw SparkCoreErrors.failedRenameTempFileError(tmpFile, targetFile) } else { - logWarning(s"fail to rename file $tmpFile to $targetFile") + logWarning(log"fail to rename file ${MDC(TEMP_FILE, tmpFile)} " + + log"to ${MDC(TARGET_PATH, targetFile)}") } } } diff --git a/core/src/main/scala/org/apache/spark/shuffle/MigratableResolver.scala b/core/src/main/scala/org/apache/spark/shuffle/MigratableResolver.scala index 9908281deed84..19835d515fec2 100644 --- a/core/src/main/scala/org/apache/spark/shuffle/MigratableResolver.scala +++ b/core/src/main/scala/org/apache/spark/shuffle/MigratableResolver.scala @@ -35,6 +35,11 @@ trait MigratableResolver { */ def getStoredShuffles(): Seq[ShuffleBlockInfo] + /** + * Mark a shuffle that should not be migrated. + */ + def addShuffleToSkip(shuffleId: Int): Unit = {} + /** * Write a provided shuffle block as a stream. Used for block migrations. * Up to the implementation to support STORAGE_REMOTE_SHUFFLE_MAX_DISK diff --git a/core/src/main/scala/org/apache/spark/shuffle/ShuffleBlockPusher.scala b/core/src/main/scala/org/apache/spark/shuffle/ShuffleBlockPusher.scala index bd03934cada45..4e3191e44fbdf 100644 --- a/core/src/main/scala/org/apache/spark/shuffle/ShuffleBlockPusher.scala +++ b/core/src/main/scala/org/apache/spark/shuffle/ShuffleBlockPusher.scala @@ -27,7 +27,8 @@ import scala.util.control.NonFatal import org.apache.spark.{SecurityManager, ShuffleDependency, SparkConf, SparkContext, SparkEnv} import org.apache.spark.annotation.Since import org.apache.spark.executor.{CoarseGrainedExecutorBackend, ExecutorBackend} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.config._ import org.apache.spark.launcher.SparkLauncher import org.apache.spark.network.buffer.{FileSegmentManagedBuffer, ManagedBuffer, NioManagedBuffer} @@ -248,7 +249,8 @@ private[spark] class ShuffleBlockPusher(conf: SparkConf) extends Logging { if (!errorHandler.shouldLogError(exception)) { logTrace(s"Pushing block $blockId to $address failed.", exception) } else { - logWarning(s"Pushing block $blockId to $address failed.", exception) + logWarning(log"Pushing block ${MDC(BLOCK_ID, blockId)} " + + log"to ${MDC(HOST_PORT, address)} failed.", exception) } handleResult(PushResult(blockId, exception)) } @@ -329,9 +331,9 @@ private[spark] class ShuffleBlockPusher(conf: SparkConf) extends Logging { unreachableBlockMgrs.add(address) removed += pushRequests.dequeueAll(req => req.address == address).length removed += deferredPushRequests.remove(address).map(_.length).getOrElse(0) - logWarning(s"Received a ConnectException from $address. " + - s"Dropping $removed push-requests and " + - s"not pushing any more blocks to this address.") + logWarning(log"Received a ConnectException from ${MDC(HOST_PORT, address)}. " + + log"Dropping ${MDC(NUM_REQUESTS, removed)} push-requests and " + + log"not pushing any more blocks to this address.") } } if (pushResult.failure != null && !errorHandler.shouldRetryError(pushResult.failure)) { @@ -360,7 +362,8 @@ private[spark] class ShuffleBlockPusher(conf: SparkConf) extends Logging { case Some(cb: CoarseGrainedExecutorBackend) => cb.notifyDriverAboutPushCompletion(shuffleId, shuffleMergeId, mapIndex) case Some(eb: ExecutorBackend) => - logWarning(s"Currently $eb doesn't support push-based shuffle") + logWarning(log"Currently ${MDC(EXECUTOR_BACKEND, eb)} " + + log"doesn't support push-based shuffle") case None => } pushCompletionNotified = true diff --git a/core/src/main/scala/org/apache/spark/shuffle/ShuffleWriteProcessor.scala b/core/src/main/scala/org/apache/spark/shuffle/ShuffleWriteProcessor.scala index 1ab77643c0364..be42af092f24a 100644 --- a/core/src/main/scala/org/apache/spark/shuffle/ShuffleWriteProcessor.scala +++ b/core/src/main/scala/org/apache/spark/shuffle/ShuffleWriteProcessor.scala @@ -18,7 +18,8 @@ package org.apache.spark.shuffle import org.apache.spark.{ShuffleDependency, SparkEnv, TaskContext} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{NUM_MERGER_LOCATIONS, SHUFFLE_ID, STAGE_ID} import org.apache.spark.scheduler.MapStatus /** @@ -72,8 +73,10 @@ private[spark] class ShuffleWriteProcessor extends Serializable with Logging { if (!dep.shuffleMergeFinalized) { manager.shuffleBlockResolver match { case resolver: IndexShuffleBlockResolver => - logInfo(s"Shuffle merge enabled with ${dep.getMergerLocs.size} merger locations " + - s" for stage ${context.stageId()} with shuffle ID ${dep.shuffleId}") + logInfo(log"Shuffle merge enabled with" + + log" ${MDC(NUM_MERGER_LOCATIONS, dep.getMergerLocs.size)} merger locations" + + log" for stage ${MDC(STAGE_ID, context.stageId())}" + + log" with shuffle ID ${MDC(SHUFFLE_ID, dep.shuffleId)}") logDebug(s"Starting pushing blocks for the task ${context.taskAttemptId()}") val dataFile = resolver.getDataFile(dep.shuffleId, mapId) new ShuffleBlockPusher(SparkEnv.get.conf) diff --git a/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala b/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala index 344020935f211..efffda43695cc 100644 --- a/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala +++ b/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala @@ -87,7 +87,8 @@ private[spark] class SortShuffleManager(conf: SparkConf) extends ShuffleManager private lazy val shuffleExecutorComponents = loadShuffleExecutorComponents(conf) - override val shuffleBlockResolver = new IndexShuffleBlockResolver(conf) + override val shuffleBlockResolver = + new IndexShuffleBlockResolver(conf, taskIdMapsForShuffle = taskIdMapsForShuffle) /** * Obtains a [[ShuffleHandle]] to pass to tasks. @@ -176,7 +177,7 @@ private[spark] class SortShuffleManager(conf: SparkConf) extends ShuffleManager metrics, shuffleExecutorComponents) case other: BaseShuffleHandle[K @unchecked, V @unchecked, _] => - new SortShuffleWriter(other, mapId, context, shuffleExecutorComponents) + new SortShuffleWriter(other, mapId, context, metrics, shuffleExecutorComponents) } } diff --git a/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleWriter.scala b/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleWriter.scala index 8613fe11a4c2f..3be7d24f7e4ec 100644 --- a/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleWriter.scala +++ b/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleWriter.scala @@ -21,6 +21,7 @@ import org.apache.spark._ import org.apache.spark.internal.{config, Logging} import org.apache.spark.scheduler.MapStatus import org.apache.spark.shuffle.{BaseShuffleHandle, ShuffleWriter} +import org.apache.spark.shuffle.ShuffleWriteMetricsReporter import org.apache.spark.shuffle.api.ShuffleExecutorComponents import org.apache.spark.util.collection.ExternalSorter @@ -28,6 +29,7 @@ private[spark] class SortShuffleWriter[K, V, C]( handle: BaseShuffleHandle[K, V, C], mapId: Long, context: TaskContext, + writeMetrics: ShuffleWriteMetricsReporter, shuffleExecutorComponents: ShuffleExecutorComponents) extends ShuffleWriter[K, V] with Logging { @@ -46,8 +48,6 @@ private[spark] class SortShuffleWriter[K, V, C]( private var partitionLengths: Array[Long] = _ - private val writeMetrics = context.taskMetrics().shuffleWriteMetrics - /** Write a bunch of records to this task's output */ override def write(records: Iterator[Product2[K, V]]): Unit = { sorter = if (dep.mapSideCombine) { @@ -67,7 +67,7 @@ private[spark] class SortShuffleWriter[K, V, C]( // (see SPARK-3570). val mapOutputWriter = shuffleExecutorComponents.createMapOutputWriter( dep.shuffleId, mapId, dep.partitioner.numPartitions) - sorter.writePartitionedMapOutput(dep.shuffleId, mapId, mapOutputWriter) + sorter.writePartitionedMapOutput(dep.shuffleId, mapId, mapOutputWriter, writeMetrics) partitionLengths = mapOutputWriter.commitAllPartitions(sorter.getChecksums).getPartitionLengths mapStatus = MapStatus(blockManager.shuffleServerId, partitionLengths, mapId) } diff --git a/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala b/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala index 24f4ff1bd6728..5c93bf4bf77a0 100644 --- a/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala +++ b/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala @@ -26,7 +26,7 @@ import scala.jdk.CollectionConverters._ import org.apache.spark._ import org.apache.spark.executor.{ExecutorMetrics, TaskMetrics} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.internal.config.CPUS_PER_TASK import org.apache.spark.internal.config.Status._ import org.apache.spark.resource.ResourceProfile.CPUS @@ -662,7 +662,7 @@ private[spark] class AppStatusListener( case e: TaskFailedReason => // All other failure cases Some(e.toErrorString) case other => - logInfo(s"Unhandled task end reason: $other") + logInfo(log"Unhandled task end reason: ${MDC(LogKeys.REASON, other)}") None } task.errorMessage = errorMessage diff --git a/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala b/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala index 109a9a2e3eb94..87f876467c30e 100644 --- a/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala +++ b/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala @@ -26,7 +26,7 @@ import scala.jdk.CollectionConverters._ import org.apache.spark.{JobExecutionStatus, SparkConf, SparkContext} import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.PATH +import org.apache.spark.internal.LogKeys.PATH import org.apache.spark.internal.config.Status.LIVE_UI_LOCAL_STORE_DIR import org.apache.spark.status.AppStatusUtils.getQuantilesValue import org.apache.spark.status.api.v1 @@ -861,7 +861,7 @@ private[spark] object AppStatusStore extends Logging { def createStorePath(rootDir: String): Option[File] = { try { val localDir = Utils.createDirectory(rootDir, "spark-ui") - logInfo(s"Created spark ui store directory at $rootDir") + logInfo(log"Created spark ui store directory at ${MDC(PATH, rootDir)}") Some(localDir) } catch { case e: IOException => diff --git a/core/src/main/scala/org/apache/spark/status/KVUtils.scala b/core/src/main/scala/org/apache/spark/status/KVUtils.scala index 821686803f469..e334626413dc0 100644 --- a/core/src/main/scala/org/apache/spark/status/KVUtils.scala +++ b/core/src/main/scala/org/apache/spark/status/KVUtils.scala @@ -31,7 +31,8 @@ import org.rocksdb.RocksDBException import org.apache.spark.SparkConf import org.apache.spark.deploy.history.{FsHistoryProvider, FsHistoryProviderMetadata} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.config.History import org.apache.spark.internal.config.History.HYBRID_STORE_DISK_BACKEND import org.apache.spark.internal.config.History.HybridStoreDiskBackend @@ -154,7 +155,7 @@ private[spark] object KVUtils extends Logging { open(dbPath, metadata, conf, live) case dbExc @ (_: NativeDB.DBException | _: RocksDBException) => // Get rid of the corrupted data and re-create it. - logWarning(s"Failed to load disk store $dbPath :", dbExc) + logWarning(log"Failed to load disk store ${MDC(PATH, dbPath)} :", dbExc) Utils.deleteRecursively(dbPath) open(dbPath, metadata, conf, live) } diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/api.scala b/core/src/main/scala/org/apache/spark/status/api/v1/api.scala index 7a0c69e294883..6ae1dce57f31c 100644 --- a/core/src/main/scala/org/apache/spark/status/api/v1/api.scala +++ b/core/src/main/scala/org/apache/spark/status/api/v1/api.scala @@ -510,7 +510,7 @@ case class StackTrace(elems: Seq[String]) { override def toString: String = elems.mkString def html: NodeSeq = { - val withNewLine = elems.foldLeft(NodeSeq.Empty) { (acc, elem) => + val withNewLine = elems.map(_.stripLineEnd).foldLeft(NodeSeq.Empty) { (acc, elem) => if (acc.isEmpty) { acc :+ Text(elem) } else { diff --git a/core/src/main/scala/org/apache/spark/storage/BlockId.scala b/core/src/main/scala/org/apache/spark/storage/BlockId.scala index 585d9a886b473..6eb015d56b2c7 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockId.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockId.scala @@ -170,6 +170,11 @@ case class StreamBlockId(streamId: Int, uniqueId: Long) extends BlockId { override def name: String = "input-" + streamId + "-" + uniqueId } +@DeveloperApi +case class PythonStreamBlockId(streamId: Int, uniqueId: Long) extends BlockId { + override def name: String = "python-stream-" + streamId + "-" + uniqueId +} + /** Id associated with temporary local data managed as blocks. Not serializable. */ private[spark] case class TempLocalBlockId(id: UUID) extends BlockId { override def name: String = "temp_local_" + id @@ -213,6 +218,7 @@ object BlockId { val BROADCAST = "broadcast_([0-9]+)([_A-Za-z0-9]*)".r val TASKRESULT = "taskresult_([0-9]+)".r val STREAM = "input-([0-9]+)-([0-9]+)".r + val PYTHON_STREAM = "python-stream-([0-9]+)-([0-9]+)".r val TEMP_LOCAL = "temp_local_([-A-Fa-f0-9]+)".r val TEMP_SHUFFLE = "temp_shuffle_([-A-Fa-f0-9]+)".r val TEST = "test_(.*)".r @@ -250,6 +256,8 @@ object BlockId { TaskResultBlockId(taskId.toLong) case STREAM(streamId, uniqueId) => StreamBlockId(streamId.toInt, uniqueId.toLong) + case PYTHON_STREAM(streamId, uniqueId) => + PythonStreamBlockId(streamId.toInt, uniqueId.toLong) case TEMP_LOCAL(uuid) => TempLocalBlockId(UUID.fromString(uuid)) case TEMP_SHUFFLE(uuid) => diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala index 31669e688a197..8655b72310795 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala @@ -42,7 +42,7 @@ import org.apache.spark._ import org.apache.spark.errors.SparkCoreErrors import org.apache.spark.executor.DataReadMethod import org.apache.spark.internal.{config, Logging, MDC} -import org.apache.spark.internal.LogKey.{BLOCK_ID, COUNT, SLEEP_TIME} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.config.{Network, RDD_CACHE_VISIBILITY_TRACKING_ENABLED, Tests} import org.apache.spark.memory.{MemoryManager, MemoryMode} import org.apache.spark.metrics.source.Source @@ -305,7 +305,7 @@ private[spark] class BlockManager( // This is a lazy val so someone can migrating RDDs even if they don't have a MigratableResolver // for shuffles. Used in BlockManagerDecommissioner & block puts. - private[storage] lazy val migratableResolver: MigratableResolver = { + lazy val migratableResolver: MigratableResolver = { shuffleManager.shuffleBlockResolver.asInstanceOf[MigratableResolver] } @@ -423,7 +423,7 @@ private[spark] class BlockManager( saveSerializedValuesToMemoryStore(readToByteBuffer()) } if (!putSucceeded && level.useDisk) { - logWarning(s"Persisting block $blockId to disk instead.") + logWarning(log"Persisting block ${MDC(BLOCK_ID, blockId)} to disk instead.") saveToDiskStore() } } else if (level.useDisk) { @@ -535,7 +535,7 @@ private[spark] class BlockManager( val priorityClass = conf.get(config.STORAGE_REPLICATION_POLICY) val clazz = Utils.classForName(priorityClass) val ret = clazz.getConstructor().newInstance().asInstanceOf[BlockReplicationPolicy] - logInfo(s"Using $priorityClass for block replication policy") + logInfo(log"Using ${MDC(CLASS_NAME, priorityClass)} for block replication policy") ret } @@ -547,7 +547,7 @@ private[spark] class BlockManager( // the registration with the ESS. Therefore, this registration should be prior to // the BlockManager registration. See SPARK-39647. if (externalShuffleServiceEnabled) { - logInfo(s"external shuffle service port = $externalShuffleServicePort") + logInfo(log"external shuffle service port = ${MDC(PORT, externalShuffleServicePort)}") shuffleServerId = BlockManagerId(executorId, blockTransferService.hostName, externalShuffleServicePort) if (!isDriver && !(Utils.isTesting && conf.get(Tests.TEST_SKIP_ESS_REGISTER))) { @@ -585,7 +585,7 @@ private[spark] class BlockManager( } } - logInfo(s"Initialized BlockManager: $blockManagerId") + logInfo(log"Initialized BlockManager: ${MDC(BLOCK_MANAGER_ID, blockManagerId)}") } def shuffleMetricsSource: Source = { @@ -646,7 +646,7 @@ private[spark] class BlockManager( * will be made then. */ private def reportAllBlocks(): Unit = { - logInfo(s"Reporting ${blockInfoManager.size} blocks to the master.") + logInfo(log"Reporting ${MDC(NUM_BLOCKS, blockInfoManager.size)} blocks to the master.") for ((blockId, info) <- blockInfoManager.entries) { val status = getCurrentBlockStatus(blockId, info) if (info.tellMaster && !tryToReportBlockStatus(blockId, status)) { @@ -664,7 +664,7 @@ private[spark] class BlockManager( */ def reregister(): Unit = { // TODO: We might need to rate limit re-registering. - logInfo(s"BlockManager $blockManagerId re-registering with master") + logInfo(log"BlockManager ${MDC(BLOCK_MANAGER_ID, blockManagerId)} re-registering with master") val id = master.registerBlockManager(blockManagerId, diskBlockManager.localDirsString, maxOnHeapMemory, maxOffHeapMemory, storageEndpoint, isReRegister = true) if (id.executorId != BlockManagerId.INVALID_EXECUTOR_ID) { @@ -875,7 +875,7 @@ private[spark] class BlockManager( droppedMemorySize: Long = 0L): Unit = { val needReregister = !tryToReportBlockStatus(blockId, status, droppedMemorySize) if (needReregister) { - logInfo(s"Got told to re-register updating block $blockId") + logInfo(log"Got told to re-register updating block ${MDC(BLOCK_ID, blockId)}") // Re-registering will report our new block for free. asyncReregister() } @@ -1139,8 +1139,9 @@ private[spark] class BlockManager( None } } - logInfo(s"Read $blockId from the disk of a same host executor is " + - (if (res.isDefined) "successful." else "failed.")) + logInfo( + log"Read ${MDC(BLOCK_ID, blockId)} from the disk of a same host executor is " + + log"${MDC(STATUS, if (res.isDefined) "successful." else "failed.")}") res }.orElse { fetchRemoteManagedBuffer(blockId, blockSize, locationsAndStatus).map(bufferTransformer) @@ -1214,14 +1215,16 @@ private[spark] class BlockManager( // Give up trying anymore locations. Either we've tried all of the original locations, // or we've refreshed the list of locations from the master, and have still // hit failures after trying locations from the refreshed list. - logWarning(s"Failed to fetch remote block $blockId " + - s"from [${locations.mkString(", ")}] after $totalFailureCount fetch failures. " + - s"Most recent failure cause:", e) + logWarning(log"Failed to fetch remote block ${MDC(BLOCK_ID, blockId)} " + + log"from [${MDC(BLOCK_MANAGER_IDS, locations.mkString(", "))}] " + + log"after ${MDC(NUM_FAILURES, totalFailureCount)} fetch failures. " + + log"Most recent failure cause:", e) return None } - logWarning(s"Failed to fetch remote block $blockId " + - s"from $loc (failed attempt $runningFailureCount)", e) + logWarning(log"Failed to fetch remote block ${MDC(BLOCK_ID, blockId)} " + + log"from ${MDC(BLOCK_MANAGER_ID, loc)} " + + log"(failed attempt ${MDC(NUM_FAILURES, runningFailureCount)}", e) // If there is a large number of executors then locations list can contain a // large number of stale entries causing a large number of retries that may @@ -1306,12 +1309,12 @@ private[spark] class BlockManager( def get[T: ClassTag](blockId: BlockId): Option[BlockResult] = { val local = getLocalValues(blockId) if (local.isDefined) { - logInfo(s"Found block $blockId locally") + logInfo(log"Found block ${MDC(BLOCK_ID, blockId)} locally") return local } val remote = getRemoteValues[T](blockId) if (remote.isDefined) { - logInfo(s"Found block $blockId remotely") + logInfo(log"Found block ${MDC(BLOCK_ID, blockId)} remotely") return remote } None @@ -1335,7 +1338,8 @@ private[spark] class BlockManager( // SPARK-27666. When a task completes, Spark automatically releases all the blocks locked // by this task. We should not release any locks for a task that is already completed. if (taskContext.isDefined && taskContext.get.isCompleted()) { - logWarning(s"Task ${taskAttemptId.get} already completed, not releasing lock for $blockId") + logWarning(log"Task ${MDC(TASK_ATTEMPT_ID, taskAttemptId.get)} " + + log"already completed, not releasing lock for ${MDC(BLOCK_ID, blockId)}") } else { blockInfoManager.unlock(blockId, taskAttemptId) } @@ -1544,7 +1548,8 @@ private[spark] class BlockManager( if (blockInfoManager.lockNewBlockForWriting(blockId, newInfo, keepReadLock)) { newInfo } else { - logWarning(s"Block $blockId already exists on this machine; not re-adding it") + logWarning(log"Block ${MDC(BLOCK_ID, blockId)} " + + log"already exists on this machine; not re-adding it") return None } } @@ -1562,7 +1567,7 @@ private[spark] class BlockManager( blockInfoManager.unlock(blockId) } } else { - logWarning(s"Putting block $blockId failed") + logWarning(log"Putting block ${MDC(BLOCK_ID, blockId)} failed") removeBlockInternal(blockId, tellMaster = false) } res @@ -1570,7 +1575,8 @@ private[spark] class BlockManager( // Since removeBlockInternal may throw exception, // we should print exception first to show root cause. case NonFatal(e) => - logWarning(s"Putting block $blockId failed due to exception $e.") + logWarning(log"Putting block ${MDC(BLOCK_ID, blockId)} " + + log"failed due to exception ${MDC(ERROR, e)}.") throw e } finally { // This cleanup is performed in a finally block rather than a `catch` to avoid having to @@ -1631,7 +1637,7 @@ private[spark] class BlockManager( case Left(iter) => // Not enough space to unroll this block; drop to disk if applicable if (level.useDisk) { - logWarning(s"Persisting block $blockId to disk instead.") + logWarning(log"Persisting block ${MDC(BLOCK_ID, blockId)} to disk instead.") diskStore.put(blockId) { channel => val out = Channels.newOutputStream(channel) serializerManager.dataSerializeStream(blockId, out, iter)(classTag) @@ -1648,7 +1654,7 @@ private[spark] class BlockManager( case Left(partiallySerializedValues) => // Not enough space to unroll this block; drop to disk if applicable if (level.useDisk) { - logWarning(s"Persisting block $blockId to disk instead.") + logWarning(log"Persisting block ${MDC(BLOCK_ID, blockId)} to disk instead.") diskStore.put(blockId) { channel => val out = Channels.newOutputStream(channel) partiallySerializedValues.finishWritingToStream(out) @@ -1815,7 +1821,8 @@ private[spark] class BlockManager( existingReplicas: Set[BlockManagerId], maxReplicas: Int, maxReplicationFailures: Option[Int] = None): Boolean = { - logInfo(s"Using $blockManagerId to pro-actively replicate $blockId") + logInfo(log"Using ${MDC(BLOCK_MANAGER_ID, blockManagerId)} to pro-actively replicate " + + log"${MDC(BLOCK_ID, blockId)}") blockInfoManager.lockForReading(blockId).forall { info => val data = doGetLocalBytes(blockId, info) val storageLevel = StorageLevel( @@ -1904,7 +1911,9 @@ private[spark] class BlockManager( throw e // Everything else we may retry case NonFatal(e) => - logWarning(s"Failed to replicate $blockId to $peer, failure #$numFailures", e) + logWarning(log"Failed to replicate ${MDC(BLOCK_ID, blockId)} " + + log"to ${MDC(PEER, peer)}, " + + log"failure #${MDC(NUM_FAILURES, numFailures)}", e) peersFailedToReplicateTo += peer // we have a failed replication, so we get the list of peers again // we don't want peers we have already replicated to and the ones that @@ -1925,8 +1934,9 @@ private[spark] class BlockManager( logDebug(s"Replicating $blockId of ${data.size} bytes to " + s"${peersReplicatedTo.size} peer(s) took ${(System.nanoTime - startTime) / 1e6} ms") if (peersReplicatedTo.size < numPeersToReplicateTo) { - logWarning(s"Block $blockId replicated to only " + - s"${peersReplicatedTo.size} peer(s) instead of $numPeersToReplicateTo peers") + logWarning(log"Block ${MDC(BLOCK_ID, blockId)} replicated to only " + + log"${MDC(NUM_PEERS_REPLICATED_TO, peersReplicatedTo.size)} peer(s) " + + log"instead of ${MDC(NUM_PEERS_TO_REPLICATE_TO, numPeersToReplicateTo)} peers") return false } @@ -1969,14 +1979,14 @@ private[spark] class BlockManager( private[storage] override def dropFromMemory[T: ClassTag]( blockId: BlockId, data: () => Either[Array[T], ChunkedByteBuffer]): StorageLevel = { - logInfo(s"Dropping block $blockId from memory") + logInfo(log"Dropping block ${MDC(BLOCK_ID, blockId)} from memory") val info = blockInfoManager.assertBlockIsLockedForWriting(blockId) var blockIsUpdated = false val level = info.level // Drop to disk, if storage level requires if (level.useDisk && !diskStore.contains(blockId)) { - logInfo(s"Writing block $blockId to disk") + logInfo(log"Writing block ${MDC(BLOCK_ID, blockId)} to disk") data() match { case Left(elements) => diskStore.put(blockId) { channel => @@ -1999,7 +2009,8 @@ private[spark] class BlockManager( if (blockIsRemoved) { blockIsUpdated = true } else { - logWarning(s"Block $blockId could not be dropped from memory as it does not exist") + logWarning(log"Block ${MDC(BLOCK_ID, blockId)} " + + log"could not be dropped from memory as it does not exist") } val status = getCurrentBlockStatus(blockId, info) @@ -2019,7 +2030,7 @@ private[spark] class BlockManager( */ def removeRdd(rddId: Int): Int = { // TODO: Avoid a linear scan by creating another mapping of RDD.id to blocks. - logInfo(s"Removing RDD $rddId") + logInfo(log"Removing RDD ${MDC(RDD_ID, rddId)}") val blocksToRemove = blockInfoManager.entries.flatMap(_._1.asRDDId).filter(_.rddId == rddId) blocksToRemove.foreach { blockId => removeBlock(blockId, tellMaster = false) } blocksToRemove.size @@ -2083,7 +2094,8 @@ private[spark] class BlockManager( blockInfoManager.lockForWriting(blockId) match { case None => // The block has already been removed; do nothing. - logWarning(s"Asked to remove block $blockId, which does not exist") + logWarning(log"Asked to remove block ${MDC(BLOCK_ID, blockId)}, " + + log"which does not exist") case Some(info) => removeBlockInternal(blockId, tellMaster = tellMaster && info.tellMaster) addUpdatedBlockStatusToTaskMetrics(blockId, BlockStatus.empty) @@ -2106,7 +2118,8 @@ private[spark] class BlockManager( val removedFromMemory = memoryStore.remove(blockId) val removedFromDisk = diskStore.remove(blockId) if (!removedFromMemory && !removedFromDisk) { - logWarning(s"Block $blockId could not be removed as it was not found on disk or in memory") + logWarning(log"Block ${MDC(BLOCK_ID, blockId)} " + + log"could not be removed as it was not found on disk or in memory") } blockInfoManager.removeBlock(blockId) @@ -2118,7 +2131,7 @@ private[spark] class BlockManager( } } finally { if (!hasRemoveBlock) { - logWarning(s"Block $blockId was not removed normally.") + logWarning(log"Block ${MDC(BLOCK_ID, blockId)} was not removed normally.") blockInfoManager.removeBlock(blockId) } } diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerDecommissioner.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerDecommissioner.scala index 5b4ecef233f8f..19807453ee28c 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockManagerDecommissioner.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerDecommissioner.scala @@ -27,7 +27,7 @@ import scala.util.control.NonFatal import org.apache.spark._ import org.apache.spark.errors.SparkCoreErrors import org.apache.spark.internal.{config, Logging, MDC} -import org.apache.spark.internal.LogKey.SHUFFLE_BLOCK_INFO +import org.apache.spark.internal.LogKeys._ import org.apache.spark.shuffle.ShuffleBlockInfo import org.apache.spark.storage.BlockManagerMessages.ReplicateBlock import org.apache.spark.util.{ThreadUtils, Utils} @@ -73,13 +73,15 @@ private[storage] class BlockManagerDecommissioner( private def allowRetry(shuffleBlock: ShuffleBlockInfo, failureNum: Int): Boolean = { if (failureNum < maxReplicationFailuresForDecommission) { - logInfo(s"Add $shuffleBlock back to migration queue for " + - s"retry ($failureNum / $maxReplicationFailuresForDecommission)") + logInfo(log"Add ${MDC(SHUFFLE_BLOCK_INFO, shuffleBlock)} back to migration queue for " + + log" retry (${MDC(FAILURES, failureNum)} / " + + log"${MDC(MAX_ATTEMPTS, maxReplicationFailuresForDecommission)})") // The block needs to retry so we should not mark it as finished shufflesToMigrate.add((shuffleBlock, failureNum)) } else { - logWarning(s"Give up migrating $shuffleBlock since it's been " + - s"failed for $maxReplicationFailuresForDecommission times") + logWarning(log"Give up migrating ${MDC(SHUFFLE_BLOCK_INFO, shuffleBlock)} " + + log"since it's been failed for " + + log"${MDC(MAX_ATTEMPTS, maxReplicationFailuresForDecommission)} times") false } } @@ -97,7 +99,7 @@ private[storage] class BlockManagerDecommissioner( } override def run(): Unit = { - logInfo(s"Starting shuffle block migration thread for $peer") + logInfo(log"Starting shuffle block migration thread for ${MDC(PEER, peer)}") // Once a block fails to transfer to an executor stop trying to transfer more blocks while (keepRunning) { try { @@ -106,10 +108,12 @@ private[storage] class BlockManagerDecommissioner( var isTargetDecommissioned = false // We only migrate a shuffle block when both index file and data file exist. if (blocks.isEmpty) { - logInfo(s"Ignore deleted shuffle block $shuffleBlockInfo") + logInfo(log"Ignore deleted shuffle block ${MDC(SHUFFLE_BLOCK_INFO, shuffleBlockInfo)}") } else { - logInfo(s"Got migration sub-blocks $blocks. Trying to migrate $shuffleBlockInfo " + - s"to $peer ($retryCount / $maxReplicationFailuresForDecommission)") + logInfo(log"Got migration sub-blocks ${MDC(BLOCK_IDS, blocks)}. Trying to migrate " + + log"${MDC(SHUFFLE_BLOCK_INFO, shuffleBlockInfo)} to ${MDC(PEER, peer)} " + + log"(${MDC(NUM_RETRY, retryCount)} / " + + log"${MDC(MAX_ATTEMPTS, maxReplicationFailuresForDecommission)}") // Migrate the components of the blocks. try { val startTime = System.currentTimeMillis() @@ -129,9 +133,10 @@ private[storage] class BlockManagerDecommissioner( logDebug(s"Migrated sub-block $blockId") } } - logInfo(s"Migrated $shuffleBlockInfo (" + - s"size: ${Utils.bytesToString(blocks.map(b => b._2.size()).sum)}) to $peer " + - s"in ${System.currentTimeMillis() - startTime} ms") + logInfo(log"Migrated ${MDC(SHUFFLE_BLOCK_INFO, shuffleBlockInfo)} (" + + log"size: ${MDC(SIZE, Utils.bytesToString(blocks.map(b => b._2.size()).sum))}) " + + log"to ${MDC(PEER, peer)} in " + + log"${MDC(DURATION, System.currentTimeMillis() - startTime)} ms") } catch { case e @ ( _ : IOException | _ : SparkException) => // If a block got deleted before netty opened the file handle, then trying to @@ -140,7 +145,8 @@ private[storage] class BlockManagerDecommissioner( // could also happen with manually managed shuffles or a GC event on the // driver a no longer referenced RDD with shuffle files. if (bm.migratableResolver.getMigrationBlocks(shuffleBlockInfo).size < blocks.size) { - logWarning(s"Skipping block $shuffleBlockInfo, block deleted.") + logWarning(log"Skipping block ${MDC(SHUFFLE_BLOCK_INFO, shuffleBlockInfo)}, " + + log"block deleted.") } else if (fallbackStorage.isDefined // Confirm peer is not the fallback BM ID because fallbackStorage would already // have been used in the try-block above so there's no point trying again @@ -165,7 +171,7 @@ private[storage] class BlockManagerDecommissioner( if (keepRunning) { numMigratedShuffles.incrementAndGet() } else { - logWarning(s"Stop migrating shuffle blocks to $peer") + logWarning(log"Stop migrating shuffle blocks to ${MDC(PEER, peer)}") val newRetryCount = if (isTargetDecommissioned) { retryCount @@ -179,7 +185,11 @@ private[storage] class BlockManagerDecommissioner( } } catch { case _: InterruptedException => - logInfo(s"Stop shuffle block migration${if (keepRunning) " unexpectedly"}.") + if (keepRunning) { + logInfo("Stop shuffle block migration unexpectedly.") + } else { + logInfo("Stop shuffle block migration.") + } keepRunning = false case NonFatal(e) => keepRunning = false @@ -232,12 +242,16 @@ private[storage] class BlockManagerDecommissioner( logInfo("Attempting to migrate all cached RDD blocks") rddBlocksLeft = decommissionRddCacheBlocks() lastRDDMigrationTime = startTime - logInfo(s"Finished current round RDD blocks migration, " + - s"waiting for ${sleepInterval}ms before the next round migration.") + logInfo(log"Finished current round RDD blocks migration, " + + log"waiting for ${MDC(SLEEP_TIME, sleepInterval)}ms before the next round migration.") Thread.sleep(sleepInterval) } catch { case _: InterruptedException => - logInfo(s"Stop RDD blocks migration${if (!stopped && !stoppedRDD) " unexpectedly"}.") + if (!stopped && !stoppedRDD) { + logInfo("Stop RDD blocks migration unexpectedly.") + } else { + logInfo("Stop RDD blocks migration.") + } stoppedRDD = true case NonFatal(e) => logError("Error occurred during RDD blocks migration.", e) @@ -263,8 +277,9 @@ private[storage] class BlockManagerDecommissioner( val startTime = System.nanoTime() shuffleBlocksLeft = refreshMigratableShuffleBlocks() lastShuffleMigrationTime = startTime - logInfo(s"Finished current round refreshing migratable shuffle blocks, " + - s"waiting for ${sleepInterval}ms before the next round refreshing.") + logInfo(log"Finished current round refreshing migratable shuffle blocks, " + + log"waiting for ${MDC(SLEEP_TIME, sleepInterval)}ms before the " + + log"next round refreshing.") Thread.sleep(sleepInterval) } catch { case _: InterruptedException if stopped => @@ -300,8 +315,9 @@ private[storage] class BlockManagerDecommissioner( shufflesToMigrate.addAll(newShufflesToMigrate.map(x => (x, 0)).asJava) migratingShuffles ++= newShufflesToMigrate val remainedShuffles = migratingShuffles.size - numMigratedShuffles.get() - logInfo(s"${newShufflesToMigrate.size} of ${localShuffles.size} local shuffles " + - s"are added. In total, $remainedShuffles shuffles are remained.") + logInfo(log"${MDC(COUNT, newShufflesToMigrate.size)} of " + + log"${MDC(TOTAL, localShuffles.size)} local shuffles are added. " + + log"In total, ${MDC(NUM_REMAINED, remainedShuffles)} shuffles are remained.") // Update the threads doing migrations val livePeerSet = bm.getPeers(false).toSet @@ -348,10 +364,11 @@ private[storage] class BlockManagerDecommissioner( // Refresh peers and validate we have somewhere to move blocks. if (replicateBlocksInfo.nonEmpty) { - logInfo(s"Need to replicate ${replicateBlocksInfo.size} RDD blocks " + - "for block manager decommissioning") + logInfo( + log"Need to replicate ${MDC(NUM_REPLICAS, replicateBlocksInfo.size)} RDD blocks " + + log"for block manager decommissioning") } else { - logWarning(s"Asked to decommission RDD cache blocks, but no blocks to migrate") + logWarning("Asked to decommission RDD cache blocks, but no blocks to migrate") return false } @@ -362,8 +379,8 @@ private[storage] class BlockManagerDecommissioner( (replicateBlock.blockId, replicatedSuccessfully) }.filterNot(_._2).map(_._1) if (blocksFailedReplication.nonEmpty) { - logWarning("Blocks failed replication in cache decommissioning " + - s"process: ${blocksFailedReplication.mkString(",")}") + logWarning(log"Blocks failed replication in cache decommissioning " + + log"process: ${MDC(BLOCK_IDS, blocksFailedReplication.mkString(","))}") return true } false @@ -376,11 +393,12 @@ private[storage] class BlockManagerDecommissioner( blockToReplicate.maxReplicas, maxReplicationFailures = Some(maxReplicationFailuresForDecommission)) if (replicatedSuccessfully) { - logInfo(s"Block ${blockToReplicate.blockId} migrated successfully, Removing block now") + logInfo(log"Block ${MDC(BLOCK_ID, blockToReplicate.blockId)} migrated " + + log"successfully, Removing block now") bm.removeBlock(blockToReplicate.blockId) - logInfo(s"Block ${blockToReplicate.blockId} removed") + logInfo(log"Block ${MDC(BLOCK_ID, blockToReplicate.blockId)} removed") } else { - logWarning(s"Failed to migrate block ${blockToReplicate.blockId}") + logWarning(log"Failed to migrate block ${MDC(BLOCK_ID, blockToReplicate.blockId)}") } replicatedSuccessfully } diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala index 2b961317e01d9..276bd63e14237 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala @@ -23,7 +23,8 @@ import scala.concurrent.Future import org.apache.spark.SparkConf import org.apache.spark.errors.SparkCoreErrors -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.rpc.RpcEndpointRef import org.apache.spark.storage.BlockManagerMessages._ import org.apache.spark.util.{RpcUtils, ThreadUtils} @@ -41,7 +42,7 @@ class BlockManagerMaster( /** Remove a dead executor from the driver endpoint. This is only called on the driver side. */ def removeExecutor(execId: String): Unit = { tell(RemoveExecutor(execId)) - logInfo("Removed " + execId + " successfully in removeExecutor") + logInfo(log"Removed ${MDC(EXECUTOR_ID, execId)} successfully in removeExecutor") } /** Decommission block managers corresponding to given set of executors @@ -61,7 +62,7 @@ class BlockManagerMaster( */ def removeExecutorAsync(execId: String): Unit = { driverEndpoint.ask[Boolean](RemoveExecutor(execId)) - logInfo("Removal of executor " + execId + " requested") + logInfo(log"Removal of executor ${MDC(EXECUTOR_ID, execId)} requested") } /** @@ -76,7 +77,7 @@ class BlockManagerMaster( maxOffHeapMemSize: Long, storageEndpoint: RpcEndpointRef, isReRegister: Boolean = false): BlockManagerId = { - logInfo(s"Registering BlockManager $id") + logInfo(log"Registering BlockManager ${MDC(BLOCK_MANAGER_ID, id)}") val updatedId = driverEndpoint.askSync[BlockManagerId]( RegisterBlockManager( id, @@ -89,9 +90,9 @@ class BlockManagerMaster( ) if (updatedId.executorId == BlockManagerId.INVALID_EXECUTOR_ID) { assert(isReRegister, "Got invalid executor id from non re-register case") - logInfo(s"Re-register BlockManager $id failed") + logInfo(log"Re-register BlockManager ${MDC(BLOCK_MANAGER_ID, id)} failed") } else { - logInfo(s"Registered BlockManager $updatedId") + logInfo(log"Registered BlockManager ${MDC(BLOCK_MANAGER_ID, updatedId)}") } updatedId } @@ -189,7 +190,8 @@ class BlockManagerMaster( def removeRdd(rddId: Int, blocking: Boolean): Unit = { val future = driverEndpoint.askSync[Future[Seq[Int]]](RemoveRdd(rddId)) future.failed.foreach(e => - logWarning(s"Failed to remove RDD $rddId - ${e.getMessage}", e) + logWarning(log"Failed to remove RDD ${MDC(RDD_ID, rddId)} - " + + log"${MDC(ERROR, e.getMessage)}", e) )(ThreadUtils.sameThread) if (blocking) { // the underlying Futures will timeout anyway, so it's safe to use infinite timeout here @@ -201,7 +203,8 @@ class BlockManagerMaster( def removeShuffle(shuffleId: Int, blocking: Boolean): Unit = { val future = driverEndpoint.askSync[Future[Seq[Boolean]]](RemoveShuffle(shuffleId)) future.failed.foreach(e => - logWarning(s"Failed to remove shuffle $shuffleId - ${e.getMessage}", e) + logWarning(log"Failed to remove shuffle ${MDC(SHUFFLE_ID, shuffleId)} - " + + log"${MDC(ERROR, e.getMessage)}", e) )(ThreadUtils.sameThread) if (blocking) { // the underlying Futures will timeout anyway, so it's safe to use infinite timeout here @@ -214,8 +217,9 @@ class BlockManagerMaster( val future = driverEndpoint.askSync[Future[Seq[Int]]]( RemoveBroadcast(broadcastId, removeFromMaster)) future.failed.foreach(e => - logWarning(s"Failed to remove broadcast $broadcastId" + - s" with removeFromMaster = $removeFromMaster - ${e.getMessage}", e) + logWarning(log"Failed to remove broadcast ${MDC(BROADCAST_ID, broadcastId)}" + + log" with removeFromMaster = ${MDC(REMOVE_FROM_MASTER, removeFromMaster)} - " + + log"${MDC(ERROR, e.getMessage)}", e) )(ThreadUtils.sameThread) if (blocking) { // the underlying Futures will timeout anyway, so it's safe to use infinite timeout here diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala index 5bb4e096c029c..73f89ea0e86e5 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala @@ -32,7 +32,7 @@ import com.google.common.cache.CacheBuilder import org.apache.spark.{MapOutputTrackerMaster, SparkConf, SparkContext, SparkEnv} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.internal.{config, Logging, MDC} -import org.apache.spark.internal.LogKey.{BLOCK_MANAGER_ID, EXECUTOR_ID, OLD_BLOCK_MANAGER_ID} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.config.RDD_CACHE_VISIBILITY_TRACKING_ENABLED import org.apache.spark.network.shuffle.{ExternalBlockStoreClient, RemoteBlockPushResolver} import org.apache.spark.rpc.{IsolatedThreadSafeRpcEndpoint, RpcCallContext, RpcEndpointRef, RpcEnv} @@ -110,7 +110,7 @@ class BlockManagerMasterEndpoint( val clazz = Utils.classForName(topologyMapperClassName) val mapper = clazz.getConstructor(classOf[SparkConf]).newInstance(conf).asInstanceOf[TopologyMapper] - logInfo(s"Using $topologyMapperClassName for getting topology information") + logInfo(log"Using ${MDC(CLASS_NAME, topologyMapperClassName)} for getting topology information") mapper } @@ -218,7 +218,8 @@ class BlockManagerMasterEndpoint( // executor is notified(see BlockManager.decommissionSelf), so we don't need to send the // notification here. val bms = executorIds.flatMap(blockManagerIdByExecutor.get) - logInfo(s"Mark BlockManagers (${bms.mkString(", ")}) as being decommissioning.") + logInfo(log"Mark BlockManagers (${MDC(BLOCK_MANAGER_IDS, bms.mkString(", "))}) as " + + log"being decommissioning.") decommissioningBlockManagerSet ++= bms context.reply(true) @@ -314,8 +315,9 @@ class BlockManagerMasterEndpoint( defaultValue: T): PartialFunction[Throwable, T] = { case e: IOException => if (!SparkContext.getActive.map(_.isStopped).getOrElse(true)) { - logWarning(s"Error trying to remove $blockType $blockId" + - s" from block manager $bmId", e) + logWarning(log"Error trying to remove ${MDC(BLOCK_TYPE, blockType)} " + + log"${MDC(BLOCK_ID, blockId)}" + + log" from block manager ${MDC(BLOCK_MANAGER_ID, bmId)}", e) } defaultValue @@ -333,8 +335,9 @@ class BlockManagerMasterEndpoint( false } if (!isAlive) { - logWarning(s"Error trying to remove $blockType $blockId. " + - s"The executor $executorId may have been lost.", t) + logWarning(log"Error trying to remove ${MDC(BLOCK_TYPE, blockType)} " + + log"${MDC(BLOCK_ID, blockId)}. " + + log"The executor ${MDC(EXECUTOR_ID, executorId)} may have been lost.", t) defaultValue } else { throw t @@ -516,7 +519,7 @@ class BlockManagerMasterEndpoint( // etc.) as replication doesn't make much sense in that context. if (locations.isEmpty) { blockLocations.remove(blockId) - logWarning(s"No more replicas available for $blockId !") + logWarning(log"No more replicas available for ${MDC(BLOCK_ID, blockId)}!") } else if (proactivelyReplicate && (blockId.isRDD || blockId.isInstanceOf[TestBlockId])) { // As a heuristic, assume single executor failure to find out the number of replicas that // existed before failure @@ -533,7 +536,7 @@ class BlockManagerMasterEndpoint( } listenerBus.post(SparkListenerBlockManagerRemoved(System.currentTimeMillis(), blockManagerId)) - logInfo(s"Removing block manager $blockManagerId") + logInfo(log"Removing block manager ${MDC(BLOCK_MANAGER_ID, blockManagerId)}") } @@ -549,7 +552,7 @@ class BlockManagerMasterEndpoint( } private def removeExecutor(execId: String): Unit = { - logInfo("Trying to remove executor " + execId + " from BlockManagerMaster.") + logInfo(log"Trying to remove executor ${MDC(EXECUTOR_ID, execId)} from BlockManagerMaster.") blockManagerIdByExecutor.get(execId).foreach(removeBlockManager) } @@ -705,8 +708,9 @@ class BlockManagerMasterEndpoint( removeExecutor(id.executorId) case None => } - logInfo("Registering block manager %s with %s RAM, %s".format( - id.hostPort, Utils.bytesToString(maxOnHeapMemSize + maxOffHeapMemSize), id)) + logInfo(log"Registering block manager ${MDC(HOST_PORT, id.hostPort)} with " + + log"${MDC(MEMORY_SIZE, Utils.bytesToString(maxOnHeapMemSize + maxOffHeapMemSize))} RAM, " + + log"${MDC(BLOCK_MANAGER_ID, id)}") blockManagerIdByExecutor(id.executorId) = id @@ -736,8 +740,8 @@ class BlockManagerMasterEndpoint( assert(!blockManagerInfo.contains(id), "BlockManager re-registration shouldn't succeed when the executor is lost") - logInfo(s"BlockManager ($id) re-registration is rejected since " + - s"the executor (${id.executorId}) has been lost") + logInfo(log"BlockManager (${MDC(BLOCK_MANAGER_ID, id)}) re-registration is rejected since " + + log"the executor (${MDC(EXECUTOR_ID, id.executorId)}) has been lost") // Use "invalid" as the return executor id to indicate the block manager that // re-registration failed. It's a bit hacky but fine since the returned block @@ -1055,26 +1059,30 @@ private[spark] class BlockManagerInfo( _blocks.put(blockId, blockStatus) _remainingMem -= memSize if (blockExists) { - logInfo(s"Updated $blockId in memory on ${blockManagerId.hostPort}" + - s" (current size: ${Utils.bytesToString(memSize)}," + - s" original size: ${Utils.bytesToString(originalMemSize)}," + - s" free: ${Utils.bytesToString(_remainingMem)})") + logInfo(log"Updated ${MDC(BLOCK_ID, blockId)} in memory on " + + log"${MDC(HOST_PORT, blockManagerId.hostPort)} (current size: " + + log"${MDC(CURRENT_MEMORY_SIZE, Utils.bytesToString(memSize))}, original " + + log"size: ${MDC(ORIGINAL_MEMORY_SIZE, Utils.bytesToString(originalMemSize))}, " + + log"free: ${MDC(FREE_MEMORY_SIZE, Utils.bytesToString(_remainingMem))})") } else { - logInfo(s"Added $blockId in memory on ${blockManagerId.hostPort}" + - s" (size: ${Utils.bytesToString(memSize)}," + - s" free: ${Utils.bytesToString(_remainingMem)})") + logInfo(log"Added ${MDC(BLOCK_ID, blockId)} in memory on " + + log"${MDC(HOST_PORT, blockManagerId.hostPort)} " + + log"(size: ${MDC(CURRENT_MEMORY_SIZE, Utils.bytesToString(memSize))}, " + + log"free: ${MDC(FREE_MEMORY_SIZE, Utils.bytesToString(_remainingMem))})") } } if (storageLevel.useDisk) { blockStatus = BlockStatus(storageLevel, memSize = 0, diskSize = diskSize) _blocks.put(blockId, blockStatus) if (blockExists) { - logInfo(s"Updated $blockId on disk on ${blockManagerId.hostPort}" + - s" (current size: ${Utils.bytesToString(diskSize)}," + - s" original size: ${Utils.bytesToString(originalDiskSize)})") + logInfo(log"Updated ${MDC(BLOCK_ID, blockId)} on disk on " + + log"${MDC(HOST_PORT, blockManagerId.hostPort)} " + + log"(current size: ${MDC(CURRENT_DISK_SIZE, Utils.bytesToString(diskSize))}," + + log" original size: ${MDC(ORIGINAL_DISK_SIZE, Utils.bytesToString(originalDiskSize))})") } else { - logInfo(s"Added $blockId on disk on ${blockManagerId.hostPort}" + - s" (size: ${Utils.bytesToString(diskSize)})") + logInfo(log"Added ${MDC(BLOCK_ID, blockId)} on disk on " + + log"${MDC(HOST_PORT, blockManagerId.hostPort)} (size: " + + log"${MDC(CURRENT_DISK_SIZE, Utils.bytesToString(diskSize))})") } } @@ -1090,13 +1098,15 @@ private[spark] class BlockManagerInfo( blockStatus.remove(blockId) } if (originalLevel.useMemory) { - logInfo(s"Removed $blockId on ${blockManagerId.hostPort} in memory" + - s" (size: ${Utils.bytesToString(originalMemSize)}," + - s" free: ${Utils.bytesToString(_remainingMem)})") + logInfo(log"Removed ${MDC(BLOCK_ID, blockId)} on " + + log"${MDC(HOST_PORT, blockManagerId.hostPort)} in memory " + + log"(size: ${MDC(ORIGINAL_MEMORY_SIZE, Utils.bytesToString(originalMemSize))}, " + + log"free: ${MDC(FREE_MEMORY_SIZE, Utils.bytesToString(_remainingMem))})") } if (originalLevel.useDisk) { - logInfo(s"Removed $blockId on ${blockManagerId.hostPort} on disk" + - s" (size: ${Utils.bytesToString(originalDiskSize)})") + logInfo(log"Removed ${MDC(BLOCK_ID, blockId)} on " + + log"${MDC(HOST_PORT, blockManagerId.hostPort)} on disk" + + log" (size: ${MDC(ORIGINAL_DISK_SIZE, Utils.bytesToString(originalDiskSize))})") } } } diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerStorageEndpoint.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerStorageEndpoint.scala index 1fccbd16ced5b..686ac1eb786e0 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockManagerStorageEndpoint.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerStorageEndpoint.scala @@ -21,7 +21,7 @@ import scala.concurrent.{ExecutionContext, ExecutionContextExecutorService, Futu import org.apache.spark.{MapOutputTracker, SparkEnv} import org.apache.spark.internal.{Logging, MDC, MessageWithContext} -import org.apache.spark.internal.LogKey.{BLOCK_ID, BROADCAST_ID, RDD_ID, SHUFFLE_ID} +import org.apache.spark.internal.LogKeys.{BLOCK_ID, BROADCAST_ID, RDD_ID, SHUFFLE_ID} import org.apache.spark.rpc.{IsolatedThreadSafeRpcEndpoint, RpcCallContext, RpcEnv} import org.apache.spark.storage.BlockManagerMessages._ import org.apache.spark.util.{ThreadUtils, Utils} diff --git a/core/src/main/scala/org/apache/spark/storage/BlockReplicationPolicy.scala b/core/src/main/scala/org/apache/spark/storage/BlockReplicationPolicy.scala index 893b5605414e4..5186cbfa217cc 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockReplicationPolicy.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockReplicationPolicy.scala @@ -21,7 +21,8 @@ import scala.collection.mutable import scala.util.Random import org.apache.spark.annotation.DeveloperApi -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ /** * ::DeveloperApi:: @@ -120,7 +121,8 @@ class RandomBlockReplicationPolicy BlockReplicationUtils.getRandomSample(peers, numReplicas, random) } else { if (peers.size < numReplicas) { - logWarning(s"Expecting ${numReplicas} replicas with only ${peers.size} peer/s.") + logWarning(log"Expecting ${MDC(NUM_REPLICAS, numReplicas)} " + + log"replicas with only ${MDC(NUM_PEERS, peers.size)} peer/s.") } random.shuffle(peers).toList } diff --git a/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala b/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala index 4c0b5f4a14f64..72d8dc0b19d21 100644 --- a/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala +++ b/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala @@ -31,7 +31,7 @@ import org.apache.spark.{SparkConf, SparkException} import org.apache.spark.errors.SparkCoreErrors import org.apache.spark.executor.ExecutorExitCode import org.apache.spark.internal.{config, Logging, MDC} -import org.apache.spark.internal.LogKey.{MERGE_DIR_NAME, PATH} +import org.apache.spark.internal.LogKeys.{MERGE_DIR_NAME, PATH} import org.apache.spark.network.shuffle.ExecutorDiskUtils import org.apache.spark.storage.DiskBlockManager.ATTEMPT_ID_KEY import org.apache.spark.storage.DiskBlockManager.MERGE_DIR_KEY @@ -252,7 +252,7 @@ private[spark] class DiskBlockManager( Utils.getConfiguredLocalDirs(conf).flatMap { rootDir => try { val localDir = Utils.createDirectory(rootDir, "blockmgr") - logInfo(s"Created local directory at $localDir") + logInfo(log"Created local directory at ${MDC(PATH, localDir)}") Some(localDir) } catch { case e: IOException => @@ -290,7 +290,7 @@ private[spark] class DiskBlockManager( } } } - logInfo(s"Merge directory and its sub dirs get created at $mergeDir") + logInfo(log"Merge directory and its sub dirs get created at ${MDC(PATH, mergeDir)}") } catch { case e: IOException => logError( @@ -325,8 +325,8 @@ private[spark] class DiskBlockManager( logDebug(s"Created directory at ${dirToCreate.getAbsolutePath} with permission 770") } catch { case e: SecurityException => - logWarning(s"Failed to create directory ${dirToCreate.getAbsolutePath} " + - s"with permission 770", e) + logWarning(log"Failed to create directory ${MDC(PATH, dirToCreate.getAbsolutePath)} " + + log"with permission 770", e) created = null; } } diff --git a/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala b/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala index 0b6e33ff5fb37..efcdb7fa8c69e 100644 --- a/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala +++ b/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala @@ -25,7 +25,7 @@ import java.util.zip.Checksum import org.apache.spark.SparkException import org.apache.spark.errors.SparkCoreErrors import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{ERROR, PATH} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.io.MutableCheckedOutputStream import org.apache.spark.serializer.{SerializationStream, SerializerInstance, SerializerManager} import org.apache.spark.shuffle.ShuffleWriteMetricsReporter @@ -126,6 +126,12 @@ private[spark] class DiskBlockObjectWriter( */ private var numRecordsCommitted = 0L + // For testing only. + private[storage] def getSerializerWrappedStream: OutputStream = bs + + // For testing only. + private[storage] def getSerializationStream: SerializationStream = objOut + /** * Set the checksum that the checksumOutputStream should use */ @@ -174,19 +180,36 @@ private[spark] class DiskBlockObjectWriter( * Should call after committing or reverting partial writes. */ private def closeResources(): Unit = { - if (initialized) { - Utils.tryWithSafeFinally { - mcs.manualClose() - } { - channel = null - mcs = null - bs = null - fos = null - ts = null - objOut = null - initialized = false - streamOpen = false - hasBeenClosed = true + try { + if (streamOpen) { + Utils.tryWithSafeFinally { + if (null != objOut) objOut.close() + bs = null + } { + objOut = null + if (null != bs) bs.close() + bs = null + } + } + } catch { + case e: IOException => + logInfo(log"Exception occurred while closing the output stream" + + log"${MDC(ERROR, e.getMessage)}") + } finally { + if (initialized) { + Utils.tryWithSafeFinally { + mcs.manualClose() + } { + channel = null + mcs = null + bs = null + fos = null + ts = null + objOut = null + initialized = false + streamOpen = false + hasBeenClosed = true + } } } } @@ -297,7 +320,7 @@ private[spark] class DiskBlockObjectWriter( } } { if (!Files.deleteIfExists(file.toPath)) { - logWarning(s"Error deleting $file") + logWarning(log"Error deleting ${MDC(FILE_NAME, file)}") } } } diff --git a/core/src/main/scala/org/apache/spark/storage/DiskStore.scala b/core/src/main/scala/org/apache/spark/storage/DiskStore.scala index 54c5d0b2dce71..1498b224b0c92 100644 --- a/core/src/main/scala/org/apache/spark/storage/DiskStore.scala +++ b/core/src/main/scala/org/apache/spark/storage/DiskStore.scala @@ -30,7 +30,8 @@ import io.netty.channel.DefaultFileRegion import org.apache.commons.io.FileUtils import org.apache.spark.{SecurityManager, SparkConf, SparkException} -import org.apache.spark.internal.{config, Logging} +import org.apache.spark.internal.{config, Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.network.buffer.ManagedBuffer import org.apache.spark.network.util.{AbstractFileRegion, JavaUtils} import org.apache.spark.security.CryptoStreamUtils @@ -62,7 +63,7 @@ private[spark] class DiskStore( */ def put(blockId: BlockId)(writeFunc: WritableByteChannel => Unit): Unit = { if (contains(blockId)) { - logWarning(s"Block $blockId is already present in the disk store") + logWarning(log"Block ${MDC(BLOCK_ID, blockId)} is already present in the disk store") try { diskManager.getFile(blockId).delete() } catch { @@ -133,7 +134,7 @@ private[spark] class DiskStore( if (file.exists()) { val ret = file.delete() if (!ret) { - logWarning(s"Error deleting ${file.getPath()}") + logWarning(log"Error deleting ${MDC(PATH, file.getPath())}") } ret } else { @@ -148,6 +149,7 @@ private[spark] class DiskStore( def moveFileToBlock(sourceFile: File, blockSize: Long, targetBlockId: BlockId): Unit = { blockSizes.put(targetBlockId, blockSize) val targetFile = diskManager.getFile(targetBlockId.name) + logDebug(s"${sourceFile.getPath()} -> ${targetFile.getPath()}") FileUtils.moveFile(sourceFile, targetFile) } diff --git a/core/src/main/scala/org/apache/spark/storage/FallbackStorage.scala b/core/src/main/scala/org/apache/spark/storage/FallbackStorage.scala index 161120393490f..0f2bfaede4454 100644 --- a/core/src/main/scala/org/apache/spark/storage/FallbackStorage.scala +++ b/core/src/main/scala/org/apache/spark/storage/FallbackStorage.scala @@ -28,7 +28,8 @@ import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.{SparkConf, SparkException} import org.apache.spark.deploy.SparkHadoopUtil -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.config.{STORAGE_DECOMMISSION_FALLBACK_STORAGE_CLEANUP, STORAGE_DECOMMISSION_FALLBACK_STORAGE_PATH} import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer} import org.apache.spark.network.util.JavaUtils @@ -84,7 +85,7 @@ private[storage] class FallbackStorage(conf: SparkConf) extends Logging { } } case r => - logWarning(s"Unsupported Resolver: ${r.getClass.getName}") + logWarning(log"Unsupported Resolver: ${MDC(CLASS_NAME, r.getClass.getName)}") } } @@ -138,10 +139,10 @@ private[spark] object FallbackStorage extends Logging { // The fallback directory for this app may not be created yet. if (fallbackFileSystem.exists(fallbackPath)) { if (fallbackFileSystem.delete(fallbackPath, true)) { - logInfo(s"Succeed to clean up: $fallbackUri") + logInfo(log"Succeed to clean up: ${MDC(URI, fallbackUri)}") } else { // Clean-up can fail due to the permission issues. - logWarning(s"Failed to clean up: $fallbackUri") + logWarning(log"Failed to clean up: ${MDC(URI, fallbackUri)}") } } } @@ -158,7 +159,7 @@ private[spark] object FallbackStorage extends Logging { * Read a ManagedBuffer. */ def read(conf: SparkConf, blockId: BlockId): ManagedBuffer = { - logInfo(s"Read $blockId") + logInfo(log"Read ${MDC(BLOCK_ID, blockId)}") val fallbackPath = new Path(conf.get(STORAGE_DECOMMISSION_FALLBACK_STORAGE_PATH).get) val hadoopConf = SparkHadoopUtil.get.newConfiguration(conf) val fallbackFileSystem = FileSystem.get(fallbackPath.toUri, hadoopConf) diff --git a/core/src/main/scala/org/apache/spark/storage/PushBasedFetchHelper.scala b/core/src/main/scala/org/apache/spark/storage/PushBasedFetchHelper.scala index 31958af84e54b..8a3ca3066961c 100644 --- a/core/src/main/scala/org/apache/spark/storage/PushBasedFetchHelper.scala +++ b/core/src/main/scala/org/apache/spark/storage/PushBasedFetchHelper.scala @@ -29,7 +29,7 @@ import org.roaringbitmap.RoaringBitmap import org.apache.spark.MapOutputTracker import org.apache.spark.MapOutputTracker.SHUFFLE_PUSH_MAP_ID import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{HOST, PORT, REDUCE_ID, SHUFFLE_ID, SHUFFLE_MERGE_ID} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.network.shuffle.{BlockStoreClient, MergedBlockMeta, MergedBlocksMetaListener} import org.apache.spark.shuffle.ShuffleReadMetricsReporter import org.apache.spark.storage.BlockManagerId.SHUFFLE_MERGER_IDENTIFIER @@ -246,8 +246,9 @@ private class PushBasedFetchHelper( case Failure(throwable) => // If we see an exception with getting the local dirs for push-merged-local blocks, // we fallback to fetch the original blocks. We do not report block fetch failure. - logWarning(s"Error while fetching the merged dirs for push-merged-local " + - s"blocks: ${pushMergedLocalBlocks.mkString(", ")}. Fetch the original blocks instead", + logWarning(log"Error while fetching the merged dirs for push-merged-local " + + log"blocks: ${MDC(BLOCK_IDS, pushMergedLocalBlocks.mkString(", "))}. " + + log"Fetch the original blocks instead", throwable) pushMergedLocalBlocks.foreach { blockId => @@ -280,8 +281,8 @@ private class PushBasedFetchHelper( // If we see an exception with reading a push-merged-local meta, we fallback to // fetch the original blocks. We do not report block fetch failure // and will continue with the remaining local block read. - logWarning(s"Error occurred while fetching push-merged-local meta, " + - s"prepare to fetch the original blocks", e) + logWarning(log"Error occurred while fetching push-merged-local meta, " + + log"prepare to fetch the original blocks", e) iterator.addToResultsQueue( FallbackOnPushMergedFailureResult(blockId, blockManagerId, 0, isNetworkReqDone = false)) } @@ -315,7 +316,8 @@ private class PushBasedFetchHelper( blockId: BlockId, address: BlockManagerId): Unit = { assert(blockId.isInstanceOf[ShuffleMergedBlockId] || blockId.isInstanceOf[ShuffleBlockChunkId]) - logWarning(s"Falling back to fetch the original blocks for push-merged block $blockId") + logWarning(log"Falling back to fetch the original blocks for push-merged block " + + log"${MDC(BLOCK_ID, blockId)}") shuffleMetrics.incMergedFetchFallbackCount(1) // Increase the blocks processed since we will process another block in the next iteration of // the while loop in ShuffleBlockFetcherIterator.next(). @@ -340,7 +342,8 @@ private class PushBasedFetchHelper( // Fallback for all the pending fetch requests val pendingShuffleChunks = iterator.removePendingChunks(shuffleChunkId, address) pendingShuffleChunks.foreach { pendingBlockId => - logInfo(s"Falling back immediately for shuffle chunk $pendingBlockId") + logInfo( + log"Falling back immediately for shuffle chunk ${MDC(BLOCK_ID, pendingBlockId)}") shuffleMetrics.incMergedFetchFallbackCount(1) val bitmapOfPendingChunk: RoaringBitmap = chunksMetaMap.remove(pendingBlockId).get chunkBitmap.or(bitmapOfPendingChunk) diff --git a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala index d22ce3dbed772..ff1799d8ff3e1 100644 --- a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala +++ b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala @@ -37,7 +37,7 @@ import org.apache.spark.{MapOutputTracker, SparkException, TaskContext} import org.apache.spark.MapOutputTracker.SHUFFLE_PUSH_MAP_ID import org.apache.spark.errors.SparkCoreErrors import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{BLOCK_ID, ERROR, HOST, MAX_ATTEMPTS, PORT} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.network.buffer.{FileSegmentManagedBuffer, ManagedBuffer} import org.apache.spark.network.shuffle._ import org.apache.spark.network.shuffle.checksum.{Cause, ShuffleChecksumHelper} @@ -249,7 +249,7 @@ final class ShuffleBlockFetcherIterator( } shuffleFilesSet.foreach { file => if (!file.delete()) { - logWarning("Failed to cleanup shuffle fetch temp file " + file.path()) + logWarning(log"Failed to cleanup shuffle fetch temp file ${MDC(PATH, file.path())}") } } } @@ -342,8 +342,8 @@ final class ShuffleBlockFetcherIterator( if (isNettyOOMOnShuffle.compareAndSet(false, true)) { // The fetcher can fail remaining blocks in batch for the same error. So we only // log the warning once to avoid flooding the logs. - logInfo(s"Block $blockId has failed $failureTimes times " + - s"due to Netty OOM, will retry") + logInfo(log"Block ${MDC(BLOCK_ID, blockId)} has failed " + + log"${MDC(FAILURES, failureTimes)} times due to Netty OOM, will retry") } remainingBlocks -= blockId deferredBlocks += blockId @@ -448,14 +448,17 @@ final class ShuffleBlockFetcherIterator( s"the number of host-local blocks ${numHostLocalBlocks} " + s"the number of push-merged-local blocks ${pushMergedLocalBlocks.size} " + s"+ the number of remote blocks ${numRemoteBlocks} ") - logInfo(s"Getting $blocksToFetchCurrentIteration " + - s"(${Utils.bytesToString(totalBytes)}) non-empty blocks including " + - s"${localBlocks.size} (${Utils.bytesToString(localBlockBytes)}) local and " + - s"${numHostLocalBlocks} (${Utils.bytesToString(hostLocalBlockBytes)}) " + - s"host-local and ${pushMergedLocalBlocks.size} " + - s"(${Utils.bytesToString(pushMergedLocalBlockBytes)}) " + - s"push-merged-local and $numRemoteBlocks (${Utils.bytesToString(remoteBlockBytes)}) " + - s"remote blocks") + logInfo( + log"Getting ${MDC(NUM_BLOCKS, blocksToFetchCurrentIteration)} " + + log"(${MDC(TOTAL_SIZE, Utils.bytesToString(totalBytes))}) non-empty blocks including " + + log"${MDC(NUM_LOCAL_BLOCKS, localBlocks.size)} " + + log"(${MDC(LOCAL_BLOCKS_SIZE, Utils.bytesToString(localBlockBytes))}) local and " + + log"${MDC(NUM_HOST_LOCAL_BLOCKS, numHostLocalBlocks)} " + + log"(${MDC(HOST_LOCAL_BLOCKS_SIZE, Utils.bytesToString(hostLocalBlockBytes))}) " + + log"host-local and ${MDC(NUM_PUSH_MERGED_LOCAL_BLOCKS, pushMergedLocalBlocks.size)} " + + log"(${MDC(PUSH_MERGED_LOCAL_BLOCKS_SIZE, Utils.bytesToString(pushMergedLocalBlockBytes))})" + + log" push-merged-local and ${MDC(NUM_REMOTE_BLOCKS, numRemoteBlocks)} " + + log"(${MDC(REMOTE_BLOCKS_SIZE, Utils.bytesToString(remoteBlockBytes))}) remote blocks") this.hostLocalBlocks ++= hostLocalBlocksByExecutor.values .flatMap { infos => infos.map(info => (info._1, info._3)) } collectedRemoteRequests @@ -719,8 +722,10 @@ final class ShuffleBlockFetcherIterator( val numDeferredRequest = deferredFetchRequests.values.map(_.size).sum val numFetches = remoteRequests.size - fetchRequests.size - numDeferredRequest - logInfo(s"Started $numFetches remote fetches in ${Utils.getUsedTimeNs(startTimeNs)}" + - (if (numDeferredRequest > 0 ) s", deferred $numDeferredRequest requests" else "")) + logInfo(log"Started ${MDC(COUNT, numFetches)} remote fetches in " + + log"${MDC(DURATION, Utils.getUsedTimeNs(startTimeNs))}" + + (if (numDeferredRequest > 0) log", deferred ${MDC(NUM_REQUESTS, numDeferredRequest)} requests" + else log"")) // Get Local Blocks fetchLocalBlocks(localBlocks) @@ -846,8 +851,10 @@ final class ShuffleBlockFetcherIterator( // uses are shared by the UnsafeShuffleWriter (both writers use DiskBlockObjectWriter // which returns a zero-size from commitAndGet() in case no records were written // since the last call. - val msg = s"Received a zero-size buffer for block $blockId from $address " + - s"(expectedApproxSize = $size, isNetworkReqDone=$isNetworkReqDone)" + val msg = log"Received a zero-size buffer for block ${MDC(BLOCK_ID, blockId)} " + + log"from ${MDC(URI, address)} " + + log"(expectedApproxSize = ${MDC(NUM_BYTES, size)}, " + + log"isNetworkReqDone=${MDC(IS_NETWORK_REQUEST_DONE, isNetworkReqDone)})" if (blockId.isShuffleChunk) { // Zero-size block may come from nodes with hardware failures, For shuffle chunks, // the original shuffle blocks that belong to that zero-size shuffle chunk is @@ -859,7 +866,7 @@ final class ShuffleBlockFetcherIterator( result = null null } else { - throwFetchFailedException(blockId, mapIndex, address, new IOException(msg)) + throwFetchFailedException(blockId, mapIndex, address, new IOException(msg.message)) } } else { try { @@ -945,7 +952,8 @@ final class ShuffleBlockFetcherIterator( } } else { // It's the first time this block is detected corrupted - logWarning(s"got an corrupted block $blockId from $address, fetch again", e) + logWarning(log"got an corrupted block ${MDC(BLOCK_ID, blockId)} " + + log"from ${MDC(URI, address)}, fetch again", e) corruptedBlocks += blockId fetchRequests += FetchRequest( address, Array(FetchBlockInfo(blockId, size, mapIndex))) @@ -1033,8 +1041,8 @@ final class ShuffleBlockFetcherIterator( // If we see an exception with reading push-merged-local index file, we fallback // to fetch the original blocks. We do not report block fetch failure // and will continue with the remaining local block read. - logWarning(s"Error occurred while reading push-merged-local index, " + - s"prepare to fetch the original blocks", e) + logWarning("Error occurred while reading push-merged-local index, " + + "prepare to fetch the original blocks", e) pushBasedFetchHelper.initiateFallbackFetchForPushMergedBlock( shuffleBlockId, pushBasedFetchHelper.localShuffleMergerBlockMgrId) } @@ -1138,14 +1146,16 @@ final class ShuffleBlockFetcherIterator( case otherCause => s"Block $blockId is corrupted due to $otherCause" } - logInfo(s"Finished corruption diagnosis in $duration ms. $diagnosisResponse") + logInfo(log"Finished corruption diagnosis in ${MDC(DURATION, duration)} ms. " + + log"${MDC(STATUS, diagnosisResponse)}") diagnosisResponse case shuffleBlockChunk: ShuffleBlockChunkId => // TODO SPARK-36284 Add shuffle checksum support for push-based shuffle - val diagnosisResponse = s"BlockChunk $shuffleBlockChunk is corrupted but corruption " + + logWarning(log"BlockChunk ${MDC(SHUFFLE_BLOCK_INFO, shuffleBlockChunk)} " + + log"is corrupted but corruption diagnosis is skipped due to lack of shuffle " + + log"checksum support for push-based shuffle.") + s"BlockChunk $shuffleBlockChunk is corrupted but corruption " + s"diagnosis is skipped due to lack of shuffle checksum support for push-based shuffle." - logWarning(diagnosisResponse) - diagnosisResponse case unexpected: BlockId => throw SparkException.internalError( s"Unexpected type of BlockId, $unexpected", category = "STORAGE") @@ -1273,7 +1283,8 @@ final class ShuffleBlockFetcherIterator( originalLocalBlocks, originalHostLocalBlocksByExecutor, originalMergedLocalBlocks) // Add the remote requests into our queue in a random order fetchRequests ++= Utils.randomize(originalRemoteReqs) - logInfo(s"Created ${originalRemoteReqs.size} fallback remote requests for push-merged") + logInfo(log"Created ${MDC(NUM_REQUESTS, originalRemoteReqs.size)} fallback remote requests " + + log"for push-merged") // fetch all the fallback blocks that are local. fetchLocalBlocks(originalLocalBlocks) // Merged local blocks should be empty during fallback diff --git a/core/src/main/scala/org/apache/spark/storage/TopologyMapper.scala b/core/src/main/scala/org/apache/spark/storage/TopologyMapper.scala index 3c2c4b46dc4ca..f1dca53c7e3b1 100644 --- a/core/src/main/scala/org/apache/spark/storage/TopologyMapper.scala +++ b/core/src/main/scala/org/apache/spark/storage/TopologyMapper.scala @@ -19,7 +19,8 @@ package org.apache.spark.storage import org.apache.spark.SparkConf import org.apache.spark.annotation.DeveloperApi -import org.apache.spark.internal.{config, Logging} +import org.apache.spark.internal.{config, Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.util.Utils /** @@ -78,7 +79,7 @@ class FileBasedTopologyMapper(conf: SparkConf) extends TopologyMapper(conf) with if (topology.isDefined) { logDebug(s"$hostname -> ${topology.get}") } else { - logWarning(s"$hostname does not have any topology information") + logWarning(log"${MDC(HOST_PORT, hostname)} does not have any topology information") } topology } diff --git a/core/src/main/scala/org/apache/spark/storage/memory/MemoryStore.scala b/core/src/main/scala/org/apache/spark/storage/memory/MemoryStore.scala index 48d2ef68b41ab..6746bbd490c42 100644 --- a/core/src/main/scala/org/apache/spark/storage/memory/MemoryStore.scala +++ b/core/src/main/scala/org/apache/spark/storage/memory/MemoryStore.scala @@ -30,7 +30,8 @@ import scala.util.control.NonFatal import com.google.common.io.ByteStreams import org.apache.spark.{SparkConf, SparkException, TaskContext} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.config.{STORAGE_UNROLL_MEMORY_THRESHOLD, UNROLL_MEMORY_CHECK_PERIOD, UNROLL_MEMORY_GROWTH_FACTOR} import org.apache.spark.memory.{MemoryManager, MemoryMode} import org.apache.spark.serializer.{SerializationStream, SerializerManager} @@ -110,12 +111,14 @@ private[spark] class MemoryStore( } if (maxMemory < unrollMemoryThreshold) { - logWarning(s"Max memory ${Utils.bytesToString(maxMemory)} is less than the initial memory " + - s"threshold ${Utils.bytesToString(unrollMemoryThreshold)} needed to store a block in " + - s"memory. Please configure Spark with more memory.") + logWarning(log"Max memory ${MDC(NUM_BYTES, Utils.bytesToString(maxMemory))} " + + log"is less than the initial memory " + + log"threshold ${MDC(MAX_SIZE, Utils.bytesToString(unrollMemoryThreshold))} " + + log"needed to store a block in memory. Please configure Spark with more memory.") } - logInfo("MemoryStore started with capacity %s".format(Utils.bytesToString(maxMemory))) + logInfo(log"MemoryStore started with capacity " + + log"${MDC(MEMORY_SIZE, Utils.bytesToString(maxMemory))}") /** Total storage memory used including unroll memory, in bytes. */ private def memoryUsed: Long = memoryManager.storageMemoryUsed @@ -156,8 +159,9 @@ private[spark] class MemoryStore( entries.synchronized { entries.put(blockId, entry) } - logInfo("Block %s stored as bytes in memory (estimated size %s, free %s)".format( - blockId, Utils.bytesToString(size), Utils.bytesToString(maxMemory - blocksMemoryUsed))) + logInfo(log"Block ${MDC(BLOCK_ID, blockId)} stored as bytes in memory " + + log"(estimated size ${MDC(SIZE, Utils.bytesToString(size))}, " + + log"free ${MDC(MEMORY_SIZE, Utils.bytesToString(maxMemory - blocksMemoryUsed))})") true } else { false @@ -213,8 +217,9 @@ private[spark] class MemoryStore( reserveUnrollMemoryForThisTask(blockId, initialMemoryThreshold, memoryMode) if (!keepUnrolling) { - logWarning(s"Failed to reserve initial memory threshold of " + - s"${Utils.bytesToString(initialMemoryThreshold)} for computing block $blockId in memory.") + logWarning(log"Failed to reserve initial memory threshold of " + + log"${MDC(NUM_BYTES, Utils.bytesToString(initialMemoryThreshold))} " + + log"for computing block ${MDC(BLOCK_ID, blockId)} in memory.") } else { unrollMemoryUsedByThisBlock += initialMemoryThreshold } @@ -247,7 +252,8 @@ private[spark] class MemoryStore( // SPARK-45025 - if a thread interrupt was received, we log a warning and return used memory // to avoid getting killed by task reaper eventually. if (shouldCheckThreadInterruption && Thread.currentThread().isInterrupted) { - logInfo(s"Failed to unroll block=$blockId since thread interrupt was received") + logInfo( + log"Failed to unroll block=${MDC(BLOCK_ID, blockId)} since thread interrupt was received") Left(unrollMemoryUsedByThisBlock) } else if (keepUnrolling) { // Make sure that we have enough memory to store the block. By this point, it is possible that @@ -276,8 +282,9 @@ private[spark] class MemoryStore( entries.put(blockId, entry) } - logInfo("Block %s stored as values in memory (estimated size %s, free %s)".format(blockId, - Utils.bytesToString(entry.size), Utils.bytesToString(maxMemory - blocksMemoryUsed))) + logInfo(log"Block ${MDC(BLOCK_ID, blockId)} stored as values in memory " + + log"(estimated size ${MDC(MEMORY_SIZE, Utils.bytesToString(entry.size))}, free " + + log"${MDC(FREE_MEMORY_SIZE, Utils.bytesToString(maxMemory - blocksMemoryUsed))})") Right(entry.size) } else { // We ran out of space while unrolling the values for this block @@ -348,9 +355,10 @@ private[spark] class MemoryStore( // Initial per-task memory to request for unrolling blocks (bytes). val initialMemoryThreshold = unrollMemoryThreshold val chunkSize = if (initialMemoryThreshold > ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH) { - logWarning(s"Initial memory threshold of ${Utils.bytesToString(initialMemoryThreshold)} " + - s"is too large to be set as chunk size. Chunk size has been capped to " + - s"${Utils.bytesToString(ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH)}") + logWarning(log"Initial memory threshold of " + + log"${MDC(NUM_BYTES, Utils.bytesToString(initialMemoryThreshold))} " + + log"is too large to be set as chunk size. Chunk size has been capped to " + + log"${MDC(MAX_SIZE, Utils.bytesToString(ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH))}") ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH } else { initialMemoryThreshold.toInt @@ -517,8 +525,8 @@ private[spark] class MemoryStore( if (freedMemory >= space) { var lastSuccessfulBlock = -1 try { - logInfo(s"${selectedBlocks.size} blocks selected for dropping " + - s"(${Utils.bytesToString(freedMemory)} bytes)") + logInfo(log"${MDC(NUM_BLOCKS, selectedBlocks.size)} blocks selected for dropping " + + log"(${MDC(MEMORY_SIZE, Utils.bytesToString(freedMemory))} bytes)") selectedBlocks.indices.foreach { idx => val blockId = selectedBlocks(idx) val entry = entries.synchronized { @@ -533,8 +541,9 @@ private[spark] class MemoryStore( } lastSuccessfulBlock = idx } - logInfo(s"After dropping ${selectedBlocks.size} blocks, " + - s"free memory is ${Utils.bytesToString(maxMemory - blocksMemoryUsed)}") + logInfo( + log"After dropping ${MDC(NUM_BLOCKS, selectedBlocks.size)} blocks, free memory is" + + log"${MDC(FREE_MEMORY_SIZE, Utils.bytesToString(maxMemory - blocksMemoryUsed))}") freedMemory } finally { // like BlockManager.doPut, we use a finally rather than a catch to avoid having to deal @@ -549,7 +558,7 @@ private[spark] class MemoryStore( } } else { blockId.foreach { id => - logInfo(s"Will not store $id") + logInfo(log"Will not store ${MDC(BLOCK_ID, id)}") } selectedBlocks.foreach { id => blockInfoManager.unlock(id) @@ -645,11 +654,11 @@ private[spark] class MemoryStore( */ private def logMemoryUsage(): Unit = { logInfo( - s"Memory use = ${Utils.bytesToString(blocksMemoryUsed)} (blocks) + " + - s"${Utils.bytesToString(currentUnrollMemory)} (scratch space shared across " + - s"$numTasksUnrolling tasks(s)) = ${Utils.bytesToString(memoryUsed)}. " + - s"Storage limit = ${Utils.bytesToString(maxMemory)}." - ) + log"Memory use = ${MDC(CURRENT_MEMORY_SIZE, Utils.bytesToString(blocksMemoryUsed))} " + + log"(blocks) + ${MDC(FREE_MEMORY_SIZE, Utils.bytesToString(currentUnrollMemory))} " + + log"(scratch space shared across ${MDC(NUM_TASKS, numTasksUnrolling)} " + + log"tasks(s)) = ${MDC(STORAGE_MEMORY_SIZE, Utils.bytesToString(memoryUsed))}. " + + log"Storage limit = ${MDC(MAX_MEMORY_SIZE, Utils.bytesToString(maxMemory))}.") } /** @@ -660,8 +669,8 @@ private[spark] class MemoryStore( */ private def logUnrollFailureMessage(blockId: BlockId, finalVectorSize: Long): Unit = { logWarning( - s"Not enough space to cache $blockId in memory! " + - s"(computed ${Utils.bytesToString(finalVectorSize)} so far)" + log"Not enough space to cache ${MDC(BLOCK_ID, blockId)} in memory! " + + log"(computed ${MDC(NUM_BYTES, Utils.bytesToString(finalVectorSize))} so far)" ) logMemoryUsage() } diff --git a/core/src/main/scala/org/apache/spark/ui/DriverLogPage.scala b/core/src/main/scala/org/apache/spark/ui/DriverLogPage.scala index 3102115159994..8b4eebc26b3ba 100644 --- a/core/src/main/scala/org/apache/spark/ui/DriverLogPage.scala +++ b/core/src/main/scala/org/apache/spark/ui/DriverLogPage.scala @@ -22,7 +22,7 @@ import jakarta.servlet.http.HttpServletRequest import org.apache.spark.SparkConf import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{LOG_TYPE, PATH} +import org.apache.spark.internal.LogKeys.{LOG_TYPE, PATH} import org.apache.spark.internal.config.DRIVER_LOG_LOCAL_DIR import org.apache.spark.util.Utils import org.apache.spark.util.logging.DriverLogger.DRIVER_LOG_FILE diff --git a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala index 5e567a891d587..f503be908c072 100644 --- a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala +++ b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala @@ -40,7 +40,9 @@ import org.json4s.JValue import org.json4s.jackson.JsonMethods.{pretty, render} import org.apache.spark.{SecurityManager, SparkConf, SSLOptions} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.config.UI._ import org.apache.spark.util.Utils @@ -84,7 +86,8 @@ private[spark] object JettyUtils extends Logging { case e: IllegalArgumentException => response.sendError(HttpServletResponse.SC_BAD_REQUEST, e.getMessage) case e: Exception => - logWarning(s"GET ${request.getRequestURI} failed: $e", e) + logWarning(log"GET ${MDC(LogKeys.URI, request.getRequestURI)} failed: " + + log"${MDC(ERROR, e)}", e) throw e } } @@ -247,7 +250,8 @@ private[spark] object JettyUtils extends Logging { poolSize: Int = 200): ServerInfo = { val stopTimeout = conf.get(UI_JETTY_STOP_TIMEOUT) - logInfo(s"Start Jetty $hostName:$port for $serverName") + logInfo(log"Start Jetty ${MDC(HOST, hostName)}:${MDC(PORT, port)}" + + log" for ${MDC(SERVER_NAME, serverName)}") // Start the server first, with no connectors. val pool = new QueuedThreadPool(poolSize) if (serverName.nonEmpty) { @@ -555,7 +559,9 @@ private[spark] case class ServerInfo( */ private def addFilters(handler: ServletContextHandler, securityMgr: SecurityManager): Unit = { conf.get(UI_FILTERS).foreach { filter => - logInfo(s"Adding filter to ${handler.getContextPath()}: $filter") + logInfo(log"Adding filter to" + + log" ${MDC(SERVLET_CONTEXT_HANDLER_PATH, handler.getContextPath())}:" + + log" ${MDC(UI_FILTER, filter)}") val oldParams = conf.getOption(s"spark.$filter.params").toSeq .flatMap(Utils.stringToSeq) .flatMap { param => diff --git a/core/src/main/scala/org/apache/spark/ui/SparkUI.scala b/core/src/main/scala/org/apache/spark/ui/SparkUI.scala index ddf451c16f3a2..b8d422c9d9fbb 100644 --- a/core/src/main/scala/org/apache/spark/ui/SparkUI.scala +++ b/core/src/main/scala/org/apache/spark/ui/SparkUI.scala @@ -24,7 +24,7 @@ import org.eclipse.jetty.servlet.ServletContextHandler import org.apache.spark.{SecurityManager, SparkConf, SparkContext} import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.CLASS_NAME +import org.apache.spark.internal.LogKeys.{CLASS_NAME, WEB_URL} import org.apache.spark.internal.config.DRIVER_LOG_LOCAL_DIR import org.apache.spark.internal.config.UI._ import org.apache.spark.scheduler._ @@ -164,7 +164,7 @@ private[spark] class SparkUI private ( /** Stop the server behind this web interface. Only valid after bind(). */ override def stop(): Unit = { super.stop() - logInfo(s"Stopped Spark web UI at $webUrl") + logInfo(log"Stopped Spark web UI at ${MDC(WEB_URL, webUrl)}") } override def withSparkUI[T](appId: String, attemptId: Option[String])(fn: SparkUI => T): T = { diff --git a/core/src/main/scala/org/apache/spark/ui/WebUI.scala b/core/src/main/scala/org/apache/spark/ui/WebUI.scala index baeed322e8ad3..60d4e5db99d7e 100644 --- a/core/src/main/scala/org/apache/spark/ui/WebUI.scala +++ b/core/src/main/scala/org/apache/spark/ui/WebUI.scala @@ -30,7 +30,7 @@ import org.json4s.JsonAST.{JNothing, JValue} import org.apache.spark.{SecurityManager, SparkConf, SSLOptions} import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.CLASS_NAME +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.config._ import org.apache.spark.ui.JettyUtils._ import org.apache.spark.util.Utils @@ -156,7 +156,8 @@ private[spark] abstract class WebUI( serverInfo = Some(server) val hostName = Option(conf.getenv("SPARK_LOCAL_IP")) .getOrElse(if (Utils.preferIPv6) "[::]" else "0.0.0.0") - logInfo(s"Bound $className to $hostName, and started at $webUrl") + logInfo(log"Bound ${MDC(CLASS_NAME, className)} to ${MDC(HOST, hostName)}," + + log" and started at ${MDC(WEB_URL, webUrl)}") } catch { case e: Exception => logError(log"Failed to bind ${MDC(CLASS_NAME, className)}", e) diff --git a/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraph.scala b/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraph.scala index 9a035e0f1e834..ee7f67233bbd5 100644 --- a/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraph.scala +++ b/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraph.scala @@ -25,7 +25,8 @@ import scala.xml.Utility import org.apache.commons.text.StringEscapeUtils -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.rdd.DeterministicLevel import org.apache.spark.scheduler.StageInfo import org.apache.spark.storage.StorageLevel @@ -214,7 +215,8 @@ private[spark] object RDDOperationGraph extends Logging { case (true, false) => outgoingEdges += e case (false, true) => incomingEdges += e // should never happen - case _ => logWarning(s"Found an orphan edge in stage ${stage.stageId}: $e") + case _ => logWarning(log"Found an orphan edge in stage " + + log"${MDC(STAGE_ID, stage.stageId)}: ${MDC(ERROR, e)}") } } diff --git a/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala b/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala index c6d8073a0c2fa..3237a321f1c3e 100644 --- a/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala +++ b/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala @@ -23,7 +23,8 @@ import java.util.concurrent.ConcurrentHashMap import java.util.concurrent.atomic.AtomicLong import org.apache.spark.{InternalAccumulator, SparkContext, TaskContext} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.scheduler.AccumulableInfo import org.apache.spark.util.AccumulatorContext.internOption @@ -276,7 +277,8 @@ private[spark] object AccumulatorContext extends Logging { // Since we are storing weak references, warn when the underlying data is not valid. val acc = ref.get if (acc eq null) { - logWarning(s"Attempted to access garbage collected accumulator $id") + logWarning(log"Attempted to access garbage collected accumulator " + + log"${MDC(ACCUMULATOR_ID, id)}") } Option(acc) } diff --git a/core/src/main/scala/org/apache/spark/util/DependencyUtils.scala b/core/src/main/scala/org/apache/spark/util/DependencyUtils.scala index 14851d8772895..8526a21254586 100644 --- a/core/src/main/scala/org/apache/spark/util/DependencyUtils.scala +++ b/core/src/main/scala/org/apache/spark/util/DependencyUtils.scala @@ -26,7 +26,9 @@ import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.{SparkConf, SparkException} import org.apache.spark.deploy.SparkSubmit -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.config._ import org.apache.spark.util.ArrayImplicits._ @@ -223,10 +225,10 @@ private[spark] object DependencyUtils extends Logging { if (file.exists()) { loader.addURL(file.toURI.toURL) } else { - logWarning(s"Local jar $file does not exist, skipping.") + logWarning(log"Local jar ${MDC(FILE_NAME, file)} does not exist, skipping.") } case _ => - logWarning(s"Skip remote jar $uri.") + logWarning(log"Skip remote jar ${MDC(LogKeys.URI, uri)}.") } } diff --git a/core/src/main/scala/org/apache/spark/util/Distribution.scala b/core/src/main/scala/org/apache/spark/util/Distribution.scala index 49aab5575f843..07ea9720b0b8d 100644 --- a/core/src/main/scala/org/apache/spark/util/Distribution.scala +++ b/core/src/main/scala/org/apache/spark/util/Distribution.scala @@ -19,8 +19,6 @@ package org.apache.spark.util import java.io.PrintStream -import scala.collection.immutable.IndexedSeq - /** * Util for getting some stats from a small sample of numeric values, with some handy * summary functions. diff --git a/core/src/main/scala/org/apache/spark/util/EventLoop.scala b/core/src/main/scala/org/apache/spark/util/EventLoop.scala index eaa9ef517294e..b9de661b63c4f 100644 --- a/core/src/main/scala/org/apache/spark/util/EventLoop.scala +++ b/core/src/main/scala/org/apache/spark/util/EventLoop.scala @@ -23,7 +23,7 @@ import java.util.concurrent.atomic.AtomicBoolean import scala.util.control.NonFatal import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.EVENT_LOOP +import org.apache.spark.internal.LogKeys.EVENT_LOOP /** * An event loop to receive events from the caller and process all events in the event thread. It diff --git a/core/src/main/scala/org/apache/spark/util/HadoopFSUtils.scala b/core/src/main/scala/org/apache/spark/util/HadoopFSUtils.scala index a4b9ba7bb0169..f8f5bb4f72a40 100644 --- a/core/src/main/scala/org/apache/spark/util/HadoopFSUtils.scala +++ b/core/src/main/scala/org/apache/spark/util/HadoopFSUtils.scala @@ -28,7 +28,8 @@ import org.apache.hadoop.fs.viewfs.ViewFileSystem import org.apache.hadoop.hdfs.DistributedFileSystem import org.apache.spark._ -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.metrics.source.HiveCatalogMetrics import org.apache.spark.util.ArrayImplicits._ @@ -86,7 +87,7 @@ private[spark] object HadoopFSUtils extends Logging { path: Path, hadoopConf: Configuration, filter: PathFilter): Seq[(Path, Seq[FileStatus])] = { - logInfo(s"Listing $path with listFiles API") + logInfo(log"Listing ${MDC(PATH, path)} with listFiles API") try { val prefixLength = path.toString.length val remoteIter = path.getFileSystem(hadoopConf).listFiles(path, true) @@ -99,7 +100,8 @@ private[spark] object HadoopFSUtils extends Logging { Seq((path, statues.toImmutableArraySeq)) } catch { case _: FileNotFoundException => - logWarning(s"The root directory $path was not found. Was it deleted very recently?") + logWarning(log"The root directory ${MDC(PATH, path)} " + + log"was not found. Was it deleted very recently?") Seq((path, Seq.empty[FileStatus])) } } @@ -132,8 +134,9 @@ private[spark] object HadoopFSUtils extends Logging { } } - logInfo(s"Listing leaf files and directories in parallel under ${paths.length} paths." + - s" The first several paths are: ${paths.take(10).mkString(", ")}.") + logInfo(log"Listing leaf files and directories in parallel under" + + log"${MDC(NUM_PATHS, paths.length)} paths." + + log" The first several paths are: ${MDC(PATHS, paths.take(10).mkString(", "))}.") HiveCatalogMetrics.incrementParallelListingJobCount(1) val serializableConfiguration = new SerializableConfiguration(hadoopConf) @@ -235,7 +238,8 @@ private[spark] object HadoopFSUtils extends Logging { // InMemoryFileIndex construction. However, it's still a net improvement to detect and // fail-fast on the non-root cases. For more info see the SPARK-27676 review discussion. case _: FileNotFoundException if isRootPath || ignoreMissingFiles => - logWarning(s"The directory $path was not found. Was it deleted very recently?") + logWarning(log"The directory ${MDC(PATH, path)} " + + log"was not found. Was it deleted very recently?") Array.empty[FileStatus] } @@ -323,8 +327,8 @@ private[spark] object HadoopFSUtils extends Logging { } if (missingFiles.nonEmpty) { - logWarning( - s"the following files were missing during file scan:\n ${missingFiles.mkString("\n ")}") + logWarning(log"the following files were missing during file scan:\n " + + log"${MDC(PATHS, missingFiles.mkString("\n "))}") } resolvedLeafStatuses.toImmutableArraySeq diff --git a/core/src/main/scala/org/apache/spark/util/ListenerBus.scala b/core/src/main/scala/org/apache/spark/util/ListenerBus.scala index 814201d8c959c..4f01cd6ac2136 100644 --- a/core/src/main/scala/org/apache/spark/util/ListenerBus.scala +++ b/core/src/main/scala/org/apache/spark/util/ListenerBus.scala @@ -27,7 +27,7 @@ import com.codahale.metrics.Timer import org.apache.spark.SparkEnv import org.apache.spark.internal.{config, Logging, MDC} -import org.apache.spark.internal.LogKey.LISTENER +import org.apache.spark.internal.LogKeys.{EVENT, LISTENER, TOTAL_TIME} import org.apache.spark.scheduler.EventLoggingListener import org.apache.spark.scheduler.SparkListenerEnvironmentUpdate @@ -132,8 +132,9 @@ private[spark] trait ListenerBus[L <: AnyRef, E] extends Logging { if (maybeTimerContext != null) { val elapsed = maybeTimerContext.stop() if (logSlowEventEnabled && elapsed > logSlowEventThreshold) { - logInfo(s"Process of event ${redactEvent(event)} by listener ${listenerName} took " + - s"${elapsed / 1000000000d}s.") + logInfo(log"Process of event ${MDC(EVENT, redactEvent(event))} by" + + log"listener ${MDC(LISTENER, listenerName)} took " + + log"${MDC(TOTAL_TIME, elapsed / 1000000d)}ms.") } } } diff --git a/core/src/main/scala/org/apache/spark/util/PeriodicCheckpointer.scala b/core/src/main/scala/org/apache/spark/util/PeriodicCheckpointer.scala index f01645d82303e..7a98c4830db92 100644 --- a/core/src/main/scala/org/apache/spark/util/PeriodicCheckpointer.scala +++ b/core/src/main/scala/org/apache/spark/util/PeriodicCheckpointer.scala @@ -23,7 +23,8 @@ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.spark.SparkContext -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ /** @@ -184,9 +185,9 @@ private[spark] object PeriodicCheckpointer extends Logging { val fs = path.getFileSystem(conf) fs.delete(path, true) } catch { - case e: Exception => - logWarning("PeriodicCheckpointer could not remove old checkpoint file: " + - checkpointFile) + case _: Exception => + logWarning(log"PeriodicCheckpointer could not remove old checkpoint file: " + + log"${MDC(FILE_NAME, checkpointFile)}") } } } diff --git a/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala b/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala index b9dece19f2651..993352c6a6379 100644 --- a/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala +++ b/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala @@ -27,7 +27,7 @@ import org.apache.hadoop.fs.FileSystem import org.apache.spark.SparkConf import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.PATH +import org.apache.spark.internal.LogKeys.PATH import org.apache.spark.internal.config.SPARK_SHUTDOWN_TIMEOUT_MS diff --git a/core/src/main/scala/org/apache/spark/util/SignalUtils.scala b/core/src/main/scala/org/apache/spark/util/SignalUtils.scala index 775dc44fc1a13..b41166a50efd2 100644 --- a/core/src/main/scala/org/apache/spark/util/SignalUtils.scala +++ b/core/src/main/scala/org/apache/spark/util/SignalUtils.scala @@ -25,7 +25,8 @@ import org.apache.commons.lang3.SystemUtils import org.slf4j.Logger import sun.misc.{Signal, SignalHandler} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC, MessageWithContext} +import org.apache.spark.internal.LogKeys._ /** * Contains utilities for working with posix signals. @@ -58,7 +59,7 @@ private[spark] object SignalUtils extends Logging { */ def register(signal: String)(action: => Boolean): Unit = { if (SystemUtils.IS_OS_UNIX) { - register(signal, s"Failed to register signal handler for $signal", + register(signal, log"Failed to register signal handler for ${MDC(SIGNAL, signal)}", logStackTrace = true)(action) } } @@ -74,12 +75,12 @@ private[spark] object SignalUtils extends Logging { */ def register( signal: String, - failMessage: String, + failMessage: MessageWithContext, logStackTrace: Boolean = true)( action: => Boolean): Unit = synchronized { try { val handler = handlers.getOrElseUpdate(signal, { - logInfo(s"Registering signal handler for $signal") + logInfo(log"Registering signal handler for ${MDC(SIGNAL, signal)}") new ActionHandler(new Signal(signal)) }) handler.register(action) diff --git a/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala b/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala index 1447a3e752de7..88fe64859a214 100644 --- a/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala +++ b/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala @@ -151,11 +151,11 @@ object SizeEstimator extends Logging { // TODO: We could use reflection on the VMOption returned ? getVMMethod.invoke(bean, "UseCompressedOops").toString.contains("true") } catch { - case e: Exception => + case _: Exception => // Guess whether they've enabled UseCompressedOops based on whether maxMemory < 32 GB val guess = Runtime.getRuntime.maxMemory < (32L*1024*1024*1024) - val guessInWords = if (guess) "yes" else "not" - logWarning("Failed to check whether UseCompressedOops is set; assuming " + guessInWords) + logWarning(log"Failed to check whether UseCompressedOops is set; " + + log"assuming " + (if (guess) log"yes" else log"not")) guess } } diff --git a/core/src/main/scala/org/apache/spark/util/SparkUncaughtExceptionHandler.scala b/core/src/main/scala/org/apache/spark/util/SparkUncaughtExceptionHandler.scala index 74f1474f9cf78..c1ea4f929101f 100644 --- a/core/src/main/scala/org/apache/spark/util/SparkUncaughtExceptionHandler.scala +++ b/core/src/main/scala/org/apache/spark/util/SparkUncaughtExceptionHandler.scala @@ -17,8 +17,9 @@ package org.apache.spark.util +import org.apache.spark.executor.{ExecutorExitCode, KilledByTaskReaperException} import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.THREAD +import org.apache.spark.internal.LogKeys.THREAD /** * The default uncaught exception handler for Spark daemons. It terminates the whole process for @@ -56,6 +57,8 @@ private[spark] class SparkUncaughtExceptionHandler(val exitOnUncaughtException: // SPARK-24294: This is defensive code, in case that SparkFatalException is // misused and uncaught. System.exit(SparkExitCode.OOM) + case _: KilledByTaskReaperException if exitOnUncaughtException => + System.exit(ExecutorExitCode.KILLED_BY_TASK_REAPER) case _ if exitOnUncaughtException => System.exit(SparkExitCode.UNCAUGHT_EXCEPTION) case _ => diff --git a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala index e4167c43ab9f6..7f61b3f0b2c24 100644 --- a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala +++ b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala @@ -231,7 +231,7 @@ private[spark] object ThreadUtils { /** * Run a piece of code in a new thread and return the result. Exception in the new thread is * thrown in the caller thread with an adjusted stack trace that removes references to this - * method for clarity. The exception stack traces will be like the following + * method for clarity. The exception stack traces will be like the following: * * SomeException: exception-message * at CallerClass.body-method (sourcefile.scala) @@ -261,31 +261,51 @@ private[spark] object ThreadUtils { exception match { case Some(realException) => - // Remove the part of the stack that shows method calls into this helper method - // This means drop everything from the top until the stack element - // ThreadUtils.runInNewThread(), and then drop that as well (hence the `drop(1)`). - val baseStackTrace = Thread.currentThread().getStackTrace().dropWhile( - ! _.getClassName.contains(this.getClass.getSimpleName)).drop(1) - - // Remove the part of the new thread stack that shows methods call from this helper method - val extraStackTrace = realException.getStackTrace.takeWhile( - ! _.getClassName.contains(this.getClass.getSimpleName)) - - // Combine the two stack traces, with a place holder just specifying that there - // was a helper method used, without any further details of the helper - val placeHolderStackElem = new StackTraceElement( - s"... run in separate thread using ${ThreadUtils.getClass.getName.stripSuffix("$")} ..", - " ", "", -1) - val finalStackTrace = extraStackTrace ++ Seq(placeHolderStackElem) ++ baseStackTrace - - // Update the stack trace and rethrow the exception in the caller thread - realException.setStackTrace(finalStackTrace) - throw realException + throw wrapCallerStacktrace(realException, dropStacks = 2) case None => result } } + /** + * Adjust exception stack stace to wrap with caller side thread stack trace. + * The exception stack traces will be like the following: + * + * SomeException: exception-message + * at CallerClass.body-method (sourcefile.scala) + * at ... run in separate thread using org.apache.spark.util.ThreadUtils ... () + * at CallerClass.caller-method (sourcefile.scala) + * ... + */ + def wrapCallerStacktrace[T <: Throwable]( + realException: T, + combineMessage: String = + s"run in separate thread using ${ThreadUtils.getClass.getName.stripSuffix("$")}", + dropStacks: Int = 1): T = { + require(dropStacks >= 0, "dropStacks must be zero or positive") + val simpleName = this.getClass.getSimpleName + // Remove the part of the stack that shows method calls into this helper method + // This means drop everything from the top until the stack element + // ThreadUtils.wrapCallerStack(), and then drop that as well (hence the `drop(1)`). + // Large dropStacks allows caller to drop more stacks. + val baseStackTrace = Thread.currentThread().getStackTrace + .dropWhile(!_.getClassName.contains(simpleName)) + .drop(dropStacks) + + // Remove the part of the new thread stack that shows methods call from this helper method + val extraStackTrace = realException.getStackTrace + .takeWhile(!_.getClassName.contains(simpleName)) + + // Combine the two stack traces, with a place holder just specifying that there + // was a helper method used, without any further details of the helper + val placeHolderStackElem = new StackTraceElement(s"... $combineMessage ..", " ", "", -1) + val finalStackTrace = extraStackTrace ++ Seq(placeHolderStackElem) ++ baseStackTrace + + // Update the stack trace and rethrow the exception in the caller thread + realException.setStackTrace(finalStackTrace) + realException + } + /** * Construct a new ForkJoinPool with a specified max parallelism and name prefix. */ diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index af91a4b32c6fc..a37aedfcb635a 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -69,7 +69,8 @@ import org.slf4j.Logger import org.apache.spark._ import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.{Logging, MDC, MessageWithContext} -import org.apache.spark.internal.LogKey._ +import org.apache.spark.internal.LogKeys +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.config._ import org.apache.spark.internal.config.Streaming._ import org.apache.spark.internal.config.Tests.IS_TESTING @@ -400,14 +401,14 @@ private[spark] object Utils "Untarring behavior will be deprecated at spark.files and " + "SparkContext.addFile. Consider using spark.archives or SparkContext.addArchive " + "instead.") - logInfo("Untarring " + fileName) + logInfo(log"Untarring ${MDC(FILE_NAME, fileName)}") executeAndGetOutput(Seq("tar", "-xzf", fileName), targetDir) } else if (fileName.endsWith(".tar")) { logWarning( "Untarring behavior will be deprecated at spark.files and " + "SparkContext.addFile. Consider using spark.archives or SparkContext.addArchive " + "instead.") - logInfo("Untarring " + fileName) + logInfo(log"Untarring ${MDC(FILE_NAME, fileName)}") executeAndGetOutput(Seq("tar", "-xf", fileName), targetDir) } } @@ -444,7 +445,8 @@ private[spark] object Utils // TODO(SPARK-38632): should keep file permissions. Java implementation doesn't. unTarUsingJava(source, dest) } else { - logWarning(s"Cannot unpack $source, just copying it to $dest.") + logWarning(log"Cannot unpack ${MDC(LogKeys.FILE_NAME, source)}, " + + log"just copying it to ${MDC(FILE_NAME2, dest)}.") copyRecursive(source, dest) } } @@ -501,7 +503,7 @@ private[spark] object Utils fileOverwrite: Boolean): Unit = { val tempFile = File.createTempFile("fetchFileTemp", null, new File(destFile.getParentFile.getAbsolutePath)) - logInfo(s"Fetching $url to $tempFile") + logInfo(log"Fetching ${MDC(LogKeys.URL, url)} to ${MDC(FILE_ABSOLUTE_PATH, tempFile)}") try { val out = new FileOutputStream(tempFile) @@ -543,7 +545,8 @@ private[spark] object Utils if (!filesEqualRecursive(sourceFile, destFile)) { if (fileOverwrite) { logInfo( - s"File $destFile exists and does not match contents of $url, replacing it with $url" + log"File ${MDC(DESTINATION_PATH, destFile)} exists and does not match contents of" + + log" ${MDC(LogKeys.URL, url)}, replacing it with ${MDC(LogKeys.URL2, url)}" ) if (!destFile.delete()) { throw new SparkException( @@ -561,10 +564,8 @@ private[spark] object Utils // Do nothing if the file contents are the same, i.e. this file has been copied // previously. logInfo( - "%s has been previously copied to %s".format( - sourceFile.getAbsolutePath, - destFile.getAbsolutePath - ) + log"${MDC(SOURCE_PATH, sourceFile.getAbsolutePath)} has been previously" + + log" copied to ${MDC(DESTINATION_PATH, destFile.getAbsolutePath)}" ) return } @@ -574,7 +575,8 @@ private[spark] object Utils if (removeSourceFile) { Files.move(sourceFile.toPath, destFile.toPath) } else { - logInfo(s"Copying ${sourceFile.getAbsolutePath} to ${destFile.getAbsolutePath}") + logInfo(log"Copying ${MDC(SOURCE_PATH, sourceFile.getAbsolutePath)}" + + log" to ${MDC(DESTINATION_PATH, destFile.getAbsolutePath)}") copyRecursive(sourceFile, destFile) } } @@ -797,8 +799,10 @@ private[spark] object Utils } if (uris.nonEmpty) { logWarning( - "The configured local directories are not expected to be URIs; however, got suspicious " + - s"values [${uris.mkString(", ")}]. Please check your configured local directories.") + log"The configured local directories are not expected to be URIs; " + + log"however, got suspicious values [" + + log"${MDC(LogKeys.URIS, uris.mkString(", "))}]. " + + log"Please check your configured local directories.") } configuredLocalDirs.flatMap { root => @@ -889,16 +893,17 @@ private[spark] object Utils // because of Inet6Address.toHostName may add interface at the end if it knows about it val strippedAddress = InetAddress.getByAddress(addr.getAddress) // We've found an address that looks reasonable! - logWarning("Your hostname, " + InetAddress.getLocalHost.getHostName + " resolves to" + - " a loopback address: " + address.getHostAddress + "; using " + - strippedAddress.getHostAddress + " instead (on interface " + ni.getName + ")") + logWarning(log"Your hostname, ${MDC(HOST, InetAddress.getLocalHost.getHostName)}, " + + log"resolves to a loopback address: ${MDC(HOST_PORT, address.getHostAddress)}; " + + log"using ${MDC(HOST_PORT2, strippedAddress.getHostAddress)} instead (on interface " + + log"${MDC(NETWORK_IF, ni.getName)})") logWarning("Set SPARK_LOCAL_IP if you need to bind to another address") return strippedAddress } } - logWarning("Your hostname, " + InetAddress.getLocalHost.getHostName + " resolves to" + - " a loopback address: " + address.getHostAddress + ", but we couldn't find any" + - " external IP address!") + logWarning(log"Your hostname, ${MDC(HOST, InetAddress.getLocalHost.getHostName)}, " + + log"resolves to a loopback address: ${MDC(HOST_PORT, address.getHostAddress)}, " + + log"but we couldn't find any external IP address!") logWarning("Set SPARK_LOCAL_IP if you need to bind to another address") } address @@ -1197,7 +1202,7 @@ private[spark] object Utils val process = builder.start() if (redirectStderr) { val threadName = "redirect stderr for command " + command(0) - def log(s: String): Unit = logInfo(s) + def log(s: String): Unit = logInfo(log"${MDC(LINE, s)}") processStreamByLine(threadName, process.getErrorStream, log) } process @@ -1330,7 +1335,7 @@ private[spark] object Utils case t: Throwable => if (originalThrowable != t) { originalThrowable.addSuppressed(t) - logWarning(s"Suppressing exception in catch: ${t.getMessage}", t) + logWarning(log"Suppressing exception in catch: ${MDC(ERROR, t.getMessage)}", t) } } throw originalThrowable @@ -1340,7 +1345,7 @@ private[spark] object Utils } catch { case t: Throwable if (originalThrowable != null && originalThrowable != t) => originalThrowable.addSuppressed(t) - logWarning(s"Suppressing exception in finally: ${t.getMessage}", t) + logWarning(log"Suppressing exception in finally: ${MDC(ERROR, t.getMessage)}", t) throw originalThrowable } } @@ -2129,6 +2134,18 @@ private[spark] object Utils (base + offset - 1024) % (65536 - 1024) + 1024 } + /** + * Attempt to start a service on the given port, or fail after a number of attempts. + * Use a shared configuration for the maximum number of port retries. + */ + def startServiceOnPort[T]( + startPort: Int, + startService: Int => (T, Int), + conf: SparkConf, + serviceName: String = ""): (T, Int) = { + startServiceOnPort(startPort, startService, portMaxRetries(conf), serviceName) + } + /** * Attempt to start a service on the given port, or fail after a number of attempts. * Each subsequent attempt uses 1 + the port used in the previous attempt (unless the port is 0). @@ -2136,21 +2153,20 @@ private[spark] object Utils * @param startPort The initial port to start the service on. * @param startService Function to start service on a given port. * This is expected to throw java.net.BindException on port collision. - * @param conf A SparkConf used to get the maximum number of retries when binding to a port. + * @param maxRetries The maximum number of retries when binding to a port. * @param serviceName Name of the service. * @return (service: T, port: Int) */ def startServiceOnPort[T]( startPort: Int, startService: Int => (T, Int), - conf: SparkConf, - serviceName: String = ""): (T, Int) = { + maxRetries: Int, + serviceName: String): (T, Int) = { require(startPort == 0 || (1024 <= startPort && startPort < 65536), "startPort should be between 1024 and 65535 (inclusive), or 0 for a random free port.") val serviceString = if (serviceName.isEmpty) "" else s" '$serviceName'" - val maxRetries = portMaxRetries(conf) for (offset <- 0 to maxRetries) { // Do not increment port if startPort is 0, which is treated as a special port val tryPort = if (startPort == 0) { @@ -2160,7 +2176,8 @@ private[spark] object Utils } try { val (service, port) = startService(tryPort) - logInfo(s"Successfully started service$serviceString on port $port.") + logInfo(log"Successfully started service${MDC(SERVICE_NAME, serviceString)}" + + log" on port ${MDC(PORT, port)}.") return (service, port) } catch { case e: Exception if isBindCollision(e) => @@ -2185,11 +2202,13 @@ private[spark] object Utils if (startPort == 0) { // As startPort 0 is for a random free port, it is most possibly binding address is // not correct. - logWarning(s"Service$serviceString could not bind on a random free port. " + - "You may check whether configuring an appropriate binding address.") + logWarning(log"Service${MDC(SERVICE_NAME, serviceString)} " + + log"could not bind on a random free port. " + + log"You may check whether configuring an appropriate binding address.") } else { - logWarning(s"Service$serviceString could not bind on port $tryPort. " + - s"Attempting port ${tryPort + 1}.") + logWarning(log"Service${MDC(SERVICE_NAME, serviceString)} " + + log"could not bind on port ${MDC(PORT, tryPort)}. " + + log"Attempting port ${MDC(PORT2, tryPort + 1)}.") } } } @@ -2212,6 +2231,9 @@ private[spark] object Utils case e: NativeIoException => (e.getMessage != null && e.getMessage.startsWith("bind() failed: ")) || isBindCollision(e.getCause) + case e: IOException => + (e.getMessage != null && e.getMessage.startsWith("Failed to bind to address")) || + isBindCollision(e.getCause) case e: Exception => isBindCollision(e.getCause) case _ => false } @@ -2454,9 +2476,9 @@ private[spark] object Utils (isShuffleServiceAndYarn || isTesting) && ioEncryptionDisabled && serializerIsSupported } if (!canDoPushBasedShuffle) { - logWarning("Push-based shuffle can only be enabled when the application is submitted " + - "to run in YARN mode, with external shuffle service enabled, IO encryption disabled, " + - "and relocation of serialized objects supported.") + logWarning(log"Push-based shuffle can only be enabled when the application is submitted " + + log"to run in YARN mode, with external shuffle service enabled, IO encryption " + + log"disabled, and relocation of serialized objects supported.") } canDoPushBasedShuffle @@ -2517,15 +2539,15 @@ private[spark] object Utils */ def getDynamicAllocationInitialExecutors(conf: SparkConf): Int = { if (conf.get(DYN_ALLOCATION_INITIAL_EXECUTORS) < conf.get(DYN_ALLOCATION_MIN_EXECUTORS)) { - logWarning(s"${DYN_ALLOCATION_INITIAL_EXECUTORS.key} less than " + - s"${DYN_ALLOCATION_MIN_EXECUTORS.key} is invalid, ignoring its setting, " + - "please update your configs.") + logWarning(log"${MDC(CONFIG, DYN_ALLOCATION_INITIAL_EXECUTORS.key)} less than " + + log"${MDC(CONFIG2, DYN_ALLOCATION_MIN_EXECUTORS.key)} is invalid, ignoring its setting, " + + log"please update your configs.") } if (conf.get(EXECUTOR_INSTANCES).getOrElse(0) < conf.get(DYN_ALLOCATION_MIN_EXECUTORS)) { - logWarning(s"${EXECUTOR_INSTANCES.key} less than " + - s"${DYN_ALLOCATION_MIN_EXECUTORS.key} is invalid, ignoring its setting, " + - "please update your configs.") + logWarning(log"${MDC(CONFIG, EXECUTOR_INSTANCES.key)} less than " + + log"${MDC(CONFIG2, DYN_ALLOCATION_MIN_EXECUTORS.key)} is invalid, ignoring its setting, " + + log"please update your configs.") } val initialExecutors = Seq( @@ -2533,9 +2555,10 @@ private[spark] object Utils conf.get(DYN_ALLOCATION_INITIAL_EXECUTORS), conf.get(EXECUTOR_INSTANCES).getOrElse(0)).max - logInfo(s"Using initial executors = $initialExecutors, max of " + - s"${DYN_ALLOCATION_INITIAL_EXECUTORS.key}, ${DYN_ALLOCATION_MIN_EXECUTORS.key} and " + - s"${EXECUTOR_INSTANCES.key}") + logInfo(log"Using initial executors = ${MDC(NUM_EXECUTORS, initialExecutors)}, max of " + + log"${MDC(CONFIG, DYN_ALLOCATION_INITIAL_EXECUTORS.key)}," + + log"${MDC(CONFIG2, DYN_ALLOCATION_MIN_EXECUTORS.key)} and" + + log" ${MDC(CONFIG3, EXECUTOR_INSTANCES.key)}") initialExecutors } @@ -2723,7 +2746,7 @@ private[spark] object Utils e.getCause() match { case uoe: UnsupportedOperationException => logDebug(s"Extension $name not being initialized.", uoe) - logInfo(s"Extension $name not being initialized.") + logInfo(log"Extension ${MDC(CLASS_NAME, name)} not being initialized.") None case null => throw e @@ -2747,8 +2770,8 @@ private[spark] object Utils // To handle master URLs, e.g., k8s://host:port. if (!masterWithoutK8sPrefix.contains("://")) { val resolvedURL = s"https://$masterWithoutK8sPrefix" - logInfo("No scheme specified for kubernetes master URL, so defaulting to https. Resolved " + - s"URL is $resolvedURL.") + logInfo(log"No scheme specified for kubernetes master URL, so defaulting to https." + + log" Resolved URL is ${MDC(LogKeys.URL, resolvedURL)}.") return s"k8s://$resolvedURL" } @@ -2758,7 +2781,7 @@ private[spark] object Utils case Some("https") => masterWithoutK8sPrefix case Some("http") => - logWarning("Kubernetes master URL uses HTTP instead of HTTPS.") + logWarning(log"Kubernetes master URL uses HTTP instead of HTTPS.") masterWithoutK8sPrefix case _ => throw new IllegalArgumentException("Invalid Kubernetes master scheme: " + masterScheme @@ -3001,7 +3024,7 @@ private[spark] object Utils entry = in.getNextEntry() } in.close() // so that any error in closing does not get ignored - logInfo(s"Unzipped from $dfsZipFile\n\t${files.mkString("\n\t")}") + logInfo(log"Unzipped from ${MDC(PATH, dfsZipFile)}\n\t${MDC(PATHS, files.mkString("\n\t"))}") } finally { // Close everything no matter what happened IOUtils.closeQuietly(in) @@ -3118,7 +3141,8 @@ private[spark] class CallerContext( context } else { val finalContext = context.substring(0, len) - logWarning(s"Truncated Spark caller context from $context to $finalContext") + logWarning(log"Truncated Spark caller context from ${MDC(CONTEXT, context)} " + + log"to ${MDC(FINAL_CONTEXT, finalContext)}") finalContext } } diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala index 8224472b75458..16a2f4fb6cad9 100644 --- a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala +++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala @@ -29,7 +29,8 @@ import com.google.common.io.ByteStreams import org.apache.spark.{SparkEnv, TaskContext} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.executor.ShuffleWriteMetrics -import org.apache.spark.internal.{config, Logging} +import org.apache.spark.internal.{config, Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.serializer.{DeserializationStream, Serializer, SerializerManager} import org.apache.spark.storage.{BlockId, BlockManager} import org.apache.spark.util.CompletionIterator @@ -544,7 +545,7 @@ class ExternalAppendOnlyMap[K, V, C]( } if (file.exists()) { if (!file.delete()) { - logWarning(s"Error deleting ${file}") + logWarning(log"Error deleting ${MDC(FILE_NAME, file)}") } } } @@ -565,8 +566,9 @@ class ExternalAppendOnlyMap[K, V, C]( if (hasSpilled) { false } else { - logInfo(s"Task ${context.taskAttemptId()} force spilling in-memory map to disk and " + - s"it will release ${org.apache.spark.util.Utils.bytesToString(getUsed())} memory") + logInfo(log"Task ${MDC(TASK_ATTEMPT_ID, context.taskAttemptId())} force spilling" + + log" in-memory map to disk and it will release " + + log"${MDC(NUM_BYTES, org.apache.spark.util.Utils.bytesToString(getUsed()))} memory") val nextUpstream = spillMemoryIteratorToDisk(upstream) assert(!upstream.hasNext) hasSpilled = true diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala index 77aec10a6b126..393cdbbef0a5a 100644 --- a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala +++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala @@ -28,9 +28,10 @@ import com.google.common.io.ByteStreams import org.apache.spark._ import org.apache.spark.executor.ShuffleWriteMetrics -import org.apache.spark.internal.{config, Logging} +import org.apache.spark.internal.{config, Logging, MDC} +import org.apache.spark.internal.LogKeys.{NUM_BYTES, TASK_ATTEMPT_ID} import org.apache.spark.serializer._ -import org.apache.spark.shuffle.ShufflePartitionPairsWriter +import org.apache.spark.shuffle.{ShufflePartitionPairsWriter, ShuffleWriteMetricsReporter} import org.apache.spark.shuffle.api.{ShuffleMapOutputWriter, ShufflePartitionWriter} import org.apache.spark.shuffle.checksum.ShuffleChecksumSupport import org.apache.spark.storage.{BlockId, DiskBlockObjectWriter, ShuffleBlockId} @@ -693,7 +694,8 @@ private[spark] class ExternalSorter[K, V, C]( def writePartitionedMapOutput( shuffleId: Int, mapId: Long, - mapOutputWriter: ShuffleMapOutputWriter): Unit = { + mapOutputWriter: ShuffleMapOutputWriter, + writeMetrics: ShuffleWriteMetricsReporter): Unit = { if (spills.isEmpty) { // Case where we only have in-memory data val collection = if (aggregator.isDefined) map else buffer @@ -710,7 +712,7 @@ private[spark] class ExternalSorter[K, V, C]( serializerManager, serInstance, blockId, - context.taskMetrics().shuffleWriteMetrics, + writeMetrics, if (partitionChecksums.nonEmpty) partitionChecksums(partitionId) else null) while (it.hasNext && it.nextPartition() == partitionId) { it.writeNext(partitionPairsWriter) @@ -734,7 +736,7 @@ private[spark] class ExternalSorter[K, V, C]( serializerManager, serInstance, blockId, - context.taskMetrics().shuffleWriteMetrics, + writeMetrics, if (partitionChecksums.nonEmpty) partitionChecksums(id) else null) if (elements.hasNext) { for (elem <- elements) { @@ -815,8 +817,9 @@ private[spark] class ExternalSorter[K, V, C]( false } else { val inMemoryIterator = new WritablePartitionedIterator[K, C](upstream) - logInfo(s"Task ${TaskContext.get().taskAttemptId()} force spilling in-memory map to disk " + - s"and it will release ${org.apache.spark.util.Utils.bytesToString(getUsed())} memory") + logInfo(log"Task ${MDC(TASK_ATTEMPT_ID, TaskContext.get().taskAttemptId())}" + + log" force spilling in-memory map to disk and it will release" + + log" ${MDC(NUM_BYTES, org.apache.spark.util.Utils.bytesToString(getUsed()))} memory") val spillFile = spillMemoryIteratorToDisk(inMemoryIterator) forceSpillFiles += spillFile val spillReader = new SpillReader(spillFile) diff --git a/core/src/main/scala/org/apache/spark/util/logging/DriverLogger.scala b/core/src/main/scala/org/apache/spark/util/logging/DriverLogger.scala index ffb13db515822..840fb59d410a6 100644 --- a/core/src/main/scala/org/apache/spark/util/logging/DriverLogger.scala +++ b/core/src/main/scala/org/apache/spark/util/logging/DriverLogger.scala @@ -33,7 +33,8 @@ import org.apache.logging.log4j.core.layout.PatternLayout import org.apache.spark.SparkConf import org.apache.spark.deploy.SparkHadoopUtil -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.config._ import org.apache.spark.network.util.JavaUtils import org.apache.spark.util.{ThreadUtils, Utils} @@ -79,7 +80,7 @@ private[spark] class DriverLogger(conf: SparkConf) extends Logging { val fa = log4jFileAppender() logger.addAppender(fa) fa.start() - logInfo(s"Added a local log appender at: $localLogFile") + logInfo(log"Added a local log appender at: ${MDC(FILE_NAME, localLogFile)}") } def startSync(hadoopConf: Configuration): Unit = { @@ -144,7 +145,7 @@ private[spark] class DriverLogger(conf: SparkConf) extends Logging { threadpool = ThreadUtils.newDaemonSingleThreadScheduledExecutor("dfsSyncThread") threadpool.scheduleWithFixedDelay(this, UPLOAD_INTERVAL_IN_SECS, UPLOAD_INTERVAL_IN_SECS, TimeUnit.SECONDS) - logInfo(s"Started driver log file sync to: ${dfsLogFile}") + logInfo(log"Started driver log file sync to: ${MDC(PATH, dfsLogFile)}") } def run(): Unit = { @@ -229,8 +230,8 @@ private[spark] object DriverLogger extends Logging { // Return None because we don't need DFS-related logic in SparkContext and DfsAsyncWriter None } else { - logWarning(s"Driver logs are not persisted because" + - s" ${DRIVER_LOG_DFS_DIR.key} is not configured") + logWarning(log"Driver logs are not persisted because" + + log" ${MDC(CONFIG, DRIVER_LOG_DFS_DIR.key)} is not configured") None } } else { diff --git a/core/src/main/scala/org/apache/spark/util/logging/FileAppender.scala b/core/src/main/scala/org/apache/spark/util/logging/FileAppender.scala index 1dadf15da40fa..202c919362951 100644 --- a/core/src/main/scala/org/apache/spark/util/logging/FileAppender.scala +++ b/core/src/main/scala/org/apache/spark/util/logging/FileAppender.scala @@ -21,7 +21,7 @@ import java.io.{File, FileOutputStream, InputStream, IOException} import org.apache.spark.SparkConf import org.apache.spark.internal.{config, Logging, MDC} -import org.apache.spark.internal.LogKey.PATH +import org.apache.spark.internal.LogKeys._ import org.apache.spark.util.{IntParam, Utils} /** @@ -138,20 +138,24 @@ private[spark] object FileAppender extends Logging { def createTimeBasedAppender(): FileAppender = { val validatedParams: Option[(Long, String)] = rollingInterval match { case "daily" => - logInfo(s"Rolling executor logs enabled for $file with daily rolling") + logInfo(log"Rolling executor logs enabled for ${MDC(FILE_NAME, file)} with daily rolling") Some((24 * 60 * 60 * 1000L, "--yyyy-MM-dd")) case "hourly" => - logInfo(s"Rolling executor logs enabled for $file with hourly rolling") + logInfo(log"Rolling executor logs enabled for ${MDC(FILE_NAME, file)}" + + log" with hourly rolling") Some((60 * 60 * 1000L, "--yyyy-MM-dd--HH")) case "minutely" => - logInfo(s"Rolling executor logs enabled for $file with rolling every minute") + logInfo(log"Rolling executor logs enabled for ${MDC(FILE_NAME, file)}" + + log" with rolling every minute") Some((60 * 1000L, "--yyyy-MM-dd--HH-mm")) case IntParam(seconds) => - logInfo(s"Rolling executor logs enabled for $file with rolling $seconds seconds") + logInfo(log"Rolling executor logs enabled for ${MDC(FILE_NAME, file)}" + + log" with rolling ${MDC(TIME_UNITS, seconds)} seconds") Some((seconds * 1000L, "--yyyy-MM-dd--HH-mm-ss")) case _ => - logWarning(s"Illegal interval for rolling executor logs [$rollingInterval], " + - s"rolling logs not enabled") + logWarning(log"Illegal interval for rolling executor logs [" + + log"${MDC(TIME_UNITS, rollingInterval)}], " + + log"rolling logs not enabled") None } validatedParams.map { @@ -167,12 +171,14 @@ private[spark] object FileAppender extends Logging { def createSizeBasedAppender(): FileAppender = { rollingSizeBytes match { case IntParam(bytes) => - logInfo(s"Rolling executor logs enabled for $file with rolling every $bytes bytes") + logInfo(log"Rolling executor logs enabled for ${MDC(FILE_NAME, file)}" + + log" with rolling every ${MDC(NUM_BYTES, bytes)} bytes") new RollingFileAppender( inputStream, file, new SizeBasedRollingPolicy(bytes), conf, closeStreams = closeStreams) case _ => logWarning( - s"Illegal size [$rollingSizeBytes] for rolling executor logs, rolling logs not enabled") + log"Illegal size [${MDC(NUM_BYTES, rollingSizeBytes)}] " + + log"for rolling executor logs, rolling logs not enabled") new FileAppender(inputStream, file, closeStreams = closeStreams) } } diff --git a/core/src/main/scala/org/apache/spark/util/logging/RollingFileAppender.scala b/core/src/main/scala/org/apache/spark/util/logging/RollingFileAppender.scala index f8f144f6e3885..6927c119a91c5 100644 --- a/core/src/main/scala/org/apache/spark/util/logging/RollingFileAppender.scala +++ b/core/src/main/scala/org/apache/spark/util/logging/RollingFileAppender.scala @@ -25,7 +25,7 @@ import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf import org.apache.spark.internal.{config, MDC} -import org.apache.spark.internal.LogKey.PATH +import org.apache.spark.internal.LogKeys._ import org.apache.spark.util.ArrayImplicits._ /** @@ -118,7 +118,7 @@ private[spark] class RollingFileAppender( if (activeFile.exists) { if (!rolloverFileExist(rolloverFile)) { rotateFile(activeFile, rolloverFile) - logInfo(s"Rolled over $activeFile to $rolloverFile") + logInfo(log"Rolled over ${MDC(FILE_NAME, activeFile)} to ${MDC(FILE_NAME2, rolloverFile)}") } else { // In case the rollover file name clashes, make a unique file name. // The resultant file names are long and ugly, so this is used only @@ -132,12 +132,13 @@ private[spark] class RollingFileAppender( i += 1 } while (i < 10000 && rolloverFileExist(altRolloverFile)) - logWarning(s"Rollover file $rolloverFile already exists, " + - s"rolled over $activeFile to file $altRolloverFile") + logWarning(log"Rollover file ${MDC(FILE_NAME, rolloverFile)} already exists, " + + log"rolled over ${MDC(FILE_NAME2, activeFile)} " + + log"to file ${MDC(FILE_NAME3, altRolloverFile)}") rotateFile(activeFile, altRolloverFile) } } else { - logWarning(s"File $activeFile does not exist") + logWarning(log"File ${MDC(FILE_NAME, activeFile)} does not exist") } } @@ -152,7 +153,8 @@ private[spark] class RollingFileAppender( val filesToBeDeleted = rolledoverFiles.take( math.max(0, rolledoverFiles.length - maxRetainedFiles)) filesToBeDeleted.foreach { file => - logInfo(s"Deleting file executor log file ${file.getAbsolutePath}") + logInfo(log"Deleting file executor log file" + + log" ${MDC(FILE_ABSOLUTE_PATH, file.getAbsolutePath)}") file.delete() } } catch { diff --git a/core/src/main/scala/org/apache/spark/util/logging/RollingPolicy.scala b/core/src/main/scala/org/apache/spark/util/logging/RollingPolicy.scala index 5327ecd3e56a9..310e895930943 100644 --- a/core/src/main/scala/org/apache/spark/util/logging/RollingPolicy.scala +++ b/core/src/main/scala/org/apache/spark/util/logging/RollingPolicy.scala @@ -20,7 +20,8 @@ package org.apache.spark.util.logging import java.text.SimpleDateFormat import java.util.{Calendar, Locale} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ /** * Defines the policy based on which [[org.apache.spark.util.logging.RollingFileAppender]] will @@ -53,8 +54,9 @@ private[spark] class TimeBasedRollingPolicy( import TimeBasedRollingPolicy._ if (checkIntervalConstraint && rolloverIntervalMillis < MINIMUM_INTERVAL_SECONDS * 1000L) { - logWarning(s"Rolling interval [${rolloverIntervalMillis/1000L} seconds] is too small. " + - s"Setting the interval to the acceptable minimum of $MINIMUM_INTERVAL_SECONDS seconds.") + logWarning(log"Rolling interval [${MDC(TIME_UNITS, rolloverIntervalMillis)} " + + log"ms] is too small. Setting the interval to the acceptable minimum of " + + log"${MDC(MIN_TIME, MINIMUM_INTERVAL_SECONDS * 1000)} ms.") rolloverIntervalMillis = MINIMUM_INTERVAL_SECONDS * 1000L } @@ -103,8 +105,9 @@ private[spark] class SizeBasedRollingPolicy( import SizeBasedRollingPolicy._ if (checkSizeConstraint && rolloverSizeBytes < MINIMUM_SIZE_BYTES) { - logWarning(s"Rolling size [$rolloverSizeBytes bytes] is too small. " + - s"Setting the size to the acceptable minimum of $MINIMUM_SIZE_BYTES bytes.") + logWarning(log"Rolling size [${MDC(NUM_BYTES, rolloverSizeBytes)} bytes] is too small. " + + log"Setting the size to the acceptable minimum of ${MDC(MIN_SIZE, MINIMUM_SIZE_BYTES)} " + + log"bytes.") rolloverSizeBytes = MINIMUM_SIZE_BYTES } diff --git a/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java b/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java index 063d391bb4bfd..d95d648c2d732 100644 --- a/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java +++ b/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java @@ -314,7 +314,8 @@ public void writeWithoutSpilling() throws Exception { @Test public void writeChecksumFileWithoutSpill() throws Exception { - IndexShuffleBlockResolver blockResolver = new IndexShuffleBlockResolver(conf, blockManager); + IndexShuffleBlockResolver blockResolver = + new IndexShuffleBlockResolver(conf, blockManager, Collections.emptyMap()); ShuffleChecksumBlockId checksumBlockId = new ShuffleChecksumBlockId(0, 0, IndexShuffleBlockResolver.NOOP_REDUCE_ID()); String checksumAlgorithm = conf.get(package$.MODULE$.SHUFFLE_CHECKSUM_ALGORITHM()); @@ -344,7 +345,8 @@ public void writeChecksumFileWithoutSpill() throws Exception { @Test public void writeChecksumFileWithSpill() throws Exception { - IndexShuffleBlockResolver blockResolver = new IndexShuffleBlockResolver(conf, blockManager); + IndexShuffleBlockResolver blockResolver = + new IndexShuffleBlockResolver(conf, blockManager, Collections.emptyMap()); ShuffleChecksumBlockId checksumBlockId = new ShuffleChecksumBlockId(0, 0, IndexShuffleBlockResolver.NOOP_REDUCE_ID()); String checksumAlgorithm = conf.get(package$.MODULE$.SHUFFLE_CHECKSUM_ALGORITHM()); diff --git a/core/src/test/scala/org/apache/spark/CheckpointSuite.scala b/core/src/test/scala/org/apache/spark/CheckpointSuite.scala index 874f4896bb01e..7a39ba4ab382b 100644 --- a/core/src/test/scala/org/apache/spark/CheckpointSuite.scala +++ b/core/src/test/scala/org/apache/spark/CheckpointSuite.scala @@ -669,4 +669,20 @@ class CheckpointStorageSuite extends SparkFunSuite with LocalSparkContext { assert(rdd.firstParent.isInstanceOf[ReliableCheckpointRDD[_]]) } } + + test("SPARK-48268: checkpoint directory via configuration") { + withTempDir { checkpointDir => + val conf = new SparkConf() + .set("spark.checkpoint.dir", checkpointDir.toString) + .set(UI_ENABLED.key, "false") + sc = new SparkContext("local", "test", conf) + val parCollection = sc.makeRDD(1 to 4) + val flatMappedRDD = parCollection.flatMap(x => 1 to x) + flatMappedRDD.checkpoint() + assert(flatMappedRDD.dependencies.head.rdd === parCollection) + val result = flatMappedRDD.collect() + assert(flatMappedRDD.dependencies.head.rdd != parCollection) + assert(flatMappedRDD.collect() === result) + } + } } diff --git a/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala b/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala index c15fdf098bb56..58cf14e969e50 100644 --- a/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala +++ b/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala @@ -20,6 +20,7 @@ package org.apache.spark import java.util.concurrent.{Semaphore, TimeUnit} import java.util.concurrent.atomic.AtomicInteger +import scala.collection.mutable.ArrayBuffer import scala.concurrent.{ExecutionContext, Future} // scalastyle:off executioncontextglobal import scala.concurrent.ExecutionContext.Implicits.global @@ -29,9 +30,10 @@ import scala.concurrent.duration._ import org.scalatest.BeforeAndAfter import org.scalatest.matchers.must.Matchers +import org.apache.spark.executor.ExecutorExitCode import org.apache.spark.internal.config._ import org.apache.spark.internal.config.Deploy._ -import org.apache.spark.scheduler.{SparkListener, SparkListenerJobEnd, SparkListenerJobStart, SparkListenerStageCompleted, SparkListenerTaskEnd, SparkListenerTaskStart} +import org.apache.spark.scheduler.{SparkListener, SparkListenerExecutorRemoved, SparkListenerJobEnd, SparkListenerJobStart, SparkListenerStageCompleted, SparkListenerTaskEnd, SparkListenerTaskStart} import org.apache.spark.util.ThreadUtils /** @@ -429,12 +431,20 @@ class JobCancellationSuite extends SparkFunSuite with Matchers with BeforeAndAft .set(TASK_REAPER_KILL_TIMEOUT.key, "5s") sc = new SparkContext("local-cluster[2,1,1024]", "test", conf) - // Add a listener to release the semaphore once any tasks are launched. + // Add a listener to release a semaphore once any tasks are launched, and another semaphore + // once an executor is removed. val sem = new Semaphore(0) + val semExec = new Semaphore(0) + val execLossReason = new ArrayBuffer[String]() sc.addSparkListener(new SparkListener { override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = { sem.release() } + + override def onExecutorRemoved(executorRemoved: SparkListenerExecutorRemoved): Unit = { + execLossReason += executorRemoved.reason + semExec.release() + } }) // jobA is the one to be cancelled. @@ -455,6 +465,9 @@ class JobCancellationSuite extends SparkFunSuite with Matchers with BeforeAndAft sc.cancelJobGroup("jobA") val e = intercept[SparkException] { ThreadUtils.awaitResult(jobA, 15.seconds) }.getCause assert(e.getMessage contains "cancel") + semExec.acquire(2) + val expectedReason = s"Command exited with code ${ExecutorExitCode.KILLED_BY_TASK_REAPER}" + assert(execLossReason == Seq(expectedReason, expectedReason)) // Once A is cancelled, job B should finish fairly quickly. assert(ThreadUtils.awaitResult(jobB, 1.minute) === 100) diff --git a/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala b/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala index 7aec8eeaad423..26dc218c30c74 100644 --- a/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala +++ b/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala @@ -1110,4 +1110,59 @@ class MapOutputTrackerSuite extends SparkFunSuite with LocalSparkContext { rpcEnv.shutdown() } } + + test( + "SPARK-48394: mapIdToMapIndex should cleanup unused mapIndexes after removeOutputsByFilter" + ) { + val rpcEnv = createRpcEnv("test") + val tracker = newTrackerMaster() + try { + tracker.trackerEndpoint = rpcEnv.setupEndpoint(MapOutputTracker.ENDPOINT_NAME, + new MapOutputTrackerMasterEndpoint(rpcEnv, tracker, conf)) + tracker.registerShuffle(0, 1, 1) + tracker.registerMapOutput(0, 0, MapStatus(BlockManagerId("exec-1", "hostA", 1000), + Array(2L), 0)) + tracker.removeOutputsOnHost("hostA") + assert(tracker.shuffleStatuses(0).mapIdToMapIndex.filter(_._2 == 0).size == 0) + } finally { + tracker.stop() + rpcEnv.shutdown() + } + } + + test("SPARK-48394: mapIdToMapIndex should cleanup unused mapIndexes after unregisterMapOutput") { + val rpcEnv = createRpcEnv("test") + val tracker = newTrackerMaster() + try { + tracker.trackerEndpoint = rpcEnv.setupEndpoint(MapOutputTracker.ENDPOINT_NAME, + new MapOutputTrackerMasterEndpoint(rpcEnv, tracker, conf)) + tracker.registerShuffle(0, 1, 1) + tracker.registerMapOutput(0, 0, MapStatus(BlockManagerId("exec-1", "hostA", 1000), + Array(2L), 0)) + tracker.unregisterMapOutput(0, 0, BlockManagerId("exec-1", "hostA", 1000)) + assert(tracker.shuffleStatuses(0).mapIdToMapIndex.filter(_._2 == 0).size == 0) + } finally { + tracker.stop() + rpcEnv.shutdown() + } + } + + test("SPARK-48394: mapIdToMapIndex should cleanup unused mapIndexes after registerMapOutput") { + val rpcEnv = createRpcEnv("test") + val tracker = newTrackerMaster() + try { + tracker.trackerEndpoint = rpcEnv.setupEndpoint(MapOutputTracker.ENDPOINT_NAME, + new MapOutputTrackerMasterEndpoint(rpcEnv, tracker, conf)) + tracker.registerShuffle(0, 1, 1) + tracker.registerMapOutput(0, 0, MapStatus(BlockManagerId("exec-1", "hostA", 1000), + Array(2L), 0)) + // Another task also finished working on partition 0. + tracker.registerMapOutput(0, 0, MapStatus(BlockManagerId("exec-2", "hostB", 1000), + Array(2L), 1)) + assert(tracker.shuffleStatuses(0).mapIdToMapIndex.filter(_._2 == 0).size == 1) + } finally { + tracker.stop() + rpcEnv.shutdown() + } + } } diff --git a/core/src/test/scala/org/apache/spark/MapStatusesSerDeserBenchmark.scala b/core/src/test/scala/org/apache/spark/MapStatusesSerDeserBenchmark.scala index ca85ffda4e602..75f952d063d33 100644 --- a/core/src/test/scala/org/apache/spark/MapStatusesSerDeserBenchmark.scala +++ b/core/src/test/scala/org/apache/spark/MapStatusesSerDeserBenchmark.scala @@ -123,7 +123,6 @@ object MapStatusesSerDeserBenchmark extends BenchmarkBase { } override def afterAll(): Unit = { - tracker.stop() if (sc != null) { sc.stop() } diff --git a/core/src/test/scala/org/apache/spark/SparkThrowableSuite.scala b/core/src/test/scala/org/apache/spark/SparkThrowableSuite.scala index 1492543c56bbc..231cfdc3f32fc 100644 --- a/core/src/test/scala/org/apache/spark/SparkThrowableSuite.scala +++ b/core/src/test/scala/org/apache/spark/SparkThrowableSuite.scala @@ -20,11 +20,8 @@ package org.apache.spark import java.io.File import java.nio.charset.StandardCharsets import java.nio.file.Files -import java.util.Locale -import scala.jdk.CollectionConverters._ import scala.util.Properties.lineSeparator -import scala.util.matching.Regex import com.fasterxml.jackson.annotation.JsonInclude.Include import com.fasterxml.jackson.core.JsonParser.Feature.STRICT_DUPLICATE_DETECTION @@ -48,19 +45,13 @@ class SparkThrowableSuite extends SparkFunSuite { SPARK_GENERATE_GOLDEN_FILES=1 build/sbt \ "core/testOnly *SparkThrowableSuite -- -t \"Error classes are correctly formatted\"" }}} - - To regenerate the error class document. Run: - {{{ - SPARK_GENERATE_GOLDEN_FILES=1 build/sbt \ - "core/testOnly *SparkThrowableSuite -- -t \"Error classes match with document\"" - }}} */ private val regenerateCommand = "SPARK_GENERATE_GOLDEN_FILES=1 build/sbt " + "\"core/testOnly *SparkThrowableSuite -- -t \\\"Error classes match with document\\\"\"" private val errorJsonFilePath = getWorkspaceFilePath( // Note that though we call them "error classes" here, the proper name is "error conditions", - // hence why the name of the JSON file different. We will address this inconsistency as part + // hence why the name of the JSON file is different. We will address this inconsistency as part // of this ticket: https://issues.apache.org/jira/browse/SPARK-47429 "common", "utils", "src", "main", "resources", "error", "error-conditions.json") @@ -173,219 +164,6 @@ class SparkThrowableSuite extends SparkFunSuite { checkIfUnique(messageFormats) } - test("Error classes match with document") { - val errors = errorReader.errorInfoMap - - // the black list of error class name which should not add quote - val contentQuoteBlackList = Seq( - "INCOMPLETE_TYPE_DEFINITION.MAP", - "INCOMPLETE_TYPE_DEFINITION.STRUCT") - - def quoteParameter(content: String, errorName: String): String = { - if (contentQuoteBlackList.contains(errorName)) { - content - } else { - "<(.*?)>".r.replaceAllIn(content, (m: Regex.Match) => { - val matchStr = m.group(1) - if (matchStr.nonEmpty) { - s"`<$matchStr>`" - } else { - m.matched - } - }).replaceAll("%(.*?)\\$", "`\\%$1\\$`") - } - } - - val sqlStates = IOUtils.toString(getWorkspaceFilePath("docs", - "sql-error-conditions-sqlstates.md").toUri, StandardCharsets.UTF_8).split("\n") - .filter(_.startsWith("##")).map(s => { - - val errorHeader = s.split("[`|:|#|\\s]+").filter(_.nonEmpty) - val sqlState = errorHeader(1) - (sqlState, errorHeader.head.toLowerCase(Locale.ROOT) + "-" + sqlState + "-" + - errorHeader.takeRight(errorHeader.length - 2).mkString("-").toLowerCase(Locale.ROOT)) - }).toMap - - def getSqlState(sqlState: Option[String]): String = { - if (sqlState.isDefined) { - val prefix = sqlState.get.substring(0, 2) - if (sqlStates.contains(prefix)) { - s"[SQLSTATE: ${sqlState.get}](sql-error-conditions-sqlstates.html#${sqlStates(prefix)})" - } else { - "SQLSTATE: " + sqlState.get - } - } else { - "SQLSTATE: none assigned" - } - } - - def getErrorPath(error: String): String = { - s"sql-error-conditions-${error.toLowerCase(Locale.ROOT).replaceAll("_", "-")}-error-class" - } - - def getHeader(title: String): String = { - s"""--- - |layout: global - |title: $title - |displayTitle: $title - |license: | - | Licensed to the Apache Software Foundation (ASF) under one or more - | contributor license agreements. See the NOTICE file distributed with - | this work for additional information regarding copyright ownership. - | The ASF licenses this file to You under the Apache License, Version 2.0 - | (the "License"); you may not use this file except in compliance with - | the License. You may obtain a copy of the License at - | - | http://www.apache.org/licenses/LICENSE-2.0 - | - | Unless required by applicable law or agreed to in writing, software - | distributed under the License is distributed on an "AS IS" BASIS, - | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - | See the License for the specific language governing permissions and - | limitations under the License. - |--- - | - |""".stripMargin - } - - def orphanedGoldenFiles(): Iterable[File] = { - val subErrorFileNames = errors.filter(_._2.subClass.isDefined).map(error => { - getErrorPath(error._1) + ".md" - }).toSet - - val docsDir = getWorkspaceFilePath("docs") - val orphans = FileUtils.listFiles(docsDir.toFile, Array("md"), false).asScala.filter { f => - (f.getName.startsWith("sql-error-conditions-") && f.getName.endsWith("-error-class.md")) && - !subErrorFileNames.contains(f.getName) - } - orphans - } - - val sqlErrorParentDocContent = errors.toSeq.filter(!_._1.startsWith("_LEGACY_ERROR")) - .sortBy(_._1).map(error => { - val name = error._1 - val info = error._2 - if (info.subClass.isDefined) { - val title = s"[$name](${getErrorPath(name)}.html)" - s"""|### $title - | - |${getSqlState(info.sqlState)} - | - |${quoteParameter(info.messageTemplate, name)} - | - |For more details see $title - |""".stripMargin - } else { - s"""|### $name - | - |${getSqlState(info.sqlState)} - | - |${quoteParameter(info.messageTemplate, name)} - |""".stripMargin - } - }).mkString("\n") - - val sqlErrorParentDoc = - s"""${getHeader("Error Conditions")} - | - |This is a list of common, named error conditions returned by Spark SQL. - | - |Also see [SQLSTATE Codes](sql-error-conditions-sqlstates.html). - | - |$sqlErrorParentDocContent""".stripMargin - - errors.filter(_._2.subClass.isDefined).foreach(error => { - val name = error._1 - val info = error._2 - - val subErrorContent = info.subClass.get.toSeq.sortBy(_._1).map(subError => { - s"""|## ${subError._1} - | - |${quoteParameter(subError._2.messageTemplate, s"$name.${subError._1}")} - |""".stripMargin - }).mkString("\n") - - val subErrorDoc = - s"""${getHeader(name + " error class")} - | - |${getSqlState(info.sqlState)} - | - |${quoteParameter(info.messageTemplate, name)} - | - |This error class has the following derived error classes: - | - |$subErrorContent - |""".stripMargin - - val errorDocPath = getWorkspaceFilePath("docs", getErrorPath(name) + ".md") - val errorsInDoc = if (errorDocPath.toFile.exists()) { - IOUtils.toString(errorDocPath.toUri, StandardCharsets.UTF_8) - } else { - "" - } - if (regenerateGoldenFiles) { - if (subErrorDoc.trim != errorsInDoc.trim) { - logInfo(s"Regenerating sub error class document $errorDocPath") - if (errorDocPath.toFile.exists()) { - Files.delete(errorDocPath) - } - FileUtils.writeStringToFile( - errorDocPath.toFile, - subErrorDoc + lineSeparator, - StandardCharsets.UTF_8) - } - } else { - assert(subErrorDoc.trim == errorsInDoc.trim, - "The error class document is not up to date. " + - s"Please regenerate it by running `$regenerateCommand`") - } - }) - - val parentDocPath = getWorkspaceFilePath("docs", "sql-error-conditions.md") - val commonErrorsInDoc = if (parentDocPath.toFile.exists()) { - IOUtils.toString(parentDocPath.toUri, StandardCharsets.UTF_8) - } else { - "" - } - if (regenerateGoldenFiles) { - if (sqlErrorParentDoc.trim != commonErrorsInDoc.trim) { - logInfo(s"Regenerating error class document $parentDocPath") - if (parentDocPath.toFile.exists()) { - Files.delete(parentDocPath) - } - FileUtils.writeStringToFile( - parentDocPath.toFile, - sqlErrorParentDoc, - StandardCharsets.UTF_8) - } - } else { - assert(sqlErrorParentDoc.trim == commonErrorsInDoc.trim, - "The error class document is not up to date. " + - s"Please regenerate it by running `$regenerateCommand`") - } - - val orphans = orphanedGoldenFiles() - if (regenerateGoldenFiles) { - if (orphans.nonEmpty) { - logInfo(s"Orphaned error class documents (${orphans.size}) is not empty, " + - "executing cleanup operation.") - orphans.foreach { f => - FileUtils.deleteQuietly(f) - logInfo(s"Cleanup orphaned error document: ${f.getName}.") - } - } else { - logInfo("Orphaned error class documents is empty") - } - } else { - assert(orphans.isEmpty, - "Exist orphaned error class documents. " + - s"Please regenerate it by running `$regenerateCommand`") - } - } - test("Round trip") { val tmpFile = File.createTempFile("rewritten", ".json") val mapper = JsonMapper.builder() diff --git a/core/src/test/scala/org/apache/spark/deploy/RPackageUtilsSuite.scala b/core/src/test/scala/org/apache/spark/deploy/RPackageUtilsSuite.scala index a8ede31f1d30d..77f5268f79cae 100644 --- a/core/src/test/scala/org/apache/spark/deploy/RPackageUtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/RPackageUtilsSuite.scala @@ -127,7 +127,7 @@ class RPackageUtilsSuite RPackageUtils.checkAndBuildRPackage(jar.getAbsolutePath, new BufferPrintStream, verbose = true) val output = lineBuffer.mkString("\n") - assert(output.contains(RPackageUtils.RJarDoc)) + assert(output.contains(RPackageUtils.RJarDoc.message)) } } diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala index f55c00d7d61a5..40d8eae644a07 100644 --- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala @@ -1107,12 +1107,49 @@ class SparkSubmitSuite "--master", "local", unusedJar.toString) val appArgs = new SparkSubmitArguments(args, env = Map("SPARK_CONF_DIR" -> path)) - assert(appArgs.propertiesFile != null) - assert(appArgs.propertiesFile.startsWith(path)) appArgs.executorMemory should be ("3g") } } + test("SPARK-48392: load spark-defaults.conf when --load-spark-defaults is set") { + forConfDir(Map("spark.executor.memory" -> "3g", "spark.driver.memory" -> "3g")) { path => + withPropertyFile("spark-conf.properties", + Map("spark.executor.cores" -> "16", "spark.driver.memory" -> "4g")) { propsFile => + val unusedJar = TestUtils.createJarWithClasses(Seq.empty) + val args = Seq( + "--class", SimpleApplicationTest.getClass.getName.stripSuffix("$"), + "--name", "testApp", + "--master", "local", + "--properties-file", propsFile, + "--load-spark-defaults", + unusedJar.toString) + val appArgs = new SparkSubmitArguments(args, env = Map("SPARK_CONF_DIR" -> path)) + appArgs.executorCores should be("16") + appArgs.executorMemory should be("3g") + appArgs.driverMemory should be("4g") + } + } + } + + test("SPARK-48392: should skip spark-defaults.conf when --load-spark-defaults is not set") { + forConfDir(Map("spark.executor.memory" -> "3g", "spark.driver.memory" -> "3g")) { path => + withPropertyFile("spark-conf.properties", + Map("spark.executor.cores" -> "16", "spark.driver.memory" -> "4g")) { propsFile => + val unusedJar = TestUtils.createJarWithClasses(Seq.empty) + val args = Seq( + "--class", SimpleApplicationTest.getClass.getName.stripSuffix("$"), + "--name", "testApp", + "--master", "local", + "--properties-file", propsFile, + unusedJar.toString) + val appArgs = new SparkSubmitArguments(args, env = Map("SPARK_CONF_DIR" -> path)) + appArgs.executorCores should be("16") + appArgs.driverMemory should be("4g") + appArgs.executorMemory should be(null) + } + } + } + test("support glob path") { withTempDir { tmpJarDir => withTempDir { tmpFileDir => @@ -1623,6 +1660,22 @@ class SparkSubmitSuite } } + private def withPropertyFile(fileName: String, conf: Map[String, String])(f: String => Unit) = { + withTempDir { tmpDir => + val props = new java.util.Properties() + val propsFile = File.createTempFile(fileName, "", tmpDir) + val propsOutputStream = new FileOutputStream(propsFile) + try { + conf.foreach { case (k, v) => props.put(k, v) } + props.store(propsOutputStream, "") + } finally { + propsOutputStream.close() + } + + f(propsFile.getPath) + } + } + private def updateConfWithFakeS3Fs(conf: Configuration): Unit = { conf.set("fs.s3a.impl", classOf[TestFileSystem].getCanonicalName) conf.set("fs.s3a.impl.disable.cache", "true") @@ -1694,40 +1747,31 @@ class SparkSubmitSuite val infixDelimFromFile = s"${delimKey}infixDelimFromFile" -> s"${CR}blah${LF}" val nonDelimSpaceFromFile = s"${delimKey}nonDelimSpaceFromFile" -> " blah\f" - val testProps = Seq(leadingDelimKeyFromFile, trailingDelimKeyFromFile, infixDelimFromFile, + val testProps = Map(leadingDelimKeyFromFile, trailingDelimKeyFromFile, infixDelimFromFile, nonDelimSpaceFromFile) - val props = new java.util.Properties() - val propsFile = File.createTempFile("test-spark-conf", ".properties", - Utils.createTempDir()) - val propsOutputStream = new FileOutputStream(propsFile) - try { - testProps.foreach { case (k, v) => props.put(k, v) } - props.store(propsOutputStream, "test whitespace") - } finally { - propsOutputStream.close() - } + withPropertyFile("test-spark-conf.properties", testProps) { propsFile => + val clArgs = Seq( + "--class", "org.SomeClass", + "--conf", s"${lineFeedFromCommandLine._1}=${lineFeedFromCommandLine._2}", + "--conf", "spark.master=yarn", + "--properties-file", propsFile, + "thejar.jar") - val clArgs = Seq( - "--class", "org.SomeClass", - "--conf", s"${lineFeedFromCommandLine._1}=${lineFeedFromCommandLine._2}", - "--conf", "spark.master=yarn", - "--properties-file", propsFile.getPath, - "thejar.jar") + val appArgs = new SparkSubmitArguments(clArgs) + val (_, _, conf, _) = submit.prepareSubmitEnvironment(appArgs) - val appArgs = new SparkSubmitArguments(clArgs) - val (_, _, conf, _) = submit.prepareSubmitEnvironment(appArgs) + Seq( + lineFeedFromCommandLine, + leadingDelimKeyFromFile, + trailingDelimKeyFromFile, + infixDelimFromFile + ).foreach { case (k, v) => + conf.get(k) should be (v) + } - Seq( - lineFeedFromCommandLine, - leadingDelimKeyFromFile, - trailingDelimKeyFromFile, - infixDelimFromFile - ).foreach { case (k, v) => - conf.get(k) should be (v) + conf.get(nonDelimSpaceFromFile._1) should be ("blah") } - - conf.get(nonDelimSpaceFromFile._1) should be ("blah") } test("get a Spark configuration from arguments") { diff --git a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerArgumentsSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerArgumentsSuite.scala index 5903ae71ec66e..2b9b110a41424 100644 --- a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerArgumentsSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerArgumentsSuite.scala @@ -22,6 +22,7 @@ import java.nio.charset.StandardCharsets._ import com.google.common.io.Files import org.apache.spark._ +import org.apache.spark.internal.config.{ConfigEntry, History} import org.apache.spark.internal.config.History._ import org.apache.spark.internal.config.Tests._ @@ -52,4 +53,16 @@ class HistoryServerArgumentsSuite extends SparkFunSuite { assert(conf.get("spark.test.CustomPropertyB") === "notblah") } } + + test("SPARK-48471: all history configurations should have documentations") { + val configs = History.getClass.getDeclaredFields + .filter(f => classOf[ConfigEntry[_]].isAssignableFrom(f.getType)) + .map { f => + f.setAccessible(true) + f.get(History).asInstanceOf[ConfigEntry[_]] + } + configs.foreach { config => + assert(config.doc.nonEmpty, s"Config ${config.key} doesn't have documentation") + } + } } diff --git a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala index 0d739b647eab9..91a93bbe01d7f 100644 --- a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala @@ -626,28 +626,6 @@ abstract class HistoryServerSuite extends SparkFunSuite with BeforeAndAfter with } test("access history application defaults to the last attempt id") { - - def getRedirectUrl(url: URL): (Int, String) = { - val connection = url.openConnection().asInstanceOf[HttpURLConnection] - connection.setRequestMethod("GET") - connection.setUseCaches(false) - connection.setDefaultUseCaches(false) - connection.setInstanceFollowRedirects(false) - connection.connect() - val code = connection.getResponseCode() - val location = connection.getHeaderField("Location") - (code, location) - } - - def buildPageAttemptUrl(appId: String, attemptId: Option[Int]): URL = { - attemptId match { - case Some(id) => - new URL(s"http://$localhost:$port/history/$appId/$id") - case None => - new URL(s"http://$localhost:$port/history/$appId") - } - } - val oneAttemptAppId = "local-1430917381534" HistoryServerSuite.getUrl(buildPageAttemptUrl(oneAttemptAppId, None)) @@ -668,6 +646,42 @@ abstract class HistoryServerSuite extends SparkFunSuite with BeforeAndAfter with } } + test("Redirect URLs should end with a slash") { + val oneAttemptAppId = "local-1430917381534" + val multiAttemptAppid = "local-1430917381535" + + val url = buildPageAttemptUrl(oneAttemptAppId, None) + val (code, location) = getRedirectUrl(url) + assert(code === 302, s"Unexpected status code $code for $url") + assert(location === url.toString + "/") + + val url2 = buildPageAttemptUrl(multiAttemptAppid, None) + val (code2, location2) = getRedirectUrl(url2) + assert(code2 === 302, s"Unexpected status code $code2 for $url2") + assert(location2 === url2.toString + "/2/") + } + + def getRedirectUrl(url: URL): (Int, String) = { + val connection = url.openConnection().asInstanceOf[HttpURLConnection] + connection.setRequestMethod("GET") + connection.setUseCaches(false) + connection.setDefaultUseCaches(false) + connection.setInstanceFollowRedirects(false) + connection.connect() + val code = connection.getResponseCode() + val location = connection.getHeaderField("Location") + (code, location) + } + + def buildPageAttemptUrl(appId: String, attemptId: Option[Int]): URL = { + attemptId match { + case Some(id) => + new URL(s"http://$localhost:$port/history/$appId/$id") + case None => + new URL(s"http://$localhost:$port/history/$appId") + } + } + def getContentAndCode(path: String, port: Int = port): (Int, Option[String], Option[String]) = { HistoryServerSuite.getContentAndCode(new URL(s"http://$localhost:$port/api/v1/$path")) } diff --git a/core/src/test/scala/org/apache/spark/deploy/master/PersistenceEngineBenchmark.scala b/core/src/test/scala/org/apache/spark/deploy/master/PersistenceEngineBenchmark.scala index 34a447efe5281..2a06ee5ed947b 100644 --- a/core/src/test/scala/org/apache/spark/deploy/master/PersistenceEngineBenchmark.scala +++ b/core/src/test/scala/org/apache/spark/deploy/master/PersistenceEngineBenchmark.scala @@ -29,7 +29,7 @@ import org.apache.spark.deploy.{DeployTestUtils, DriverDescription} import org.apache.spark.internal.config.Deploy.ZOOKEEPER_URL import org.apache.spark.io.CompressionCodec import org.apache.spark.resource.ResourceUtils.{FPGA, GPU} -import org.apache.spark.serializer.{JavaSerializer, KryoSerializer} +import org.apache.spark.serializer.JavaSerializer import org.apache.spark.util.Utils @@ -49,7 +49,7 @@ import org.apache.spark.util.Utils object PersistenceEngineBenchmark extends BenchmarkBase { val conf = new SparkConf() - val serializers = Seq(new JavaSerializer(conf), new KryoSerializer(conf)) + val serializers = Seq(new JavaSerializer(conf)) val zkTestServer = new TestingServer(findFreePort(conf)) override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { diff --git a/core/src/test/scala/org/apache/spark/deploy/master/PersistenceEngineSuite.scala b/core/src/test/scala/org/apache/spark/deploy/master/PersistenceEngineSuite.scala index 01b7e46eb2a8f..6839afdeeff8e 100644 --- a/core/src/test/scala/org/apache/spark/deploy/master/PersistenceEngineSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/master/PersistenceEngineSuite.scala @@ -28,7 +28,7 @@ import org.apache.spark.{SecurityManager, SparkConf, SparkFunSuite} import org.apache.spark.internal.config.Deploy.ZOOKEEPER_URL import org.apache.spark.io.CompressionCodec import org.apache.spark.rpc.{RpcEndpoint, RpcEnv} -import org.apache.spark.serializer.{JavaSerializer, KryoSerializer, Serializer} +import org.apache.spark.serializer.{JavaSerializer, Serializer} import org.apache.spark.util.Utils class PersistenceEngineSuite extends SparkFunSuite { @@ -103,18 +103,6 @@ class PersistenceEngineSuite extends SparkFunSuite { } } - test("SPARK-46205: Support KryoSerializer in FileSystemPersistenceEngine") { - withTempDir { dir => - val conf = new SparkConf() - val serializer = new KryoSerializer(conf) - val engine = new FileSystemPersistenceEngine(dir.getAbsolutePath, serializer) - engine.persist("test_1", "test_1_value") - engine.read[String]("test_1") - engine.unpersist("test_1") - engine.close() - } - } - test("SPARK-46216: FileSystemPersistenceEngine with compression") { val conf = new SparkConf() CompressionCodec.ALL_COMPRESSION_CODECS.foreach { c => diff --git a/core/src/test/scala/org/apache/spark/deploy/master/RecoverySuite.scala b/core/src/test/scala/org/apache/spark/deploy/master/RecoverySuite.scala index 5e2939738cdfb..18b22e7352c92 100644 --- a/core/src/test/scala/org/apache/spark/deploy/master/RecoverySuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/master/RecoverySuite.scala @@ -38,7 +38,7 @@ import org.apache.spark.io.LZ4CompressionCodec import org.apache.spark.resource.{ResourceInformation, ResourceProfile, ResourceRequirement} import org.apache.spark.resource.ResourceUtils.{FPGA, GPU} import org.apache.spark.rpc.{RpcAddress, RpcEndpoint, RpcEndpointRef, RpcEnv} -import org.apache.spark.serializer.KryoSerializer +import org.apache.spark.serializer.JavaSerializer class RecoverySuite extends MasterSuiteBase { test("can use a custom recovery mode factory") { @@ -474,26 +474,6 @@ class RecoverySuite extends MasterSuiteBase { } } - test("SPARK-46205: Recovery with Kryo Serializer") { - val conf = new SparkConf(loadDefaults = false) - conf.set(RECOVERY_MODE, "FILESYSTEM") - conf.set(RECOVERY_SERIALIZER, "Kryo") - conf.set(RECOVERY_DIRECTORY, System.getProperty("java.io.tmpdir")) - - var master: Master = null - try { - master = makeAliveMaster(conf) - val e = master.invokePrivate(_persistenceEngine()).asInstanceOf[FileSystemPersistenceEngine] - assert(e.serializer.isInstanceOf[KryoSerializer]) - } finally { - if (master != null) { - master.rpcEnv.shutdown() - master.rpcEnv.awaitTermination() - master = null - } - } - } - test("SPARK-46216: Recovery without compression") { val conf = new SparkConf(loadDefaults = false) conf.set(RECOVERY_MODE, "FILESYSTEM") @@ -536,14 +516,13 @@ class RecoverySuite extends MasterSuiteBase { test("SPARK-46258: Recovery with RocksDB") { val conf = new SparkConf(loadDefaults = false) conf.set(RECOVERY_MODE, "ROCKSDB") - conf.set(RECOVERY_SERIALIZER, "Kryo") conf.set(RECOVERY_DIRECTORY, System.getProperty("java.io.tmpdir")) var master: Master = null try { master = makeAliveMaster(conf) val e = master.invokePrivate(_persistenceEngine()).asInstanceOf[RocksDBPersistenceEngine] - assert(e.serializer.isInstanceOf[KryoSerializer]) + assert(e.serializer.isInstanceOf[JavaSerializer]) } finally { if (master != null) { master.rpcEnv.shutdown() diff --git a/core/src/test/scala/org/apache/spark/deploy/master/ui/ReadOnlyMasterWebUISuite.scala b/core/src/test/scala/org/apache/spark/deploy/master/ui/ReadOnlyMasterWebUISuite.scala index d7f05754a7cd1..20ff932eb01a3 100644 --- a/core/src/test/scala/org/apache/spark/deploy/master/ui/ReadOnlyMasterWebUISuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/master/ui/ReadOnlyMasterWebUISuite.scala @@ -17,7 +17,9 @@ package org.apache.spark.deploy.master.ui -import jakarta.servlet.http.HttpServletResponse.SC_METHOD_NOT_ALLOWED +import scala.io.Source + +import jakarta.servlet.http.HttpServletResponse.{SC_METHOD_NOT_ALLOWED, SC_OK} import org.mockito.Mockito.{mock, when} import org.apache.spark.{SecurityManager, SparkConf, SparkFunSuite} @@ -74,4 +76,14 @@ class ReadOnlyMasterWebUISuite extends SparkFunSuite { val body = convPostDataToString(hostnames.map(("host", _))) assert(sendHttpRequest(url, "POST", body).getResponseCode === SC_METHOD_NOT_ALLOWED) } + + test("SPARK-47894: /environment") { + val url = s"http://${Utils.localHostNameForURI()}:${masterWebUI.boundPort}/environment" + val conn = sendHttpRequest(url, "GET", "") + assert(conn.getResponseCode === SC_OK) + val result = Source.fromInputStream(conn.getInputStream).mkString + assert(result.contains("Runtime Information")) + assert(result.contains("Spark Properties")) + assert(result.contains("Hadoop Properties")) + } } diff --git a/core/src/test/scala/org/apache/spark/input/WholeTextFileInputFormatSuite.scala b/core/src/test/scala/org/apache/spark/input/WholeTextFileInputFormatSuite.scala index 26c1be259ad23..1d258888fa0bc 100644 --- a/core/src/test/scala/org/apache/spark/input/WholeTextFileInputFormatSuite.scala +++ b/core/src/test/scala/org/apache/spark/input/WholeTextFileInputFormatSuite.scala @@ -19,8 +19,6 @@ package org.apache.spark.input import java.io.{DataOutputStream, File, FileOutputStream} -import scala.collection.immutable.IndexedSeq - import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite} /** diff --git a/core/src/test/scala/org/apache/spark/input/WholeTextFileRecordReaderSuite.scala b/core/src/test/scala/org/apache/spark/input/WholeTextFileRecordReaderSuite.scala index 0fc0b7536067e..9caf778de3848 100644 --- a/core/src/test/scala/org/apache/spark/input/WholeTextFileRecordReaderSuite.scala +++ b/core/src/test/scala/org/apache/spark/input/WholeTextFileRecordReaderSuite.scala @@ -21,8 +21,6 @@ import java.io.DataOutputStream import java.io.File import java.io.FileOutputStream -import scala.collection.immutable.IndexedSeq - import org.apache.hadoop.conf.Configuration import org.apache.hadoop.io.Text import org.apache.hadoop.io.compress.{CompressionCodecFactory, GzipCodec} diff --git a/core/src/test/scala/org/apache/spark/internal/config/ConfigEntrySuite.scala b/core/src/test/scala/org/apache/spark/internal/config/ConfigEntrySuite.scala index 38063c47ec96a..ae99735084056 100644 --- a/core/src/test/scala/org/apache/spark/internal/config/ConfigEntrySuite.scala +++ b/core/src/test/scala/org/apache/spark/internal/config/ConfigEntrySuite.scala @@ -196,12 +196,6 @@ class ConfigEntrySuite extends SparkFunSuite { assert(conversionError.getMessage === s"${conversionTest.key} should be double, but was abc") } - test("default value handling is null-safe") { - val conf = new SparkConf() - val stringConf = ConfigBuilder(testKey("string")).stringConf.createWithDefault(null) - assert(conf.get(stringConf) === null) - } - test("variable expansion of spark config entries") { val env = Map("ENV1" -> "env1") val conf = new SparkConfWithEnv(env) @@ -220,7 +214,7 @@ class ConfigEntrySuite extends SparkFunSuite { val refConf = ConfigBuilder(testKey("configReferenceTest")) .stringConf - .createWithDefault(null) + .createWithDefault("") def ref(entry: ConfigEntry[_]): String = "${" + entry.key + "}" @@ -250,12 +244,6 @@ class ConfigEntrySuite extends SparkFunSuite { // Make sure SparkConf's env override works. conf.set(refConf, "${env:ENV1}") assert(conf.get(refConf) === env("ENV1")) - - // Conf with null default value is not expanded. - val nullConf = ConfigBuilder(testKey("nullString")) - .stringConf - .createWithDefault(null) - testEntryRef(nullConf, ref(nullConf)) } test("conf entry : default function") { diff --git a/core/src/test/scala/org/apache/spark/internal/plugin/PluginContainerSuite.scala b/core/src/test/scala/org/apache/spark/internal/plugin/PluginContainerSuite.scala index cdbe5553bc95d..79fa8d21bf3f1 100644 --- a/core/src/test/scala/org/apache/spark/internal/plugin/PluginContainerSuite.scala +++ b/core/src/test/scala/org/apache/spark/internal/plugin/PluginContainerSuite.scala @@ -20,6 +20,7 @@ package org.apache.spark.internal.plugin import java.io.File import java.nio.charset.StandardCharsets import java.util.{Map => JMap} +import java.util.concurrent.CountDownLatch import java.util.concurrent.atomic.AtomicInteger import scala.concurrent.duration._ @@ -40,6 +41,7 @@ import org.apache.spark.memory.MemoryMode import org.apache.spark.resource.ResourceInformation import org.apache.spark.resource.ResourceUtils.GPU import org.apache.spark.resource.TestResourceIDs.{DRIVER_GPU_ID, EXECUTOR_GPU_ID, WORKER_GPU_ID} +import org.apache.spark.scheduler.{SparkListener, SparkListenerEvent} import org.apache.spark.util.Utils class PluginContainerSuite extends SparkFunSuite with LocalSparkContext { @@ -256,6 +258,40 @@ class PluginContainerSuite extends SparkFunSuite with LocalSparkContext { } } } + + test("The plugin should be shutdown before the listener bus is stopped") { + + val conf = new SparkConf() + .setAppName(getClass().getName()) + .set(SparkLauncher.SPARK_MASTER, "local[1]") + .set(PLUGINS, Seq(classOf[TestSparkPlugin].getName())) + + val sc = new SparkContext(conf) + + val countDownLatch = new CountDownLatch(1) + sc.addSparkListener(new SparkListener { + + override def onOtherEvent(event: SparkListenerEvent): Unit = { + event match { + case _: TestSparkPluginEvent => + // Count down upon receiving the event sent from the plugin during shutdown. + countDownLatch.countDown() + } + } + }) + + TestSparkPlugin.driverPluginShutdownHook = () => { + // The listener bus should still be active when the plugin is shutdown + sc.listenerBus.post(TestSparkPluginEvent()) + } + + // Stop the context + sc.stop() + countDownLatch.await() + // The listener should receive the event posted by the plugin on shutdown. + // If the listener bus is stopped before the plugin is shutdown, + // then the event will be dropped and won't be delivered to the listener. + } } class MemoryOverridePlugin extends SparkPlugin { @@ -392,6 +428,12 @@ private class TestDriverPlugin extends DriverPlugin { case other => throw new IllegalArgumentException(s"unknown: $other") } + override def shutdown(): Unit = { + if (TestSparkPlugin.driverPluginShutdownHook != null) { + TestSparkPlugin.driverPluginShutdownHook() + } + } + } private class TestExecutorPlugin extends ExecutorPlugin { @@ -420,9 +462,12 @@ private class TestExecutorPlugin extends ExecutorPlugin { } } +case class TestSparkPluginEvent() extends SparkListenerEvent + private object TestSparkPlugin { var driverPlugin: TestDriverPlugin = _ var driverContext: PluginContext = _ + var driverPluginShutdownHook: () => Unit = _ var executorPlugin: TestExecutorPlugin = _ var executorContext: PluginContext = _ @@ -432,6 +477,7 @@ private object TestSparkPlugin { def reset(): Unit = { driverPlugin = null driverContext = null + driverPluginShutdownHook = null executorPlugin = null executorContext = null extraConf = null diff --git a/core/src/test/scala/org/apache/spark/io/CompressionCodecSuite.scala b/core/src/test/scala/org/apache/spark/io/CompressionCodecSuite.scala index 729fcecff1207..5c09a1f965b9e 100644 --- a/core/src/test/scala/org/apache/spark/io/CompressionCodecSuite.scala +++ b/core/src/test/scala/org/apache/spark/io/CompressionCodecSuite.scala @@ -18,6 +18,7 @@ package org.apache.spark.io import java.io.{ByteArrayInputStream, ByteArrayOutputStream} +import java.util.Locale import com.google.common.io.ByteStreams @@ -160,4 +161,18 @@ class CompressionCodecSuite extends SparkFunSuite { ByteStreams.readFully(concatenatedBytes, decompressed) assert(decompressed.toSeq === (0 to 127)) } + + test("SPARK-48506: CompressionCodec getShortName is case insensitive for short names") { + CompressionCodec.shortCompressionCodecNames.foreach { case (shortName, codecClass) => + assert(CompressionCodec.getShortName(shortName) === shortName) + assert(CompressionCodec.getShortName(shortName.toUpperCase(Locale.ROOT)) === shortName) + assert(CompressionCodec.getShortName(codecClass) === shortName) + checkError( + exception = intercept[SparkIllegalArgumentException] { + CompressionCodec.getShortName(codecClass.toUpperCase(Locale.ROOT)) + }, + errorClass = "CODEC_SHORT_NAME_NOT_FOUND", + parameters = Map("codecName" -> codecClass.toUpperCase(Locale.ROOT))) + } + } } diff --git a/core/src/test/scala/org/apache/spark/io/LZFBenchmark.scala b/core/src/test/scala/org/apache/spark/io/LZFBenchmark.scala new file mode 100644 index 0000000000000..1934bd5169703 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/io/LZFBenchmark.scala @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.io + +import java.io.{ByteArrayOutputStream, ObjectOutputStream} +import java.lang.management.ManagementFactory + +import org.apache.spark.SparkConf +import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} +import org.apache.spark.internal.config.IO_COMPRESSION_LZF_PARALLEL + +/** + * Benchmark for ZStandard codec performance. + * {{{ + * To run this benchmark: + * 1. without sbt: bin/spark-submit --class + * 2. build/sbt "core/Test/runMain " + * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "core/Test/runMain " + * Results will be written to "benchmarks/ZStandardBenchmark-results.txt". + * }}} + */ +object LZFBenchmark extends BenchmarkBase { + + override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { + runBenchmark("Benchmark LZFCompressionCodec") { + compressSmallObjects() + compressLargeObjects() + } + } + + private def compressSmallObjects(): Unit = { + val N = 256_000_000 + val benchmark = new Benchmark("Compress small objects", N, output = output) + Seq(true, false).foreach { parallel => + val conf = new SparkConf(false).set(IO_COMPRESSION_LZF_PARALLEL, parallel) + val condition = if (parallel) "in parallel" else "single-threaded" + benchmark.addCase(s"Compression $N int values $condition") { _ => + val os = new LZFCompressionCodec(conf).compressedOutputStream(new ByteArrayOutputStream()) + for (i <- 1 until N) { + os.write(i) + } + os.close() + } + } + benchmark.run() + } + + private def compressLargeObjects(): Unit = { + val N = 1024 + val data: Array[Byte] = (1 until 128 * 1024 * 1024).map(_.toByte).toArray + val benchmark = new Benchmark(s"Compress large objects", N, output = output) + + // com.ning.compress.lzf.parallel.PLZFOutputStream.getNThreads + def getNThreads: Int = { + var nThreads = Runtime.getRuntime.availableProcessors + val jmx = ManagementFactory.getOperatingSystemMXBean + if (jmx != null) { + val loadAverage = jmx.getSystemLoadAverage.toInt + if (nThreads > 1 && loadAverage >= 1) nThreads = Math.max(1, nThreads - loadAverage) + } + nThreads + } + Seq(true, false).foreach { parallel => + val conf = new SparkConf(false).set(IO_COMPRESSION_LZF_PARALLEL, parallel) + val condition = if (parallel) s"in $getNThreads threads" else "single-threaded" + benchmark.addCase(s"Compression $N array values $condition") { _ => + val baos = new ByteArrayOutputStream() + val zcos = new LZFCompressionCodec(conf).compressedOutputStream(baos) + val oos = new ObjectOutputStream(zcos) + 1 to N foreach { _ => + oos.writeObject(data) + } + oos.close() + } + } + benchmark.run() + } +} diff --git a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala index c94e2d76b9f8c..7c5db914cd5ba 100644 --- a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala +++ b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala @@ -914,6 +914,22 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually { assert(c.cartesian[Int](a).collect().toList.sorted === c_cartesian_a) } + test("SPARK-48656: number of cartesian partitions overflow") { + val numSlices: Int = 65536 + val rdd1 = sc.parallelize(Seq(1, 2, 3), numSlices = numSlices) + val rdd2 = sc.parallelize(Seq(1, 2, 3), numSlices = numSlices) + checkError( + exception = intercept[SparkIllegalArgumentException] { + rdd1.cartesian(rdd2).partitions + }, + errorClass = "COLLECTION_SIZE_LIMIT_EXCEEDED.INITIALIZE", + sqlState = "54000", + parameters = Map( + "numberOfElements" -> (numSlices.toLong * numSlices.toLong).toString, + "maxRoundedArrayLength" -> Int.MaxValue.toString) + ) + } + test("intersection") { val all = sc.parallelize(1 to 10) val evens = sc.parallelize(2 to 10 by 2) @@ -1317,7 +1333,9 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually { val thrown = intercept[IllegalStateException] { block } - assert(thrown.getMessage.contains("stopped")) + assert(thrown.getMessage.contains("Cannot call methods on a stopped SparkContext")) + assert(thrown.getMessage.contains("This stopped SparkContext was created at:")) + assert(thrown.getMessage.contains("And it was stopped at:")) } assertFails { sc.parallelize(1 to 100) } assertFails { sc.textFile("/nonexistent-path") } diff --git a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorIntegrationSuite.scala index fcacd223814c7..ecc91560714d1 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorIntegrationSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorIntegrationSuite.scala @@ -19,8 +19,9 @@ package org.apache.spark.scheduler import org.apache.hadoop.mapred.{FileOutputCommitter, TaskAttemptContext} import org.scalatest.concurrent.{Signaler, ThreadSignaler, TimeLimits} +import org.scalatest.time.{Seconds, Span} -import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkException, SparkFunSuite, TaskContext} +import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite, TaskContext} /** * Integration tests for the OutputCommitCoordinator. @@ -44,15 +45,13 @@ class OutputCommitCoordinatorIntegrationSuite sc = new SparkContext("local[2, 4]", "test", conf) } - test("SPARK-39195: exception thrown in OutputCommitter.commitTask()") { + test("exception thrown in OutputCommitter.commitTask()") { // Regression test for SPARK-10381 - val e = intercept[SparkException] { + failAfter(Span(60, Seconds)) { withTempDir { tempDir => sc.parallelize(1 to 4, 2).map(_.toString).saveAsTextFile(tempDir.getAbsolutePath + "/out") } - }.getCause.getMessage - assert(e.contains("failed; but task commit success, data duplication may happen.") && - e.contains("Intentional exception")) + } } } diff --git a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala index f1a4b97c2981d..46b95177e7719 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala @@ -87,12 +87,11 @@ class OutputCommitCoordinatorSuite extends SparkFunSuite with BeforeAndAfter { isLocal: Boolean, listenerBus: LiveListenerBus): SparkEnv = { outputCommitCoordinator = - spy[OutputCommitCoordinator]( - new OutputCommitCoordinator(conf, isDriver = true, Option(this))) + spy[OutputCommitCoordinator](new OutputCommitCoordinator(conf, isDriver = true)) // Use Mockito.spy() to maintain the default infrastructure everywhere else. // This mocking allows us to control the coordinator responses in test cases. SparkEnv.createDriverEnv(conf, isLocal, listenerBus, - SparkContext.numDriverCores(master), this, Some(outputCommitCoordinator)) + SparkContext.numDriverCores(master), Some(outputCommitCoordinator)) } } // Use Mockito.spy() to maintain the default infrastructure everywhere else @@ -190,9 +189,12 @@ class OutputCommitCoordinatorSuite extends SparkFunSuite with BeforeAndAfter { // The authorized committer now fails, clearing the lock outputCommitCoordinator.taskCompleted(stage, stageAttempt, partition, attemptNumber = authorizedCommitter, reason = TaskKilled("test")) - // A new task should not be allowed to become stage failed because of potential data duplication - assert(!outputCommitCoordinator.canCommit(stage, stageAttempt, partition, + // A new task should now be allowed to become the authorized committer + assert(outputCommitCoordinator.canCommit(stage, stageAttempt, partition, nonAuthorizedCommitter + 2)) + // There can only be one authorized committer + assert(!outputCommitCoordinator.canCommit(stage, stageAttempt, partition, + nonAuthorizedCommitter + 3)) } test("SPARK-19631: Do not allow failed attempts to be authorized for committing") { @@ -226,8 +228,7 @@ class OutputCommitCoordinatorSuite extends SparkFunSuite with BeforeAndAfter { assert(outputCommitCoordinator.canCommit(stage, 2, partition, taskAttempt)) // Commit the 1st attempt, fail the 2nd attempt, make sure 3rd attempt cannot commit, - // then fail the 1st attempt and since stage failed because of potential data duplication, - // make sure fail the 4th attempt. + // then fail the 1st attempt and make sure the 4th one can commit again. stage += 1 outputCommitCoordinator.stageStart(stage, maxPartitionId = 1) assert(outputCommitCoordinator.canCommit(stage, 1, partition, taskAttempt)) @@ -236,9 +237,7 @@ class OutputCommitCoordinatorSuite extends SparkFunSuite with BeforeAndAfter { assert(!outputCommitCoordinator.canCommit(stage, 3, partition, taskAttempt)) outputCommitCoordinator.taskCompleted(stage, 1, partition, taskAttempt, ExecutorLostFailure("0", exitCausedByApp = true, None)) - // A new task should not be allowed to become the authorized committer since stage failed - // because of potential data duplication - assert(!outputCommitCoordinator.canCommit(stage, 4, partition, taskAttempt)) + assert(outputCommitCoordinator.canCommit(stage, 4, partition, taskAttempt)) } test("SPARK-24589: Make sure stage state is cleaned up") { diff --git a/core/src/test/scala/org/apache/spark/shuffle/sort/SortShuffleWriterSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/sort/SortShuffleWriterSuite.scala index 9e52b5e15143b..99402abb16cac 100644 --- a/core/src/test/scala/org/apache/spark/shuffle/sort/SortShuffleWriterSuite.scala +++ b/core/src/test/scala/org/apache/spark/shuffle/sort/SortShuffleWriterSuite.scala @@ -85,6 +85,7 @@ class SortShuffleWriterSuite shuffleHandle, mapId = 1, context, + context.taskMetrics().shuffleWriteMetrics, shuffleExecutorComponents) writer.write(Iterator.empty) writer.stop(success = true) @@ -102,6 +103,7 @@ class SortShuffleWriterSuite shuffleHandle, mapId = 2, context, + context.taskMetrics().shuffleWriteMetrics, shuffleExecutorComponents) writer.write(records.iterator) writer.stop(success = true) @@ -158,6 +160,7 @@ class SortShuffleWriterSuite shuffleHandle, mapId = 0, context, + context.taskMetrics().shuffleWriteMetrics, new LocalDiskShuffleExecutorComponents( conf, shuffleBlockResolver._blockManager, shuffleBlockResolver)) writer.write(records.iterator) diff --git a/core/src/test/scala/org/apache/spark/shuffle/sort/io/LocalDiskShuffleMapOutputWriterSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/sort/io/LocalDiskShuffleMapOutputWriterSuite.scala index 3db7527262568..7ab2cb864234f 100644 --- a/core/src/test/scala/org/apache/spark/shuffle/sort/io/LocalDiskShuffleMapOutputWriterSuite.scala +++ b/core/src/test/scala/org/apache/spark/shuffle/sort/io/LocalDiskShuffleMapOutputWriterSuite.scala @@ -71,7 +71,7 @@ class LocalDiskShuffleMapOutputWriterSuite extends SparkFunSuite { partitionSizesInMergedFile = null conf = new SparkConf() .set("spark.app.id", "example.spark.app") - .set("spark.shuffle.unsafe.file.output.buffer", "16k") + .set("spark.shuffle.localDisk.file.output.buffer", "16k") when(blockResolver.getDataFile(anyInt, anyLong)).thenReturn(mergedOutputFile) when(blockResolver.createTempFile(any(classOf[File]))) .thenAnswer { invocationOnMock => diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerDecommissionIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerDecommissionIntegrationSuite.scala index ba665600a1cb7..febe1ac4bb4cf 100644 --- a/core/src/test/scala/org/apache/spark/storage/BlockManagerDecommissionIntegrationSuite.scala +++ b/core/src/test/scala/org/apache/spark/storage/BlockManagerDecommissionIntegrationSuite.scala @@ -17,12 +17,14 @@ package org.apache.spark.storage +import java.io.File import java.util.concurrent.{ConcurrentHashMap, ConcurrentLinkedQueue, Semaphore, TimeUnit} import scala.collection.mutable.ArrayBuffer import scala.concurrent.duration._ import scala.jdk.CollectionConverters._ +import org.apache.commons.io.FileUtils import org.scalatest.concurrent.Eventually import org.apache.spark._ @@ -353,4 +355,78 @@ class BlockManagerDecommissionIntegrationSuite extends SparkFunSuite with LocalS import scala.language.reflectiveCalls assert(listener.removeReasonValidated) } + + test("SPARK-46957: Migrated shuffle files should be able to cleanup from executor") { + + val sparkTempDir = System.getProperty("java.io.tmpdir") + + def shuffleFiles: Seq[File] = { + FileUtils + .listFiles(new File(sparkTempDir), Array("data", "index"), true) + .asScala + .toSeq + } + + val existingShuffleFiles = shuffleFiles + + val conf = new SparkConf() + .setAppName("SPARK-46957") + .setMaster("local-cluster[2,1,1024]") + .set(config.DECOMMISSION_ENABLED, true) + .set(config.STORAGE_DECOMMISSION_ENABLED, true) + .set(config.STORAGE_DECOMMISSION_SHUFFLE_BLOCKS_ENABLED, true) + sc = new SparkContext(conf) + TestUtils.waitUntilExecutorsUp(sc, 2, 60000) + val shuffleBlockUpdates = new ArrayBuffer[BlockId]() + var isDecommissionedExecutorRemoved = false + val execToDecommission = sc.getExecutorIds().head + sc.addSparkListener(new SparkListener { + override def onBlockUpdated(blockUpdated: SparkListenerBlockUpdated): Unit = { + if (blockUpdated.blockUpdatedInfo.blockId.isShuffle) { + shuffleBlockUpdates += blockUpdated.blockUpdatedInfo.blockId + } + } + + override def onExecutorRemoved(executorRemoved: SparkListenerExecutorRemoved): Unit = { + assert(execToDecommission === executorRemoved.executorId) + isDecommissionedExecutorRemoved = true + } + }) + + // Run a job to create shuffle data + val result = sc.parallelize(1 to 1000, 10) + .map { i => (i % 2, i) } + .reduceByKey(_ + _).collect() + + assert(result.head === (0, 250500)) + assert(result.tail.head === (1, 250000)) + sc.schedulerBackend + .asInstanceOf[StandaloneSchedulerBackend] + .decommissionExecutor( + execToDecommission, + ExecutorDecommissionInfo("test", None), + adjustTargetNumExecutors = true + ) + + eventually(timeout(1.minute), interval(10.milliseconds)) { + assert(isDecommissionedExecutorRemoved) + // Ensure there are shuffle data have been migrated + assert(shuffleBlockUpdates.size >= 2) + } + + val shuffleId = shuffleBlockUpdates + .find(_.isInstanceOf[ShuffleIndexBlockId]) + .map(_.asInstanceOf[ShuffleIndexBlockId].shuffleId) + .get + + val newShuffleFiles = shuffleFiles.diff(existingShuffleFiles) + assert(newShuffleFiles.size >= shuffleBlockUpdates.size) + + // Remove the shuffle data + sc.shuffleDriverComponents.removeShuffle(shuffleId, true) + + eventually(timeout(1.minute), interval(10.milliseconds)) { + assert(newShuffleFiles.intersect(shuffleFiles).isEmpty) + } + } } diff --git a/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala b/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala index 70a57eed07acd..4352436c872fe 100644 --- a/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala +++ b/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala @@ -16,11 +16,14 @@ */ package org.apache.spark.storage -import java.io.File +import java.io.{File, InputStream, OutputStream} +import java.nio.ByteBuffer + +import scala.reflect.ClassTag import org.apache.spark.{SparkConf, SparkException, SparkFunSuite} import org.apache.spark.executor.ShuffleWriteMetrics -import org.apache.spark.serializer.{JavaSerializer, SerializerManager} +import org.apache.spark.serializer.{DeserializationStream, JavaSerializer, SerializationStream, Serializer, SerializerInstance, SerializerManager} import org.apache.spark.util.Utils class DiskBlockObjectWriterSuite extends SparkFunSuite { @@ -43,10 +46,14 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite { private def createWriter(): (DiskBlockObjectWriter, File, ShuffleWriteMetrics) = { val file = new File(tempDir, "somefile") val conf = new SparkConf() - val serializerManager = new SerializerManager(new JavaSerializer(conf), conf) + val serializerManager = new CustomSerializerManager(new JavaSerializer(conf), conf, None) val writeMetrics = new ShuffleWriteMetrics() val writer = new DiskBlockObjectWriter( - file, serializerManager, new JavaSerializer(new SparkConf()).newInstance(), 1024, true, + file, + serializerManager, + new CustomJavaSerializer(new SparkConf()).newInstance(), + 1024, + true, writeMetrics) (writer, file, writeMetrics) } @@ -196,9 +203,76 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite { for (i <- 1 to 500) { writer.write(i, i) } + + val bs = writer.getSerializerWrappedStream.asInstanceOf[OutputStreamWithCloseDetecting] + val objOut = writer.getSerializationStream.asInstanceOf[SerializationStreamWithCloseDetecting] + writer.closeAndDelete() assert(!file.exists()) assert(writeMetrics.bytesWritten == 0) assert(writeMetrics.recordsWritten == 0) + assert(bs.isClosed) + assert(objOut.isClosed) + } +} + +trait CloseDetecting { + var isClosed = false +} + +class OutputStreamWithCloseDetecting(outputStream: OutputStream) + extends OutputStream + with CloseDetecting { + override def write(b: Int): Unit = outputStream.write(b) + + override def close(): Unit = { + isClosed = true + outputStream.close() + } +} + +class CustomSerializerManager( + defaultSerializer: Serializer, + conf: SparkConf, + encryptionKey: Option[Array[Byte]]) + extends SerializerManager(defaultSerializer, conf, encryptionKey) { + override def wrapStream(blockId: BlockId, s: OutputStream): OutputStream = { + new OutputStreamWithCloseDetecting(wrapForCompression(blockId, wrapForEncryption(s))) + } +} + +class CustomJavaSerializer(conf: SparkConf) extends JavaSerializer(conf) { + + override def newInstance(): SerializerInstance = { + new CustomJavaSerializerInstance(super.newInstance()) } } + +class SerializationStreamWithCloseDetecting(serializationStream: SerializationStream) + extends SerializationStream with CloseDetecting { + + override def close(): Unit = { + isClosed = true + serializationStream.close() + } + + override def writeObject[T: ClassTag](t: T): SerializationStream = + serializationStream.writeObject(t) + + override def flush(): Unit = serializationStream.flush() +} + +class CustomJavaSerializerInstance(instance: SerializerInstance) extends SerializerInstance { + override def serializeStream(s: OutputStream): SerializationStream = + new SerializationStreamWithCloseDetecting(instance.serializeStream(s)) + + override def serialize[T: ClassTag](t: T): ByteBuffer = instance.serialize(t) + + override def deserialize[T: ClassTag](bytes: ByteBuffer): T = instance.deserialize(bytes) + + override def deserialize[T: ClassTag](bytes: ByteBuffer, loader: ClassLoader): T = + instance.deserialize(bytes, loader) + + override def deserializeStream(s: InputStream): DeserializationStream = + instance.deserializeStream(s) +} diff --git a/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala b/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala index 02900e14b1f67..ca77d2c7b7097 100644 --- a/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala +++ b/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala @@ -392,9 +392,11 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT configureMockTransfer(Map()) val iterator = createShuffleBlockIteratorWithDefaults( - Map(hostLocalBmId -> toBlockList(hostLocalBlocks.keys, 1L, 1)) + Map(hostLocalBmId -> toBlockList(hostLocalBlocks.keys, 1L, 1)), + blockManager = Some(blockManager) ) intercept[FetchFailedException] { iterator.next() } + verify(mockExternalBlockStoreClient, times(1)).getHostLocalDirs(any(), any(), any(), any()) } test("Hit maxBytesInFlight limitation before maxBlocksInFlightPerAddress") { diff --git a/core/src/test/scala/org/apache/spark/util/SparkUncaughtExceptionHandlerSuite.scala b/core/src/test/scala/org/apache/spark/util/SparkUncaughtExceptionHandlerSuite.scala index 9e23b25493dfe..4843409661554 100644 --- a/core/src/test/scala/org/apache/spark/util/SparkUncaughtExceptionHandlerSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/SparkUncaughtExceptionHandlerSuite.scala @@ -22,6 +22,7 @@ import java.io.File import scala.util.Try import org.apache.spark.SparkFunSuite +import org.apache.spark.executor.{ExecutorExitCode, KilledByTaskReaperException} class SparkUncaughtExceptionHandlerSuite extends SparkFunSuite { @@ -33,6 +34,8 @@ class SparkUncaughtExceptionHandlerSuite extends SparkFunSuite { (ThrowableTypes.RuntimeException, false, 0), (ThrowableTypes.OutOfMemoryError, true, SparkExitCode.OOM), (ThrowableTypes.OutOfMemoryError, false, SparkExitCode.OOM), + (ThrowableTypes.KilledByTaskReaperException, true, ExecutorExitCode.KILLED_BY_TASK_REAPER), + (ThrowableTypes.KilledByTaskReaperException, false, 0), (ThrowableTypes.SparkFatalRuntimeException, true, SparkExitCode.UNCAUGHT_EXCEPTION), (ThrowableTypes.SparkFatalRuntimeException, false, 0), (ThrowableTypes.SparkFatalOutOfMemoryError, true, SparkExitCode.OOM), @@ -64,6 +67,8 @@ object ThrowableTypes extends Enumeration { val RuntimeException = ThrowableTypesVal("RuntimeException", new RuntimeException) val OutOfMemoryError = ThrowableTypesVal("OutOfMemoryError", new OutOfMemoryError) + val KilledByTaskReaperException = ThrowableTypesVal("KilledByTaskReaperException", + new KilledByTaskReaperException("dummy message")) val SparkFatalRuntimeException = ThrowableTypesVal("SparkFatalException(RuntimeException)", new SparkFatalException(new RuntimeException)) val SparkFatalOutOfMemoryError = ThrowableTypesVal("SparkFatalException(OutOfMemoryError)", diff --git a/core/src/test/scala/org/apache/spark/util/ThreadUtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/ThreadUtilsSuite.scala index d907fe1a27c83..04f661db691e5 100644 --- a/core/src/test/scala/org/apache/spark/util/ThreadUtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/ThreadUtilsSuite.scala @@ -119,11 +119,46 @@ class ThreadUtilsSuite extends SparkFunSuite { runInNewThread("thread-name") { throw new IllegalArgumentException(uniqueExceptionMessage) } } assert(exception.getMessage === uniqueExceptionMessage) - assert(exception.getStackTrace.mkString("\n").contains( + val stacktrace = exception.getStackTrace.mkString("\n") + assert(stacktrace.contains( "... run in separate thread using org.apache.spark.util.ThreadUtils ..."), "stack trace does not contain expected place holder" ) - assert(exception.getStackTrace.mkString("\n").contains("ThreadUtils.scala") === false, + assert(!stacktrace.contains("ThreadUtils.scala"), + "stack trace contains unexpected references to ThreadUtils" + ) + } + + test("SPARK-47833: wrapCallerStacktrace") { + var runnerThreadName: String = null + var exception: Throwable = null + val t = new Thread() { + override def run(): Unit = { + runnerThreadName = Thread.currentThread().getName + internalMethod() + } + private def internalMethod(): Unit = { + throw new RuntimeException(s"Error occurred on $runnerThreadName") + } + } + t.setDaemon(true) + t.setUncaughtExceptionHandler { case (_, e) => exception = e } + t.start() + t.join() + + ThreadUtils.wrapCallerStacktrace(exception, s"run in separate thread: $runnerThreadName") + + val stacktrace = exception.getStackTrace.mkString("\n") + assert(stacktrace.contains("internalMethod"), + "stack trace does not contain real exception stack trace" + ) + assert(stacktrace.contains(s"... run in separate thread: $runnerThreadName ..."), + "stack trace does not contain expected place holder" + ) + assert(stacktrace.contains("org.scalatest.Suite.run"), + "stack trace does not contain caller stack trace" + ) + assert(!stacktrace.contains("ThreadUtils.scala"), "stack trace contains unexpected references to ThreadUtils" ) } diff --git a/dev/.scalafmt.conf b/dev/.scalafmt.conf index 9a01136dfaf88..bb16145f4df7d 100644 --- a/dev/.scalafmt.conf +++ b/dev/.scalafmt.conf @@ -27,4 +27,4 @@ danglingParentheses.preset = false docstrings.style = Asterisk maxColumn = 98 runner.dialect = scala213 -version = 3.8.0 +version = 3.8.2 diff --git a/dev/checkstyle-suppressions.xml b/dev/checkstyle-suppressions.xml index 834265f48aa8c..677381704427c 100644 --- a/dev/checkstyle-suppressions.xml +++ b/dev/checkstyle-suppressions.xml @@ -64,4 +64,8 @@ files="src/main/java/org/apache/spark/sql/api/java/*"/> + + diff --git a/dev/checkstyle.xml b/dev/checkstyle.xml index 7add947428160..c4023a84ee3cf 100644 --- a/dev/checkstyle.xml +++ b/dev/checkstyle.xml @@ -71,6 +71,12 @@ + + + + + + + + + + diff --git a/dev/connect-gen-protos.sh b/dev/connect-gen-protos.sh index 9ae3bac45933d..33b06167d67cc 100755 --- a/dev/connect-gen-protos.sh +++ b/dev/connect-gen-protos.sh @@ -76,7 +76,7 @@ for f in `find gen/proto/python -name "*.py*"`; do sed -e "s/DESCRIPTOR, 'spark.connect/DESCRIPTOR, 'pyspark.sql.connect.proto/g" $f > $f.tmp mv $f.tmp $f elif [[ $f == *.pyi ]]; then - sed -e 's/import spark.connect./import pyspark.sql.connect.proto./g' -e 's/spark.connect./pyspark.sql.connect.proto./g' $f > $f.tmp + sed -e 's/import spark.connect./import pyspark.sql.connect.proto./g' -e 's/spark.connect./pyspark.sql.connect.proto./g' -e '/ *@typing_extensions\.final/d' $f > $f.tmp mv $f.tmp $f fi diff --git a/dev/create-release/do-release-docker.sh b/dev/create-release/do-release-docker.sh index c44d0193069b8..132f6b78c3db6 100755 --- a/dev/create-release/do-release-docker.sh +++ b/dev/create-release/do-release-docker.sh @@ -84,8 +84,8 @@ if [ ! -z "$RELEASE_STEP" ] && [ "$RELEASE_STEP" = "finalize" ]; then error "Exiting." fi - if [ -z "$PYPI_PASSWORD" ]; then - stty -echo && printf "PyPi password: " && read PYPI_PASSWORD && printf '\n' && stty echo + if [ -z "$PYPI_API_TOKEN" ]; then + stty -echo && printf "PyPi API token: " && read PYPI_API_TOKEN && printf '\n' && stty echo fi fi @@ -142,7 +142,7 @@ GIT_NAME=$GIT_NAME GIT_EMAIL=$GIT_EMAIL GPG_KEY=$GPG_KEY ASF_PASSWORD=$ASF_PASSWORD -PYPI_PASSWORD=$PYPI_PASSWORD +PYPI_API_TOKEN=$PYPI_API_TOKEN GPG_PASSPHRASE=$GPG_PASSPHRASE RELEASE_STEP=$RELEASE_STEP USER=$USER diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh index 62d172ef74ca4..19589b951a6e1 100755 --- a/dev/create-release/release-build.sh +++ b/dev/create-release/release-build.sh @@ -80,6 +80,9 @@ done export LC_ALL=C.UTF-8 export LANG=C.UTF-8 +export PYSPARK_PYTHON=/usr/local/bin/python +export PYSPARK_DRIVER_PYTHON=/usr/local/bin/python + # Commit ref to checkout when building GIT_REF=${GIT_REF:-master} @@ -95,8 +98,8 @@ init_java init_maven_sbt if [[ "$1" == "finalize" ]]; then - if [[ -z "$PYPI_PASSWORD" ]]; then - error 'The environment variable PYPI_PASSWORD is not set. Exiting.' + if [[ -z "$PYPI_API_TOKEN" ]]; then + error 'The environment variable PYPI_API_TOKEN is not set. Exiting.' fi git config --global user.name "$GIT_NAME" @@ -104,31 +107,36 @@ if [[ "$1" == "finalize" ]]; then # Create the git tag for the new release echo "Creating the git tag for the new release" - rm -rf spark - git clone "https://$ASF_USERNAME:$ASF_PASSWORD@$ASF_SPARK_REPO" -b master - cd spark - git tag "v$RELEASE_VERSION" "$RELEASE_TAG" - git push origin "v$RELEASE_VERSION" - cd .. - rm -rf spark - echo "git tag v$RELEASE_VERSION created" + if check_for_tag "v$RELEASE_VERSION"; then + echo "v$RELEASE_VERSION already exists. Skip creating it." + else + rm -rf spark + git clone "https://$ASF_USERNAME:$ASF_PASSWORD@$ASF_SPARK_REPO" -b master + cd spark + git tag "v$RELEASE_VERSION" "$RELEASE_TAG" + git push origin "v$RELEASE_VERSION" + cd .. + rm -rf spark + echo "git tag v$RELEASE_VERSION created" + fi # download PySpark binary from the dev directory and upload to PyPi. echo "Uploading PySpark to PyPi" svn co --depth=empty "$RELEASE_STAGING_LOCATION/$RELEASE_TAG-bin" svn-spark cd svn-spark - svn update "pyspark-$RELEASE_VERSION.tar.gz" - svn update "pyspark-$RELEASE_VERSION.tar.gz.asc" - TWINE_USERNAME=spark-upload TWINE_PASSWORD="$PYPI_PASSWORD" twine upload \ + PYSPARK_VERSION=`echo "$RELEASE_VERSION" | sed -e "s/-/./" -e "s/preview/dev/"` + svn update "pyspark-$PYSPARK_VERSION.tar.gz" + svn update "pyspark-$PYSPARK_VERSION.tar.gz.asc" + twine upload -u __token__ -p $PYPI_API_TOKEN \ --repository-url https://upload.pypi.org/legacy/ \ - "pyspark-$RELEASE_VERSION.tar.gz" \ - "pyspark-$RELEASE_VERSION.tar.gz.asc" - svn update "pyspark-connect-$RELEASE_VERSION.tar.gz" - svn update "pyspark-connect-$RELEASE_VERSION.tar.gz.asc" - TWINE_USERNAME=spark-upload TWINE_PASSWORD="$PYPI_PASSWORD" twine upload \ + "pyspark-$PYSPARK_VERSION.tar.gz" \ + "pyspark-$PYSPARK_VERSION.tar.gz.asc" + svn update "pyspark_connect-$PYSPARK_VERSION.tar.gz" + svn update "pyspark_connect-$PYSPARK_VERSION.tar.gz.asc" + twine upload -u __token__ -p $PYPI_API_TOKEN \ --repository-url https://upload.pypi.org/legacy/ \ - "pyspark-connect-$RELEASE_VERSION.tar.gz" \ - "pyspark-connect-$RELEASE_VERSION.tar.gz.asc" + "pyspark_connect-$PYSPARK_VERSION.tar.gz" \ + "pyspark_connect-$PYSPARK_VERSION.tar.gz.asc" cd .. rm -rf svn-spark echo "PySpark uploaded" @@ -194,6 +202,8 @@ fi PUBLISH_SCALA_2_12=1 if [[ $SPARK_VERSION > "3.5.99" ]]; then PUBLISH_SCALA_2_12=0 + # There is no longer scala-2.13 profile since 4.0.0 + SCALA_2_13_PROFILES="" fi SCALA_2_12_PROFILES="-Pscala-2.12" @@ -201,7 +211,7 @@ SCALA_2_12_PROFILES="-Pscala-2.12" HIVE_PROFILES="-Phive -Phive-thriftserver" # Profiles for publishing snapshots and release to Maven Central # We use Apache Hive 2.3 for publishing -PUBLISH_PROFILES="$BASE_PROFILES $HIVE_PROFILES -Pspark-ganglia-lgpl -Pkinesis-asl -Phadoop-cloud" +PUBLISH_PROFILES="$BASE_PROFILES $HIVE_PROFILES -Pspark-ganglia-lgpl -Pkinesis-asl -Phadoop-cloud -Pjvm-profiler" # Profiles for building binary releases BASE_RELEASE_PROFILES="$BASE_PROFILES -Psparkr" @@ -309,7 +319,7 @@ if [[ "$1" == "package" ]]; then --detach-sig $PYTHON_DIST_NAME shasum -a 512 $PYTHON_DIST_NAME > $PYTHON_DIST_NAME.sha512 - PYTHON_CONNECT_DIST_NAME=pyspark-connect-$PYSPARK_VERSION.tar.gz + PYTHON_CONNECT_DIST_NAME=pyspark_connect-$PYSPARK_VERSION.tar.gz cp spark-$SPARK_VERSION-bin-$NAME/python/dist/$PYTHON_CONNECT_DIST_NAME . echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --armour \ @@ -345,21 +355,25 @@ if [[ "$1" == "package" ]]; then declare -A BINARY_PKGS_EXTRA BINARY_PKGS_EXTRA["hadoop3"]="withpip,withr" - if [[ $PUBLISH_SCALA_2_13 = 1 ]]; then - key="hadoop3-scala2.13" + # This is dead code as Scala 2.12 is no longer supported, but we keep it as a template for + # adding new Scala version support in the future. This secondary Scala version only has one + # binary package to avoid doubling the number of final packages. It doesn't build PySpark and + # SparkR as the primary Scala version will build them. + if [[ $PUBLISH_SCALA_2_12 = 1 ]]; then + key="hadoop3-scala2.12" args="-Phadoop-3 $HIVE_PROFILES" extra="" - if ! make_binary_release "$key" "$SCALA_2_13_PROFILES $args" "$extra" "2.13"; then + if ! make_binary_release "$key" "$SCALA_2_12_PROFILES $args" "$extra" "2.12"; then error "Failed to build $key package. Check logs for details." fi fi - if [[ $PUBLISH_SCALA_2_12 = 1 ]]; then + if [[ $PUBLISH_SCALA_2_13 = 1 ]]; then echo "Packages to build: ${!BINARY_PKGS_ARGS[@]}" for key in ${!BINARY_PKGS_ARGS[@]}; do args=${BINARY_PKGS_ARGS[$key]} extra=${BINARY_PKGS_EXTRA[$key]} - if ! make_binary_release "$key" "$SCALA_2_12_PROFILES $args" "$extra" "2.12"; then + if ! make_binary_release "$key" "$SCALA_2_13_PROFILES $args" "$extra" "2.13"; then error "Failed to build $key package. Check logs for details." fi done @@ -374,8 +388,8 @@ if [[ "$1" == "package" ]]; then echo "Copying release tarballs" cp spark-* "svn-spark/${DEST_DIR_NAME}-bin/" - cp pyspark-* "svn-spark/${DEST_DIR_NAME}-bin/" - cp SparkR_* "svn-spark/${DEST_DIR_NAME}-bin/" + cp pyspark* "svn-spark/${DEST_DIR_NAME}-bin/" + cp SparkR* "svn-spark/${DEST_DIR_NAME}-bin/" svn add "svn-spark/${DEST_DIR_NAME}-bin" cd svn-spark diff --git a/dev/create-release/release-util.sh b/dev/create-release/release-util.sh index 0394fb49c2fa0..b5edbf40d487d 100755 --- a/dev/create-release/release-util.sh +++ b/dev/create-release/release-util.sh @@ -128,6 +128,9 @@ function get_release_info { RC_COUNT=1 fi + if [ "$GIT_BRANCH" = "master" ]; then + RELEASE_VERSION="$RELEASE_VERSION-preview1" + fi export NEXT_VERSION export RELEASE_VERSION=$(read_config "Release" "$RELEASE_VERSION") diff --git a/dev/create-release/spark-rm/Dockerfile b/dev/create-release/spark-rm/Dockerfile index f51b24d583947..ca9e10bebfc53 100644 --- a/dev/create-release/spark-rm/Dockerfile +++ b/dev/create-release/spark-rm/Dockerfile @@ -15,74 +15,123 @@ # limitations under the License. # -# Image for building Spark releases. Based on Ubuntu 20.04. -# -# Includes: -# * Java 17 -# * Ivy -# * Python (3.8.5) -# * R-base/R-base-dev (4.0.3) -# * Ruby (2.7.0) -# -# You can test it as below: -# cd dev/create-release/spark-rm -# docker build -t spark-rm --build-arg UID=$UID . +# Image for building Spark releases. Based on Ubuntu 22.04. +FROM ubuntu:jammy-20240227 -FROM ubuntu:20.04 +ENV FULL_REFRESH_DATE 20240318 -# For apt to be noninteractive ENV DEBIAN_FRONTEND noninteractive ENV DEBCONF_NONINTERACTIVE_SEEN true -# These arguments are just for reuse and not really meant to be customized. -ARG APT_INSTALL="apt-get install --no-install-recommends -y" +RUN apt-get update && apt-get install -y \ + build-essential \ + ca-certificates \ + curl \ + gfortran \ + git \ + subversion \ + gnupg \ + libcurl4-openssl-dev \ + libfontconfig1-dev \ + libfreetype6-dev \ + libfribidi-dev \ + libgit2-dev \ + libharfbuzz-dev \ + libjpeg-dev \ + liblapack-dev \ + libopenblas-dev \ + libpng-dev \ + libpython3-dev \ + libssl-dev \ + libtiff5-dev \ + libxml2-dev \ + nodejs \ + npm \ + openjdk-17-jdk-headless \ + pandoc \ + pkg-config \ + python3.10 \ + python3-psutil \ + texlive-latex-base \ + texlive \ + texlive-fonts-extra \ + texinfo \ + texlive-latex-extra \ + qpdf \ + jq \ + r-base \ + ruby \ + ruby-dev \ + software-properties-common \ + wget \ + zlib1g-dev \ + && rm -rf /var/lib/apt/lists/* -ARG PIP_PKGS="sphinx==4.5.0 mkdocs==1.1.2 numpy==1.20.3 pydata_sphinx_theme==0.13.3 ipython==7.19.0 nbsphinx==0.8.0 numpydoc==1.1.0 jinja2==3.1.2 twine==3.4.1 sphinx-plotly-directive==0.1.3 sphinx-copybutton==0.5.2 pandas==1.5.3 pyarrow==10.0.1 plotly==5.4.0 markupsafe==2.0.1 docutils<0.17 grpcio==1.62.0 protobuf==4.21.6 grpcio-status==1.62.0 googleapis-common-protos==1.56.4" -ARG GEM_PKGS="bundler:2.3.8" -# Install extra needed repos and refresh. -# - CRAN repo -# - Ruby repo (for doc generation) -# -# This is all in a single "RUN" command so that if anything changes, "apt update" is run to fetch -# the most current package versions (instead of potentially using old versions cached by docker). -RUN apt-get clean && apt-get update && $APT_INSTALL gnupg ca-certificates && \ - echo 'deb https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/' >> /etc/apt/sources.list && \ - gpg --keyserver hkps://keyserver.ubuntu.com --recv-key E298A3A825C0D65DFD57CBB651716619E084DAB9 && \ - gpg -a --export E084DAB9 | apt-key add - && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* && \ - apt-get clean && \ - apt-get update && \ - $APT_INSTALL software-properties-common && \ - apt-get update && \ - # Install openjdk 17. - $APT_INSTALL openjdk-17-jdk && \ - update-alternatives --set java $(ls /usr/lib/jvm/java-17-openjdk-*/bin/java) && \ - # Install build / source control tools - $APT_INSTALL curl wget git maven ivy subversion make gcc lsof libffi-dev \ - pandoc pandoc-citeproc libssl-dev libcurl4-openssl-dev libxml2-dev && \ - curl -sL https://deb.nodesource.com/setup_12.x | bash && \ - $APT_INSTALL nodejs && \ - # Install needed python packages. Use pip for installing packages (for consistency). - $APT_INSTALL python-is-python3 python3-pip python3-setuptools && \ - # qpdf is required for CRAN checks to pass. - $APT_INSTALL qpdf jq && \ - pip3 install $PIP_PKGS && \ - # Install R packages and dependencies used when building. - # R depends on pandoc*, libssl (which are installed above). - # Note that PySpark doc generation also needs pandoc due to nbsphinx - $APT_INSTALL r-base r-base-dev && \ - $APT_INSTALL libcurl4-openssl-dev libgit2-dev libssl-dev libxml2-dev && \ - $APT_INSTALL texlive-latex-base texlive texlive-fonts-extra texinfo qpdf texlive-latex-extra && \ - $APT_INSTALL libfontconfig1-dev libharfbuzz-dev libfribidi-dev libfreetype6-dev libpng-dev libtiff5-dev libjpeg-dev && \ - Rscript -e "install.packages(c('curl', 'xml2', 'httr', 'devtools', 'testthat', 'knitr', 'rmarkdown', 'markdown', 'roxygen2', 'e1071', 'survival'), repos='https://cloud.r-project.org/')" && \ - Rscript -e "devtools::install_github('jimhester/lintr')" && \ - Rscript -e "devtools::install_version('pkgdown', version='2.0.1', repos='https://cloud.r-project.org')" && \ - Rscript -e "devtools::install_version('preferably', version='0.4', repos='https://cloud.r-project.org')" && \ - # Install tools needed to build the documentation. - $APT_INSTALL ruby2.7 ruby2.7-dev && \ - gem install --no-document $GEM_PKGS +RUN echo 'deb https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/' >> /etc/apt/sources.list +RUN gpg --keyserver hkps://keyserver.ubuntu.com --recv-key E298A3A825C0D65DFD57CBB651716619E084DAB9 +RUN gpg -a --export E084DAB9 | apt-key add - +RUN add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/' + +# See more in SPARK-39959, roxygen2 < 7.2.1 +RUN Rscript -e "install.packages(c('devtools', 'knitr', 'markdown', \ + 'rmarkdown', 'testthat', 'devtools', 'e1071', 'survival', 'arrow', \ + 'ggplot2', 'mvtnorm', 'statmod', 'xml2'), repos='https://cloud.r-project.org/')" && \ + Rscript -e "devtools::install_version('roxygen2', version='7.2.0', repos='https://cloud.r-project.org')" && \ + Rscript -e "devtools::install_version('lintr', version='2.0.1', repos='https://cloud.r-project.org')" && \ + Rscript -e "devtools::install_version('pkgdown', version='2.0.1', repos='https://cloud.r-project.org')" && \ + Rscript -e "devtools::install_version('preferably', version='0.4', repos='https://cloud.r-project.org')" + +# See more in SPARK-39735 +ENV R_LIBS_SITE "/usr/local/lib/R/site-library:${R_LIBS_SITE}:/usr/lib/R/library" + + +RUN add-apt-repository ppa:pypy/ppa +RUN mkdir -p /usr/local/pypy/pypy3.9 && \ + curl -sqL https://downloads.python.org/pypy/pypy3.9-v7.3.16-linux64.tar.bz2 | tar xjf - -C /usr/local/pypy/pypy3.9 --strip-components=1 && \ + ln -sf /usr/local/pypy/pypy3.9/bin/pypy /usr/local/bin/pypy3.8 && \ + ln -sf /usr/local/pypy/pypy3.9/bin/pypy /usr/local/bin/pypy3 +RUN curl -sS https://bootstrap.pypa.io/get-pip.py | pypy3 +RUN pypy3 -m pip install numpy 'six==1.16.0' 'pandas==2.2.2' scipy coverage matplotlib lxml + + +ARG BASIC_PIP_PKGS="numpy pyarrow>=15.0.0 six==1.16.0 pandas==2.2.2 scipy plotly>=4.8 mlflow>=2.8.1 coverage matplotlib openpyxl memory-profiler>=0.61.0 scikit-learn>=1.3.2 twine==3.4.1" +# Python deps for Spark Connect +ARG CONNECT_PIP_PKGS="grpcio==1.62.0 grpcio-status==1.62.0 protobuf==4.25.1 googleapis-common-protos==1.56.4" + +# Install Python 3.10 packages +RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10 +RUN python3.10 -m pip install --ignore-installed blinker>=1.6.2 # mlflow needs this +RUN python3.10 -m pip install --ignore-installed 'six==1.16.0' # Avoid `python3-six` installation +RUN python3.10 -m pip install $BASIC_PIP_PKGS unittest-xml-reporting $CONNECT_PIP_PKGS && \ + python3.10 -m pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu && \ + python3.10 -m pip install deepspeed torcheval && \ + python3.10 -m pip cache purge + +# Install Python 3.9 +RUN add-apt-repository ppa:deadsnakes/ppa +RUN apt-get update && apt-get install -y \ + python3.9 python3.9-distutils \ + && rm -rf /var/lib/apt/lists/* +RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.9 +RUN python3.9 -m pip install --ignore-installed blinker>=1.6.2 # mlflow needs this +RUN python3.9 -m pip install --force $BASIC_PIP_PKGS unittest-xml-reporting $CONNECT_PIP_PKGS && \ + python3.9 -m pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu && \ + python3.9 -m pip install torcheval && \ + python3.9 -m pip cache purge + +# Should unpin 'sphinxcontrib-*' after upgrading sphinx>5 +# See 'ipython_genutils' in SPARK-38517 +# See 'docutils<0.18.0' in SPARK-39421 +RUN python3.9 -m pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe 'pyzmq<24.0.0' \ +ipython ipython_genutils sphinx_plotly_directive 'numpy>=1.20.0' pyarrow pandas 'plotly>=4.8' 'docutils<0.18.0' \ +'flake8==3.9.0' 'mypy==1.8.0' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' 'black==23.9.1' \ +'pandas-stubs==1.2.0.53' 'grpcio==1.62.0' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' \ +'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.1' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5' +RUN python3.9 -m pip list + +RUN gem install --no-document "bundler:2.4.22" +RUN ln -s "$(which python3.9)" "/usr/local/bin/python" WORKDIR /opt/spark-rm/output diff --git a/dev/deps/spark-deps-hadoop-3-hive-2.3 b/dev/deps/spark-deps-hadoop-3-hive-2.3 index 466e8d09d89ed..5478fbde929db 100644 --- a/dev/deps/spark-deps-hadoop-3-hive-2.3 +++ b/dev/deps/spark-deps-hadoop-3-hive-2.3 @@ -1,10 +1,10 @@ HikariCP/2.5.1//HikariCP-2.5.1.jar JLargeArrays/1.5//JLargeArrays-1.5.jar JTransforms/3.1//JTransforms-3.1.jar -RoaringBitmap/1.0.5//RoaringBitmap-1.0.5.jar +RoaringBitmap/1.1.0//RoaringBitmap-1.1.0.jar ST4/4.0.4//ST4-4.0.4.jar activation/1.1.1//activation-1.1.1.jar -aircompressor/0.26//aircompressor-0.26.jar +aircompressor/0.27//aircompressor-0.27.jar algebra_2.13/2.8.0//algebra_2.13-2.8.0.jar aliyun-java-sdk-core/4.5.10//aliyun-java-sdk-core-4.5.10.jar aliyun-java-sdk-kms/2.11.0//aliyun-java-sdk-kms-2.11.0.jar @@ -16,10 +16,11 @@ antlr4-runtime/4.13.1//antlr4-runtime-4.13.1.jar aopalliance-repackaged/3.0.3//aopalliance-repackaged-3.0.3.jar arpack/3.0.3//arpack-3.0.3.jar arpack_combined_all/0.1//arpack_combined_all-0.1.jar -arrow-format/15.0.2//arrow-format-15.0.2.jar -arrow-memory-core/15.0.2//arrow-memory-core-15.0.2.jar -arrow-memory-netty/15.0.2//arrow-memory-netty-15.0.2.jar -arrow-vector/15.0.2//arrow-vector-15.0.2.jar +arrow-format/16.1.0//arrow-format-16.1.0.jar +arrow-memory-core/16.1.0//arrow-memory-core-16.1.0.jar +arrow-memory-netty-buffer-patch/16.1.0//arrow-memory-netty-buffer-patch-16.1.0.jar +arrow-memory-netty/16.1.0//arrow-memory-netty-16.1.0.jar +arrow-vector/16.1.0//arrow-vector-16.1.0.jar audience-annotations/0.12.0//audience-annotations-0.12.0.jar avro-ipc/1.11.3//avro-ipc-1.11.3.jar avro-mapred/1.11.3//avro-mapred-1.11.3.jar @@ -28,19 +29,19 @@ azure-data-lake-store-sdk/2.3.9//azure-data-lake-store-sdk-2.3.9.jar azure-keyvault-core/1.0.0//azure-keyvault-core-1.0.0.jar azure-storage/7.0.1//azure-storage-7.0.1.jar blas/3.0.3//blas-3.0.3.jar -bonecp/0.8.0.RELEASE//bonecp-0.8.0.RELEASE.jar breeze-macros_2.13/2.1.0//breeze-macros_2.13-2.1.0.jar breeze_2.13/2.1.0//breeze_2.13-2.1.0.jar bundle/2.24.6//bundle-2.24.6.jar cats-kernel_2.13/2.8.0//cats-kernel_2.13-2.8.0.jar +checker-qual/3.42.0//checker-qual-3.42.0.jar chill-java/0.10.0//chill-java-0.10.0.jar chill_2.13/0.10.0//chill_2.13-0.10.0.jar -commons-cli/1.6.0//commons-cli-1.6.0.jar -commons-codec/1.16.1//commons-codec-1.16.1.jar +commons-cli/1.8.0//commons-cli-1.8.0.jar +commons-codec/1.17.0//commons-codec-1.17.0.jar commons-collections/3.2.2//commons-collections-3.2.2.jar commons-collections4/4.4//commons-collections4-4.4.jar commons-compiler/3.1.9//commons-compiler-3.1.9.jar -commons-compress/1.26.1//commons-compress-1.26.1.jar +commons-compress/1.26.2//commons-compress-1.26.2.jar commons-crypto/1.1.0//commons-crypto-1.1.0.jar commons-dbcp/1.4//commons-dbcp-1.4.jar commons-io/2.16.1//commons-io-2.16.1.jar @@ -48,22 +49,20 @@ commons-lang/2.6//commons-lang-2.6.jar commons-lang3/3.14.0//commons-lang3-3.14.0.jar commons-math3/3.6.1//commons-math3-3.6.1.jar commons-pool/1.5.4//commons-pool-1.5.4.jar -commons-text/1.11.0//commons-text-1.11.0.jar +commons-text/1.12.0//commons-text-1.12.0.jar compress-lzf/1.1.2//compress-lzf-1.1.2.jar -curator-client/5.6.0//curator-client-5.6.0.jar -curator-framework/5.6.0//curator-framework-5.6.0.jar -curator-recipes/5.6.0//curator-recipes-5.6.0.jar +curator-client/5.7.0//curator-client-5.7.0.jar +curator-framework/5.7.0//curator-framework-5.7.0.jar +curator-recipes/5.7.0//curator-recipes-5.7.0.jar datanucleus-api-jdo/4.2.4//datanucleus-api-jdo-4.2.4.jar datanucleus-core/4.1.17//datanucleus-core-4.1.17.jar datanucleus-rdbms/4.1.19//datanucleus-rdbms-4.1.19.jar -datasketches-java/5.0.1//datasketches-java-5.0.1.jar +datasketches-java/6.0.0//datasketches-java-6.0.0.jar datasketches-memory/2.2.0//datasketches-memory-2.2.0.jar derby/10.16.1.1//derby-10.16.1.1.jar derbyshared/10.16.1.1//derbyshared-10.16.1.1.jar derbytools/10.16.1.1//derbytools-10.16.1.1.jar dropwizard-metrics-hadoop-metrics2-reporter/0.1.2//dropwizard-metrics-hadoop-metrics2-reporter-0.1.2.jar -eclipse-collections-api/11.1.0//eclipse-collections-api-11.1.0.jar -eclipse-collections/11.1.0//eclipse-collections-11.1.0.jar esdk-obs-java/3.20.4.2//esdk-obs-java-3.20.4.2.jar flatbuffers-java/23.5.26//flatbuffers-java-23.5.26.jar gcs-connector/hadoop3-2.2.21/shaded/gcs-connector-hadoop3-2.2.21-shaded.jar @@ -80,39 +79,39 @@ hadoop-client-runtime/3.4.0//hadoop-client-runtime-3.4.0.jar hadoop-cloud-storage/3.4.0//hadoop-cloud-storage-3.4.0.jar hadoop-huaweicloud/3.4.0//hadoop-huaweicloud-3.4.0.jar hadoop-shaded-guava/1.2.0//hadoop-shaded-guava-1.2.0.jar -hadoop-yarn-server-web-proxy/3.4.0//hadoop-yarn-server-web-proxy-3.4.0.jar -hive-beeline/2.3.9//hive-beeline-2.3.9.jar -hive-cli/2.3.9//hive-cli-2.3.9.jar -hive-common/2.3.9//hive-common-2.3.9.jar -hive-exec/2.3.9/core/hive-exec-2.3.9-core.jar -hive-jdbc/2.3.9//hive-jdbc-2.3.9.jar -hive-llap-common/2.3.9//hive-llap-common-2.3.9.jar -hive-metastore/2.3.9//hive-metastore-2.3.9.jar -hive-serde/2.3.9//hive-serde-2.3.9.jar +hive-beeline/2.3.10//hive-beeline-2.3.10.jar +hive-cli/2.3.10//hive-cli-2.3.10.jar +hive-common/2.3.10//hive-common-2.3.10.jar +hive-exec/2.3.10/core/hive-exec-2.3.10-core.jar +hive-jdbc/2.3.10//hive-jdbc-2.3.10.jar +hive-llap-common/2.3.10//hive-llap-common-2.3.10.jar +hive-metastore/2.3.10//hive-metastore-2.3.10.jar +hive-serde/2.3.10//hive-serde-2.3.10.jar hive-service-rpc/4.0.0//hive-service-rpc-4.0.0.jar -hive-shims-0.23/2.3.9//hive-shims-0.23-2.3.9.jar -hive-shims-common/2.3.9//hive-shims-common-2.3.9.jar -hive-shims-scheduler/2.3.9//hive-shims-scheduler-2.3.9.jar -hive-shims/2.3.9//hive-shims-2.3.9.jar +hive-shims-0.23/2.3.10//hive-shims-0.23-2.3.10.jar +hive-shims-common/2.3.10//hive-shims-common-2.3.10.jar +hive-shims-scheduler/2.3.10//hive-shims-scheduler-2.3.10.jar +hive-shims/2.3.10//hive-shims-2.3.10.jar hive-storage-api/2.8.1//hive-storage-api-2.8.1.jar hk2-api/3.0.3//hk2-api-3.0.3.jar hk2-locator/3.0.3//hk2-locator-3.0.3.jar hk2-utils/3.0.3//hk2-utils-3.0.3.jar httpclient/4.5.14//httpclient-4.5.14.jar httpcore/4.4.16//httpcore-4.4.16.jar -icu4j/72.1//icu4j-72.1.jar +icu4j/75.1//icu4j-75.1.jar ini4j/0.5.4//ini4j-0.5.4.jar istack-commons-runtime/3.0.8//istack-commons-runtime-3.0.8.jar ivy/2.5.2//ivy-2.5.2.jar -jackson-annotations/2.17.0//jackson-annotations-2.17.0.jar +jackson-annotations/2.17.1//jackson-annotations-2.17.1.jar jackson-core-asl/1.9.13//jackson-core-asl-1.9.13.jar -jackson-core/2.17.0//jackson-core-2.17.0.jar -jackson-databind/2.17.0//jackson-databind-2.17.0.jar -jackson-dataformat-cbor/2.17.0//jackson-dataformat-cbor-2.17.0.jar -jackson-dataformat-yaml/2.17.0//jackson-dataformat-yaml-2.17.0.jar -jackson-datatype-jsr310/2.17.0//jackson-datatype-jsr310-2.17.0.jar +jackson-core/2.17.1//jackson-core-2.17.1.jar +jackson-databind/2.17.1//jackson-databind-2.17.1.jar +jackson-dataformat-cbor/2.17.1//jackson-dataformat-cbor-2.17.1.jar +jackson-dataformat-yaml/2.17.1//jackson-dataformat-yaml-2.17.1.jar +jackson-datatype-jdk8/2.17.0//jackson-datatype-jdk8-2.17.0.jar +jackson-datatype-jsr310/2.17.1//jackson-datatype-jsr310-2.17.1.jar jackson-mapper-asl/1.9.13//jackson-mapper-asl-1.9.13.jar -jackson-module-scala_2.13/2.17.0//jackson-module-scala_2.13-2.17.0.jar +jackson-module-scala_2.13/2.17.1//jackson-module-scala_2.13-2.17.1.jar jakarta.annotation-api/2.0.0//jakarta.annotation-api-2.0.0.jar jakarta.inject-api/2.0.1//jakarta.inject-api-2.0.1.jar jakarta.servlet-api/5.0.0//jakarta.servlet-api-5.0.0.jar @@ -138,11 +137,11 @@ jersey-container-servlet/3.0.12//jersey-container-servlet-3.0.12.jar jersey-hk2/3.0.12//jersey-hk2-3.0.12.jar jersey-server/3.0.12//jersey-server-3.0.12.jar jettison/1.5.4//jettison-1.5.4.jar -jetty-util-ajax/11.0.20//jetty-util-ajax-11.0.20.jar -jetty-util/11.0.20//jetty-util-11.0.20.jar +jetty-util-ajax/11.0.21//jetty-util-ajax-11.0.21.jar +jetty-util/11.0.21//jetty-util-11.0.21.jar jline/2.14.6//jline-2.14.6.jar -jline/3.24.1//jline-3.24.1.jar -jna/5.13.0//jna-5.13.0.jar +jline/3.25.1//jline-3.25.1.jar +jna/5.14.0//jna-5.14.0.jar joda-time/2.12.7//joda-time-2.12.7.jar jodd-core/3.5.2//jodd-core-3.5.2.jar jpam/1.1//jpam-1.1.jar @@ -156,35 +155,35 @@ jsr305/3.0.0//jsr305-3.0.0.jar jta/1.1//jta-1.1.jar jul-to-slf4j/2.0.13//jul-to-slf4j-2.0.13.jar kryo-shaded/4.0.2//kryo-shaded-4.0.2.jar -kubernetes-client-api/6.12.0//kubernetes-client-api-6.12.0.jar -kubernetes-client/6.12.0//kubernetes-client-6.12.0.jar -kubernetes-httpclient-okhttp/6.12.0//kubernetes-httpclient-okhttp-6.12.0.jar -kubernetes-model-admissionregistration/6.12.0//kubernetes-model-admissionregistration-6.12.0.jar -kubernetes-model-apiextensions/6.12.0//kubernetes-model-apiextensions-6.12.0.jar -kubernetes-model-apps/6.12.0//kubernetes-model-apps-6.12.0.jar -kubernetes-model-autoscaling/6.12.0//kubernetes-model-autoscaling-6.12.0.jar -kubernetes-model-batch/6.12.0//kubernetes-model-batch-6.12.0.jar -kubernetes-model-certificates/6.12.0//kubernetes-model-certificates-6.12.0.jar -kubernetes-model-common/6.12.0//kubernetes-model-common-6.12.0.jar -kubernetes-model-coordination/6.12.0//kubernetes-model-coordination-6.12.0.jar -kubernetes-model-core/6.12.0//kubernetes-model-core-6.12.0.jar -kubernetes-model-discovery/6.12.0//kubernetes-model-discovery-6.12.0.jar -kubernetes-model-events/6.12.0//kubernetes-model-events-6.12.0.jar -kubernetes-model-extensions/6.12.0//kubernetes-model-extensions-6.12.0.jar -kubernetes-model-flowcontrol/6.12.0//kubernetes-model-flowcontrol-6.12.0.jar -kubernetes-model-gatewayapi/6.12.0//kubernetes-model-gatewayapi-6.12.0.jar -kubernetes-model-metrics/6.12.0//kubernetes-model-metrics-6.12.0.jar -kubernetes-model-networking/6.12.0//kubernetes-model-networking-6.12.0.jar -kubernetes-model-node/6.12.0//kubernetes-model-node-6.12.0.jar -kubernetes-model-policy/6.12.0//kubernetes-model-policy-6.12.0.jar -kubernetes-model-rbac/6.12.0//kubernetes-model-rbac-6.12.0.jar -kubernetes-model-resource/6.12.0//kubernetes-model-resource-6.12.0.jar -kubernetes-model-scheduling/6.12.0//kubernetes-model-scheduling-6.12.0.jar -kubernetes-model-storageclass/6.12.0//kubernetes-model-storageclass-6.12.0.jar +kubernetes-client-api/6.13.0//kubernetes-client-api-6.13.0.jar +kubernetes-client/6.13.0//kubernetes-client-6.13.0.jar +kubernetes-httpclient-okhttp/6.13.0//kubernetes-httpclient-okhttp-6.13.0.jar +kubernetes-model-admissionregistration/6.13.0//kubernetes-model-admissionregistration-6.13.0.jar +kubernetes-model-apiextensions/6.13.0//kubernetes-model-apiextensions-6.13.0.jar +kubernetes-model-apps/6.13.0//kubernetes-model-apps-6.13.0.jar +kubernetes-model-autoscaling/6.13.0//kubernetes-model-autoscaling-6.13.0.jar +kubernetes-model-batch/6.13.0//kubernetes-model-batch-6.13.0.jar +kubernetes-model-certificates/6.13.0//kubernetes-model-certificates-6.13.0.jar +kubernetes-model-common/6.13.0//kubernetes-model-common-6.13.0.jar +kubernetes-model-coordination/6.13.0//kubernetes-model-coordination-6.13.0.jar +kubernetes-model-core/6.13.0//kubernetes-model-core-6.13.0.jar +kubernetes-model-discovery/6.13.0//kubernetes-model-discovery-6.13.0.jar +kubernetes-model-events/6.13.0//kubernetes-model-events-6.13.0.jar +kubernetes-model-extensions/6.13.0//kubernetes-model-extensions-6.13.0.jar +kubernetes-model-flowcontrol/6.13.0//kubernetes-model-flowcontrol-6.13.0.jar +kubernetes-model-gatewayapi/6.13.0//kubernetes-model-gatewayapi-6.13.0.jar +kubernetes-model-metrics/6.13.0//kubernetes-model-metrics-6.13.0.jar +kubernetes-model-networking/6.13.0//kubernetes-model-networking-6.13.0.jar +kubernetes-model-node/6.13.0//kubernetes-model-node-6.13.0.jar +kubernetes-model-policy/6.13.0//kubernetes-model-policy-6.13.0.jar +kubernetes-model-rbac/6.13.0//kubernetes-model-rbac-6.13.0.jar +kubernetes-model-resource/6.13.0//kubernetes-model-resource-6.13.0.jar +kubernetes-model-scheduling/6.13.0//kubernetes-model-scheduling-6.13.0.jar +kubernetes-model-storageclass/6.13.0//kubernetes-model-storageclass-6.13.0.jar lapack/3.0.3//lapack-3.0.3.jar leveldbjni-all/1.8//leveldbjni-all-1.8.jar libfb303/0.9.3//libfb303-0.9.3.jar -libthrift/0.12.0//libthrift-0.12.0.jar +libthrift/0.16.0//libthrift-0.16.0.jar log4j-1.2-api/2.22.1//log4j-1.2-api-2.22.1.jar log4j-api/2.22.1//log4j-api-2.22.1.jar log4j-core/2.22.1//log4j-core-2.22.1.jar @@ -192,38 +191,37 @@ log4j-layout-template-json/2.22.1//log4j-layout-template-json-2.22.1.jar log4j-slf4j2-impl/2.22.1//log4j-slf4j2-impl-2.22.1.jar logging-interceptor/3.12.12//logging-interceptor-3.12.12.jar lz4-java/1.8.0//lz4-java-1.8.0.jar -metrics-core/4.2.25//metrics-core-4.2.25.jar -metrics-graphite/4.2.25//metrics-graphite-4.2.25.jar -metrics-jmx/4.2.25//metrics-jmx-4.2.25.jar -metrics-json/4.2.25//metrics-json-4.2.25.jar -metrics-jvm/4.2.25//metrics-jvm-4.2.25.jar +metrics-core/4.2.26//metrics-core-4.2.26.jar +metrics-graphite/4.2.26//metrics-graphite-4.2.26.jar +metrics-jmx/4.2.26//metrics-jmx-4.2.26.jar +metrics-json/4.2.26//metrics-json-4.2.26.jar +metrics-jvm/4.2.26//metrics-jvm-4.2.26.jar minlog/1.3.0//minlog-1.3.0.jar -netty-all/4.1.108.Final//netty-all-4.1.108.Final.jar -netty-buffer/4.1.108.Final//netty-buffer-4.1.108.Final.jar -netty-codec-http/4.1.108.Final//netty-codec-http-4.1.108.Final.jar -netty-codec-http2/4.1.108.Final//netty-codec-http2-4.1.108.Final.jar -netty-codec-socks/4.1.108.Final//netty-codec-socks-4.1.108.Final.jar -netty-codec/4.1.108.Final//netty-codec-4.1.108.Final.jar -netty-common/4.1.108.Final//netty-common-4.1.108.Final.jar -netty-handler-proxy/4.1.108.Final//netty-handler-proxy-4.1.108.Final.jar -netty-handler/4.1.108.Final//netty-handler-4.1.108.Final.jar -netty-resolver/4.1.108.Final//netty-resolver-4.1.108.Final.jar -netty-tcnative-boringssl-static/2.0.61.Final//netty-tcnative-boringssl-static-2.0.61.Final.jar +netty-all/4.1.110.Final//netty-all-4.1.110.Final.jar +netty-buffer/4.1.110.Final//netty-buffer-4.1.110.Final.jar +netty-codec-http/4.1.110.Final//netty-codec-http-4.1.110.Final.jar +netty-codec-http2/4.1.110.Final//netty-codec-http2-4.1.110.Final.jar +netty-codec-socks/4.1.110.Final//netty-codec-socks-4.1.110.Final.jar +netty-codec/4.1.110.Final//netty-codec-4.1.110.Final.jar +netty-common/4.1.110.Final//netty-common-4.1.110.Final.jar +netty-handler-proxy/4.1.110.Final//netty-handler-proxy-4.1.110.Final.jar +netty-handler/4.1.110.Final//netty-handler-4.1.110.Final.jar +netty-resolver/4.1.110.Final//netty-resolver-4.1.110.Final.jar netty-tcnative-boringssl-static/2.0.65.Final/linux-aarch_64/netty-tcnative-boringssl-static-2.0.65.Final-linux-aarch_64.jar netty-tcnative-boringssl-static/2.0.65.Final/linux-x86_64/netty-tcnative-boringssl-static-2.0.65.Final-linux-x86_64.jar netty-tcnative-boringssl-static/2.0.65.Final/osx-aarch_64/netty-tcnative-boringssl-static-2.0.65.Final-osx-aarch_64.jar netty-tcnative-boringssl-static/2.0.65.Final/osx-x86_64/netty-tcnative-boringssl-static-2.0.65.Final-osx-x86_64.jar netty-tcnative-boringssl-static/2.0.65.Final/windows-x86_64/netty-tcnative-boringssl-static-2.0.65.Final-windows-x86_64.jar netty-tcnative-classes/2.0.65.Final//netty-tcnative-classes-2.0.65.Final.jar -netty-transport-classes-epoll/4.1.108.Final//netty-transport-classes-epoll-4.1.108.Final.jar -netty-transport-classes-kqueue/4.1.108.Final//netty-transport-classes-kqueue-4.1.108.Final.jar -netty-transport-native-epoll/4.1.108.Final/linux-aarch_64/netty-transport-native-epoll-4.1.108.Final-linux-aarch_64.jar -netty-transport-native-epoll/4.1.108.Final/linux-riscv64/netty-transport-native-epoll-4.1.108.Final-linux-riscv64.jar -netty-transport-native-epoll/4.1.108.Final/linux-x86_64/netty-transport-native-epoll-4.1.108.Final-linux-x86_64.jar -netty-transport-native-kqueue/4.1.108.Final/osx-aarch_64/netty-transport-native-kqueue-4.1.108.Final-osx-aarch_64.jar -netty-transport-native-kqueue/4.1.108.Final/osx-x86_64/netty-transport-native-kqueue-4.1.108.Final-osx-x86_64.jar -netty-transport-native-unix-common/4.1.108.Final//netty-transport-native-unix-common-4.1.108.Final.jar -netty-transport/4.1.108.Final//netty-transport-4.1.108.Final.jar +netty-transport-classes-epoll/4.1.110.Final//netty-transport-classes-epoll-4.1.110.Final.jar +netty-transport-classes-kqueue/4.1.110.Final//netty-transport-classes-kqueue-4.1.110.Final.jar +netty-transport-native-epoll/4.1.110.Final/linux-aarch_64/netty-transport-native-epoll-4.1.110.Final-linux-aarch_64.jar +netty-transport-native-epoll/4.1.110.Final/linux-riscv64/netty-transport-native-epoll-4.1.110.Final-linux-riscv64.jar +netty-transport-native-epoll/4.1.110.Final/linux-x86_64/netty-transport-native-epoll-4.1.110.Final-linux-x86_64.jar +netty-transport-native-kqueue/4.1.110.Final/osx-aarch_64/netty-transport-native-kqueue-4.1.110.Final-osx-aarch_64.jar +netty-transport-native-kqueue/4.1.110.Final/osx-x86_64/netty-transport-native-kqueue-4.1.110.Final-osx-x86_64.jar +netty-transport-native-unix-common/4.1.110.Final//netty-transport-native-unix-common-4.1.110.Final.jar +netty-transport/4.1.110.Final//netty-transport-4.1.110.Final.jar objenesis/3.3//objenesis-3.3.jar okhttp/3.12.12//okhttp-3.12.12.jar okio/1.15.0//okio-1.15.0.jar @@ -231,30 +229,30 @@ opencsv/2.3//opencsv-2.3.jar opentracing-api/0.33.0//opentracing-api-0.33.0.jar opentracing-noop/0.33.0//opentracing-noop-0.33.0.jar opentracing-util/0.33.0//opentracing-util-0.33.0.jar -orc-core/2.0.0/shaded-protobuf/orc-core-2.0.0-shaded-protobuf.jar +orc-core/2.0.1/shaded-protobuf/orc-core-2.0.1-shaded-protobuf.jar orc-format/1.0.0/shaded-protobuf/orc-format-1.0.0-shaded-protobuf.jar -orc-mapreduce/2.0.0/shaded-protobuf/orc-mapreduce-2.0.0-shaded-protobuf.jar -orc-shims/2.0.0//orc-shims-2.0.0.jar +orc-mapreduce/2.0.1/shaded-protobuf/orc-mapreduce-2.0.1-shaded-protobuf.jar +orc-shims/2.0.1//orc-shims-2.0.1.jar oro/2.0.8//oro-2.0.8.jar osgi-resource-locator/1.0.3//osgi-resource-locator-1.0.3.jar paranamer/2.8//paranamer-2.8.jar -parquet-column/1.13.1//parquet-column-1.13.1.jar -parquet-common/1.13.1//parquet-common-1.13.1.jar -parquet-encoding/1.13.1//parquet-encoding-1.13.1.jar -parquet-format-structures/1.13.1//parquet-format-structures-1.13.1.jar -parquet-hadoop/1.13.1//parquet-hadoop-1.13.1.jar -parquet-jackson/1.13.1//parquet-jackson-1.13.1.jar -pickle/1.3//pickle-1.3.jar +parquet-column/1.14.1//parquet-column-1.14.1.jar +parquet-common/1.14.1//parquet-common-1.14.1.jar +parquet-encoding/1.14.1//parquet-encoding-1.14.1.jar +parquet-format-structures/1.14.1//parquet-format-structures-1.14.1.jar +parquet-hadoop/1.14.1//parquet-hadoop-1.14.1.jar +parquet-jackson/1.14.1//parquet-jackson-1.14.1.jar +pickle/1.5//pickle-1.5.jar py4j/0.10.9.7//py4j-0.10.9.7.jar remotetea-oncrpc/1.1.2//remotetea-oncrpc-1.1.2.jar -rocksdbjni/8.11.3//rocksdbjni-8.11.3.jar +rocksdbjni/9.2.1//rocksdbjni-9.2.1.jar scala-collection-compat_2.13/2.7.0//scala-collection-compat_2.13-2.7.0.jar -scala-compiler/2.13.13//scala-compiler-2.13.13.jar -scala-library/2.13.13//scala-library-2.13.13.jar +scala-compiler/2.13.14//scala-compiler-2.13.14.jar +scala-library/2.13.14//scala-library-2.13.14.jar scala-parallel-collections_2.13/1.0.4//scala-parallel-collections_2.13-1.0.4.jar -scala-parser-combinators_2.13/2.3.0//scala-parser-combinators_2.13-2.3.0.jar -scala-reflect/2.13.13//scala-reflect-2.13.13.jar -scala-xml_2.13/2.2.0//scala-xml_2.13-2.2.0.jar +scala-parser-combinators_2.13/2.4.0//scala-parser-combinators_2.13-2.4.0.jar +scala-reflect/2.13.14//scala-reflect-2.13.14.jar +scala-xml_2.13/2.3.0//scala-xml_2.13-2.3.0.jar slf4j-api/2.0.13//slf4j-api-2.0.13.jar snakeyaml-engine/2.7//snakeyaml-engine-2.7.jar snakeyaml/2.2//snakeyaml-2.2.jar @@ -264,7 +262,7 @@ spire-platform_2.13/0.18.0//spire-platform_2.13-0.18.0.jar spire-util_2.13/0.18.0//spire-util_2.13-0.18.0.jar spire_2.13/0.18.0//spire_2.13-0.18.0.jar stax-api/1.0.1//stax-api-1.0.1.jar -stream/2.9.6//stream-2.9.6.jar +stream/2.9.8//stream-2.9.8.jar super-csv/2.2.0//super-csv-2.2.0.jar threeten-extra/1.7.1//threeten-extra-1.7.1.jar tink/1.13.0//tink-1.13.0.jar @@ -272,10 +270,10 @@ transaction-api/1.1//transaction-api-1.1.jar txw2/3.0.2//txw2-3.0.2.jar univocity-parsers/2.9.1//univocity-parsers-2.9.1.jar wildfly-openssl/1.1.3.Final//wildfly-openssl-1.1.3.Final.jar -xbean-asm9-shaded/4.24//xbean-asm9-shaded-4.24.jar +xbean-asm9-shaded/4.25//xbean-asm9-shaded-4.25.jar xmlschema-core/2.3.1//xmlschema-core-2.3.1.jar xz/1.9//xz-1.9.jar zjsonpatch/0.3.0//zjsonpatch-0.3.0.jar zookeeper-jute/3.9.2//zookeeper-jute-3.9.2.jar zookeeper/3.9.2//zookeeper-3.9.2.jar -zstd-jni/1.5.6-2//zstd-jni-1.5.6-2.jar +zstd-jni/1.5.6-3//zstd-jni-1.5.6-3.jar diff --git a/dev/infra/Dockerfile b/dev/infra/Dockerfile index 0b0a478b4bf44..6ba9be87552ab 100644 --- a/dev/infra/Dockerfile +++ b/dev/infra/Dockerfile @@ -81,17 +81,17 @@ ENV R_LIBS_SITE "/usr/local/lib/R/site-library:${R_LIBS_SITE}:/usr/lib/R/library RUN add-apt-repository ppa:pypy/ppa -RUN mkdir -p /usr/local/pypy/pypy3.8 && \ - curl -sqL https://downloads.python.org/pypy/pypy3.8-v7.3.11-linux64.tar.bz2 | tar xjf - -C /usr/local/pypy/pypy3.8 --strip-components=1 && \ - ln -sf /usr/local/pypy/pypy3.8/bin/pypy /usr/local/bin/pypy3.8 && \ - ln -sf /usr/local/pypy/pypy3.8/bin/pypy /usr/local/bin/pypy3 +RUN mkdir -p /usr/local/pypy/pypy3.9 && \ + curl -sqL https://downloads.python.org/pypy/pypy3.9-v7.3.16-linux64.tar.bz2 | tar xjf - -C /usr/local/pypy/pypy3.9 --strip-components=1 && \ + ln -sf /usr/local/pypy/pypy3.9/bin/pypy /usr/local/bin/pypy3.8 && \ + ln -sf /usr/local/pypy/pypy3.9/bin/pypy /usr/local/bin/pypy3 RUN curl -sS https://bootstrap.pypa.io/get-pip.py | pypy3 -RUN pypy3 -m pip install numpy 'six==1.16.0' 'pandas<=2.2.2' scipy coverage matplotlib lxml +RUN pypy3 -m pip install numpy 'six==1.16.0' 'pandas==2.2.2' scipy coverage matplotlib lxml -ARG BASIC_PIP_PKGS="numpy pyarrow>=15.0.0 six==1.16.0 pandas<=2.2.2 scipy plotly>=4.8 mlflow>=2.8.1 coverage matplotlib openpyxl memory-profiler>=0.61.0 scikit-learn>=1.3.2" +ARG BASIC_PIP_PKGS="numpy pyarrow>=15.0.0 six==1.16.0 pandas==2.2.2 scipy plotly>=4.8 mlflow>=2.8.1 coverage matplotlib openpyxl memory-profiler>=0.61.0 scikit-learn>=1.3.2" # Python deps for Spark Connect -ARG CONNECT_PIP_PKGS="grpcio==1.62.0 grpcio-status==1.62.0 protobuf==4.25.1 googleapis-common-protos==1.56.4" +ARG CONNECT_PIP_PKGS="grpcio==1.62.0 grpcio-status==1.62.0 protobuf==4.25.1 googleapis-common-protos==1.56.4 graphviz==0.20.3" # Install Python 3.10 packages RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10 @@ -138,9 +138,6 @@ RUN python3.12 -m pip install $BASIC_PIP_PKGS $CONNECT_PIP_PKGS lxml && \ python3.12 -m pip cache purge # Remove unused installation packages to free up disk space -RUN apt-get remove --purge -y \ - '^aspnet.*' '^dotnet-.*' '^llvm-.*' 'php.*' '^mongodb-.*' \ - snapd google-chrome-stable microsoft-edge-stable firefox \ - azure-cli google-cloud-sdk mono-devel powershell libgl1-mesa-dri || true +RUN apt-get remove --purge -y 'gfortran-11' 'humanity-icon-theme' 'nodejs-doc' || true RUN apt-get autoremove --purge -y RUN apt-get clean diff --git a/dev/is-changed.py b/dev/is-changed.py index 85f0d3cda6df4..1962e244d5dd7 100755 --- a/dev/is-changed.py +++ b/dev/is-changed.py @@ -17,6 +17,8 @@ # limitations under the License. # +import warnings +import traceback import os import sys from argparse import ArgumentParser @@ -82,4 +84,8 @@ def main(): if __name__ == "__main__": - main() + try: + main() + except Exception: + warnings.warn(f"Ignored exception:\n\n{traceback.format_exc()}") + print("true") diff --git a/dev/java-file-header b/dev/java-file-header new file mode 100644 index 0000000000000..c6a5afeef509f --- /dev/null +++ b/dev/java-file-header @@ -0,0 +1,16 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ \ No newline at end of file diff --git a/dev/lint-java b/dev/lint-java index ac5a2c869404f..ff431301773f3 100755 --- a/dev/lint-java +++ b/dev/lint-java @@ -20,7 +20,7 @@ SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )" SPARK_ROOT_DIR="$(dirname $SCRIPT_DIR)" -ERRORS=$($SCRIPT_DIR/../build/mvn -Pkinesis-asl -Pkubernetes -Pyarn -Phive -Phive-thriftserver checkstyle:check | grep ERROR) +ERRORS=$($SCRIPT_DIR/../build/mvn -Pkinesis-asl -Pspark-ganglia-lgpl -Pkubernetes -Pyarn -Phive -Phive-thriftserver checkstyle:check | grep ERROR) if test ! -z "$ERRORS"; then echo -e "Checkstyle checks failed at following occurrences:\n$ERRORS" diff --git a/dev/lint-python b/dev/lint-python index 8d587bd52aca7..b8703310bc4b6 100755 --- a/dev/lint-python +++ b/dev/lint-python @@ -84,7 +84,10 @@ function satisfies_min_version { local expected_version="$2" echo "$( "$PYTHON_EXECUTABLE" << EOM -from setuptools.extern.packaging import version +try: + from setuptools.extern.packaging import version +except ModuleNotFoundError: + from packaging import version print(version.parse('$provided_version') >= version.parse('$expected_version')) EOM )" @@ -122,6 +125,7 @@ function mypy_annotation_test { echo "starting mypy annotations test..." MYPY_REPORT=$( ($MYPY_BUILD \ + --python-executable $PYTHON_EXECUTABLE \ --namespace-packages \ --config-file python/mypy.ini \ --cache-dir /tmp/.mypy_cache/ \ @@ -181,6 +185,7 @@ function mypy_examples_test { echo "starting mypy examples test..." MYPY_REPORT=$( (MYPYPATH=python $MYPY_BUILD \ + --python-executable $PYTHON_EXECUTABLE \ --namespace-packages \ --config-file python/mypy.ini \ --exclude "mllib/*" \ diff --git a/dev/lint-scala b/dev/lint-scala index 6b3178312c106..03771954ff525 100755 --- a/dev/lint-scala +++ b/dev/lint-scala @@ -32,7 +32,7 @@ ERRORS=$(./build/mvn \ -pl connector/connect/common \ -pl connector/connect/server \ -pl connector/connect/client/jvm \ - 2>&1 | grep -e "^Requires formatting" \ + 2>&1 | grep -e "Unformatted files found" \ ) if test ! -z "$ERRORS"; then diff --git a/dev/merge_spark_pr.py b/dev/merge_spark_pr.py index c9893fd7e5a9d..4ebd3e4b951f5 100755 --- a/dev/merge_spark_pr.py +++ b/dev/merge_spark_pr.py @@ -118,9 +118,14 @@ def run_cmd(cmd): return subprocess.check_output(cmd.split(" ")).decode("utf-8") -def continue_maybe(prompt): +def continue_maybe(prompt, cherry=False): result = bold_input("%s (y/N): " % prompt) if result.lower() != "y": + if cherry: + try: + run_cmd("git cherry-pick --abort") + except Exception: + print_error("Unable to abort and get back to the state before cherry-pick") fail("Okay, exiting") @@ -234,9 +239,9 @@ def cherry_pick(pr_num, merge_hash, default_branch): run_cmd("git cherry-pick -sx %s" % merge_hash) except Exception as e: msg = "Error cherry-picking: %s\nWould you like to manually fix-up this merge?" % e - continue_maybe(msg) + continue_maybe(msg, True) msg = "Okay, please fix any conflicts and finish the cherry-pick. Finished?" - continue_maybe(msg) + continue_maybe(msg, True) continue_maybe( "Pick complete (local ref %s). Push to %s?" % (pick_branch_name, PUSH_REMOTE_NAME) @@ -257,16 +262,19 @@ def cherry_pick(pr_num, merge_hash, default_branch): def print_jira_issue_summary(issue): - summary = issue.fields.summary + summary = "Summary\t\t%s\n" % issue.fields.summary assignee = issue.fields.assignee if assignee is not None: assignee = assignee.displayName - status = issue.fields.status.name + assignee = "Assignee\t%s\n" % assignee + status = "Status\t\t%s\n" % issue.fields.status.name + url = "Url\t\t%s/%s\n" % (JIRA_BASE, issue.key) + target_versions = "Affected\t%s\n" % [x.name for x in issue.fields.versions] + fix_versions = "" + if len(issue.fields.fixVersions) > 0: + fix_versions = "Fixed\t\t%s\n" % [x.name for x in issue.fields.fixVersions] print("=== JIRA %s ===" % issue.key) - print( - "summary\t\t%s\nassignee\t%s\nstatus\t\t%s\nurl\t\t%s/%s\n" - % (summary, assignee, status, JIRA_BASE, issue.key) - ) + print("%s%s%s%s%s%s" % (summary, assignee, status, url, target_versions, fix_versions)) def get_jira_issue(prompt, default_jira_id=""): @@ -501,12 +509,19 @@ def standardize_jira_ref(text): >>> standardize_jira_ref( ... "[SPARK-6250][SPARK-6146][SPARK-5911][SQL] Types are now reserved words in DDL parser.") '[SPARK-6250][SPARK-6146][SPARK-5911][SQL] Types are now reserved words in DDL parser.' + >>> standardize_jira_ref( + ... 'Revert "[SPARK-48591][PYTHON] Simplify the if-else branches with F.lit"') + 'Revert "[SPARK-48591][PYTHON] Simplify the if-else branches with F.lit"' >>> standardize_jira_ref("Additional information for users building from source code") 'Additional information for users building from source code' """ jira_refs = [] components = [] + # If this is a Revert PR, no need to process any further + if text.startswith('Revert "') and text.endswith('"'): + return text + # If the string is compliant, no need to process any further if re.search(r"^\[SPARK-[0-9]{3,6}\](\[[A-Z0-9_\s,]+\] )+\S+", text): return text @@ -678,6 +693,14 @@ def main(): ) continue_maybe(msg) + if asf_jira is not None: + jira_ids = re.findall("SPARK-[0-9]{4,5}", title) + for jira_id in jira_ids: + try: + print_jira_issue_summary(asf_jira.issue(jira_id)) + except Exception: + print_error("Unable to fetch summary of %s" % jira_id) + print("\n=== Pull Request #%s ===" % pr_num) print("title\t%s\nsource\t%s\ntarget\t%s\nurl\t%s" % (title, pr_repo_desc, target_ref, url)) continue_maybe("Proceed with merging pull request #%s?" % pr_num) diff --git a/dev/pyproject.toml b/dev/pyproject.toml index 4f462d14c7838..f19107b3782a6 100644 --- a/dev/pyproject.toml +++ b/dev/pyproject.toml @@ -29,6 +29,6 @@ testpaths = [ # GitHub workflow version and dev/reformat-python required-version = "23.9.1" line-length = 100 -target-version = ['py38'] +target-version = ['py39'] include = '\.pyi?$' extend-exclude = 'cloudpickle|error_classes.py' diff --git a/dev/requirements.txt b/dev/requirements.txt index d6530d8ce2821..88883a963950e 100644 --- a/dev/requirements.txt +++ b/dev/requirements.txt @@ -60,6 +60,9 @@ mypy-protobuf==3.3.0 googleapis-common-protos-stubs==2.2.0 grpc-stubs==1.24.11 +# Debug for Spark and Spark Connect +graphviz==0.20.3 + # TorchDistributor dependencies torch torchvision diff --git a/dev/run-pip-tests b/dev/run-pip-tests index f8a547b0c917c..91399ff1e25ea 100755 --- a/dev/run-pip-tests +++ b/dev/run-pip-tests @@ -87,6 +87,10 @@ for python in "${PYTHON_EXECS[@]}"; do VIRTUALENV_PATH="$VIRTUALENV_BASE"/$python rm -rf "$VIRTUALENV_PATH" if [ -n "$USE_CONDA" ]; then + if [ -f "$CONDA_PREFIX/etc/profile.d/conda.sh" ]; then + # See also https://github.com/conda/conda/issues/7980 + source "$CONDA_PREFIX/etc/profile.d/conda.sh" + fi conda create -y -p "$VIRTUALENV_PATH" python=$python numpy pandas pip setuptools source activate "$VIRTUALENV_PATH" || conda activate "$VIRTUALENV_PATH" else diff --git a/dev/sbt-checkstyle b/dev/sbt-checkstyle index 99a46a3a0e38b..f2d5a0fa304ac 100755 --- a/dev/sbt-checkstyle +++ b/dev/sbt-checkstyle @@ -17,7 +17,7 @@ # limitations under the License. # -SPARK_PROFILES=${1:-"-Pkinesis-asl -Pkubernetes -Pyarn -Phive -Phive-thriftserver"} +SPARK_PROFILES=${1:-"-Pkinesis-asl -Pspark-ganglia-lgpl -Pkubernetes -Pyarn -Phive -Phive-thriftserver -Pjvm-profiler"} # NOTE: echo "q" is needed because SBT prompts the user for input on encountering a build file # with failure (either resolution or compilation); the "q" makes SBT quit. diff --git a/dev/scalastyle b/dev/scalastyle index 12457af1ae7b3..9de1fd1c9d9d5 100755 --- a/dev/scalastyle +++ b/dev/scalastyle @@ -17,7 +17,7 @@ # limitations under the License. # -SPARK_PROFILES=${1:-"-Pkubernetes -Pyarn -Pspark-ganglia-lgpl -Pkinesis-asl -Phive-thriftserver -Phive -Pvolcano"} +SPARK_PROFILES=${1:-"-Pkubernetes -Pyarn -Pspark-ganglia-lgpl -Pkinesis-asl -Phive-thriftserver -Phive -Pvolcano -Pjvm-profiler -Phadoop-cloud"} # NOTE: echo "q" is needed because SBT prompts the user for input on encountering a build file # with failure (either resolution or compilation); the "q" makes SBT quit. diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index 5e169eb119b45..44295e7e630e9 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -476,8 +476,9 @@ def __hash__(self): "pyspark.sql.session", "pyspark.sql.conf", "pyspark.sql.catalog", - "pyspark.sql.column", - "pyspark.sql.dataframe", + "pyspark.sql.classic.column", + "pyspark.sql.classic.dataframe", + "pyspark.sql.classic.window", "pyspark.sql.datasource", "pyspark.sql.group", "pyspark.sql.functions.builtin", @@ -488,7 +489,6 @@ def __hash__(self): "pyspark.sql.streaming.listener", "pyspark.sql.udf", "pyspark.sql.udtf", - "pyspark.sql.window", "pyspark.sql.avro.functions", "pyspark.sql.protobuf.functions", "pyspark.sql.pandas.conversion", @@ -1009,6 +1009,7 @@ def __hash__(self): # sql unittests "pyspark.sql.tests.connect.test_connect_plan", "pyspark.sql.tests.connect.test_connect_basic", + "pyspark.sql.tests.connect.test_connect_dataframe_property", "pyspark.sql.tests.connect.test_connect_error", "pyspark.sql.tests.connect.test_connect_function", "pyspark.sql.tests.connect.test_connect_collection", @@ -1050,6 +1051,7 @@ def __hash__(self): "pyspark.sql.tests.connect.test_parity_python_streaming_datasource", "pyspark.sql.tests.connect.test_utils", "pyspark.sql.tests.connect.client.test_artifact", + "pyspark.sql.tests.connect.client.test_artifact_localcluster", "pyspark.sql.tests.connect.client.test_client", "pyspark.sql.tests.connect.client.test_reattach", "pyspark.sql.tests.connect.streaming.test_parity_streaming", @@ -1062,6 +1064,7 @@ def __hash__(self): "pyspark.sql.tests.connect.test_parity_pandas_udf_window", "pyspark.sql.tests.connect.test_resources", "pyspark.sql.tests.connect.shell.test_progress", + "pyspark.sql.tests.connect.test_df_debug", ], excluded_python_implementations=[ "PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and @@ -1102,6 +1105,8 @@ def __hash__(self): "python/pyspark/pandas", ], python_test_goals=[ + # unittests dedicated for Spark Connect + "pyspark.pandas.tests.connect.test_connect_plotting", # pandas-on-Spark unittests "pyspark.pandas.tests.connect.test_parity_categorical", "pyspark.pandas.tests.connect.test_parity_config", @@ -1171,6 +1176,9 @@ def __hash__(self): "pyspark.pandas.tests.connect.indexes.test_parity_reindex", "pyspark.pandas.tests.connect.indexes.test_parity_rename", "pyspark.pandas.tests.connect.indexes.test_parity_reset_index", + "pyspark.pandas.tests.connect.indexes.test_parity_datetime", + "pyspark.pandas.tests.connect.indexes.test_parity_datetime_at", + "pyspark.pandas.tests.connect.indexes.test_parity_datetime_between", "pyspark.pandas.tests.connect.computation.test_parity_any_all", "pyspark.pandas.tests.connect.computation.test_parity_apply_func", "pyspark.pandas.tests.connect.computation.test_parity_binary_ops", @@ -1183,6 +1191,12 @@ def __hash__(self): "pyspark.pandas.tests.connect.computation.test_parity_describe", "pyspark.pandas.tests.connect.computation.test_parity_eval", "pyspark.pandas.tests.connect.computation.test_parity_melt", + "pyspark.pandas.tests.connect.computation.test_parity_missing_data", + "pyspark.pandas.tests.connect.groupby.test_parity_stat", + "pyspark.pandas.tests.connect.groupby.test_parity_stat_adv", + "pyspark.pandas.tests.connect.groupby.test_parity_stat_ddof", + "pyspark.pandas.tests.connect.groupby.test_parity_stat_func", + "pyspark.pandas.tests.connect.groupby.test_parity_stat_prod", ], excluded_python_implementations=[ "PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and @@ -1248,6 +1262,18 @@ def __hash__(self): "pyspark.pandas.tests.connect.reshape.test_parity_get_dummies_object", "pyspark.pandas.tests.connect.reshape.test_parity_get_dummies_prefix", "pyspark.pandas.tests.connect.reshape.test_parity_merge_asof", + "pyspark.pandas.tests.connect.indexes.test_parity_append", + "pyspark.pandas.tests.connect.indexes.test_parity_intersection", + "pyspark.pandas.tests.connect.indexes.test_parity_monotonic", + "pyspark.pandas.tests.connect.indexes.test_parity_union", + "pyspark.pandas.tests.connect.indexes.test_parity_datetime_ceil", + "pyspark.pandas.tests.connect.indexes.test_parity_datetime_floor", + "pyspark.pandas.tests.connect.indexes.test_parity_datetime_iso", + "pyspark.pandas.tests.connect.indexes.test_parity_datetime_map", + "pyspark.pandas.tests.connect.indexes.test_parity_datetime_property", + "pyspark.pandas.tests.connect.indexes.test_parity_datetime_round", + "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_shift", + "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_transform", # fallback "pyspark.pandas.tests.connect.frame.test_parity_asfreq", "pyspark.pandas.tests.connect.frame.test_parity_asof", @@ -1273,7 +1299,6 @@ def __hash__(self): "pyspark.pandas.tests.connect.computation.test_parity_pivot_table_multi_idx", "pyspark.pandas.tests.connect.computation.test_parity_pivot_table_multi_idx_adv", "pyspark.pandas.tests.connect.computation.test_parity_stats", - "pyspark.pandas.tests.connect.computation.test_parity_missing_data", "pyspark.pandas.tests.connect.frame.test_parity_interpolate", "pyspark.pandas.tests.connect.frame.test_parity_interpolate_error", "pyspark.pandas.tests.connect.resample.test_parity_frame", @@ -1346,24 +1371,6 @@ def __hash__(self): "pyspark.pandas.tests.connect.io.test_parity_dataframe_conversion", "pyspark.pandas.tests.connect.io.test_parity_dataframe_spark_io", "pyspark.pandas.tests.connect.io.test_parity_series_conversion", - "pyspark.pandas.tests.connect.groupby.test_parity_stat", - "pyspark.pandas.tests.connect.groupby.test_parity_stat_adv", - "pyspark.pandas.tests.connect.groupby.test_parity_stat_ddof", - "pyspark.pandas.tests.connect.groupby.test_parity_stat_func", - "pyspark.pandas.tests.connect.groupby.test_parity_stat_prod", - "pyspark.pandas.tests.connect.indexes.test_parity_append", - "pyspark.pandas.tests.connect.indexes.test_parity_intersection", - "pyspark.pandas.tests.connect.indexes.test_parity_monotonic", - "pyspark.pandas.tests.connect.indexes.test_parity_union", - "pyspark.pandas.tests.connect.indexes.test_parity_datetime", - "pyspark.pandas.tests.connect.indexes.test_parity_datetime_at", - "pyspark.pandas.tests.connect.indexes.test_parity_datetime_between", - "pyspark.pandas.tests.connect.indexes.test_parity_datetime_ceil", - "pyspark.pandas.tests.connect.indexes.test_parity_datetime_floor", - "pyspark.pandas.tests.connect.indexes.test_parity_datetime_iso", - "pyspark.pandas.tests.connect.indexes.test_parity_datetime_map", - "pyspark.pandas.tests.connect.indexes.test_parity_datetime_property", - "pyspark.pandas.tests.connect.indexes.test_parity_datetime_round", "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_arithmetic", "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_arithmetic_ext", "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_arithmetic_ext_float", @@ -1386,9 +1393,7 @@ def __hash__(self): "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_diff_len", "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_fillna", "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_filter", - "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_shift", "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_split_apply_combine", - "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_transform", "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_expanding", "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_expanding_adv", "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_expanding_count", diff --git a/dev/test-dependencies.sh b/dev/test-dependencies.sh index 175f59a700941..563a7e1acab4f 100755 --- a/dev/test-dependencies.sh +++ b/dev/test-dependencies.sh @@ -31,7 +31,7 @@ export LC_ALL=C # NOTE: These should match those in the release publishing script, and be kept in sync with # dev/create-release/release-build.sh HADOOP_MODULE_PROFILES="-Phive-thriftserver -Pkubernetes -Pyarn -Phive \ - -Pspark-ganglia-lgpl -Pkinesis-asl -Phadoop-cloud" + -Pspark-ganglia-lgpl -Pkinesis-asl -Phadoop-cloud -Pjvm-profiler" MVN="build/mvn" HADOOP_HIVE_PROFILES=( hadoop-3-hive-2.3 @@ -49,7 +49,7 @@ OLD_VERSION=$($MVN -q \ --non-recursive \ org.codehaus.mojo:exec-maven-plugin:1.6.0:exec | grep -E '[0-9]+\.[0-9]+\.[0-9]+') # dependency:get for guava and jetty-io are workaround for SPARK-37302. -GUAVA_VERSION=$(build/mvn help:evaluate -Dexpression=guava.version -q -DforceStdout | grep -E "^[0-9.]+$") +GUAVA_VERSION=$(build/mvn help:evaluate -Dexpression=guava.version -q -DforceStdout | grep -E "^[0-9\.]+") build/mvn dependency:get -Dartifact=com.google.guava:guava:${GUAVA_VERSION} -q JETTY_VERSION=$(build/mvn help:evaluate -Dexpression=jetty.version -q -DforceStdout | grep -E "[0-9]+\.[0-9]+\.[0-9]+") build/mvn dependency:get -Dartifact=org.eclipse.jetty:jetty-io:${JETTY_VERSION} -q @@ -140,4 +140,8 @@ for HADOOP_HIVE_PROFILE in "${HADOOP_HIVE_PROFILES[@]}"; do fi done +if [[ -d "$FWDIR/dev/pr-deps" ]]; then + rm -rf "$FWDIR/dev/pr-deps" +fi + exit 0 diff --git a/docs/Gemfile.lock b/docs/Gemfile.lock index 4e38f18703f3c..e137f0f039b97 100644 --- a/docs/Gemfile.lock +++ b/docs/Gemfile.lock @@ -4,16 +4,16 @@ GEM addressable (2.8.6) public_suffix (>= 2.0.2, < 6.0) colorator (1.1.0) - concurrent-ruby (1.2.2) + concurrent-ruby (1.2.3) em-websocket (0.5.3) eventmachine (>= 0.12.9) http_parser.rb (~> 0) eventmachine (1.2.7) ffi (1.16.3) forwardable-extended (2.6.0) - google-protobuf (3.25.2) + google-protobuf (3.25.3) http_parser.rb (0.8.0) - i18n (1.14.1) + i18n (1.14.5) concurrent-ruby (~> 1.0) jekyll (4.3.3) addressable (~> 2.4) @@ -42,22 +42,22 @@ GEM kramdown-parser-gfm (1.1.0) kramdown (~> 2.0) liquid (4.0.4) - listen (3.8.0) + listen (3.9.0) rb-fsevent (~> 0.10, >= 0.10.3) rb-inotify (~> 0.9, >= 0.9.10) mercenary (0.4.0) pathutil (0.16.2) forwardable-extended (~> 2.6) - public_suffix (5.0.4) - rake (13.1.0) + public_suffix (5.0.5) + rake (13.2.1) rb-fsevent (0.11.2) rb-inotify (0.10.1) ffi (~> 1.0) rexml (3.2.6) rouge (3.30.0) safe_yaml (1.0.5) - sass-embedded (1.69.7) - google-protobuf (~> 3.25) + sass-embedded (1.63.6) + google-protobuf (~> 3.23) rake (>= 13.0.0) terminal-table (3.0.2) unicode-display_width (>= 1.1.1, < 3) diff --git a/docs/README.md b/docs/README.md index fac9010d86922..363f1c2076363 100644 --- a/docs/README.md +++ b/docs/README.md @@ -36,27 +36,25 @@ You need to have [Ruby 3][ruby] and [Python 3][python] installed. Make sure the [python]: https://www.python.org/downloads/ ```sh -$ gem install bundler +$ gem install bundler -v 2.4.22 ``` -After this all the required ruby dependencies can be installed from the `docs/` directory via the Bundler: +After this all the required Ruby dependencies can be installed from the `docs/` directory via Bundler: ```sh -$ cd docs +$ cd "$SPARK_HOME"/docs $ bundle install ``` -To generate the Python or R docs, you'll need to [install Pandoc](https://pandoc.org/installing.html). - -### SQL and Python API Documentation (Optional) - -To generate SQL and Python API docs, you'll need to install these libraries: +And the required Python dependencies can be installed using pip: -Run the following command from $SPARK_HOME: ```sh +$ cd "$SPARK_HOME" $ pip install --upgrade -r dev/requirements.txt ``` +To generate the Python or R API docs, you'll also need to [install Pandoc](https://pandoc.org/installing.html). + ### R API Documentation (Optional) If you'd like to generate R API documentation, install these libraries: @@ -121,6 +119,10 @@ The jekyll plugin also generates the PySpark docs using [Sphinx](http://sphinx-d using [roxygen2](https://cran.r-project.org/web/packages/roxygen2/index.html) and SQL docs using [MkDocs](https://www.mkdocs.org/). -NOTE: To skip the step of building and copying over the Scala, Java, Python, R and SQL API docs, run `SKIP_API=1 -bundle exec jekyll build`. In addition, `SKIP_SCALADOC=1`, `SKIP_PYTHONDOC=1`, `SKIP_RDOC=1` and `SKIP_SQLDOC=1` can be used -to skip a single step of the corresponding language. `SKIP_SCALADOC` indicates skipping both the Scala and Java docs. +To control what API docs get built, you can set any combination of the following shell variables before you run `bundle exec jekyll build`: +* `SKIP_API=1`: Skip building all the API docs. +* `SKIP_SCALADOC=1`: Skip the Scala and Java API docs. +* `SKIP_PYTHONDOC=1`: Skip the Python API docs. +* `SKIP_RDOC=1`: Skip the R API docs. +* `SKIP_SQLDOC=1`: Skip the SQL API docs. + diff --git a/docs/_config.yml b/docs/_config.yml index 19183f85df239..e74eda0470417 100644 --- a/docs/_config.yml +++ b/docs/_config.yml @@ -22,7 +22,7 @@ include: SPARK_VERSION: 4.0.0-SNAPSHOT SPARK_VERSION_SHORT: 4.0.0 SCALA_BINARY_VERSION: "2.13" -SCALA_VERSION: "2.13.13" +SCALA_VERSION: "2.13.14" SPARK_ISSUE_TRACKER_URL: https://issues.apache.org/jira/browse/SPARK SPARK_GITHUB_URL: https://github.com/apache/spark # Before a new release, we should: diff --git a/docs/_data/menu-sql.yaml b/docs/_data/menu-sql.yaml index 04792ebf576fa..01c8a8076958f 100644 --- a/docs/_data/menu-sql.yaml +++ b/docs/_data/menu-sql.yaml @@ -63,6 +63,8 @@ url: sql-performance-tuning.html#optimizing-the-join-strategy - text: Adaptive Query Execution url: sql-performance-tuning.html#adaptive-query-execution + - text: Storage Partition Join + url: sql-performance-tuning.html#storage-partition-join - text: Distributed SQL Engine url: sql-distributed-sql-engine.html subitems: @@ -85,6 +87,8 @@ url: sql-ref-datetime-pattern.html - text: Number Pattern url: sql-ref-number-pattern.html + - text: Operators + url: sql-ref-operators.html - text: Functions url: sql-ref-functions.html - text: Identifiers @@ -106,48 +110,3 @@ url: sql-ref-syntax.html#auxiliary-statements - text: Error Conditions url: sql-error-conditions.html - subitems: - - text: SQLSTATE Codes - url: sql-error-conditions-sqlstates.html - - text: COLLECTION_SIZE_LIMIT_EXCEEDED error class - url: sql-error-conditions-collection-size-limit-exceeded-error-class.html - - text: CONNECT error class - url: sql-error-conditions-connect-error-class.html - - text: DATATYPE_MISMATCH error class - url: sql-error-conditions-datatype-mismatch-error-class.html - - text: INCOMPATIBLE_DATA_FOR_TABLE error class - url: sql-error-conditions-incompatible-data-for-table-error-class.html - - text: INCOMPLETE_TYPE_DEFINITION error class - url: sql-error-conditions-incomplete-type-definition-error-class.html - - text: INCONSISTENT_BEHAVIOR_CROSS_VERSION error class - url: sql-error-conditions-inconsistent-behavior-cross-version-error-class.html - - text: INVALID_FORMAT error class - url: sql-error-conditions-invalid-format-error-class.html - - text: INVALID_OPTIONS error class - url: sql-error-conditions-invalid-options-error-class.html - - text: INVALID_PARAMETER_VALUE error class - url: sql-error-conditions-invalid-parameter-value-error-class.html - - text: INVALID_SCHEMA error class - url: sql-error-conditions-invalid-schema-error-class.html - - text: INVALID_SUBQUERY_EXPRESSION error class - url: sql-error-conditions-invalid-subquery-expression-error-class.html - - text: NOT_NULL_CONSTRAINT_VIOLATION error class - url: sql-error-conditions-not-null-constraint-violation-error-class.html - - text: UNRESOLVED_COLUMN error class - url: sql-error-conditions-unresolved-column-error-class.html - - text: UNRESOLVED_FIELD error class - url: sql-error-conditions-unresolved-field-error-class.html - - text: UNRESOLVED_MAP_KEY error class - url: sql-error-conditions-unresolved-map-key-error-class.html - - text: UNSUPPORTED_DESERIALIZER error class - url: sql-error-conditions-unsupported-deserializer-error-class.html - - text: UNSUPPORTED_FEATURE error class - url: sql-error-conditions-unsupported-feature-error-class.html - - text: UNSUPPORTED_GENERATOR error class - url: sql-error-conditions-unsupported-generator-error-class.html - - text: UNSUPPORTED_SAVE_MODE error class - url: sql-error-conditions-unsupported-save-mode-error-class.html - - text: UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY error class - url: sql-error-conditions-unsupported-subquery-expression-category-error-class.html - - text: WRONG_NUM_ARGS error class - url: sql-error-conditions-wrong-num-args-error-class.html diff --git a/docs/_plugins/build_api_docs.rb b/docs/_plugins/build_api_docs.rb index 5b52f3799cc4a..8d3ef86ac3d66 100644 --- a/docs/_plugins/build_api_docs.rb +++ b/docs/_plugins/build_api_docs.rb @@ -188,6 +188,14 @@ def build_sql_docs cp_r("../sql/site/.", "api/sql") end +def build_error_docs + print_header "Building error docs." + system("python '#{SPARK_PROJECT_ROOT}/docs/util/build-error-docs.py'") \ + || raise("Error doc generation failed") +end + +build_error_docs + if not (ENV['SKIP_API'] == '1') if not (ENV['SKIP_SCALADOC'] == '1') build_scala_and_java_docs diff --git a/docs/building-spark.md b/docs/building-spark.md index d10dfc9434fec..9ac61bfb8a64a 100644 --- a/docs/building-spark.md +++ b/docs/building-spark.md @@ -27,7 +27,7 @@ license: | ## Apache Maven The Maven-based build is the build of reference for Apache Spark. -Building Spark using Maven requires Maven 3.9.6 and Java 17/21. +Building Spark using Maven requires Maven 3.9.8 and Java 17/21. Spark requires Scala 2.13; support for Scala 2.12 was removed in Spark 4.0.0. ### Setting up Maven's Memory Usage @@ -85,9 +85,9 @@ Example: To enable Hive integration for Spark SQL along with its JDBC server and CLI, add the `-Phive` and `-Phive-thriftserver` profiles to your existing build options. -By default Spark will build with Hive 2.3.9. +By default Spark will build with Hive 2.3.10. - # With Hive 2.3.9 support + # With Hive 2.3.10 support ./build/mvn -Pyarn -Phive -Phive-thriftserver -DskipTests clean package ## Packaging without Hadoop Dependencies for YARN @@ -117,6 +117,13 @@ where `spark-streaming_{{site.SCALA_BINARY_VERSION}}` is the `artifactId` as def ./build/mvn -Pconnect -DskipTests clean package +## Building with JVM Profile support + + ./build/mvn -Pjvm-profiler -DskipTests clean package + +**Note:** The `jvm-profiler` profile builds the assembly without including the dependency `ap-loader`, +you can download it manually from maven central repo and use it together with `spark-profiler_{{site.SCALA_BINARY_VERSION}}`. + ## Continuous Compilation We use the scala-maven-plugin which supports incremental and continuous compilation. E.g. diff --git a/docs/configuration.md b/docs/configuration.md index d5e2a569fdeaf..6833d4e54fd03 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -91,7 +91,7 @@ Then, you can supply configuration values at runtime: ```sh ./bin/spark-submit \ --name "My app" \ - --master local[4] \ + --master "local[4]" \ --conf spark.eventLog.enabled=false \ --conf "spark.executor.extraJavaOptions=-XX:+PrintGCDetails -XX:+PrintGCTimeStamps" \ myApp.jar @@ -103,20 +103,25 @@ such as `--master`, as shown above. `spark-submit` can accept any Spark property flag, but uses special flags for properties that play a part in launching the Spark application. Running `./bin/spark-submit --help` will show the entire list of these options. -`bin/spark-submit` will also read configuration options from `conf/spark-defaults.conf`, in which -each line consists of a key and a value separated by whitespace. For example: +When configurations are specified via the `--conf/-c` flags, `bin/spark-submit` will also read +configuration options from `conf/spark-defaults.conf`, in which each line consists of a key and +a value separated by whitespace. For example: spark.master spark://5.6.7.8:7077 spark.executor.memory 4g spark.eventLog.enabled true spark.serializer org.apache.spark.serializer.KryoSerializer +In addition, a property file with Spark configurations can be passed to `bin/spark-submit` via +`--properties-file` parameter. When this is set, Spark will no longer load configurations from +`conf/spark-defaults.conf` unless another parameter `--load-spark-defaults` is provided. + Any values specified as flags or in the properties file will be passed on to the application and merged with those specified through SparkConf. Properties set directly on the SparkConf -take highest precedence, then flags passed to `spark-submit` or `spark-shell`, then options -in the `spark-defaults.conf` file. A few configuration keys have been renamed since earlier -versions of Spark; in such cases, the older key names are still accepted, but take lower -precedence than any instance of the newer key. +take the highest precedence, then those through `--conf` flags or `--properties-file` passed to +`spark-submit` or `spark-shell`, then options in the `spark-defaults.conf` file. A few +configuration keys have been renamed since earlier versions of Spark; in such cases, the older +key names are still accepted, but take lower precedence than any instance of the newer key. Spark properties mainly can be divided into two kinds: one is related to deploy, like "spark.driver.memory", "spark.executor.instances", this kind of properties may not be affected when @@ -1028,11 +1033,19 @@ Apart from these, the following properties are also available, and may be useful spark.shuffle.unsafe.file.output.buffer 32k - The file system for this buffer size after each partition is written in unsafe shuffle writer. - In KiB unless otherwise specified. + Deprecated since Spark 4.0, please use spark.shuffle.localDisk.file.output.buffer. 2.3.0 + + spark.shuffle.localDisk.file.output.buffer + 32k + + The file system for this buffer size after each partition is written in all local disk shuffle writers. + In KiB unless otherwise specified. + + 4.0.0 + spark.shuffle.spill.diskWriteBufferSize 1024 * 1024 @@ -1795,6 +1808,15 @@ Apart from these, the following properties are also available, and may be useful 0.6.0 + + spark.checkpoint.dir + (none) + + Set the default directory for checkpointing. It can be overwritten by + SparkContext.setCheckpointDir. + + 4.0.0 + spark.checkpoint.compress false @@ -1881,6 +1903,14 @@ Apart from these, the following properties are also available, and may be useful 4.0.0 + + spark.io.compression.lzf.parallel.enabled + false + + When true, LZF compression will use multiple threads to compress data in parallel. + + 4.0.0 + spark.kryo.classesToRegister (none) @@ -3072,7 +3102,7 @@ Apart from these, the following properties are also available, and may be useful spark.stage.ignoreDecommissionFetchFailure - false + true Whether ignore stage fetch failure caused by executor decommission when count spark.stage.maxConsecutiveAttempts @@ -3387,7 +3417,7 @@ Spark subsystems. Runtime SQL configurations are per-session, mutable Spark SQL configurations. They can be set with initial values by the config file and command-line options with `--conf/-c` prefixed, or by setting `SparkConf` that are used to create `SparkSession`. -Also, they can be set and queried by SET commands and rest to their initial values by RESET command, +Also, they can be set and queried by SET commands and reset to their initial values by RESET command, or by `SparkSession.conf`'s setter and getter methods in runtime. {% include_api_gen generated-runtime-sql-config-table.html %} @@ -3670,14 +3700,36 @@ Note: When running Spark on YARN in `cluster` mode, environment variables need t # Configuring Logging Spark uses [log4j](http://logging.apache.org/log4j/) for logging. You can configure it by adding a -`log4j2.properties` file in the `conf` directory. One way to start is to copy the existing -`log4j2.properties.template` located there. +`log4j2.properties` file in the `conf` directory. One way to start is to copy the existing templates `log4j2.properties.template` or `log4j2.properties.pattern-layout-template` located there. + +## Structured Logging +Starting from version 4.0.0, `spark-submit` has adopted the [JSON Template Layout](https://logging.apache.org/log4j/2.x/manual/json-template-layout.html) for logging, which outputs logs in JSON format. This format facilitates querying logs using Spark SQL with the JSON data source. Additionally, the logs include all Mapped Diagnostic Context (MDC) information for search and debugging purposes. + +To configure the layout of structured logging, start with the `log4j2.properties.template` file. + +To query Spark logs using Spark SQL, you can use the following Python code snippet: + +```python +from pyspark.util import LogUtils + +logDf = spark.read.schema(LogUtils.LOG_SCHEMA).json("path/to/logs") +``` + +Or using the following Scala code snippet: +```scala +import org.apache.spark.util.LogUtils.LOG_SCHEMA + +val logDf = spark.read.schema(LOG_SCHEMA).json("path/to/logs") +``` + +## Plain Text Logging +If you prefer plain text logging, you have two options: +- Disable structured JSON logging by setting the Spark configuration `spark.log.structuredLogging.enabled` to `false`. +- Use a custom log4j configuration file. Rename `conf/log4j2.properties.pattern-layout-template` to `conf/log4j2.properties`. This reverts to the default configuration prior to Spark 4.0, which utilizes [PatternLayout](https://logging.apache.org/log4j/2.x/manual/layouts.html#PatternLayout) for logging all messages in plain text. -By default, Spark adds 1 record to the MDC (Mapped Diagnostic Context): `mdc.taskName`, which shows something -like `task 1.0 in stage 0.0`. You can add `%X{mdc.taskName}` to your patternLayout in -order to print it in the logs. +MDC information is not included by default when with plain text logging. In order to print it in the logs, you can update the patternLayout in the file. For example, you can add `%X{task_name}` to print the task name in the logs. Moreover, you can use `spark.sparkContext.setLocalProperty(s"mdc.$name", "value")` to add user specific data into MDC. -The key in MDC will be the string of "mdc.$name". +The key in MDC will be the string of `mdc.$name`. # Overriding configuration directory @@ -3728,7 +3780,7 @@ Also, you can modify or add configurations at runtime: {% highlight bash %} ./bin/spark-submit \ --name "My app" \ - --master local[4] \ + --master "local[4]" \ --conf spark.eventLog.enabled=false \ --conf "spark.executor.extraJavaOptions=-XX:+PrintGCDetails -XX:+PrintGCTimeStamps" \ --conf spark.hadoop.abc.def=xyz \ diff --git a/docs/core-migration-guide.md b/docs/core-migration-guide.md index 5f3560883e593..26b0ff32cf5d9 100644 --- a/docs/core-migration-guide.md +++ b/docs/core-migration-guide.md @@ -42,9 +42,15 @@ license: | - Since Spark 4.0, Spark uses the external shuffle service for deleting shuffle blocks for deallocated executors when the shuffle is no longer needed. To restore the legacy behavior, you can set `spark.shuffle.service.removeShuffle` to `false`. -- Since Spark 4.0, the default log4j output has shifted from plain text to JSON lines to enhance analyzability. To revert to plain text output, you can either set `spark.log.structuredLogging.enabled` to `false`, or use a custom log4j configuration. +- Starting with Spark 4.0, the default logging format for `spark-submit` has changed from plain text to JSON lines to improve log analysis. If you prefer plain text logs, you have two options: + - Set the Spark configuration `spark.log.structuredLogging.enabled` to `false`. + - Use a custom log4j configuration file, such as renaming the template file `conf/log4j2.properties.pattern-layout-template` to `conf/log4j2.properties`. -- Since Spark 4.0, Spark performs speculative executions less agressively with `spark.speculation.multiplier=3` and `spark.speculation.quantile=0.9`. To restore the legacy behavior, you can set `spark.speculation.multiplier=1.5` and `spark.speculation.quantile=0.75`. +- Since Spark 4.0, the MDC (Mapped Diagnostic Context) key for Spark task names in Spark logs has been changed from `mdc.taskName` to `task_name`. To use the key `mdc.taskName`, you can set `spark.log.legacyTaskNameMdc.enabled` to `true`. + +- Since Spark 4.0, Spark performs speculative executions less aggressively with `spark.speculation.multiplier=3` and `spark.speculation.quantile=0.9`. To restore the legacy behavior, you can set `spark.speculation.multiplier=1.5` and `spark.speculation.quantile=0.75`. + +- Since Spark 4.0, `spark.shuffle.unsafe.file.output.buffer` is deprecated though still works. Use `spark.shuffle.localDisk.file.output.buffer` instead. ## Upgrading from Core 3.4 to 3.5 diff --git a/docs/css/custom.css b/docs/css/custom.css index 22175068023b7..9edb466606555 100644 --- a/docs/css/custom.css +++ b/docs/css/custom.css @@ -988,3 +988,26 @@ table.spark-config th:nth-child(4), table.spark-config td:nth-child(4) { width: 90px; } + +table#error-conditions { + table-layout: fixed; + + span.error-condition-name { + /* Any error names that wrap will have the wrapped lines indented + relative to the first line thanks to these three rules. + */ + padding-left: 2em; + text-indent: -2em; + display: block; + } + + th:nth-child(1), + td:nth-child(1) { + /* width: 85px; */ + width: 105px; + } + + td.error-sub-condition { + padding-left: 2.5em; + } +} diff --git a/docs/monitoring.md b/docs/monitoring.md index 5e11d5aef81eb..d04fb35cf7275 100644 --- a/docs/monitoring.md +++ b/docs/monitoring.md @@ -1301,6 +1301,17 @@ These metrics are exposed by Spark executors. - shuffleRemoteBytesReadToDisk.count - shuffleTotalBytesRead.count - shuffleWriteTime.count + - Metrics related to push-based shuffle: + - shuffleCorruptMergedBlockChunks + - shuffleMergedFetchFallbackCount + - shuffleMergedRemoteBlocksFetched + - shuffleMergedLocalBlocksFetched + - shuffleMergedRemoteChunksFetched + - shuffleMergedLocalChunksFetched + - shuffleMergedRemoteBytesRead + - shuffleMergedLocalBytesRead + - shuffleRemoteReqsDuration + - shuffleMergedRemoteReqsDuration - succeededTasks.count - threadpool.activeTasks - threadpool.completeTasks @@ -1424,12 +1435,14 @@ Note: applies to the shuffle service - blockTransferMessageRate (meter) - rate of block transfer messages, i.e. if batch fetches are enabled, this represents number of batches rather than number of blocks - blockTransferRateBytes (meter) -- blockTransferAvgTime_1min (gauge - 1-minute moving average) +- blockTransferAvgSize_1min (gauge - 1-minute moving average) - numActiveConnections.count - numRegisteredConnections.count - numCaughtExceptions.count -- openBlockRequestLatencyMillis (histogram) -- registerExecutorRequestLatencyMillis (histogram) +- openBlockRequestLatencyMillis (timer) +- registerExecutorRequestLatencyMillis (timer) +- fetchMergedBlocksMetaLatencyMillis (timer) +- finalizeShuffleMergeLatencyMillis (timer) - registeredExecutorsSize - shuffle-server.usedDirectMemory - shuffle-server.usedHeapMemory diff --git a/docs/quick-start.md b/docs/quick-start.md index 366970cf66c71..5a03af98cd832 100644 --- a/docs/quick-start.md +++ b/docs/quick-start.md @@ -286,7 +286,7 @@ We can run this application using the `bin/spark-submit` script: {% highlight bash %} # Use spark-submit to run your application $ YOUR_SPARK_HOME/bin/spark-submit \ - --master local[4] \ + --master "local[4]" \ SimpleApp.py ... Lines with a: 46, Lines with b: 23 @@ -371,7 +371,7 @@ $ sbt package # Use spark-submit to run your application $ YOUR_SPARK_HOME/bin/spark-submit \ --class "SimpleApp" \ - --master local[4] \ + --master "local[4]" \ target/scala-{{site.SCALA_BINARY_VERSION}}/simple-project_{{site.SCALA_BINARY_VERSION}}-1.0.jar ... Lines with a: 46, Lines with b: 23 @@ -452,7 +452,7 @@ $ mvn package # Use spark-submit to run your application $ YOUR_SPARK_HOME/bin/spark-submit \ --class "SimpleApp" \ - --master local[4] \ + --master "local[4]" \ target/simple-project-1.0.jar ... Lines with a: 46, Lines with b: 23 diff --git a/docs/rdd-programming-guide.md b/docs/rdd-programming-guide.md index f75bda0ffafb0..cbbce4c082060 100644 --- a/docs/rdd-programming-guide.md +++ b/docs/rdd-programming-guide.md @@ -214,13 +214,13 @@ can be passed to the `--repositories` argument. For example, to run `bin/pyspark` on exactly four cores, use: {% highlight bash %} -$ ./bin/pyspark --master local[4] +$ ./bin/pyspark --master "local[4]" {% endhighlight %} Or, to also add `code.py` to the search path (in order to later be able to `import code`), use: {% highlight bash %} -$ ./bin/pyspark --master local[4] --py-files code.py +$ ./bin/pyspark --master "local[4]" --py-files code.py {% endhighlight %} For a complete list of options, run `pyspark --help`. Behind the scenes, @@ -260,19 +260,19 @@ can be passed to the `--repositories` argument. For example, to run `bin/spark-s four cores, use: {% highlight bash %} -$ ./bin/spark-shell --master local[4] +$ ./bin/spark-shell --master "local[4]" {% endhighlight %} Or, to also add `code.jar` to its classpath, use: {% highlight bash %} -$ ./bin/spark-shell --master local[4] --jars code.jar +$ ./bin/spark-shell --master "local[4]" --jars code.jar {% endhighlight %} To include a dependency using Maven coordinates: {% highlight bash %} -$ ./bin/spark-shell --master local[4] --packages "org.example:example:0.1" +$ ./bin/spark-shell --master "local[4]" --packages "org.example:example:0.1" {% endhighlight %} For a complete list of options, run `spark-shell --help`. Behind the scenes, @@ -781,7 +781,7 @@ One of the harder things about Spark is understanding the scope and life cycle o #### Example -Consider the naive RDD element sum below, which may behave differently depending on whether execution is happening within the same JVM. A common example of this is when running Spark in `local` mode (`--master = local[n]`) versus deploying a Spark application to a cluster (e.g. via spark-submit to YARN): +Consider the naive RDD element sum below, which may behave differently depending on whether execution is happening within the same JVM. A common example of this is when running Spark in `local` mode (`--master = "local[n]"`) versus deploying a Spark application to a cluster (e.g. via spark-submit to YARN):
    diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md index 778af5f0751a8..7619dd728a2e5 100644 --- a/docs/running-on-kubernetes.md +++ b/docs/running-on-kubernetes.md @@ -44,7 +44,7 @@ Cluster administrators should use [Pod Security Policies](https://kubernetes.io/ # Prerequisites -* A running Kubernetes cluster at version >= 1.26 with access configured to it using +* A running Kubernetes cluster at version >= 1.27 with access configured to it using [kubectl](https://kubernetes.io/docs/reference/kubectl/). If you do not already have a working Kubernetes cluster, you may set up a test cluster on your local machine using [minikube](https://kubernetes.io/docs/getting-started-guides/minikube/). @@ -1939,10 +1939,10 @@ Install Apache YuniKorn: ```bash helm repo add yunikorn https://apache.github.io/yunikorn-release helm repo update -helm install yunikorn yunikorn/yunikorn --namespace yunikorn --version 1.5.0 --create-namespace --set embedAdmissionController=false +helm install yunikorn yunikorn/yunikorn --namespace yunikorn --version 1.5.1 --create-namespace --set embedAdmissionController=false ``` -The above steps will install YuniKorn v1.5.0 on an existing Kubernetes cluster. +The above steps will install YuniKorn v1.5.1 on an existing Kubernetes cluster. ##### Get started diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md index aab8ee60a256c..700ddefabea47 100644 --- a/docs/running-on-yarn.md +++ b/docs/running-on-yarn.md @@ -33,6 +33,9 @@ Please see [Spark Security](security.html) and the specific security sections in # Launching Spark on YARN +Apache Hadoop does not support Java 17 as of 3.4.0, while Apache Spark requires at least Java 17 since 4.0.0, so a different JDK should be configured for Spark applications. +Please refer to [Configuring different JDKs for Spark Applications](#configuring-different-jdks-for-spark-applications) for details. + Ensure that `HADOOP_CONF_DIR` or `YARN_CONF_DIR` points to the directory which contains the (client side) configuration files for the Hadoop cluster. These configs are used to write to HDFS and connect to the YARN ResourceManager. The configuration contained in this directory will be distributed to the YARN cluster so that all @@ -1032,3 +1035,34 @@ and one should be configured with: spark.shuffle.service.name = spark_shuffle_y spark.shuffle.service.port = ``` + +# Configuring different JDKs for Spark Applications + +In some cases it may be desirable to use a different JDK from YARN node manager to run Spark applications, +this can be achieved by setting the `JAVA_HOME` environment variable for YARN containers and the `spark-submit` +process. + +Note that, Spark assumes that all JVM processes runs in one application use the same version of JDK, otherwise, +you may encounter JDK serialization issues. + +To configure a Spark application to use a JDK which has been pre-installed on all nodes at `/opt/openjdk-17`: + + $ export JAVA_HOME=/opt/openjdk-17 + $ ./bin/spark-submit --class path.to.your.Class \ + --master yarn \ + --conf spark.yarn.appMasterEnv.JAVA_HOME=/opt/openjdk-17 \ + --conf spark.executorEnv.JAVA_HOME=/opt/openjdk-17 \ + [app options] + +Optionally, the user may want to avoid installing a different JDK on the YARN cluster nodes, in such a case, +it's also possible to distribute the JDK using YARN's Distributed Cache. For example, to use Java 21 to run +a Spark application, prepare a JDK 21 tarball `openjdk-21.tar.gz` and untar it to `/opt` on the local node, +then submit a Spark application: + + $ export JAVA_HOME=/opt/openjdk-21 + $ ./bin/spark-submit --class path.to.your.Class \ + --master yarn \ + --archives path/to/openjdk-21.tar.gz \ + --conf spark.yarn.appMasterEnv.JAVA_HOME=./openjdk-21.tar.gz/openjdk-21 \ + --conf spark.executorEnv.JAVA_HOME=./openjdk-21.tar.gz/openjdk-21 \ + [app options] diff --git a/docs/security.md b/docs/security.md index 455935fcffca3..1b5dcb3836457 100644 --- a/docs/security.md +++ b/docs/security.md @@ -207,6 +207,15 @@ The following table describes the different options available for configuring th 2.2.0 + + spark.network.crypto.cipher + AES/CTR/NoPadding + + Cipher mode to use. Defaults "AES/CTR/NoPadding" for backward compatibility, which is not authenticated. + Recommended to use "AES/GCM/NoPadding", which is an authenticated encryption mode. + + 4.0.0 + spark.network.crypto.authEngineVersion 1 @@ -308,7 +317,7 @@ The following settings cover enabling encryption for data written to disk: ## Authentication and Authorization -Enabling authentication for the Web UIs is done using [javax servlet filters](https://docs.oracle.com/javaee/6/api/javax/servlet/Filter.html). +Enabling authentication for the Web UIs is done using [jakarta servlet filters](https://jakarta.ee/specifications/servlet/5.0/apidocs/jakarta/servlet/filter). You will need a filter that implements the authentication method you want to deploy. Spark does not provide any built-in authentication filters. diff --git a/docs/spark-standalone.md b/docs/spark-standalone.md index 1eab3158e2e56..774c0bee31295 100644 --- a/docs/spark-standalone.md +++ b/docs/spark-standalone.md @@ -793,19 +793,11 @@ In order to enable this recovery mode, you can set SPARK_DAEMON_JAVA_OPTS in spa spark.deploy.recoveryDirectory "" The directory in which Spark will store recovery state, accessible from the Master's perspective. - Note that the directory should be clearly manualy if spark.deploy.recoveryMode, - spark.deploy.recoverySerializer, or spark.deploy.recoveryCompressionCodec is changed. + Note that the directory should be clearly manualy if spark.deploy.recoveryMode + or spark.deploy.recoveryCompressionCodec is changed. 0.8.1 - - spark.deploy.recoverySerializer - JAVA - A serializer for writing/reading objects to/from persistence engines; JAVA (default) or KRYO. - Java serializer has been the default mode since Spark 0.8.1. - Kryo serializer is a new fast and compact mode from Spark 4.0.0. - 4.0.0 - spark.deploy.recoveryCompressionCodec (none) diff --git a/docs/sql-data-sources-avro.md b/docs/sql-data-sources-avro.md index d717899564299..3721f92d93266 100644 --- a/docs/sql-data-sources-avro.md +++ b/docs/sql-data-sources-avro.md @@ -225,6 +225,24 @@ write.stream( {% endhighlight %}
    +
    +{% highlight sql %} +CREATE TABLE t AS + SELECT NAMED_STRUCT('u', NAMED_STRUCT('member0', member0, 'member1', member1)) AS s + FROM VALUES (1, NULL), (NULL, 'a') tab(member0, member1); +DECLARE avro_schema STRING; +SET VARIABLE avro_schema = + '{ "type": "record", "name": "struct", "fields": [{ "name": "u", "type": ["int","string"] }] }'; + +SELECT TO_AVRO(s, avro_schema) AS RESULT FROM t; + +SELECT FROM_AVRO(result, avro_schema, MAP()).u FROM ( + SELECT TO_AVRO(s, avro_schema) AS RESULT FROM t); + +DROP TEMPORARY VARIABLE avro_schema; +DROP TABLE t; +{% endhighlight %} +
    ## Data Source Option diff --git a/docs/sql-data-sources-hive-tables.md b/docs/sql-data-sources-hive-tables.md index b51cde53bd8fd..566dcb33a25d9 100644 --- a/docs/sql-data-sources-hive-tables.md +++ b/docs/sql-data-sources-hive-tables.md @@ -127,10 +127,10 @@ The following options can be used to configure the version of Hive that is used Property NameDefaultMeaningSince Version spark.sql.hive.metastore.version - 2.3.9 + 2.3.10 Version of the Hive metastore. Available - options are 2.0.0 through 2.3.9 and 3.0.0 through 3.1.3. + options are 2.0.0 through 2.3.10 and 3.0.0 through 3.1.3. 1.4.0 @@ -142,9 +142,9 @@ The following options can be used to configure the version of Hive that is used property can be one of four options:
    1. builtin
    2. - Use Hive 2.3.9, which is bundled with the Spark assembly when -Phive is + Use Hive 2.3.10, which is bundled with the Spark assembly when -Phive is enabled. When this option is chosen, spark.sql.hive.metastore.version must be - either 2.3.9 or not defined. + either 2.3.10 or not defined.
    3. maven
    4. Use Hive jars of specified version downloaded from Maven repositories. This configuration is not generally recommended for production deployments. diff --git a/docs/sql-data-sources-jdbc.md b/docs/sql-data-sources-jdbc.md index edd1a51f39322..9ffd96cd40ee5 100644 --- a/docs/sql-data-sources-jdbc.md +++ b/docs/sql-data-sources-jdbc.md @@ -845,7 +845,7 @@ as the activated JDBC Driver. Note that, different JDBC drivers, or different ve numeric, decimal DecimalType - Since PostgreSQL 15, 's' can be negative. If 's<0' it'll be adjusted to DecimalType(min(p-s, 38), 0); Otherwise, DecimalType(p, s), and if 'p>38', the fraction part will be truncated if exceeded. And if any value of this column have an actual precision greater 38 will fail with NUMERIC_VALUE_OUT_OF_RANGE.WITHOUT_SUGGESTION error +
      • Since PostgreSQL 15, 's' can be negative. If 's<0' it'll be adjusted to DecimalType(min(p-s, 38), 0); Otherwise, DecimalType(p, s)
      • If 'p>38', the fraction part will be truncated if exceeded. And if any value of this column have an actual precision greater 38 will fail with NUMERIC_VALUE_OUT_OF_RANGE.WITHOUT_SUGGESTION error.
      • Special numeric values, 'NaN', 'infinity' and '-infinity' is not supported
      character varying(n), varchar(n) @@ -1074,8 +1074,8 @@ the [PostgreSQL JDBC Driver](https://mvnrepository.com/artifact/org.postgresql/p TimestampType - timestamp - + timestamp with time zone + Before Spark 4.0, it was mapped as timestamp. Please refer to the migration guide for more information TimestampNTZType @@ -1335,3 +1335,873 @@ as the activated JDBC Driver. + +### Mapping Spark SQL Data Types to Oracle + +The below table describes the data type conversions from Spark SQL Data Types to Oracle data types, +when creating, altering, or writing data to an Oracle table using the built-in jdbc data source with +the Oracle JDBC as the activated JDBC Driver. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      Spark SQL Data TypeOracle Data TypeRemarks
      BooleanTypeNUMBER(1, 0)BooleanType maps to NUMBER(1, 0) as BOOLEAN is introduced since Oracle Release 23c
      ByteTypeNUMBER(3)
      ShortTypeNUMBER(5)
      IntegerTypeNUMBER(10)
      LongTypeNUMBER(19)
      FloatTypeNUMBER(19, 4)
      DoubleTypeNUMBER(19, 4)
      DecimalType(p, s)NUMBER(p,s)
      DateTypeDATE
      TimestampTypeTIMESTAMP WITH LOCAL TIME ZONE
      TimestampNTZTypeTIMESTAMP
      StringTypeVARCHAR2(255)For historical reason, a string value has maximum 255 characters
      BinaryTypeBLOB
      CharType(n)CHAR(n)
      VarcharType(n)VARCHAR2(n)
      + +The Spark Catalyst data types below are not supported with suitable Oracle types. + +- DayTimeIntervalType +- YearMonthIntervalType +- CalendarIntervalType +- ArrayType +- MapType +- StructType +- UserDefinedType +- NullType +- ObjectType +- VariantType + +### Mapping Spark SQL Data Types from Microsoft SQL Server + +The below table describes the data type conversions from Microsoft SQL Server data types to Spark SQL Data Types, +when reading data from a Microsoft SQL Server table using the built-in jdbc data source with the mssql-jdbc +as the activated JDBC Driver. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      SQL Server Data TypeSpark SQL Data TypeRemarks
      bitBooleanType
      tinyintShortType
      smallintShortType
      intIntegerType
      bigintLongType
      float(p), realFloatType1 ≤ p ≤ 24
      float[(p)]DoubleType25 ≤ p ≤ 53
      double precisionDoubleType
      smallmoneyDecimalType(10, 4)
      moneyDecimalType(19, 4)
      decimal[(p[, s])], numeric[(p[, s])]DecimalType(p, s)
      dateDateType
      datetimeTimestampType(Default)preferTimestampNTZ=false or spark.sql.timestampType=TIMESTAMP_LTZ
      datetimeTimestampNTZTypepreferTimestampNTZ=true or spark.sql.timestampType=TIMESTAMP_NTZ
      datetime2 [ (fractional seconds precision) ]TimestampType(Default)preferTimestampNTZ=false or spark.sql.timestampType=TIMESTAMP_LTZ
      datetime2 [ (fractional seconds precision) ]TimestampNTZTypepreferTimestampNTZ=true or spark.sql.timestampType=TIMESTAMP_NTZ
      datetimeoffset [ (fractional seconds precision) ]TimestampType
      smalldatetimeTimestampType(Default)preferTimestampNTZ=false or spark.sql.timestampType=TIMESTAMP_LTZ
      smalldatetimeTimestampNTZTypepreferTimestampNTZ=true or spark.sql.timestampType=TIMESTAMP_NTZ
      time [ (fractional second scale) ]TimestampType(Default)preferTimestampNTZ=false or spark.sql.timestampType=TIMESTAMP_LTZ
      time [ (fractional second scale) ]TimestampNTZTypepreferTimestampNTZ=true or spark.sql.timestampType=TIMESTAMP_NTZ
      binary [ ( n ) ]BinaryType
      varbinary [ ( n | max ) ]BinaryType
      char [ ( n ) ]CharType(n)
      varchar [ ( n | max ) ]VarcharType(n)
      nchar [ ( n ) ]StringType
      nvarchar [ ( n | max ) ]StringType
      textStringType
      ntextStringType
      imageStringType
      geographyBinaryType
      geometryBinaryType
      rowversionBinaryType
      sql_variantUNRECOGNIZED_SQL_TYPE error raised
      + +### Mapping Spark SQL Data Types to Microsoft SQL Server + +The below table describes the data type conversions from Spark SQL Data Types to Microsoft SQL Server data types, +when creating, altering, or writing data to a Microsoft SQL Server table using the built-in jdbc data source with +the mssql-jdbc as the activated JDBC Driver. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      Spark SQL Data TypeSQL Server Data TypeRemarks
      BooleanTypebit
      ByteTypesmallintSupported since Spark 4.0.0, previous versions throw errors
      ShortTypesmallint
      IntegerTypeint
      LongTypebigint
      FloatTypereal
      DoubleTypedouble precision
      DecimalType(p, s)number(p,s)
      DateTypedate
      TimestampTypedatetime
      TimestampNTZTypedatetime
      StringTypenvarchar(max)
      BinaryTypevarbinary(max)
      CharType(n)char(n)
      VarcharType(n)varchar(n)
      + +The Spark Catalyst data types below are not supported with suitable SQL Server types. + +- DayTimeIntervalType +- YearMonthIntervalType +- CalendarIntervalType +- ArrayType +- MapType +- StructType +- UserDefinedType +- NullType +- ObjectType +- VariantType + +### Mapping Spark SQL Data Types from DB2 + +The below table describes the data type conversions from DB2 data types to Spark SQL Data Types, +when reading data from a DB2 table using the built-in jdbc data source with the [IBM Data Server Driver For JDBC and SQLJ](https://mvnrepository.com/artifact/com.ibm.db2/jcc) +as the activated JDBC Driver. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      DB2 Data TypeSpark SQL Data TypeRemarks
      BOOLEANBinaryType
      SMALLINTShortType
      INTEGERIntegerType
      BIGINTLongType
      REALFloatType
      DOUBLE, FLOATDoubleTypeFLOAT is double precision floating-point in db2
      DECIMAL, NUMERIC, DECFLOATDecimalType
      DATEDateType
      TIMESTAMP, TIMESTAMP WITHOUT TIME ZONETimestampType(Default)preferTimestampNTZ=false or spark.sql.timestampType=TIMESTAMP_LTZ
      TIMESTAMP, TIMESTAMP WITHOUT TIME ZONETimestampNTZTypepreferTimestampNTZ=true or spark.sql.timestampType=TIMESTAMP_NTZ
      TIMESTAMP WITH TIME ZONETimestampType
      TIMETimestampType(Default)preferTimestampNTZ=false or spark.sql.timestampType=TIMESTAMP_LTZ
      TIMETimestampNTZTypepreferTimestampNTZ=true or spark.sql.timestampType=TIMESTAMP_NTZ
      CHAR(n)CharType(n)
      VARCHAR(n)VarcharType(n)
      CHAR(n) FOR BIT DATABinaryType
      VARCHAR(n) FOR BIT DATABinaryType
      BINARY(n)BinaryType
      VARBINARY(n)BinaryType
      CLOB(n)StringType
      DBCLOB(n)StringType
      BLOB(n)BinaryType
      GRAPHIC(n)StringType
      VARGRAPHIC(n)StringType
      XMLStringType
      ROWIDStringType
      + +### Mapping Spark SQL Data Types to DB2 + +The below table describes the data type conversions from Spark SQL Data Types to DB2 data types, +when creating, altering, or writing data to a DB2 table using the built-in jdbc data source with +the [IBM Data Server Driver For JDBC and SQLJ](https://mvnrepository.com/artifact/com.ibm.db2/jcc) as the activated JDBC Driver. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      Spark SQL Data TypeDB2 Data TypeRemarks
      BooleanTypeBOOLEAN
      ByteTypeSMALLINT
      ShortTypeSMALLINT
      IntegerTypeINTEGER
      LongTypeBIGINT
      FloatTypeREAL
      DoubleTypeDOUBLE PRECISION
      DecimalType(p, s)DECIMAL(p,s)The maximum value for 'p' is 31 in DB2, while it is 38 in Spark. It might fail when storing DecimalType(p>=32, s) to DB2
      DateTypeDATE
      TimestampTypeTIMESTAMP
      TimestampNTZTypeTIMESTAMP
      StringTypeCLOB
      BinaryTypeBLOB
      CharType(n)CHAR(n)The maximum value for 'n' is 255 in DB2, while it is unlimited in Spark.
      VarcharType(n)VARCHAR(n)The maximum value for 'n' is 255 in DB2, while it is unlimited in Spark.
      + +The Spark Catalyst data types below are not supported with suitable DB2 types. + +- DayTimeIntervalType +- YearMonthIntervalType +- CalendarIntervalType +- ArrayType +- MapType +- StructType +- UserDefinedType +- NullType +- ObjectType +- VariantType + +### Mapping Spark SQL Data Types from Teradata + +The below table describes the data type conversions from Teradata data types to Spark SQL Data Types, +when reading data from a Teradata table using the built-in jdbc data source with the [Teradata JDBC Driver](https://mvnrepository.com/artifact/com.teradata.jdbc/terajdbc) +as the activated JDBC Driver. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      Teradata Data TypeSpark SQL Data TypeRemarks
      BYTEINTByteType
      SMALLINTShortType
      INTEGER, INTIntegerType
      BIGINTLongType
      REAL, DOUBLE PRECISION, FLOATDoubleType
      DECIMAL, NUMERIC, NUMBERDecimalType
      DATEDateType
      TIMESTAMP, TIMESTAMP WITH TIME ZONETimestampType(Default)preferTimestampNTZ=false or spark.sql.timestampType=TIMESTAMP_LTZ
      TIMESTAMP, TIMESTAMP WITH TIME ZONETimestampNTZTypepreferTimestampNTZ=true or spark.sql.timestampType=TIMESTAMP_NTZ
      TIME, TIME WITH TIME ZONETimestampType(Default)preferTimestampNTZ=false or spark.sql.timestampType=TIMESTAMP_LTZ
      TIME, TIME WITH TIME ZONETimestampNTZTypepreferTimestampNTZ=true or spark.sql.timestampType=TIMESTAMP_NTZ
      CHARACTER(n), CHAR(n), GRAPHIC(n)CharType(n)
      VARCHAR(n), VARGRAPHIC(n)VarcharType(n)
      BYTE(n), VARBYTE(n)BinaryType
      CLOBStringType
      BLOBBinaryType
      INTERVAL Data Types-The INTERVAL data types are unknown yet
      Period Data Types, ARRAY, UDT-Not Supported
      + +### Mapping Spark SQL Data Types to Teradata + +The below table describes the data type conversions from Spark SQL Data Types to Teradata data types, +when creating, altering, or writing data to a Teradata table using the built-in jdbc data source with +the [Teradata JDBC Driver](https://mvnrepository.com/artifact/com.teradata.jdbc/terajdbc) as the activated JDBC Driver. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      Spark SQL Data TypeTeradata Data TypeRemarks
      BooleanTypeCHAR(1)
      ByteTypeBYTEINT
      ShortTypeSMALLINT
      IntegerTypeINTEGER
      LongTypeBIGINT
      FloatTypeREAL
      DoubleTypeDOUBLE PRECISION
      DecimalType(p, s)DECIMAL(p,s)
      DateTypeDATE
      TimestampTypeTIMESTAMP
      TimestampNTZTypeTIMESTAMP
      StringTypeVARCHAR(255)
      BinaryTypeBLOB
      CharType(n)CHAR(n)
      VarcharType(n)VARCHAR(n)
      + +The Spark Catalyst data types below are not supported with suitable Teradata types. + +- DayTimeIntervalType +- YearMonthIntervalType +- CalendarIntervalType +- ArrayType +- MapType +- StructType +- UserDefinedType +- NullType +- ObjectType +- VariantType diff --git a/docs/sql-data-sources-load-save-functions.md b/docs/sql-data-sources-load-save-functions.md index b42f6e84076d2..70105c22e583c 100644 --- a/docs/sql-data-sources-load-save-functions.md +++ b/docs/sql-data-sources-load-save-functions.md @@ -109,7 +109,7 @@ For example, you can control bloom filters and dictionary encodings for ORC data The following ORC example will create bloom filter and use dictionary encoding only for `favorite_color`. For Parquet, there exists `parquet.bloom.filter.enabled` and `parquet.enable.dictionary`, too. To find more detailed information about the extra ORC/Parquet options, -visit the official Apache [ORC](https://orc.apache.org/docs/spark-config.html) / [Parquet](https://github.com/apache/parquet-mr/tree/master/parquet-hadoop) websites. +visit the official Apache [ORC](https://orc.apache.org/docs/spark-config.html) / [Parquet](https://github.com/apache/parquet-java/tree/master/parquet-hadoop) websites. ORC data source: diff --git a/docs/sql-data-sources-parquet.md b/docs/sql-data-sources-parquet.md index f5c5ccd3b89a1..5a0ca595fabbc 100644 --- a/docs/sql-data-sources-parquet.md +++ b/docs/sql-data-sources-parquet.md @@ -350,7 +350,7 @@ Dataset df2 = spark.read().parquet("/path/to/table.parquet.encrypted"); #### KMS Client -The InMemoryKMS class is provided only for illustration and simple demonstration of Parquet encryption functionality. **It should not be used in a real deployment**. The master encryption keys must be kept and managed in a production-grade KMS system, deployed in user's organization. Rollout of Spark with Parquet encryption requires implementation of a client class for the KMS server. Parquet provides a plug-in [interface](https://github.com/apache/parquet-mr/blob/apache-parquet-1.13.1/parquet-hadoop/src/main/java/org/apache/parquet/crypto/keytools/KmsClient.java) for development of such classes, +The InMemoryKMS class is provided only for illustration and simple demonstration of Parquet encryption functionality. **It should not be used in a real deployment**. The master encryption keys must be kept and managed in a production-grade KMS system, deployed in user's organization. Rollout of Spark with Parquet encryption requires implementation of a client class for the KMS server. Parquet provides a plug-in [interface](https://github.com/apache/parquet-java/blob/apache-parquet-1.13.1/parquet-hadoop/src/main/java/org/apache/parquet/crypto/keytools/KmsClient.java) for development of such classes,
      {% highlight java %} @@ -371,9 +371,9 @@ public interface KmsClient {
      -An [example](https://github.com/apache/parquet-mr/blob/master/parquet-hadoop/src/test/java/org/apache/parquet/crypto/keytools/samples/VaultClient.java) of such class for an open source [KMS](https://www.vaultproject.io/api/secret/transit) can be found in the parquet-mr repository. The production KMS client should be designed in cooperation with organization's security administrators, and built by developers with an experience in access control management. Once such class is created, it can be passed to applications via the `parquet.encryption.kms.client.class` parameter and leveraged by general Spark users as shown in the encrypted dataframe write/read sample above. +An [example](https://github.com/apache/parquet-java/blob/master/parquet-hadoop/src/test/java/org/apache/parquet/crypto/keytools/samples/VaultClient.java) of such class for an open source [KMS](https://www.vaultproject.io/api/secret/transit) can be found in the parquet-java repository. The production KMS client should be designed in cooperation with organization's security administrators, and built by developers with an experience in access control management. Once such class is created, it can be passed to applications via the `parquet.encryption.kms.client.class` parameter and leveraged by general Spark users as shown in the encrypted dataframe write/read sample above. -Note: By default, Parquet implements a "double envelope encryption" mode, that minimizes the interaction of Spark executors with a KMS server. In this mode, the DEKs are encrypted with "key encryption keys" (KEKs, randomly generated by Parquet). The KEKs are encrypted with MEKs in KMS; the result and the KEK itself are cached in Spark executor memory. Users interested in regular envelope encryption, can switch to it by setting the `parquet.encryption.double.wrapping` parameter to `false`. For more details on Parquet encryption parameters, visit the parquet-hadoop configuration [page](https://github.com/apache/parquet-mr/blob/master/parquet-hadoop/README.md#class-propertiesdrivencryptofactory). +Note: By default, Parquet implements a "double envelope encryption" mode, that minimizes the interaction of Spark executors with a KMS server. In this mode, the DEKs are encrypted with "key encryption keys" (KEKs, randomly generated by Parquet). The KEKs are encrypted with MEKs in KMS; the result and the KEK itself are cached in Spark executor memory. Users interested in regular envelope encryption, can switch to it by setting the `parquet.encryption.double.wrapping` parameter to `false`. For more details on Parquet encryption parameters, visit the parquet-hadoop configuration [page](https://github.com/apache/parquet-java/blob/master/parquet-hadoop/README.md#class-propertiesdrivencryptofactory). ## Data Source Option diff --git a/docs/sql-error-conditions-as-of-join-error-class.md b/docs/sql-error-conditions-as-of-join-error-class.md deleted file mode 100644 index df122c22616e5..0000000000000 --- a/docs/sql-error-conditions-as-of-join-error-class.md +++ /dev/null @@ -1,41 +0,0 @@ ---- -layout: global -title: AS_OF_JOIN error class -displayTitle: AS_OF_JOIN error class -license: | - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---- - - - -[SQLSTATE: 42604](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Invalid as-of join. - -This error class has the following derived error classes: - -## TOLERANCE_IS_NON_NEGATIVE - -The input argument `tolerance` must be non-negative. - -## TOLERANCE_IS_UNFOLDABLE - -The input argument `tolerance` must be a constant. - - diff --git a/docs/sql-error-conditions-cannot-create-data-source-table-error-class.md b/docs/sql-error-conditions-cannot-create-data-source-table-error-class.md deleted file mode 100644 index a5bc5c0dc094b..0000000000000 --- a/docs/sql-error-conditions-cannot-create-data-source-table-error-class.md +++ /dev/null @@ -1,37 +0,0 @@ ---- -layout: global -title: CANNOT_CREATE_DATA_SOURCE_TABLE error class -displayTitle: CANNOT_CREATE_DATA_SOURCE_TABLE error class -license: | - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---- - - - -[SQLSTATE: 42KDE](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Failed to create data source table ``: - -This error class has the following derived error classes: - -## EXTERNAL_METADATA_UNSUPPORTED - -provider '``' does not support external metadata but a schema is provided. Please remove the schema when creating the table. - - diff --git a/docs/sql-error-conditions-cannot-load-state-store-error-class.md b/docs/sql-error-conditions-cannot-load-state-store-error-class.md deleted file mode 100644 index 1f44e5592eba8..0000000000000 --- a/docs/sql-error-conditions-cannot-load-state-store-error-class.md +++ /dev/null @@ -1,74 +0,0 @@ ---- -layout: global -title: CANNOT_LOAD_STATE_STORE error class -displayTitle: CANNOT_LOAD_STATE_STORE error class -license: | - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---- - - - -SQLSTATE: 58030 - -An error occurred during loading state. - -This error class has the following derived error classes: - -## CANNOT_READ_CHECKPOINT - -Cannot read RocksDB checkpoint metadata. Expected ``, but found ``. - -## CANNOT_READ_DELTA_FILE_KEY_SIZE - -Error reading delta file `` of ``: key size cannot be ``. - -## CANNOT_READ_DELTA_FILE_NOT_EXISTS - -Error reading delta file `` of ``: `` does not exist. - -## CANNOT_READ_SNAPSHOT_FILE_KEY_SIZE - -Error reading snapshot file `` of ``: key size cannot be ``. - -## CANNOT_READ_SNAPSHOT_FILE_VALUE_SIZE - -Error reading snapshot file `` of ``: value size cannot be ``. - -## CANNOT_READ_STREAMING_STATE_FILE - -Error reading streaming state file of `` does not exist. If the stream job is restarted with a new or updated state operation, please create a new checkpoint location or clear the existing checkpoint location. - -## UNCATEGORIZED - - - -## UNEXPECTED_FILE_SIZE - -Copied `` to ``, expected `` bytes, found `` bytes. - -## UNEXPECTED_VERSION - -Version cannot be `` because it is less than 0. - -## UNRELEASED_THREAD_ERROR - -``: RocksDB instance could not be acquired by `` for operationType=`` as it was not released by `` after `` ms. -Thread holding the lock has trace: `` - - diff --git a/docs/sql-error-conditions-cannot-update-field-error-class.md b/docs/sql-error-conditions-cannot-update-field-error-class.md deleted file mode 100644 index fe27ab90d149d..0000000000000 --- a/docs/sql-error-conditions-cannot-update-field-error-class.md +++ /dev/null @@ -1,53 +0,0 @@ ---- -layout: global -title: CANNOT_UPDATE_FIELD error class -displayTitle: CANNOT_UPDATE_FIELD error class -license: | - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---- - - - -[SQLSTATE: 0A000](sql-error-conditions-sqlstates.html#class-0A-feature-not-supported) - -Cannot update `` field `` type: - -This error class has the following derived error classes: - -## ARRAY_TYPE - -Update the element by updating ``.element. - -## INTERVAL_TYPE - -Update an interval by updating its fields. - -## MAP_TYPE - -Update a map by updating ``.key or ``.value. - -## STRUCT_TYPE - -Update a struct by updating its fields. - -## USER_DEFINED_TYPE - -Update a UserDefinedType[``] by updating its fields. - - diff --git a/docs/sql-error-conditions-cannot-write-state-store-error-class.md b/docs/sql-error-conditions-cannot-write-state-store-error-class.md deleted file mode 100644 index 0bed5755bdb83..0000000000000 --- a/docs/sql-error-conditions-cannot-write-state-store-error-class.md +++ /dev/null @@ -1,37 +0,0 @@ ---- -layout: global -title: CANNOT_WRITE_STATE_STORE error class -displayTitle: CANNOT_WRITE_STATE_STORE error class -license: | - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---- - - - -SQLSTATE: 58030 - -Error writing state store files for provider ``. - -This error class has the following derived error classes: - -## CANNOT_COMMIT - -Cannot perform commit during state checkpoint. - - diff --git a/docs/sql-error-conditions-collection-size-limit-exceeded-error-class.md b/docs/sql-error-conditions-collection-size-limit-exceeded-error-class.md deleted file mode 100644 index 0d502245459e7..0000000000000 --- a/docs/sql-error-conditions-collection-size-limit-exceeded-error-class.md +++ /dev/null @@ -1,45 +0,0 @@ ---- -layout: global -title: COLLECTION_SIZE_LIMIT_EXCEEDED error class -displayTitle: COLLECTION_SIZE_LIMIT_EXCEEDED error class -license: | - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---- - - - -[SQLSTATE: 54000](sql-error-conditions-sqlstates.html#class-54-program-limit-exceeded) - -Can't create array with `` elements which exceeding the array size limit ``, - -This error class has the following derived error classes: - -## FUNCTION - -unsuccessful try to create arrays in the function ``. - -## INITIALIZE - -cannot initialize an array with specified parameters. - -## PARAMETER - -the value of parameter(s) `` in the function `` is invalid. - - diff --git a/docs/sql-error-conditions-complex-expression-unsupported-input-error-class.md b/docs/sql-error-conditions-complex-expression-unsupported-input-error-class.md deleted file mode 100644 index e73499ffabd51..0000000000000 --- a/docs/sql-error-conditions-complex-expression-unsupported-input-error-class.md +++ /dev/null @@ -1,41 +0,0 @@ ---- -layout: global -title: COMPLEX_EXPRESSION_UNSUPPORTED_INPUT error class -displayTitle: COMPLEX_EXPRESSION_UNSUPPORTED_INPUT error class -license: | - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---- - - - -[SQLSTATE: 42K09](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Cannot process input data types for the expression: ``. - -This error class has the following derived error classes: - -## MISMATCHED_TYPES - -All input types must be the same except nullable, containsNull, valueContainsNull flags, but found the input types ``. - -## NO_INPUTS - -The collection of input data types must not be empty. - - diff --git a/docs/sql-error-conditions-connect-error-class.md b/docs/sql-error-conditions-connect-error-class.md deleted file mode 100644 index c6d2057b09836..0000000000000 --- a/docs/sql-error-conditions-connect-error-class.md +++ /dev/null @@ -1,53 +0,0 @@ ---- -layout: global -title: CONNECT error class -displayTitle: CONNECT error class -license: | - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---- - - - -SQLSTATE: 56K00 - -Generic Spark Connect error. - -This error class has the following derived error classes: - -## INTERCEPTOR_CTOR_MISSING - -Cannot instantiate GRPC interceptor because `` is missing a default constructor without arguments. - -## INTERCEPTOR_RUNTIME_ERROR - -Error instantiating GRPC interceptor: `` - -## PLUGIN_CTOR_MISSING - -Cannot instantiate Spark Connect plugin because `` is missing a default constructor without arguments. - -## PLUGIN_RUNTIME_ERROR - -Error instantiating Spark Connect plugin: `` - -## SESSION_NOT_SAME - -Both Datasets must belong to the same SparkSession. - - diff --git a/docs/sql-error-conditions-create-view-column-arity-mismatch-error-class.md b/docs/sql-error-conditions-create-view-column-arity-mismatch-error-class.md deleted file mode 100644 index a11449954ee0a..0000000000000 --- a/docs/sql-error-conditions-create-view-column-arity-mismatch-error-class.md +++ /dev/null @@ -1,45 +0,0 @@ ---- -layout: global -title: CREATE_VIEW_COLUMN_ARITY_MISMATCH error class -displayTitle: CREATE_VIEW_COLUMN_ARITY_MISMATCH error class -license: | - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---- - - - -[SQLSTATE: 21S01](sql-error-conditions-sqlstates.html#class-21-cardinality-violation) - -Cannot create view ``, the reason is - -This error class has the following derived error classes: - -## NOT_ENOUGH_DATA_COLUMNS - -not enough data columns: -View columns: ``. -Data columns: ``. - -## TOO_MANY_DATA_COLUMNS - -too many data columns: -View columns: ``. -Data columns: ``. - - diff --git a/docs/sql-error-conditions-datatype-mismatch-error-class.md b/docs/sql-error-conditions-datatype-mismatch-error-class.md deleted file mode 100644 index 1d18836ac9e77..0000000000000 --- a/docs/sql-error-conditions-datatype-mismatch-error-class.md +++ /dev/null @@ -1,254 +0,0 @@ ---- -layout: global -title: DATATYPE_MISMATCH error class -displayTitle: DATATYPE_MISMATCH error class -license: | - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---- - - - -[SQLSTATE: 42K09](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Cannot resolve `` due to data type mismatch: - -This error class has the following derived error classes: - -## ARRAY_FUNCTION_DIFF_TYPES - -Input to `` should have been `` followed by a value with same element type, but it's [``, ``]. - -## BINARY_ARRAY_DIFF_TYPES - -Input to function `` should have been two `` with same element type, but it's [``, ``]. - -## BINARY_OP_DIFF_TYPES - -the left and right operands of the binary operator have incompatible types (`` and ``). - -## BINARY_OP_WRONG_TYPE - -the binary operator requires the input type ``, not ``. - -## BLOOM_FILTER_BINARY_OP_WRONG_TYPE - -The Bloom filter binary input to `` should be either a constant value or a scalar subquery expression, but it's ``. - -## BLOOM_FILTER_WRONG_TYPE - -Input to function `` should have been `` followed by value with ``, but it's [``]. - -## CANNOT_CONVERT_TO_JSON - -Unable to convert column `` of type `` to JSON. - -## CANNOT_DROP_ALL_FIELDS - -Cannot drop all fields in struct. - -## CAST_WITHOUT_SUGGESTION - -cannot cast `` to ``. - -## CAST_WITH_CONF_SUGGESTION - -cannot cast `` to `` with ANSI mode on. -If you have to cast `` to ``, you can set `` as ``. - -## CAST_WITH_FUNC_SUGGESTION - -cannot cast `` to ``. -To convert values from `` to ``, you can use the functions `` instead. - -## CREATE_MAP_KEY_DIFF_TYPES - -The given keys of function `` should all be the same type, but they are ``. - -## CREATE_MAP_VALUE_DIFF_TYPES - -The given values of function `` should all be the same type, but they are ``. - -## CREATE_NAMED_STRUCT_WITHOUT_FOLDABLE_STRING - -Only foldable `STRING` expressions are allowed to appear at odd position, but they are ``. - -## DATA_DIFF_TYPES - -Input to `` should all be the same type, but it's ``. - -## FILTER_NOT_BOOLEAN - -Filter expression `` of type `` is not a boolean. - -## HASH_MAP_TYPE - -Input to the function `` cannot contain elements of the "MAP" type. In Spark, same maps may have different hashcode, thus hash expressions are prohibited on "MAP" elements. To restore previous behavior set "spark.sql.legacy.allowHashOnMapType" to "true". - -## INPUT_SIZE_NOT_ONE - -Length of `` should be 1. - -## INVALID_ARG_VALUE - -The `` value must to be a `` literal of ``, but got ``. - -## INVALID_JSON_MAP_KEY_TYPE - -Input schema `` can only contain STRING as a key type for a MAP. - -## INVALID_JSON_SCHEMA - -Input schema `` must be a struct, an array or a map. - -## INVALID_MAP_KEY_TYPE - -The key of map cannot be/contain ``. - -## INVALID_ORDERING_TYPE - -The `` does not support ordering on type ``. - -## INVALID_ROW_LEVEL_OPERATION_ASSIGNMENTS - -`` - -## INVALID_XML_MAP_KEY_TYPE - -Input schema `` can only contain STRING as a key type for a MAP. - -## IN_SUBQUERY_DATA_TYPE_MISMATCH - -The data type of one or more elements in the left hand side of an IN subquery is not compatible with the data type of the output of the subquery. Mismatched columns: [``], left side: [``], right side: [``]. - -## IN_SUBQUERY_LENGTH_MISMATCH - -The number of columns in the left hand side of an IN subquery does not match the number of columns in the output of subquery. Left hand side columns(length: ``): [``], right hand side columns(length: ``): [``]. - -## MAP_CONCAT_DIFF_TYPES - -The `` should all be of type map, but it's ``. - -## MAP_FUNCTION_DIFF_TYPES - -Input to `` should have been `` followed by a value with same key type, but it's [``, ``]. - -## MAP_ZIP_WITH_DIFF_TYPES - -Input to the `` should have been two maps with compatible key types, but it's [``, ``]. - -## NON_FOLDABLE_INPUT - -the input `` should be a foldable `` expression; however, got ``. - -## NON_STRING_TYPE - -all arguments must be strings. - -## NULL_TYPE - -Null typed values cannot be used as arguments of ``. - -## PARAMETER_CONSTRAINT_VIOLATION - -The ``(``) must be `` the ``(``). - -## RANGE_FRAME_INVALID_TYPE - -The data type `` used in the order specification does not match the data type `` which is used in the range frame. - -## RANGE_FRAME_MULTI_ORDER - -A range window frame with value boundaries cannot be used in a window specification with multiple order by expressions: ``. - -## RANGE_FRAME_WITHOUT_ORDER - -A range window frame cannot be used in an unordered window specification. - -## SEQUENCE_WRONG_INPUT_TYPES - -`` uses the wrong parameter type. The parameter type must conform to: -1. The start and stop expressions must resolve to the same type. -2. If start and stop expressions resolve to the `` type, then the step expression must resolve to the `` type. -3. Otherwise, if start and stop expressions resolve to the `` type, then the step expression must resolve to the same type. - -## SPECIFIED_WINDOW_FRAME_DIFF_TYPES - -Window frame bounds `` and `` do not have the same type: `` <> ``. - -## SPECIFIED_WINDOW_FRAME_INVALID_BOUND - -Window frame upper bound `` does not follow the lower bound ``. - -## SPECIFIED_WINDOW_FRAME_UNACCEPTED_TYPE - -The data type of the `` bound `` does not match the expected data type ``. - -## SPECIFIED_WINDOW_FRAME_WITHOUT_FOLDABLE - -Window frame `` bound `` is not a literal. - -## SPECIFIED_WINDOW_FRAME_WRONG_COMPARISON - -The lower bound of a window frame must be `` to the upper bound. - -## STACK_COLUMN_DIFF_TYPES - -The data type of the column (``) do not have the same type: `` (``) <> `` (``). - -## TYPE_CHECK_FAILURE_WITH_HINT - -````. - -## UNEXPECTED_CLASS_TYPE - -class `` not found. - -## UNEXPECTED_INPUT_TYPE - -The `` parameter requires the `` type, however `` has the type ``. - -## UNEXPECTED_NULL - -The `` must not be null. - -## UNEXPECTED_RETURN_TYPE - -The `` requires return `` type, but the actual is `` type. - -## UNEXPECTED_STATIC_METHOD - -cannot find a static method `` that matches the argument types in ``. - -## UNSUPPORTED_INPUT_TYPE - -The input of `` can't be `` type data. - -## VALUE_OUT_OF_RANGE - -The `` must be between `` (current value = ``). - -## WRONG_NUM_ARG_TYPES - -The expression requires `` argument types but the actual number is ``. - -## WRONG_NUM_ENDPOINTS - -The number of endpoints must be >= 2 to construct intervals but the actual number is ``. - - diff --git a/docs/sql-error-conditions-duplicate-routine-parameter-assignment-error-class.md b/docs/sql-error-conditions-duplicate-routine-parameter-assignment-error-class.md deleted file mode 100644 index 288088e57e7c4..0000000000000 --- a/docs/sql-error-conditions-duplicate-routine-parameter-assignment-error-class.md +++ /dev/null @@ -1,41 +0,0 @@ ---- -layout: global -title: DUPLICATE_ROUTINE_PARAMETER_ASSIGNMENT error class -displayTitle: DUPLICATE_ROUTINE_PARAMETER_ASSIGNMENT error class -license: | - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---- - - - -[SQLSTATE: 4274K](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Call to function `` is invalid because it includes multiple argument assignments to the same parameter name ``. - -This error class has the following derived error classes: - -## BOTH_POSITIONAL_AND_NAMED - -A positional argument and named argument both referred to the same parameter. Please remove the named argument referring to this parameter. - -## DOUBLE_NAMED_ARGUMENT_REFERENCE - -More than one named argument referred to the same parameter. Please assign a value only once. - - diff --git a/docs/sql-error-conditions-expect-table-not-view-error-class.md b/docs/sql-error-conditions-expect-table-not-view-error-class.md deleted file mode 100644 index 0ab99ce33fa83..0000000000000 --- a/docs/sql-error-conditions-expect-table-not-view-error-class.md +++ /dev/null @@ -1,41 +0,0 @@ ---- -layout: global -title: EXPECT_TABLE_NOT_VIEW error class -displayTitle: EXPECT_TABLE_NOT_VIEW error class -license: | - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---- - - - -[SQLSTATE: 42809](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -'``' expects a table but `` is a view. - -This error class has the following derived error classes: - -## NO_ALTERNATIVE - - - -## USE_ALTER_VIEW - -Please use ALTER VIEW instead. - - diff --git a/docs/sql-error-conditions-expect-view-not-table-error-class.md b/docs/sql-error-conditions-expect-view-not-table-error-class.md deleted file mode 100644 index 97a6f59e13f42..0000000000000 --- a/docs/sql-error-conditions-expect-view-not-table-error-class.md +++ /dev/null @@ -1,41 +0,0 @@ ---- -layout: global -title: EXPECT_VIEW_NOT_TABLE error class -displayTitle: EXPECT_VIEW_NOT_TABLE error class -license: | - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---- - - - -[SQLSTATE: 42809](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The table `` does not support ``. - -This error class has the following derived error classes: - -## NO_ALTERNATIVE - - - -## USE_ALTER_TABLE - -Please use ALTER TABLE instead. - - diff --git a/docs/sql-error-conditions-failed-jdbc-error-class.md b/docs/sql-error-conditions-failed-jdbc-error-class.md deleted file mode 100644 index bc8464c188d7c..0000000000000 --- a/docs/sql-error-conditions-failed-jdbc-error-class.md +++ /dev/null @@ -1,89 +0,0 @@ ---- -layout: global -title: FAILED_JDBC error class -displayTitle: FAILED_JDBC error class -license: | - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---- - - - -SQLSTATE: HV000 - -Failed JDBC `` on the operation: - -This error class has the following derived error classes: - -## ALTER_TABLE - -Alter the table ``. - -## CREATE_INDEX - -Create the index `` in the `` table. - -## CREATE_NAMESPACE - -Create the namespace ``. - -## CREATE_NAMESPACE_COMMENT - -Create a comment on the namespace: ``. - -## CREATE_TABLE - -Create the table ``. - -## DROP_INDEX - -Drop the index `` in the `` table. - -## DROP_NAMESPACE - -Drop the namespace ``. - -## GET_TABLES - -Get tables from the namespace: ``. - -## LIST_NAMESPACES - -List namespaces. - -## NAMESPACE_EXISTS - -Check that the namespace `` exists. - -## REMOVE_NAMESPACE_COMMENT - -Remove a comment on the namespace: ``. - -## RENAME_TABLE - -Rename the table `` to ``. - -## TABLE_EXISTS - -Check that the table `` exists. - -## UNCLASSIFIED - -`` - - diff --git a/docs/sql-error-conditions-incompatible-data-for-table-error-class.md b/docs/sql-error-conditions-incompatible-data-for-table-error-class.md deleted file mode 100644 index 2f84dc90b6536..0000000000000 --- a/docs/sql-error-conditions-incompatible-data-for-table-error-class.md +++ /dev/null @@ -1,73 +0,0 @@ ---- -layout: global -title: INCOMPATIBLE_DATA_FOR_TABLE error class -displayTitle: INCOMPATIBLE_DATA_FOR_TABLE error class -license: | - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---- - - - -SQLSTATE: KD000 - -Cannot write incompatible data for the table ``: - -This error class has the following derived error classes: - -## AMBIGUOUS_COLUMN_NAME - -Ambiguous column name in the input data ``. - -## CANNOT_FIND_DATA - -Cannot find data for the output column ``. - -## CANNOT_SAFELY_CAST - -Cannot safely cast `` `` to ``. - -## EXTRA_COLUMNS - -Cannot write extra columns ``. - -## EXTRA_STRUCT_FIELDS - -Cannot write extra fields `` to the struct ``. - -## NULLABLE_ARRAY_ELEMENTS - -Cannot write nullable elements to array of non-nulls: ``. - -## NULLABLE_COLUMN - -Cannot write nullable values to non-null column ``. - -## NULLABLE_MAP_VALUES - -Cannot write nullable values to map of non-nulls: ``. - -## STRUCT_MISSING_FIELDS - -Struct `` missing fields: ``. - -## UNEXPECTED_COLUMN_NAME - -Struct `` ``-th field name does not match (may be out of order): expected ``, found ``. - - diff --git a/docs/sql-error-conditions-incomplete-type-definition-error-class.md b/docs/sql-error-conditions-incomplete-type-definition-error-class.md deleted file mode 100644 index b84d4c37b7f03..0000000000000 --- a/docs/sql-error-conditions-incomplete-type-definition-error-class.md +++ /dev/null @@ -1,45 +0,0 @@ ---- -layout: global -title: INCOMPLETE_TYPE_DEFINITION error class -displayTitle: INCOMPLETE_TYPE_DEFINITION error class -license: | - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---- - - - -[SQLSTATE: 42K01](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Incomplete complex type: - -This error class has the following derived error classes: - -## ARRAY - -The definition of "ARRAY" type is incomplete. You must provide an element type. For example: "ARRAY``". - -## MAP - -The definition of "MAP" type is incomplete. You must provide a key type and a value type. For example: "MAP". - -## STRUCT - -The definition of "STRUCT" type is incomplete. You must provide at least one field type. For example: "STRUCT". - - diff --git a/docs/sql-error-conditions-inconsistent-behavior-cross-version-error-class.md b/docs/sql-error-conditions-inconsistent-behavior-cross-version-error-class.md deleted file mode 100644 index 15027d5575c88..0000000000000 --- a/docs/sql-error-conditions-inconsistent-behavior-cross-version-error-class.md +++ /dev/null @@ -1,68 +0,0 @@ ---- -layout: global -title: INCONSISTENT_BEHAVIOR_CROSS_VERSION error class -displayTitle: INCONSISTENT_BEHAVIOR_CROSS_VERSION error class -license: | - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---- - - - -[SQLSTATE: 42K0B](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -You may get a different result due to the upgrading to - -This error class has the following derived error classes: - -## DATETIME_PATTERN_RECOGNITION - -Spark >= 3.0: -Fail to recognize `` pattern in the DateTimeFormatter. -1) You can set `` to "LEGACY" to restore the behavior before Spark 3.0. -2) You can form a valid datetime pattern with the guide from '``/sql-ref-datetime-pattern.html'. - -## DATETIME_WEEK_BASED_PATTERN - -Spark >= 3.0: -All week-based patterns are unsupported since Spark 3.0, detected week-based character: ``. -Please use the SQL function EXTRACT instead. - -## PARSE_DATETIME_BY_NEW_PARSER - -Spark >= 3.0: -Fail to parse `` in the new parser. -You can set `` to "LEGACY" to restore the behavior before Spark 3.0, or set to "CORRECTED" and treat it as an invalid datetime string. - -## READ_ANCIENT_DATETIME - -Spark >= 3.0: reading dates before 1582-10-15 or timestamps before 1900-01-01T00:00:00Z from `` files can be ambiguous, as the files may be written by -Spark 2.x or legacy versions of Hive, which uses a legacy hybrid calendar that is different from Spark 3.0+'s Proleptic Gregorian calendar. -See more details in SPARK-31404. -You can set the SQL config `` or the datasource option `
      - - - - - - - - - - -
      SQLSTATEDescription and issuing error classes
      0A000feature not supported
      INVALID_PANDAS_UDF_PLACEMENT, STAR_GROUP_BY_POS, UNSUPPORTED_ARROWTYPE, UNSUPPORTED_DATATYPE, UNSUPPORTED_DESERIALIZER, UNSUPPORTED_FEATURE, UNSUPPORTED_GENERATOR, UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY, UNSUPPORTED_TYPED_LITERAL -
      -## Class `21`: cardinality violation - - - - - - - - - - - - -
      SQLSTATEDescription and issuing error classes
      21000cardinality violation
      SCALAR_SUBQUERY_TOO_MANY_ROWS -
      -## Class `22`: data exception - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
      SQLSTATEDescription and issuing error classes
      22003numeric value out of range
      ARITHMETIC_OVERFLOW, CAST_OVERFLOW, CAST_OVERFLOW_IN_TABLE_INSERT, DECIMAL_PRECISION_EXCEEDS_MAX_PRECISION, INVALID_INDEX_OF_ZERO, INCORRECT_RAMP_UP_RATE, INVALID_ARRAY_INDEX, INVALID_ARRAY_INDEX_IN_ELEMENT_AT, NUMERIC_OUT_OF_SUPPORTED_RANGE, NUMERIC_VALUE_OUT_OF_RANGE -
      22007invalid datetime format
      CANNOT_PARSE_TIMESTAMP -
      22008datetime field overflow
      DATETIME_OVERFLOW -
      2200Enull value in array target
      NULL_MAP_KEY -
      22012division by zero
      DIVIDE_BY_ZERO, INTERVAL_DIVIDED_BY_ZERO -
      22015interval field overflow
      INTERVAL_ARITHMETIC_OVERFLOW -
      22018invalid character value for cast
      CANNOT_PARSE_DECIMAL, CAST_INVALID_INPUT, CONVERSION_INVALID_INPUT -
      22023invalid parameter value
      INVALID_FRACTION_OF_SECOND, INVALID_PARAMETER_VALUE, SECOND_FUNCTION_ARGUMENT_NOT_INTEGER -
      22032invalid JSON text
      INVALID_JSON_ROOT_FIELD, INVALID_JSON_SCHEMA_MAP_TYPE -
      2203Gsql_json_item_cannot_be_cast_to_target_type
      CANNOT_PARSE_JSON_FIELD -
      22546The value for a routine argument is not valid.
      CANNOT_DECODE_URL -
      -## Class `23`: integrity constraint violation - - - - - - - - - - - - -
      SQLSTATEDescription and issuing error classes
      23505A violation of the constraint imposed by a unique index or a unique constraint occurred.
      DUPLICATED_MAP_KEY, DUPLICATE_KEY -
      -## Class `2B`: dependent privilege descriptors still exist - - - - - - - - - - - - -
      SQLSTATEDescription and issuing error classes
      2BP01dependent_objects_still_exist
      SCHEMA_NOT_EMPTY -
      -## Class `38`: external routine exception - - - - - - - - - - - - -
      SQLSTATEDescription and issuing error classes
      38000external routine exception
      FAILED_FUNCTION_CALL -
      -## Class `39`: external routine invocation exception - - - - - - - - - - - - -
      SQLSTATEDescription and issuing error classes
      39000external routine invocation exception
      FAILED_EXECUTE_UDF -
      -## Class `42`: syntax error or access rule violation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
      SQLSTATEDescription and issuing error classes
      42000syntax error or access rule violation
      AMBIGUOUS_REFERENCE_TO_FIELDS, INVALID_COLUMN_OR_FIELD_DATA_TYPE, INVALID_EXTRACT_BASE_FIELD_TYPE, INVALID_EXTRACT_FIELD_TYPE, INVALID_FIELD_NAME, INVALID_SET_SYNTAX, INVALID_SQL_SYNTAX, NON_PARTITION_COLUMN, NOT_NULL_CONSTRAINT_VIOLATION, NULLABLE_COLUMN_OR_FIELD, NULLABLE_ROW_ID_ATTRIBUTES -
      42001Invalid encoder error
      INVALID_EXPRESSION_ENCODER -
      42601A character, token, or clause is invalid or missing.
      IDENTIFIER_TOO_MANY_NAME_PARTS, INVALID_EXTRACT_FIELD, INVALID_FORMAT, PARSE_SYNTAX_ERROR, UNCLOSED_BRACKETED_COMMENT -
      42602A character that is invalid in a name has been detected.
      INVALID_IDENTIFIER, INVALID_PROPERTY_KEY, INVALID_PROPERTY_VALUE -
      42604An invalid numeric or string constant has been detected.
      EMPTY_JSON_FIELD_VALUE, INVALID_TYPED_LITERAL -
      42605The number of arguments specified for a scalar function is invalid.
      WRONG_NUM_ARGS -
      42607An operand of an aggregate function or CONCAT operator is invalid.
      NESTED_AGGREGATE_FUNCTION -
      42613Clauses are mutually exclusive.
      INCOMPATIBLE_JOIN_TYPES, INVALID_LATERAL_JOIN_TYPE, NON_LAST_MATCHED_CLAUSE_OMIT_CONDITION, NON_LAST_NOT_MATCHED_BY_SOURCE_CLAUSE_OMIT_CONDITION, NON_LAST_NOT_MATCHED_BY_TARGET_CLAUSE_OMIT_CONDITION -
      42614A duplicate keyword or clause is invalid.
      REPEATED_CLAUSE -
      42617The statement string is blank or empty.
      PARSE_EMPTY_STATEMENT -
      42702A column reference is ambiguous, because of duplicate names.
      AMBIGUOUS_COLUMN_OR_FIELD, AMBIGUOUS_LATERAL_COLUMN_ALIAS -
      42703An undefined column or parameter name was detected.
      COLUMN_NOT_FOUND, UNRESOLVED_COLUMN, UNRESOLVED_FIELD, UNRESOLVED_MAP_KEY, UNRESOLVED_USING_COLUMN_FOR_JOIN -
      42704An undefined object or constraint name was detected.
      AMBIGUOUS_REFERENCE, DEFAULT_DATABASE_NOT_EXISTS, FIELD_NOT_FOUND, INDEX_NOT_FOUND, SCHEMA_NOT_FOUND, UNRECOGNIZED_SQL_TYPE -
      42710A duplicate object or constraint name was detected.
      CREATE_TABLE_COLUMN_OPTION_DUPLICATE, INDEX_ALREADY_EXISTS, LOCATION_ALREADY_EXISTS -
      42711A duplicate column name was detected in the object definition or ALTER TABLE statement.
      COLUMN_ALREADY_EXISTS -
      42723A routine with the same signature already exists in the schema, module, or compound block where it is defined.
      ROUTINE_ALREADY_EXISTS -
      42803A column reference in the SELECT or HAVING clause is invalid, because it is not a grouping column; or a column reference in the GROUP BY clause is invalid.
      GROUPING_COLUMN_MISMATCH, GROUPING_ID_COLUMN_MISMATCH, MISSING_AGGREGATION, MISSING_GROUP_BY, UNRESOLVED_ALL_IN_GROUP_BY -
      42805An integer in the ORDER BY clause does not identify a column of the result table.
      GROUP_BY_POS_OUT_OF_RANGE, ORDER_BY_POS_OUT_OF_RANGE -
      42809The identified object is not the type of object to which the statement applies.
      FORBIDDEN_OPERATION -
      42818The operands of an operator or function are not compatible or comparable.
      INCOMPARABLE_PIVOT_COLUMN -
      42823Multiple columns are returned from a subquery that only allows one column.
      INVALID_SUBQUERY_EXPRESSION -
      42825The rows of UNION, INTERSECT, EXCEPT, or VALUES do not have compatible columns.
      CANNOT_MERGE_INCOMPATIBLE_DATA_TYPE, INCOMPATIBLE_COLUMN_TYPE -
      42826The rows of UNION, INTERSECT, EXCEPT, or VALUES do not have the same number of columns.
      NUM_COLUMNS_MISMATCH -
      42846Cast from source type to target type is not supported.
      CANNOT_CAST_DATATYPE -
      42883No routine was found with a matching signature.
      ROUTINE_NOT_FOUND, UNRESOLVED_ROUTINE -
      428C4The number of elements on each side of the predicate operator is not the same.
      UNPIVOT_VALUE_SIZE_MISMATCH -
      428EKThe schema qualifier is not valid.
      TEMP_VIEW_NAME_TOO_MANY_NAME_PARTS -
      428FTThe partitioning clause specified on CREATE or ALTER is not valid.
      PARTITIONS_ALREADY_EXIST, PARTITIONS_NOT_FOUND -
      42903Invalid use of an aggregate function or OLAP function.
      GROUP_BY_AGGREGATE, GROUP_BY_POS_AGGREGATE, INVALID_WHERE_CONDITION -
      429BBThe data type of a column, parameter, or SQL variable is not supported.
      CANNOT_RECOGNIZE_HIVE_TYPE -
      42K01data type not fully specified
      DATATYPE_MISSING_SIZE, INCOMPLETE_TYPE_DEFINITION -
      42K02data source not found
      DATA_SOURCE_NOT_FOUND -
      42K03File not found
      PATH_NOT_FOUND, RENAME_SRC_PATH_NOT_FOUND -
      42K04Duplicate file
      FAILED_RENAME_PATH, PATH_ALREADY_EXISTS -
      42K05Name is not valid
      INVALID_EMPTY_LOCATION, REQUIRES_SINGLE_PART_NAMESPACE -
      42K06Invalid type for options
      INVALID_OPTIONS -
      42K07Not a valid schema literal
      INVALID_SCHEMA -
      42K08Not a constant
      NON_LITERAL_PIVOT_VALUES -
      42K09Data type mismatch
      DATATYPE_MISMATCH, PIVOT_VALUE_DATA_TYPE_MISMATCH, UNEXPECTED_INPUT_TYPE, UNPIVOT_VALUE_DATA_TYPE_MISMATCH -
      42K0AInvalid UNPIVOT clause
      UNPIVOT_REQUIRES_ATTRIBUTES, UNPIVOT_REQUIRES_VALUE_COLUMNS -
      42K0BLegacy feature blocked
      INCONSISTENT_BEHAVIOR_CROSS_VERSION -
      42KD9Cannot infer table schema.
      UNABLE_TO_INFER_SCHEMA -
      42P01undefined_table
      TABLE_OR_VIEW_NOT_FOUND, VIEW_NOT_FOUND -
      42P02undefined_parameter
      UNBOUND_SQL_PARAMETER -
      42P06duplicate_schema
      SCHEMA_ALREADY_EXISTS -
      42P07duplicate_table
      TABLE_OR_VIEW_ALREADY_EXISTS, TEMP_TABLE_OR_VIEW_ALREADY_EXISTS, VIEW_ALREADY_EXISTS -
      42P20windowing_error
      UNSUPPORTED_EXPR_FOR_WINDOW -
      -## Class `46`: java ddl 1 - - - - - - - - - - - - - - - - - - - - -
      SQLSTATEDescription and issuing error classes
      46110unsupported feature
      CANNOT_MODIFY_CONFIG -
      46121invalid column name
      INVALID_COLUMN_NAME_AS_PATH -
      -## Class `53`: insufficient resources - - - - - - - - - - - - -
      SQLSTATEDescription and issuing error classes
      53200out_of_memory
      UNABLE_TO_ACQUIRE_MEMORY -
      -## Class `54`: program limit exceeded - - - - - - - - - - - - -
      SQLSTATEDescription and issuing error classes
      54000program limit exceeded
      GROUPING_SIZE_LIMIT_EXCEEDED, TOO_MANY_ARRAY_ELEMENTS -
      -## Class `HY`: CLI-specific condition - - - - - - - - - - - - -
      SQLSTATEDescription and issuing error classes
      HY008operation canceled
      OPERATION_CANCELED -
      -## Class `XX`: internal error - - - - - - - - - - - - -
      SQLSTATEDescription and issuing error classes
      XX000internal error
      INTERNAL_ERROR -
      diff --git a/docs/sql-error-conditions-stds-invalid-option-value-error-class.md b/docs/sql-error-conditions-stds-invalid-option-value-error-class.md deleted file mode 100644 index 7cc72417d752f..0000000000000 --- a/docs/sql-error-conditions-stds-invalid-option-value-error-class.md +++ /dev/null @@ -1,45 +0,0 @@ ---- -layout: global -title: STDS_INVALID_OPTION_VALUE error class -displayTitle: STDS_INVALID_OPTION_VALUE error class -license: | - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---- - - - -[SQLSTATE: 42616](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Invalid value for source option '``': - -This error class has the following derived error classes: - -## IS_EMPTY - -cannot be empty. - -## IS_NEGATIVE - -cannot be negative. - -## WITH_MESSAGE - -`` - - diff --git a/docs/sql-error-conditions-invalid-partition-operation-error-class.md b/docs/sql-error-conditions-syntax-discontinued-error-class.md similarity index 71% rename from docs/sql-error-conditions-invalid-partition-operation-error-class.md rename to docs/sql-error-conditions-syntax-discontinued-error-class.md index 2f8a017803887..966e11004364e 100644 --- a/docs/sql-error-conditions-invalid-partition-operation-error-class.md +++ b/docs/sql-error-conditions-syntax-discontinued-error-class.md @@ -1,7 +1,7 @@ --- layout: global -title: INVALID_PARTITION_OPERATION error class -displayTitle: INVALID_PARTITION_OPERATION error class +title: SYNTAX_DISCONTINUED error class +displayTitle: SYNTAX_DISCONTINUED error class license: | Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with @@ -26,16 +26,14 @@ license: | [SQLSTATE: 42601](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) -The partition command is invalid. +Support of the clause or keyword: `` has been discontinued in this context. This error class has the following derived error classes: -## PARTITION_MANAGEMENT_IS_UNSUPPORTED +## BANG_EQUALS_NOT -Table `` does not support partition management. - -## PARTITION_SCHEMA_IS_EMPTY - -Table `` is not partitioned. +The '!' keyword is supported as a prefix operator in a logical operation only. +Use the 'NOT' keyword instead for clauses such as `NOT LIKE`, `NOT IN`, `NOT BETWEEN`, etc. +To re-enable the '!' keyword, set "spark.sql.legacy.bangEqualsNot" to "true". diff --git a/docs/sql-error-conditions-unresolved-column-error-class.md b/docs/sql-error-conditions-unresolved-column-error-class.md deleted file mode 100644 index 89b1daf0128df..0000000000000 --- a/docs/sql-error-conditions-unresolved-column-error-class.md +++ /dev/null @@ -1,41 +0,0 @@ ---- -layout: global -title: UNRESOLVED_COLUMN error class -displayTitle: UNRESOLVED_COLUMN error class -license: | - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---- - - - -[SQLSTATE: 42703](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -A column, variable, or function parameter with name `` cannot be resolved. - -This error class has the following derived error classes: - -## WITHOUT_SUGGESTION - - - -## WITH_SUGGESTION - -Did you mean one of the following? [``]. - - diff --git a/docs/sql-error-conditions-unresolved-field-error-class.md b/docs/sql-error-conditions-unresolved-field-error-class.md deleted file mode 100644 index 83f008139af43..0000000000000 --- a/docs/sql-error-conditions-unresolved-field-error-class.md +++ /dev/null @@ -1,41 +0,0 @@ ---- -layout: global -title: UNRESOLVED_FIELD error class -displayTitle: UNRESOLVED_FIELD error class -license: | - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---- - - - -[SQLSTATE: 42703](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -A field with name `` cannot be resolved with the struct-type column ``. - -This error class has the following derived error classes: - -## WITHOUT_SUGGESTION - - - -## WITH_SUGGESTION - -Did you mean one of the following? [``]. - - diff --git a/docs/sql-error-conditions-unresolved-map-key-error-class.md b/docs/sql-error-conditions-unresolved-map-key-error-class.md deleted file mode 100644 index 9c0268240154a..0000000000000 --- a/docs/sql-error-conditions-unresolved-map-key-error-class.md +++ /dev/null @@ -1,41 +0,0 @@ ---- -layout: global -title: UNRESOLVED_MAP_KEY error class -displayTitle: UNRESOLVED_MAP_KEY error class -license: | - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---- - - - -[SQLSTATE: 42703](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Cannot resolve column `` as a map key. If the key is a string literal, add the single quotes '' around it. - -This error class has the following derived error classes: - -## WITHOUT_SUGGESTION - - - -## WITH_SUGGESTION - -Otherwise did you mean one of the following column(s)? [``]. - - diff --git a/docs/sql-error-conditions-unsupported-add-file-error-class.md b/docs/sql-error-conditions-unsupported-add-file-error-class.md deleted file mode 100644 index 482d753fb53b0..0000000000000 --- a/docs/sql-error-conditions-unsupported-add-file-error-class.md +++ /dev/null @@ -1,41 +0,0 @@ ---- -layout: global -title: UNSUPPORTED_ADD_FILE error class -displayTitle: UNSUPPORTED_ADD_FILE error class -license: | - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---- - - - -[SQLSTATE: 0A000](sql-error-conditions-sqlstates.html#class-0A-feature-not-supported) - -Don't support add file. - -This error class has the following derived error classes: - -## DIRECTORY - -The file `` is a directory, consider to set "spark.sql.legacy.addSingleFileInAddFile" to "false". - -## LOCAL_DIRECTORY - -The local directory `` is not supported in a non-local master mode. - - diff --git a/docs/sql-error-conditions-unsupported-default-value-error-class.md b/docs/sql-error-conditions-unsupported-default-value-error-class.md deleted file mode 100644 index c6ca78f606bd0..0000000000000 --- a/docs/sql-error-conditions-unsupported-default-value-error-class.md +++ /dev/null @@ -1,41 +0,0 @@ ---- -layout: global -title: UNSUPPORTED_DEFAULT_VALUE error class -displayTitle: UNSUPPORTED_DEFAULT_VALUE error class -license: | - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---- - - - -[SQLSTATE: 0A000](sql-error-conditions-sqlstates.html#class-0A-feature-not-supported) - -DEFAULT column values is not supported. - -This error class has the following derived error classes: - -## WITHOUT_SUGGESTION - - - -## WITH_SUGGESTION - -Enable it by setting "spark.sql.defaultColumn.enabled" to "true". - - diff --git a/docs/sql-error-conditions-unsupported-deserializer-error-class.md b/docs/sql-error-conditions-unsupported-deserializer-error-class.md deleted file mode 100644 index 11b58f9386d05..0000000000000 --- a/docs/sql-error-conditions-unsupported-deserializer-error-class.md +++ /dev/null @@ -1,41 +0,0 @@ ---- -layout: global -title: UNSUPPORTED_DESERIALIZER error class -displayTitle: UNSUPPORTED_DESERIALIZER error class -license: | - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---- - - - -[SQLSTATE: 0A000](sql-error-conditions-sqlstates.html#class-0A-feature-not-supported) - -The deserializer is not supported: - -This error class has the following derived error classes: - -## DATA_TYPE_MISMATCH - -need a(n) `` field but got ``. - -## FIELD_NUMBER_MISMATCH - -try to map `` to Tuple``, but failed as the number of fields does not line up. - - diff --git a/docs/sql-error-conditions-unsupported-feature-error-class.md b/docs/sql-error-conditions-unsupported-feature-error-class.md deleted file mode 100644 index f67d7caff63de..0000000000000 --- a/docs/sql-error-conditions-unsupported-feature-error-class.md +++ /dev/null @@ -1,229 +0,0 @@ ---- -layout: global -title: UNSUPPORTED_FEATURE error class -displayTitle: UNSUPPORTED_FEATURE error class -license: | - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---- - - - -[SQLSTATE: 0A000](sql-error-conditions-sqlstates.html#class-0A-feature-not-supported) - -The feature is not supported: - -This error class has the following derived error classes: - -## AES_MODE - -AES-`` with the padding `` by the `` function. - -## AES_MODE_AAD - -`` with AES-`` does not support additional authenticate data (AAD). - -## AES_MODE_IV - -`` with AES-`` does not support initialization vectors (IVs). - -## ANALYZE_UNCACHED_TEMP_VIEW - -The ANALYZE TABLE FOR COLUMNS command can operate on temporary views that have been cached already. Consider to cache the view ``. - -## ANALYZE_UNSUPPORTED_COLUMN_TYPE - -The ANALYZE TABLE FOR COLUMNS command does not support the type `` of the column `` in the table ``. - -## ANALYZE_VIEW - -The ANALYZE TABLE command does not support views. - -## CATALOG_OPERATION - -Catalog `` does not support ``. - -## COLLATION - -Collation is not yet supported. - -## COMBINATION_QUERY_RESULT_CLAUSES - -Combination of ORDER BY/SORT BY/DISTRIBUTE BY/CLUSTER BY. - -## COMMENT_NAMESPACE - -Attach a comment to the namespace ``. - -## DESC_TABLE_COLUMN_PARTITION - -DESC TABLE COLUMN for a specific partition. - -## DROP_DATABASE - -Drop the default database ``. - -## DROP_NAMESPACE - -Drop the namespace ``. - -## HIVE_TABLE_TYPE - -The `` is hive ``. - -## HIVE_WITH_ANSI_INTERVALS - -Hive table `` with ANSI intervals. - -## INSERT_PARTITION_SPEC_IF_NOT_EXISTS - -INSERT INTO `` with IF NOT EXISTS in the PARTITION spec. - -## LATERAL_COLUMN_ALIAS_IN_AGGREGATE_FUNC - -Referencing a lateral column alias `` in the aggregate function ``. - -## LATERAL_COLUMN_ALIAS_IN_AGGREGATE_WITH_WINDOW_AND_HAVING - -Referencing lateral column alias `` in the aggregate query both with window expressions and with having clause. Please rewrite the aggregate query by removing the having clause or removing lateral alias reference in the SELECT list. - -## LATERAL_COLUMN_ALIAS_IN_GROUP_BY - -Referencing a lateral column alias via GROUP BY alias/ALL is not supported yet. - -## LATERAL_COLUMN_ALIAS_IN_WINDOW - -Referencing a lateral column alias `` in window expression ``. - -## LATERAL_JOIN_USING - -JOIN USING with LATERAL correlation. - -## LITERAL_TYPE - -Literal for '``' of ``. - -## MULTIPLE_BUCKET_TRANSFORMS - -Multiple bucket TRANSFORMs. - -## MULTI_ACTION_ALTER - -The target JDBC server hosting table `` does not support ALTER TABLE with multiple actions. Split the ALTER TABLE up into individual actions to avoid this error. - -## ORC_TYPE_CAST - -Unable to convert `` of Orc to data type ``. - -## OVERWRITE_BY_SUBQUERY - -INSERT OVERWRITE with a subquery condition. - -## PANDAS_UDAF_IN_PIVOT - -Pandas user defined aggregate function in the PIVOT clause. - -## PARAMETER_MARKER_IN_UNEXPECTED_STATEMENT - -Parameter markers are not allowed in ``. - -## PARTITION_WITH_NESTED_COLUMN_IS_UNSUPPORTED - -Invalid partitioning: `` is missing or is in a map or array. - -## PIVOT_AFTER_GROUP_BY - -PIVOT clause following a GROUP BY clause. Consider pushing the GROUP BY into a subquery. - -## PIVOT_TYPE - -Pivoting by the value '``' of the column data type ``. - -## PURGE_PARTITION - -Partition purge. - -## PURGE_TABLE - -Purge table. - -## PYTHON_UDF_IN_ON_CLAUSE - -Python UDF in the ON clause of a `` JOIN. In case of an INNER JOIN consider rewriting to a CROSS JOIN with a WHERE clause. - -## REMOVE_NAMESPACE_COMMENT - -Remove a comment from the namespace ``. - -## REPLACE_NESTED_COLUMN - -The replace function does not support nested column ``. - -## SET_NAMESPACE_PROPERTY - -`` is a reserved namespace property, ``. - -## SET_OPERATION_ON_MAP_TYPE - -Cannot have MAP type columns in DataFrame which calls set operations (INTERSECT, EXCEPT, etc.), but the type of column `` is ``. - -## SET_PROPERTIES_AND_DBPROPERTIES - -set PROPERTIES and DBPROPERTIES at the same time. - -## SET_TABLE_PROPERTY - -`` is a reserved table property, ``. - -## SET_VARIABLE_USING_SET - -`` is a VARIABLE and cannot be updated using the SET statement. Use SET VARIABLE `` = ... instead. - -## STATE_STORE_MULTIPLE_COLUMN_FAMILIES - -Creating multiple column families with `` is not supported. - -## STATE_STORE_REMOVING_COLUMN_FAMILIES - -Removing column families with `` is not supported. - -## STATE_STORE_TTL - -State TTL with `` is not supported. Please use RocksDBStateStoreProvider. - -## TABLE_OPERATION - -Table `` does not support ``. Please check the current catalog and namespace to make sure the qualified table name is expected, and also check the catalog implementation which is configured by "spark.sql.catalog". - -## TIME_TRAVEL - -Time travel on the relation: ``. - -## TOO_MANY_TYPE_ARGUMENTS_FOR_UDF_CLASS - -UDF class with `` type arguments. - -## TRANSFORM_DISTINCT_ALL - -TRANSFORM with the DISTINCT/ALL clause. - -## TRANSFORM_NON_HIVE - -TRANSFORM with SERDE is only supported in hive mode. - - diff --git a/docs/sql-error-conditions-unsupported-generator-error-class.md b/docs/sql-error-conditions-unsupported-generator-error-class.md deleted file mode 100644 index 4e42d6b43bca4..0000000000000 --- a/docs/sql-error-conditions-unsupported-generator-error-class.md +++ /dev/null @@ -1,49 +0,0 @@ ---- -layout: global -title: UNSUPPORTED_GENERATOR error class -displayTitle: UNSUPPORTED_GENERATOR error class -license: | - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---- - - - -[SQLSTATE: 42K0E](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The generator is not supported: - -This error class has the following derived error classes: - -## MULTI_GENERATOR - -only one generator allowed per SELECT clause but found ``: ``. - -## NESTED_IN_EXPRESSIONS - -nested in expressions ``. - -## NOT_GENERATOR - -`` is expected to be a generator. However, its class is ``, which is not a generator. - -## OUTSIDE_SELECT - -outside the SELECT clause, found: ``. - - diff --git a/docs/sql-error-conditions-unsupported-insert-error-class.md b/docs/sql-error-conditions-unsupported-insert-error-class.md deleted file mode 100644 index 3f679589fd3af..0000000000000 --- a/docs/sql-error-conditions-unsupported-insert-error-class.md +++ /dev/null @@ -1,53 +0,0 @@ ---- -layout: global -title: UNSUPPORTED_INSERT error class -displayTitle: UNSUPPORTED_INSERT error class -license: | - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---- - - - -[SQLSTATE: 42809](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Can't insert into the target. - -This error class has the following derived error classes: - -## MULTI_PATH - -Can only write data to relations with a single path but given paths are ``. - -## NOT_ALLOWED - -The target relation `` does not allow insertion. - -## NOT_PARTITIONED - -The target relation `` is not partitioned. - -## RDD_BASED - -An RDD-based table is not allowed. - -## READ_FROM - -The target relation `` is also being read from. - - diff --git a/docs/sql-error-conditions-unsupported-merge-condition-error-class.md b/docs/sql-error-conditions-unsupported-merge-condition-error-class.md deleted file mode 100644 index 070782395d3bc..0000000000000 --- a/docs/sql-error-conditions-unsupported-merge-condition-error-class.md +++ /dev/null @@ -1,45 +0,0 @@ ---- -layout: global -title: UNSUPPORTED_MERGE_CONDITION error class -displayTitle: UNSUPPORTED_MERGE_CONDITION error class -license: | - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---- - - - -[SQLSTATE: 42K0E](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -MERGE operation contains unsupported `` condition. - -This error class has the following derived error classes: - -## AGGREGATE - -Aggregates are not allowed: ``. - -## NON_DETERMINISTIC - -Non-deterministic expressions are not allowed: ``. - -## SUBQUERY - -Subqueries are not allowed: ``. - - diff --git a/docs/sql-error-conditions-unsupported-overwrite-error-class.md b/docs/sql-error-conditions-unsupported-overwrite-error-class.md deleted file mode 100644 index dd7de62cd06d4..0000000000000 --- a/docs/sql-error-conditions-unsupported-overwrite-error-class.md +++ /dev/null @@ -1,41 +0,0 @@ ---- -layout: global -title: UNSUPPORTED_OVERWRITE error class -displayTitle: UNSUPPORTED_OVERWRITE error class -license: | - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---- - - - -[SQLSTATE: 42902](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Can't overwrite the target that is also being read from. - -This error class has the following derived error classes: - -## PATH - -The target path is ``. - -## TABLE - -The target table is ``. - - diff --git a/docs/sql-error-conditions-unsupported-save-mode-error-class.md b/docs/sql-error-conditions-unsupported-save-mode-error-class.md deleted file mode 100644 index dbe210360fcb1..0000000000000 --- a/docs/sql-error-conditions-unsupported-save-mode-error-class.md +++ /dev/null @@ -1,41 +0,0 @@ ---- -layout: global -title: UNSUPPORTED_SAVE_MODE error class -displayTitle: UNSUPPORTED_SAVE_MODE error class -license: | - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---- - - - -[SQLSTATE: 0A000](sql-error-conditions-sqlstates.html#class-0A-feature-not-supported) - -The save mode `` is not supported for: - -This error class has the following derived error classes: - -## EXISTENT_PATH - -an existent path. - -## NON_EXISTENT_PATH - -a non-existent path. - - diff --git a/docs/sql-error-conditions-unsupported-subquery-expression-category-error-class.md b/docs/sql-error-conditions-unsupported-subquery-expression-category-error-class.md deleted file mode 100644 index 59a34d6a01695..0000000000000 --- a/docs/sql-error-conditions-unsupported-subquery-expression-category-error-class.md +++ /dev/null @@ -1,95 +0,0 @@ ---- -layout: global -title: UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY error class -displayTitle: UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY error class -license: | - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---- - - - -[SQLSTATE: 0A000](sql-error-conditions-sqlstates.html#class-0A-feature-not-supported) - -Unsupported subquery expression: - -This error class has the following derived error classes: - -## ACCESSING_OUTER_QUERY_COLUMN_IS_NOT_ALLOWED - -Accessing outer query column is not allowed in this location: -`` - -## AGGREGATE_FUNCTION_MIXED_OUTER_LOCAL_REFERENCES - -Found an aggregate function in a correlated predicate that has both outer and local references, which is not supported: ``. - -## CORRELATED_COLUMN_IS_NOT_ALLOWED_IN_PREDICATE - -Correlated column is not allowed in predicate: -`` - -## CORRELATED_COLUMN_NOT_FOUND - -A correlated outer name reference within a subquery expression body was not found in the enclosing query: ``. - -## CORRELATED_REFERENCE - -Expressions referencing the outer query are not supported outside of WHERE/HAVING clauses: ``. - -## HIGHER_ORDER_FUNCTION - -Subquery expressions are not supported within higher-order functions. Please remove all subquery expressions from higher-order functions and then try the query again. - -## LATERAL_JOIN_CONDITION_NON_DETERMINISTIC - -Lateral join condition cannot be non-deterministic: ``. - -## MUST_AGGREGATE_CORRELATED_SCALAR_SUBQUERY - -Correlated scalar subqueries must be aggregated to return at most one row. - -## NON_CORRELATED_COLUMNS_IN_GROUP_BY - -A GROUP BY clause in a scalar correlated subquery cannot contain non-correlated columns: ``. - -## NON_DETERMINISTIC_LATERAL_SUBQUERIES - -Non-deterministic lateral subqueries are not supported when joining with outer relations that produce more than one row: -`` - -## UNSUPPORTED_CORRELATED_REFERENCE_DATA_TYPE - -Correlated column reference '``' cannot be `` type. - -## UNSUPPORTED_CORRELATED_SCALAR_SUBQUERY - -Correlated scalar subqueries can only be used in filters, aggregations, projections, and UPDATE/MERGE/DELETE commands: -`` - -## UNSUPPORTED_IN_EXISTS_SUBQUERY - -IN/EXISTS predicate subqueries can only be used in filters, joins, aggregations, window functions, projections, and UPDATE/MERGE/DELETE commands: -`` - -## UNSUPPORTED_TABLE_ARGUMENT - -Table arguments are used in a function where they are not supported: -`` - - diff --git a/docs/sql-error-conditions-wrong-num-args-error-class.md b/docs/sql-error-conditions-wrong-num-args-error-class.md deleted file mode 100644 index 652037bae6789..0000000000000 --- a/docs/sql-error-conditions-wrong-num-args-error-class.md +++ /dev/null @@ -1,41 +0,0 @@ ---- -layout: global -title: WRONG_NUM_ARGS error class -displayTitle: WRONG_NUM_ARGS error class -license: | - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---- - - - -[SQLSTATE: 42605](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The `` requires `` parameters but the actual number is ``. - -This error class has the following derived error classes: - -## WITHOUT_SUGGESTION - -Please, refer to '``/sql-ref-functions.html' for a fix. - -## WITH_SUGGESTION - -If you have to call this function with `` parameters, set the legacy configuration `` to ``. - - diff --git a/docs/sql-error-conditions.md b/docs/sql-error-conditions.md index 36d7cf58f09f2..0c1953ea8f468 100644 --- a/docs/sql-error-conditions.md +++ b/docs/sql-error-conditions.md @@ -19,2879 +19,19 @@ license: | limitations under the License. --- - +{% comment %} +Don't discuss error classes (e.g. `42`) or sub-classes (e.g. `K01`) with users. It's not helpful. +Keep this documentation focused on error states (e.g. `58002`) and conditions (e.g. +`AMBIGUOUS_COLUMN_REFERENCE`), which is what users see and what they will typically be searching +for when they encounter an error. -This is a list of common, named error conditions returned by Spark SQL. +To update this information, edit `error-conditions.json`. The table below will be automatically +derived from that file via `docs/util/build-error-docs.py`. -Also see [SQLSTATE Codes](sql-error-conditions-sqlstates.html). +Also note that this is a Jekyll comment and not an HTML comment so that this comment does not show +up in the generated HTML to end users. :-) +{% endcomment %} -### AGGREGATE_FUNCTION_WITH_NONDETERMINISTIC_EXPRESSION +This is a list of error states and conditions that may be returned by Spark SQL. -[SQLSTATE: 42845](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Non-deterministic expression `` should not appear in the arguments of an aggregate function. - -### ALL_PARAMETERS_MUST_BE_NAMED - -SQLSTATE: 07001 - -Using name parameterized queries requires all parameters to be named. Parameters missing names: ``. - -### ALL_PARTITION_COLUMNS_NOT_ALLOWED - -SQLSTATE: KD005 - -Cannot use all columns for partition columns. - -### ALTER_TABLE_COLUMN_DESCRIPTOR_DUPLICATE - -[SQLSTATE: 42710](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -ALTER TABLE `` column `` specifies descriptor "``" more than once, which is invalid. - -### AMBIGUOUS_ALIAS_IN_NESTED_CTE - -[SQLSTATE: 42KD0](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Name `` is ambiguous in nested CTE. -Please set `` to "CORRECTED" so that name defined in inner CTE takes precedence. If set it to "LEGACY", outer CTE definitions will take precedence. -See '``/sql-migration-guide.html#query-engine'. - -### AMBIGUOUS_COLUMN_OR_FIELD - -[SQLSTATE: 42702](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Column or field `` is ambiguous and has `` matches. - -### AMBIGUOUS_COLUMN_REFERENCE - -[SQLSTATE: 42702](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Column `` is ambiguous. It's because you joined several DataFrame together, and some of these DataFrames are the same. -This column points to one of the DataFrames but Spark is unable to figure out which one. -Please alias the DataFrames with different names via `DataFrame.alias` before joining them, -and specify the column using qualified name, e.g. `df.alias("a").join(df.alias("b"), col("a.id") > col("b.id"))`. - -### AMBIGUOUS_LATERAL_COLUMN_ALIAS - -[SQLSTATE: 42702](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Lateral column alias `` is ambiguous and has `` matches. - -### AMBIGUOUS_REFERENCE - -[SQLSTATE: 42704](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Reference `` is ambiguous, could be: ``. - -### AMBIGUOUS_REFERENCE_TO_FIELDS - -[SQLSTATE: 42000](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Ambiguous reference to the field ``. It appears `` times in the schema. - -### ARITHMETIC_OVERFLOW - -[SQLSTATE: 22003](sql-error-conditions-sqlstates.html#class-22-data-exception) - -``.`` If necessary set `` to "false" to bypass this error. - -### ASSIGNMENT_ARITY_MISMATCH - -[SQLSTATE: 42802](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The number of columns or variables assigned or aliased: `` does not match the number of source expressions: ``. - -### [AS_OF_JOIN](sql-error-conditions-as-of-join-error-class.html) - -[SQLSTATE: 42604](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Invalid as-of join. - -For more details see [AS_OF_JOIN](sql-error-conditions-as-of-join-error-class.html) - -### AVRO_INCOMPATIBLE_READ_TYPE - -[SQLSTATE: 22KD3](sql-error-conditions-sqlstates.html#class-22-data-exception) - -Cannot convert Avro `` to SQL `` because the original encoded data type is ``, however you're trying to read the field as ``, which would lead to an incorrect answer. -To allow reading this field, enable the SQL configuration: "spark.sql.legacy.avro.allowIncompatibleSchema". - -### BATCH_METADATA_NOT_FOUND - -[SQLSTATE: 42K03](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Unable to find batch ``. - -### BINARY_ARITHMETIC_OVERFLOW - -[SQLSTATE: 22003](sql-error-conditions-sqlstates.html#class-22-data-exception) - -`` `` `` caused overflow. - -### CALL_ON_STREAMING_DATASET_UNSUPPORTED - -[SQLSTATE: 42KDE](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The method `` can not be called on streaming Dataset/DataFrame. - -### CANNOT_ALTER_PARTITION_COLUMN - -[SQLSTATE: 428FR](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -ALTER TABLE (ALTER|CHANGE) COLUMN is not supported for partition columns, but found the partition column `` in the table ``. - -### CANNOT_CAST_DATATYPE - -[SQLSTATE: 42846](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Cannot cast `` to ``. - -### CANNOT_CONVERT_PROTOBUF_FIELD_TYPE_TO_SQL_TYPE - -[SQLSTATE: 42846](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Cannot convert Protobuf `` to SQL `` because schema is incompatible (protobufType = ``, sqlType = ``). - -### CANNOT_CONVERT_PROTOBUF_MESSAGE_TYPE_TO_SQL_TYPE - -[SQLSTATE: 42846](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Unable to convert `` of Protobuf to SQL type ``. - -### CANNOT_CONVERT_SQL_TYPE_TO_PROTOBUF_FIELD_TYPE - -[SQLSTATE: 42846](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Cannot convert SQL `` to Protobuf `` because schema is incompatible (protobufType = ``, sqlType = ``). - -### CANNOT_CONVERT_SQL_VALUE_TO_PROTOBUF_ENUM_TYPE - -[SQLSTATE: 42846](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Cannot convert SQL `` to Protobuf `` because `` is not in defined values for enum: ``. - -### [CANNOT_CREATE_DATA_SOURCE_TABLE](sql-error-conditions-cannot-create-data-source-table-error-class.html) - -[SQLSTATE: 42KDE](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Failed to create data source table ``: - -For more details see [CANNOT_CREATE_DATA_SOURCE_TABLE](sql-error-conditions-cannot-create-data-source-table-error-class.html) - -### CANNOT_DECODE_URL - -[SQLSTATE: 22546](sql-error-conditions-sqlstates.html#class-22-data-exception) - -The provided URL cannot be decoded: ``. Please ensure that the URL is properly formatted and try again. - -### CANNOT_INVOKE_IN_TRANSFORMATIONS - -[SQLSTATE: 0A000](sql-error-conditions-sqlstates.html#class-0A-feature-not-supported) - -Dataset transformations and actions can only be invoked by the driver, not inside of other Dataset transformations; for example, dataset1.map(x => dataset2.values.count() * x) is invalid because the values transformation and count action cannot be performed inside of the dataset1.map transformation. For more information, see SPARK-28702. - -### CANNOT_LOAD_FUNCTION_CLASS - -[SQLSTATE: 46103](sql-error-conditions-sqlstates.html#class-46-java-ddl-1) - -Cannot load class `` when registering the function ``, please make sure it is on the classpath. - -### CANNOT_LOAD_PROTOBUF_CLASS - -[SQLSTATE: 42K03](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Could not load Protobuf class with name ``. ``. - -### [CANNOT_LOAD_STATE_STORE](sql-error-conditions-cannot-load-state-store-error-class.html) - -SQLSTATE: 58030 - -An error occurred during loading state. - -For more details see [CANNOT_LOAD_STATE_STORE](sql-error-conditions-cannot-load-state-store-error-class.html) - -### CANNOT_MERGE_INCOMPATIBLE_DATA_TYPE - -[SQLSTATE: 42825](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Failed to merge incompatible data types `` and ``. Please check the data types of the columns being merged and ensure that they are compatible. If necessary, consider casting the columns to compatible data types before attempting the merge. - -### CANNOT_MERGE_SCHEMAS - -[SQLSTATE: 42KD9](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Failed merging schemas: -Initial schema: -`` -Schema that cannot be merged with the initial schema: -``. - -### CANNOT_MODIFY_CONFIG - -[SQLSTATE: 46110](sql-error-conditions-sqlstates.html#class-46-java-ddl-1) - -Cannot modify the value of the Spark config: ``. -See also '``/sql-migration-guide.html#ddl-statements'. - -### CANNOT_PARSE_DECIMAL - -[SQLSTATE: 22018](sql-error-conditions-sqlstates.html#class-22-data-exception) - -Cannot parse decimal. Please ensure that the input is a valid number with optional decimal point or comma separators. - -### CANNOT_PARSE_INTERVAL - -[SQLSTATE: 22006](sql-error-conditions-sqlstates.html#class-22-data-exception) - -Unable to parse ``. Please ensure that the value provided is in a valid format for defining an interval. You can reference the documentation for the correct format. If the issue persists, please double check that the input value is not null or empty and try again. - -### CANNOT_PARSE_JSON_FIELD - -[SQLSTATE: 2203G](sql-error-conditions-sqlstates.html#class-22-data-exception) - -Cannot parse the field name `` and the value `` of the JSON token type `` to target Spark data type ``. - -### CANNOT_PARSE_PROTOBUF_DESCRIPTOR - -[SQLSTATE: 22018](sql-error-conditions-sqlstates.html#class-22-data-exception) - -Error parsing descriptor bytes into Protobuf FileDescriptorSet. - -### CANNOT_PARSE_TIMESTAMP - -[SQLSTATE: 22007](sql-error-conditions-sqlstates.html#class-22-data-exception) - -``. If necessary set `` to "false" to bypass this error. - -### CANNOT_RECOGNIZE_HIVE_TYPE - -[SQLSTATE: 429BB](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Cannot recognize hive type string: ``, column: ``. The specified data type for the field cannot be recognized by Spark SQL. Please check the data type of the specified field and ensure that it is a valid Spark SQL data type. Refer to the Spark SQL documentation for a list of valid data types and their format. If the data type is correct, please ensure that you are using a supported version of Spark SQL. - -### CANNOT_RENAME_ACROSS_SCHEMA - -[SQLSTATE: 0AKD0](sql-error-conditions-sqlstates.html#class-0A-feature-not-supported) - -Renaming a `` across schemas is not allowed. - -### CANNOT_RESOLVE_DATAFRAME_COLUMN - -[SQLSTATE: 42704](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Cannot resolve dataframe column ``. It's probably because of illegal references like `df1.select(df2.col("a"))`. - -### CANNOT_RESOLVE_STAR_EXPAND - -[SQLSTATE: 42704](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Cannot resolve ``.* given input columns ``. Please check that the specified table or struct exists and is accessible in the input columns. - -### CANNOT_RESTORE_PERMISSIONS_FOR_PATH - -SQLSTATE: 58030 - -Failed to set permissions on created path `` back to ``. - -### CANNOT_SAVE_VARIANT - -[SQLSTATE: 0A000](sql-error-conditions-sqlstates.html#class-0A-feature-not-supported) - -Cannot save variant data type into external storage. - -### [CANNOT_UPDATE_FIELD](sql-error-conditions-cannot-update-field-error-class.html) - -[SQLSTATE: 0A000](sql-error-conditions-sqlstates.html#class-0A-feature-not-supported) - -Cannot update `
      ` field `` type: - -For more details see [CANNOT_UPDATE_FIELD](sql-error-conditions-cannot-update-field-error-class.html) - -### CANNOT_UP_CAST_DATATYPE - -[SQLSTATE: 42846](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Cannot up cast `` from `` to ``. -`
      ` - -### [CANNOT_WRITE_STATE_STORE](sql-error-conditions-cannot-write-state-store-error-class.html) - -SQLSTATE: 58030 - -Error writing state store files for provider ``. - -For more details see [CANNOT_WRITE_STATE_STORE](sql-error-conditions-cannot-write-state-store-error-class.html) - -### CAST_INVALID_INPUT - -[SQLSTATE: 22018](sql-error-conditions-sqlstates.html#class-22-data-exception) - -The value `` of the type `` cannot be cast to `` because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. If necessary set `` to "false" to bypass this error. - -### CAST_OVERFLOW - -[SQLSTATE: 22003](sql-error-conditions-sqlstates.html#class-22-data-exception) - -The value `` of the type `` cannot be cast to `` due to an overflow. Use `try_cast` to tolerate overflow and return NULL instead. If necessary set `` to "false" to bypass this error. - -### CAST_OVERFLOW_IN_TABLE_INSERT - -[SQLSTATE: 22003](sql-error-conditions-sqlstates.html#class-22-data-exception) - -Fail to assign a value of `` type to the `` type column or variable `` due to an overflow. Use `try_cast` on the input value to tolerate overflow and return NULL instead. - -### CATALOG_NOT_FOUND - -[SQLSTATE: 42P08](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The catalog `` not found. Consider to set the SQL config `` to a catalog plugin. - -### CHECKPOINT_RDD_BLOCK_ID_NOT_FOUND - -SQLSTATE: 56000 - -Checkpoint block `` not found! -Either the executor that originally checkpointed this partition is no longer alive, or the original RDD is unpersisted. -If this problem persists, you may consider using `rdd.checkpoint()` instead, which is slower than local checkpointing but more fault-tolerant. - -### CLASS_NOT_OVERRIDE_EXPECTED_METHOD - -[SQLSTATE: 38000](sql-error-conditions-sqlstates.html#class-38-external-routine-exception) - -`` must override either `` or ``. - -### CLASS_UNSUPPORTED_BY_MAP_OBJECTS - -[SQLSTATE: 0A000](sql-error-conditions-sqlstates.html#class-0A-feature-not-supported) - -`MapObjects` does not support the class `` as resulting collection. - -### [CODEC_NOT_AVAILABLE](sql-error-conditions-codec-not-available-error-class.html) - -SQLSTATE: 56038 - -The codec `` is not available. - -For more details see [CODEC_NOT_AVAILABLE](sql-error-conditions-codec-not-available-error-class.html) - -### CODEC_SHORT_NAME_NOT_FOUND - -[SQLSTATE: 42704](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Cannot find a short name for the codec ``. - -### COLLATION_INVALID_NAME - -[SQLSTATE: 42704](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The value `` does not represent a correct collation name. Suggested valid collation name: [``]. - -### [COLLATION_MISMATCH](sql-error-conditions-collation-mismatch-error-class.html) - -[SQLSTATE: 42P21](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Could not determine which collation to use for string functions and operators. - -For more details see [COLLATION_MISMATCH](sql-error-conditions-collation-mismatch-error-class.html) - -### [COLLECTION_SIZE_LIMIT_EXCEEDED](sql-error-conditions-collection-size-limit-exceeded-error-class.html) - -[SQLSTATE: 54000](sql-error-conditions-sqlstates.html#class-54-program-limit-exceeded) - -Can't create array with `` elements which exceeding the array size limit ``, - -For more details see [COLLECTION_SIZE_LIMIT_EXCEEDED](sql-error-conditions-collection-size-limit-exceeded-error-class.html) - -### COLUMN_ALIASES_NOT_ALLOWED - -[SQLSTATE: 42601](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Column aliases are not allowed in ``. - -### COLUMN_ALREADY_EXISTS - -[SQLSTATE: 42711](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The column `` already exists. Choose another name or rename the existing column. - -### COLUMN_NOT_DEFINED_IN_TABLE - -[SQLSTATE: 42703](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -`` column `` is not defined in table ``, defined table columns are: ``. - -### COLUMN_NOT_FOUND - -[SQLSTATE: 42703](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The column `` cannot be found. Verify the spelling and correctness of the column name according to the SQL config ``. - -### COMPARATOR_RETURNS_NULL - -[SQLSTATE: 22004](sql-error-conditions-sqlstates.html#class-22-data-exception) - -The comparator has returned a NULL for a comparison between `` and ``. -It should return a positive integer for "greater than", 0 for "equal" and a negative integer for "less than". -To revert to deprecated behavior where NULL is treated as 0 (equal), you must set "spark.sql.legacy.allowNullComparisonResultInArraySort" to "true". - -### [COMPLEX_EXPRESSION_UNSUPPORTED_INPUT](sql-error-conditions-complex-expression-unsupported-input-error-class.html) - -[SQLSTATE: 42K09](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Cannot process input data types for the expression: ``. - -For more details see [COMPLEX_EXPRESSION_UNSUPPORTED_INPUT](sql-error-conditions-complex-expression-unsupported-input-error-class.html) - -### CONCURRENT_QUERY - -[SQLSTATE: 0A000](sql-error-conditions-sqlstates.html#class-0A-feature-not-supported) - -Another instance of this query was just started by a concurrent session. - -### CONCURRENT_STREAM_LOG_UPDATE - -SQLSTATE: 40000 - -Concurrent update to the log. Multiple streaming jobs detected for ``. -Please make sure only one streaming job runs on a specific checkpoint location at a time. - -### [CONNECT](sql-error-conditions-connect-error-class.html) - -SQLSTATE: 56K00 - -Generic Spark Connect error. - -For more details see [CONNECT](sql-error-conditions-connect-error-class.html) - -### CONVERSION_INVALID_INPUT - -[SQLSTATE: 22018](sql-error-conditions-sqlstates.html#class-22-data-exception) - -The value `` (``) cannot be converted to `` because it is malformed. Correct the value as per the syntax, or change its format. Use `` to tolerate malformed input and return NULL instead. - -### CREATE_PERMANENT_VIEW_WITHOUT_ALIAS - -[SQLSTATE: 0A000](sql-error-conditions-sqlstates.html#class-0A-feature-not-supported) - -Not allowed to create the permanent view `` without explicitly assigning an alias for the expression ``. - -### CREATE_TABLE_COLUMN_DESCRIPTOR_DUPLICATE - -[SQLSTATE: 42710](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -CREATE TABLE column `` specifies descriptor "``" more than once, which is invalid. - -### [CREATE_VIEW_COLUMN_ARITY_MISMATCH](sql-error-conditions-create-view-column-arity-mismatch-error-class.html) - -[SQLSTATE: 21S01](sql-error-conditions-sqlstates.html#class-21-cardinality-violation) - -Cannot create view ``, the reason is - -For more details see [CREATE_VIEW_COLUMN_ARITY_MISMATCH](sql-error-conditions-create-view-column-arity-mismatch-error-class.html) - -### [DATATYPE_MISMATCH](sql-error-conditions-datatype-mismatch-error-class.html) - -[SQLSTATE: 42K09](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Cannot resolve `` due to data type mismatch: - -For more details see [DATATYPE_MISMATCH](sql-error-conditions-datatype-mismatch-error-class.html) - -### DATATYPE_MISSING_SIZE - -[SQLSTATE: 42K01](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -DataType `` requires a length parameter, for example ``(10). Please specify the length. - -### DATA_SOURCE_ALREADY_EXISTS - -[SQLSTATE: 42710](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Data source '``' already exists. Please choose a different name for the new data source. - -### DATA_SOURCE_NOT_EXIST - -[SQLSTATE: 42704](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Data source '``' not found. Please make sure the data source is registered. - -### DATA_SOURCE_NOT_FOUND - -[SQLSTATE: 42K02](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Failed to find the data source: ``. Make sure the provider name is correct and the package is properly registered and compatible with your Spark version. - -### DATA_SOURCE_TABLE_SCHEMA_MISMATCH - -[SQLSTATE: 42K03](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The schema of the data source table does not match the expected schema. If you are using the DataFrameReader.schema API or creating a table, avoid specifying the schema. -Data Source schema: `` -Expected schema: `` - -### DATETIME_OVERFLOW - -[SQLSTATE: 22008](sql-error-conditions-sqlstates.html#class-22-data-exception) - -Datetime operation overflow: ``. - -### DECIMAL_PRECISION_EXCEEDS_MAX_PRECISION - -[SQLSTATE: 22003](sql-error-conditions-sqlstates.html#class-22-data-exception) - -Decimal precision `` exceeds max precision ``. - -### DEFAULT_DATABASE_NOT_EXISTS - -[SQLSTATE: 42704](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Default database `` does not exist, please create it first or change default database to ````. - -### DEFAULT_PLACEMENT_INVALID - -[SQLSTATE: 42608](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -A DEFAULT keyword in a MERGE, INSERT, UPDATE, or SET VARIABLE command could not be directly assigned to a target column because it was part of an expression. -For example: `UPDATE SET c1 = DEFAULT` is allowed, but `UPDATE T SET c1 = DEFAULT + 1` is not allowed. - -### DISTINCT_WINDOW_FUNCTION_UNSUPPORTED - -[SQLSTATE: 0A000](sql-error-conditions-sqlstates.html#class-0A-feature-not-supported) - -Distinct window functions are not supported: ``. - -### DIVIDE_BY_ZERO - -[SQLSTATE: 22012](sql-error-conditions-sqlstates.html#class-22-data-exception) - -Division by zero. Use `try_divide` to tolerate divisor being 0 and return NULL instead. If necessary set `` to "false" to bypass this error. - -### DUPLICATED_FIELD_NAME_IN_ARROW_STRUCT - -[SQLSTATE: 42713](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Duplicated field names in Arrow Struct are not allowed, got ``. - -### DUPLICATED_MAP_KEY - -[SQLSTATE: 23505](sql-error-conditions-sqlstates.html#class-23-integrity-constraint-violation) - -Duplicate map key `` was found, please check the input data. -If you want to remove the duplicated keys, you can set `` to "LAST_WIN" so that the key inserted at last takes precedence. - -### DUPLICATED_METRICS_NAME - -[SQLSTATE: 42710](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The metric name is not unique: ``. The same name cannot be used for metrics with different results. -However multiple instances of metrics with with same result and name are allowed (e.g. self-joins). - -### DUPLICATE_ASSIGNMENTS - -[SQLSTATE: 42701](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The columns or variables `` appear more than once as assignment targets. - -### DUPLICATE_CLAUSES - -[SQLSTATE: 42614](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Found duplicate clauses: ``. Please, remove one of them. - -### DUPLICATE_KEY - -[SQLSTATE: 23505](sql-error-conditions-sqlstates.html#class-23-integrity-constraint-violation) - -Found duplicate keys ``. - -### [DUPLICATE_ROUTINE_PARAMETER_ASSIGNMENT](sql-error-conditions-duplicate-routine-parameter-assignment-error-class.html) - -[SQLSTATE: 4274K](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Call to function `` is invalid because it includes multiple argument assignments to the same parameter name ``. - -For more details see [DUPLICATE_ROUTINE_PARAMETER_ASSIGNMENT](sql-error-conditions-duplicate-routine-parameter-assignment-error-class.html) - -### EMPTY_JSON_FIELD_VALUE - -[SQLSTATE: 42604](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Failed to parse an empty string for data type ``. - -### ENCODER_NOT_FOUND - -[SQLSTATE: 42704](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Not found an encoder of the type `` to Spark SQL internal representation. -Consider to change the input type to one of supported at '``/sql-ref-datatypes.html'. - -### ERROR_READING_AVRO_UNKNOWN_FINGERPRINT - -SQLSTATE: KD00B - -Error reading avro data -- encountered an unknown fingerprint: ``, not sure what schema to use. -This could happen if you registered additional schemas after starting your spark context. - -### EVENT_TIME_IS_NOT_ON_TIMESTAMP_TYPE - -[SQLSTATE: 42K09](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The event time `` has the invalid type ``, but expected "TIMESTAMP". - -### EXCEED_LIMIT_LENGTH - -[SQLSTATE: 54006](sql-error-conditions-sqlstates.html#class-54-program-limit-exceeded) - -Exceeds char/varchar type length limitation: ``. - -### EXCEPT_NESTED_COLUMN_INVALID_TYPE - -[SQLSTATE: 428H2](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -EXCEPT column `` was resolved and expected to be StructType, but found type ``. - -### EXCEPT_OVERLAPPING_COLUMNS - -[SQLSTATE: 42702](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Columns in an EXCEPT list must be distinct and non-overlapping, but got (``). - -### EXEC_IMMEDIATE_DUPLICATE_ARGUMENT_ALIASES - -[SQLSTATE: 42701](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The USING clause of this EXECUTE IMMEDIATE command contained multiple arguments with same alias (``), which is invalid; please update the command to specify unique aliases and then try it again. - -### EXPECT_PERMANENT_VIEW_NOT_TEMP - -[SQLSTATE: 42809](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -'``' expects a permanent view but `` is a temp view. - -### [EXPECT_TABLE_NOT_VIEW](sql-error-conditions-expect-table-not-view-error-class.html) - -[SQLSTATE: 42809](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -'``' expects a table but `` is a view. - -For more details see [EXPECT_TABLE_NOT_VIEW](sql-error-conditions-expect-table-not-view-error-class.html) - -### [EXPECT_VIEW_NOT_TABLE](sql-error-conditions-expect-view-not-table-error-class.html) - -[SQLSTATE: 42809](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The table `` does not support ``. - -For more details see [EXPECT_VIEW_NOT_TABLE](sql-error-conditions-expect-view-not-table-error-class.html) - -### EXPRESSION_DECODING_FAILED - -[SQLSTATE: 42846](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Failed to decode a row to a value of the expressions: ``. - -### EXPRESSION_ENCODING_FAILED - -[SQLSTATE: 42846](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Failed to encode a value of the expressions: `` to a row. - -### EXPRESSION_TYPE_IS_NOT_ORDERABLE - -[SQLSTATE: 42822](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Column expression `` cannot be sorted because its type `` is not orderable. - -### FAILED_EXECUTE_UDF - -[SQLSTATE: 39000](sql-error-conditions-sqlstates.html#class-39-external-routine-invocation-exception) - -User defined function (``: (``) => ``) failed due to: ``. - -### FAILED_FUNCTION_CALL - -[SQLSTATE: 38000](sql-error-conditions-sqlstates.html#class-38-external-routine-exception) - -Failed preparing of the function `` for call. Please, double check function's arguments. - -### [FAILED_JDBC](sql-error-conditions-failed-jdbc-error-class.html) - -SQLSTATE: HV000 - -Failed JDBC `` on the operation: - -For more details see [FAILED_JDBC](sql-error-conditions-failed-jdbc-error-class.html) - -### FAILED_PARSE_STRUCT_TYPE - -[SQLSTATE: 22018](sql-error-conditions-sqlstates.html#class-22-data-exception) - -Failed parsing struct: ``. - -### [FAILED_READ_FILE](sql-error-conditions-failed-read-file-error-class.html) - -SQLSTATE: KD001 - -Encountered error while reading file ``. - -For more details see [FAILED_READ_FILE](sql-error-conditions-failed-read-file-error-class.html) - -### FAILED_REGISTER_CLASS_WITH_KRYO - -SQLSTATE: KD000 - -Failed to register classes with Kryo. - -### FAILED_RENAME_PATH - -[SQLSTATE: 42K04](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Failed to rename `` to `` as destination already exists. - -### FAILED_RENAME_TEMP_FILE - -SQLSTATE: 58030 - -Failed to rename temp file `` to `` as FileSystem.rename returned false. - -### FAILED_ROW_TO_JSON - -[SQLSTATE: 2203G](sql-error-conditions-sqlstates.html#class-22-data-exception) - -Failed to convert the row value `` of the class `` to the target SQL type `` in the JSON format. - -### FIELDS_ALREADY_EXISTS - -[SQLSTATE: 42710](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Cannot `` column, because `` already exists in ``. - -### FIELD_NOT_FOUND - -[SQLSTATE: 42704](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -No such struct field `` in ``. - -### FORBIDDEN_OPERATION - -[SQLSTATE: 42809](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The operation `` is not allowed on the ``: ``. - -### FOREACH_BATCH_USER_FUNCTION_ERROR - -[SQLSTATE: 39000](sql-error-conditions-sqlstates.html#class-39-external-routine-invocation-exception) - -An error occurred in the user provided function in foreach batch sink. Reason: `` - -### FOUND_MULTIPLE_DATA_SOURCES - -[SQLSTATE: 42710](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Detected multiple data sources with the name '``'. Please check the data source isn't simultaneously registered and located in the classpath. - -### GENERATED_COLUMN_WITH_DEFAULT_VALUE - -[SQLSTATE: 42623](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -A column cannot have both a default value and a generation expression but column `` has default value: (``) and generation expression: (``). - -### GET_TABLES_BY_TYPE_UNSUPPORTED_BY_HIVE_VERSION - -SQLSTATE: 56038 - -Hive 2.2 and lower versions don't support getTablesByType. Please use Hive 2.3 or higher version. - -### GRAPHITE_SINK_INVALID_PROTOCOL - -SQLSTATE: KD000 - -Invalid Graphite protocol: ``. - -### GRAPHITE_SINK_PROPERTY_MISSING - -SQLSTATE: KD000 - -Graphite sink requires '``' property. - -### GROUPING_COLUMN_MISMATCH - -[SQLSTATE: 42803](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Column of grouping (``) can't be found in grouping columns ``. - -### GROUPING_ID_COLUMN_MISMATCH - -[SQLSTATE: 42803](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Columns of grouping_id (``) does not match grouping columns (``). - -### GROUPING_SIZE_LIMIT_EXCEEDED - -[SQLSTATE: 54000](sql-error-conditions-sqlstates.html#class-54-program-limit-exceeded) - -Grouping sets size cannot be greater than ``. - -### GROUP_BY_AGGREGATE - -[SQLSTATE: 42903](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Aggregate functions are not allowed in GROUP BY, but found ``. - -### GROUP_BY_POS_AGGREGATE - -[SQLSTATE: 42903](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -GROUP BY `` refers to an expression `` that contains an aggregate function. Aggregate functions are not allowed in GROUP BY. - -### GROUP_BY_POS_OUT_OF_RANGE - -[SQLSTATE: 42805](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -GROUP BY position `` is not in select list (valid range is [1, ``]). - -### GROUP_EXPRESSION_TYPE_IS_NOT_ORDERABLE - -[SQLSTATE: 42822](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The expression `` cannot be used as a grouping expression because its data type `` is not an orderable data type. - -### HLL_INVALID_INPUT_SKETCH_BUFFER - -[SQLSTATE: 22546](sql-error-conditions-sqlstates.html#class-22-data-exception) - -Invalid call to ``; only valid HLL sketch buffers are supported as inputs (such as those produced by the `hll_sketch_agg` function). - -### HLL_INVALID_LG_K - -[SQLSTATE: 22546](sql-error-conditions-sqlstates.html#class-22-data-exception) - -Invalid call to ``; the `lgConfigK` value must be between `` and ``, inclusive: ``. - -### HLL_UNION_DIFFERENT_LG_K - -[SQLSTATE: 22000](sql-error-conditions-sqlstates.html#class-22-data-exception) - -Sketches have different `lgConfigK` values: `` and ``. Set the `allowDifferentLgConfigK` parameter to true to call `` with different `lgConfigK` values. - -### IDENTIFIER_TOO_MANY_NAME_PARTS - -[SQLSTATE: 42601](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -`` is not a valid identifier as it has more than 2 name parts. - -### [ILLEGAL_STATE_STORE_VALUE](sql-error-conditions-illegal-state-store-value-error-class.html) - -[SQLSTATE: 42601](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Illegal value provided to the State Store - -For more details see [ILLEGAL_STATE_STORE_VALUE](sql-error-conditions-illegal-state-store-value-error-class.html) - -### INCOMPARABLE_PIVOT_COLUMN - -[SQLSTATE: 42818](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Invalid pivot column ``. Pivot columns must be comparable. - -### INCOMPATIBLE_COLUMN_TYPE - -[SQLSTATE: 42825](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -`` can only be performed on tables with compatible column types. The `` column of the `` table is `` type which is not compatible with `` at the same column of the first table.``. - -### INCOMPATIBLE_DATASOURCE_REGISTER - -SQLSTATE: 56038 - -Detected an incompatible DataSourceRegister. Please remove the incompatible library from classpath or upgrade it. Error: `` - -### [INCOMPATIBLE_DATA_FOR_TABLE](sql-error-conditions-incompatible-data-for-table-error-class.html) - -SQLSTATE: KD000 - -Cannot write incompatible data for the table ``: - -For more details see [INCOMPATIBLE_DATA_FOR_TABLE](sql-error-conditions-incompatible-data-for-table-error-class.html) - -### INCOMPATIBLE_JOIN_TYPES - -[SQLSTATE: 42613](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The join types `` and `` are incompatible. - -### INCOMPATIBLE_VIEW_SCHEMA_CHANGE - -SQLSTATE: 51024 - -The SQL query of view `` has an incompatible schema change and column `` cannot be resolved. Expected `` columns named `` but got ``. -Please try to re-create the view by running: ``. - -### [INCOMPLETE_TYPE_DEFINITION](sql-error-conditions-incomplete-type-definition-error-class.html) - -[SQLSTATE: 42K01](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Incomplete complex type: - -For more details see [INCOMPLETE_TYPE_DEFINITION](sql-error-conditions-incomplete-type-definition-error-class.html) - -### [INCONSISTENT_BEHAVIOR_CROSS_VERSION](sql-error-conditions-inconsistent-behavior-cross-version-error-class.html) - -[SQLSTATE: 42K0B](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -You may get a different result due to the upgrading to - -For more details see [INCONSISTENT_BEHAVIOR_CROSS_VERSION](sql-error-conditions-inconsistent-behavior-cross-version-error-class.html) - -### INCORRECT_RAMP_UP_RATE - -[SQLSTATE: 22003](sql-error-conditions-sqlstates.html#class-22-data-exception) - -Max offset with `` rowsPerSecond is ``, but 'rampUpTimeSeconds' is ``. - -### INDETERMINATE_COLLATION - -[SQLSTATE: 42P22](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Function called requires knowledge of the collation it should apply, but indeterminate collation was found. Use COLLATE function to set the collation explicitly. - -### INDEX_ALREADY_EXISTS - -[SQLSTATE: 42710](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Cannot create the index `` on table `` because it already exists. - -### INDEX_NOT_FOUND - -[SQLSTATE: 42704](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Cannot find the index `` on table ``. - -### [INSERT_COLUMN_ARITY_MISMATCH](sql-error-conditions-insert-column-arity-mismatch-error-class.html) - -[SQLSTATE: 21S01](sql-error-conditions-sqlstates.html#class-21-cardinality-violation) - -Cannot write to ``, the reason is - -For more details see [INSERT_COLUMN_ARITY_MISMATCH](sql-error-conditions-insert-column-arity-mismatch-error-class.html) - -### INSERT_PARTITION_COLUMN_ARITY_MISMATCH - -[SQLSTATE: 21S01](sql-error-conditions-sqlstates.html#class-21-cardinality-violation) - -Cannot write to '``', ``: -Table columns: ``. -Partition columns with static values: ``. -Data columns: ``. - -### [INSUFFICIENT_TABLE_PROPERTY](sql-error-conditions-insufficient-table-property-error-class.html) - -[SQLSTATE: XXKUC](sql-error-conditions-sqlstates.html#class-XX-internal-error) - -Can't find table property: - -For more details see [INSUFFICIENT_TABLE_PROPERTY](sql-error-conditions-insufficient-table-property-error-class.html) - -### INTERNAL_ERROR - -[SQLSTATE: XX000](sql-error-conditions-sqlstates.html#class-XX-internal-error) - -`` - -### INTERNAL_ERROR_BROADCAST - -[SQLSTATE: XX000](sql-error-conditions-sqlstates.html#class-XX-internal-error) - -`` - -### INTERNAL_ERROR_EXECUTOR - -[SQLSTATE: XX000](sql-error-conditions-sqlstates.html#class-XX-internal-error) - -`` - -### INTERNAL_ERROR_MEMORY - -[SQLSTATE: XX000](sql-error-conditions-sqlstates.html#class-XX-internal-error) - -`` - -### [INTERNAL_ERROR_METADATA_CATALOG](sql-error-conditions-internal-error-metadata-catalog-error-class.html) - -[SQLSTATE: XX000](sql-error-conditions-sqlstates.html#class-XX-internal-error) - -An object in the metadata catalog has been corrupted: - -For more details see [INTERNAL_ERROR_METADATA_CATALOG](sql-error-conditions-internal-error-metadata-catalog-error-class.html) - -### INTERNAL_ERROR_NETWORK - -[SQLSTATE: XX000](sql-error-conditions-sqlstates.html#class-XX-internal-error) - -`` - -### INTERNAL_ERROR_SHUFFLE - -[SQLSTATE: XX000](sql-error-conditions-sqlstates.html#class-XX-internal-error) - -`` - -### INTERNAL_ERROR_STORAGE - -[SQLSTATE: XX000](sql-error-conditions-sqlstates.html#class-XX-internal-error) - -`` - -### INTERNAL_ERROR_TWS - -[SQLSTATE: XX000](sql-error-conditions-sqlstates.html#class-XX-internal-error) - -`` - -### INTERVAL_ARITHMETIC_OVERFLOW - -[SQLSTATE: 22015](sql-error-conditions-sqlstates.html#class-22-data-exception) - -``.`` - -### INTERVAL_DIVIDED_BY_ZERO - -[SQLSTATE: 22012](sql-error-conditions-sqlstates.html#class-22-data-exception) - -Division by zero. Use `try_divide` to tolerate divisor being 0 and return NULL instead. - -### [INVALID_AGGREGATE_FILTER](sql-error-conditions-invalid-aggregate-filter-error-class.html) - -[SQLSTATE: 42903](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The FILTER expression `` in an aggregate function is invalid. - -For more details see [INVALID_AGGREGATE_FILTER](sql-error-conditions-invalid-aggregate-filter-error-class.html) - -### INVALID_ARRAY_INDEX - -[SQLSTATE: 22003](sql-error-conditions-sqlstates.html#class-22-data-exception) - -The index `` is out of bounds. The array has `` elements. Use the SQL function `get()` to tolerate accessing element at invalid index and return NULL instead. If necessary set `` to "false" to bypass this error. - -### INVALID_ARRAY_INDEX_IN_ELEMENT_AT - -[SQLSTATE: 22003](sql-error-conditions-sqlstates.html#class-22-data-exception) - -The index `` is out of bounds. The array has `` elements. Use `try_element_at` to tolerate accessing element at invalid index and return NULL instead. If necessary set `` to "false" to bypass this error. - -### INVALID_BITMAP_POSITION - -[SQLSTATE: 22003](sql-error-conditions-sqlstates.html#class-22-data-exception) - -The 0-indexed bitmap position `` is out of bounds. The bitmap has `` bits (`` bytes). - -### [INVALID_BOUNDARY](sql-error-conditions-invalid-boundary-error-class.html) - -[SQLSTATE: 22003](sql-error-conditions-sqlstates.html#class-22-data-exception) - -The boundary `` is invalid: ``. - -For more details see [INVALID_BOUNDARY](sql-error-conditions-invalid-boundary-error-class.html) - -### INVALID_BUCKET_COLUMN_DATA_TYPE - -[SQLSTATE: 42601](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Cannot use `` for bucket column. Collated data types are not supported for bucketing. - -### INVALID_BUCKET_FILE - -SQLSTATE: 58030 - -Invalid bucket file: ``. - -### INVALID_BYTE_STRING - -[SQLSTATE: 22P03](sql-error-conditions-sqlstates.html#class-22-data-exception) - -The expected format is ByteString, but was `` (``). - -### INVALID_COLUMN_NAME_AS_PATH - -[SQLSTATE: 46121](sql-error-conditions-sqlstates.html#class-46-java-ddl-1) - -The datasource `` cannot save the column `` because its name contains some characters that are not allowed in file paths. Please, use an alias to rename it. - -### INVALID_COLUMN_OR_FIELD_DATA_TYPE - -[SQLSTATE: 42000](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Column or field `` is of type `` while it's required to be ``. - -### [INVALID_CONF_VALUE](sql-error-conditions-invalid-conf-value-error-class.html) - -[SQLSTATE: 22022](sql-error-conditions-sqlstates.html#class-22-data-exception) - -The value '``' in the config "``" is invalid. - -For more details see [INVALID_CONF_VALUE](sql-error-conditions-invalid-conf-value-error-class.html) - -### [INVALID_CURSOR](sql-error-conditions-invalid-cursor-error-class.html) - -[SQLSTATE: HY109](sql-error-conditions-sqlstates.html#class-HY-cli-specific-condition) - -The cursor is invalid. - -For more details see [INVALID_CURSOR](sql-error-conditions-invalid-cursor-error-class.html) - -### [INVALID_DATETIME_PATTERN](sql-error-conditions-invalid-datetime-pattern-error-class.html) - -[SQLSTATE: 22007](sql-error-conditions-sqlstates.html#class-22-data-exception) - -Unrecognized datetime pattern: ``. - -For more details see [INVALID_DATETIME_PATTERN](sql-error-conditions-invalid-datetime-pattern-error-class.html) - -### [INVALID_DEFAULT_VALUE](sql-error-conditions-invalid-default-value-error-class.html) - -[SQLSTATE: 42623](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Failed to execute `` command because the destination column or variable `` has a DEFAULT value ``, - -For more details see [INVALID_DEFAULT_VALUE](sql-error-conditions-invalid-default-value-error-class.html) - -### [INVALID_DELIMITER_VALUE](sql-error-conditions-invalid-delimiter-value-error-class.html) - -[SQLSTATE: 42602](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Invalid value for delimiter. - -For more details see [INVALID_DELIMITER_VALUE](sql-error-conditions-invalid-delimiter-value-error-class.html) - -### INVALID_DRIVER_MEMORY - -SQLSTATE: F0000 - -System memory `` must be at least ``. -Please increase heap size using the --driver-memory option or "``" in Spark configuration. - -### INVALID_EMPTY_LOCATION - -[SQLSTATE: 42K05](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The location name cannot be empty string, but ```` was given. - -### INVALID_ESC - -[SQLSTATE: 42604](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Found an invalid escape string: ``. The escape string must contain only one character. - -### INVALID_ESCAPE_CHAR - -[SQLSTATE: 42604](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -`EscapeChar` should be a string literal of length one, but got ``. - -### INVALID_EXECUTOR_MEMORY - -SQLSTATE: F0000 - -Executor memory `` must be at least ``. -Please increase executor memory using the --executor-memory option or "``" in Spark configuration. - -### INVALID_EXPRESSION_ENCODER - -[SQLSTATE: 42001](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Found an invalid expression encoder. Expects an instance of ExpressionEncoder but got ``. For more information consult '``/api/java/index.html?org/apache/spark/sql/Encoder.html'. - -### INVALID_EXTRACT_BASE_FIELD_TYPE - -[SQLSTATE: 42000](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Can't extract a value from ``. Need a complex type [STRUCT, ARRAY, MAP] but got ``. - -### INVALID_EXTRACT_FIELD - -[SQLSTATE: 42601](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Cannot extract `` from ``. - -### INVALID_EXTRACT_FIELD_TYPE - -[SQLSTATE: 42000](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Field name should be a non-null string literal, but it's ``. - -### INVALID_FIELD_NAME - -[SQLSTATE: 42000](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Field name `` is invalid: `` is not a struct. - -### [INVALID_FORMAT](sql-error-conditions-invalid-format-error-class.html) - -[SQLSTATE: 42601](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The format is invalid: ``. - -For more details see [INVALID_FORMAT](sql-error-conditions-invalid-format-error-class.html) - -### INVALID_FRACTION_OF_SECOND - -[SQLSTATE: 22023](sql-error-conditions-sqlstates.html#class-22-data-exception) - -The fraction of sec must be zero. Valid range is [0, 60]. If necessary set `` to "false" to bypass this error. - -### [INVALID_HANDLE](sql-error-conditions-invalid-handle-error-class.html) - -[SQLSTATE: HY000](sql-error-conditions-sqlstates.html#class-HY-cli-specific-condition) - -The handle `` is invalid. - -For more details see [INVALID_HANDLE](sql-error-conditions-invalid-handle-error-class.html) - -### INVALID_IDENTIFIER - -[SQLSTATE: 42602](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The unquoted identifier `` is invalid and must be back quoted as: ````. -Unquoted identifiers can only contain ASCII letters ('a' - 'z', 'A' - 'Z'), digits ('0' - '9'), and underbar ('_'). -Unquoted identifiers must also not start with a digit. -Different data sources and meta stores may impose additional restrictions on valid identifiers. - -### INVALID_INDEX_OF_ZERO - -[SQLSTATE: 22003](sql-error-conditions-sqlstates.html#class-22-data-exception) - -The index 0 is invalid. An index shall be either `< 0 or >` 0 (the first element has index 1). - -### [INVALID_INLINE_TABLE](sql-error-conditions-invalid-inline-table-error-class.html) - -[SQLSTATE: 42000](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Invalid inline table. - -For more details see [INVALID_INLINE_TABLE](sql-error-conditions-invalid-inline-table-error-class.html) - -### [INVALID_INTERVAL_FORMAT](sql-error-conditions-invalid-interval-format-error-class.html) - -[SQLSTATE: 22006](sql-error-conditions-sqlstates.html#class-22-data-exception) - -Error parsing '``' to interval. Please ensure that the value provided is in a valid format for defining an interval. You can reference the documentation for the correct format. - -For more details see [INVALID_INTERVAL_FORMAT](sql-error-conditions-invalid-interval-format-error-class.html) - -### [INVALID_INVERSE_DISTRIBUTION_FUNCTION](sql-error-conditions-invalid-inverse-distribution-function-error-class.html) - -[SQLSTATE: 42K0K](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Invalid inverse distribution function ``. - -For more details see [INVALID_INVERSE_DISTRIBUTION_FUNCTION](sql-error-conditions-invalid-inverse-distribution-function-error-class.html) - -### INVALID_JSON_DATA_TYPE - -[SQLSTATE: 2203G](sql-error-conditions-sqlstates.html#class-22-data-exception) - -Failed to convert the JSON string '``' to a data type. Please enter a valid data type. - -### INVALID_JSON_ROOT_FIELD - -[SQLSTATE: 22032](sql-error-conditions-sqlstates.html#class-22-data-exception) - -Cannot convert JSON root field to target Spark type. - -### INVALID_JSON_SCHEMA_MAP_TYPE - -[SQLSTATE: 22032](sql-error-conditions-sqlstates.html#class-22-data-exception) - -Input schema `` can only contain STRING as a key type for a MAP. - -### INVALID_KRYO_SERIALIZER_BUFFER_SIZE - -SQLSTATE: F0000 - -The value of the config "``" must be less than 2048 MiB, but got `` MiB. - -### [INVALID_LAMBDA_FUNCTION_CALL](sql-error-conditions-invalid-lambda-function-call-error-class.html) - -[SQLSTATE: 42K0D](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Invalid lambda function call. - -For more details see [INVALID_LAMBDA_FUNCTION_CALL](sql-error-conditions-invalid-lambda-function-call-error-class.html) - -### INVALID_LATERAL_JOIN_TYPE - -[SQLSTATE: 42613](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The `` JOIN with LATERAL correlation is not allowed because an OUTER subquery cannot correlate to its join partner. Remove the LATERAL correlation or use an INNER JOIN, or LEFT OUTER JOIN instead. - -### [INVALID_LIMIT_LIKE_EXPRESSION](sql-error-conditions-invalid-limit-like-expression-error-class.html) - -[SQLSTATE: 42K0E](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The limit like expression `` is invalid. - -For more details see [INVALID_LIMIT_LIKE_EXPRESSION](sql-error-conditions-invalid-limit-like-expression-error-class.html) - -### INVALID_NON_DETERMINISTIC_EXPRESSIONS - -[SQLSTATE: 42K0E](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The operator expects a deterministic expression, but the actual expression is ``. - -### INVALID_NUMERIC_LITERAL_RANGE - -[SQLSTATE: 22003](sql-error-conditions-sqlstates.html#class-22-data-exception) - -Numeric literal `` is outside the valid range for `` with minimum value of `` and maximum value of ``. Please adjust the value accordingly. - -### [INVALID_OBSERVED_METRICS](sql-error-conditions-invalid-observed-metrics-error-class.html) - -[SQLSTATE: 42K0E](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Invalid observed metrics. - -For more details see [INVALID_OBSERVED_METRICS](sql-error-conditions-invalid-observed-metrics-error-class.html) - -### [INVALID_OPTIONS](sql-error-conditions-invalid-options-error-class.html) - -[SQLSTATE: 42K06](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Invalid options: - -For more details see [INVALID_OPTIONS](sql-error-conditions-invalid-options-error-class.html) - -### INVALID_PANDAS_UDF_PLACEMENT - -[SQLSTATE: 0A000](sql-error-conditions-sqlstates.html#class-0A-feature-not-supported) - -The group aggregate pandas UDF `` cannot be invoked together with as other, non-pandas aggregate functions. - -### [INVALID_PARAMETER_VALUE](sql-error-conditions-invalid-parameter-value-error-class.html) - -[SQLSTATE: 22023](sql-error-conditions-sqlstates.html#class-22-data-exception) - -The value of parameter(s) `` in `` is invalid: - -For more details see [INVALID_PARAMETER_VALUE](sql-error-conditions-invalid-parameter-value-error-class.html) - -### INVALID_PARTITION_COLUMN_DATA_TYPE - -[SQLSTATE: 0A000](sql-error-conditions-sqlstates.html#class-0A-feature-not-supported) - -Cannot use `` for partition column. - -### [INVALID_PARTITION_OPERATION](sql-error-conditions-invalid-partition-operation-error-class.html) - -[SQLSTATE: 42601](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The partition command is invalid. - -For more details see [INVALID_PARTITION_OPERATION](sql-error-conditions-invalid-partition-operation-error-class.html) - -### INVALID_PROPERTY_KEY - -[SQLSTATE: 42602](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -`` is an invalid property key, please use quotes, e.g. SET ``=``. - -### INVALID_PROPERTY_VALUE - -[SQLSTATE: 42602](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -`` is an invalid property value, please use quotes, e.g. SET ``=`` - -### INVALID_QUERY_MIXED_QUERY_PARAMETERS - -[SQLSTATE: 42613](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Parameterized query must either use positional, or named parameters, but not both. - -### INVALID_SAVE_MODE - -[SQLSTATE: 42000](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The specified save mode `` is invalid. Valid save modes include "append", "overwrite", "ignore", "error", "errorifexists", and "default". - -### [INVALID_SCHEMA](sql-error-conditions-invalid-schema-error-class.html) - -[SQLSTATE: 42K07](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The input schema `` is not a valid schema string. - -For more details see [INVALID_SCHEMA](sql-error-conditions-invalid-schema-error-class.html) - -### INVALID_SCHEMA_OR_RELATION_NAME - -[SQLSTATE: 42602](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -`` is not a valid name for tables/schemas. Valid names only contain alphabet characters, numbers and _. - -### INVALID_SET_SYNTAX - -[SQLSTATE: 42000](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Expected format is 'SET', 'SET key', or 'SET key=value'. If you want to include special characters in key, or include semicolon in value, please use backquotes, e.g., SET `key`=`value`. - -### INVALID_SQL_ARG - -[SQLSTATE: 42K08](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The argument `` of `sql()` is invalid. Consider to replace it either by a SQL literal or by collection constructor functions such as `map()`, `array()`, `struct()`. - -### [INVALID_SQL_SYNTAX](sql-error-conditions-invalid-sql-syntax-error-class.html) - -[SQLSTATE: 42000](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Invalid SQL syntax: - -For more details see [INVALID_SQL_SYNTAX](sql-error-conditions-invalid-sql-syntax-error-class.html) - -### INVALID_STATEMENT_FOR_EXECUTE_INTO - -SQLSTATE: 07501 - -The INTO clause of EXECUTE IMMEDIATE is only valid for queries but the given statement is not a query: ``. - -### INVALID_STATEMENT_OR_CLAUSE - -[SQLSTATE: 42601](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The statement or clause: `` is not valid. - -### [INVALID_SUBQUERY_EXPRESSION](sql-error-conditions-invalid-subquery-expression-error-class.html) - -[SQLSTATE: 42823](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Invalid subquery: - -For more details see [INVALID_SUBQUERY_EXPRESSION](sql-error-conditions-invalid-subquery-expression-error-class.html) - -### INVALID_TEMP_OBJ_REFERENCE - -[SQLSTATE: 42K0F](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Cannot create the persistent object `` of the type `` because it references to the temporary object `` of the type ``. Please make the temporary object `` persistent, or make the persistent object `` temporary. - -### INVALID_TIME_TRAVEL_SPEC - -[SQLSTATE: 42K0E](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Cannot specify both version and timestamp when time travelling the table. - -### [INVALID_TIME_TRAVEL_TIMESTAMP_EXPR](sql-error-conditions-invalid-time-travel-timestamp-expr-error-class.html) - -[SQLSTATE: 42K0E](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The time travel timestamp expression `` is invalid. - -For more details see [INVALID_TIME_TRAVEL_TIMESTAMP_EXPR](sql-error-conditions-invalid-time-travel-timestamp-expr-error-class.html) - -### INVALID_TYPED_LITERAL - -[SQLSTATE: 42604](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The value of the typed literal `` is invalid: ``. - -### INVALID_UDF_IMPLEMENTATION - -[SQLSTATE: 38000](sql-error-conditions-sqlstates.html#class-38-external-routine-exception) - -Function `` does not implement a ScalarFunction or AggregateFunction. - -### INVALID_URL - -[SQLSTATE: 22P02](sql-error-conditions-sqlstates.html#class-22-data-exception) - -The url is invalid: ``. If necessary set `` to "false" to bypass this error. - -### INVALID_USAGE_OF_STAR_OR_REGEX - -[SQLSTATE: 42000](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Invalid usage of `` in ``. - -### INVALID_VARIABLE_TYPE_FOR_QUERY_EXECUTE_IMMEDIATE - -[SQLSTATE: 42K09](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Variable type must be string type but got ``. - -### INVALID_VARIANT_CAST - -[SQLSTATE: 22023](sql-error-conditions-sqlstates.html#class-22-data-exception) - -The variant value ```` cannot be cast into ````. Please use `try_variant_get` instead. - -### INVALID_VARIANT_GET_PATH - -[SQLSTATE: 22023](sql-error-conditions-sqlstates.html#class-22-data-exception) - -The path ```` is not a valid variant extraction path in ````. -A valid path should start with `$` and is followed by zero or more segments like `[123]`, `.name`, `['name']`, or `["name"]`. - -### INVALID_VIEW_TEXT - -[SQLSTATE: XX000](sql-error-conditions-sqlstates.html#class-XX-internal-error) - -The view `` cannot be displayed due to invalid view text: ``. This may be caused by an unauthorized modification of the view or an incorrect query syntax. Please check your query syntax and verify that the view has not been tampered with. - -### INVALID_WHERE_CONDITION - -[SQLSTATE: 42903](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The WHERE condition `` contains invalid expressions: ``. -Rewrite the query to avoid window functions, aggregate functions, and generator functions in the WHERE clause. - -### INVALID_WINDOW_SPEC_FOR_AGGREGATION_FUNC - -[SQLSTATE: 42601](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Cannot specify ORDER BY or a window frame for ``. - -### INVALID_WRITER_COMMIT_MESSAGE - -[SQLSTATE: 42KDE](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The data source writer has generated an invalid number of commit messages. Expected exactly one writer commit message from each task, but received ``. - -### [INVALID_WRITE_DISTRIBUTION](sql-error-conditions-invalid-write-distribution-error-class.html) - -[SQLSTATE: 42000](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The requested write distribution is invalid. - -For more details see [INVALID_WRITE_DISTRIBUTION](sql-error-conditions-invalid-write-distribution-error-class.html) - -### JOIN_CONDITION_IS_NOT_BOOLEAN_TYPE - -[SQLSTATE: 42K0E](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The join condition `` has the invalid type ``, expected "BOOLEAN". - -### KRYO_BUFFER_OVERFLOW - -[SQLSTATE: 54006](sql-error-conditions-sqlstates.html#class-54-program-limit-exceeded) - -Kryo serialization failed: ``. To avoid this, increase "``" value. - -### LOAD_DATA_PATH_NOT_EXISTS - -[SQLSTATE: 42K03](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -LOAD DATA input path does not exist: ``. - -### LOCAL_MUST_WITH_SCHEMA_FILE - -[SQLSTATE: 42601](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -LOCAL must be used together with the schema of `file`, but got: ````. - -### LOCATION_ALREADY_EXISTS - -[SQLSTATE: 42710](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Cannot name the managed table as ``, as its associated location `` already exists. Please pick a different table name, or remove the existing location first. - -### MALFORMED_CSV_RECORD - -SQLSTATE: KD000 - -Malformed CSV record: `` - -### MALFORMED_PROTOBUF_MESSAGE - -[SQLSTATE: XX000](sql-error-conditions-sqlstates.html#class-XX-internal-error) - -Malformed Protobuf messages are detected in message deserialization. Parse Mode: ``. To process malformed protobuf message as null result, try setting the option 'mode' as 'PERMISSIVE'. - -### [MALFORMED_RECORD_IN_PARSING](sql-error-conditions-malformed-record-in-parsing-error-class.html) - -[SQLSTATE: 22023](sql-error-conditions-sqlstates.html#class-22-data-exception) - -Malformed records are detected in record parsing: ``. -Parse Mode: ``. To process malformed records as null result, try setting the option 'mode' as 'PERMISSIVE'. - -For more details see [MALFORMED_RECORD_IN_PARSING](sql-error-conditions-malformed-record-in-parsing-error-class.html) - -### MALFORMED_VARIANT - -[SQLSTATE: 22023](sql-error-conditions-sqlstates.html#class-22-data-exception) - -Variant binary is malformed. Please check the data source is valid. - -### MERGE_CARDINALITY_VIOLATION - -[SQLSTATE: 23K01](sql-error-conditions-sqlstates.html#class-23-integrity-constraint-violation) - -The ON search condition of the MERGE statement matched a single row from the target table with multiple rows of the source table. -This could result in the target row being operated on more than once with an update or delete operation and is not allowed. - -### MISSING_AGGREGATION - -[SQLSTATE: 42803](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The non-aggregating expression `` is based on columns which are not participating in the GROUP BY clause. -Add the columns or the expression to the GROUP BY, aggregate the expression, or use `` if you do not care which of the values within a group is returned. - -### [MISSING_ATTRIBUTES](sql-error-conditions-missing-attributes-error-class.html) - -[SQLSTATE: XX000](sql-error-conditions-sqlstates.html#class-XX-internal-error) - -Resolved attribute(s) `` missing from `` in operator ``. - -For more details see [MISSING_ATTRIBUTES](sql-error-conditions-missing-attributes-error-class.html) - -### MISSING_GROUP_BY - -[SQLSTATE: 42803](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The query does not include a GROUP BY clause. Add GROUP BY or turn it into the window functions using OVER clauses. - -### MULTIPLE_TIME_TRAVEL_SPEC - -[SQLSTATE: 42K0E](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Cannot specify time travel in both the time travel clause and options. - -### MULTIPLE_XML_DATA_SOURCE - -[SQLSTATE: 42710](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Detected multiple data sources with the name `` (``). Please specify the fully qualified class name or remove `` from the classpath. - -### MULTI_SOURCES_UNSUPPORTED_FOR_EXPRESSION - -[SQLSTATE: 42K0E](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The expression `` does not support more than one source. - -### MULTI_UDF_INTERFACE_ERROR - -[SQLSTATE: 0A000](sql-error-conditions-sqlstates.html#class-0A-feature-not-supported) - -Not allowed to implement multiple UDF interfaces, UDF class ``. - -### NAMED_PARAMETERS_NOT_SUPPORTED - -[SQLSTATE: 4274K](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Named parameters are not supported for function ``; please retry the query with positional arguments to the function call instead. - -### NAMED_PARAMETER_SUPPORT_DISABLED - -[SQLSTATE: 0A000](sql-error-conditions-sqlstates.html#class-0A-feature-not-supported) - -Cannot call function `` because named argument references are not enabled here. -In this case, the named argument reference was ``. -Set "spark.sql.allowNamedFunctionArguments" to "true" to turn on feature. - -### NESTED_AGGREGATE_FUNCTION - -[SQLSTATE: 42607](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -It is not allowed to use an aggregate function in the argument of another aggregate function. Please use the inner aggregate function in a sub-query. - -### NESTED_EXECUTE_IMMEDIATE - -SQLSTATE: 07501 - -Nested EXECUTE IMMEDIATE commands are not allowed. Please ensure that the SQL query provided (``) does not contain another EXECUTE IMMEDIATE command. - -### NONEXISTENT_FIELD_NAME_IN_LIST - -SQLSTATE: HV091 - -Field(s) `` do(es) not exist. Available fields: `` - -### NON_FOLDABLE_ARGUMENT - -[SQLSTATE: 42K08](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The function `` requires the parameter `` to be a foldable expression of the type ``, but the actual argument is a non-foldable. - -### NON_LAST_MATCHED_CLAUSE_OMIT_CONDITION - -[SQLSTATE: 42613](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -When there are more than one MATCHED clauses in a MERGE statement, only the last MATCHED clause can omit the condition. - -### NON_LAST_NOT_MATCHED_BY_SOURCE_CLAUSE_OMIT_CONDITION - -[SQLSTATE: 42613](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -When there are more than one NOT MATCHED BY SOURCE clauses in a MERGE statement, only the last NOT MATCHED BY SOURCE clause can omit the condition. - -### NON_LAST_NOT_MATCHED_BY_TARGET_CLAUSE_OMIT_CONDITION - -[SQLSTATE: 42613](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -When there are more than one NOT MATCHED [BY TARGET] clauses in a MERGE statement, only the last NOT MATCHED [BY TARGET] clause can omit the condition. - -### NON_LITERAL_PIVOT_VALUES - -[SQLSTATE: 42K08](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Literal expressions required for pivot values, found ``. - -### NON_PARTITION_COLUMN - -[SQLSTATE: 42000](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -PARTITION clause cannot contain the non-partition column: ``. - -### NON_TIME_WINDOW_NOT_SUPPORTED_IN_STREAMING - -[SQLSTATE: 42KDE](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Window function is not supported in `` (as column ``) on streaming DataFrames/Datasets. -Structured Streaming only supports time-window aggregation using the WINDOW function. (window specification: ``) - -### [NOT_ALLOWED_IN_FROM](sql-error-conditions-not-allowed-in-from-error-class.html) - -[SQLSTATE: 42601](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Not allowed in the FROM clause: - -For more details see [NOT_ALLOWED_IN_FROM](sql-error-conditions-not-allowed-in-from-error-class.html) - -### [NOT_A_CONSTANT_STRING](sql-error-conditions-not-a-constant-string-error-class.html) - -[SQLSTATE: 42601](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The expression `` used for the routine or clause `` must be a constant STRING which is NOT NULL. - -For more details see [NOT_A_CONSTANT_STRING](sql-error-conditions-not-a-constant-string-error-class.html) - -### NOT_A_PARTITIONED_TABLE - -[SQLSTATE: 42809](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Operation `` is not allowed for `` because it is not a partitioned table. - -### [NOT_NULL_CONSTRAINT_VIOLATION](sql-error-conditions-not-null-constraint-violation-error-class.html) - -[SQLSTATE: 42000](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Assigning a NULL is not allowed here. - -For more details see [NOT_NULL_CONSTRAINT_VIOLATION](sql-error-conditions-not-null-constraint-violation-error-class.html) - -### NOT_SUPPORTED_CHANGE_COLUMN - -[SQLSTATE: 0A000](sql-error-conditions-sqlstates.html#class-0A-feature-not-supported) - -ALTER TABLE ALTER/CHANGE COLUMN is not supported for changing `
      `'s column `` with type `` to `` with type ``. - -### NOT_SUPPORTED_COMMAND_FOR_V2_TABLE - -[SQLSTATE: 0A000](sql-error-conditions-sqlstates.html#class-0A-feature-not-supported) - -`` is not supported for v2 tables. - -### NOT_SUPPORTED_COMMAND_WITHOUT_HIVE_SUPPORT - -[SQLSTATE: 0A000](sql-error-conditions-sqlstates.html#class-0A-feature-not-supported) - -`` is not supported, if you want to enable it, please set "spark.sql.catalogImplementation" to "hive". - -### [NOT_SUPPORTED_IN_JDBC_CATALOG](sql-error-conditions-not-supported-in-jdbc-catalog-error-class.html) - -[SQLSTATE: 0A000](sql-error-conditions-sqlstates.html#class-0A-feature-not-supported) - -Not supported command in JDBC catalog: - -For more details see [NOT_SUPPORTED_IN_JDBC_CATALOG](sql-error-conditions-not-supported-in-jdbc-catalog-error-class.html) - -### NOT_UNRESOLVED_ENCODER - -[SQLSTATE: 42601](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Unresolved encoder expected, but `` was found. - -### NO_DEFAULT_COLUMN_VALUE_AVAILABLE - -[SQLSTATE: 42608](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Can't determine the default value for `` since it is not nullable and it has no default value. - -### NO_HANDLER_FOR_UDAF - -[SQLSTATE: 42000](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -No handler for UDAF '``'. Use sparkSession.udf.register(...) instead. - -### NO_MERGE_ACTION_SPECIFIED - -[SQLSTATE: 42K0E](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -df.mergeInto needs to be followed by at least one of whenMatched/whenNotMatched/whenNotMatchedBySource. - -### NO_SQL_TYPE_IN_PROTOBUF_SCHEMA - -[SQLSTATE: 42S22](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Cannot find `` in Protobuf schema. - -### NO_UDF_INTERFACE - -[SQLSTATE: 38000](sql-error-conditions-sqlstates.html#class-38-external-routine-exception) - -UDF class `` doesn't implement any UDF interface. - -### NULLABLE_COLUMN_OR_FIELD - -[SQLSTATE: 42000](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Column or field `` is nullable while it's required to be non-nullable. - -### NULLABLE_ROW_ID_ATTRIBUTES - -[SQLSTATE: 42000](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Row ID attributes cannot be nullable: ``. - -### NULL_MAP_KEY - -[SQLSTATE: 2200E](sql-error-conditions-sqlstates.html#class-22-data-exception) - -Cannot use null as map key. - -### NULL_QUERY_STRING_EXECUTE_IMMEDIATE - -[SQLSTATE: 22004](sql-error-conditions-sqlstates.html#class-22-data-exception) - -Execute immediate requires a non-null variable as the query string, but the provided variable `` is null. - -### NUMERIC_OUT_OF_SUPPORTED_RANGE - -[SQLSTATE: 22003](sql-error-conditions-sqlstates.html#class-22-data-exception) - -The value `` cannot be interpreted as a numeric since it has more than 38 digits. - -### [NUMERIC_VALUE_OUT_OF_RANGE](sql-error-conditions-numeric-value-out-of-range-error-class.html) - -[SQLSTATE: 22003](sql-error-conditions-sqlstates.html#class-22-data-exception) - - - -For more details see [NUMERIC_VALUE_OUT_OF_RANGE](sql-error-conditions-numeric-value-out-of-range-error-class.html) - -### NUM_COLUMNS_MISMATCH - -[SQLSTATE: 42826](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -`` can only be performed on inputs with the same number of columns, but the first input has `` columns and the `` input has `` columns. - -### NUM_TABLE_VALUE_ALIASES_MISMATCH - -[SQLSTATE: 42826](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Number of given aliases does not match number of output columns. -Function name: ``; number of aliases: ``; number of output columns: ``. - -### OPERATION_CANCELED - -[SQLSTATE: HY008](sql-error-conditions-sqlstates.html#class-HY-cli-specific-condition) - -Operation has been canceled. - -### ORDER_BY_POS_OUT_OF_RANGE - -[SQLSTATE: 42805](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -ORDER BY position `` is not in select list (valid range is [1, ``]). - -### PARSE_EMPTY_STATEMENT - -[SQLSTATE: 42617](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Syntax error, unexpected empty statement. - -### PARSE_SYNTAX_ERROR - -[SQLSTATE: 42601](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Syntax error at or near ````. - -### PARTITIONS_ALREADY_EXIST - -[SQLSTATE: 428FT](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Cannot ADD or RENAME TO partition(s) `` in table `` because they already exist. -Choose a different name, drop the existing partition, or add the IF NOT EXISTS clause to tolerate a pre-existing partition. - -### PARTITIONS_NOT_FOUND - -[SQLSTATE: 428FT](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The partition(s) `` cannot be found in table ``. -Verify the partition specification and table name. -To tolerate the error on drop use ALTER TABLE … DROP IF EXISTS PARTITION. - -### PATH_ALREADY_EXISTS - -[SQLSTATE: 42K04](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Path `` already exists. Set mode as "overwrite" to overwrite the existing path. - -### PATH_NOT_FOUND - -[SQLSTATE: 42K03](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Path does not exist: ``. - -### PIVOT_VALUE_DATA_TYPE_MISMATCH - -[SQLSTATE: 42K09](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Invalid pivot value '``': value data type `` does not match pivot column data type ``. - -### PLAN_VALIDATION_FAILED_RULE_EXECUTOR - -[SQLSTATE: XXKD0](sql-error-conditions-sqlstates.html#class-XX-internal-error) - -The input plan of `` is invalid: `` - -### PLAN_VALIDATION_FAILED_RULE_IN_BATCH - -[SQLSTATE: XXKD0](sql-error-conditions-sqlstates.html#class-XX-internal-error) - -Rule `` in batch `` generated an invalid plan: `` - -### PROTOBUF_DEPENDENCY_NOT_FOUND - -[SQLSTATE: 42K0G](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Could not find dependency: ``. - -### PROTOBUF_DESCRIPTOR_FILE_NOT_FOUND - -[SQLSTATE: 42K0G](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Error reading Protobuf descriptor file at path: ``. - -### PROTOBUF_FIELD_MISSING - -[SQLSTATE: 42K0G](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Searching for `` in Protobuf schema at `` gave `` matches. Candidates: ``. - -### PROTOBUF_FIELD_MISSING_IN_SQL_SCHEMA - -[SQLSTATE: 42K0G](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Found `` in Protobuf schema but there is no match in the SQL schema. - -### PROTOBUF_FIELD_TYPE_MISMATCH - -[SQLSTATE: 42K0G](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Type mismatch encountered for field: ``. - -### PROTOBUF_MESSAGE_NOT_FOUND - -[SQLSTATE: 42K0G](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Unable to locate Message `` in Descriptor. - -### PROTOBUF_TYPE_NOT_SUPPORT - -[SQLSTATE: 42K0G](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Protobuf type not yet supported: ``. - -### PYTHON_DATA_SOURCE_ERROR - -[SQLSTATE: 38000](sql-error-conditions-sqlstates.html#class-38-external-routine-exception) - -Failed to `` Python data source ``: `` - -### PYTHON_STREAMING_DATA_SOURCE_RUNTIME_ERROR - -[SQLSTATE: 38000](sql-error-conditions-sqlstates.html#class-38-external-routine-exception) - -Failed when Python streaming data source perform ``: `` - -### RECURSIVE_PROTOBUF_SCHEMA - -[SQLSTATE: 42K0G](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Found recursive reference in Protobuf schema, which can not be processed by Spark by default: ``. try setting the option `recursive.fields.max.depth` 0 to 10. Going beyond 10 levels of recursion is not allowed. - -### RECURSIVE_VIEW - -[SQLSTATE: 42K0H](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Recursive view `` detected (cycle: ``). - -### REF_DEFAULT_VALUE_IS_NOT_ALLOWED_IN_PARTITION - -[SQLSTATE: 42601](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -References to DEFAULT column values are not allowed within the PARTITION clause. - -### RENAME_SRC_PATH_NOT_FOUND - -[SQLSTATE: 42K03](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Failed to rename as `` was not found. - -### REPEATED_CLAUSE - -[SQLSTATE: 42614](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The `` clause may be used at most once per `` operation. - -### REQUIRED_PARAMETER_NOT_FOUND - -[SQLSTATE: 4274K](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Cannot invoke function `` because the parameter named `` is required, but the function call did not supply a value. Please update the function call to supply an argument value (either positionally at index `` or by name) and retry the query again. - -### REQUIRES_SINGLE_PART_NAMESPACE - -[SQLSTATE: 42K05](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -`` requires a single-part namespace, but got ``. - -### ROUTINE_ALREADY_EXISTS - -[SQLSTATE: 42723](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Cannot create the function `` because it already exists. -Choose a different name, drop or replace the existing function, or add the IF NOT EXISTS clause to tolerate a pre-existing function. - -### ROUTINE_NOT_FOUND - -[SQLSTATE: 42883](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The function `` cannot be found. Verify the spelling and correctness of the schema and catalog. -If you did not qualify the name with a schema and catalog, verify the current_schema() output, or qualify the name with the correct schema and catalog. -To tolerate the error on drop use DROP FUNCTION IF EXISTS. - -### ROW_SUBQUERY_TOO_MANY_ROWS - -[SQLSTATE: 21000](sql-error-conditions-sqlstates.html#class-21-cardinality-violation) - -More than one row returned by a subquery used as a row. - -### RULE_ID_NOT_FOUND - -[SQLSTATE: 22023](sql-error-conditions-sqlstates.html#class-22-data-exception) - -Not found an id for the rule name "``". Please modify RuleIdCollection.scala if you are adding a new rule. - -### SCALAR_SUBQUERY_IS_IN_GROUP_BY_OR_AGGREGATE_FUNCTION - -[SQLSTATE: 0A000](sql-error-conditions-sqlstates.html#class-0A-feature-not-supported) - -The correlated scalar subquery '``' is neither present in GROUP BY, nor in an aggregate function. -Add it to GROUP BY using ordinal position or wrap it in `first()` (or `first_value`) if you don't care which value you get. - -### SCALAR_SUBQUERY_TOO_MANY_ROWS - -[SQLSTATE: 21000](sql-error-conditions-sqlstates.html#class-21-cardinality-violation) - -More than one row returned by a subquery used as an expression. - -### SCHEMA_ALREADY_EXISTS - -[SQLSTATE: 42P06](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Cannot create schema `` because it already exists. -Choose a different name, drop the existing schema, or add the IF NOT EXISTS clause to tolerate pre-existing schema. - -### SCHEMA_NOT_EMPTY - -[SQLSTATE: 2BP01](sql-error-conditions-sqlstates.html#class-2B-dependent-privilege-descriptors-still-exist) - -Cannot drop a schema `` because it contains objects. -Use DROP SCHEMA ... CASCADE to drop the schema and all its objects. - -### SCHEMA_NOT_FOUND - -[SQLSTATE: 42704](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The schema `` cannot be found. Verify the spelling and correctness of the schema and catalog. -If you did not qualify the name with a catalog, verify the current_schema() output, or qualify the name with the correct catalog. -To tolerate the error on drop use DROP SCHEMA IF EXISTS. - -### SECOND_FUNCTION_ARGUMENT_NOT_INTEGER - -[SQLSTATE: 22023](sql-error-conditions-sqlstates.html#class-22-data-exception) - -The second argument of `` function needs to be an integer. - -### SEED_EXPRESSION_IS_UNFOLDABLE - -[SQLSTATE: 42K08](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The seed expression `` of the expression `` must be foldable. - -### SORT_BY_WITHOUT_BUCKETING - -[SQLSTATE: 42601](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -sortBy must be used together with bucketBy. - -### SPARK_JOB_CANCELLED - -[SQLSTATE: XXKDA](sql-error-conditions-sqlstates.html#class-XX-internal-error) - -Job `` cancelled `` - -### SPECIFY_BUCKETING_IS_NOT_ALLOWED - -[SQLSTATE: 42601](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -A CREATE TABLE without explicit column list cannot specify bucketing information. -Please use the form with explicit column list and specify bucketing information. -Alternatively, allow bucketing information to be inferred by omitting the clause. - -### SPECIFY_CLUSTER_BY_WITH_BUCKETING_IS_NOT_ALLOWED - -[SQLSTATE: 42908](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Cannot specify both CLUSTER BY and CLUSTERED BY INTO BUCKETS. - -### SPECIFY_CLUSTER_BY_WITH_PARTITIONED_BY_IS_NOT_ALLOWED - -[SQLSTATE: 42908](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Cannot specify both CLUSTER BY and PARTITIONED BY. - -### SPECIFY_PARTITION_IS_NOT_ALLOWED - -[SQLSTATE: 42601](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -A CREATE TABLE without explicit column list cannot specify PARTITIONED BY. -Please use the form with explicit column list and specify PARTITIONED BY. -Alternatively, allow partitioning to be inferred by omitting the PARTITION BY clause. - -### SQL_CONF_NOT_FOUND - -[SQLSTATE: 42K0I](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The SQL config `` cannot be found. Please verify that the config exists. - -### STAR_GROUP_BY_POS - -[SQLSTATE: 0A000](sql-error-conditions-sqlstates.html#class-0A-feature-not-supported) - -Star (*) is not allowed in a select list when GROUP BY an ordinal position is used. - -### STATEFUL_PROCESSOR_CANNOT_PERFORM_OPERATION_WITH_INVALID_HANDLE_STATE - -[SQLSTATE: 42802](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Failed to perform stateful processor operation=`` with invalid handle state=``. - -### STATEFUL_PROCESSOR_CANNOT_PERFORM_OPERATION_WITH_INVALID_TIME_MODE - -[SQLSTATE: 42802](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Failed to perform stateful processor operation=`` with invalid timeMode=`` - -### STATEFUL_PROCESSOR_CANNOT_REINITIALIZE_STATE_ON_KEY - -[SQLSTATE: 42802](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Cannot re-initialize state on the same grouping key during initial state handling for stateful processor. Invalid grouping key=``. - -### STATEFUL_PROCESSOR_INCORRECT_TIME_MODE_TO_ASSIGN_TTL - -[SQLSTATE: 42802](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Cannot use TTL for state=`` in timeMode=``, use TimeMode.ProcessingTime() instead. - -### STATEFUL_PROCESSOR_TTL_DURATION_MUST_BE_POSITIVE - -[SQLSTATE: 42802](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -TTL duration must be greater than zero for State store operation=`` on state=``. - -### STATE_STORE_CANNOT_CREATE_COLUMN_FAMILY_WITH_RESERVED_CHARS - -[SQLSTATE: 42802](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Failed to create column family with unsupported starting character and name=``. - -### STATE_STORE_CANNOT_USE_COLUMN_FAMILY_WITH_INVALID_NAME - -[SQLSTATE: 42802](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Failed to perform column family operation=`` with invalid name=``. Column family name cannot be empty or include leading/trailing spaces or use the reserved keyword=default - -### STATE_STORE_HANDLE_NOT_INITIALIZED - -[SQLSTATE: 42802](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The handle has not been initialized for this StatefulProcessor. -Please only use the StatefulProcessor within the transformWithState operator. - -### STATE_STORE_INCORRECT_NUM_ORDERING_COLS_FOR_RANGE_SCAN - -[SQLSTATE: 42802](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Incorrect number of ordering ordinals=`` for range scan encoder. The number of ordering ordinals cannot be zero or greater than number of schema columns. - -### STATE_STORE_INCORRECT_NUM_PREFIX_COLS_FOR_PREFIX_SCAN - -[SQLSTATE: 42802](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Incorrect number of prefix columns=`` for prefix scan encoder. Prefix columns cannot be zero or greater than or equal to num of schema columns. - -### STATE_STORE_NULL_TYPE_ORDERING_COLS_NOT_SUPPORTED - -[SQLSTATE: 42802](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Null type ordering column with name=`` at index=`` is not supported for range scan encoder. - -### STATE_STORE_UNSUPPORTED_OPERATION - -[SQLSTATE: XXKST](sql-error-conditions-sqlstates.html#class-XX-internal-error) - -`` operation not supported with `` - -### STATE_STORE_UNSUPPORTED_OPERATION_BINARY_INEQUALITY - -[SQLSTATE: XXKST](sql-error-conditions-sqlstates.html#class-XX-internal-error) - -Binary inequality column is not supported with state store. Provided schema: ``. - -### STATE_STORE_UNSUPPORTED_OPERATION_ON_MISSING_COLUMN_FAMILY - -[SQLSTATE: 42802](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -State store operation=`` not supported on missing column family=``. - -### STATE_STORE_VARIABLE_SIZE_ORDERING_COLS_NOT_SUPPORTED - -[SQLSTATE: 42802](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Variable size ordering column with name=`` at index=`` is not supported for range scan encoder. - -### STATIC_PARTITION_COLUMN_IN_INSERT_COLUMN_LIST - -[SQLSTATE: 42713](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Static partition column `` is also specified in the column list. - -### STDS_COMMITTED_BATCH_UNAVAILABLE - -SQLSTATE: KD006 - -No committed batch found, checkpoint location: ``. Ensure that the query has run and committed any microbatch before stopping. - -### STDS_CONFLICT_OPTIONS - -[SQLSTATE: 42613](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The options `` cannot be specified together. Please specify the one. - -### STDS_FAILED_TO_READ_STATE_SCHEMA - -[SQLSTATE: 42K03](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Failed to read the state schema. Either the file does not exist, or the file is corrupted. options: ``. -Rerun the streaming query to construct the state schema, and report to the corresponding communities or vendors if the error persists. - -### STDS_INTERNAL_ERROR - -[SQLSTATE: XXKST](sql-error-conditions-sqlstates.html#class-XX-internal-error) - -Internal error: `` -Please, report this bug to the corresponding communities or vendors, and provide the full stack trace. - -### [STDS_INVALID_OPTION_VALUE](sql-error-conditions-stds-invalid-option-value-error-class.html) - -[SQLSTATE: 42616](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Invalid value for source option '``': - -For more details see [STDS_INVALID_OPTION_VALUE](sql-error-conditions-stds-invalid-option-value-error-class.html) - -### STDS_NO_PARTITION_DISCOVERED_IN_STATE_STORE - -SQLSTATE: KD006 - -The state does not have any partition. Please double check that the query points to the valid state. options: `` - -### STDS_OFFSET_LOG_UNAVAILABLE - -SQLSTATE: KD006 - -The offset log for `` does not exist, checkpoint location: ``. -Please specify the batch ID which is available for querying - you can query the available batch IDs via using state metadata data source. - -### STDS_OFFSET_METADATA_LOG_UNAVAILABLE - -SQLSTATE: KD006 - -Metadata is not available for offset log for ``, checkpoint location: ``. -The checkpoint seems to be only run with older Spark version(s). Run the streaming query with the recent Spark version, so that Spark constructs the state metadata. - -### STDS_REQUIRED_OPTION_UNSPECIFIED - -[SQLSTATE: 42601](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -'``' must be specified. - -### STREAMING_STATEFUL_OPERATOR_NOT_MATCH_IN_STATE_METADATA - -[SQLSTATE: 42K03](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Streaming stateful operator name does not match with the operator in state metadata. This likely to happen when user adds/removes/changes stateful operator of existing streaming query. -Stateful operators in the metadata: [``]; Stateful operators in current batch: [``]. - -### STREAM_FAILED - -[SQLSTATE: XXKST](sql-error-conditions-sqlstates.html#class-XX-internal-error) - -Query [id = ``, runId = ``] terminated with exception: `` - -### SUM_OF_LIMIT_AND_OFFSET_EXCEEDS_MAX_INT - -[SQLSTATE: 22003](sql-error-conditions-sqlstates.html#class-22-data-exception) - -The sum of the LIMIT clause and the OFFSET clause must not be greater than the maximum 32-bit integer value (2,147,483,647) but found limit = ``, offset = ``. - -### TABLE_OR_VIEW_ALREADY_EXISTS - -[SQLSTATE: 42P07](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Cannot create table or view `` because it already exists. -Choose a different name, drop or replace the existing object, or add the IF NOT EXISTS clause to tolerate pre-existing objects. - -### TABLE_OR_VIEW_NOT_FOUND - -[SQLSTATE: 42P01](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The table or view `` cannot be found. Verify the spelling and correctness of the schema and catalog. -If you did not qualify the name with a schema, verify the current_schema() output, or qualify the name with the correct schema and catalog. -To tolerate the error on drop use DROP VIEW IF EXISTS or DROP TABLE IF EXISTS. - -### TABLE_VALUED_FUNCTION_FAILED_TO_ANALYZE_IN_PYTHON - -[SQLSTATE: 38000](sql-error-conditions-sqlstates.html#class-38-external-routine-exception) - -Failed to analyze the Python user defined table function: `` - -### TABLE_VALUED_FUNCTION_REQUIRED_METADATA_INCOMPATIBLE_WITH_CALL - -[SQLSTATE: 22023](sql-error-conditions-sqlstates.html#class-22-data-exception) - -Failed to evaluate the table function `` because its table metadata ``, but the function call ``. - -### TABLE_VALUED_FUNCTION_REQUIRED_METADATA_INVALID - -[SQLSTATE: 22023](sql-error-conditions-sqlstates.html#class-22-data-exception) - -Failed to evaluate the table function `` because its table metadata was invalid; ``. - -### TABLE_VALUED_FUNCTION_TOO_MANY_TABLE_ARGUMENTS - -[SQLSTATE: 54023](sql-error-conditions-sqlstates.html#class-54-program-limit-exceeded) - -There are too many table arguments for table-valued function. -It allows one table argument, but got: ``. -If you want to allow it, please set "spark.sql.allowMultipleTableArguments.enabled" to "true" - -### TASK_WRITE_FAILED - -SQLSTATE: 58030 - -Task failed while writing rows to ``. - -### TEMP_TABLE_OR_VIEW_ALREADY_EXISTS - -[SQLSTATE: 42P07](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Cannot create the temporary view `` because it already exists. -Choose a different name, drop or replace the existing view, or add the IF NOT EXISTS clause to tolerate pre-existing views. - -### TEMP_VIEW_NAME_TOO_MANY_NAME_PARTS - -[SQLSTATE: 428EK](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -CREATE TEMPORARY VIEW or the corresponding Dataset APIs only accept single-part view names, but got: ``. - -### UDTF_ALIAS_NUMBER_MISMATCH - -[SQLSTATE: 42802](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The number of aliases supplied in the AS clause does not match the number of columns output by the UDTF. -Expected `` aliases, but got ``. -Please ensure that the number of aliases provided matches the number of columns output by the UDTF. - -### UDTF_INVALID_ALIAS_IN_REQUESTED_ORDERING_STRING_FROM_ANALYZE_METHOD - -[SQLSTATE: 42802](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Failed to evaluate the user-defined table function because its 'analyze' method returned a requested OrderingColumn whose column name expression included an unnecessary alias ``; please remove this alias and then try the query again. - -### UDTF_INVALID_REQUESTED_SELECTED_EXPRESSION_FROM_ANALYZE_METHOD_REQUIRES_ALIAS - -[SQLSTATE: 42802](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Failed to evaluate the user-defined table function because its 'analyze' method returned a requested 'select' expression (``) that does not include a corresponding alias; please update the UDTF to specify an alias there and then try the query again. - -### UNABLE_TO_ACQUIRE_MEMORY - -[SQLSTATE: 53200](sql-error-conditions-sqlstates.html#class-53-insufficient-resources) - -Unable to acquire `` bytes of memory, got ``. - -### UNABLE_TO_CONVERT_TO_PROTOBUF_MESSAGE_TYPE - -[SQLSTATE: 42K0G](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Unable to convert SQL type `` to Protobuf type ``. - -### UNABLE_TO_FETCH_HIVE_TABLES - -SQLSTATE: 58030 - -Unable to fetch tables of Hive database: ``. - -### UNABLE_TO_INFER_SCHEMA - -[SQLSTATE: 42KD9](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Unable to infer schema for ``. It must be specified manually. - -### UNBOUND_SQL_PARAMETER - -[SQLSTATE: 42P02](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Found the unbound parameter: ``. Please, fix `args` and provide a mapping of the parameter to either a SQL literal or collection constructor functions such as `map()`, `array()`, `struct()`. - -### UNCLOSED_BRACKETED_COMMENT - -[SQLSTATE: 42601](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Found an unclosed bracketed comment. Please, append */ at the end of the comment. - -### UNEXPECTED_INPUT_TYPE - -[SQLSTATE: 42K09](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Parameter `` of function `` requires the `` type, however `` has the type ``. - -### UNEXPECTED_POSITIONAL_ARGUMENT - -[SQLSTATE: 4274K](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Cannot invoke function `` because it contains positional argument(s) following the named argument assigned to ``; please rearrange them so the positional arguments come first and then retry the query again. - -### UNEXPECTED_SERIALIZER_FOR_CLASS - -[SQLSTATE: 42846](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The class `` has an unexpected expression serializer. Expects "STRUCT" or "IF" which returns "STRUCT" but found ``. - -### UNKNOWN_PROTOBUF_MESSAGE_TYPE - -[SQLSTATE: 42K0G](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Attempting to treat `` as a Message, but it was ``. - -### UNPIVOT_REQUIRES_ATTRIBUTES - -[SQLSTATE: 42K0A](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -UNPIVOT requires all given `` expressions to be columns when no `` expressions are given. These are not columns: [``]. - -### UNPIVOT_REQUIRES_VALUE_COLUMNS - -[SQLSTATE: 42K0A](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -At least one value column needs to be specified for UNPIVOT, all columns specified as ids. - -### UNPIVOT_VALUE_DATA_TYPE_MISMATCH - -[SQLSTATE: 42K09](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Unpivot value columns must share a least common type, some types do not: [``]. - -### UNPIVOT_VALUE_SIZE_MISMATCH - -[SQLSTATE: 428C4](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -All unpivot value columns must have the same size as there are value column names (``). - -### UNRECOGNIZED_PARAMETER_NAME - -[SQLSTATE: 4274K](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Cannot invoke function `` because the function call included a named argument reference for the argument named ``, but this function does not include any signature containing an argument with this name. Did you mean one of the following? [``]. - -### UNRECOGNIZED_SQL_TYPE - -[SQLSTATE: 42704](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Unrecognized SQL type - name: ``, id: ``. - -### UNRESOLVABLE_TABLE_VALUED_FUNCTION - -[SQLSTATE: 42883](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Could not resolve `` to a table-valued function. -Please make sure that `` is defined as a table-valued function and that all required parameters are provided correctly. -If `` is not defined, please create the table-valued function before using it. -For more information about defining table-valued functions, please refer to the Apache Spark documentation. - -### UNRESOLVED_ALL_IN_GROUP_BY - -[SQLSTATE: 42803](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Cannot infer grouping columns for GROUP BY ALL based on the select clause. Please explicitly specify the grouping columns. - -### [UNRESOLVED_COLUMN](sql-error-conditions-unresolved-column-error-class.html) - -[SQLSTATE: 42703](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -A column, variable, or function parameter with name `` cannot be resolved. - -For more details see [UNRESOLVED_COLUMN](sql-error-conditions-unresolved-column-error-class.html) - -### [UNRESOLVED_FIELD](sql-error-conditions-unresolved-field-error-class.html) - -[SQLSTATE: 42703](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -A field with name `` cannot be resolved with the struct-type column ``. - -For more details see [UNRESOLVED_FIELD](sql-error-conditions-unresolved-field-error-class.html) - -### [UNRESOLVED_MAP_KEY](sql-error-conditions-unresolved-map-key-error-class.html) - -[SQLSTATE: 42703](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Cannot resolve column `` as a map key. If the key is a string literal, add the single quotes '' around it. - -For more details see [UNRESOLVED_MAP_KEY](sql-error-conditions-unresolved-map-key-error-class.html) - -### UNRESOLVED_ROUTINE - -[SQLSTATE: 42883](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Cannot resolve function `` on search path ``. - -### UNRESOLVED_USING_COLUMN_FOR_JOIN - -[SQLSTATE: 42703](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -USING column `` cannot be resolved on the `` side of the join. The ``-side columns: [``]. - -### UNRESOLVED_VARIABLE - -[SQLSTATE: 42883](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Cannot resolve variable `` on search path ``. - -### UNSET_NONEXISTENT_PROPERTIES - -[SQLSTATE: 42K0J](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Attempted to unset non-existent properties [``] in table `
      `. - -### [UNSUPPORTED_ADD_FILE](sql-error-conditions-unsupported-add-file-error-class.html) - -[SQLSTATE: 0A000](sql-error-conditions-sqlstates.html#class-0A-feature-not-supported) - -Don't support add file. - -For more details see [UNSUPPORTED_ADD_FILE](sql-error-conditions-unsupported-add-file-error-class.html) - -### UNSUPPORTED_ARROWTYPE - -[SQLSTATE: 0A000](sql-error-conditions-sqlstates.html#class-0A-feature-not-supported) - -Unsupported arrow type ``. - -### [UNSUPPORTED_CALL](sql-error-conditions-unsupported-call-error-class.html) - -[SQLSTATE: 0A000](sql-error-conditions-sqlstates.html#class-0A-feature-not-supported) - -Cannot call the method "``" of the class "``". - -For more details see [UNSUPPORTED_CALL](sql-error-conditions-unsupported-call-error-class.html) - -### UNSUPPORTED_CHAR_OR_VARCHAR_AS_STRING - -[SQLSTATE: 0A000](sql-error-conditions-sqlstates.html#class-0A-feature-not-supported) - -The char/varchar type can't be used in the table schema. -If you want Spark treat them as string type as same as Spark 3.0 and earlier, please set "spark.sql.legacy.charVarcharAsString" to "true". - -### [UNSUPPORTED_COLLATION](sql-error-conditions-unsupported-collation-error-class.html) - -[SQLSTATE: 0A000](sql-error-conditions-sqlstates.html#class-0A-feature-not-supported) - -Collation `` is not supported for: - -For more details see [UNSUPPORTED_COLLATION](sql-error-conditions-unsupported-collation-error-class.html) - -### UNSUPPORTED_DATASOURCE_FOR_DIRECT_QUERY - -[SQLSTATE: 0A000](sql-error-conditions-sqlstates.html#class-0A-feature-not-supported) - -Unsupported data source type for direct query on files: `` - -### UNSUPPORTED_DATATYPE - -[SQLSTATE: 0A000](sql-error-conditions-sqlstates.html#class-0A-feature-not-supported) - -Unsupported data type ``. - -### UNSUPPORTED_DATA_SOURCE_SAVE_MODE - -[SQLSTATE: 0A000](sql-error-conditions-sqlstates.html#class-0A-feature-not-supported) - -The data source "``" cannot be written in the `` mode. Please use either the "Append" or "Overwrite" mode instead. - -### UNSUPPORTED_DATA_TYPE_FOR_DATASOURCE - -[SQLSTATE: 0A000](sql-error-conditions-sqlstates.html#class-0A-feature-not-supported) - -The `` datasource doesn't support the column `` of the type ``. - -### [UNSUPPORTED_DEFAULT_VALUE](sql-error-conditions-unsupported-default-value-error-class.html) - -[SQLSTATE: 0A000](sql-error-conditions-sqlstates.html#class-0A-feature-not-supported) - -DEFAULT column values is not supported. - -For more details see [UNSUPPORTED_DEFAULT_VALUE](sql-error-conditions-unsupported-default-value-error-class.html) - -### [UNSUPPORTED_DESERIALIZER](sql-error-conditions-unsupported-deserializer-error-class.html) - -[SQLSTATE: 0A000](sql-error-conditions-sqlstates.html#class-0A-feature-not-supported) - -The deserializer is not supported: - -For more details see [UNSUPPORTED_DESERIALIZER](sql-error-conditions-unsupported-deserializer-error-class.html) - -### UNSUPPORTED_EXPRESSION_GENERATED_COLUMN - -[SQLSTATE: 42621](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Cannot create generated column `` with generation expression `` because ``. - -### UNSUPPORTED_EXPR_FOR_OPERATOR - -[SQLSTATE: 42K0E](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -A query operator contains one or more unsupported expressions. -Consider to rewrite it to avoid window functions, aggregate functions, and generator functions in the WHERE clause. -Invalid expressions: [``] - -### UNSUPPORTED_EXPR_FOR_PARAMETER - -[SQLSTATE: 42K0E](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -A query parameter contains unsupported expression. -Parameters can either be variables or literals. -Invalid expression: [``] - -### UNSUPPORTED_EXPR_FOR_WINDOW - -[SQLSTATE: 42P20](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Expression `` not supported within a window function. - -### [UNSUPPORTED_FEATURE](sql-error-conditions-unsupported-feature-error-class.html) - -[SQLSTATE: 0A000](sql-error-conditions-sqlstates.html#class-0A-feature-not-supported) - -The feature is not supported: - -For more details see [UNSUPPORTED_FEATURE](sql-error-conditions-unsupported-feature-error-class.html) - -### [UNSUPPORTED_GENERATOR](sql-error-conditions-unsupported-generator-error-class.html) - -[SQLSTATE: 42K0E](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The generator is not supported: - -For more details see [UNSUPPORTED_GENERATOR](sql-error-conditions-unsupported-generator-error-class.html) - -### UNSUPPORTED_GROUPING_EXPRESSION - -[SQLSTATE: 42K0E](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -grouping()/grouping_id() can only be used with GroupingSets/Cube/Rollup. - -### [UNSUPPORTED_INSERT](sql-error-conditions-unsupported-insert-error-class.html) - -[SQLSTATE: 42809](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Can't insert into the target. - -For more details see [UNSUPPORTED_INSERT](sql-error-conditions-unsupported-insert-error-class.html) - -### [UNSUPPORTED_MERGE_CONDITION](sql-error-conditions-unsupported-merge-condition-error-class.html) - -[SQLSTATE: 42K0E](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -MERGE operation contains unsupported `` condition. - -For more details see [UNSUPPORTED_MERGE_CONDITION](sql-error-conditions-unsupported-merge-condition-error-class.html) - -### [UNSUPPORTED_OVERWRITE](sql-error-conditions-unsupported-overwrite-error-class.html) - -[SQLSTATE: 42902](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Can't overwrite the target that is also being read from. - -For more details see [UNSUPPORTED_OVERWRITE](sql-error-conditions-unsupported-overwrite-error-class.html) - -### [UNSUPPORTED_SAVE_MODE](sql-error-conditions-unsupported-save-mode-error-class.html) - -[SQLSTATE: 0A000](sql-error-conditions-sqlstates.html#class-0A-feature-not-supported) - -The save mode `` is not supported for: - -For more details see [UNSUPPORTED_SAVE_MODE](sql-error-conditions-unsupported-save-mode-error-class.html) - -### [UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY](sql-error-conditions-unsupported-subquery-expression-category-error-class.html) - -[SQLSTATE: 0A000](sql-error-conditions-sqlstates.html#class-0A-feature-not-supported) - -Unsupported subquery expression: - -For more details see [UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY](sql-error-conditions-unsupported-subquery-expression-category-error-class.html) - -### UNSUPPORTED_TYPED_LITERAL - -[SQLSTATE: 0A000](sql-error-conditions-sqlstates.html#class-0A-feature-not-supported) - -Literals of the type `` are not supported. Supported types are ``. - -### UNTYPED_SCALA_UDF - -[SQLSTATE: 42K0E](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -You're using untyped Scala UDF, which does not have the input type information. Spark may blindly pass null to the Scala closure with primitive-type argument, and the closure will see the default value of the Java type for the null argument, e.g. `udf((x: Int) => x, IntegerType)`, the result is 0 for null input. To get rid of this error, you could: -1. use typed Scala UDF APIs(without return type parameter), e.g. `udf((x: Int) => x)`. -2. use Java UDF APIs, e.g. `udf(new UDF1[String, Integer] { override def call(s: String): Integer = s.length() }, IntegerType)`, if input types are all non primitive. -3. set "spark.sql.legacy.allowUntypedScalaUDF" to "true" and use this API with caution. - -### USER_RAISED_EXCEPTION - -SQLSTATE: P0001 - -`` - -### USER_RAISED_EXCEPTION_PARAMETER_MISMATCH - -SQLSTATE: P0001 - -The `raise_error()` function was used to raise error class: `` which expects parameters: ``. -The provided parameters `` do not match the expected parameters. -Please make sure to provide all expected parameters. - -### USER_RAISED_EXCEPTION_UNKNOWN_ERROR_CLASS - -SQLSTATE: P0001 - -The `raise_error()` function was used to raise an unknown error class: `` - -### VARIABLE_ALREADY_EXISTS - -[SQLSTATE: 42723](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Cannot create the variable `` because it already exists. -Choose a different name, or drop or replace the existing variable. - -### VARIABLE_NOT_FOUND - -[SQLSTATE: 42883](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The variable `` cannot be found. Verify the spelling and correctness of the schema and catalog. -If you did not qualify the name with a schema and catalog, verify the current_schema() output, or qualify the name with the correct schema and catalog. -To tolerate the error on drop use DROP VARIABLE IF EXISTS. - -### VARIANT_CONSTRUCTOR_SIZE_LIMIT - -[SQLSTATE: 22023](sql-error-conditions-sqlstates.html#class-22-data-exception) - -Cannot construct a Variant larger than 16 MiB. The maximum allowed size of a Variant value is 16 MiB. - -### VARIANT_DUPLICATE_KEY - -[SQLSTATE: 22023](sql-error-conditions-sqlstates.html#class-22-data-exception) - -Failed to build variant because of a duplicate object key ````. - -### VARIANT_SIZE_LIMIT - -[SQLSTATE: 22023](sql-error-conditions-sqlstates.html#class-22-data-exception) - -Cannot build variant bigger than `` in ``. -Please avoid large input strings to this expression (for example, add function calls(s) to check the expression size and convert it to NULL first if it is too big). - -### VIEW_ALREADY_EXISTS - -[SQLSTATE: 42P07](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Cannot create view `` because it already exists. -Choose a different name, drop or replace the existing object, or add the IF NOT EXISTS clause to tolerate pre-existing objects. - -### VIEW_EXCEED_MAX_NESTED_DEPTH - -[SQLSTATE: 54K00](sql-error-conditions-sqlstates.html#class-54-program-limit-exceeded) - -The depth of view `` exceeds the maximum view resolution depth (``). -Analysis is aborted to avoid errors. If you want to work around this, please try to increase the value of "spark.sql.view.maxNestedViewDepth". - -### VIEW_NOT_FOUND - -[SQLSTATE: 42P01](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The view `` cannot be found. Verify the spelling and correctness of the schema and catalog. -If you did not qualify the name with a schema, verify the current_schema() output, or qualify the name with the correct schema and catalog. -To tolerate the error on drop use DROP VIEW IF EXISTS. - -### WINDOW_FUNCTION_AND_FRAME_MISMATCH - -[SQLSTATE: 42K0E](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -`` function can only be evaluated in an ordered row-based window frame with a single offset: ``. - -### WINDOW_FUNCTION_WITHOUT_OVER_CLAUSE - -[SQLSTATE: 42601](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Window function `` requires an OVER clause. - -### WRITE_STREAM_NOT_ALLOWED - -[SQLSTATE: 42601](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -`writeStream` can be called only on streaming Dataset/DataFrame. - -### WRONG_COMMAND_FOR_OBJECT_TYPE - -[SQLSTATE: 42809](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The operation `` requires a ``. But `` is a ``. Use `` instead. - -### [WRONG_NUM_ARGS](sql-error-conditions-wrong-num-args-error-class.html) - -[SQLSTATE: 42605](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -The `` requires `` parameters but the actual number is ``. - -For more details see [WRONG_NUM_ARGS](sql-error-conditions-wrong-num-args-error-class.html) - -### XML_ROW_TAG_MISSING - -[SQLSTATE: 42KDF](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -`` option is required for reading files in XML format. +{% include_api_gen _generated/error-conditions.html %} diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index 3004008b8ec78..4707e491fa674 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -24,13 +24,15 @@ license: | ## Upgrading from Spark SQL 3.5 to 4.0 +- Since Spark 4.0, `spark.sql.ansi.enabled` is on by default. To restore the previous behavior, set `spark.sql.ansi.enabled` to `false` or `SPARK_ANSI_SQL_MODE` to `false`. +- Since Spark 4.0, `CREATE TABLE` syntax without `USING` and `STORED AS` will use the value of `spark.sql.sources.default` as the table provider instead of `Hive`. To restore the previous behavior, set `spark.sql.legacy.createHiveTableByDefault` to `true` or `SPARK_SQL_LEGACY_CREATE_HIVE_TABLE` to `true`. - Since Spark 4.0, the default behaviour when inserting elements in a map is changed to first normalize keys -0.0 to 0.0. The affected SQL functions are `create_map`, `map_from_arrays`, `map_from_entries`, and `map_concat`. To restore the previous behaviour, set `spark.sql.legacy.disableMapKeyNormalization` to `true`. - Since Spark 4.0, the default value of `spark.sql.maxSinglePartitionBytes` is changed from `Long.MaxValue` to `128m`. To restore the previous behavior, set `spark.sql.maxSinglePartitionBytes` to `9223372036854775807`(`Long.MaxValue`). - Since Spark 4.0, any read of SQL tables takes into consideration the SQL configs `spark.sql.files.ignoreCorruptFiles`/`spark.sql.files.ignoreMissingFiles` instead of the core config `spark.files.ignoreCorruptFiles`/`spark.files.ignoreMissingFiles`. - Since Spark 4.0, `spark.sql.hive.metastore` drops the support of Hive prior to 2.0.0 as they require JDK 8 that Spark does not support anymore. Users should migrate to higher versions. - Since Spark 4.0, `spark.sql.parquet.compression.codec` drops the support of codec name `lz4raw`, please use `lz4_raw` instead. - Since Spark 4.0, when overflowing during casting timestamp to byte/short/int under non-ansi mode, Spark will return null instead a wrapping value. -- Since Spark 4.0, the `encode()` and `decode()` functions support only the following charsets 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'. To restore the previous behavior when the function accepts charsets of the current JDK used by Spark, set `spark.sql.legacy.javaCharsets` to `true`. +- Since Spark 4.0, the `encode()` and `decode()` functions support only the following charsets 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16', 'UTF-32'. To restore the previous behavior when the function accepts charsets of the current JDK used by Spark, set `spark.sql.legacy.javaCharsets` to `true`. - Since Spark 4.0, the legacy datetime rebasing SQL configs with the prefix `spark.sql.legacy` are removed. To restore the previous behavior, use the following configs: - `spark.sql.parquet.int96RebaseModeInWrite` instead of `spark.sql.legacy.parquet.int96RebaseModeInWrite` - `spark.sql.parquet.datetimeRebaseModeInWrite` instead of `spark.sql.legacy.parquet.datetimeRebaseModeInWrite` @@ -39,15 +41,25 @@ license: | - `spark.sql.avro.datetimeRebaseModeInRead` instead of `spark.sql.legacy.avro.datetimeRebaseModeInRead` - Since Spark 4.0, the default value of `spark.sql.orc.compression.codec` is changed from `snappy` to `zstd`. To restore the previous behavior, set `spark.sql.orc.compression.codec` to `snappy`. - Since Spark 4.0, the SQL config `spark.sql.legacy.allowZeroIndexInFormatString` is deprecated. Consider to change `strfmt` of the `format_string` function to use 1-based indexes. The first argument must be referenced by "1$", the second by "2$", etc. -- Since Spark 4.0, JDBC read option `preferTimestampNTZ=true` will not convert Postgres TIMESTAMP WITH TIME ZONE and TIME WITH TIME ZONE data types to TimestampNTZType, which is available in Spark 3.5. -- Since Spark 4.0, JDBC read option `preferTimestampNTZ=true` will not convert MySQL TIMESTAMP to TimestampNTZType, which is available in Spark 3.5. MySQL DATETIME is not affected. +- Since Spark 4.0, Postgres JDBC datasource will read JDBC read TIMESTAMP WITH TIME ZONE as TimestampType regardless of the JDBC read option `preferTimestampNTZ`, while in 3.5 and previous, TimestampNTZType when `preferTimestampNTZ=true`. To restore the previous behavior, set `spark.sql.legacy.postgres.datetimeMapping.enabled` to `true`. +- Since Spark 4.0, Postgres JDBC datasource will write TimestampType as TIMESTAMP WITH TIME ZONE, while in 3.5 and previous, it wrote as TIMESTAMP a.k.a. TIMESTAMP WITHOUT TIME ZONE. To restore the previous behavior, set `spark.sql.legacy.postgres.datetimeMapping.enabled` to `true`. +- Since Spark 4.0, MySQL JDBC datasource will read TIMESTAMP as TimestampType regardless of the JDBC read option `preferTimestampNTZ`, while in 3.5 and previous, TimestampNTZType when `preferTimestampNTZ=true`. To restore the previous behavior, set `spark.sql.legacy.mysql.timestampNTZMapping.enabled` to `true`, MySQL DATETIME is not affected. - Since Spark 4.0, MySQL JDBC datasource will read SMALLINT as ShortType, while in Spark 3.5 and previous, it was read as IntegerType. MEDIUMINT UNSIGNED is read as IntegerType, while in Spark 3.5 and previous, it was read as LongType. To restore the previous behavior, you can cast the column to the old type. - Since Spark 4.0, MySQL JDBC datasource will read FLOAT as FloatType, while in Spark 3.5 and previous, it was read as DoubleType. To restore the previous behavior, you can cast the column to the old type. - Since Spark 4.0, MySQL JDBC datasource will read BIT(n > 1) as BinaryType, while in Spark 3.5 and previous, read as LongType. To restore the previous behavior, set `spark.sql.legacy.mysql.bitArrayMapping.enabled` to `true`. - Since Spark 4.0, MySQL JDBC datasource will write ShortType as SMALLINT, while in Spark 3.5 and previous, write as INTEGER. To restore the previous behavior, you can replace the column with IntegerType whenever before writing. +- Since Spark 4.0, MySQL JDBC datasource will write TimestampNTZType as MySQL DATETIME because they both represent TIMESTAMP WITHOUT TIME ZONE, while in 3.5 and previous, it wrote as MySQL TIMESTAMP. To restore the previous behavior, set `spark.sql.legacy.mysql.timestampNTZMapping.enabled` to `true`. - Since Spark 4.0, Oracle JDBC datasource will write TimestampType as TIMESTAMP WITH LOCAL TIME ZONE, while in Spark 3.5 and previous, write as TIMESTAMP. To restore the previous behavior, set `spark.sql.legacy.oracle.timestampMapping.enabled` to `true`. +- Since Spark 4.0, MsSQL Server JDBC datasource will read TINYINT as ShortType, while in Spark 3.5 and previous, read as IntegerType. To restore the previous behavior, set `spark.sql.legacy.mssqlserver.numericMapping.enabled` to `true`. +- Since Spark 4.0, MsSQL Server JDBC datasource will read DATETIMEOFFSET as TimestampType, while in Spark 3.5 and previous, read as StringType. To restore the previous behavior, set `spark.sql.legacy.mssqlserver.datetimeoffsetMapping.enabled` to `true`. +- Since Spark 4.0, DB2 JDBC datasource will read SMALLINT as ShortType, while in Spark 3.5 and previous, it was read as IntegerType. To restore the previous behavior, set `spark.sql.legacy.db2.numericMapping.enabled` to `true`. +- Since Spark 4.0, DB2 JDBC datasource will write BooleanType as BOOLEAN, while in Spark 3.5 and previous, write as CHAR(1). To restore the previous behavior, set `spark.sql.legacy.db2.booleanMapping.enabled` to `true`. - Since Spark 4.0, The default value for `spark.sql.legacy.ctePrecedencePolicy` has been changed from `EXCEPTION` to `CORRECTED`. Instead of raising an error, inner CTE definitions take precedence over outer definitions. - Since Spark 4.0, The default value for `spark.sql.legacy.timeParserPolicy` has been changed from `EXCEPTION` to `CORRECTED`. Instead of raising an `INCONSISTENT_BEHAVIOR_CROSS_VERSION` error, `CANNOT_PARSE_TIMESTAMP` will be raised if ANSI mode is enable. `NULL` will be returned if ANSI mode is disabled. See [Datetime Patterns for Formatting and Parsing](sql-ref-datetime-pattern.html). +- Since Spark 4.0, A bug falsely allowing `!` instead of `NOT` when `!` is not a prefix operator has been fixed. Clauses such as `expr ! IN (...)`, `expr ! BETWEEN ...`, or `col ! NULL` now raise syntax errors. To restore the previous behavior, set `spark.sql.legacy.bangEqualsNot` to `true`. +- Since Spark 4.0, By default views tolerate column type changes in the query and compensate with casts. To restore the previous behavior, allowing up-casts only, set `spark.sql.legacy.viewSchemaCompensation` to `false`. +- Since Spark 4.0, Views allow control over how they react to underlying query changes. By default views tolerate column type changes in the query and compensate with casts. To disable thsi feature set `spark.sql.legacy.viewSchemaBindingMode` to `false`. This also removes the clause from `DESCRIBE EXTENDED` and `SHOW CREATE TABLE`. +- Since Spark 4.0, The Storage-Partitioned Join feature flag `spark.sql.sources.v2.bucketing.pushPartValues.enabled` is set to `true`. To restore the previous behavior, set `spark.sql.sources.v2.bucketing.pushPartValues.enabled` to `false`. ## Upgrading from Spark SQL 3.5.1 to 3.5.2 @@ -85,6 +97,7 @@ license: | - Since Spark 3.4, `BinaryType` is not supported in CSV datasource. In Spark 3.3 or earlier, users can write binary columns in CSV datasource, but the output content in CSV files is `Object.toString()` which is meaningless; meanwhile, if users read CSV tables with binary columns, Spark will throw an `Unsupported type: binary` exception. - Since Spark 3.4, bloom filter joins are enabled by default. To restore the legacy behavior, set `spark.sql.optimizer.runtime.bloomFilter.enabled` to `false`. - Since Spark 3.4, when schema inference on external Parquet files, INT64 timestamps with annotation `isAdjustedToUTC=false` will be inferred as TimestampNTZ type instead of Timestamp type. To restore the legacy behavior, set `spark.sql.parquet.inferTimestampNTZ.enabled` to `false`. + - Since Spark 3.4, the behavior for `CREATE TABLE AS SELECT ...` is changed from OVERWRITE to APPEND when `spark.sql.legacy.allowNonEmptyLocationInCTAS` is set to `true`. Users are recommended to avoid CTAS with a non-empty table location. ## Upgrading from Spark SQL 3.2 to 3.3 @@ -1062,7 +1075,7 @@ Python UDF registration is unchanged. Spark SQL is designed to be compatible with the Hive Metastore, SerDes and UDFs. Currently, Hive SerDes and UDFs are based on built-in Hive, and Spark SQL can be connected to different versions of Hive Metastore -(from 0.12.0 to 2.3.9 and 3.0.0 to 3.1.3. Also see [Interacting with Different Versions of Hive Metastore](sql-data-sources-hive-tables.html#interacting-with-different-versions-of-hive-metastore)). +(from 2.0.0 to 2.3.10 and 3.0.0 to 3.1.3. Also see [Interacting with Different Versions of Hive Metastore](sql-data-sources-hive-tables.html#interacting-with-different-versions-of-hive-metastore)). #### Deploying in Existing Hive Warehouses {:.no_toc} diff --git a/docs/sql-performance-tuning.md b/docs/sql-performance-tuning.md index b443e3d9c5f59..12b79828e44cb 100644 --- a/docs/sql-performance-tuning.md +++ b/docs/sql-performance-tuning.md @@ -428,3 +428,122 @@ You can control the details of how AQE works by providing your own cost evaluato
      3.2.0
      + +## Storage Partition Join + +Storage Partition Join (SPJ) is an optimization technique in Spark SQL that makes use the existing storage layout to avoid the shuffle phase. + +This is a generalization of the concept of Bucket Joins, which is only applicable for [bucketed](sql-data-sources-load-save-functions.html#bucketing-sorting-and-partitioning) tables, to tables partitioned by functions registered in FunctionCatalog. Storage Partition Joins are currently supported for compatible V2 DataSources. + +The following SQL properties enable Storage Partition Join in different join queries with various optimizations. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      Property NameDefaultMeaningSince Version
      spark.sql.sources.v2.bucketing.enabledfalse + When true, try to eliminate shuffle by using the partitioning reported by a compatible V2 data source. + 3.3.0
      spark.sql.sources.v2.bucketing.pushPartValues.enabledtrue + When enabled, try to eliminate shuffle if one side of the join has missing partition values from the other side. This config requires spark.sql.sources.v2.bucketing.enabled to be true. + 3.4.0
      spark.sql.requireAllClusterKeysForCoPartitiontrue + When true, require the join or MERGE keys to be same and in the same order as the partition keys to eliminate shuffle. Hence, set to false in this situation to eliminate shuffle. + 3.4.0
      spark.sql.sources.v2.bucketing.partiallyClusteredDistribution.enabledfalse + When true, and when the join is not a full outer join, enable skew optimizations to handle partitions with large amounts of data when avoiding shuffle. One side will be chosen as the big table based on table statistics, and the splits on this side will be partially-clustered. The splits of the other side will be grouped and replicated to match. This config requires both spark.sql.sources.v2.bucketing.enabled and spark.sql.sources.v2.bucketing.pushPartValues.enabled to be true. + 3.4.0
      spark.sql.sources.v2.bucketing.allowJoinKeysSubsetOfPartitionKeys.enabledfalse + When enabled, try to avoid shuffle if join or MERGE condition does not include all partition columns. This config requires both spark.sql.sources.v2.bucketing.enabled and spark.sql.sources.v2.bucketing.pushPartValues.enabled to be true, and spark.sql.requireAllClusterKeysForCoPartition to be false. + 4.0.0
      spark.sql.sources.v2.bucketing.allowCompatibleTransforms.enabledfalse + When enabled, try to avoid shuffle if partition transforms are compatible but not identical. This config requires both spark.sql.sources.v2.bucketing.enabled and spark.sql.sources.v2.bucketing.pushPartValues.enabled to be true. + 4.0.0
      spark.sql.sources.v2.bucketing.shuffle.enabledfalse + When enabled, try to avoid shuffle on one side of the join, by recognizing the partitioning reported by a V2 data source on the other side. + 4.0.0
      + +If Storage Partition Join is performed, the query plan will not contain Exchange nodes prior to the join. + +The following example uses Iceberg ([https://iceberg.apache.org/docs/latest/spark-getting-started/](https://iceberg.apache.org/docs/latest/spark-getting-started/)), a Spark V2 DataSource that supports Storage Partition Join. +```sql +CREATE TABLE prod.db.target (id INT, salary INT, dep STRING) +USING iceberg +PARTITIONED BY (dep, bucket(8, id)) + +CREATE TABLE prod.db.source (id INT, salary INT, dep STRING) +USING iceberg +PARTITIONED BY (dep, bucket(8, id)) + +EXPLAIN SELECT * FROM target t INNER JOIN source s +ON t.dep = s.dep AND t.id = s.id + +-- Plan without Storage Partition Join +== Physical Plan == +* Project (12) ++- * SortMergeJoin Inner (11) + :- * Sort (5) + : +- Exchange (4) // DATA SHUFFLE + : +- * Filter (3) + : +- * ColumnarToRow (2) + : +- BatchScan (1) + +- * Sort (10) + +- Exchange (9) // DATA SHUFFLE + +- * Filter (8) + +- * ColumnarToRow (7) + +- BatchScan (6) + + +SET 'spark.sql.sources.v2.bucketing.enabled' 'true' +SET 'spark.sql.iceberg.planning.preserve-data-grouping' 'true' +SET 'spark.sql.sources.v2.bucketing.pushPartValues.enabled' 'true' +SET 'spark.sql.requireAllClusterKeysForCoPartition' 'false' +SET 'spark.sql.sources.v2.bucketing.partiallyClusteredDistribution.enabled' 'true' + +-- Plan with Storage Partition Join +== Physical Plan == +* Project (10) ++- * SortMergeJoin Inner (9) + :- * Sort (4) + : +- * Filter (3) + : +- * ColumnarToRow (2) + : +- BatchScan (1) + +- * Sort (8) + +- * Filter (7) + +- * ColumnarToRow (6) + +- BatchScan (5) +``` \ No newline at end of file diff --git a/docs/sql-ref-ansi-compliance.md b/docs/sql-ref-ansi-compliance.md index bf1819b9767b0..920b3392854c9 100644 --- a/docs/sql-ref-ansi-compliance.md +++ b/docs/sql-ref-ansi-compliance.md @@ -21,18 +21,18 @@ license: | In Spark SQL, there are two options to comply with the SQL standard: `spark.sql.ansi.enabled` and `spark.sql.storeAssignmentPolicy` (See a table below for details). -When `spark.sql.ansi.enabled` is set to `true`, Spark SQL uses an ANSI compliant dialect instead of being Hive compliant. For example, Spark will throw an exception at runtime instead of returning null results if the inputs to a SQL operator/function are invalid. Some ANSI dialect features may be not from the ANSI SQL standard directly, but their behaviors align with ANSI SQL's style. +By default, `spark.sql.ansi.enabled` is `true` and Spark SQL uses an ANSI compliant dialect instead of being Hive compliant. For example, Spark will throw an exception at runtime instead of returning null results if the inputs to a SQL operator/function are invalid. Some ANSI dialect features may be not from the ANSI SQL standard directly, but their behaviors align with ANSI SQL's style. Moreover, Spark SQL has an independent option to control implicit casting behaviours when inserting rows in a table. The casting behaviours are defined as store assignment rules in the standard. -When `spark.sql.storeAssignmentPolicy` is set to `ANSI`, Spark SQL complies with the ANSI store assignment rules. This is a separate configuration because its default value is `ANSI`, while the configuration `spark.sql.ansi.enabled` is disabled by default. +By default, `spark.sql.storeAssignmentPolicy` is `ANSI` and Spark SQL complies with the ANSI store assignment rules. - + + + + + + """ + .format( + anchor=anchor_name(condition_name), + sql_state=condition_details["sqlState"], + # This inserts soft break opportunities so that if a long name needs to be wrapped + # it will wrap in a visually pleasing manner. + # See: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/wbr + condition_name=condition_name.replace("_", "_"), + message=condition_details["message"], + ) + ] + sub_condition_rows = [] + if "subClass" in condition_details: + for sub_condition_name in sorted(condition_details["subClass"]): + sub_condition_rows.append( + """ + + + + + + """ + .format( + anchor=anchor_name(condition_name, sub_condition_name), + # See comment above for explanation of ``. + sub_condition_name=sub_condition_name.replace("_", "_"), + message=condition_details["subClass"][sub_condition_name]["message"], + ) + ) + doc_rows = condition_row + sub_condition_rows + return [ + dedent(row).strip() + for row in doc_rows + ] + + +def generate_doc_table(error_conditions): + doc_rows = chain.from_iterable([ + generate_doc_rows(condition_name, condition_details) + for condition_name, condition_details + in sorted( + error_conditions.items(), + key=lambda x: (x[1]["sqlState"], x[0]), + ) + ]) + table_html = ( + """ +
      Property NameDefaultMeaningSince Version
      spark.sql.ansi.enabledfalsetrue When true, Spark tries to conform to the ANSI SQL specification:
      1. Spark SQL will throw runtime exceptions on invalid operations, including integer overflow @@ -67,10 +67,8 @@ The following subsections present behaviour changes in arithmetic operations, ty ### Arithmetic Operations -In Spark SQL, arithmetic operations performed on numeric types (with the exception of decimal) are not checked for overflows by default. -This means that in case an operation causes overflows, the result is the same with the corresponding operation in a Java/Scala program (e.g., if the sum of 2 integers is higher than the maximum value representable, the result is a negative number). -On the other hand, Spark SQL returns null for decimal overflows. -When `spark.sql.ansi.enabled` is set to `true` and an overflow occurs in numeric and interval arithmetic operations, it throws an arithmetic exception at runtime. +In Spark SQL, by default, Spark throws an arithmetic exception at runtime for both interval and numeric type overflows. +If `spark.sql.ansi.enabled` is `false`, then the decimal type will produce `null` values and other numeric types will behave in the same way as the corresponding operation in a Java/Scala program (e.g., if the sum of 2 integers is higher than the maximum value representable, the result is a negative number) which is the behavior of Spark 3 or older. ```sql -- `spark.sql.ansi.enabled=true` @@ -141,7 +139,7 @@ In the table above, all the `CAST`s with new syntax are marked as red ANSI Mode|Spark SQL
      Default Mode|SQL-2016| +|Keyword|Spark SQL
      ANSI Mode|Spark SQL
      NonANSI Mode|SQL-2016| |--|----------------------|-------------------------|--------| |ADD|non-reserved|non-reserved|non-reserved| |AFTER|non-reserved|non-reserved|non-reserved| @@ -415,9 +414,11 @@ Below is a list of all the keywords in Spark SQL. |ASC|non-reserved|non-reserved|non-reserved| |AT|non-reserved|non-reserved|reserved| |AUTHORIZATION|reserved|non-reserved|reserved| +|BEGIN|non-reserved|non-reserved|non-reserved| |BETWEEN|non-reserved|non-reserved|reserved| |BIGINT|non-reserved|non-reserved|reserved| |BINARY|non-reserved|non-reserved|reserved| +|BINDING|non-reserved|non-reserved|non-reserved| |BOOLEAN|non-reserved|non-reserved|reserved| |BOTH|reserved|non-reserved|reserved| |BUCKET|non-reserved|non-reserved|non-reserved| @@ -425,6 +426,7 @@ Below is a list of all the keywords in Spark SQL. |BY|non-reserved|non-reserved|reserved| |BYTE|non-reserved|non-reserved|non-reserved| |CACHE|non-reserved|non-reserved|non-reserved| +|CALLED|non-reserved|non-reserved|non-reserved| |CASCADE|non-reserved|non-reserved|non-reserved| |CASE|reserved|non-reserved|reserved| |CAST|reserved|non-reserved|reserved| @@ -447,9 +449,11 @@ Below is a list of all the keywords in Spark SQL. |COMMIT|non-reserved|non-reserved|reserved| |COMPACT|non-reserved|non-reserved|non-reserved| |COMPACTIONS|non-reserved|non-reserved|non-reserved| +|COMPENSATION|non-reserved|non-reserved|non-reserved| |COMPUTE|non-reserved|non-reserved|non-reserved| |CONCATENATE|non-reserved|non-reserved|non-reserved| |CONSTRAINT|reserved|non-reserved|reserved| +|CONTAINS|non-reserved|non-reserved|non-reserved| |COST|non-reserved|non-reserved|non-reserved| |CREATE|reserved|non-reserved|reserved| |CROSS|reserved|strict-non-reserved|reserved| @@ -476,10 +480,12 @@ Below is a list of all the keywords in Spark SQL. |DECLARE|non-reserved|non-reserved|non-reserved| |DEFAULT|non-reserved|non-reserved|non-reserved| |DEFINED|non-reserved|non-reserved|non-reserved| +|DEFINER|non-reserved|non-reserved|non-reserved| |DELETE|non-reserved|non-reserved|reserved| |DELIMITED|non-reserved|non-reserved|non-reserved| |DESC|non-reserved|non-reserved|non-reserved| |DESCRIBE|non-reserved|non-reserved|reserved| +|DETERMINISTIC|non-reserved|non-reserved|reserved| |DFS|non-reserved|non-reserved|non-reserved| |DIRECTORIES|non-reserved|non-reserved|non-reserved| |DIRECTORY|non-reserved|non-reserved|non-reserved| @@ -492,6 +498,7 @@ Below is a list of all the keywords in Spark SQL. |END|reserved|non-reserved|reserved| |ESCAPE|reserved|non-reserved|reserved| |ESCAPED|non-reserved|non-reserved|non-reserved| +|EVOLUTION|non-reserved|non-reserved|non-reserved| |EXCEPT|reserved|strict-non-reserved|reserved| |EXCHANGE|non-reserved|non-reserved|non-reserved| |EXCLUDE|non-reserved|non-reserved|non-reserved| @@ -537,6 +544,7 @@ Below is a list of all the keywords in Spark SQL. |INDEXES|non-reserved|non-reserved|non-reserved| |INNER|reserved|strict-non-reserved|reserved| |INPATH|non-reserved|non-reserved|non-reserved| +|INPUT|non-reserved|non-reserved|non-reserved| |INPUTFORMAT|non-reserved|non-reserved|non-reserved| |INSERT|non-reserved|non-reserved|reserved| |INT|non-reserved|non-reserved|reserved| @@ -544,10 +552,12 @@ Below is a list of all the keywords in Spark SQL. |INTERSECT|reserved|strict-non-reserved|reserved| |INTERVAL|non-reserved|non-reserved|reserved| |INTO|reserved|non-reserved|reserved| +|INVOKER|non-reserved|non-reserved|non-reserved| |IS|reserved|non-reserved|reserved| |ITEMS|non-reserved|non-reserved|non-reserved| |JOIN|reserved|strict-non-reserved|reserved| |KEYS|non-reserved|non-reserved|non-reserved| +|LANGUAGE|non-reserved|non-reserved|reserved| |LAST|non-reserved|non-reserved|non-reserved| |LATERAL|reserved|strict-non-reserved|reserved| |LAZY|non-reserved|non-reserved|non-reserved| @@ -576,6 +586,7 @@ Below is a list of all the keywords in Spark SQL. |MINUTE|non-reserved|non-reserved|non-reserved| |MINUTES|non-reserved|non-reserved|non-reserved| |MINUS|non-reserved|strict-non-reserved|non-reserved| +|MODIFIES|non-reserved|non-reserved|non-reserved| |MONTH|non-reserved|non-reserved|non-reserved| |MONTHS|non-reserved|non-reserved|non-reserved| |MSCK|non-reserved|non-reserved|non-reserved| @@ -609,8 +620,6 @@ Below is a list of all the keywords in Spark SQL. |PARTITIONED|non-reserved|non-reserved|non-reserved| |PARTITIONS|non-reserved|non-reserved|non-reserved| |PERCENT|non-reserved|non-reserved|non-reserved| -|PERCENTILE_CONT|reserved|non-reserved|non-reserved| -|PERCENTILE_DISC|reserved|non-reserved|non-reserved| |PIVOT|non-reserved|non-reserved|non-reserved| |PLACING|non-reserved|non-reserved|non-reserved| |POSITION|non-reserved|non-reserved|reserved| @@ -622,6 +631,7 @@ Below is a list of all the keywords in Spark SQL. |QUARTER|non-reserved|non-reserved|non-reserved| |QUERY|non-reserved|non-reserved|non-reserved| |RANGE|non-reserved|non-reserved|reserved| +|READS|non-reserved|non-reserved|non-reserved| |REAL|non-reserved|non-reserved|reserved| |RECORDREADER|non-reserved|non-reserved|non-reserved| |RECORDWRITER|non-reserved|non-reserved|non-reserved| @@ -637,6 +647,8 @@ Below is a list of all the keywords in Spark SQL. |RESET|non-reserved|non-reserved|non-reserved| |RESPECT|non-reserved|non-reserved|non-reserved| |RESTRICT|non-reserved|non-reserved|non-reserved| +|RETURN|non-reserved|non-reserved|reserved| +|RETURNS|non-reserved|non-reserved|reserved| |REVOKE|non-reserved|non-reserved|reserved| |RIGHT|reserved|strict-non-reserved|reserved| |RLIKE|non-reserved|non-reserved|non-reserved| @@ -650,6 +662,7 @@ Below is a list of all the keywords in Spark SQL. |SCHEMAS|non-reserved|non-reserved|non-reserved| |SECOND|non-reserved|non-reserved|non-reserved| |SECONDS|non-reserved|non-reserved|non-reserved| +|SECURITY|non-reserved|non-reserved|non-reserved| |SELECT|reserved|non-reserved|reserved| |SEMI|non-reserved|strict-non-reserved|non-reserved| |SEPARATED|non-reserved|non-reserved|non-reserved| @@ -667,6 +680,8 @@ Below is a list of all the keywords in Spark SQL. |SORT|non-reserved|non-reserved|non-reserved| |SORTED|non-reserved|non-reserved|non-reserved| |SOURCE|non-reserved|non-reserved|non-reserved| +|SPECIFIC|non-reserved|non-reserved|reserved| +|SQL|reserved|non-reserved|reserved| |START|non-reserved|non-reserved|reserved| |STATISTICS|non-reserved|non-reserved|non-reserved| |STORED|non-reserved|non-reserved|non-reserved| diff --git a/docs/sql-ref-datatypes.md b/docs/sql-ref-datatypes.md index 8d75b4a175ab7..3a4530dcecaef 100644 --- a/docs/sql-ref-datatypes.md +++ b/docs/sql-ref-datatypes.md @@ -126,7 +126,9 @@ from pyspark.sql.types import * |**FloatType**|float
      **Note:** Numbers will be converted to 4-byte single-precision floating point numbers at runtime.|FloatType()| |**DoubleType**|float|DoubleType()| |**DecimalType**|decimal.Decimal|DecimalType()| -|**StringType**|string|StringType()| +|**StringType**|str|StringType()| +|**CharType(length)**|str|CharType(length)| +|**VarcharType(length)**|str|VarcharType(length)| |**BinaryType**|bytearray|BinaryType()| |**BooleanType**|bool|BooleanType()| |**TimestampType**|datetime.datetime|TimestampType()| @@ -157,6 +159,8 @@ You can access them by doing |**DoubleType**|Double|DoubleType| |**DecimalType**|java.math.BigDecimal|DecimalType| |**StringType**|String|StringType| +|**CharType(length)**|String|CharType(length)| +|**VarcharType(length)**|String|VarcharType(length)| |**BinaryType**|Array[Byte]|BinaryType| |**BooleanType**|Boolean|BooleanType| |**TimestampType**|java.time.Instant or java.sql.Timestamp|TimestampType| @@ -188,6 +192,8 @@ please use factory methods provided in |**DoubleType**|double or Double|DataTypes.DoubleType| |**DecimalType**|java.math.BigDecimal|DataTypes.createDecimalType()
      DataTypes.createDecimalType(*precision*, *scale*).| |**StringType**|String|DataTypes.StringType| +|**CharType(length)**|String|DataTypes.createCharType(length)| +|**VarcharType(length)**|String|DataTypes.createVarcharType(length)| |**BinaryType**|byte[]|DataTypes.BinaryType| |**BooleanType**|boolean or Boolean|DataTypes.BooleanType| |**TimestampType**|java.time.Instant or java.sql.Timestamp|DataTypes.TimestampType| @@ -242,6 +248,8 @@ The following table shows the type names as well as aliases used in Spark SQL pa |**TimestampType**|TIMESTAMP, TIMESTAMP_LTZ| |**TimestampNTZType**|TIMESTAMP_NTZ| |**StringType**|STRING| +|**CharType(length)**|CHAR(length)| +|**VarcharType(length)**|VARCHAR(length)| |**BinaryType**|BINARY| |**DecimalType**|DECIMAL, DEC, NUMERIC| |**YearMonthIntervalType**|INTERVAL YEAR, INTERVAL YEAR TO MONTH, INTERVAL MONTH| diff --git a/docs/sql-ref-identifier.md b/docs/sql-ref-identifier.md index e4d9727c09b7e..7aca08ea9fd8d 100644 --- a/docs/sql-ref-identifier.md +++ b/docs/sql-ref-identifier.md @@ -30,7 +30,7 @@ An identifier is a string used to identify a database object such as a table, vi ```sql { letter | digit | '_' } [ , ... ] ``` -**Note:** If `spark.sql.ansi.enabled` is set to true, ANSI SQL reserved keywords cannot be used as identifiers. For more details, please refer to [ANSI Compliance](sql-ref-ansi-compliance.html). +**Note:** If `spark.sql.ansi.enforceReservedKeywords` is set to true, ANSI SQL reserved keywords cannot be used as identifiers. For more details, please refer to [ANSI Compliance](sql-ref-ansi-compliance.html). #### Delimited Identifier diff --git a/docs/sql-ref-operators.md b/docs/sql-ref-operators.md new file mode 100644 index 0000000000000..102e45fba8d20 --- /dev/null +++ b/docs/sql-ref-operators.md @@ -0,0 +1,124 @@ +--- +layout: global +title: Operators +displayTitle: Operators +license: | + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--- + +An SQL operator is a symbol specifying an action that is performed on one or more expressions. Operators are represented by special characters or by keywords. + +### Operator Precedence + +When a complex expression has multiple operators, operator precedence determines the sequence of operations in the expression, +e.g. in expression `1 + 2 * 3`, `*` has higher precedence than `+`, so the expression is evaluated as `1 + (2 * 3) = 7`. +The order of execution can significantly affect the resulting value. + +Operators have the precedence levels shown in the following table. +An operator on higher precedence is evaluated before an operator on a lower level. +In the following table, the operators in descending order of precedence, a.k.a. 1 is the highest level. +Operators listed on the same table cell have the same precedence and are evaluated from left to right or right to left based on the associativity. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      PrecedenceOperatorOperationAssociativity
      1.
      []
      ::
      member access
      element access
      cast
      Left to right
      2+
      -
      ~
      unary plus
      unary minus
      bitwise NOT
      Right to left
      3*
      /
      %
      DIV
      multiplication
      division, modulo
      integral division
      Left to right
      4+
      -
      ||
      addition
      subtraction
      concatenation
      Left to right
      5<<
      >>
      >>>
      bitwise shift left
      bitwise shift right
      bitwise shift right unsigned
      Left to right
      6&bitwise ANDLeft to right
      7^bitwise XOR(exclusive or)Left to right
      8|bitwise OR(inclusive or)Left to right
      9=, ==
      <>, !=
      <, <=
      >, >=
      comparison operatorsLeft to right
      10NOT, !
      EXISTS
      logical NOT
      existence
      Right to left
      11BETWEEN
      IN
      RLIKE, REGEXP
      ILIKE
      LIKE
      IS [NULL, TRUE, FALSE]
      IS DISTINCT FROM
      other predicatesLeft to right
      12ANDconjunctionLeft to right
      13ORdisjunctionLeft to right
      diff --git a/docs/sql-ref-syntax-ddl-alter-database.md b/docs/sql-ref-syntax-ddl-alter-database.md index 0ac0038236433..727fedb19e237 100644 --- a/docs/sql-ref-syntax-ddl-alter-database.md +++ b/docs/sql-ref-syntax-ddl-alter-database.md @@ -25,7 +25,7 @@ license: | `DATABASE`, `SCHEMA` and `NAMESPACE` are interchangeable and one can be used in place of the others. An error message is issued if the database is not found in the system. -### ALTER PROPERTIES +### SET PROPERTIES `ALTER DATABASE SET DBPROPERTIES` statement changes the properties associated with a database. The specified property values override any existing value with the same property name. This command is mostly used to record the metadata for a database and may be used for auditing purposes. @@ -43,7 +43,25 @@ ALTER { DATABASE | SCHEMA | NAMESPACE } database_name Specifies the name of the database to be altered. -### ALTER LOCATION +### UNSET PROPERTIES +`ALTER DATABASE UNSET DBPROPERTIES` statement unsets the properties associated with a database. +If the specified property key does not exist, the command will ignore it and finally succeed. +(available since Spark 4.0.0). + +#### Syntax + +```sql +ALTER { DATABASE | SCHEMA | NAMESPACE } database_name + UNSET { DBPROPERTIES | PROPERTIES } ( property_name [ , ... ] ) +``` + +#### Parameters + +* **database_name** + + Specifies the name of the database to be altered. + +### SET LOCATION `ALTER DATABASE SET LOCATION` statement changes the default parent-directory where new tables will be added for a database. Please note that it does not move the contents of the database's current directory to the newly specified location or change the locations associated with any tables/partitions under the specified database @@ -95,6 +113,24 @@ DESCRIBE DATABASE EXTENDED inventory; | Location|file:/temp/spark-warehouse/new_inventory.db| | Properties| ((Edit-date,01/01/2001), (Edited-by,John))| +-------------------------+-------------------------------------------+ + +-- Alters the database to unset the property `Edited-by` +ALTER DATABASE inventory UNSET DBPROPERTIES ('Edited-by'); + +-- Verify that the property `Edited-by` has been unset. +DESCRIBE DATABASE EXTENDED inventory; ++-------------------------+-------------------------------------------+ +|database_description_item| database_description_value| ++-------------------------+-------------------------------------------+ +| Database Name| inventory| +| Description| | +| Location|file:/temp/spark-warehouse/new_inventory.db| +| Properties| ((Edit-date,01/01/2001)) | ++-------------------------+-------------------------------------------+ + +-- Alters the database to unset a non-existent property `non-existent` +-- Note: The command will ignore 'non-existent' and finally succeed +ALTER DATABASE inventory UNSET DBPROPERTIES ('non-existent'); ``` ### Related Statements diff --git a/docs/sql-ref-syntax-ddl-alter-view.md b/docs/sql-ref-syntax-ddl-alter-view.md index d69f24677116d..ca272d27bc7e6 100644 --- a/docs/sql-ref-syntax-ddl-alter-view.md +++ b/docs/sql-ref-syntax-ddl-alter-view.md @@ -109,6 +109,32 @@ Note that `ALTER VIEW` statement does not support `SET SERDE` or `SET SERDEPROPE Specifies the definition of the view. Check [select_statement](sql-ref-syntax-qry-select.html) for details. +#### ALTER View WITH SCHEMA + +Changes the view's schema binding behavior. + +If the view is cached, the command clears cached data of the view and all its dependents that refer to it. View's cache will be lazily filled when the next time the view is accessed. The command leaves view's dependents as uncached. + +This statement is not supported for `TEMPORARY` views. + +#### Syntax +```sql +ALTER VIEW view_identifier WITH SCHEMA { BINDING | COMPENSATION | [ TYPE ] EVOLUTION } +``` + +#### Parameters +* **view_identifier** + + Specifies a view name, which may be optionally qualified with a database name. + + **Syntax:** `[ database_name. ] view_name` + +* **BINDING** - The view can tolerate only type changes in the underlying schema requiring safe up-casts. +* **COMPENSATION** - The view can tolerate type changes in the underlying schema requiring casts. Runtime casting errors may occur. +* **TYPE EVOLUTION** - The view will adapt to any type changes in the underlying schema. +* **EVOLUTION** - For views defined without a column lists any schema changes are adapted by the view, including, for queries with `SELECT *` dropped or added columns. + If the view is defined with a column list, the clause is interpreted as `TYPE EVOLUTION`. + ### Examples ```sql @@ -196,6 +222,24 @@ DESC TABLE EXTENDED tempdb1.v2; | View Text| select * from tempdb1.v1| | | View Original Text| select * from tempdb1.v1| | +----------------------------+---------------------------+-------+ + +CREATE OR REPLACE VIEW open_orders AS SELECT * FROM orders WHERE status = 'open'; +ALTER VIEW open_orders WITH SCHEMA EVOLUTION; +DESC TABLE EXTENDED open_orders; ++----------------------------+---------------------------+-------+ +| col_name| data_type|comment| ++----------------------------+---------------------------+-------+ +| order_no| int| null| +| order_date| date| null| +| | | | +|# Detailed Table Information| | | +| Database| mydb| | +| Table| open_orders| | +| Type| VIEW| | +| View Text| select * from orders| | +| View Original Text| select * from orders| | +| View Schema Mode | EVOLUTION| | ++----------------------------+---------------------------+-------+ ``` ### Related Statements diff --git a/docs/sql-ref-syntax-ddl-create-view.md b/docs/sql-ref-syntax-ddl-create-view.md index 1a9c1f62728e7..21174f12300e3 100644 --- a/docs/sql-ref-syntax-ddl-create-view.md +++ b/docs/sql-ref-syntax-ddl-create-view.md @@ -61,6 +61,17 @@ CREATE [ OR REPLACE ] [ [ GLOBAL ] TEMPORARY ] VIEW [ IF NOT EXISTS ] view_ident * `[ ( column_name [ COMMENT column_comment ], ... ) ]` to specify column-level comments. * `[ COMMENT view_comment ]` to specify view-level comments. * `[ TBLPROPERTIES ( property_name = property_value [ , ... ] ) ]` to add metadata key-value pairs. + * `[ WITH SCHEMA { BINDING | COMPENSATION | [ TYPE ] EVOLUTION } ]` to specify how the view reacts to schema changes + + This clause is not supported for `TEMPORARY` views. + + * **BINDING** - The view can tolerate only type changes in the underlying schema requiring safe up-casts. + * **COMPENSATION** - The view can tolerate type changes in the underlying schema requiring casts. Runtime casting errors may occur. + * **TYPE EVOLUTION** - The view will adapt to any type changes in the underlying schema. + * **EVOLUTION** - For views defined without a column lists any schema changes are adapted by the view, including, for queries with `SELECT *` dropped or added columns. + If the view is defined with a column list, the clause is interpreted as `TYPE EVOLUTION`. + + The default is `WITH SCHEMA COMPENSATION`. * **query** A [SELECT](sql-ref-syntax-qry-select.html) statement that constructs the view from base tables or other views. @@ -80,6 +91,10 @@ CREATE GLOBAL TEMPORARY VIEW IF NOT EXISTS subscribed_movies AS SELECT mo.member_id, mb.full_name, mo.movie_title FROM movies AS mo INNER JOIN members AS mb ON mo.member_id = mb.id; + +-- Create a view filtering the `orders` table which will adjust to schema changes in `orders`. +CREATE OR REPLACE VIEW open_orders WITH SCHEMA EVOLUTION + AS SELECT * FROM orders WHERE status = 'open'; ``` ### Related Statements diff --git a/docs/structured-streaming-programming-guide.md b/docs/structured-streaming-programming-guide.md index f3a8a0a40694b..fabe7f17b78b3 100644 --- a/docs/structured-streaming-programming-guide.md +++ b/docs/structured-streaming-programming-guide.md @@ -574,6 +574,10 @@ Here are the details of all the sources in Spark.
      maxFileAge: Maximum age of a file that can be found in this directory, before it is ignored. For the first batch all files will be considered valid. If latestFirst is set to `true` and maxFilesPerTrigger or maxBytesPerTrigger is set, then this parameter will be ignored, because old files that are valid, and should be processed, may be ignored. The max age is specified with respect to the timestamp of the latest file, and not the timestamp of the current system.(default: 1 week)
      + maxCachedFiles: maximum number of files to cache to be processed in subsequent batches (default: 10000). If files are available in the cache, they will be read from first before listing from the input source. +
      + discardCachedInputRatio: ratio of cached files/bytes to max files/bytes to allow for listing from input source when there is less cached input than could be available to be read (default: 0.2). For example, if there are only 10 cached files remaining for a batch but the maxFilesPerTrigger is set to 100, the 10 cached files would be discarded and a new listing would be performed instead. Similarly, if there are cached files that are 10 MB remaining for a batch, but the maxBytesPerTrigger is set to 100MB, the cached files would be discarded. +
      cleanSource: option to clean up completed files after processing.
      Available options are "archive", "delete", "off". If the option is not provided, the default value is "off".
      When "archive" is provided, additional option sourceArchiveDir must be provided as well. The value of "sourceArchiveDir" must not match with source pattern in depth (the number of directories from the root directory), where the depth is minimum of depth on both paths. This will ensure archived files are never included as new source files.
      diff --git a/docs/submitting-applications.md b/docs/submitting-applications.md index bf02ec137e200..071fbf5549398 100644 --- a/docs/submitting-applications.md +++ b/docs/submitting-applications.md @@ -91,7 +91,7 @@ run it with `--help`. Here are a few examples of common options: # Run application locally on 8 cores ./bin/spark-submit \ --class org.apache.spark.examples.SparkPi \ - --master local[8] \ + --master "local[8]" \ /path/to/examples.jar \ 100 @@ -178,8 +178,13 @@ The master URL passed to Spark can be in one of the following formats: # Loading Configuration from a File The `spark-submit` script can load default [Spark configuration values](configuration.html) from a -properties file and pass them on to your application. By default, it will read options -from `conf/spark-defaults.conf` in the `SPARK_HOME` directory. +properties file and pass them on to your application. The file can be specified via the `--properties-file` +parameter. When this is not specified, by default Spark will read options from `conf/spark-defaults.conf` +in the `SPARK_HOME` directory. + +An additional flag `--load-spark-defaults` can be used to tell Spark to load configurations from `conf/spark-defaults.conf` +even when a property file is provided via `--properties-file`. This is useful, for instance, when users +want to put system-wide default settings in the former while user/cluster specific settings in the latter. Loading default Spark configurations this way can obviate the need for certain flags to `spark-submit`. For instance, if the `spark.master` property is set, you can safely omit the diff --git a/docs/util/build-error-docs.py b/docs/util/build-error-docs.py new file mode 100644 index 0000000000000..df6b9e3c05270 --- /dev/null +++ b/docs/util/build-error-docs.py @@ -0,0 +1,152 @@ +""" +Generate a unified page of documentation for all error conditions. +""" +import json +import os +import re +from itertools import chain +from pathlib import Path +from textwrap import dedent + +# To avoid adding new direct dependencies, we import from within mkdocs. +# This is not ideal as unrelated updates to mkdocs may break this script. +from mkdocs.structure.pages import markdown + +THIS_DIR = Path(__file__).parent +SPARK_PROJECT_ROOT = THIS_DIR.parents[1] +DOCS_ROOT = SPARK_PROJECT_ROOT / "docs" +ERROR_CONDITIONS_PATH = ( + SPARK_PROJECT_ROOT / "common/utils/src/main/resources/error/error-conditions.json" +) + + +def assemble_message(message_parts): + message = " ".join(message_parts) + cleaned_message = re.sub(r"(<.*?>)", lambda x: f"`{x.group(1)}`", message) + return markdown.markdown(cleaned_message) + + +def load_error_conditions(path): + with open(path) as f: + raw_error_conditions = json.load(f) + error_conditions = dict() + for name, details in raw_error_conditions.items(): + if name.startswith("_LEGACY_ERROR") or name.startswith("INTERNAL_ERROR"): + continue + if "subClass" in details: + for sub_name in details["subClass"]: + details["subClass"][sub_name]["message"] = ( + assemble_message(details["subClass"][sub_name]["message"]) + ) + details["message"] = assemble_message(details["message"]) + error_conditions[name] = details + return error_conditions + + +def anchor_name(condition_name: str, sub_condition_name: str = None): + """ + URLs can, in practice, be up to 2,000 characters long without causing any issues. So we preserve + the condition name mostly as-is for use in the anchor, even when that name is very long. + See: https://stackoverflow.com/a/417184 + """ + parts = [ + part for part in (condition_name, sub_condition_name) + if part + ] + anchor = "-".join(parts).lower().replace("_", "-") + return anchor + + +def generate_doc_rows(condition_name, condition_details): + condition_row = [ + """ +
      {sql_state} + + + # + + {condition_name} + + {message}
      + + + # + + {sub_condition_name} + + {message}
      + + + + + + {rows} +
      Error State / SQLSTATEError Condition & Sub-ConditionMessage
      + """ + ) + # We dedent here rather than above so that the interpolated rows (which are not + # indented) don't prevent the dedent from working. + table_html = dedent(table_html).strip().format(rows="\n".join(list(doc_rows))) + return table_html + + +if __name__ == "__main__": + error_conditions = load_error_conditions(ERROR_CONDITIONS_PATH) + doc_table = generate_doc_table(error_conditions) + (DOCS_ROOT / "_generated").mkdir(exist_ok=True) + html_table_path = DOCS_ROOT / "_generated" / "error-conditions.html" + with open(html_table_path, "w") as f: + f.write(doc_table) + print("Generated:", os.path.relpath(html_table_path, start=SPARK_PROJECT_ROOT)) diff --git a/examples/src/main/python/sql/arrow.py b/examples/src/main/python/sql/arrow.py index 03daf18eadbf3..0200d094185d5 100644 --- a/examples/src/main/python/sql/arrow.py +++ b/examples/src/main/python/sql/arrow.py @@ -33,6 +33,25 @@ require_minimum_pyarrow_version() +def dataframe_to_from_arrow_table_example(spark: SparkSession) -> None: + import pyarrow as pa + import numpy as np + + # Create a PyArrow Table + table = pa.table([pa.array(np.random.rand(100)) for i in range(3)], names=["a", "b", "c"]) + + # Create a Spark DataFrame from the PyArrow Table + df = spark.createDataFrame(table) + + # Convert the Spark DataFrame to a PyArrow Table + result_table = df.select("*").toArrow() + + print(result_table.schema) + # a: double + # b: double + # c: double + + def dataframe_with_arrow_example(spark: SparkSession) -> None: import numpy as np import pandas as pd @@ -302,6 +321,8 @@ def arrow_slen(s): # type: ignore[no-untyped-def] .appName("Python Arrow-in-Spark example") \ .getOrCreate() + print("Running Arrow conversion example: DataFrame to Table") + dataframe_to_from_arrow_table_example(spark) print("Running Pandas to/from conversion example") dataframe_with_arrow_example(spark) print("Running pandas_udf example: Series to Frame") diff --git a/graphx/src/main/scala/org/apache/spark/graphx/GraphLoader.scala b/graphx/src/main/scala/org/apache/spark/graphx/GraphLoader.scala index 193d6551a3666..045149922c8ed 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/GraphLoader.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/GraphLoader.scala @@ -22,7 +22,7 @@ import java.util.concurrent.TimeUnit import org.apache.spark.SparkContext import org.apache.spark.graphx.impl.{EdgePartitionBuilder, GraphImpl} import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.TOTAL_TIME +import org.apache.spark.internal.LogKeys.TOTAL_TIME import org.apache.spark.storage.StorageLevel /** diff --git a/graphx/src/main/scala/org/apache/spark/graphx/Pregel.scala b/graphx/src/main/scala/org/apache/spark/graphx/Pregel.scala index e1b9b2fe5ae64..1493d8114c699 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/Pregel.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/Pregel.scala @@ -21,7 +21,7 @@ import scala.reflect.ClassTag import org.apache.spark.graphx.util.PeriodicGraphCheckpointer import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.NUM_ITERATIONS +import org.apache.spark.internal.LogKeys.NUM_ITERATIONS import org.apache.spark.rdd.RDD import org.apache.spark.rdd.util.PeriodicRDDCheckpointer diff --git a/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala b/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala index aa5898fb585c9..4fe010bfce785 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala @@ -23,7 +23,7 @@ import breeze.linalg.{Vector => BV} import org.apache.spark.graphx._ import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.NUM_ITERATIONS +import org.apache.spark.internal.LogKeys.NUM_ITERATIONS import org.apache.spark.ml.linalg.{Vector, Vectors} /** diff --git a/hadoop-cloud/src/main/scala/org/apache/spark/internal/io/cloud/AbortableStreamBasedCheckpointFileManager.scala b/hadoop-cloud/src/main/scala/org/apache/spark/internal/io/cloud/AbortableStreamBasedCheckpointFileManager.scala index 2afab01ec7b03..599361009fcc5 100644 --- a/hadoop-cloud/src/main/scala/org/apache/spark/internal/io/cloud/AbortableStreamBasedCheckpointFileManager.scala +++ b/hadoop-cloud/src/main/scala/org/apache/spark/internal/io/cloud/AbortableStreamBasedCheckpointFileManager.scala @@ -24,7 +24,7 @@ import scala.util.control.NonFatal import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs._ -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.sql.execution.streaming.AbstractFileContextBasedCheckpointFileManager import org.apache.spark.sql.execution.streaming.CheckpointFileManager.CancellableFSDataOutputStream @@ -36,7 +36,7 @@ class AbortableStreamBasedCheckpointFileManager(path: Path, hadoopConf: Configur s" an fs (path: $path) with abortable stream support") } - logInfo(s"Writing atomically to $path based on abortable stream") + logInfo(log"Writing atomically to ${MDC(LogKeys.PATH, path)} based on abortable stream") class AbortableStreamBasedFSDataOutputStream( fsDataOutputStream: FSDataOutputStream, @@ -53,7 +53,8 @@ class AbortableStreamBasedCheckpointFileManager(path: Path, hadoopConf: Configur fsDataOutputStream.close() } catch { case NonFatal(e) => - logWarning(s"Error cancelling write to $path (stream: $fsDataOutputStream)", e) + logWarning(log"Error cancelling write to ${MDC(LogKeys.PATH, path)} " + + log"(stream: ${MDC(LogKeys.FS_DATA_OUTPUT_STREAM, fsDataOutputStream)})", e) } finally { terminated = true } @@ -71,7 +72,8 @@ class AbortableStreamBasedCheckpointFileManager(path: Path, hadoopConf: Configur fsDataOutputStream.close() } catch { case NonFatal(e) => - logWarning(s"Error closing $path (stream: $fsDataOutputStream)", e) + logWarning(log"Error closing ${MDC(LogKeys.PATH, path)} " + + log"(stream: ${MDC(LogKeys.FS_DATA_OUTPUT_STREAM, fsDataOutputStream)})", e) } finally { terminated = true } diff --git a/launcher/src/main/java/org/apache/spark/launcher/JavaModuleOptions.java b/launcher/src/main/java/org/apache/spark/launcher/JavaModuleOptions.java index dc5840185d629..7b23db052e8b8 100644 --- a/launcher/src/main/java/org/apache/spark/launcher/JavaModuleOptions.java +++ b/launcher/src/main/java/org/apache/spark/launcher/JavaModuleOptions.java @@ -27,6 +27,7 @@ public class JavaModuleOptions { private static final String[] DEFAULT_MODULE_OPTIONS = { "-XX:+IgnoreUnrecognizedVMOptions", + "--add-modules=jdk.incubator.vector", "--add-opens=java.base/java.lang=ALL-UNNAMED", "--add-opens=java.base/java.lang.invoke=ALL-UNNAMED", "--add-opens=java.base/java.lang.reflect=ALL-UNNAMED", diff --git a/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitOptionParser.java b/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitOptionParser.java index df4fccd0f01e7..e4511421cd13c 100644 --- a/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitOptionParser.java +++ b/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitOptionParser.java @@ -55,6 +55,7 @@ class SparkSubmitOptionParser { protected final String PACKAGES = "--packages"; protected final String PACKAGES_EXCLUDE = "--exclude-packages"; protected final String PROPERTIES_FILE = "--properties-file"; + protected final String LOAD_SPARK_DEFAULTS = "--load-spark-defaults"; protected final String PROXY_USER = "--proxy-user"; protected final String PY_FILES = "--py-files"; protected final String REPOSITORIES = "--repositories"; @@ -130,6 +131,7 @@ class SparkSubmitOptionParser { { USAGE_ERROR }, { VERBOSE, "-v" }, { VERSION }, + { LOAD_SPARK_DEFAULTS }, }; /** diff --git a/licenses-binary/LICENSE-check-qual.txt b/licenses-binary/LICENSE-check-qual.txt new file mode 100644 index 0000000000000..d542ab3ec3ed8 --- /dev/null +++ b/licenses-binary/LICENSE-check-qual.txt @@ -0,0 +1,413 @@ +The Checker Framework +Copyright 2004-present by the Checker Framework developers + + +Most of the Checker Framework is licensed under the GNU General Public +License, version 2 (GPL2), with the classpath exception. The text of this +license appears below. This is the same license used for OpenJDK. + +A few parts of the Checker Framework have more permissive licenses, notably +the parts that you might want to include with your own program. + + * The annotations and utility files are licensed under the MIT License. + (The text of this license also appears below.) This applies to + checker-qual*.jar and checker-util.jar and all the files that appear in + them, which is all files in checker-qual and checker-util directories. + It also applies to the cleanroom implementations of + third-party annotations (in checker/src/testannotations/, + framework/src/main/java/org/jmlspecs/, and + framework/src/main/java/com/google/). + +The Checker Framework includes annotations for some libraries. Those in +.astub files use the MIT License. Those in https://github.com/typetools/jdk +(which appears in the annotated-jdk directory of file checker.jar) use the +GPL2 license. + +Some external libraries that are included with the Checker Framework +distribution have different licenses. Here are some examples. + + * JavaParser is dual licensed under the LGPL or the Apache license -- you + may use it under whichever one you want. (The JavaParser source code + contains a file with the text of the GPL, but it is not clear why, since + JavaParser does not use the GPL.) See + https://github.com/typetools/stubparser . + + * Annotation Tools (https://github.com/typetools/annotation-tools) uses + the MIT license. + + * Libraries in plume-lib (https://github.com/plume-lib/) are licensed + under the MIT License. + +=========================================================================== + +The GNU General Public License (GPL) + +Version 2, June 1991 + +Copyright (C) 1989, 1991 Free Software Foundation, Inc. +59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +Everyone is permitted to copy and distribute verbatim copies of this license +document, but changing it is not allowed. + +Preamble + +The licenses for most software are designed to take away your freedom to share +and change it. By contrast, the GNU General Public License is intended to +guarantee your freedom to share and change free software--to make sure the +software is free for all its users. This General Public License applies to +most of the Free Software Foundation's software and to any other program whose +authors commit to using it. (Some other Free Software Foundation software is +covered by the GNU Library General Public License instead.) You can apply it to +your programs, too. + +When we speak of free software, we are referring to freedom, not price. Our +General Public Licenses are designed to make sure that you have the freedom to +distribute copies of free software (and charge for this service if you wish), +that you receive source code or can get it if you want it, that you can change +the software or use pieces of it in new free programs; and that you know you +can do these things. + +To protect your rights, we need to make restrictions that forbid anyone to deny +you these rights or to ask you to surrender the rights. These restrictions +translate to certain responsibilities for you if you distribute copies of the +software, or if you modify it. + +For example, if you distribute copies of such a program, whether gratis or for +a fee, you must give the recipients all the rights that you have. You must +make sure that they, too, receive or can get the source code. And you must +show them these terms so they know their rights. + +We protect your rights with two steps: (1) copyright the software, and (2) +offer you this license which gives you legal permission to copy, distribute +and/or modify the software. + +Also, for each author's protection and ours, we want to make certain that +everyone understands that there is no warranty for this free software. If the +software is modified by someone else and passed on, we want its recipients to +know that what they have is not the original, so that any problems introduced +by others will not reflect on the original authors' reputations. + +Finally, any free program is threatened constantly by software patents. We +wish to avoid the danger that redistributors of a free program will +individually obtain patent licenses, in effect making the program proprietary. +To prevent this, we have made it clear that any patent must be licensed for +everyone's free use or not licensed at all. + +The precise terms and conditions for copying, distribution and modification +follow. + +TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + +0. This License applies to any program or other work which contains a notice +placed by the copyright holder saying it may be distributed under the terms of +this General Public License. The "Program", below, refers to any such program +or work, and a "work based on the Program" means either the Program or any +derivative work under copyright law: that is to say, a work containing the +Program or a portion of it, either verbatim or with modifications and/or +translated into another language. (Hereinafter, translation is included +without limitation in the term "modification".) Each licensee is addressed as +"you". + +Activities other than copying, distribution and modification are not covered by +this License; they are outside its scope. The act of running the Program is +not restricted, and the output from the Program is covered only if its contents +constitute a work based on the Program (independent of having been made by +running the Program). Whether that is true depends on what the Program does. + +1. You may copy and distribute verbatim copies of the Program's source code as +you receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice and +disclaimer of warranty; keep intact all the notices that refer to this License +and to the absence of any warranty; and give any other recipients of the +Program a copy of this License along with the Program. + +You may charge a fee for the physical act of transferring a copy, and you may +at your option offer warranty protection in exchange for a fee. + +2. You may modify your copy or copies of the Program or any portion of it, thus +forming a work based on the Program, and copy and distribute such modifications +or work under the terms of Section 1 above, provided that you also meet all of +these conditions: + + a) You must cause the modified files to carry prominent notices stating + that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in whole or + in part contains or is derived from the Program or any part thereof, to be + licensed as a whole at no charge to all third parties under the terms of + this License. + + c) If the modified program normally reads commands interactively when run, + you must cause it, when started running for such interactive use in the + most ordinary way, to print or display an announcement including an + appropriate copyright notice and a notice that there is no warranty (or + else, saying that you provide a warranty) and that users may redistribute + the program under these conditions, and telling the user how to view a copy + of this License. (Exception: if the Program itself is interactive but does + not normally print such an announcement, your work based on the Program is + not required to print an announcement.) + +These requirements apply to the modified work as a whole. If identifiable +sections of that work are not derived from the Program, and can be reasonably +considered independent and separate works in themselves, then this License, and +its terms, do not apply to those sections when you distribute them as separate +works. But when you distribute the same sections as part of a whole which is a +work based on the Program, the distribution of the whole must be on the terms +of this License, whose permissions for other licensees extend to the entire +whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest your +rights to work written entirely by you; rather, the intent is to exercise the +right to control the distribution of derivative or collective works based on +the Program. + +In addition, mere aggregation of another work not based on the Program with the +Program (or with a work based on the Program) on a volume of a storage or +distribution medium does not bring the other work under the scope of this +License. + +3. You may copy and distribute the Program (or a work based on it, under +Section 2) in object code or executable form under the terms of Sections 1 and +2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable source + code, which must be distributed under the terms of Sections 1 and 2 above + on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three years, to + give any third party, for a charge no more than your cost of physically + performing source distribution, a complete machine-readable copy of the + corresponding source code, to be distributed under the terms of Sections 1 + and 2 above on a medium customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer to + distribute corresponding source code. (This alternative is allowed only + for noncommercial distribution and only if you received the program in + object code or executable form with such an offer, in accord with + Subsection b above.) + +The source code for a work means the preferred form of the work for making +modifications to it. For an executable work, complete source code means all +the source code for all modules it contains, plus any associated interface +definition files, plus the scripts used to control compilation and installation +of the executable. However, as a special exception, the source code +distributed need not include anything that is normally distributed (in either +source or binary form) with the major components (compiler, kernel, and so on) +of the operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering access to copy +from a designated place, then offering equivalent access to copy the source +code from the same place counts as distribution of the source code, even though +third parties are not compelled to copy the source along with the object code. + +4. You may not copy, modify, sublicense, or distribute the Program except as +expressly provided under this License. Any attempt otherwise to copy, modify, +sublicense or distribute the Program is void, and will automatically terminate +your rights under this License. However, parties who have received copies, or +rights, from you under this License will not have their licenses terminated so +long as such parties remain in full compliance. + +5. You are not required to accept this License, since you have not signed it. +However, nothing else grants you permission to modify or distribute the Program +or its derivative works. These actions are prohibited by law if you do not +accept this License. Therefore, by modifying or distributing the Program (or +any work based on the Program), you indicate your acceptance of this License to +do so, and all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + +6. Each time you redistribute the Program (or any work based on the Program), +the recipient automatically receives a license from the original licensor to +copy, distribute or modify the Program subject to these terms and conditions. +You may not impose any further restrictions on the recipients' exercise of the +rights granted herein. You are not responsible for enforcing compliance by +third parties to this License. + +7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), conditions +are imposed on you (whether by court order, agreement or otherwise) that +contradict the conditions of this License, they do not excuse you from the +conditions of this License. If you cannot distribute so as to satisfy +simultaneously your obligations under this License and any other pertinent +obligations, then as a consequence you may not distribute the Program at all. +For example, if a patent license would not permit royalty-free redistribution +of the Program by all those who receive copies directly or indirectly through +you, then the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under any +particular circumstance, the balance of the section is intended to apply and +the section as a whole is intended to apply in other circumstances. + +It is not the purpose of this section to induce you to infringe any patents or +other property right claims or to contest validity of any such claims; this +section has the sole purpose of protecting the integrity of the free software +distribution system, which is implemented by public license practices. Many +people have made generous contributions to the wide range of software +distributed through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing to +distribute software through any other system and a licensee cannot impose that +choice. + +This section is intended to make thoroughly clear what is believed to be a +consequence of the rest of this License. + +8. If the distribution and/or use of the Program is restricted in certain +countries either by patents or by copyrighted interfaces, the original +copyright holder who places the Program under this License may add an explicit +geographical distribution limitation excluding those countries, so that +distribution is permitted only in or among countries not thus excluded. In +such case, this License incorporates the limitation as if written in the body +of this License. + +9. The Free Software Foundation may publish revised and/or new versions of the +General Public License from time to time. Such new versions will be similar in +spirit to the present version, but may differ in detail to address new problems +or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any later +version", you have the option of following the terms and conditions either of +that version or of any later version published by the Free Software Foundation. +If the Program does not specify a version number of this License, you may +choose any version ever published by the Free Software Foundation. + +10. If you wish to incorporate parts of the Program into other free programs +whose distribution conditions are different, write to the author to ask for +permission. For software which is copyrighted by the Free Software Foundation, +write to the Free Software Foundation; we sometimes make exceptions for this. +Our decision will be guided by the two goals of preserving the free status of +all derivatives of our free software and of promoting the sharing and reuse of +software generally. + +NO WARRANTY + +11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR +THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE +STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE +PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, +INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND +FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND +PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, +YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + +12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL +ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE +PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR +INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA +BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A +FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER +OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. + +END OF TERMS AND CONDITIONS + +How to Apply These Terms to Your New Programs + +If you develop a new program, and you want it to be of the greatest possible +use to the public, the best way to achieve this is to make it free software +which everyone can redistribute and change under these terms. + +To do so, attach the following notices to the program. It is safest to attach +them to the start of each source file to most effectively convey the exclusion +of warranty; and each file should have at least the "copyright" line and a +pointer to where the full notice is found. + + One line to give the program's name and a brief idea of what it does. + + Copyright (C) + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., 59 + Temple Place, Suite 330, Boston, MA 02111-1307 USA + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this when it +starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author Gnomovision comes + with ABSOLUTELY NO WARRANTY; for details type 'show w'. This is free + software, and you are welcome to redistribute it under certain conditions; + type 'show c' for details. + +The hypothetical commands 'show w' and 'show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may be +called something other than 'show w' and 'show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your school, +if any, to sign a "copyright disclaimer" for the program, if necessary. Here +is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + 'Gnomovision' (which makes passes at compilers) written by James Hacker. + + signature of Ty Coon, 1 April 1989 + + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General Public +License instead of this License. + + +"CLASSPATH" EXCEPTION TO THE GPL + +Certain source files distributed by Oracle America and/or its affiliates are +subject to the following clarification and special exception to the GPL, but +only where Oracle has expressly included in the particular source file's header +the words "Oracle designates this particular file as subject to the "Classpath" +exception as provided by Oracle in the LICENSE file that accompanied this code." + + Linking this library statically or dynamically with other modules is making + a combined work based on this library. Thus, the terms and conditions of + the GNU General Public License cover the whole combination. + + As a special exception, the copyright holders of this library give you + permission to link this library with independent modules to produce an + executable, regardless of the license terms of these independent modules, + and to copy and distribute the resulting executable under terms of your + choice, provided that you also meet, for each linked independent module, + the terms and conditions of the license of that module. An independent + module is a module which is not derived from or based on this library. If + you modify this library, you may extend this exception to your version of + the library, but you are not obligated to do so. If you do not wish to do + so, delete this exception statement from your version. + +=========================================================================== + +MIT License: + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + +=========================================================================== \ No newline at end of file diff --git a/licenses-binary/LICENSE-icu4j.txt b/licenses-binary/LICENSE-icu4j.txt new file mode 100644 index 0000000000000..80b587723a67f --- /dev/null +++ b/licenses-binary/LICENSE-icu4j.txt @@ -0,0 +1,519 @@ +UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE + +See Terms of Use +for definitions of Unicode Inc.’s Data Files and Software. + +NOTICE TO USER: Carefully read the following legal agreement. +BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S +DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"), +YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE +TERMS AND CONDITIONS OF THIS AGREEMENT. +IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE +THE DATA FILES OR SOFTWARE. + +COPYRIGHT AND PERMISSION NOTICE + +Copyright © 1991-2022 Unicode, Inc. All rights reserved. +Distributed under the Terms of Use in https://www.unicode.org/copyright.html. + +Permission is hereby granted, free of charge, to any person obtaining +a copy of the Unicode data files and any associated documentation +(the "Data Files") or Unicode software and any associated documentation +(the "Software") to deal in the Data Files or Software +without restriction, including without limitation the rights to use, +copy, modify, merge, publish, distribute, and/or sell copies of +the Data Files or Software, and to permit persons to whom the Data Files +or Software are furnished to do so, provided that either +(a) this copyright and permission notice appear with all copies +of the Data Files or Software, or +(b) this copyright and permission notice appear in associated +Documentation. + +THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT OF THIRD PARTY RIGHTS. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS +NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL +DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, +DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER +TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +PERFORMANCE OF THE DATA FILES OR SOFTWARE. + +Except as contained in this notice, the name of a copyright holder +shall not be used in advertising or otherwise to promote the sale, +use or other dealings in these Data Files or Software without prior +written authorization of the copyright holder. + +---------------------------------------------------------------------- + +Third-Party Software Licenses + +This section contains third-party software notices and/or additional +terms for licensed third-party software components included within ICU +libraries. + +---------------------------------------------------------------------- + +ICU License - ICU 1.8.1 to ICU 57.1 + +COPYRIGHT AND PERMISSION NOTICE + +Copyright (c) 1995-2016 International Business Machines Corporation and others +All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, and/or sell copies of the Software, and to permit persons +to whom the Software is furnished to do so, provided that the above +copyright notice(s) and this permission notice appear in all copies of +the Software and that both the above copyright notice(s) and this +permission notice appear in supporting documentation. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT +OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY +SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER +RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF +CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +Except as contained in this notice, the name of a copyright holder +shall not be used in advertising or otherwise to promote the sale, use +or other dealings in this Software without prior written authorization +of the copyright holder. + +All trademarks and registered trademarks mentioned herein are the +property of their respective owners. + +---------------------------------------------------------------------- + +Chinese/Japanese Word Break Dictionary Data (cjdict.txt) + + # The Google Chrome software developed by Google is licensed under + # the BSD license. Other software included in this distribution is + # provided under other licenses, as set forth below. + # + # The BSD License + # http://opensource.org/licenses/bsd-license.php + # Copyright (C) 2006-2008, Google Inc. + # + # All rights reserved. + # + # Redistribution and use in source and binary forms, with or without + # modification, are permitted provided that the following conditions are met: + # + # Redistributions of source code must retain the above copyright notice, + # this list of conditions and the following disclaimer. + # Redistributions in binary form must reproduce the above + # copyright notice, this list of conditions and the following + # disclaimer in the documentation and/or other materials provided with + # the distribution. + # Neither the name of Google Inc. nor the names of its + # contributors may be used to endorse or promote products derived from + # this software without specific prior written permission. + # + # + # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND + # CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, + # INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + # BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + # + # + # The word list in cjdict.txt are generated by combining three word lists + # listed below with further processing for compound word breaking. The + # frequency is generated with an iterative training against Google web + # corpora. + # + # * Libtabe (Chinese) + # - https://sourceforge.net/project/?group_id=1519 + # - Its license terms and conditions are shown below. + # + # * IPADIC (Japanese) + # - http://chasen.aist-nara.ac.jp/chasen/distribution.html + # - Its license terms and conditions are shown below. + # + # ---------COPYING.libtabe ---- BEGIN-------------------- + # + # /* + # * Copyright (c) 1999 TaBE Project. + # * Copyright (c) 1999 Pai-Hsiang Hsiao. + # * All rights reserved. + # * + # * Redistribution and use in source and binary forms, with or without + # * modification, are permitted provided that the following conditions + # * are met: + # * + # * . Redistributions of source code must retain the above copyright + # * notice, this list of conditions and the following disclaimer. + # * . Redistributions in binary form must reproduce the above copyright + # * notice, this list of conditions and the following disclaimer in + # * the documentation and/or other materials provided with the + # * distribution. + # * . Neither the name of the TaBE Project nor the names of its + # * contributors may be used to endorse or promote products derived + # * from this software without specific prior written permission. + # * + # * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + # * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + # * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + # * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + # * REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + # * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + # * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + # * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + # * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + # * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + # * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + # * OF THE POSSIBILITY OF SUCH DAMAGE. + # */ + # + # /* + # * Copyright (c) 1999 Computer Systems and Communication Lab, + # * Institute of Information Science, Academia + # * Sinica. All rights reserved. + # * + # * Redistribution and use in source and binary forms, with or without + # * modification, are permitted provided that the following conditions + # * are met: + # * + # * . Redistributions of source code must retain the above copyright + # * notice, this list of conditions and the following disclaimer. + # * . Redistributions in binary form must reproduce the above copyright + # * notice, this list of conditions and the following disclaimer in + # * the documentation and/or other materials provided with the + # * distribution. + # * . Neither the name of the Computer Systems and Communication Lab + # * nor the names of its contributors may be used to endorse or + # * promote products derived from this software without specific + # * prior written permission. + # * + # * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + # * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + # * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + # * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + # * REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + # * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + # * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + # * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + # * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + # * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + # * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + # * OF THE POSSIBILITY OF SUCH DAMAGE. + # */ + # + # Copyright 1996 Chih-Hao Tsai @ Beckman Institute, + # University of Illinois + # c-tsai4@uiuc.edu http://casper.beckman.uiuc.edu/~c-tsai4 + # + # ---------------COPYING.libtabe-----END-------------------------------- + # + # + # ---------------COPYING.ipadic-----BEGIN------------------------------- + # + # Copyright 2000, 2001, 2002, 2003 Nara Institute of Science + # and Technology. All Rights Reserved. + # + # Use, reproduction, and distribution of this software is permitted. + # Any copy of this software, whether in its original form or modified, + # must include both the above copyright notice and the following + # paragraphs. + # + # Nara Institute of Science and Technology (NAIST), + # the copyright holders, disclaims all warranties with regard to this + # software, including all implied warranties of merchantability and + # fitness, in no event shall NAIST be liable for + # any special, indirect or consequential damages or any damages + # whatsoever resulting from loss of use, data or profits, whether in an + # action of contract, negligence or other tortuous action, arising out + # of or in connection with the use or performance of this software. + # + # A large portion of the dictionary entries + # originate from ICOT Free Software. The following conditions for ICOT + # Free Software applies to the current dictionary as well. + # + # Each User may also freely distribute the Program, whether in its + # original form or modified, to any third party or parties, PROVIDED + # that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear + # on, or be attached to, the Program, which is distributed substantially + # in the same form as set out herein and that such intended + # distribution, if actually made, will neither violate or otherwise + # contravene any of the laws and regulations of the countries having + # jurisdiction over the User or the intended distribution itself. + # + # NO WARRANTY + # + # The program was produced on an experimental basis in the course of the + # research and development conducted during the project and is provided + # to users as so produced on an experimental basis. Accordingly, the + # program is provided without any warranty whatsoever, whether express, + # implied, statutory or otherwise. The term "warranty" used herein + # includes, but is not limited to, any warranty of the quality, + # performance, merchantability and fitness for a particular purpose of + # the program and the nonexistence of any infringement or violation of + # any right of any third party. + # + # Each user of the program will agree and understand, and be deemed to + # have agreed and understood, that there is no warranty whatsoever for + # the program and, accordingly, the entire risk arising from or + # otherwise connected with the program is assumed by the user. + # + # Therefore, neither ICOT, the copyright holder, or any other + # organization that participated in or was otherwise related to the + # development of the program and their respective officials, directors, + # officers and other employees shall be held liable for any and all + # damages, including, without limitation, general, special, incidental + # and consequential damages, arising out of or otherwise in connection + # with the use or inability to use the program or any product, material + # or result produced or otherwise obtained by using the program, + # regardless of whether they have been advised of, or otherwise had + # knowledge of, the possibility of such damages at any time during the + # project or thereafter. Each user will be deemed to have agreed to the + # foregoing by his or her commencement of use of the program. The term + # "use" as used herein includes, but is not limited to, the use, + # modification, copying and distribution of the program and the + # production of secondary products from the program. + # + # In the case where the program, whether in its original form or + # modified, was distributed or delivered to or received by a user from + # any person, organization or entity other than ICOT, unless it makes or + # grants independently of ICOT any specific warranty to the user in + # writing, such person, organization or entity, will also be exempted + # from and not be held liable to the user for any such damages as noted + # above as far as the program is concerned. + # + # ---------------COPYING.ipadic-----END---------------------------------- + +---------------------------------------------------------------------- + +Lao Word Break Dictionary Data (laodict.txt) + + # Copyright (C) 2016 and later: Unicode, Inc. and others. + # License & terms of use: http://www.unicode.org/copyright.html + # Copyright (c) 2015 International Business Machines Corporation + # and others. All Rights Reserved. + # + # Project: https://github.com/rober42539/lao-dictionary + # Dictionary: https://github.com/rober42539/lao-dictionary/laodict.txt + # License: https://github.com/rober42539/lao-dictionary/LICENSE.txt + # (copied below) + # + # This file is derived from the above dictionary version of Nov 22, 2020 + # ---------------------------------------------------------------------- + # Copyright (C) 2013 Brian Eugene Wilson, Robert Martin Campbell. + # All rights reserved. + # + # Redistribution and use in source and binary forms, with or without + # modification, are permitted provided that the following conditions are met: + # + # Redistributions of source code must retain the above copyright notice, this + # list of conditions and the following disclaimer. Redistributions in binary + # form must reproduce the above copyright notice, this list of conditions and + # the following disclaimer in the documentation and/or other materials + # provided with the distribution. + # + # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + # INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + # OF THE POSSIBILITY OF SUCH DAMAGE. + # -------------------------------------------------------------------------- + +---------------------------------------------------------------------- + +Burmese Word Break Dictionary Data (burmesedict.txt) + + # Copyright (c) 2014 International Business Machines Corporation + # and others. All Rights Reserved. + # + # This list is part of a project hosted at: + # github.com/kanyawtech/myanmar-karen-word-lists + # + # -------------------------------------------------------------------------- + # Copyright (c) 2013, LeRoy Benjamin Sharon + # All rights reserved. + # + # Redistribution and use in source and binary forms, with or without + # modification, are permitted provided that the following conditions + # are met: Redistributions of source code must retain the above + # copyright notice, this list of conditions and the following + # disclaimer. Redistributions in binary form must reproduce the + # above copyright notice, this list of conditions and the following + # disclaimer in the documentation and/or other materials provided + # with the distribution. + # + # Neither the name Myanmar Karen Word Lists, nor the names of its + # contributors may be used to endorse or promote products derived + # from this software without specific prior written permission. + # + # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND + # CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, + # INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS + # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + # TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR + # TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF + # THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + # SUCH DAMAGE. + # -------------------------------------------------------------------------- + +---------------------------------------------------------------------- + +Time Zone Database + + ICU uses the public domain data and code derived from Time Zone +Database for its time zone support. The ownership of the TZ database +is explained in BCP 175: Procedure for Maintaining the Time Zone +Database section 7. + + # 7. Database Ownership + # + # The TZ database itself is not an IETF Contribution or an IETF + # document. Rather it is a pre-existing and regularly updated work + # that is in the public domain, and is intended to remain in the + # public domain. Therefore, BCPs 78 [RFC5378] and 79 [RFC3979] do + # not apply to the TZ Database or contributions that individuals make + # to it. Should any claims be made and substantiated against the TZ + # Database, the organization that is providing the IANA + # Considerations defined in this RFC, under the memorandum of + # understanding with the IETF, currently ICANN, may act in accordance + # with all competent court orders. No ownership claims will be made + # by ICANN or the IETF Trust on the database or the code. Any person + # making a contribution to the database or code waives all rights to + # future claims in that contribution or in the TZ Database. + +---------------------------------------------------------------------- + +Google double-conversion + +Copyright 2006-2011, the V8 project authors. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + * Neither the name of Google Inc. nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +---------------------------------------------------------------------- + +File: aclocal.m4 (only for ICU4C) +Section: pkg.m4 - Macros to locate and utilise pkg-config. + + +Copyright © 2004 Scott James Remnant . +Copyright © 2012-2015 Dan Nicholson + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA +02111-1307, USA. + +As a special exception to the GNU General Public License, if you +distribute this file as part of a program that contains a +configuration script generated by Autoconf, you may include it under +the same distribution terms that you use for the rest of that +program. + + +(The condition for the exception is fulfilled because +ICU4C includes a configuration script generated by Autoconf, +namely the `configure` script.) + +---------------------------------------------------------------------- + +File: config.guess (only for ICU4C) + + +This file is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, see . + +As a special exception to the GNU General Public License, if you +distribute this file as part of a program that contains a +configuration script generated by Autoconf, you may include it under +the same distribution terms that you use for the rest of that +program. This Exception is an additional permission under section 7 +of the GNU General Public License, version 3 ("GPLv3"). + + +(The condition for the exception is fulfilled because +ICU4C includes a configuration script generated by Autoconf, +namely the `configure` script.) + +---------------------------------------------------------------------- + +File: install-sh (only for ICU4C) + + +Copyright 1991 by the Massachusetts Institute of Technology + +Permission to use, copy, modify, distribute, and sell this software and its +documentation for any purpose is hereby granted without fee, provided that +the above copyright notice appear in all copies and that both that +copyright notice and this permission notice appear in supporting +documentation, and that the name of M.I.T. not be used in advertising or +publicity pertaining to distribution of the software without specific, +written prior permission. M.I.T. makes no representations about the +suitability of this software for any purpose. It is provided "as is" +without express or implied warranty. diff --git a/licenses-binary/LICENSE-jakarta-servlet-api.txt b/licenses-binary/LICENSE-jakarta-servlet-api.txt new file mode 100644 index 0000000000000..e23ece2c85241 --- /dev/null +++ b/licenses-binary/LICENSE-jakarta-servlet-api.txt @@ -0,0 +1,277 @@ +Eclipse Public License - v 2.0 + + THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS ECLIPSE + PUBLIC LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION + OF THE PROGRAM CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT. + +1. DEFINITIONS + +"Contribution" means: + + a) in the case of the initial Contributor, the initial content + Distributed under this Agreement, and + + b) in the case of each subsequent Contributor: + i) changes to the Program, and + ii) additions to the Program; + where such changes and/or additions to the Program originate from + and are Distributed by that particular Contributor. A Contribution + "originates" from a Contributor if it was added to the Program by + such Contributor itself or anyone acting on such Contributor's behalf. + Contributions do not include changes or additions to the Program that + are not Modified Works. + +"Contributor" means any person or entity that Distributes the Program. + +"Licensed Patents" mean patent claims licensable by a Contributor which +are necessarily infringed by the use or sale of its Contribution alone +or when combined with the Program. + +"Program" means the Contributions Distributed in accordance with this +Agreement. + +"Recipient" means anyone who receives the Program under this Agreement +or any Secondary License (as applicable), including Contributors. + +"Derivative Works" shall mean any work, whether in Source Code or other +form, that is based on (or derived from) the Program and for which the +editorial revisions, annotations, elaborations, or other modifications +represent, as a whole, an original work of authorship. + +"Modified Works" shall mean any work in Source Code or other form that +results from an addition to, deletion from, or modification of the +contents of the Program, including, for purposes of clarity any new file +in Source Code form that contains any contents of the Program. Modified +Works shall not include works that contain only declarations, +interfaces, types, classes, structures, or files of the Program solely +in each case in order to link to, bind by name, or subclass the Program +or Modified Works thereof. + +"Distribute" means the acts of a) distributing or b) making available +in any manner that enables the transfer of a copy. + +"Source Code" means the form of a Program preferred for making +modifications, including but not limited to software source code, +documentation source, and configuration files. + +"Secondary License" means either the GNU General Public License, +Version 2.0, or any later versions of that license, including any +exceptions or additional permissions as identified by the initial +Contributor. + +2. GRANT OF RIGHTS + + a) Subject to the terms of this Agreement, each Contributor hereby + grants Recipient a non-exclusive, worldwide, royalty-free copyright + license to reproduce, prepare Derivative Works of, publicly display, + publicly perform, Distribute and sublicense the Contribution of such + Contributor, if any, and such Derivative Works. + + b) Subject to the terms of this Agreement, each Contributor hereby + grants Recipient a non-exclusive, worldwide, royalty-free patent + license under Licensed Patents to make, use, sell, offer to sell, + import and otherwise transfer the Contribution of such Contributor, + if any, in Source Code or other form. This patent license shall + apply to the combination of the Contribution and the Program if, at + the time the Contribution is added by the Contributor, such addition + of the Contribution causes such combination to be covered by the + Licensed Patents. The patent license shall not apply to any other + combinations which include the Contribution. No hardware per se is + licensed hereunder. + + c) Recipient understands that although each Contributor grants the + licenses to its Contributions set forth herein, no assurances are + provided by any Contributor that the Program does not infringe the + patent or other intellectual property rights of any other entity. + Each Contributor disclaims any liability to Recipient for claims + brought by any other entity based on infringement of intellectual + property rights or otherwise. As a condition to exercising the + rights and licenses granted hereunder, each Recipient hereby + assumes sole responsibility to secure any other intellectual + property rights needed, if any. For example, if a third party + patent license is required to allow Recipient to Distribute the + Program, it is Recipient's responsibility to acquire that license + before distributing the Program. + + d) Each Contributor represents that to its knowledge it has + sufficient copyright rights in its Contribution, if any, to grant + the copyright license set forth in this Agreement. + + e) Notwithstanding the terms of any Secondary License, no + Contributor makes additional grants to any Recipient (other than + those set forth in this Agreement) as a result of such Recipient's + receipt of the Program under the terms of a Secondary License + (if permitted under the terms of Section 3). + +3. REQUIREMENTS + +3.1 If a Contributor Distributes the Program in any form, then: + + a) the Program must also be made available as Source Code, in + accordance with section 3.2, and the Contributor must accompany + the Program with a statement that the Source Code for the Program + is available under this Agreement, and informs Recipients how to + obtain it in a reasonable manner on or through a medium customarily + used for software exchange; and + + b) the Contributor may Distribute the Program under a license + different than this Agreement, provided that such license: + i) effectively disclaims on behalf of all other Contributors all + warranties and conditions, express and implied, including + warranties or conditions of title and non-infringement, and + implied warranties or conditions of merchantability and fitness + for a particular purpose; + + ii) effectively excludes on behalf of all other Contributors all + liability for damages, including direct, indirect, special, + incidental and consequential damages, such as lost profits; + + iii) does not attempt to limit or alter the recipients' rights + in the Source Code under section 3.2; and + + iv) requires any subsequent distribution of the Program by any + party to be under a license that satisfies the requirements + of this section 3. + +3.2 When the Program is Distributed as Source Code: + + a) it must be made available under this Agreement, or if the + Program (i) is combined with other material in a separate file or + files made available under a Secondary License, and (ii) the initial + Contributor attached to the Source Code the notice described in + Exhibit A of this Agreement, then the Program may be made available + under the terms of such Secondary Licenses, and + + b) a copy of this Agreement must be included with each copy of + the Program. + +3.3 Contributors may not remove or alter any copyright, patent, +trademark, attribution notices, disclaimers of warranty, or limitations +of liability ("notices") contained within the Program from any copy of +the Program which they Distribute, provided that Contributors may add +their own appropriate notices. + +4. COMMERCIAL DISTRIBUTION + +Commercial distributors of software may accept certain responsibilities +with respect to end users, business partners and the like. While this +license is intended to facilitate the commercial use of the Program, +the Contributor who includes the Program in a commercial product +offering should do so in a manner which does not create potential +liability for other Contributors. Therefore, if a Contributor includes +the Program in a commercial product offering, such Contributor +("Commercial Contributor") hereby agrees to defend and indemnify every +other Contributor ("Indemnified Contributor") against any losses, +damages and costs (collectively "Losses") arising from claims, lawsuits +and other legal actions brought by a third party against the Indemnified +Contributor to the extent caused by the acts or omissions of such +Commercial Contributor in connection with its distribution of the Program +in a commercial product offering. The obligations in this section do not +apply to any claims or Losses relating to any actual or alleged +intellectual property infringement. In order to qualify, an Indemnified +Contributor must: a) promptly notify the Commercial Contributor in +writing of such claim, and b) allow the Commercial Contributor to control, +and cooperate with the Commercial Contributor in, the defense and any +related settlement negotiations. The Indemnified Contributor may +participate in any such claim at its own expense. + +For example, a Contributor might include the Program in a commercial +product offering, Product X. That Contributor is then a Commercial +Contributor. If that Commercial Contributor then makes performance +claims, or offers warranties related to Product X, those performance +claims and warranties are such Commercial Contributor's responsibility +alone. Under this section, the Commercial Contributor would have to +defend claims against the other Contributors related to those performance +claims and warranties, and if a court requires any other Contributor to +pay any damages as a result, the Commercial Contributor must pay +those damages. + +5. NO WARRANTY + +EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, AND TO THE EXTENT +PERMITTED BY APPLICABLE LAW, THE PROGRAM IS PROVIDED ON AN "AS IS" +BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR +IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR CONDITIONS OF +TITLE, NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A PARTICULAR +PURPOSE. Each Recipient is solely responsible for determining the +appropriateness of using and distributing the Program and assumes all +risks associated with its exercise of rights under this Agreement, +including but not limited to the risks and costs of program errors, +compliance with applicable laws, damage to or loss of data, programs +or equipment, and unavailability or interruption of operations. + +6. DISCLAIMER OF LIABILITY + +EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, AND TO THE EXTENT +PERMITTED BY APPLICABLE LAW, NEITHER RECIPIENT NOR ANY CONTRIBUTORS +SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING WITHOUT LIMITATION LOST +PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OR DISTRIBUTION OF THE PROGRAM OR THE +EXERCISE OF ANY RIGHTS GRANTED HEREUNDER, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + +7. GENERAL + +If any provision of this Agreement is invalid or unenforceable under +applicable law, it shall not affect the validity or enforceability of +the remainder of the terms of this Agreement, and without further +action by the parties hereto, such provision shall be reformed to the +minimum extent necessary to make such provision valid and enforceable. + +If Recipient institutes patent litigation against any entity +(including a cross-claim or counterclaim in a lawsuit) alleging that the +Program itself (excluding combinations of the Program with other software +or hardware) infringes such Recipient's patent(s), then such Recipient's +rights granted under Section 2(b) shall terminate as of the date such +litigation is filed. + +All Recipient's rights under this Agreement shall terminate if it +fails to comply with any of the material terms or conditions of this +Agreement and does not cure such failure in a reasonable period of +time after becoming aware of such noncompliance. If all Recipient's +rights under this Agreement terminate, Recipient agrees to cease use +and distribution of the Program as soon as reasonably practicable. +However, Recipient's obligations under this Agreement and any licenses +granted by Recipient relating to the Program shall continue and survive. + +Everyone is permitted to copy and distribute copies of this Agreement, +but in order to avoid inconsistency the Agreement is copyrighted and +may only be modified in the following manner. The Agreement Steward +reserves the right to publish new versions (including revisions) of +this Agreement from time to time. No one other than the Agreement +Steward has the right to modify this Agreement. The Eclipse Foundation +is the initial Agreement Steward. The Eclipse Foundation may assign the +responsibility to serve as the Agreement Steward to a suitable separate +entity. Each new version of the Agreement will be given a distinguishing +version number. The Program (including Contributions) may always be +Distributed subject to the version of the Agreement under which it was +received. In addition, after a new version of the Agreement is published, +Contributor may elect to Distribute the Program (including its +Contributions) under the new version. + +Except as expressly stated in Sections 2(a) and 2(b) above, Recipient +receives no rights or licenses to the intellectual property of any +Contributor under this Agreement, whether expressly, by implication, +estoppel or otherwise. All rights in the Program not expressly granted +under this Agreement are reserved. Nothing in this Agreement is intended +to be enforceable by any entity that is not a Contributor or Recipient. +No third-party beneficiary rights are created under this Agreement. + +Exhibit A - Form of Secondary Licenses Notice + +"This Source Code may also be made available under the following +Secondary Licenses when the conditions for such availability set forth +in the Eclipse Public License, v. 2.0 are satisfied: {name license(s), +version(s), and exceptions or additional permissions here}." + + Simply including a copy of this Agreement, including this Exhibit A + is not sufficient to license the Source Code under Secondary Licenses. + + If it is not possible or desirable to put the notice in a particular + file, then You may include the notice in a location (such as a LICENSE + file in a relevant directory) where a recipient would be likely to + look for such a notice. + + You may add additional accurate notices of copyright ownership. \ No newline at end of file diff --git a/licenses-binary/LICENSE-jline3.txt b/licenses-binary/LICENSE-jline3.txt new file mode 100644 index 0000000000000..ed9503f23c239 --- /dev/null +++ b/licenses-binary/LICENSE-jline3.txt @@ -0,0 +1,34 @@ +Copyright (c) 2002-2023, the original author or authors. +All rights reserved. + +https://opensource.org/licenses/BSD-3-Clause + +Redistribution and use in source and binary forms, with or +without modification, are permitted provided that the following +conditions are met: + +Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with +the distribution. + +Neither the name of JLine nor the names of its contributors +may be used to endorse or promote products derived from this +software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, +BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY +AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED +AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING +IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED +OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/licenses-binary/LICENSE-loose-version.txt b/licenses-binary/LICENSE-loose-version.txt new file mode 100644 index 0000000000000..c96142b38228f --- /dev/null +++ b/licenses-binary/LICENSE-loose-version.txt @@ -0,0 +1,279 @@ +A. HISTORY OF THE SOFTWARE +========================== + +Python was created in the early 1990s by Guido van Rossum at Stichting +Mathematisch Centrum (CWI, see https://www.cwi.nl) in the Netherlands +as a successor of a language called ABC. Guido remains Python's +principal author, although it includes many contributions from others. + +In 1995, Guido continued his work on Python at the Corporation for +National Research Initiatives (CNRI, see https://www.cnri.reston.va.us) +in Reston, Virginia where he released several versions of the +software. + +In May 2000, Guido and the Python core development team moved to +BeOpen.com to form the BeOpen PythonLabs team. In October of the same +year, the PythonLabs team moved to Digital Creations, which became +Zope Corporation. In 2001, the Python Software Foundation (PSF, see +https://www.python.org/psf/) was formed, a non-profit organization +created specifically to own Python-related Intellectual Property. +Zope Corporation was a sponsoring member of the PSF. + +All Python releases are Open Source (see https://opensource.org for +the Open Source Definition). Historically, most, but not all, Python +releases have also been GPL-compatible; the table below summarizes +the various releases. + + Release Derived Year Owner GPL- + from compatible? (1) + + 0.9.0 thru 1.2 1991-1995 CWI yes + 1.3 thru 1.5.2 1.2 1995-1999 CNRI yes + 1.6 1.5.2 2000 CNRI no + 2.0 1.6 2000 BeOpen.com no + 1.6.1 1.6 2001 CNRI yes (2) + 2.1 2.0+1.6.1 2001 PSF no + 2.0.1 2.0+1.6.1 2001 PSF yes + 2.1.1 2.1+2.0.1 2001 PSF yes + 2.1.2 2.1.1 2002 PSF yes + 2.1.3 2.1.2 2002 PSF yes + 2.2 and above 2.1.1 2001-now PSF yes + +Footnotes: + +(1) GPL-compatible doesn't mean that we're distributing Python under + the GPL. All Python licenses, unlike the GPL, let you distribute + a modified version without making your changes open source. The + GPL-compatible licenses make it possible to combine Python with + other software that is released under the GPL; the others don't. + +(2) According to Richard Stallman, 1.6.1 is not GPL-compatible, + because its license has a choice of law clause. According to + CNRI, however, Stallman's lawyer has told CNRI's lawyer that 1.6.1 + is "not incompatible" with the GPL. + +Thanks to the many outside volunteers who have worked under Guido's +direction to make these releases possible. + + +B. TERMS AND CONDITIONS FOR ACCESSING OR OTHERWISE USING PYTHON +=============================================================== + +Python software and documentation are licensed under the +Python Software Foundation License Version 2. + +Starting with Python 3.8.6, examples, recipes, and other code in +the documentation are dual licensed under the PSF License Version 2 +and the Zero-Clause BSD license. + +Some software incorporated into Python is under different licenses. +The licenses are listed with code falling under that license. + + +PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2 +-------------------------------------------- + +1. This LICENSE AGREEMENT is between the Python Software Foundation +("PSF"), and the Individual or Organization ("Licensee") accessing and +otherwise using this software ("Python") in source or binary form and +its associated documentation. + +2. Subject to the terms and conditions of this License Agreement, PSF hereby +grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce, +analyze, test, perform and/or display publicly, prepare derivative works, +distribute, and otherwise use Python alone or in any derivative version, +provided, however, that PSF's License Agreement and PSF's notice of copyright, +i.e., "Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, +2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023 Python Software Foundation; +All Rights Reserved" are retained in Python alone or in any derivative version +prepared by Licensee. + +3. In the event Licensee prepares a derivative work that is based on +or incorporates Python or any part thereof, and wants to make +the derivative work available to others as provided herein, then +Licensee hereby agrees to include in any such work a brief summary of +the changes made to Python. + +4. PSF is making Python available to Licensee on an "AS IS" +basis. PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR +IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND +DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS +FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT +INFRINGE ANY THIRD PARTY RIGHTS. + +5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON +FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS +A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON, +OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. + +6. This License Agreement will automatically terminate upon a material +breach of its terms and conditions. + +7. Nothing in this License Agreement shall be deemed to create any +relationship of agency, partnership, or joint venture between PSF and +Licensee. This License Agreement does not grant permission to use PSF +trademarks or trade name in a trademark sense to endorse or promote +products or services of Licensee, or any third party. + +8. By copying, installing or otherwise using Python, Licensee +agrees to be bound by the terms and conditions of this License +Agreement. + + +BEOPEN.COM LICENSE AGREEMENT FOR PYTHON 2.0 +------------------------------------------- + +BEOPEN PYTHON OPEN SOURCE LICENSE AGREEMENT VERSION 1 + +1. This LICENSE AGREEMENT is between BeOpen.com ("BeOpen"), having an +office at 160 Saratoga Avenue, Santa Clara, CA 95051, and the +Individual or Organization ("Licensee") accessing and otherwise using +this software in source or binary form and its associated +documentation ("the Software"). + +2. Subject to the terms and conditions of this BeOpen Python License +Agreement, BeOpen hereby grants Licensee a non-exclusive, +royalty-free, world-wide license to reproduce, analyze, test, perform +and/or display publicly, prepare derivative works, distribute, and +otherwise use the Software alone or in any derivative version, +provided, however, that the BeOpen Python License is retained in the +Software, alone or in any derivative version prepared by Licensee. + +3. BeOpen is making the Software available to Licensee on an "AS IS" +basis. BEOPEN MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR +IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, BEOPEN MAKES NO AND +DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS +FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE SOFTWARE WILL NOT +INFRINGE ANY THIRD PARTY RIGHTS. + +4. BEOPEN SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF THE +SOFTWARE FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS +AS A RESULT OF USING, MODIFYING OR DISTRIBUTING THE SOFTWARE, OR ANY +DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. + +5. This License Agreement will automatically terminate upon a material +breach of its terms and conditions. + +6. This License Agreement shall be governed by and interpreted in all +respects by the law of the State of California, excluding conflict of +law provisions. Nothing in this License Agreement shall be deemed to +create any relationship of agency, partnership, or joint venture +between BeOpen and Licensee. This License Agreement does not grant +permission to use BeOpen trademarks or trade names in a trademark +sense to endorse or promote products or services of Licensee, or any +third party. As an exception, the "BeOpen Python" logos available at +http://www.pythonlabs.com/logos.html may be used according to the +permissions granted on that web page. + +7. By copying, installing or otherwise using the software, Licensee +agrees to be bound by the terms and conditions of this License +Agreement. + + +CNRI LICENSE AGREEMENT FOR PYTHON 1.6.1 +--------------------------------------- + +1. This LICENSE AGREEMENT is between the Corporation for National +Research Initiatives, having an office at 1895 Preston White Drive, +Reston, VA 20191 ("CNRI"), and the Individual or Organization +("Licensee") accessing and otherwise using Python 1.6.1 software in +source or binary form and its associated documentation. + +2. Subject to the terms and conditions of this License Agreement, CNRI +hereby grants Licensee a nonexclusive, royalty-free, world-wide +license to reproduce, analyze, test, perform and/or display publicly, +prepare derivative works, distribute, and otherwise use Python 1.6.1 +alone or in any derivative version, provided, however, that CNRI's +License Agreement and CNRI's notice of copyright, i.e., "Copyright (c) +1995-2001 Corporation for National Research Initiatives; All Rights +Reserved" are retained in Python 1.6.1 alone or in any derivative +version prepared by Licensee. Alternately, in lieu of CNRI's License +Agreement, Licensee may substitute the following text (omitting the +quotes): "Python 1.6.1 is made available subject to the terms and +conditions in CNRI's License Agreement. This Agreement together with +Python 1.6.1 may be located on the internet using the following +unique, persistent identifier (known as a handle): 1895.22/1013. This +Agreement may also be obtained from a proxy server on the internet +using the following URL: http://hdl.handle.net/1895.22/1013". + +3. In the event Licensee prepares a derivative work that is based on +or incorporates Python 1.6.1 or any part thereof, and wants to make +the derivative work available to others as provided herein, then +Licensee hereby agrees to include in any such work a brief summary of +the changes made to Python 1.6.1. + +4. CNRI is making Python 1.6.1 available to Licensee on an "AS IS" +basis. CNRI MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR +IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, CNRI MAKES NO AND +DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS +FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON 1.6.1 WILL NOT +INFRINGE ANY THIRD PARTY RIGHTS. + +5. CNRI SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON +1.6.1 FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS +A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON 1.6.1, +OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. + +6. This License Agreement will automatically terminate upon a material +breach of its terms and conditions. + +7. This License Agreement shall be governed by the federal +intellectual property law of the United States, including without +limitation the federal copyright law, and, to the extent such +U.S. federal law does not apply, by the law of the Commonwealth of +Virginia, excluding Virginia's conflict of law provisions. +Notwithstanding the foregoing, with regard to derivative works based +on Python 1.6.1 that incorporate non-separable material that was +previously distributed under the GNU General Public License (GPL), the +law of the Commonwealth of Virginia shall govern this License +Agreement only as to issues arising under or with respect to +Paragraphs 4, 5, and 7 of this License Agreement. Nothing in this +License Agreement shall be deemed to create any relationship of +agency, partnership, or joint venture between CNRI and Licensee. This +License Agreement does not grant permission to use CNRI trademarks or +trade name in a trademark sense to endorse or promote products or +services of Licensee, or any third party. + +8. By clicking on the "ACCEPT" button where indicated, or by copying, +installing or otherwise using Python 1.6.1, Licensee agrees to be +bound by the terms and conditions of this License Agreement. + + ACCEPT + + +CWI LICENSE AGREEMENT FOR PYTHON 0.9.0 THROUGH 1.2 +-------------------------------------------------- + +Copyright (c) 1991 - 1995, Stichting Mathematisch Centrum Amsterdam, +The Netherlands. All rights reserved. + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of Stichting Mathematisch +Centrum or CWI not be used in advertising or publicity pertaining to +distribution of the software without specific, written prior +permission. + +STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO +THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND +FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE +FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT +OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +ZERO-CLAUSE BSD LICENSE FOR CODE IN THE PYTHON DOCUMENTATION +---------------------------------------------------------------------- + +Permission to use, copy, modify, and/or distribute this software for any +purpose with or without fee is hereby granted. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH +REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY +AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, +INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM +LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR +OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +PERFORMANCE OF THIS SOFTWARE. \ No newline at end of file diff --git a/licenses-binary/LICENSE-txw2.txt b/licenses-binary/LICENSE-txw2.txt new file mode 100644 index 0000000000000..da1c1cea70215 --- /dev/null +++ b/licenses-binary/LICENSE-txw2.txt @@ -0,0 +1,28 @@ +Copyright (c) 2018 Oracle and/or its affiliates. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + - Neither the name of the Eclipse Foundation, Inc. nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/licenses/LICENSE-loose-version.txt b/licenses/LICENSE-loose-version.txt new file mode 100644 index 0000000000000..c96142b38228f --- /dev/null +++ b/licenses/LICENSE-loose-version.txt @@ -0,0 +1,279 @@ +A. HISTORY OF THE SOFTWARE +========================== + +Python was created in the early 1990s by Guido van Rossum at Stichting +Mathematisch Centrum (CWI, see https://www.cwi.nl) in the Netherlands +as a successor of a language called ABC. Guido remains Python's +principal author, although it includes many contributions from others. + +In 1995, Guido continued his work on Python at the Corporation for +National Research Initiatives (CNRI, see https://www.cnri.reston.va.us) +in Reston, Virginia where he released several versions of the +software. + +In May 2000, Guido and the Python core development team moved to +BeOpen.com to form the BeOpen PythonLabs team. In October of the same +year, the PythonLabs team moved to Digital Creations, which became +Zope Corporation. In 2001, the Python Software Foundation (PSF, see +https://www.python.org/psf/) was formed, a non-profit organization +created specifically to own Python-related Intellectual Property. +Zope Corporation was a sponsoring member of the PSF. + +All Python releases are Open Source (see https://opensource.org for +the Open Source Definition). Historically, most, but not all, Python +releases have also been GPL-compatible; the table below summarizes +the various releases. + + Release Derived Year Owner GPL- + from compatible? (1) + + 0.9.0 thru 1.2 1991-1995 CWI yes + 1.3 thru 1.5.2 1.2 1995-1999 CNRI yes + 1.6 1.5.2 2000 CNRI no + 2.0 1.6 2000 BeOpen.com no + 1.6.1 1.6 2001 CNRI yes (2) + 2.1 2.0+1.6.1 2001 PSF no + 2.0.1 2.0+1.6.1 2001 PSF yes + 2.1.1 2.1+2.0.1 2001 PSF yes + 2.1.2 2.1.1 2002 PSF yes + 2.1.3 2.1.2 2002 PSF yes + 2.2 and above 2.1.1 2001-now PSF yes + +Footnotes: + +(1) GPL-compatible doesn't mean that we're distributing Python under + the GPL. All Python licenses, unlike the GPL, let you distribute + a modified version without making your changes open source. The + GPL-compatible licenses make it possible to combine Python with + other software that is released under the GPL; the others don't. + +(2) According to Richard Stallman, 1.6.1 is not GPL-compatible, + because its license has a choice of law clause. According to + CNRI, however, Stallman's lawyer has told CNRI's lawyer that 1.6.1 + is "not incompatible" with the GPL. + +Thanks to the many outside volunteers who have worked under Guido's +direction to make these releases possible. + + +B. TERMS AND CONDITIONS FOR ACCESSING OR OTHERWISE USING PYTHON +=============================================================== + +Python software and documentation are licensed under the +Python Software Foundation License Version 2. + +Starting with Python 3.8.6, examples, recipes, and other code in +the documentation are dual licensed under the PSF License Version 2 +and the Zero-Clause BSD license. + +Some software incorporated into Python is under different licenses. +The licenses are listed with code falling under that license. + + +PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2 +-------------------------------------------- + +1. This LICENSE AGREEMENT is between the Python Software Foundation +("PSF"), and the Individual or Organization ("Licensee") accessing and +otherwise using this software ("Python") in source or binary form and +its associated documentation. + +2. Subject to the terms and conditions of this License Agreement, PSF hereby +grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce, +analyze, test, perform and/or display publicly, prepare derivative works, +distribute, and otherwise use Python alone or in any derivative version, +provided, however, that PSF's License Agreement and PSF's notice of copyright, +i.e., "Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, +2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023 Python Software Foundation; +All Rights Reserved" are retained in Python alone or in any derivative version +prepared by Licensee. + +3. In the event Licensee prepares a derivative work that is based on +or incorporates Python or any part thereof, and wants to make +the derivative work available to others as provided herein, then +Licensee hereby agrees to include in any such work a brief summary of +the changes made to Python. + +4. PSF is making Python available to Licensee on an "AS IS" +basis. PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR +IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND +DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS +FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT +INFRINGE ANY THIRD PARTY RIGHTS. + +5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON +FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS +A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON, +OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. + +6. This License Agreement will automatically terminate upon a material +breach of its terms and conditions. + +7. Nothing in this License Agreement shall be deemed to create any +relationship of agency, partnership, or joint venture between PSF and +Licensee. This License Agreement does not grant permission to use PSF +trademarks or trade name in a trademark sense to endorse or promote +products or services of Licensee, or any third party. + +8. By copying, installing or otherwise using Python, Licensee +agrees to be bound by the terms and conditions of this License +Agreement. + + +BEOPEN.COM LICENSE AGREEMENT FOR PYTHON 2.0 +------------------------------------------- + +BEOPEN PYTHON OPEN SOURCE LICENSE AGREEMENT VERSION 1 + +1. This LICENSE AGREEMENT is between BeOpen.com ("BeOpen"), having an +office at 160 Saratoga Avenue, Santa Clara, CA 95051, and the +Individual or Organization ("Licensee") accessing and otherwise using +this software in source or binary form and its associated +documentation ("the Software"). + +2. Subject to the terms and conditions of this BeOpen Python License +Agreement, BeOpen hereby grants Licensee a non-exclusive, +royalty-free, world-wide license to reproduce, analyze, test, perform +and/or display publicly, prepare derivative works, distribute, and +otherwise use the Software alone or in any derivative version, +provided, however, that the BeOpen Python License is retained in the +Software, alone or in any derivative version prepared by Licensee. + +3. BeOpen is making the Software available to Licensee on an "AS IS" +basis. BEOPEN MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR +IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, BEOPEN MAKES NO AND +DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS +FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE SOFTWARE WILL NOT +INFRINGE ANY THIRD PARTY RIGHTS. + +4. BEOPEN SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF THE +SOFTWARE FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS +AS A RESULT OF USING, MODIFYING OR DISTRIBUTING THE SOFTWARE, OR ANY +DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. + +5. This License Agreement will automatically terminate upon a material +breach of its terms and conditions. + +6. This License Agreement shall be governed by and interpreted in all +respects by the law of the State of California, excluding conflict of +law provisions. Nothing in this License Agreement shall be deemed to +create any relationship of agency, partnership, or joint venture +between BeOpen and Licensee. This License Agreement does not grant +permission to use BeOpen trademarks or trade names in a trademark +sense to endorse or promote products or services of Licensee, or any +third party. As an exception, the "BeOpen Python" logos available at +http://www.pythonlabs.com/logos.html may be used according to the +permissions granted on that web page. + +7. By copying, installing or otherwise using the software, Licensee +agrees to be bound by the terms and conditions of this License +Agreement. + + +CNRI LICENSE AGREEMENT FOR PYTHON 1.6.1 +--------------------------------------- + +1. This LICENSE AGREEMENT is between the Corporation for National +Research Initiatives, having an office at 1895 Preston White Drive, +Reston, VA 20191 ("CNRI"), and the Individual or Organization +("Licensee") accessing and otherwise using Python 1.6.1 software in +source or binary form and its associated documentation. + +2. Subject to the terms and conditions of this License Agreement, CNRI +hereby grants Licensee a nonexclusive, royalty-free, world-wide +license to reproduce, analyze, test, perform and/or display publicly, +prepare derivative works, distribute, and otherwise use Python 1.6.1 +alone or in any derivative version, provided, however, that CNRI's +License Agreement and CNRI's notice of copyright, i.e., "Copyright (c) +1995-2001 Corporation for National Research Initiatives; All Rights +Reserved" are retained in Python 1.6.1 alone or in any derivative +version prepared by Licensee. Alternately, in lieu of CNRI's License +Agreement, Licensee may substitute the following text (omitting the +quotes): "Python 1.6.1 is made available subject to the terms and +conditions in CNRI's License Agreement. This Agreement together with +Python 1.6.1 may be located on the internet using the following +unique, persistent identifier (known as a handle): 1895.22/1013. This +Agreement may also be obtained from a proxy server on the internet +using the following URL: http://hdl.handle.net/1895.22/1013". + +3. In the event Licensee prepares a derivative work that is based on +or incorporates Python 1.6.1 or any part thereof, and wants to make +the derivative work available to others as provided herein, then +Licensee hereby agrees to include in any such work a brief summary of +the changes made to Python 1.6.1. + +4. CNRI is making Python 1.6.1 available to Licensee on an "AS IS" +basis. CNRI MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR +IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, CNRI MAKES NO AND +DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS +FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON 1.6.1 WILL NOT +INFRINGE ANY THIRD PARTY RIGHTS. + +5. CNRI SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON +1.6.1 FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS +A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON 1.6.1, +OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. + +6. This License Agreement will automatically terminate upon a material +breach of its terms and conditions. + +7. This License Agreement shall be governed by the federal +intellectual property law of the United States, including without +limitation the federal copyright law, and, to the extent such +U.S. federal law does not apply, by the law of the Commonwealth of +Virginia, excluding Virginia's conflict of law provisions. +Notwithstanding the foregoing, with regard to derivative works based +on Python 1.6.1 that incorporate non-separable material that was +previously distributed under the GNU General Public License (GPL), the +law of the Commonwealth of Virginia shall govern this License +Agreement only as to issues arising under or with respect to +Paragraphs 4, 5, and 7 of this License Agreement. Nothing in this +License Agreement shall be deemed to create any relationship of +agency, partnership, or joint venture between CNRI and Licensee. This +License Agreement does not grant permission to use CNRI trademarks or +trade name in a trademark sense to endorse or promote products or +services of Licensee, or any third party. + +8. By clicking on the "ACCEPT" button where indicated, or by copying, +installing or otherwise using Python 1.6.1, Licensee agrees to be +bound by the terms and conditions of this License Agreement. + + ACCEPT + + +CWI LICENSE AGREEMENT FOR PYTHON 0.9.0 THROUGH 1.2 +-------------------------------------------------- + +Copyright (c) 1991 - 1995, Stichting Mathematisch Centrum Amsterdam, +The Netherlands. All rights reserved. + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of Stichting Mathematisch +Centrum or CWI not be used in advertising or publicity pertaining to +distribution of the software without specific, written prior +permission. + +STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO +THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND +FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE +FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT +OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +ZERO-CLAUSE BSD LICENSE FOR CODE IN THE PYTHON DOCUMENTATION +---------------------------------------------------------------------- + +Permission to use, copy, modify, and/or distribute this software for any +purpose with or without fee is hereby granted. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH +REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY +AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, +INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM +LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR +OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +PERFORMANCE OF THIS SOFTWARE. \ No newline at end of file diff --git a/mllib-local/benchmarks/BLASBenchmark-jdk21-results.txt b/mllib-local/benchmarks/BLASBenchmark-jdk21-results.txt index 98e8b9592a2f5..b599123685236 100644 --- a/mllib-local/benchmarks/BLASBenchmark-jdk21-results.txt +++ b/mllib-local/benchmarks/BLASBenchmark-jdk21-results.txt @@ -2,337 +2,337 @@ daxpy ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor daxpy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 136 141 3 733.9 1.4 1.0X -java 142 146 3 706.3 1.4 1.0X -native 136 141 2 734.3 1.4 1.0X +f2j 166 169 2 603.4 1.7 1.0X +java 155 157 1 646.0 1.5 1.1X +native 165 170 1 605.0 1.7 1.0X ================================================================================================ saxpy ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor saxpy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 79 82 2 1267.0 0.8 1.0X -java 89 92 1 1118.6 0.9 0.9X -native 80 83 2 1248.2 0.8 1.0X +f2j 90 92 1 1110.4 0.9 1.0X +java 78 80 1 1277.7 0.8 1.2X +native 91 93 1 1098.6 0.9 1.0X ================================================================================================ dcopy ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor dcopy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 136 140 3 736.7 1.4 1.0X -java 120 135 12 833.0 1.2 1.1X -native 133 141 7 750.1 1.3 1.0X +f2j 163 168 3 614.6 1.6 1.0X +java 154 162 4 649.8 1.5 1.1X +native 158 162 2 632.7 1.6 1.0X ================================================================================================ scopy ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor scopy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 69 77 6 1441.4 0.7 1.0X -java 67 69 1 1498.5 0.7 1.0X -native 67 74 6 1485.8 0.7 1.0X +f2j 78 86 6 1275.7 0.8 1.0X +java 72 81 6 1391.8 0.7 1.1X +native 77 86 6 1291.6 0.8 1.0X ================================================================================================ ddot ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor ddot: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 95 95 0 1052.1 1.0 1.0X -java 47 49 1 2107.8 0.5 2.0X -native 95 95 0 1054.5 0.9 1.0X +f2j 95 95 0 1052.3 1.0 1.0X +java 51 54 1 1954.2 0.5 1.9X +native 95 95 0 1055.1 0.9 1.0X ================================================================================================ sdot ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor sdot: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 93 93 0 1075.5 0.9 1.0X -java 37 38 1 2677.6 0.4 2.5X -native 93 93 0 1076.6 0.9 1.0X +f2j 93 93 0 1074.3 0.9 1.0X +java 26 27 0 3891.3 0.3 3.6X +native 93 93 0 1075.0 0.9 1.0X ================================================================================================ dnrm2 ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor dnrm2: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 143 144 1 698.1 1.4 1.0X -java 35 37 1 2822.8 0.4 4.0X -native 94 95 1 1065.1 0.9 1.5X +f2j 142 143 2 702.7 1.4 1.0X +java 36 37 0 2791.0 0.4 4.0X +native 94 95 1 1059.9 0.9 1.5X ================================================================================================ snrm2 ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor snrm2: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 107 108 1 932.0 1.1 1.0X -java 31 32 0 3231.1 0.3 3.5X -native 91 93 2 1104.9 0.9 1.2X +f2j 123 124 1 813.3 1.2 1.0X +java 18 18 0 5598.9 0.2 6.9X +native 93 93 1 1074.8 0.9 1.3X ================================================================================================ dscal ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor dscal: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 127 130 2 789.7 1.3 1.0X -java 116 119 2 863.3 1.2 1.1X -native 122 126 4 819.4 1.2 1.0X +f2j 153 157 2 655.5 1.5 1.0X +java 135 139 2 740.6 1.4 1.1X +native 150 153 5 667.4 1.5 1.0X ================================================================================================ sscal ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor sscal: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 75 83 7 1328.3 0.8 1.0X -java 52 59 5 1911.0 0.5 1.4X -native 67 74 7 1502.7 0.7 1.1X +f2j 83 91 7 1204.0 0.8 1.0X +java 63 70 4 1593.5 0.6 1.3X +native 77 84 7 1304.7 0.8 1.1X ================================================================================================ dgemv[N] ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor dgemv[N]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 96 96 1 1041.2 1.0 1.0X -java 31 32 1 3241.1 0.3 3.1X -native 43 44 1 2303.4 0.4 2.2X +f2j 97 97 1 1031.7 1.0 1.0X +java 25 26 0 3958.2 0.3 3.8X +native 55 57 1 1814.1 0.6 1.8X ================================================================================================ dgemv[T] ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor dgemv[T]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 92 93 0 1085.6 0.9 1.0X -java 31 33 0 3187.8 0.3 2.9X -native 92 93 0 1086.1 0.9 1.0X +f2j 93 94 0 1070.0 0.9 1.0X +java 25 26 0 3995.9 0.3 3.7X +native 94 94 0 1068.7 0.9 1.0X ================================================================================================ sgemv[N] ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor sgemv[N]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 93 96 0 1074.4 0.9 1.0X -java 32 33 0 3131.9 0.3 2.9X -native 33 35 1 3028.8 0.3 2.8X +f2j 95 96 0 1049.7 1.0 1.0X +java 13 14 0 7739.8 0.1 7.4X +native 36 38 1 2745.4 0.4 2.6X ================================================================================================ sgemv[T] ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor sgemv[T]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 93 93 0 1079.9 0.9 1.0X -java 23 24 0 4315.5 0.2 4.0X -native 89 93 0 1121.6 0.9 1.0X +f2j 93 93 0 1078.6 0.9 1.0X +java 13 14 0 7912.9 0.1 7.3X +native 93 93 0 1079.6 0.9 1.0X ================================================================================================ dger ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor dger: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 130 133 1 768.3 1.3 1.0X -java 139 144 9 718.8 1.4 0.9X -native 130 135 4 766.6 1.3 1.0X +f2j 158 161 1 633.4 1.6 1.0X +java 131 134 2 763.1 1.3 1.2X +native 158 162 2 634.7 1.6 1.0X ================================================================================================ dspmv[U] ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor dspmv[U]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 89 89 0 561.3 1.8 1.0X -java 49 50 1 1022.3 1.0 1.8X -native 46 47 0 1076.9 0.9 1.9X +f2j 89 90 0 560.1 1.8 1.0X +java 13 14 0 3730.5 0.3 6.7X +native 47 47 0 1063.5 0.9 1.9X ================================================================================================ dspr[U] ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor dspr[U]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 93 95 2 535.3 1.9 1.0X -java 83 91 8 601.4 1.7 1.1X -native 59 65 6 853.9 1.2 1.6X +f2j 93 100 8 537.7 1.9 1.0X +java 92 100 8 541.3 1.8 1.0X +native 73 80 8 683.4 1.5 1.3X ================================================================================================ dsyr[U] ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor dsyr[U]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 107 109 1 466.5 2.1 1.0X -java 107 109 1 468.7 2.1 1.0X -native 108 111 1 464.7 2.2 1.0X +f2j 148 151 1 337.0 3.0 1.0X +java 149 151 1 336.2 3.0 1.0X +native 128 132 1 389.5 2.6 1.2X ================================================================================================ dgemm[N,N] ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor dgemm[N,N]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 652 666 2 1534.0 0.7 1.0X -java 421 424 1 2372.7 0.4 1.5X -native 629 634 10 1589.5 0.6 1.0X +f2j 668 669 3 1497.4 0.7 1.0X +java 63 69 4 15802.2 0.1 10.6X +native 631 633 3 1584.1 0.6 1.1X ================================================================================================ dgemm[N,T] ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor dgemm[N,T]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 663 668 2 1508.2 0.7 1.0X -java 418 424 2 2392.8 0.4 1.6X -native 374 376 1 2673.7 0.4 1.8X +f2j 661 663 1 1512.6 0.7 1.0X +java 64 68 4 15730.6 0.1 10.4X +native 374 376 1 2672.3 0.4 1.8X ================================================================================================ dgemm[T,N] ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor dgemm[T,N]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 900 901 2 1111.7 0.9 1.0X -java 421 424 3 2377.0 0.4 2.1X -native 901 903 1 1109.6 0.9 1.0X +f2j 899 901 1 1111.9 0.9 1.0X +java 63 68 4 15890.7 0.1 14.3X +native 902 903 1 1108.7 0.9 1.0X ================================================================================================ dgemm[T,T] ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor dgemm[T,T]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 945 948 6 1058.3 0.9 1.0X -java 421 424 1 2377.0 0.4 2.2X -native 907 914 2 1102.1 0.9 1.0X +f2j 940 943 4 1063.9 0.9 1.0X +java 63 68 5 15828.7 0.1 14.9X +native 914 916 1 1094.3 0.9 1.0X ================================================================================================ sgemm[N,N] ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor sgemm[N,N]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 640 647 2 1562.1 0.6 1.0X -java 415 421 2 2411.2 0.4 1.5X -native 362 370 2 2758.9 0.4 1.8X +f2j 649 650 1 1541.4 0.6 1.0X +java 40 41 1 25057.3 0.0 16.3X +native 371 372 1 2696.7 0.4 1.7X ================================================================================================ sgemm[N,T] ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor sgemm[N,T]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 641 649 2 1560.0 0.6 1.0X -java 414 422 2 2415.8 0.4 1.5X -native 371 372 1 2696.9 0.4 1.7X +f2j 650 651 1 1538.5 0.6 1.0X +java 40 41 1 24717.0 0.0 16.1X +native 371 372 1 2692.9 0.4 1.8X ================================================================================================ sgemm[T,N] ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor sgemm[T,N]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 898 899 2 1113.0 0.9 1.0X -java 418 421 1 2390.6 0.4 2.1X -native 913 918 1 1095.4 0.9 1.0X +f2j 900 901 0 1111.2 0.9 1.0X +java 40 42 1 25076.9 0.0 22.6X +native 917 920 2 1090.1 0.9 1.0X ================================================================================================ sgemm[T,T] ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor sgemm[T,T]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 935 938 6 1070.0 0.9 1.0X -java 419 422 1 2387.2 0.4 2.2X -native 898 913 2 1113.1 0.9 1.0X +f2j 940 943 1 1063.6 0.9 1.0X +java 40 42 1 24825.6 0.0 23.3X +native 914 916 1 1094.0 0.9 1.0X diff --git a/mllib-local/benchmarks/BLASBenchmark-results.txt b/mllib-local/benchmarks/BLASBenchmark-results.txt index 8bafcdd8fe702..8fde701d5b2b9 100644 --- a/mllib-local/benchmarks/BLASBenchmark-results.txt +++ b/mllib-local/benchmarks/BLASBenchmark-results.txt @@ -2,337 +2,337 @@ daxpy ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor daxpy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 145 153 5 691.6 1.4 1.0X -java 154 158 2 647.5 1.5 0.9X -native 148 152 5 676.5 1.5 1.0X +f2j 152 157 2 656.7 1.5 1.0X +java 146 152 3 686.0 1.5 1.0X +native 154 160 2 647.8 1.5 1.0X ================================================================================================ saxpy ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor saxpy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 85 89 3 1177.3 0.8 1.0X -java 92 94 1 1084.0 0.9 0.9X -native 85 88 2 1173.3 0.9 1.0X +f2j 86 88 1 1167.6 0.9 1.0X +java 73 76 1 1367.1 0.7 1.2X +native 87 89 1 1150.2 0.9 1.0X ================================================================================================ dcopy ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor dcopy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 141 147 5 707.3 1.4 1.0X -java 138 143 3 723.3 1.4 1.0X -native 143 150 4 697.8 1.4 1.0X +f2j 149 156 3 673.1 1.5 1.0X +java 148 154 3 676.5 1.5 1.0X +native 149 154 2 668.9 1.5 1.0X ================================================================================================ scopy ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor scopy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 82 84 1 1221.3 0.8 1.0X -java 71 74 1 1402.9 0.7 1.1X -native 79 82 1 1259.0 0.8 1.0X +f2j 83 86 1 1199.9 0.8 1.0X +java 75 78 2 1337.1 0.7 1.1X +native 81 83 1 1240.3 0.8 1.0X ================================================================================================ ddot ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor ddot: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 95 96 1 1049.5 1.0 1.0X -java 47 50 2 2111.5 0.5 2.0X -native 95 95 2 1055.4 0.9 1.0X +f2j 95 96 0 1048.1 1.0 1.0X +java 45 48 2 2208.4 0.5 2.1X +native 95 96 1 1053.9 0.9 1.0X ================================================================================================ sdot ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor sdot: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 93 93 1 1072.9 0.9 1.0X -java 36 37 0 2741.2 0.4 2.6X -native 93 93 1 1074.6 0.9 1.0X +f2j 93 93 0 1073.1 0.9 1.0X +java 22 23 1 4467.0 0.2 4.2X +native 93 93 0 1075.0 0.9 1.0X ================================================================================================ dnrm2 ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor dnrm2: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 143 144 3 699.2 1.4 1.0X -java 37 40 2 2696.5 0.4 3.9X -native 94 95 3 1060.8 0.9 1.5X +f2j 143 143 1 699.6 1.4 1.0X +java 32 33 0 3090.1 0.3 4.4X +native 94 95 1 1059.3 0.9 1.5X ================================================================================================ snrm2 ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor snrm2: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 116 116 1 863.5 1.2 1.0X -java 32 32 0 3134.9 0.3 3.6X -native 93 93 1 1074.7 0.9 1.2X +f2j 116 116 1 862.5 1.2 1.0X +java 16 16 0 6189.1 0.2 7.2X +native 93 93 0 1074.4 0.9 1.2X ================================================================================================ dscal ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor dscal: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 137 141 2 732.4 1.4 1.0X -java 125 129 2 799.9 1.3 1.1X -native 135 138 2 743.4 1.3 1.0X +f2j 144 148 3 696.6 1.4 1.0X +java 132 136 2 757.4 1.3 1.1X +native 139 145 2 718.7 1.4 1.0X ================================================================================================ sscal ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor sscal: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 88 91 1 1131.6 0.9 1.0X -java 65 68 3 1537.5 0.7 1.4X -native 80 82 1 1255.9 0.8 1.1X +f2j 88 90 1 1130.9 0.9 1.0X +java 66 69 1 1506.1 0.7 1.3X +native 80 83 1 1244.9 0.8 1.1X ================================================================================================ dgemv[N] ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor dgemv[N]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 96 97 2 1036.4 1.0 1.0X -java 33 33 1 3047.4 0.3 2.9X -native 44 47 2 2250.1 0.4 2.2X +f2j 96 97 0 1038.0 1.0 1.0X +java 23 24 0 4285.5 0.2 4.1X +native 46 48 1 2175.7 0.5 2.1X ================================================================================================ dgemv[T] ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor dgemv[T]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 94 94 1 1061.4 0.9 1.0X -java 32 33 0 3078.4 0.3 2.9X -native 93 94 1 1071.3 0.9 1.0X +f2j 94 95 1 1061.5 0.9 1.0X +java 23 24 0 4279.5 0.2 4.0X +native 93 94 0 1072.6 0.9 1.0X ================================================================================================ sgemv[N] ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor sgemv[N]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 95 95 1 1052.9 0.9 1.0X -java 38 39 0 2602.1 0.4 2.5X -native 34 36 1 2932.4 0.3 2.8X +f2j 95 95 0 1053.1 0.9 1.0X +java 12 13 0 8517.5 0.1 8.1X +native 34 36 1 2909.1 0.3 2.8X ================================================================================================ sgemv[T] ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor sgemv[T]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 93 93 0 1078.0 0.9 1.0X -java 28 28 0 3609.5 0.3 3.3X -native 93 93 0 1078.9 0.9 1.0X +f2j 93 93 0 1077.2 0.9 1.0X +java 12 12 0 8423.1 0.1 7.8X +native 93 93 0 1078.4 0.9 1.0X ================================================================================================ dger ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor dger: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 142 146 2 705.7 1.4 1.0X -java 147 151 4 681.2 1.5 1.0X -native 139 144 5 721.4 1.4 1.0X +f2j 149 154 2 672.9 1.5 1.0X +java 130 134 2 768.8 1.3 1.1X +native 146 150 2 686.0 1.5 1.0X ================================================================================================ dspmv[U] ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor dspmv[U]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 100 100 1 501.6 2.0 1.0X -java 48 48 0 1038.9 1.0 2.1X -native 47 47 0 1065.8 0.9 2.1X +f2j 100 100 0 500.5 2.0 1.0X +java 12 13 0 4225.7 0.2 8.4X +native 47 47 0 1066.5 0.9 2.1X ================================================================================================ dspr[U] ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor dspr[U]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 95 98 2 526.0 1.9 1.0X -java 96 98 1 523.0 1.9 1.0X -native 71 76 2 701.6 1.4 1.3X +f2j 97 99 1 515.9 1.9 1.0X +java 97 99 1 517.1 1.9 1.0X +native 74 78 1 677.5 1.5 1.3X ================================================================================================ dsyr[U] ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor dsyr[U]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 118 122 2 424.8 2.4 1.0X -java 119 123 3 420.6 2.4 1.0X -native 117 122 3 426.4 2.3 1.0X +f2j 145 149 2 345.3 2.9 1.0X +java 146 150 2 342.4 2.9 1.0X +native 122 128 2 409.9 2.4 1.2X ================================================================================================ dgemm[N,N] ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor dgemm[N,N]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 670 671 2 1492.7 0.7 1.0X -java 422 424 3 2370.0 0.4 1.6X -native 631 633 4 1586.0 0.6 1.1X +f2j 670 673 4 1491.8 0.7 1.0X +java 65 72 1 15466.7 0.1 10.4X +native 632 634 3 1583.1 0.6 1.1X ================================================================================================ dgemm[N,T] ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor dgemm[N,T]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 672 677 5 1487.0 0.7 1.0X -java 422 424 3 2368.6 0.4 1.6X -native 375 377 3 2666.7 0.4 1.8X +f2j 673 675 2 1486.3 0.7 1.0X +java 72 73 2 13912.8 0.1 9.4X +native 376 377 1 2662.5 0.4 1.8X ================================================================================================ dgemm[T,N] ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor dgemm[T,N]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 923 925 6 1083.4 0.9 1.0X -java 421 423 4 2372.8 0.4 2.2X -native 902 906 8 1108.1 0.9 1.0X +f2j 923 925 1 1082.9 0.9 1.0X +java 64 72 1 15595.7 0.1 14.4X +native 902 904 1 1108.7 0.9 1.0X ================================================================================================ dgemm[T,T] ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor dgemm[T,T]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 955 957 5 1047.1 1.0 1.0X -java 422 423 3 2370.8 0.4 2.3X -native 915 917 3 1092.7 0.9 1.0X +f2j 953 957 2 1049.0 1.0 1.0X +java 65 73 1 15430.1 0.1 14.7X +native 915 917 1 1092.5 0.9 1.0X ================================================================================================ sgemm[N,N] ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor sgemm[N,N]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 649 651 2 1539.7 0.6 1.0X -java 420 421 3 2381.8 0.4 1.5X -native 371 372 1 2693.9 0.4 1.7X +f2j 649 651 2 1540.5 0.6 1.0X +java 41 42 1 24371.9 0.0 15.8X +native 371 373 5 2695.0 0.4 1.7X ================================================================================================ sgemm[N,T] ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor sgemm[N,T]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 650 652 4 1538.8 0.6 1.0X -java 420 421 3 2381.1 0.4 1.5X -native 372 373 4 2689.5 0.4 1.7X +f2j 651 653 3 1535.9 0.7 1.0X +java 41 42 1 24106.8 0.0 15.7X +native 372 373 1 2688.0 0.4 1.8X ================================================================================================ sgemm[T,N] ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor sgemm[T,N]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 902 903 1 1108.5 0.9 1.0X -java 419 420 3 2386.9 0.4 2.2X -native 918 920 4 1089.6 0.9 1.0X +f2j 903 904 1 1107.6 0.9 1.0X +java 40 41 1 24712.8 0.0 22.3X +native 919 921 1 1088.1 0.9 1.0X ================================================================================================ sgemm[T,T] ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor sgemm[T,T]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -f2j 949 952 8 1053.6 0.9 1.0X -java 419 420 2 2384.2 0.4 2.3X -native 914 916 5 1094.3 0.9 1.0X +f2j 950 953 5 1052.9 0.9 1.0X +java 41 42 1 24365.2 0.0 23.1X +native 915 916 1 1093.0 0.9 1.0X diff --git a/mllib/benchmarks/UDTSerializationBenchmark-jdk21-results.txt b/mllib/benchmarks/UDTSerializationBenchmark-jdk21-results.txt index c66e79500f4d3..af675fe609fe2 100644 --- a/mllib/benchmarks/UDTSerializationBenchmark-jdk21-results.txt +++ b/mllib/benchmarks/UDTSerializationBenchmark-jdk21-results.txt @@ -2,11 +2,11 @@ VectorUDT de/serialization ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor VectorUDT de/serialization: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -serialize 102 106 3 0.0 101512.2 1.0X -deserialize 75 78 1 0.0 75331.6 1.3X +serialize 95 97 1 0.0 94974.9 1.0X +deserialize 67 69 1 0.0 66631.2 1.4X diff --git a/mllib/benchmarks/UDTSerializationBenchmark-results.txt b/mllib/benchmarks/UDTSerializationBenchmark-results.txt index c54679dfa205f..d22630df30dd8 100644 --- a/mllib/benchmarks/UDTSerializationBenchmark-results.txt +++ b/mllib/benchmarks/UDTSerializationBenchmark-results.txt @@ -2,11 +2,11 @@ VectorUDT de/serialization ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor VectorUDT de/serialization: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -serialize 109 113 3 0.0 109208.6 1.0X -deserialize 77 80 1 0.0 77116.0 1.4X +serialize 90 99 3 0.0 89569.0 1.0X +deserialize 68 72 3 0.0 68026.4 1.3X diff --git a/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala b/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala index 41f39461f71a6..83b77510602b2 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala @@ -18,6 +18,7 @@ package org.apache.spark.ml import org.apache.spark.annotation.Since +import org.apache.spark.internal.{LogKeys, MDC} import org.apache.spark.ml.linalg.VectorUDT import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ @@ -192,8 +193,8 @@ abstract class PredictionModel[FeaturesType, M <: PredictionModel[FeaturesType, if ($(predictionCol).nonEmpty) { transformImpl(dataset) } else { - this.logWarning(s"$uid: Predictor.transform() does nothing" + - " because no output columns were set.") + logWarning(log"${MDC(LogKeys.UUID, uid)}: Predictor.transform() does nothing because " + + log"no output columns were set.") dataset.toDF() } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala index e12c68f31099e..7883a0dea54f1 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala @@ -18,6 +18,7 @@ package org.apache.spark.ml.classification import org.apache.spark.annotation.Since +import org.apache.spark.internal.{LogKeys, MDC} import org.apache.spark.ml.{PredictionModel, Predictor, PredictorParams} import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param.ParamMap @@ -149,8 +150,8 @@ abstract class ClassificationModel[FeaturesType, M <: ClassificationModel[Featur } if (numColsOutput == 0) { - logWarning(s"$uid: ClassificationModel.transform() does nothing" + - " because no output columns were set.") + logWarning(log"${MDC(LogKeys.UUID, uid)}: ClassificationModel.transform() does nothing " + + log"because no output columns were set.") } outputData.toDF() } diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala index 024693ba06f20..4bcc7877658d1 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala @@ -26,7 +26,7 @@ import org.apache.hadoop.fs.Path import org.apache.spark.SparkException import org.apache.spark.annotation.Since import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{COUNT, RANGE} +import org.apache.spark.internal.LogKeys.{COUNT, RANGE} import org.apache.spark.ml.feature._ import org.apache.spark.ml.linalg._ import org.apache.spark.ml.optim.aggregator._ @@ -179,8 +179,8 @@ class LinearSVC @Since("2.2.0") ( maxBlockSizeInMB) if (dataset.storageLevel != StorageLevel.NONE) { - instr.logWarning(s"Input instances will be standardized, blockified to blocks, and " + - s"then cached during training. Be careful of double caching!") + instr.logWarning("Input instances will be standardized, blockified to blocks, and " + + "then cached during training. Be careful of double caching!") } val instances = dataset.select( diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 0d487377b9319..b3c48f13591fd 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -27,8 +27,8 @@ import org.apache.hadoop.fs.Path import org.apache.spark.SparkException import org.apache.spark.annotation.Since -import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{COUNT, RANGE} +import org.apache.spark.internal.{Logging, LogKeys, MDC} +import org.apache.spark.internal.LogKeys.{COUNT, RANGE} import org.apache.spark.ml.feature._ import org.apache.spark.ml.impl.Utils import org.apache.spark.ml.linalg._ @@ -503,8 +503,8 @@ class LogisticRegression @Since("1.2.0") ( tol, fitIntercept, maxBlockSizeInMB) if (dataset.storageLevel != StorageLevel.NONE) { - instr.logWarning(s"Input instances will be standardized, blockified to blocks, and " + - s"then cached during training. Be careful of double caching!") + instr.logWarning("Input instances will be standardized, blockified to blocks, and " + + "then cached during training. Be careful of double caching!") } val instances = dataset.select( @@ -569,8 +569,8 @@ class LogisticRegression @Since("1.2.0") ( val isConstantLabel = histogram.count(_ != 0.0) == 1 if ($(fitIntercept) && isConstantLabel && !usingBoundConstrainedOptimization) { - instr.logWarning(s"All labels are the same value and fitIntercept=true, so the " + - s"coefficients will be zeros. Training is not needed.") + instr.logWarning("All labels are the same value and fitIntercept=true, so the " + + "coefficients will be zeros. Training is not needed.") val constantLabelIndex = Vectors.dense(histogram).argmax val coefMatrix = new SparseMatrix(numCoefficientSets, numFeatures, new Array[Int](numCoefficientSets + 1), Array.emptyIntArray, Array.emptyDoubleArray, @@ -584,8 +584,8 @@ class LogisticRegression @Since("1.2.0") ( } if (!$(fitIntercept) && isConstantLabel) { - instr.logWarning(s"All labels belong to a single class and fitIntercept=false. It's a " + - s"dangerous ground, so the algorithm may not converge.") + instr.logWarning("All labels belong to a single class and fitIntercept=false. It's a " + + "dangerous ground, so the algorithm may not converge.") } val featuresMean = summarizer.mean.toArray @@ -847,9 +847,11 @@ class LogisticRegression @Since("1.2.0") ( (_initialModel.interceptVector.size == numCoefficientSets) && (_initialModel.getFitIntercept == $(fitIntercept)) if (!modelIsValid) { - instr.logWarning(s"Initial coefficients will be ignored! Its dimensions " + - s"(${providedCoefs.numRows}, ${providedCoefs.numCols}) did not match the " + - s"expected size ($numCoefficientSets, $numFeatures)") + instr.logWarning(log"Initial coefficients will be ignored! Its dimensions " + + log"(${MDC(LogKeys.NUM_ROWS, providedCoefs.numRows)}}, " + + log"${MDC(LogKeys.NUM_COLUMNS, providedCoefs.numCols)}) did not match the " + + log"expected size (${MDC(LogKeys.NUM_COEFFICIENTS, numCoefficientSets)}, " + + log"${MDC(LogKeys.NUM_FEATURES, numFeatures)})") } modelIsValid case None => false diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala index b70f3ddd4c14d..18643f74b700f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala @@ -30,6 +30,7 @@ import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkContext import org.apache.spark.annotation.Since +import org.apache.spark.internal.{LogKeys, MDC} import org.apache.spark.ml._ import org.apache.spark.ml.attribute._ import org.apache.spark.ml.linalg.{Vector, Vectors} @@ -180,8 +181,8 @@ final class OneVsRestModel private[ml] ( val outputSchema = transformSchema(dataset.schema, logging = true) if (getPredictionCol.isEmpty && getRawPredictionCol.isEmpty) { - logWarning(s"$uid: OneVsRestModel.transform() does nothing" + - " because no output columns were set.") + logWarning(log"${MDC(LogKeys.UUID, uid)}: OneVsRestModel.transform() does nothing " + + log"because no output columns were set.") return dataset.toDF() } @@ -400,7 +401,8 @@ final class OneVsRest @Since("1.4.0") ( getClassifier match { case _: HasWeightCol => true case c => - instr.logWarning(s"weightCol is ignored, as it is not supported by $c now.") + instr.logWarning(log"weightCol is ignored, as it is not supported by " + + log"${MDC(LogKeys.CLASSIFIER, c)} now.") false } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala index 460f2398a4628..61fab02cb4518 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala @@ -18,6 +18,7 @@ package org.apache.spark.ml.classification import org.apache.spark.annotation.Since +import org.apache.spark.internal.{LogKeys, MDC} import org.apache.spark.ml.linalg.{DenseVector, Vector, VectorUDT} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared._ @@ -154,8 +155,8 @@ abstract class ProbabilisticClassificationModel[ } if (numColsOutput == 0) { - this.logWarning(s"$uid: ProbabilisticClassificationModel.transform() does nothing" + - " because no output columns were set.") + this.logWarning(log"${MDC(LogKeys.UUID, uid)}: ProbabilisticClassificationModel.transform()" + + log" does nothing because no output columns were set.") } outputData.toDF() } diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala index 19ae8359b9a37..a68b2fc0dec83 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala @@ -21,6 +21,7 @@ import org.apache.hadoop.fs.Path import org.apache.spark.annotation.Since import org.apache.spark.broadcast.Broadcast +import org.apache.spark.internal.{LogKeys, MDC} import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.impl.Utils.{unpackUpperTriangular, EPSILON} import org.apache.spark.ml.linalg._ @@ -142,8 +143,8 @@ class GaussianMixtureModel private[ml] ( } if (numColsOutput == 0) { - this.logWarning(s"$uid: GaussianMixtureModel.transform() does nothing" + - " because no output columns were set.") + this.logWarning(log"${MDC(LogKeys.UUID, uid)}: GaussianMixtureModel.transform() does " + + log"nothing because no output columns were set.") } outputData.toDF() } diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala index 519978a0733b9..04f76660aee6a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala @@ -22,6 +22,8 @@ import scala.collection.mutable import org.apache.hadoop.fs.Path import org.apache.spark.annotation.Since +import org.apache.spark.internal.LogKeys.{COST, INIT_MODE, NUM_ITERATIONS, TOTAL_TIME} +import org.apache.spark.internal.MDC import org.apache.spark.ml.{Estimator, Model, PipelineStage} import org.apache.spark.ml.feature.{Instance, InstanceBlock} import org.apache.spark.ml.linalg._ @@ -449,14 +451,15 @@ class KMeans @Since("1.5.0") ( private def trainWithBlock(dataset: Dataset[_], instr: Instrumentation) = { if (dataset.storageLevel != StorageLevel.NONE) { - instr.logWarning(s"Input vectors will be blockified to blocks, and " + - s"then cached during training. Be careful of double caching!") + instr.logWarning("Input vectors will be blockified to blocks, and " + + "then cached during training. Be careful of double caching!") } - val initStartTime = System.nanoTime + val initStartTime = System.currentTimeMillis val centers = initialize(dataset) - val initTimeInSeconds = (System.nanoTime - initStartTime) / 1e9 - instr.logInfo(f"Initialization with ${$(initMode)} took $initTimeInSeconds%.3f seconds.") + val initTimeMs = System.currentTimeMillis - initStartTime + instr.logInfo(log"Initialization with ${MDC(INIT_MODE, $(initMode))} took " + + log"${MDC(TOTAL_TIME, initTimeMs)} ms.") val numFeatures = centers.head.size instr.logNumFeatures(numFeatures) @@ -492,7 +495,7 @@ class KMeans @Since("1.5.0") ( val distanceFunction = getDistanceFunction val sc = dataset.sparkSession.sparkContext - val iterationStartTime = System.nanoTime + val iterationStartTime = System.currentTimeMillis var converged = false var cost = 0.0 var iteration = 0 @@ -549,15 +552,16 @@ class KMeans @Since("1.5.0") ( } blocks.unpersist() - val iterationTimeInSeconds = (System.nanoTime() - iterationStartTime) / 1e9 - instr.logInfo(f"Iterations took $iterationTimeInSeconds%.3f seconds.") + val iterationTimeMs = System.currentTimeMillis - iterationStartTime + instr.logInfo(log"Iterations took ${MDC(TOTAL_TIME, iterationTimeMs)} ms.") if (iteration == $(maxIter)) { - instr.logInfo(s"KMeans reached the max number of iterations: ${$(maxIter)}.") + instr.logInfo(log"KMeans reached the max number of iterations: " + + log"${MDC(NUM_ITERATIONS, $(maxIter))}.") } else { - instr.logInfo(s"KMeans converged in $iteration iterations.") + instr.logInfo(log"KMeans converged in ${MDC(NUM_ITERATIONS, iteration)} iterations.") } - instr.logInfo(s"The cost is $cost.") + instr.logInfo(log"The cost is ${MDC(COST, cost)}.") new MLlibKMeansModel(centers.map(OldVectors.fromML), $(distanceMeasure), cost, iteration) } diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala index 3727eb17dcd0a..c726aed14ee51 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala @@ -20,6 +20,7 @@ package org.apache.spark.ml.feature import scala.collection.mutable.ArrayBuilder import org.apache.spark.annotation.Since +import org.apache.spark.internal.{LogKeys, MDC} import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute._ import org.apache.spark.ml.linalg._ @@ -139,8 +140,9 @@ final class Binarizer @Since("1.4.0") (@Since("1.4.0") override val uid: String) }.apply(col(colName)) case _: VectorUDT if td < 0 => - this.logWarning(s"Binarization operations on sparse dataset with negative threshold " + - s"$td will build a dense output, so take care when applying to sparse input.") + logWarning(log"Binarization operations on sparse dataset with negative threshold " + + log"${MDC(LogKeys.THRESHOLD, td)} will build a dense output, so take care when " + + log"applying to sparse input.") udf { vector: Vector => val values = Array.fill(vector.size)(1.0) var nnz = vector.size diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala index 5862a60a407d4..93956fc1811ef 100755 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala @@ -20,6 +20,7 @@ package org.apache.spark.ml.feature import java.util.Locale import org.apache.spark.annotation.Since +import org.apache.spark.internal.{LogKeys, MDC} import org.apache.spark.ml.Transformer import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasInputCols, HasOutputCol, HasOutputCols} @@ -129,9 +130,9 @@ class StopWordsRemover @Since("1.5.0") (@Since("1.5.0") override val uid: String if (Locale.getAvailableLocales.contains(Locale.getDefault)) { Locale.getDefault } else { - logWarning(s"Default locale set was [${Locale.getDefault.toString}]; however, it was " + - "not found in available locales in JVM, falling back to en_US locale. Set param `locale` " + - "in order to respect another locale.") + logWarning(log"Default locale set was [${MDC(LogKeys.LOCALE, Locale.getDefault)}]; " + + log"however, it was not found in available locales in JVM, falling back to en_US locale. " + + log"Set param `locale` in order to respect another locale.") Locale.US } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala index f52f56174ed23..60dc4d0240716 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala @@ -21,6 +21,7 @@ import org.apache.hadoop.fs.Path import org.apache.spark.SparkException import org.apache.spark.annotation.Since +import org.apache.spark.internal.{LogKeys, MDC} import org.apache.spark.ml.{Estimator, Model, Transformer} import org.apache.spark.ml.attribute.{Attribute, NominalAttribute} import org.apache.spark.ml.param._ @@ -431,8 +432,8 @@ class StringIndexerModel ( val labels = labelsArray(i) if (!dataset.schema.fieldNames.contains(inputColName)) { - logWarning(s"Input column ${inputColName} does not exist during transformation. " + - "Skip StringIndexerModel for this column.") + logWarning(log"Input column ${MDC(LogKeys.COLUMN_NAME, inputColName)} does not exist " + + log"during transformation. Skip StringIndexerModel for this column.") outputColNames(i) = null } else { val filteredLabels = getHandleInvalid match { diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/IterativelyReweightedLeastSquares.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/IterativelyReweightedLeastSquares.scala index c2fe001d4048d..17adf61e46e9d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/optim/IterativelyReweightedLeastSquares.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/optim/IterativelyReweightedLeastSquares.scala @@ -17,10 +17,13 @@ package org.apache.spark.ml.optim +import org.apache.spark.internal.LogKeys.{NUM_ITERATIONS, RELATIVE_TOLERANCE} +import org.apache.spark.internal.MDC import org.apache.spark.ml.feature.{Instance, OffsetInstance} import org.apache.spark.ml.linalg._ import org.apache.spark.ml.util.OptionalInstrumentation import org.apache.spark.rdd.RDD +import org.apache.spark.util.MavenUtils.LogStringContext /** * Model fitted by [[IterativelyReweightedLeastSquares]]. @@ -101,14 +104,15 @@ private[ml] class IterativelyReweightedLeastSquares( if (maxTol < tol) { converged = true - instr.logInfo(s"IRLS converged in $iter iterations.") + instr.logInfo(log"IRLS converged in ${MDC(NUM_ITERATIONS, iter)} iterations.") } - instr.logInfo(s"Iteration $iter : relative tolerance = $maxTol") + instr.logInfo(log"Iteration ${MDC(NUM_ITERATIONS, iter)}: " + + log"relative tolerance = ${MDC(RELATIVE_TOLERANCE, maxTol)}") iter = iter + 1 if (iter == maxIter) { - instr.logInfo(s"IRLS reached the max number of iterations: $maxIter.") + instr.logInfo(log"IRLS reached the max number of iterations: ${MDC(NUM_ITERATIONS, iter)}.") } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala index 9acc20b8eb2e8..eff100cc3ae3a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala @@ -17,10 +17,13 @@ package org.apache.spark.ml.optim +import org.apache.spark.internal.LogKeys.COUNT +import org.apache.spark.internal.MDC import org.apache.spark.ml.feature.Instance import org.apache.spark.ml.linalg._ import org.apache.spark.ml.util.OptionalInstrumentation import org.apache.spark.rdd.RDD +import org.apache.spark.util.MavenUtils.LogStringContext /** * Model fitted by [[WeightedLeastSquares]]. @@ -106,7 +109,7 @@ private[ml] class WeightedLeastSquares( val summary = instances.treeAggregate(new Aggregator)(_.add(_), _.merge(_), depth) summary.validate() - instr.logInfo(s"Number of instances: ${summary.count}.") + instr.logInfo(log"Number of instances: ${MDC(COUNT, summary.count)}.") val k = if (fitIntercept) summary.k + 1 else summary.k val numFeatures = summary.k val triK = summary.triK @@ -121,13 +124,13 @@ private[ml] class WeightedLeastSquares( if (rawBStd == 0) { if (fitIntercept || rawBBar == 0.0) { if (rawBBar == 0.0) { - instr.logWarning(s"Mean and standard deviation of the label are zero, so the " + - s"coefficients and the intercept will all be zero; as a result, training is not " + - s"needed.") + instr.logWarning("Mean and standard deviation of the label are zero, so the " + + "coefficients and the intercept will all be zero; as a result, training is not " + + "needed.") } else { - instr.logWarning(s"The standard deviation of the label is zero, so the coefficients " + - s"will be zeros and the intercept will be the mean of the label; as a result, " + - s"training is not needed.") + instr.logWarning("The standard deviation of the label is zero, so the coefficients " + + "will be zeros and the intercept will be the mean of the label; as a result, " + + "training is not needed.") } val coefficients = new DenseVector(Array.ofDim(numFeatures)) val intercept = rawBBar @@ -136,8 +139,8 @@ private[ml] class WeightedLeastSquares( } else { require(!(regParam > 0.0 && standardizeLabel), "The standard deviation of the label is " + "zero. Model cannot be regularized when labels are standardized.") - instr.logWarning(s"The standard deviation of the label is zero. Consider setting " + - s"fitIntercept=true.") + instr.logWarning("The standard deviation of the label is zero. Consider setting " + + "fitIntercept=true.") } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/RWrapperUtils.scala b/mllib/src/main/scala/org/apache/spark/ml/r/RWrapperUtils.scala index 665e50af67d46..9e66647ef35fb 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/r/RWrapperUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/r/RWrapperUtils.scala @@ -17,7 +17,8 @@ package org.apache.spark.ml.r -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{FEATURE_COLUMN, LABEL_COLUMN, NEW_FEATURE_COLUMN_NAME, NEW_LABEL_COLUMN_NAME} import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NominalAttribute} import org.apache.spark.ml.feature.{RFormula, RFormulaModel} import org.apache.spark.ml.util.Identifiable @@ -37,15 +38,15 @@ private[r] object RWrapperUtils extends Logging { def checkDataColumns(rFormula: RFormula, data: Dataset[_]): Unit = { if (data.schema.fieldNames.contains(rFormula.getFeaturesCol)) { val newFeaturesName = s"${Identifiable.randomUID(rFormula.getFeaturesCol)}" - logInfo(s"data containing ${rFormula.getFeaturesCol} column, " + - s"using new name $newFeaturesName instead") + logInfo(log"data containing ${MDC(FEATURE_COLUMN, rFormula.getFeaturesCol)} column, " + + log"using new name ${MDC(NEW_FEATURE_COLUMN_NAME, newFeaturesName)} instead") rFormula.setFeaturesCol(newFeaturesName) } if (rFormula.getForceIndexLabel && data.schema.fieldNames.contains(rFormula.getLabelCol)) { val newLabelName = s"${Identifiable.randomUID(rFormula.getLabelCol)}" - logInfo(s"data containing ${rFormula.getLabelCol} column and we force to index label, " + - s"using new name $newLabelName instead") + logInfo(log"data containing ${MDC(LABEL_COLUMN, rFormula.getLabelCol)} column and we force " + + log"to index label, using new name ${MDC(NEW_LABEL_COLUMN_NAME, newLabelName)} instead") rFormula.setLabelCol(newLabelName) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala index 1e6be16ef62b7..50f94a5799444 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala @@ -33,7 +33,8 @@ import org.json4s.JsonDSL._ import org.apache.spark.{Partitioner, SparkException} import org.apache.spark.annotation.Since -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.PATH import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.linalg.BLAS import org.apache.spark.ml.param._ @@ -1027,7 +1028,7 @@ object ALS extends DefaultParamsReadable[ALS] with Logging { checkpointFile.getFileSystem(sc.hadoopConfiguration).delete(checkpointFile, true) } catch { case e: IOException => - logWarning(s"Cannot delete checkpoint file $file:", e) + logWarning(log"Cannot delete checkpoint file ${MDC(PATH, file)}:", e) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala index 57d20bcd6f49d..788ad65497dfc 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala @@ -24,7 +24,7 @@ import breeze.optimize.{CachedDiffFunction, LBFGS => BreezeLBFGS} import org.apache.hadoop.fs.Path import org.apache.spark.annotation.Since -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.ml.PredictorParams import org.apache.spark.ml.feature._ import org.apache.spark.ml.linalg._ @@ -206,8 +206,8 @@ class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: S instr.logNamedValue("quantileProbabilities.size", $(quantileProbabilities).length) if (dataset.storageLevel != StorageLevel.NONE) { - instr.logWarning(s"Input instances will be standardized, blockified to blocks, and " + - s"then cached during training. Be careful of double caching!") + instr.logWarning("Input instances will be standardized, blockified to blocks, and " + + "then cached during training. Be careful of double caching!") } val validatedCensorCol = { @@ -441,8 +441,8 @@ class AFTSurvivalRegressionModel private[ml] ( if (predictionColNames.nonEmpty) { dataset.withColumns(predictionColNames, predictionColumns) } else { - this.logWarning(s"$uid: AFTSurvivalRegressionModel.transform() does nothing" + - " because no output columns were set.") + this.logWarning(log"${MDC(LogKeys.UUID, uid)}: AFTSurvivalRegressionModel.transform() " + + log"does nothing because no output columns were set.") dataset.toDF() } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala index 6c0089b689499..481e8c8357f16 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala @@ -22,6 +22,7 @@ import org.json4s.{DefaultFormats, JObject} import org.json4s.JsonDSL._ import org.apache.spark.annotation.Since +import org.apache.spark.internal.{LogKeys, MDC} import org.apache.spark.ml.feature.Instance import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.param.ParamMap @@ -238,8 +239,8 @@ class DecisionTreeRegressionModel private[ml] ( if (predictionColNames.nonEmpty) { dataset.withColumns(predictionColNames, predictionColumns) } else { - this.logWarning(s"$uid: DecisionTreeRegressionModel.transform() does nothing" + - " because no output columns were set.") + this.logWarning(log"${MDC(LogKeys.UUID, uid)}: DecisionTreeRegressionModel.transform() " + + log"does nothing because no output columns were set.") dataset.toDF() } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala index 0c58cc2449b99..732bfcbd671ed 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala @@ -21,7 +21,7 @@ import org.json4s.{DefaultFormats, JObject} import org.json4s.JsonDSL._ import org.apache.spark.annotation.Since -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.ml.linalg.{BLAS, Vector} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.tree._ @@ -288,8 +288,8 @@ class GBTRegressionModel private[ml]( if (predictionColNames.nonEmpty) { dataset.withColumns(predictionColNames, predictionColumns) } else { - this.logWarning(s"$uid: GBTRegressionModel.transform() does nothing" + - " because no output columns were set.") + this.logWarning(log"${MDC(LogKeys.UUID, uid)}: GBTRegressionModel.transform() " + + log"does nothing because no output columns were set.") dataset.toDF() } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index aa39a3e177eeb..4ded2f8d7bf5c 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -26,7 +26,7 @@ import org.apache.hadoop.fs.Path import org.apache.spark.SparkException import org.apache.spark.annotation.Since -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.ml.PredictorParams import org.apache.spark.ml.attribute._ import org.apache.spark.ml.feature.{Instance, OffsetInstance} @@ -1074,8 +1074,8 @@ class GeneralizedLinearRegressionModel private[ml] ( } if (numColsOutput == 0) { - this.logWarning(s"$uid: GeneralizedLinearRegressionModel.transform() does nothing" + - " because no output columns were set.") + this.logWarning(log"${MDC(LogKeys.UUID, uid)}: GeneralizedLinearRegressionModel.transform()" + + log" does nothing because no output columns were set.") } outputData.toDF() } diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index d53b8b270f2d6..23e536ce45eb5 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -333,8 +333,8 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String epsilon, maxBlockSizeInMB) if (dataset.storageLevel != StorageLevel.NONE) { - instr.logWarning(s"Input instances will be standardized, blockified to blocks, and " + - s"then cached during training. Be careful of double caching!") + instr.logWarning("Input instances will be standardized, blockified to blocks, and " + + "then cached during training. Be careful of double caching!") } // Extract the number of features before deciding optimization solver. @@ -377,7 +377,7 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String } else { require($(regParam) == 0.0, "The standard deviation of the label is zero. " + "Model cannot be regularized.") - instr.logWarning(s"The standard deviation of the label is zero. " + + instr.logWarning("The standard deviation of the label is zero. " + "Consider setting fitIntercept=true.") } } @@ -472,13 +472,13 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String // Also, if rawYStd==0 and yMean==0, all the coefficients are zero regardless of // the fitIntercept. if (yMean == 0.0) { - instr.logWarning(s"Mean and standard deviation of the label are zero, so the " + - s"coefficients and the intercept will all be zero; as a result, training is not " + - s"needed.") + instr.logWarning("Mean and standard deviation of the label are zero, so the " + + "coefficients and the intercept will all be zero; as a result, training is not " + + "needed.") } else { - instr.logWarning(s"The standard deviation of the label is zero, so the coefficients " + - s"will be zeros and the intercept will be the mean of the label; as a result, " + - s"training is not needed.") + instr.logWarning("The standard deviation of the label is zero, so the coefficients " + + "will be zeros and the intercept will be the mean of the label; as a result, " + + "training is not needed.") } val coefficients = Vectors.sparse(numFeatures, Seq.empty) val intercept = yMean diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala index f241ff3e41153..4135afb5ed0b2 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala @@ -21,6 +21,7 @@ import org.json4s.{DefaultFormats, JObject} import org.json4s.JsonDSL._ import org.apache.spark.annotation.Since +import org.apache.spark.internal.{LogKeys, MDC} import org.apache.spark.ml.feature.Instance import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.param.ParamMap @@ -161,7 +162,7 @@ class RandomForestRegressor @Since("1.4.0") (@Since("1.4.0") override val uid: S trees.foreach(copyValues(_)) val numFeatures = trees.head.numFeatures - instr.logNamedValue(Instrumentation.loggerTags.numFeatures, numFeatures) + instr.logNumFeatures(numFeatures) new RandomForestRegressionModel(uid, trees, numFeatures) } @@ -254,8 +255,8 @@ class RandomForestRegressionModel private[ml] ( if (predictionColNames.nonEmpty) { dataset.withColumns(predictionColNames, predictionColumns) } else { - this.logWarning(s"$uid: RandomForestRegressionModel.transform() does nothing" + - " because no output columns were set.") + this.logWarning(log"${MDC(LogKeys.UUID, uid)}: RandomForestRegressionModel.transform() " + + log"does nothing because no output columns were set.") dataset.toDF() } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala index 4697bfbe4b092..7a27b32aa24c5 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala @@ -31,6 +31,7 @@ import org.apache.spark.sql.catalyst.expressions.aggregate.TypedImperativeAggreg import org.apache.spark.sql.catalyst.trees.BinaryLike import org.apache.spark.sql.functions.lit import org.apache.spark.sql.types._ +import org.apache.spark.util.Utils /** * A builder object that provides summary statistics about a given column. @@ -397,17 +398,12 @@ private[spark] object SummaryBuilderImpl extends Logging { override def serialize(state: SummarizerBuffer): Array[Byte] = { // TODO: Use ByteBuffer to optimize - val bos = new ByteArrayOutputStream() - val oos = new ObjectOutputStream(bos) - oos.writeObject(state) - bos.toByteArray + Utils.serialize(state) } override def deserialize(bytes: Array[Byte]): SummarizerBuffer = { // TODO: Use ByteBuffer to optimize - val bis = new ByteArrayInputStream(bytes) - val ois = new ObjectInputStream(bis) - ois.readObject().asInstanceOf[SummarizerBuffer] + Utils.deserialize(bytes) } override def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): MetricsAggregate = { diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala index a9c2941ef3a53..2f63f4ae073e5 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala @@ -20,7 +20,7 @@ package org.apache.spark.ml.tree.impl import scala.collection.mutable import scala.util.Try -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.ml.feature.Instance import org.apache.spark.ml.tree.TreeEnsembleParams import org.apache.spark.mllib.tree.configuration.Algo._ @@ -134,8 +134,10 @@ private[spark] object DecisionTreeMetadata extends Logging { val maxPossibleBins = math.min(strategy.maxBins, numExamples).toInt if (maxPossibleBins < strategy.maxBins) { - logWarning(s"DecisionTree reducing maxBins from ${strategy.maxBins} to $maxPossibleBins" + - s" (= number of training instances)") + logWarning(log"DecisionTree reducing maxBins from " + + log"${MDC(LogKeys.MAX_NUM_BINS, strategy.maxBins)} to " + + log"${MDC(LogKeys.MAX_NUM_POSSIBLE_BINS, maxPossibleBins)} " + + log"(= number of training instances)") } // We check the number of bins here against maxPossibleBins. diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala index 8cf19f27cbbf9..5184109bd3a52 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala @@ -18,7 +18,8 @@ package org.apache.spark.ml.tree.impl import org.apache.spark.broadcast.Broadcast -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.TIMER import org.apache.spark.ml.feature.Instance import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.regression.DecisionTreeRegressionModel @@ -477,7 +478,7 @@ private[spark] object GradientBoostedTrees extends Logging { timer.stop("total") logInfo("Internal timing for DecisionTree:") - logInfo(s"$timer") + logInfo(log"${MDC(TIMER, timer)}") bcSplits.destroy() treePoints.unpersist() diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala index 440b6635a52db..452532df5a2b6 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala @@ -21,7 +21,8 @@ import scala.collection.mutable import scala.util.Random import org.apache.spark.broadcast.Broadcast -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{MAX_MEMORY_SIZE, MEMORY_SIZE, NUM_CLASSES, NUM_EXAMPLES, NUM_FEATURES, NUM_NODES, NUM_WEIGHTED_EXAMPLES, TIMER} import org.apache.spark.ml.classification.DecisionTreeClassificationModel import org.apache.spark.ml.feature.Instance import org.apache.spark.ml.impl.Utils @@ -131,10 +132,11 @@ private[spark] object RandomForest extends Logging with Serializable { instrumentation.logNumExamples(metadata.numExamples) instrumentation.logSumOfWeights(metadata.weightedNumExamples) case None => - logInfo(s"numFeatures: ${metadata.numFeatures}") - logInfo(s"numClasses: ${metadata.numClasses}") - logInfo(s"numExamples: ${metadata.numExamples}") - logInfo(s"weightedNumExamples: ${metadata.weightedNumExamples}") + logInfo(log"numFeatures: ${MDC(NUM_FEATURES, metadata.numFeatures)}") + logInfo(log"numClasses: ${MDC(NUM_CLASSES, metadata.numClasses)}") + logInfo(log"numExamples: ${MDC(NUM_EXAMPLES, metadata.numExamples)}") + logInfo(log"weightedNumExamples: " + + log"${MDC(NUM_WEIGHTED_EXAMPLES, metadata.weightedNumExamples)}") } timer.start("init") @@ -217,7 +219,7 @@ private[spark] object RandomForest extends Logging with Serializable { timer.stop("total") logInfo("Internal timing for DecisionTree:") - logInfo(s"$timer") + logInfo(log"${MDC(TIMER, timer)}") if (strategy.useNodeIdCache) { // Delete any remaining checkpoints used for node Id cache. @@ -1286,9 +1288,10 @@ private[spark] object RandomForest extends Logging with Serializable { } if (memUsage > maxMemoryUsage) { // If maxMemoryUsage is 0, we should still allow splitting 1 node. - logWarning(s"Tree learning is using approximately $memUsage bytes per iteration, which" + - s" exceeds requested limit maxMemoryUsage=$maxMemoryUsage. This allows splitting" + - s" $numNodesInGroup nodes in this iteration.") + logWarning(log"Tree learning is using approximately ${MDC(MEMORY_SIZE, memUsage)} " + + log"bytes per iteration, which exceeds requested limit " + + log"maxMemoryUsage=${MDC(MAX_MEMORY_SIZE, maxMemoryUsage)}. This allows splitting " + + log"${MDC(NUM_NODES, numNodesInGroup)} nodes in this iteration.") } // Convert mutable maps to immutable ones. val nodesForGroup: Map[Int, Array[LearningNode]] = diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala index 33b7963788fa5..867f35a5d2b80 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala @@ -27,7 +27,8 @@ import org.apache.hadoop.fs.Path import org.json4s.DefaultFormats import org.apache.spark.annotation.Since -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{CROSS_VALIDATION_METRIC, CROSS_VALIDATION_METRICS, ESTIMATOR_PARAM_MAP} import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.evaluation.Evaluator import org.apache.spark.ml.param.{IntParam, Param, ParamMap, ParamValidators} @@ -192,12 +193,13 @@ class CrossValidator @Since("1.2.0") (@Since("1.4.0") override val uid: String) foldMetrics }.transpose.map(_.sum / $(numFolds)) // Calculate average metric over all splits - instr.logInfo(s"Average cross-validation metrics: ${metrics.toImmutableArraySeq}") + instr.logInfo(log"Average cross-validation metrics: ${MDC( + CROSS_VALIDATION_METRICS, metrics.mkString("[", ", ", "]"))}") val (bestMetric, bestIndex) = if (eval.isLargerBetter) metrics.zipWithIndex.maxBy(_._1) else metrics.zipWithIndex.minBy(_._1) - instr.logInfo(s"Best set of parameters:\n${epm(bestIndex)}") - instr.logInfo(s"Best cross-validation metric: $bestMetric.") + instr.logInfo(log"Best set of parameters:\n${MDC(ESTIMATOR_PARAM_MAP, epm(bestIndex))}") + instr.logInfo(log"Best cross-validation metric: ${MDC(CROSS_VALIDATION_METRIC, bestMetric)}.") val bestModel = est.fit(dataset, epm(bestIndex)).asInstanceOf[Model[_]] copyValues(new CrossValidatorModel(uid, bestModel, metrics) .setSubModels(subModels).setParent(this)) diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala index 58487b6ccbb8e..8e33ae6aad28b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala @@ -28,7 +28,8 @@ import org.apache.hadoop.fs.Path import org.json4s.DefaultFormats import org.apache.spark.annotation.Since -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{ESTIMATOR_PARAM_MAP, TRAIN_VALIDATION_SPLIT_METRIC, TRAIN_VALIDATION_SPLIT_METRICS} import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.evaluation.Evaluator import org.apache.spark.ml.param.{DoubleParam, ParamMap, ParamValidators} @@ -168,12 +169,14 @@ class TrainValidationSplit @Since("1.5.0") (@Since("1.5.0") override val uid: St trainingDataset.unpersist() validationDataset.unpersist() - instr.logInfo(s"Train validation split metrics: ${metrics.toImmutableArraySeq}") + instr.logInfo(log"Train validation split metrics: ${MDC( + TRAIN_VALIDATION_SPLIT_METRICS, metrics.mkString("[", ", ", "]"))}") val (bestMetric, bestIndex) = if (eval.isLargerBetter) metrics.zipWithIndex.maxBy(_._1) else metrics.zipWithIndex.minBy(_._1) - instr.logInfo(s"Best set of parameters:\n${epm(bestIndex)}") - instr.logInfo(s"Best train validation split metric: $bestMetric.") + instr.logInfo(log"Best set of parameters:\n${MDC(ESTIMATOR_PARAM_MAP, epm(bestIndex))}") + instr.logInfo(log"Best train validation split metric: " + + log"${MDC(TRAIN_VALIDATION_SPLIT_METRIC, bestMetric)}.") val bestModel = est.fit(dataset, epm(bestIndex)).asInstanceOf[Model[_]] copyValues(new TrainValidationSplitModel(uid, bestModel, metrics) .setSubModels(subModels).setParent(this)) diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/DatasetUtils.scala b/mllib/src/main/scala/org/apache/spark/ml/util/DatasetUtils.scala index b3cb9c7f2dd12..d5b6396e5ba80 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/util/DatasetUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/util/DatasetUtils.scala @@ -18,7 +18,8 @@ package org.apache.spark.ml.util import org.apache.spark.SparkException -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{CLASS_NAME, LABEL_COLUMN, NUM_CLASSES} import org.apache.spark.ml.PredictorParams import org.apache.spark.ml.classification.ClassifierParams import org.apache.spark.ml.feature.Instance @@ -195,8 +196,9 @@ private[spark] object DatasetUtils extends Logging { s" to be inferred from values. To avoid this error for labels with > $maxNumClasses" + s" classes, specify numClasses explicitly in the metadata; this can be done by applying" + s" StringIndexer to the label column.") - logInfo(this.getClass.getCanonicalName + s" inferred $numClasses classes for" + - s" labelCol=$labelCol since numClasses was not specified in the column metadata.") + logInfo(log"${MDC(CLASS_NAME, this.getClass.getCanonicalName)} inferred ${MDC( + NUM_CLASSES, numClasses)} classes for labelCol=${MDC(LABEL_COLUMN, labelCol)}" + + log" since numClasses was not specified in the column metadata.") numClasses } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala b/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala index bfc6465c58bd1..9413605a31ced 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala @@ -27,7 +27,8 @@ import org.json4s._ import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ -import org.apache.spark.internal.{LogEntry, Logging} +import org.apache.spark.internal.{LogEntry, Logging, MDC} +import org.apache.spark.internal.LogKeys.{CLASS_NAME, NUM_PARTITIONS, PIPELINE_STAGE_UID, STORAGE_LEVEL} import org.apache.spark.ml.{MLEvents, PipelineStage} import org.apache.spark.ml.param.{Param, Params} import org.apache.spark.rdd.RDD @@ -53,8 +54,8 @@ private[spark] class Instrumentation private () extends Logging with MLEvents { // estimator.getClass.getSimpleName can cause Malformed class name error, // call safer `Utils.getSimpleName` instead val className = Utils.getSimpleName(stage.getClass) - logInfo(s"Stage class: $className") - logInfo(s"Stage uid: ${stage.uid}") + logInfo(log"Stage class: ${MDC(CLASS_NAME, className)}") + logInfo(log"Stage uid: ${MDC(PIPELINE_STAGE_UID, stage.uid)}") } /** @@ -66,8 +67,8 @@ private[spark] class Instrumentation private () extends Logging with MLEvents { * Log some data about the dataset being fit. */ def logDataset(dataset: RDD[_]): Unit = { - logInfo(s"training: numPartitions=${dataset.partitions.length}" + - s" storageLevel=${dataset.getStorageLevel}") + logInfo(log"training: numPartitions=${MDC(NUM_PARTITIONS, dataset.partitions.length)}" + + log" storageLevel=${MDC(STORAGE_LEVEL, dataset.getStorageLevel)}") } /** @@ -253,6 +254,13 @@ private[spark] class OptionalInstrumentation private( } } + override def logInfo(logEntry: LogEntry): Unit = { + instrumentation match { + case Some(instr) => instr.logInfo(logEntry) + case None => super.logInfo(logEntry) + } + } + override def logWarning(msg: => String): Unit = { instrumentation match { case Some(instr) => instr.logWarning(msg) diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala index 2083a07e2cb5a..9b26d0a911aca 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala @@ -32,7 +32,8 @@ import org.json4s.jackson.JsonMethods._ import org.apache.spark.{SparkContext, SparkException} import org.apache.spark.annotation.{Since, Unstable} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.PATH import org.apache.spark.ml._ import org.apache.spark.ml.classification.{OneVsRest, OneVsRestModel} import org.apache.spark.ml.feature.RFormulaModel @@ -674,7 +675,7 @@ private[ml] class FileSystemOverwrite extends Logging { val qualifiedOutputPath = outputPath.makeQualified(fs.getUri, fs.getWorkingDirectory) if (fs.exists(qualifiedOutputPath)) { if (shouldOverwrite) { - logInfo(s"Path $path already exists. It will be overwritten.") + logInfo(log"Path ${MDC(PATH, path)} already exists. It will be overwritten.") // TODO: Revert back to the original content if save is not successful. fs.delete(qualifiedOutputPath, true) } else { diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index 6e5c026cd0143..c826654f0893c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -24,7 +24,8 @@ import scala.collection.mutable import org.apache.spark.annotation.Since import org.apache.spark.api.java.JavaRDD -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{CLUSTER_LEVEL, COST, DIVISIBLE_CLUSTER_INDICES_SIZE, FEATURE_DIMENSION, MIN_POINT_PER_CLUSTER, NUM_POINT} import org.apache.spark.ml.util.Instrumentation import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.linalg.BLAS.axpy @@ -158,7 +159,7 @@ class BisectingKMeans private ( handlePersistence: Boolean, instr: Option[Instrumentation]): BisectingKMeansModel = { val d = instances.map(_._1.size).first() - logInfo(s"Feature dimension: $d.") + logInfo(log"Feature dimension: ${MDC(FEATURE_DIMENSION, d)}.") val dMeasure = DistanceMeasure.decodeFromString(this.distanceMeasure) val norms = instances.map(d => Vectors.norm(d._1, 2.0)) @@ -178,14 +179,15 @@ class BisectingKMeans private ( instr.foreach(_.logSumOfWeights(activeClusters.values.map(_.weightSum).sum)) val rootSummary = activeClusters(ROOT_INDEX) val n = rootSummary.size - logInfo(s"Number of points: $n.") - logInfo(s"Initial cost: ${rootSummary.cost}.") + logInfo(log"Number of points: ${MDC(NUM_POINT, n)}.") + logInfo(log"Initial cost: ${MDC(COST, rootSummary.cost)}.") val minSize = if (minDivisibleClusterSize >= 1.0) { math.ceil(minDivisibleClusterSize).toLong } else { math.ceil(minDivisibleClusterSize * n).toLong } - logInfo(s"The minimum number of points of a divisible cluster is $minSize.") + logInfo(log"The minimum number of points of a divisible cluster is " + + log"${MDC(MIN_POINT_PER_CLUSTER, minSize)}.") var inactiveClusters = mutable.Seq.empty[(Long, ClusterSummary)] val random = new Random(seed) var numLeafClustersNeeded = k - 1 @@ -206,7 +208,8 @@ class BisectingKMeans private ( } if (divisibleClusters.nonEmpty) { val divisibleIndices = divisibleClusters.keys.toSet - logInfo(s"Dividing ${divisibleIndices.size} clusters on level $level.") + logInfo(log"Dividing ${MDC(DIVISIBLE_CLUSTER_INDICES_SIZE, divisibleIndices.size)}" + + log" clusters on level ${MDC(CLUSTER_LEVEL, level)}.") var newClusterCenters = divisibleClusters.flatMap { case (index, summary) => val (left, right) = splitCenter(summary.center, random, dMeasure) Iterator((leftChildIndex(index), left), (rightChildIndex(index), right)) @@ -233,7 +236,8 @@ class BisectingKMeans private ( activeClusters = newClusters numLeafClustersNeeded -= divisibleClusters.size } else { - logInfo(s"None active and divisible clusters left on level $level. Stop iterations.") + logInfo(log"None active and divisible clusters left " + + log"on level ${MDC(CLUSTER_LEVEL, level)}. Stop iterations.") inactiveClusters ++= activeClusters activeClusters = Map.empty } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala index f6c73a88e3634..52fbc7a5a47f5 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala @@ -21,7 +21,8 @@ import scala.collection.mutable.ArrayBuffer import org.apache.spark.annotation.Since import org.apache.spark.broadcast.Broadcast -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{COST, INIT_MODE, NUM_ITERATIONS, TOTAL_TIME} import org.apache.spark.ml.util.Instrumentation import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.linalg.BLAS.axpy @@ -259,7 +260,7 @@ class KMeans private ( val sc = data.sparkContext - val initStartTime = System.nanoTime() + val initStartTime = System.currentTimeMillis() val distanceMeasureInstance = DistanceMeasure.decodeFromString(this.distanceMeasure) @@ -274,14 +275,15 @@ class KMeans private ( } } val numFeatures = centers.head.vector.size - val initTimeInSeconds = (System.nanoTime() - initStartTime) / 1e9 - logInfo(f"Initialization with $initializationMode took $initTimeInSeconds%.3f seconds.") + val initTimeMs = System.currentTimeMillis() - initStartTime + logInfo(log"Initialization with ${MDC(INIT_MODE, initializationMode)}" + + log" took ${MDC(TOTAL_TIME, initTimeMs)} ms.") var converged = false var cost = 0.0 var iteration = 0 - val iterationStartTime = System.nanoTime() + val iterationStartTime = System.currentTimeMillis() instr.foreach(_.logNumFeatures(numFeatures)) @@ -357,16 +359,17 @@ class KMeans private ( iteration += 1 } - val iterationTimeInSeconds = (System.nanoTime() - iterationStartTime) / 1e9 - logInfo(f"Iterations took $iterationTimeInSeconds%.3f seconds.") + val iterationTimeMs = System.currentTimeMillis() - iterationStartTime + logInfo(log"Iterations took ${MDC(TOTAL_TIME, iterationTimeMs)} ms") if (iteration == maxIterations) { - logInfo(s"KMeans reached the max number of iterations: $maxIterations.") + logInfo(log"KMeans reached the max number of" + + log" iterations: ${MDC(NUM_ITERATIONS, maxIterations)}.") } else { - logInfo(s"KMeans converged in $iteration iterations.") + logInfo(log"KMeans converged in ${MDC(NUM_ITERATIONS, iteration)} iterations.") } - logInfo(s"The cost is $cost.") + logInfo(log"The cost is ${MDC(COST, cost)}.") new KMeansModel(centers.map(_.vector), distanceMeasure, cost, iteration) } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LocalKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LocalKMeans.scala index 9e2113f1c0fc9..ea83be1237298 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LocalKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LocalKMeans.scala @@ -19,7 +19,8 @@ package org.apache.spark.mllib.clustering import scala.util.Random -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{NUM_ITERATIONS, POINT_OF_CENTER} import org.apache.spark.mllib.linalg.BLAS.{axpy, scal} import org.apache.spark.mllib.linalg.Vectors @@ -58,8 +59,8 @@ private[mllib] object LocalKMeans extends Logging { j += 1 } if (j == 0) { - logWarning("kMeansPlusPlus initialization ran out of distinct points for centers." + - s" Using duplicate point for center k = $i.") + logWarning(log"kMeansPlusPlus initialization ran out of distinct points for centers." + + log" Using duplicate point for center k = ${MDC(POINT_OF_CENTER, i)}.") centers(i) = points(0).toDense } else { centers(i) = points(j - 1).toDense @@ -112,9 +113,10 @@ private[mllib] object LocalKMeans extends Logging { } if (iteration == maxIterations) { - logInfo(s"Local KMeans++ reached the max number of iterations: $maxIterations.") + logInfo(log"Local KMeans++ reached the max number of " + + log"iterations: ${MDC(NUM_ITERATIONS, maxIterations)}.") } else { - logInfo(s"Local KMeans++ converged in $iteration iterations.") + logInfo(log"Local KMeans++ converged in ${MDC(NUM_ITERATIONS, iteration)} iterations.") } centers diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala index 12c7ae5066c82..9150bb305876b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala @@ -25,7 +25,8 @@ import org.apache.spark.{SparkContext, SparkException} import org.apache.spark.annotation.Since import org.apache.spark.api.java.JavaRDD import org.apache.spark.graphx._ -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{DELTA, DIFF_DELTA, NORM, NUM_ITERATIONS} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.util.{Loader, MLUtils, Saveable} import org.apache.spark.rdd.RDD @@ -368,7 +369,7 @@ object PowerIterationClustering extends Logging { var diffDelta = Double.MaxValue var curG = g for (iter <- 0 until maxIterations if math.abs(diffDelta) > tol) { - val msgPrefix = s"Iteration $iter" + val msgPrefix = log"Iteration ${MDC(NUM_ITERATIONS, iter)}:" // multiply W by vt val v = curG.aggregateMessages[Double]( sendMsg = ctx => ctx.sendToSrc(ctx.attr * ctx.dstAttr), @@ -378,15 +379,15 @@ object PowerIterationClustering extends Logging { /* useEdge */ true)).cache() // normalize v val norm = v.values.map(math.abs).sum() - logInfo(s"$msgPrefix: norm(v) = $norm.") + logInfo(msgPrefix + log" norm(v) = ${MDC(NORM, norm)}.") val v1 = v.mapValues(x => x / norm) // compare difference val delta = curG.joinVertices(v1) { case (_, x, y) => math.abs(x - y) }.vertices.values.sum() - logInfo(s"$msgPrefix: delta = $delta.") + logInfo(msgPrefix + log" delta = ${MDC(DELTA, delta)}.") diffDelta = math.abs(delta - prevDelta) - logInfo(s"$msgPrefix: diff(delta) = $diffDelta.") + logInfo(msgPrefix + log" diff(delta) = ${MDC(DIFF_DELTA, diffDelta)}.") if (math.abs(diffDelta) < tol) { /** @@ -404,8 +405,8 @@ object PowerIterationClustering extends Logging { val rayleigh = xTAx / xTx if (math.abs(norm - math.abs(rayleigh)) > tol) { - logWarning(s"Power Iteration fail to converge. delta = ${delta}," + - s" difference delta = ${diffDelta} and norm = ${norm}") + logWarning(log"Power Iteration fail to converge. delta = ${MDC(DELTA, delta)}," + + log" difference delta = ${MDC(DIFF_DELTA, diffDelta)} and norm = ${MDC(NORM, norm)}") } } curG.vertices.unpersist() diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala index ba14dc739a235..85a7350078101 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala @@ -21,7 +21,8 @@ import scala.reflect.ClassTag import org.apache.spark.annotation.Since import org.apache.spark.api.java.JavaSparkContext._ -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{CLUSTER_CENTROIDS, CLUSTER_LABEL, CLUSTER_WEIGHT, LARGEST_CLUSTER_INDEX, SMALLEST_CLUSTER_INDEX} import org.apache.spark.mllib.linalg.{BLAS, Vector, Vectors} import org.apache.spark.rdd.RDD import org.apache.spark.streaming.api.java.{JavaDStream, JavaPairDStream} @@ -129,14 +130,16 @@ class StreamingKMeansModel @Since("1.2.0") ( case _ => centroid.toArray.mkString("[", ",", "]") } - logInfo(s"Cluster $label updated with weight $updatedWeight and centroid: $display") + logInfo(log"Cluster ${MDC(CLUSTER_LABEL, label)} updated with weight " + + log"${MDC(CLUSTER_WEIGHT, updatedWeight)} and centroid: ${MDC(CLUSTER_CENTROIDS, display)}") } // Check whether the smallest cluster is dying. If so, split the largest cluster. val (maxWeight, largest) = clusterWeights.iterator.zipWithIndex.maxBy(_._1) val (minWeight, smallest) = clusterWeights.iterator.zipWithIndex.minBy(_._1) if (minWeight < 1e-8 * maxWeight) { - logInfo(s"Cluster $smallest is dying. Split the largest cluster $largest into two.") + logInfo(log"Cluster ${MDC(SMALLEST_CLUSTER_INDEX, smallest)} is dying. " + + log"Split the largest cluster ${MDC(LARGEST_CLUSTER_INDEX, largest)} into two.") val weight = (maxWeight + minWeight) / 2.0 clusterWeights(largest) = weight clusterWeights(smallest) = weight diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala index 869fe7155a268..a203d3fc73537 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala @@ -18,7 +18,8 @@ package org.apache.spark.mllib.evaluation import org.apache.spark.annotation.Since -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{COUNT, NUM_BIN} import org.apache.spark.mllib.evaluation.binary._ import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Row} @@ -201,7 +202,8 @@ class BinaryClassificationMetrics @Since("3.0.0") ( val grouping = countsSize / numBins if (grouping < 2) { // numBins was more than half of the size; no real point in down-sampling to bins - logInfo(s"Curve is too small ($countsSize) for $numBins bins to be useful") + logInfo(log"Curve is too small (${MDC(COUNT, countsSize)}) " + + log"for ${MDC(NUM_BIN, numBins)} bins to be useful") counts } else { counts.mapPartitions { iter => @@ -243,7 +245,7 @@ class BinaryClassificationMetrics @Since("3.0.0") ( val partitionwiseCumulativeCounts = agg.scanLeft(new BinaryLabelCounter())((agg, c) => agg.clone() += c) val totalCount = partitionwiseCumulativeCounts.last - logInfo(s"Total counts: $totalCount") + logInfo(log"Total counts: ${MDC(COUNT, totalCount)}") val cumulativeCounts = binnedCounts.mapPartitionsWithIndex( (index: Int, iter: Iterator[(Double, BinaryLabelCounter)]) => { val cumCount = partitionwiseCumulativeCounts(index) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala index 0dddbec8a7ed8..499dc09b86211 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala @@ -31,7 +31,8 @@ import org.apache.spark.SparkContext import org.apache.spark.annotation.Since import org.apache.spark.api.java.JavaRDD import org.apache.spark.broadcast.Broadcast -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{ALPHA, COUNT, NUM_TRAIN_WORD, VOCAB_SIZE} import org.apache.spark.internal.config.Kryo.KRYO_SERIALIZER_MAX_BUFFER_SIZE import org.apache.spark.ml.linalg.BLAS import org.apache.spark.mllib.linalg.{Vector, Vectors} @@ -208,7 +209,8 @@ class Word2Vec extends Serializable with Logging { trainWordsCount += vocab(a).cn a += 1 } - logInfo(s"vocabSize = $vocabSize, trainWordsCount = $trainWordsCount") + logInfo(log"vocabSize = ${MDC(VOCAB_SIZE, vocabSize)}," + + log" trainWordsCount = ${MDC(NUM_TRAIN_WORD, trainWordsCount)}") } private def createExpTable(): Array[Float] = { @@ -379,8 +381,9 @@ class Word2Vec extends Serializable with Logging { (1 - (numPartitions * wordCount.toDouble + numWordsProcessedInPreviousIterations) / totalWordsCounts) if (alpha < learningRate * 0.0001) alpha = learningRate * 0.0001 - logInfo(s"wordCount = ${wordCount + numWordsProcessedInPreviousIterations}, " + - s"alpha = $alpha") + logInfo(log"wordCount =" + + log" ${MDC(COUNT, wordCount + numWordsProcessedInPreviousIterations)}," + + log" alpha = ${MDC(ALPHA, alpha)}") } wc += sentence.length var pos = 0 diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala index 59d22f0eac991..3c648f34c6100 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala @@ -33,7 +33,8 @@ import org.apache.spark.SparkContext import org.apache.spark.annotation.Since import org.apache.spark.api.java.JavaRDD import org.apache.spark.api.java.JavaSparkContext.fakeClassTag -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{MIN_NUM_FREQUENT_PATTERN, NUM_FREQUENT_ITEMS, NUM_LOCAL_FREQUENT_PATTERN, NUM_PREFIXES, NUM_SEQUENCES} import org.apache.spark.mllib.util.{Loader, Saveable} import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Row, SparkSession} @@ -139,13 +140,13 @@ class PrefixSpan private ( } val totalCount = data.count() - logInfo(s"number of sequences: $totalCount") + logInfo(log"number of sequences: ${MDC(NUM_SEQUENCES, totalCount)}") val minCount = math.ceil(minSupport * totalCount).toLong - logInfo(s"minimum count for a frequent pattern: $minCount") + logInfo(log"minimum count for a frequent pattern: ${MDC(MIN_NUM_FREQUENT_PATTERN, minCount)}") // Find frequent items. val freqItems = findFrequentItems(data, minCount) - logInfo(s"number of frequent items: ${freqItems.length}") + logInfo(log"number of frequent items: ${MDC(NUM_FREQUENT_ITEMS, freqItems.length)}") // Keep only frequent items from input sequences and convert them to internal storage. val itemToInt = Utils.toMapWithIndex(freqItems) @@ -298,18 +299,20 @@ object PrefixSpan extends Logging { var largePrefixes = mutable.Map(emptyPrefix.id -> emptyPrefix) while (largePrefixes.nonEmpty) { val numLocalFreqPatterns = localFreqPatterns.length - logInfo(s"number of local frequent patterns: $numLocalFreqPatterns") + logInfo(log"number of local frequent patterns: " + + log"${MDC(NUM_LOCAL_FREQUENT_PATTERN, numLocalFreqPatterns)}") if (numLocalFreqPatterns > 1000000) { logWarning( - s""" - | Collected $numLocalFreqPatterns local frequent patterns. You may want to consider: + log""" + | Collected ${MDC(NUM_LOCAL_FREQUENT_PATTERN, numLocalFreqPatterns)} + | local frequent patterns. You may want to consider: | 1. increase minSupport, | 2. decrease maxPatternLength, | 3. increase maxLocalProjDBSize. """.stripMargin) } - logInfo(s"number of small prefixes: ${smallPrefixes.size}") - logInfo(s"number of large prefixes: ${largePrefixes.size}") + logInfo(log"number of small prefixes: ${MDC(NUM_PREFIXES, smallPrefixes.size)}") + logInfo(log"number of large prefixes: ${MDC(NUM_PREFIXES, largePrefixes.size)}") val largePrefixArray = largePrefixes.values.toArray val freqPrefixes = postfixes.flatMap { postfix => largePrefixArray.flatMap { prefix => @@ -339,7 +342,8 @@ object PrefixSpan extends Logging { var freqPatterns = sc.parallelize(localFreqPatterns.toSeq, 1) val numSmallPrefixes = smallPrefixes.size - logInfo(s"number of small prefixes for local processing: $numSmallPrefixes") + logInfo(log"number of small prefixes for local processing: " + + log"${MDC(NUM_PREFIXES, numSmallPrefixes)}") if (numSmallPrefixes > 0) { // Switch to local processing. val bcSmallPrefixes = sc.broadcast(smallPrefixes) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala index 4e9952e6d768f..3329682d3b550 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala @@ -23,7 +23,7 @@ import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, Matrix => BM} import org.apache.spark.{Partitioner, PartitionIdPassthrough, SparkException} import org.apache.spark.annotation.Since -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.mllib.linalg._ import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel @@ -322,7 +322,10 @@ class BlockMatrix @Since("1.3.0") ( val m = numRows().toInt val n = numCols().toInt val mem = m * n / 125000 - if (mem > 500) logWarning(s"Storing this matrix will require $mem MiB of memory!") + if (mem > 500) { + logWarning(log"Storing this matrix will require ${MDC(LogKeys.MEMORY_SIZE, mem)} " + + log"MiB of memory!") + } val localBlocks = blocks.collect() val values = new Array[Double](m * n) localBlocks.foreach { case ((blockRowIndex, blockColIndex), submat) => diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala index 882872709ac35..63cd41439054e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala @@ -26,7 +26,7 @@ import breeze.linalg.{axpy => brzAxpy, inv, svd => brzSvd, DenseMatrix => BDM, D import breeze.numerics.{sqrt => brzSqrt} import org.apache.spark.annotation.Since -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.internal.config.MAX_RESULT_SIZE import org.apache.spark.mllib.linalg._ import org.apache.spark.mllib.stat._ @@ -251,7 +251,8 @@ class RowMatrix @Since("1.0.0") ( } if (cols > 10000) { val memMB = (cols.toLong * cols) / 125000 - logWarning(s"$cols columns will require at least $memMB megabytes of memory!") + logWarning(log"${MDC(LogKeys.NUM_COLUMNS, cols)} columns will require at least " + + log"${MDC(LogKeys.MEMORY_SIZE, memMB)} megabytes of memory!") } } @@ -342,7 +343,8 @@ class RowMatrix @Since("1.0.0") ( val computeMode = mode match { case "auto" => if (k > 5000) { - logWarning(s"computing svd with k=$k and n=$n, please check necessity") + logWarning(log"computing svd with k=${MDC(LogKeys.NUM_LEADING_SINGULAR_VALUES, k)} and " + + log"n=${MDC(LogKeys.NUM_COLUMNS, n)}, please check necessity") } // TODO: The conditions below are not fully tested. @@ -395,7 +397,8 @@ class RowMatrix @Since("1.0.0") ( // criterion specified by tol after max number of iterations. // Thus use i < min(k, sigmas.length) instead of i < k. if (sigmas.length < k) { - logWarning(s"Requested $k singular values but only found ${sigmas.length} converged.") + logWarning(log"Requested ${MDC(LogKeys.NUM_LEADING_SINGULAR_VALUES, k)} singular " + + log"values but only found ${MDC(LogKeys.SIGMAS_LENGTH, sigmas.length)} converged.") } while (i < math.min(k, sigmas.length) && sigmas(i) >= threshold) { i += 1 @@ -403,7 +406,8 @@ class RowMatrix @Since("1.0.0") ( val sk = i if (sk < k) { - logWarning(s"Requested $k singular values but only found $sk nonzeros.") + logWarning(log"Requested ${MDC(LogKeys.NUM_LEADING_SINGULAR_VALUES, k)} singular " + + log"values but only found ${MDC(LogKeys.COUNT, sk)} nonzeros.") } // Warn at the end of the run as well, for increased visibility. @@ -625,9 +629,9 @@ class RowMatrix @Since("1.0.0") ( require(threshold >= 0, s"Threshold cannot be negative: $threshold") if (threshold > 1) { - logWarning(s"Threshold is greater than 1: $threshold " + - "Computation will be more efficient with promoted sparsity, " + - " however there is no correctness guarantee.") + logWarning(log"Threshold is greater than 1: ${MDC(LogKeys.THRESHOLD, threshold)} " + + log"Computation will be more efficient with promoted sparsity, " + + log"however there is no correctness guarantee.") } val gamma = if (threshold < 1e-6) { @@ -828,9 +832,9 @@ class RowMatrix @Since("1.0.0") ( val desiredTreeDepth = math.ceil(numerator / denominator) if (desiredTreeDepth > 4) { - logWarning( - s"Desired tree depth for treeAggregation is big ($desiredTreeDepth)." - + "Consider increasing driver max result size or reducing number of partitions") + logWarning(log"Desired tree depth for treeAggregation is big " + + log"(${MDC(LogKeys.DESIRED_TREE_DEPTH, desiredTreeDepth)}). " + + log"Consider increasing driver max result size or reducing number of partitions") } math.min(math.max(1, desiredTreeDepth), 10).toInt diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala index d40e640a33d6f..a288d13e57f7b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala @@ -21,7 +21,7 @@ import scala.collection.mutable.ArrayBuffer import breeze.linalg.{norm, DenseVector => BDV} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.rdd.RDD @@ -203,8 +203,9 @@ object GradientDescent extends Logging { } if (numIterations * miniBatchFraction < 1.0) { - logWarning("Not all examples will be used if numIterations * miniBatchFraction < 1.0: " + - s"numIterations=$numIterations and miniBatchFraction=$miniBatchFraction") + logWarning(log"Not all examples will be used if numIterations * miniBatchFraction < 1.0: " + + log"numIterations=${MDC(LogKeys.NUM_ITERATIONS, numIterations)} and " + + log"miniBatchFraction=${MDC(LogKeys.MINI_BATCH_FRACTION, miniBatchFraction)}") } val stochasticLossHistory = new ArrayBuffer[Double](numIterations + 1) @@ -291,7 +292,9 @@ object GradientDescent extends Logging { } } } else { - logWarning(s"Iteration ($i/$numIterations). The size of sampled batch is zero") + logWarning(log"Iteration " + + log"(${MDC(LogKeys.INDEX, i)}/${MDC(LogKeys.NUM_ITERATIONS, numIterations)}). " + + log"The size of sampled batch is zero") } i += 1 } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala index 9ffee8832db93..bc888aecec0ab 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala @@ -30,7 +30,7 @@ import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkContext import org.apache.spark.annotation.Since import org.apache.spark.api.java.{JavaPairRDD, JavaRDD} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.mllib.linalg.BLAS import org.apache.spark.mllib.rdd.MLPairRDDFunctions._ import org.apache.spark.mllib.util.{Loader, Saveable} @@ -66,11 +66,12 @@ class MatrixFactorizationModel @Since("0.8.0") ( require(features.first()._2.length == rank, s"$name feature dimension does not match the rank $rank.") if (features.partitioner.isEmpty) { - logWarning(s"$name factor does not have a partitioner. " - + "Prediction on individual records could be slow.") + logWarning(log"${MDC(LogKeys.FEATURE_NAME, name)} factor does not have a partitioner. " + + log"Prediction on individual records could be slow.") } if (features.getStorageLevel == StorageLevel.NONE) { - logWarning(s"$name factor is not cached. Prediction could be slow.") + logWarning(log"${MDC(LogKeys.FEATURE_NAME, name)} factor is not cached. " + + log"Prediction could be slow.") } } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala index 94848cb5033a1..2fa1339bc72a9 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala @@ -21,7 +21,8 @@ import scala.reflect.ClassTag import org.apache.spark.annotation.Since import org.apache.spark.api.java.JavaSparkContext.fakeClassTag -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{MODEL_WEIGHTS, TIME} import org.apache.spark.mllib.linalg.Vector import org.apache.spark.streaming.api.java.{JavaDStream, JavaPairDStream} import org.apache.spark.streaming.dstream.DStream @@ -90,12 +91,12 @@ abstract class StreamingLinearAlgorithm[ data.foreachRDD { (rdd, time) => if (!rdd.isEmpty()) { model = Some(algorithm.run(rdd, model.get.weights)) - logInfo(s"Model updated at time ${time.toString}") + logInfo(log"Model updated at time ${MDC(TIME, time)}") val display = model.get.weights.size match { case x if x > 100 => model.get.weights.toArray.take(100).mkString("[", ",", "...") case _ => model.get.weights.toArray.mkString("[", ",", "]") } - logInfo(s"Current model: weights, ${display}") + logInfo(log"Current model: weights, ${MDC(MODEL_WEIGHTS, display)}") } } } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala index 9aeab65e25de4..2059a9f785381 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala @@ -20,7 +20,7 @@ package org.apache.spark.mllib.stat.test import org.apache.commons.math3.distribution.ChiSquaredDistribution import org.apache.spark.SparkException -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.mllib.linalg._ import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD @@ -221,8 +221,9 @@ private[spark] object ChiSqTest extends Logging { } val size = observed.size if (size > 1000) { - logWarning("Chi-squared approximation may not be accurate due to low expected frequencies " - + s" as a result of a large number of categories: $size.") + logWarning(log"Chi-squared approximation may not be accurate due to low expected " + + log"frequencies as a result of a large number of categories: " + + log"${MDC(LogKeys.NUM_CATEGORIES, size)}.") } val obsArr = observed.toArray val expArr = if (expected.size == 0) Array.tabulate(size)(_ => 1.0 / size) else expected.toArray diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala index c282dc59fa8d3..2f65dea0c4a89 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala @@ -26,7 +26,7 @@ import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkContext import org.apache.spark.annotation.Since import org.apache.spark.api.java.JavaRDD -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.tree.configuration.{Algo, FeatureType} import org.apache.spark.mllib.tree.configuration.Algo._ @@ -209,15 +209,19 @@ object DecisionTreeModel extends Loader[DecisionTreeModel] with Logging { .map(Utils.memoryStringToMb) .getOrElse(Utils.DEFAULT_DRIVER_MEM_MB) if (driverMemory <= memThreshold) { - logWarning(s"$thisClassName.save() was called, but it may fail because of too little" + - s" driver memory (${driverMemory}m)." + - s" If failure occurs, try setting driver-memory ${memThreshold}m (or larger).") + logWarning(log"${MDC(LogKeys.CLASS_NAME, thisClassName)}.save() was called, " + + log"but it may fail because of too little driver memory " + + log"(${MDC(LogKeys.DRIVER_MEMORY_SIZE, driverMemory)}m). If failure occurs, " + + log"try setting driver-memory ${MDC(LogKeys.MEMORY_THRESHOLD_SIZE, memThreshold)}m " + + log"(or larger).") } } else { if (sc.executorMemory <= memThreshold) { - logWarning(s"$thisClassName.save() was called, but it may fail because of too little" + - s" executor memory (${sc.executorMemory}m)." + - s" If failure occurs try setting executor-memory ${memThreshold}m (or larger).") + logWarning(log"${MDC(LogKeys.CLASS_NAME, thisClassName)}.save() was called, " + + log"but it may fail because of too little executor memory " + + log"(${MDC(LogKeys.EXECUTOR_MEMORY_SIZE, sc.executorMemory)}m). If failure occurs, " + + log"try setting executor-memory ${MDC(LogKeys.MEMORY_THRESHOLD_SIZE, memThreshold)}m " + + log"(or larger).") } } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala index 579d6b77f62c3..aa2287f3af896 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala @@ -26,7 +26,7 @@ import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkContext import org.apache.spark.annotation.Since import org.apache.spark.api.java.JavaRDD -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.ml.linalg.BLAS import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.regression.LabeledPoint @@ -407,15 +407,19 @@ private[tree] object TreeEnsembleModel extends Logging { .map(Utils.memoryStringToMb) .getOrElse(Utils.DEFAULT_DRIVER_MEM_MB) if (driverMemory <= memThreshold) { - logWarning(s"$className.save() was called, but it may fail because of too little" + - s" driver memory (${driverMemory}m)." + - s" If failure occurs, try setting driver-memory ${memThreshold}m (or larger).") + logWarning(log"${MDC(LogKeys.CLASS_NAME, className)}.save() was called, " + + log"but it may fail because of too little driver memory " + + log"(${MDC(LogKeys.DRIVER_MEMORY_SIZE, driverMemory)}m). If failure occurs, " + + log"try setting driver-memory ${MDC(LogKeys.MEMORY_THRESHOLD_SIZE, memThreshold)}m " + + log"(or larger).") } } else { if (sc.executorMemory <= memThreshold) { - logWarning(s"$className.save() was called, but it may fail because of too little" + - s" executor memory (${sc.executorMemory}m)." + - s" If failure occurs try setting executor-memory ${memThreshold}m (or larger).") + logWarning(log"${MDC(LogKeys.CLASS_NAME, className)}.save() was called, " + + log"but it may fail because of too little executor memory " + + log"(${MDC(LogKeys.EXECUTOR_MEMORY_SIZE, sc.executorMemory)}m). If failure occurs, " + + log"try setting executor-memory ${MDC(LogKeys.MEMORY_THRESHOLD_SIZE, memThreshold)}m " + + log"(or larger).") } } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala index d8c0f8711cabc..4857c9b00f421 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala @@ -19,7 +19,7 @@ package org.apache.spark.mllib.util import org.apache.spark.annotation.Since import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{COUNT, RANGE} +import org.apache.spark.internal.LogKeys.{COUNT, RANGE} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala index 10adf10690b77..e23423e4c004e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala @@ -23,7 +23,7 @@ import scala.reflect.ClassTag import org.apache.spark.{SparkContext, SparkException} import org.apache.spark.annotation.Since import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.OPTIMIZER_CLASS_NAME +import org.apache.spark.internal.LogKeys.OPTIMIZER_CLASS_NAME import org.apache.spark.ml.linalg.{MatrixUDT => MLMatrixUDT, VectorUDT => MLVectorUDT} import org.apache.spark.ml.util.Instrumentation import org.apache.spark.mllib.linalg._ diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorIndexerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorIndexerSuite.scala index 0a9347b87977e..384fcf6ceb859 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorIndexerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorIndexerSuite.scala @@ -19,7 +19,7 @@ package org.apache.spark.ml.feature import org.apache.spark.SparkException import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{CATEGORICAL_FEATURES, MAX_CATEGORIES} +import org.apache.spark.internal.LogKeys.{CATEGORICAL_FEATURES, MAX_CATEGORIES} import org.apache.spark.ml.attribute._ import org.apache.spark.ml.linalg.{SparseVector, Vector, Vectors} import org.apache.spark.ml.param.ParamsSuite diff --git a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala index ff85831a7a6b2..94abeaf0804ed 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala @@ -28,7 +28,8 @@ import org.apache.commons.io.FileUtils import org.apache.commons.io.filefilter.TrueFileFilter import org.apache.spark._ -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{RMSE, TEST_SIZE, TRAINING_SIZE} import org.apache.spark.ml.linalg.{BLAS, Vectors} import org.apache.spark.ml.recommendation.ALS._ import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils} @@ -353,8 +354,8 @@ class ALSSuite extends MLTest with DefaultReadWriteTest with Logging { } } } - logInfo(s"Generated an explicit feedback dataset with ${training.size} ratings for training " + - s"and ${test.size} for test.") + logInfo(log"Generated an explicit feedback dataset with ${MDC(TRAINING_SIZE, training.size)} " + + log"ratings for training and ${MDC(TEST_SIZE, test.size)} for test.") (sc.parallelize(training.toSeq, 2), sc.parallelize(test.toSeq, 2)) } @@ -485,7 +486,7 @@ class ALSSuite extends MLTest with DefaultReadWriteTest with Logging { val mse = errorSquares.sum / errorSquares.length math.sqrt(mse) } - logInfo(s"Test RMSE is $rmse.") + logInfo(log"Test RMSE is ${MDC(RMSE, rmse)}.") assert(rmse < targetRMSE) } @@ -1246,8 +1247,8 @@ object ALSSuite extends Logging { } } } - logInfo(s"Generated an implicit feedback dataset with ${training.size} ratings for training " + - s"and ${test.size} for test.") + logInfo(log"Generated an implicit feedback dataset with ${MDC(TRAINING_SIZE, training.size)}" + + log" ratings for training and ${MDC(TEST_SIZE, test.size)} for test.") (sc.parallelize(training.toSeq, 2), sc.parallelize(test.toSeq, 2)) } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala index a25a19e2d354b..135d7e26c6d8c 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala @@ -25,6 +25,8 @@ import breeze.linalg.{DenseMatrix => BDM} import org.json4s.jackson.JsonMethods.{parse => parseJson} import org.apache.spark.{SparkConf, SparkException, SparkFunSuite} +import org.apache.spark.internal.LogKeys.MALFORMATTED_STRING +import org.apache.spark.internal.MDC import org.apache.spark.internal.config.Kryo._ import org.apache.spark.ml.{linalg => newlinalg} import org.apache.spark.mllib.util.TestingUtils._ @@ -226,7 +228,7 @@ class VectorsSuite extends SparkFunSuite { malformatted.foreach { s => intercept[SparkException] { Vectors.parse(s) - logInfo(s"Didn't detect malformatted string $s.") + logInfo(log"Didn't detect malformatted string ${MDC(MALFORMATTED_STRING, s)}.") } } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostedTreesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostedTreesSuite.scala index f5c6abfc66f27..e654eac83649c 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostedTreesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostedTreesSuite.scala @@ -19,7 +19,7 @@ package org.apache.spark.mllib.tree import org.apache.spark.SparkFunSuite import org.apache.spark.internal.{MDC, MessageWithContext} -import org.apache.spark.internal.LogKey.{LEARNING_RATE, NUM_ITERATIONS, SUBSAMPLING_RATE} +import org.apache.spark.internal.LogKeys.{LEARNING_RATE, NUM_ITERATIONS, SUBSAMPLING_RATE} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.configuration.{BoostingStrategy, Strategy} import org.apache.spark.mllib.tree.configuration.Algo._ diff --git a/pom.xml b/pom.xml index bf8d4f1b417d5..67ff14070b8bb 100644 --- a/pom.xml +++ b/pom.xml @@ -115,10 +115,10 @@ UTF-8 17 ${java.version} - 3.9.6 + 3.9.8 3.2.0 spark - 9.6 + 9.7 2.0.13 2.22.1 @@ -128,22 +128,19 @@ 3.11.4 ${hadoop.version} 3.9.2 - 5.6.0 + 5.7.0 org.apache.hive core - 2.3.9 - 2.3.9 - - 2.3 + 2.3.10 3.7.0 10.16.1.1 - 1.13.1 - 2.0.0 + 1.14.1 + 2.0.1 shaded-protobuf - 11.0.20 + 11.0.21 5.0.0 4.0.1 @@ -154,7 +151,7 @@ If you change codahale.metrics.version, you also need to change the link to metrics.dropwizard.io in docs/monitoring.md. --> - 4.2.25 + 4.2.26 1.11.3 1.12.0 @@ -172,10 +169,10 @@ 3.2.2 4.4 - 2.13.13 + 2.13.14 2.13 2.2.0 - 4.8.1 + 4.9.1 false 2.16.2 @@ -183,16 +180,16 @@ true true 1.9.13 - 2.17.0 - 2.17.0 + 2.17.1 + 2.17.1 2.3.1 3.0.2 1.1.10.5 3.0.3 - 1.16.1 - 1.26.1 + 1.17.0 + 1.26.2 2.16.1 - + 2.6 3.14.0 @@ -206,31 +203,34 @@ 3.5.2 3.0.0 2.2.11 - 0.12.0 + 0.16.0 4.13.1 1.1 - 4.17.0 - 4.17.0 + 4.21.0 + 4.21.0 3.1.0 1.1.0 - 1.6.0 + 1.8.0 1.78 1.13.0 - 5.0.1 - 4.1.108.Final + 6.0.0 + 4.1.110.Final 2.0.65.Final - 72.1 + 75.1 + 5.9.3 + 1.9.3 + 0.11.1 - 15.0.2 - 3.0.0-M1 + 16.1.0 + 3.0.0-M2 org.fusesource.leveldbjni - 6.12.0 + 6.13.0 ${java.home} @@ -294,6 +294,9 @@ 1.1.3 6.0.53 + + 3.0-9 + 128m yyyy-MM-dd HH:mm:ss z @@ -301,6 +304,7 @@ -XX:+IgnoreUnrecognizedVMOptions + --add-modules=jdk.incubator.vector --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED @@ -319,11 +323,11 @@ -Dio.netty.tryReflectionSetAccessible=true 2.7.12 - 8.3.0 + 8.4.0 42.7.3 11.5.9.0 - 9.4.1.jre8 - 23.3.0.23.09 + 12.6.2.jre11 + 23.4.0.24.05 @@ -411,17 +415,17 @@ org.scalatestplus - scalacheck-1-17_${scala.binary.version} + scalacheck-1-18_${scala.binary.version} test org.scalatestplus - mockito-5-10_${scala.binary.version} + mockito-5-12_${scala.binary.version} test org.scalatestplus - selenium-4-17_${scala.binary.version} + selenium-4-21_${scala.binary.version} test @@ -481,7 +485,7 @@ org.apache.xbean xbean-asm9-shaded - 4.24 + 4.25 @@ -826,7 +830,7 @@ org.roaringbitmap RoaringBitmap - 1.0.5 + 1.1.0 @@ -1016,11 +1020,6 @@ jackson-module-scala_${scala.binary.version} ${fasterxml.jackson.version} - - com.fasterxml.jackson.module - jackson-module-jaxb-annotations - ${fasterxml.jackson.version} - org.apache.ws.xmlschema xmlschema-core @@ -1079,15 +1078,6 @@ org.glassfish.jersey.test-framework.providers jersey-test-framework-provider-simple ${jersey.version} - - - - org.junit.jupiter - * - - test @@ -1130,7 +1120,7 @@ org.scala-lang.modules scala-xml_${scala.binary.version} - 2.2.0 + 2.3.0 org.scala-lang @@ -1156,7 +1146,7 @@ org.scala-lang.modules scala-parser-combinators_${scala.binary.version} - 2.3.0 + 2.4.0 jline @@ -1166,25 +1156,25 @@ org.scalatest scalatest_${scala.binary.version} - 3.2.18 + 3.2.19 test org.scalatestplus - scalacheck-1-17_${scala.binary.version} - 3.2.18.0 + scalacheck-1-18_${scala.binary.version} + 3.2.19.0 test org.scalatestplus - mockito-5-10_${scala.binary.version} - 3.2.18.0 + mockito-5-12_${scala.binary.version} + 3.2.19.0 test org.scalatestplus - selenium-4-17_${scala.binary.version} - 3.2.18.0 + selenium-4-21_${scala.binary.version} + 3.2.19.0 test @@ -1196,59 +1186,85 @@ org.mockito mockito-core - 5.10.0 + 5.12.0 test net.bytebuddy byte-buddy - 1.14.11 + 1.14.17 test net.bytebuddy byte-buddy-agent - 1.14.11 + 1.14.17 test org.jmock jmock-junit5 test - - - org.junit.jupiter - * - - - org.junit.platform - * - - 2.13.1 org.scalacheck scalacheck_${scala.binary.version} - 1.17.0 + 1.18.0 test org.junit.jupiter junit-jupiter - 5.9.3 + ${junit-jupiter.version} + test + + + org.junit.jupiter + junit-jupiter-api + ${junit-jupiter.version} + test + + + org.junit.jupiter + junit-jupiter-engine + ${junit-jupiter.version} + test + + + org.junit.jupiter + junit-jupiter-params + ${junit-jupiter.version} + test + + + org.junit.platform + junit-platform-commons + ${junit-platform.version} + test + + + org.junit.platform + junit-platform-engine + ${junit-platform.version} + test + + + org.junit.platform + junit-platform-launcher + ${junit-platform.version} test net.aichler jupiter-interface - 0.11.1 + ${sbt-jupiter-interface.version} test com.github.docker-java docker-java - 3.3.5 + 3.3.6 test @@ -1268,7 +1284,7 @@ com.github.docker-java docker-java-transport-zerodep - 3.3.5 + 3.3.6 test @@ -1349,13 +1365,6 @@ org.apache.curator curator-test ${curator.version} - - - - org.junit.jupiter - junit-jupiter-api - - test @@ -1755,83 +1764,6 @@ ${yarn.version} test - - org.apache.hadoop - hadoop-yarn-server-web-proxy - ${yarn.version} - ${hadoop.deps.scope} - - - org.apache.hadoop - hadoop-yarn-server-common - - - org.apache.hadoop - hadoop-yarn-common - - - org.apache.hadoop - hadoop-yarn-api - - - org.bouncycastle - bcprov-jdk15on - - - org.bouncycastle - bcpkix-jdk15on - - - org.fusesource.leveldbjni - leveldbjni-all - - - asm - asm - - - org.ow2.asm - asm - - - org.jboss.netty - netty - - - javax.servlet - servlet-api - - - javax.servlet - javax.servlet-api - - - commons-logging - commons-logging - - - com.sun.jersey - * - - - com.sun.jersey.jersey-test-framework - * - - - com.sun.jersey.contribs - * - - - - com.zaxxer - HikariCP-java7 - - - com.microsoft.sqlserver - mssql-jdbc - - - org.apache.hadoop hadoop-yarn-client @@ -1902,6 +1834,10 @@ io.netty netty-transport-native-epoll + + io.netty + netty-tcnative-boringssl-static + com.github.spotbugs spotbugs-annotations @@ -2095,7 +2031,6 @@ commons-logging commons-logging - + org.eclipse.jetty.aggregate jetty-all - org.apache.logging.log4j * @@ -2122,10 +2056,9 @@ - org.apache.hive + ${hive.group} hive-storage-api - @@ -2244,7 +2177,6 @@ org.json json - ${hive.group} @@ -2259,7 +2191,6 @@ org.apache.calcite.avatica avatica - org.apache.logging.log4j * @@ -2277,10 +2208,9 @@ janino - org.pentaho - pentaho-aggdesigner-algorithm + net.hydromatic + aggdesigner-algorithm - @@ -2348,6 +2278,10 @@ org.codehaus.groovy groovy-all + + com.lmax + disruptor + @@ -2389,7 +2323,6 @@ org.slf4j slf4j-log4j12 - org.apache.hbase @@ -2399,7 +2332,10 @@ co.cask.tephra * - + + com.jolbox + bonecp + @@ -2457,12 +2393,14 @@ org.codehaus.groovy groovy-all - ${hive.group} hive-service-rpc - + org.apache.parquet parquet-hadoop-bundle @@ -2476,7 +2414,6 @@ tomcat jasper-runtime - @@ -2553,30 +2490,28 @@ org.codehaus.groovy groovy-all - org.apache.logging.log4j log4j-slf4j-impl - - org.apache.hive + ${hive.group} hive-llap-common - ${hive23.version} + ${hive.version} ${hive.deps.scope} - org.apache.hive + ${hive.group} hive-common - org.apache.hive + ${hive.group} hive-serde @@ -2587,21 +2522,21 @@ - org.apache.hive + ${hive.group} hive-llap-client - ${hive23.version} + ${hive.version} test - org.apache.hive + ${hive.group} hive-common - org.apache.hive + ${hive.group} hive-serde - org.apache.hive + ${hive.group} hive-llap-common @@ -2662,7 +2597,7 @@ hadoop-client-api - org.apache.hive + ${hive.group} hive-storage-api @@ -2670,7 +2605,7 @@ io.airlift aircompressor - 0.26 + 0.27 org.apache.orc @@ -2692,7 +2627,7 @@ orc-core - org.apache.hive + ${hive.group} hive-storage-api @@ -2788,6 +2723,10 @@ org.slf4j slf4j-api + + javax.annotation + javax.annotation-api + @@ -2877,7 +2816,7 @@ 2.9.1 - org.apache.hive + ${hive.group} hive-storage-api ${hive.storage.version} ${hive.storage.scope} @@ -2967,6 +2906,14 @@ ${java.version} test provided + + + org.jline.terminal.impl.ffm.* + @@ -2987,7 +2934,7 @@ org.codehaus.mojo extra-enforcer-rules - 1.7.0 + 1.8.0 @@ -3110,7 +3057,6 @@ maven-compiler-plugin 3.13.0 - ${java.version} true true @@ -3361,12 +3307,15 @@ org.apache.maven.plugins maven-install-plugin - 3.1.1 + 3.1.2 org.apache.maven.plugins maven-deploy-plugin - 3.1.1 + 3.1.2 + + 3 + org.apache.maven.plugins @@ -3535,7 +3484,7 @@ --> com.puppycrawl.tools checkstyle - 10.14.0 + 10.17.0 @@ -3597,7 +3546,7 @@ org.antipathy mvn-scalafmt_${scala.binary.version} - 1.1.1684076452.9f83818 + 1.1.1713302731.c3d0074 ${scalafmt.validateOnly} ${scalafmt.skip} diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index 0783b6a611b8f..c684e2e30f7f1 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -15,7 +15,8 @@ * limitations under the License. */ -import com.typesafe.tools.mima.core._ +import com.typesafe.tools.mima.core +import com.typesafe.tools.mima.core.* /** * Additional excludes for checking of Spark's binary compatibility. @@ -93,7 +94,9 @@ object MimaExcludes { ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.api.python.TestWritable"), ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.api.python.TestWritable$"), ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.api.python.WriteInputFormatTestDataGenerator"), - ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.api.python.WriteInputFormatTestDataGenerator$") + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.api.python.WriteInputFormatTestDataGenerator$"), + // SPARK-47764: Cleanup shuffle dependencies based on ShuffleCleanupMode + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.shuffle.MigratableResolver.addShuffleToSkip") ) // Default exclude rules diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index bcaa51ec30ff6..d1b0ed953e30b 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -255,16 +255,18 @@ object SparkBuild extends PomBuild { } ) + val noLintOnCompile = sys.env.contains("NOLINT_ON_COMPILE") && + !sys.env.get("NOLINT_ON_COMPILE").contains("false") lazy val sharedSettings = sparkGenjavadocSettings ++ compilerWarningSettings ++ - (if (sys.env.contains("NOLINT_ON_COMPILE")) Nil else enableScalaStyle) ++ Seq( + (if (noLintOnCompile) Nil else enableScalaStyle) ++ Seq( (Compile / exportJars) := true, (Test / exportJars) := false, javaHome := sys.env.get("JAVA_HOME") .orElse(sys.props.get("java.home").map { p => new File(p).getParentFile().getAbsolutePath() }) .map(file), publishMavenStyle := true, - unidocGenjavadocVersion := "0.18", + unidocGenjavadocVersion := "0.19", // Override SBT's default resolvers: resolvers := Seq( @@ -294,13 +296,8 @@ object SparkBuild extends PomBuild { publishLocal := Seq((MavenCompile / publishLocal), (SbtCompile / publishLocal)).dependOn.value, javaOptions ++= { - val versionParts = System.getProperty("java.version").split("[+.\\-]+", 3) - val major = versionParts(0).toInt - if (major >= 21) { - Seq("--add-modules=jdk.incubator.vector", "-Dforeign.restricted=warn") - } else { - Seq("--add-modules=jdk.incubator.vector,jdk.incubator.foreign", "-Dforeign.restricted=warn") - } + // for `dev.ludovic.netlib.blas` which implements such hardware-accelerated BLAS operations + Seq("--add-modules=jdk.incubator.vector") }, (Compile / doc / javacOptions) ++= { @@ -952,7 +949,7 @@ object Unsafe { object DockerIntegrationTests { // This serves to override the override specified in DependencyOverrides: lazy val settings = Seq( - dependencyOverrides += "com.google.guava" % "guava" % "33.0.0-jre" + dependencyOverrides += "com.google.guava" % "guava" % "33.1.0-jre" ) } diff --git a/project/plugins.sbt b/project/plugins.sbt index deb06738c642b..98170afd84759 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -17,13 +17,12 @@ addSbtPlugin("software.purpledragon" % "sbt-checkstyle-plugin" % "4.0.1") -// sbt-checkstyle-plugin uses an old version of checkstyle. Match it to Maven's. // If you are changing the dependency setting for checkstyle plugin, // please check pom.xml in the root of the source tree too. -libraryDependencies += "com.puppycrawl.tools" % "checkstyle" % "10.14.0" +libraryDependencies += "com.puppycrawl.tools" % "checkstyle" % "10.17.0" -// checkstyle uses guava 31.0.1-jre. -libraryDependencies += "com.google.guava" % "guava" % "31.0.1-jre" +// checkstyle uses guava 33.1.0-jre. +libraryDependencies += "com.google.guava" % "guava" % "33.1.0-jre" addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "2.2.0") @@ -37,9 +36,9 @@ addSbtPlugin("com.github.sbt" % "sbt-unidoc" % "0.5.0") addSbtPlugin("io.spray" % "sbt-revolver" % "0.10.0") -libraryDependencies += "org.ow2.asm" % "asm" % "9.6" +libraryDependencies += "org.ow2.asm" % "asm" % "9.7" -libraryDependencies += "org.ow2.asm" % "asm-commons" % "9.6" +libraryDependencies += "org.ow2.asm" % "asm-commons" % "9.7" addSbtPlugin("com.simplytyped" % "sbt-antlr4" % "0.8.3") diff --git a/python/MANIFEST.in b/python/MANIFEST.in index 862d62b1d3b29..45c9dca8b474a 100644 --- a/python/MANIFEST.in +++ b/python/MANIFEST.in @@ -14,7 +14,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -global-exclude *.py[cod] __pycache__ .DS_Store +# Reference: https://setuptools.pypa.io/en/latest/userguide/miscellaneous.html + +recursive-include pyspark *.pyi py.typed *.json recursive-include deps/jars *.jar graft deps/bin recursive-include deps/sbin spark-config.sh spark-daemon.sh start-history-server.sh stop-history-server.sh @@ -22,5 +24,8 @@ recursive-include deps/data *.data *.txt recursive-include deps/licenses *.txt recursive-include deps/examples *.py recursive-include lib *.zip -recursive-include pyspark *.pyi py.typed include README.md + +# Note that these commands are processed in the order they appear, so keep +# this exclude at the end. +global-exclude *.py[cod] __pycache__ .DS_Store diff --git a/python/docs/source/development/contributing.rst b/python/docs/source/development/contributing.rst index 94e485c706e39..d7e87c4de390e 100644 --- a/python/docs/source/development/contributing.rst +++ b/python/docs/source/development/contributing.rst @@ -129,7 +129,7 @@ If you are using Conda, the development environment can be set as follows. .. code-block:: bash - # Python 3.8+ is required + # Python 3.9+ is required conda create --name pyspark-dev-env python=3.9 conda activate pyspark-dev-env pip install --upgrade -r dev/requirements.txt @@ -145,7 +145,7 @@ Now, you can start developing and `running the tests `_. pip ~~~ -With Python 3.8+, pip can be used as below to install and set up the development environment. +With Python 3.9+, pip can be used as below to install and set up the development environment. .. code-block:: bash @@ -248,7 +248,7 @@ Usage 1. Check if an appropriate error class already exists in `Error classes in PySpark `_. If true, use the error class and skip to step 3. -2. Add a new class to `error_classes.py `_; keep in mind the invariants below. +2. Add a new class to `error-conditions.json `_; keep in mind the invariants below. 3. Check if the exception type already extends `PySparkException`. If true, skip to step 5. 4. Mix `PySparkException` into the exception. @@ -266,7 +266,7 @@ Throw with arbitrary error message: **After** -`error_classes.py` +`error-conditions.json` .. code-block:: python diff --git a/python/docs/source/getting_started/install.rst b/python/docs/source/getting_started/install.rst index 6a91a06a1c29d..6cc68cd46b117 100644 --- a/python/docs/source/getting_started/install.rst +++ b/python/docs/source/getting_started/install.rst @@ -30,7 +30,7 @@ and building from the source. Python Versions Supported ------------------------- -Python 3.8 and above. +Python 3.9 and above. Using PyPI @@ -53,6 +53,9 @@ If you want to install extra dependencies for a specific component, you can inst # Spark Connect pip install pyspark[connect] + +See :ref:`optional-dependencies` for more detail about extra dependencies. + For PySpark with/without a specific Hadoop version, you can install it by using ``PYSPARK_HADOOP_VERSION`` environment variables as below: .. code-block:: bash @@ -121,7 +124,7 @@ the same session as pyspark (you can install in several steps too). .. code-block:: bash - conda install -c conda-forge pyspark # can also add "python=3.8 some_package [etc.]" here + conda install -c conda-forge pyspark # can also add "python=3.9 some_package [etc.]" here Note that `PySpark for conda `_ is maintained separately by the community; while new versions generally get packaged quickly, the @@ -142,7 +145,7 @@ PySpark is included in the distributions available at the `Apache Spark website You can download a distribution you want from the site. After that, uncompress the tar file into the directory where you want to install Spark, for example, as below: -.. parsed-literal:: +.. code-block:: bash tar xzvf spark-\ |release|\-bin-hadoop3.tgz @@ -150,7 +153,7 @@ Ensure the ``SPARK_HOME`` environment variable points to the directory where the Update ``PYTHONPATH`` environment variable such that it can find the PySpark and Py4J under ``SPARK_HOME/python/lib``. One example of doing this is shown below: -.. parsed-literal:: +.. code-block:: bash cd spark-\ |release|\-bin-hadoop3 export SPARK_HOME=`pwd` @@ -165,16 +168,110 @@ To install PySpark from source, refer to |building_spark|_. Dependencies ------------ -========================== ========================= ====================================================================================== -Package Supported version Note -========================== ========================= ====================================================================================== -`py4j` >=0.10.9.7 Required -`pandas` >=1.4.4 Required for pandas API on Spark and Spark Connect; Optional for Spark SQL -`pyarrow` >=10.0.0 Required for pandas API on Spark and Spark Connect; Optional for Spark SQL -`numpy` >=1.21 Required for pandas API on Spark and MLLib DataFrame-based API; Optional for Spark SQL -`grpcio` >=1.62.0 Required for Spark Connect -`grpcio-status` >=1.62.0 Required for Spark Connect -`googleapis-common-protos` >=1.56.4 Required for Spark Connect -========================== ========================= ====================================================================================== + +Required dependencies +~~~~~~~~~~~~~~~~~~~~~ + +PySpark requires the following dependencies. + +========================== ========================= ============================= +Package Supported version Note +========================== ========================= ============================= +`py4j` >=0.10.9.7 Required to interact with JVM +========================== ========================= ============================= + +Additional libraries that enhance functionality but are not included in the installation packages: + +- **memory-profiler**: Used for PySpark UDF memory profiling, ``spark.profile.show(...)`` and ``spark.sql.pyspark.udf.profiler``. Note that PySpark requires Java 17 or later with ``JAVA_HOME`` properly set and refer to |downloading|_. + + +.. _optional-dependencies: + +Optional dependencies +~~~~~~~~~~~~~~~~~~~~~ + +PySpark has several optional dependencies that enhance its functionality for specific modules. +These dependencies are only required for certain features and are not necessary for the basic functionality of PySpark. +If these optional dependencies are not installed, PySpark will function correctly for basic operations but will raise an ``ImportError`` +when you try to use features that require these dependencies. + +Spark Connect +^^^^^^^^^^^^^ + +Installable with ``pip install "pyspark[connect]"``. + +========================== ================= ========================== +Package Supported version Note +========================== ================= ========================== +`pandas` >=2.0.0 Required for Spark Connect +`pyarrow` >=10.0.0 Required for Spark Connect +`grpcio` >=1.62.0 Required for Spark Connect +`grpcio-status` >=1.62.0 Required for Spark Connect +`googleapis-common-protos` >=1.56.4 Required for Spark Connect +`graphviz` >=0.20 Optional for Spark Connect +========================== ================= ========================== + +Spark SQL +^^^^^^^^^ + +Installable with ``pip install "pyspark[sql]"``. + +========= ================= ====================== +Package Supported version Note +========= ================= ====================== +`pandas` >=2.0.0 Required for Spark SQL +`pyarrow` >=10.0.0 Required for Spark SQL +========= ================= ====================== + + +Pandas API on Spark +^^^^^^^^^^^^^^^^^^^ + +Installable with ``pip install "pyspark[pandas_on_spark]"``. + +========= ================= ================================ +Package Supported version Note +========= ================= ================================ +`pandas` >=2.0.0 Required for Pandas API on Spark +`pyarrow` >=10.0.0 Required for Pandas API on Spark +========= ================= ================================ + +Additional libraries that enhance functionality but are not included in the installation packages: + +- **mlflow**: Required for ``pyspark.pandas.mlflow``. +- **plotly**: Provide plotting for visualization. It is recommended using **plotly** over **matplotlib**. +- **matplotlib**: Provide plotting for visualization. The default is **plotly**. + + +MLlib DataFrame-based API +^^^^^^^^^^^^^^^^^^^^^^^^^ + +Installable with ``pip install "pyspark[ml]"``. + +======= ================= ====================================== +Package Supported version Note +======= ================= ====================================== +`numpy` >=1.21 Required for MLlib DataFrame-based API +======= ================= ====================================== + +Additional libraries that enhance functionality but are not included in the installation packages: + +- **scipy**: Required for SciPy integration. +- **scikit-learn**: Required for implementing machine learning algorithms. +- **torch**: Required for machine learning model training. +- **torchvision**: Required for supporting image and video processing. +- **torcheval**: Required for facilitating model evaluation metrics. +- **deepspeed**: Required for providing high-performance model training optimizations. Installable on non-Darwin systems. + +MLlib +^^^^^ + +Installable with ``pip install "pyspark[mllib]"``. + +======= ================= ================== +Package Supported version Note +======= ================= ================== +`numpy` >=1.21 Required for MLlib +======= ================= ================== diff --git a/python/docs/source/migration_guide/pyspark_upgrade.rst b/python/docs/source/migration_guide/pyspark_upgrade.rst index 36c1eacaf2c7b..5292530420025 100644 --- a/python/docs/source/migration_guide/pyspark_upgrade.rst +++ b/python/docs/source/migration_guide/pyspark_upgrade.rst @@ -22,8 +22,8 @@ Upgrading PySpark Upgrading from PySpark 3.5 to 4.0 --------------------------------- -* In Spark 4.0, it is recommended to use Pandas version 2.0.0 or above with PySpark for optimal compatibility. -* In Spark 4.0, the minimum supported version for Pandas has been raised from 1.0.5 to 1.4.4 in PySpark. +* In Spark 4.0, Python 3.8 support was dropped in PySpark. +* In Spark 4.0, the minimum supported version for Pandas has been raised from 1.0.5 to 2.0.0 in PySpark. * In Spark 4.0, the minimum supported version for Numpy has been raised from 1.15 to 1.21 in PySpark. * In Spark 4.0, the minimum supported version for PyArrow has been raised from 4.0.0 to 10.0.0 in PySpark. * In Spark 4.0, ``Int64Index`` and ``Float64Index`` have been removed from pandas API on Spark, ``Index`` should be used directly. @@ -71,7 +71,9 @@ Upgrading from PySpark 3.5 to 4.0 * In Spark 4.0, when applying ``astype`` to a decimal type object, the existing missing value is changed to ``True`` instead of ``False`` from Pandas API on Spark. * In Spark 4.0, ``pyspark.testing.assertPandasOnSparkEqual`` has been removed from Pandas API on Spark, use ``pyspark.pandas.testing.assert_frame_equal`` instead. * In Spark 4.0, the aliases ``Y``, ``M``, ``H``, ``T``, ``S`` have been deprecated from Pandas API on Spark, use ``YE``, ``ME``, ``h``, ``min``, ``s`` instead respectively. - +* In Spark 4.0, the schema of a map column is inferred by merging the schemas of all pairs in the map. To restore the previous behavior where the schema is only inferred from the first non-null pair, you can set ``spark.sql.pyspark.legacy.inferMapTypeFromFirstPair.enabled`` to ``true``. +* In Spark 4.0, `compute.ops_on_diff_frames` is on by default. To restore the previous behavior, set `compute.ops_on_diff_frames` to `false`. +* In Spark 4.0, the data type `YearMonthIntervalType` in ``DataFrame.collect`` no longer returns the underlying integers. To restore the previous behavior, set ``PYSPARK_YM_INTERVAL_LEGACY`` environment variable to ``1``. Upgrading from PySpark 3.3 to 3.4 diff --git a/python/docs/source/reference/pyspark.sql/dataframe.rst b/python/docs/source/reference/pyspark.sql/dataframe.rst index b69a2771b04fc..d0196baa7a05b 100644 --- a/python/docs/source/reference/pyspark.sql/dataframe.rst +++ b/python/docs/source/reference/pyspark.sql/dataframe.rst @@ -55,6 +55,7 @@ DataFrame DataFrame.dropna DataFrame.dtypes DataFrame.exceptAll + DataFrame.executionInfo DataFrame.explain DataFrame.fillna DataFrame.filter @@ -109,6 +110,7 @@ DataFrame DataFrame.tail DataFrame.take DataFrame.to + DataFrame.toArrow DataFrame.toDF DataFrame.toJSON DataFrame.toLocalIterator diff --git a/python/docs/source/reference/pyspark.sql/functions.rst b/python/docs/source/reference/pyspark.sql/functions.rst index e9e2c44767ff1..e0895959e893b 100644 --- a/python/docs/source/reference/pyspark.sql/functions.rst +++ b/python/docs/source/reference/pyspark.sql/functions.rst @@ -143,6 +143,7 @@ Mathematical Functions try_add try_divide try_multiply + try_remainder try_subtract unhex width_bucket @@ -280,6 +281,8 @@ Date and Timestamp Functions quarter second session_window + timestamp_add + timestamp_diff timestamp_micros timestamp_millis timestamp_seconds @@ -532,11 +535,24 @@ JSON Functions json_array_length json_object_keys json_tuple - parse_json schema_of_json to_json +VARIANT Functions +----------------- +.. autosummary:: + :toctree: api/ + + is_variant_null + parse_json + schema_of_variant + schema_of_variant_agg + try_variant_get + variant_get + try_parse_json + + XML Functions -------------- .. autosummary:: diff --git a/python/docs/source/reference/pyspark.sql/variant_val.rst b/python/docs/source/reference/pyspark.sql/variant_val.rst index a7f592c18e3a3..8630ae8aace14 100644 --- a/python/docs/source/reference/pyspark.sql/variant_val.rst +++ b/python/docs/source/reference/pyspark.sql/variant_val.rst @@ -25,3 +25,4 @@ VariantVal :toctree: api/ VariantVal.toPython + VariantVal.toJson diff --git a/python/docs/source/user_guide/pandas_on_spark/typehints.rst b/python/docs/source/user_guide/pandas_on_spark/typehints.rst index 1405baa39c16e..23126664d78a6 100644 --- a/python/docs/source/user_guide/pandas_on_spark/typehints.rst +++ b/python/docs/source/user_guide/pandas_on_spark/typehints.rst @@ -62,7 +62,7 @@ it as a Spark schema. As an example, you can specify the return type hint as bel Notice that the function ``pandas_div`` actually takes and outputs a pandas DataFrame instead of pandas-on-Spark :class:`DataFrame`. So, technically the correct types should be of pandas. -With Python 3.8+, you can specify the type hints by using pandas instances as follows: +With Python 3.9+, you can specify the type hints by using pandas instances as follows: .. code-block:: python diff --git a/python/docs/source/user_guide/sql/arrow_pandas.rst b/python/docs/source/user_guide/sql/arrow_pandas.rst index 039671608b6d9..fde40140110f9 100644 --- a/python/docs/source/user_guide/sql/arrow_pandas.rst +++ b/python/docs/source/user_guide/sql/arrow_pandas.rst @@ -39,6 +39,22 @@ is installed and available on all cluster nodes. You can install it using pip or conda from the conda-forge channel. See PyArrow `installation `_ for details. +Conversion to/from Arrow Table +------------------------------ + +From Spark 4.0, you can create a Spark DataFrame from a PyArrow Table with +:meth:`SparkSession.createDataFrame`, and you can convert a Spark DataFrame to a PyArrow Table +with :meth:`DataFrame.toArrow`. + +.. literalinclude:: ../../../../../examples/src/main/python/sql/arrow.py + :language: python + :lines: 37-52 + :dedent: 4 + +Note that :meth:`DataFrame.toArrow` results in the collection of all records in the DataFrame to +the driver program and should be done on a small subset of the data. Not all Spark and Arrow data +types are currently supported and an error can be raised if a column has an unsupported type. + Enabling for Conversion to/from Pandas -------------------------------------- @@ -53,7 +69,7 @@ This can be controlled by ``spark.sql.execution.arrow.pyspark.fallback.enabled`` .. literalinclude:: ../../../../../examples/src/main/python/sql/arrow.py :language: python - :lines: 37-52 + :lines: 56-71 :dedent: 4 Using the above optimizations with Arrow will produce the same results as when Arrow is not @@ -90,7 +106,7 @@ specify the type hints of ``pandas.Series`` and ``pandas.DataFrame`` as below: .. literalinclude:: ../../../../../examples/src/main/python/sql/arrow.py :language: python - :lines: 56-80 + :lines: 75-99 :dedent: 4 In the following sections, it describes the combinations of the supported type hints. For simplicity, @@ -113,7 +129,7 @@ The following example shows how to create this Pandas UDF that computes the prod .. literalinclude:: ../../../../../examples/src/main/python/sql/arrow.py :language: python - :lines: 84-114 + :lines: 103-133 :dedent: 4 For detailed usage, please see :func:`pandas_udf`. @@ -152,7 +168,7 @@ The following example shows how to create this Pandas UDF: .. literalinclude:: ../../../../../examples/src/main/python/sql/arrow.py :language: python - :lines: 118-140 + :lines: 137-159 :dedent: 4 For detailed usage, please see :func:`pandas_udf`. @@ -174,7 +190,7 @@ The following example shows how to create this Pandas UDF: .. literalinclude:: ../../../../../examples/src/main/python/sql/arrow.py :language: python - :lines: 144-167 + :lines: 163-186 :dedent: 4 For detailed usage, please see :func:`pandas_udf`. @@ -205,7 +221,7 @@ and window operations: .. literalinclude:: ../../../../../examples/src/main/python/sql/arrow.py :language: python - :lines: 171-212 + :lines: 190-231 :dedent: 4 .. currentmodule:: pyspark.sql.functions @@ -270,7 +286,7 @@ in the group. .. literalinclude:: ../../../../../examples/src/main/python/sql/arrow.py :language: python - :lines: 216-234 + :lines: 235-253 :dedent: 4 For detailed usage, please see please see :meth:`GroupedData.applyInPandas` @@ -288,7 +304,7 @@ The following example shows how to use :meth:`DataFrame.mapInPandas`: .. literalinclude:: ../../../../../examples/src/main/python/sql/arrow.py :language: python - :lines: 238-249 + :lines: 257-268 :dedent: 4 For detailed usage, please see :meth:`DataFrame.mapInPandas`. @@ -327,7 +343,7 @@ The following example shows how to use ``DataFrame.groupby().cogroup().applyInPa .. literalinclude:: ../../../../../examples/src/main/python/sql/arrow.py :language: python - :lines: 253-275 + :lines: 272-294 :dedent: 4 @@ -339,9 +355,9 @@ Arrow Python UDFs Arrow Python UDFs are user defined functions that are executed row-by-row, utilizing Arrow for efficient batch data transfer and serialization. To define an Arrow Python UDF, you can use the :meth:`udf` decorator or wrap the function with the :meth:`udf` method, ensuring the ``useArrow`` parameter is set to True. Additionally, you can enable Arrow -optimization for Python UDFs throughout the entire SparkSession by setting the Spark configuration ``spark.sql -.execution.pythonUDF.arrow.enabled`` to true. It's important to note that the Spark configuration takes effect only -when ``useArrow`` is either not set or set to None. +optimization for Python UDFs throughout the entire SparkSession by setting the Spark configuration +``spark.sql.execution.pythonUDF.arrow.enabled`` to true. It's important to note that the Spark configuration takes +effect only when ``useArrow`` is either not set or set to None. The type hints for Arrow Python UDFs should be specified in the same way as for default, pickled Python UDFs. @@ -349,7 +365,7 @@ Here's an example that demonstrates the usage of both a default, pickled Python .. literalinclude:: ../../../../../examples/src/main/python/sql/arrow.py :language: python - :lines: 279-297 + :lines: 298-316 :dedent: 4 Compared to the default, pickled Python UDFs, Arrow Python UDFs provide a more coherent type coercion mechanism. UDF @@ -400,11 +416,15 @@ and each column will be converted to the Spark session time zone then localized zone, which removes the time zone and displays values as local time. This will occur when calling :meth:`DataFrame.toPandas()` or ``pandas_udf`` with timestamp columns. -When timestamp data is transferred from Pandas to Spark, it will be converted to UTC microseconds. This -occurs when calling :meth:`SparkSession.createDataFrame` with a Pandas DataFrame or when returning a timestamp from a -``pandas_udf``. These conversions are done automatically to ensure Spark will have data in the -expected format, so it is not necessary to do any of these conversions yourself. Any nanosecond -values will be truncated. +When timestamp data is transferred from Spark to a PyArrow Table, it will remain in microsecond +resolution with the UTC time zone. This occurs when calling :meth:`DataFrame.toArrow()` with +timestamp columns. + +When timestamp data is transferred from Pandas or PyArrow to Spark, it will be converted to UTC +microseconds. This occurs when calling :meth:`SparkSession.createDataFrame` with a Pandas DataFrame +or PyArrow Table, or when returning a timestamp from a ``pandas_udf``. These conversions are done +automatically to ensure Spark will have data in the expected format, so it is not necessary to do +any of these conversions yourself. Any nanosecond values will be truncated. Note that a standard UDF (non-Pandas) will load timestamp data as Python datetime objects, which is different from a Pandas timestamp. It is recommended to use Pandas time series functionality when @@ -414,16 +434,19 @@ working with timestamps in ``pandas_udf``\s to get the best performance, see Recommended Pandas and PyArrow Versions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -For usage with pyspark.sql, the minimum supported versions of Pandas is 1.4.4 and PyArrow is 10.0.0. +For usage with pyspark.sql, the minimum supported versions of Pandas is 2.0.0 and PyArrow is 10.0.0. Higher versions may be used, however, compatibility and data correctness can not be guaranteed and should be verified by the user. Setting Arrow ``self_destruct`` for memory savings ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Since Spark 3.2, the Spark configuration ``spark.sql.execution.arrow.pyspark.selfDestruct.enabled`` can be used to enable PyArrow's ``self_destruct`` feature, which can save memory when creating a Pandas DataFrame via ``toPandas`` by freeing Arrow-allocated memory while building the Pandas DataFrame. -This option is experimental, and some operations may fail on the resulting Pandas DataFrame due to immutable backing arrays. -Typically, you would see the error ``ValueError: buffer source array is read-only``. -Newer versions of Pandas may fix these errors by improving support for such cases. -You can work around this error by copying the column(s) beforehand. -Additionally, this conversion may be slower because it is single-threaded. +Since Spark 3.2, the Spark configuration ``spark.sql.execution.arrow.pyspark.selfDestruct.enabled`` +can be used to enable PyArrow's ``self_destruct`` feature, which can save memory when creating a +Pandas DataFrame via ``toPandas`` by freeing Arrow-allocated memory while building the Pandas +DataFrame. This option can also save memory when creating a PyArrow Table via ``toArrow``. +This option is experimental. When used with ``toPandas``, some operations may fail on the resulting +Pandas DataFrame due to immutable backing arrays. Typically, you would see the error +``ValueError: buffer source array is read-only``. Newer versions of Pandas may fix these errors by +improving support for such cases. You can work around this error by copying the column(s) +beforehand. Additionally, this conversion may be slower because it is single-threaded. diff --git a/python/docs/source/user_guide/sql/index.rst b/python/docs/source/user_guide/sql/index.rst index 118cf139d9b38..d1b67f7eeb909 100644 --- a/python/docs/source/user_guide/sql/index.rst +++ b/python/docs/source/user_guide/sql/index.rst @@ -25,5 +25,6 @@ Spark SQL arrow_pandas python_udtf + python_data_source type_conversions diff --git a/python/docs/source/user_guide/sql/python_data_source.rst b/python/docs/source/user_guide/sql/python_data_source.rst new file mode 100644 index 0000000000000..cdbc706993119 --- /dev/null +++ b/python/docs/source/user_guide/sql/python_data_source.rst @@ -0,0 +1,395 @@ +.. Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + +====================== +Python Data Source API +====================== + +.. currentmodule:: pyspark.sql + +Overview +-------- +The Python Data Source API is a new feature introduced in Spark 4.0, enabling developers to read from custom data sources and write to custom data sinks in Python. +This guide provides a comprehensive overview of the API and instructions on how to create, use, and manage Python data sources. + + +Creating a Python Data Source +----------------------------- +To create a custom Python data source, you'll need to subclass the :class:`DataSource` base classes and implement the necessary methods for reading and writing data. + +This example demonstrates creating a simple data source to generate synthetic data using the `faker` library. Ensure the `faker` library is installed and accessible in your Python environment. + +**Define the Data Source** + +Start by creating a new subclass of :class:`DataSource` with the source name, schema. + +In order to be used as source or sink in batch or streaming query, corresponding method of DataSource needs to be implemented. + +Method that needs to be implemented for a capability: + ++------------+----------------------+------------------+ +| | source | sink | ++============+======================+==================+ +| batch | reader() | writer() | ++------------+----------------------+------------------+ +| | streamReader() | | +| streaming | or | streamWriter() | +| | simpleStreamReader() | | ++------------+----------------------+------------------+ + +.. code-block:: python + + from pyspark.sql.datasource import DataSource, DataSourceReader + from pyspark.sql.types import StructType + + class FakeDataSource(DataSource): + """ + A fake data source for PySpark to generate synthetic data using the `faker` library. + Options: + - numRows: specify number of rows to generate. Default value is 3. + """ + + @classmethod + def name(cls): + return "fake" + + def schema(self): + return "name string, date string, zipcode string, state string" + + def reader(self, schema: StructType): + return FakeDataSourceReader(schema, self.options) + + def writer(self, schema: StructType, overwrite: bool): + return FakeDataSourceWriter(self.options) + + def streamReader(self, schema: StructType): + return FakeStreamReader(schema, self.options) + + # Please skip the implementation of this method if streamReader has been implemented. + def simpleStreamReader(self, schema: StructType): + return SimpleStreamReader() + + def streamWriter(self, schema: StructType, overwrite: bool): + return FakeStreamWriter(self.options) + +Implementing Batch Reader and Writer for Python Data Source +----------------------------------------------------------- +**Implement the Reader** + +Define the reader logic to generate synthetic data. Use the `faker` library to populate each field in the schema. + +.. code-block:: python + + class FakeDataSourceReader(DataSourceReader): + + def __init__(self, schema, options): + self.schema: StructType = schema + self.options = options + + def read(self, partition): + from faker import Faker + fake = Faker() + # Note: every value in this `self.options` dictionary is a string. + num_rows = int(self.options.get("numRows", 3)) + for _ in range(num_rows): + row = [] + for field in self.schema.fields: + value = getattr(fake, field.name)() + row.append(value) + yield tuple(row) + +**Implement the Writer** + +Create a fake data source writer that processes each partition of data, counts the rows, and either +prints the total count of rows after a successful write or the number of failed tasks if the writing process fails. + +.. code-block:: python + + from dataclasses import dataclass + from typing import Iterator, List + + from pyspark.sql.types import Row + from pyspark.sql.datasource import DataSource, DataSourceWriter, WriterCommitMessage + + @dataclass + class SimpleCommitMessage(WriterCommitMessage): + partition_id: int + count: int + + class FakeDataSourceWriter(DataSourceWriter): + + def write(self, rows: Iterator[Row]) -> SimpleCommitMessage: + from pyspark import TaskContext + + context = TaskContext.get() + partition_id = context.partitionId() + cnt = sum(1 for _ in rows) + return SimpleCommitMessage(partition_id=partition_id, count=cnt) + + def commit(self, messages: List[SimpleCommitMessage]) -> None: + total_count = sum(message.count for message in messages) + print(f"Total number of rows: {total_count}") + + def abort(self, messages: List[SimpleCommitMessage]) -> None: + failed_count = sum(message is None for message in messages) + print(f"Number of failed tasks: {failed_count}") + + +Implementing Streaming Reader and Writer for Python Data Source +--------------------------------------------------------------- +**Implement the Stream Reader** + +This is a dummy streaming data reader that generate 2 rows in every microbatch. The streamReader instance has a integer offset that increase by 2 in every microbatch. + +.. code-block:: python + + class RangePartition(InputPartition): + def __init__(self, start, end): + self.start = start + self.end = end + + class FakeStreamReader(DataSourceStreamReader): + def __init__(self, schema, options): + self.current = 0 + + def initialOffset(self) -> dict: + """ + Return the initial start offset of the reader. + """ + return {"offset": 0} + + def latestOffset(self) -> dict: + """ + Return the current latest offset that the next microbatch will read to. + """ + self.current += 2 + return {"offset": self.current} + + def partitions(self, start: dict, end: dict): + """ + Plans the partitioning of the current microbatch defined by start and end offset, + it needs to return a sequence of :class:`InputPartition` object. + """ + return [RangePartition(start["offset"], end["offset"])] + + def commit(self, end: dict): + """ + This is invoked when the query has finished processing data before end offset, this can be used to clean up resource. + """ + pass + + def read(self, partition) -> Iterator[Tuple]: + """ + Takes a partition as an input and read an iterator of tuples from the data source. + """ + start, end = partition.start, partition.end + for i in range(start, end): + yield (i, str(i)) + +**Implement the Simple Stream Reader** + +If the data source has low throughput and doesn't require partitioning, you can implement SimpleDataSourceStreamReader instead of DataSourceStreamReader. + +One of simpleStreamReader() and streamReader() must be implemented for readable streaming data source. And simpleStreamReader() will only be invoked when streamReader() is not implemented. + +This is the same dummy streaming reader that generate 2 rows every batch implemented with SimpleDataSourceStreamReader interface. + +.. code-block:: python + + class SimpleStreamReader(SimpleDataSourceStreamReader): + def initialOffset(self): + """ + Return the initial start offset of the reader. + """ + return {"offset": 0} + + def read(self, start: dict) -> (Iterator[Tuple], dict): + """ + Takes start offset as an input, return an iterator of tuples and the start offset of next read. + """ + start_idx = start["offset"] + it = iter([(i,) for i in range(start_idx, start_idx + 2)]) + return (it, {"offset": start_idx + 2}) + + def readBetweenOffsets(self, start: dict, end: dict) -> Iterator[Tuple]: + """ + Takes start and end offset as input and read an iterator of data deterministically. + This is called whe query replay batches during restart or after failure. + """ + start_idx = start["offset"] + end_idx = end["offset"] + return iter([(i,) for i in range(start_idx, end_idx)]) + + def commit(self, end): + """ + This is invoked when the query has finished processing data before end offset, this can be used to clean up resource. + """ + pass + +**Implement the Stream Writer** + +This is a streaming data writer that write the metadata information of each microbatch to a local path. + +.. code-block:: python + + class SimpleCommitMessage(WriterCommitMessage): + partition_id: int + count: int + + class FakeStreamWriter(DataSourceStreamWriter): + def __init__(self, options): + self.options = options + self.path = self.options.get("path") + assert self.path is not None + + def write(self, iterator): + """ + Write the data and return the commit message of that partition + """ + from pyspark import TaskContext + context = TaskContext.get() + partition_id = context.partitionId() + cnt = 0 + for row in iterator: + cnt += 1 + return SimpleCommitMessage(partition_id=partition_id, count=cnt) + + def commit(self, messages, batchId) -> None: + """ + Receives a sequence of :class:`WriterCommitMessage` when all write tasks succeed and decides what to do with it. + In this FakeStreamWriter, we write the metadata of the microbatch(number of rows and partitions) into a json file inside commit(). + """ + status = dict(num_partitions=len(messages), rows=sum(m.count for m in messages)) + with open(os.path.join(self.path, f"{batchId}.json"), "a") as file: + file.write(json.dumps(status) + "\n") + + def abort(self, messages, batchId) -> None: + """ + Receives a sequence of :class:`WriterCommitMessage` from successful tasks when some tasks fail and decides what to do with it. + In this FakeStreamWriter, we write a failure message into a txt file inside abort(). + """ + with open(os.path.join(self.path, f"{batchId}.txt"), "w") as file: + file.write(f"failed in batch {batchId}") + +Serialization Requirement +------------------------- +User defined DataSource, DataSourceReader, DataSourceWriter, DataSourceStreamReader and DataSourceStreamWriter and their methods must be able to be serialized by pickle. + +For library that are used inside a method, it must be imported inside the method. For example, TaskContext must be imported inside the read() method in the code below. + +.. code-block:: python + + def read(self, partition): + from pyspark import TaskContext + context = TaskContext.get() + +Using a Python Data Source +-------------------------- +**Use a Python Data Source in Batch Query** + +After defining your data source, it must be registered before usage. + +.. code-block:: python + + spark.dataSource.register(FakeDataSource) + +**Read From a Python Data Source** + +Read from the fake datasource with the default schema and options: + +.. code-block:: python + + spark.read.format("fake").load().show() + + # +-----------+----------+-------+-------+ + # | name| date|zipcode| state| + # +-----------+----------+-------+-------+ + # |Carlos Cobb|2018-07-15| 73003|Indiana| + # | Eric Scott|1991-08-22| 10085| Idaho| + # | Amy Martin|1988-10-28| 68076| Oregon| + # +-----------+----------+-------+-------+ + +Read from the fake datasource with a custom schema: + +.. code-block:: python + + spark.read.format("fake").schema("name string, company string").load().show() + + # +---------------------+--------------+ + # |name |company | + # +---------------------+--------------+ + # |Tanner Brennan |Adams Group | + # |Leslie Maxwell |Santiago Group| + # |Mrs. Jacqueline Brown|Maynard Inc | + # +---------------------+--------------+ + +Read from the fake datasource with a different number of rows: + +.. code-block:: python + + spark.read.format("fake").option("numRows", 5).load().show() + + # +--------------+----------+-------+------------+ + # | name| date|zipcode| state| + # +--------------+----------+-------+------------+ + # | Pam Mitchell|1988-10-20| 23788| Tennessee| + # |Melissa Turner|1996-06-14| 30851| Nevada| + # | Brian Ramsey|2021-08-21| 55277| Washington| + # | Caitlin Reed|1983-06-22| 89813|Pennsylvania| + # | Douglas James|2007-01-18| 46226| Alabama| + # +--------------+----------+-------+------------+ + +**Write To a Python Data Source** + +To write data to a custom location, make sure that you specify the `mode()` clause. Supported modes are `append` and `overwrite`. + +.. code-block:: python + + df = spark.range(0, 10, 1, 5) + df.write.format("fake").mode("append").save() + + # You can check the Spark log (standard error) to see the output of the write operation. + # Total number of rows: 10 + +**Use a Python Data Source in Streaming Query** + +Once we register the python data source, we can also use it in streaming queries as source of readStream() or sink of writeStream() by passing short name or full name to format(). + +Start a query that read from fake python data source and write to console + +.. code-block:: python + + query = spark.readStream.format("fake").load().writeStream.format("console").start() + + # +---+ + # | id| + # +---+ + # | 0| + # | 1| + # +---+ + # +---+ + # | id| + # +---+ + # | 2| + # | 3| + # +---+ + +We can also use the same data source in streaming reader and writer + +.. code-block:: python + + query = spark.readStream.format("fake").load().writeStream.format("fake").start("/output_path") diff --git a/python/packaging/classic/setup.py b/python/packaging/classic/setup.py index 8eefc17db7002..5e94c2b653806 100755 --- a/python/packaging/classic/setup.py +++ b/python/packaging/classic/setup.py @@ -150,7 +150,7 @@ def _supports_symlinks(): # binary format protocol with the Java version, see ARROW_HOME/format/* for specifications. # Also don't forget to update python/docs/source/getting_started/install.rst, and # python/packaging/connect/setup.py -_minimum_pandas_version = "1.4.4" +_minimum_pandas_version = "2.0.0" _minimum_numpy_version = "1.21" _minimum_pyarrow_version = "10.0.0" _minimum_grpc_version = "1.62.0" @@ -204,8 +204,13 @@ def run(self): copyfile("pyspark/shell.py", "pyspark/python/pyspark/shell.py") if in_spark: + # !!HACK ALTERT!! + # `setup.py` has to be located with the same directory with the package. + # Therefore, we copy the current file, and place it at `spark/python` directory. + # After that, we remove it in the end. copyfile("packaging/classic/setup.py", "setup.py") copyfile("packaging/classic/setup.cfg", "setup.cfg") + # Construct the symlink farm - this is nein_sparkcessary since we can't refer to # the path above the package root and we need to copy the jars and scripts which # are up above the python root. @@ -270,12 +275,14 @@ def run(self): "pyspark.ml.deepspeed", "pyspark.sql", "pyspark.sql.avro", + "pyspark.sql.classic", "pyspark.sql.connect", "pyspark.sql.connect.avro", "pyspark.sql.connect.client", "pyspark.sql.connect.functions", "pyspark.sql.connect.proto", "pyspark.sql.connect.protobuf", + "pyspark.sql.connect.resource", "pyspark.sql.connect.shell", "pyspark.sql.connect.streaming", "pyspark.sql.connect.streaming.worker", @@ -357,11 +364,10 @@ def run(self): "numpy>=%s" % _minimum_numpy_version, ], }, - python_requires=">=3.8", + python_requires=">=3.9", classifiers=[ "Development Status :: 5 - Production/Stable", "License :: OSI Approved :: Apache Software License", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", diff --git a/python/packaging/connect/setup.py b/python/packaging/connect/setup.py index fe1e7486faa9b..bc1d4fd2868de 100755 --- a/python/packaging/connect/setup.py +++ b/python/packaging/connect/setup.py @@ -25,7 +25,7 @@ import sys from setuptools import setup import os -from shutil import copyfile +from shutil import copyfile, move import glob from pathlib import Path @@ -70,6 +70,7 @@ test_packages = [ "pyspark.tests", # for Memory profiler parity tests "pyspark.testing", + "pyspark.resource.tests", "pyspark.sql.tests", "pyspark.sql.tests.connect", "pyspark.sql.tests.connect.streaming", @@ -108,6 +109,13 @@ try: if in_spark: + # !!HACK ALTERT!! + # 1. `setup.py` has to be located with the same directory with the package. + # Therefore, we copy the current file, and place it at `spark/python` directory. + # After that, we remove it in the end. + # 2. Here it renames `lib` to `lib.ack` so MANIFEST.in does not pick `py4j` up. + # We rename it back in the end. + move("lib", "lib.back") copyfile("packaging/connect/setup.py", "setup.py") copyfile("packaging/connect/setup.cfg", "setup.cfg") @@ -116,7 +124,7 @@ # binary format protocol with the Java version, see ARROW_HOME/format/* for specifications. # Also don't forget to update python/docs/source/getting_started/install.rst, and # python/packaging/classic/setup.py - _minimum_pandas_version = "1.4.4" + _minimum_pandas_version = "2.0.0" _minimum_numpy_version = "1.21" _minimum_pyarrow_version = "10.0.0" _minimum_grpc_version = "1.59.3" @@ -145,6 +153,7 @@ "pyspark.sql.connect.functions", "pyspark.sql.connect.proto", "pyspark.sql.connect.protobuf", + "pyspark.sql.connect.resource", "pyspark.sql.connect.shell", "pyspark.sql.connect.streaming", "pyspark.sql.connect.streaming.worker", @@ -178,6 +187,7 @@ author_email="dev@spark.apache.org", url="https://github.com/apache/spark/tree/master/python", packages=connect_packages + test_packages, + include_package_data=True, license="http://www.apache.org/licenses/LICENSE-2.0", # Don't forget to update python/docs/source/getting_started/install.rst # if you're updating the versions or dependencies. @@ -189,11 +199,10 @@ "googleapis-common-protos>=%s" % _minimum_googleapis_common_protos_version, "numpy>=%s" % _minimum_numpy_version, ], - python_requires=">=3.8", + python_requires=">=3.9", classifiers=[ "Development Status :: 5 - Production/Stable", "License :: OSI Approved :: Apache Software License", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", @@ -205,5 +214,6 @@ ) finally: if in_spark: + move("lib.back", "lib") os.remove("setup.py") os.remove("setup.cfg") diff --git a/python/pyspark/__init__.py b/python/pyspark/__init__.py index 15c21df0c6bf4..49c594f8c7def 100644 --- a/python/pyspark/__init__.py +++ b/python/pyspark/__init__.py @@ -125,8 +125,10 @@ def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: # for backward compatibility references. sys.modules["pyspark.context"] = context -# for back compatibility -from pyspark.sql import SQLContext, HiveContext, Row # noqa: F401 + # for back compatibility + from pyspark.sql import SQLContext, HiveContext # noqa: F401 + +from pyspark.sql import Row # noqa: F401 __all__ = [ "SparkConf", diff --git a/python/pyspark/daemon.py b/python/pyspark/daemon.py index b0e06d13beda7..a23af109ea6de 100644 --- a/python/pyspark/daemon.py +++ b/python/pyspark/daemon.py @@ -28,9 +28,9 @@ from socket import AF_INET, AF_INET6, SOCK_STREAM, SOMAXCONN from signal import SIGHUP, SIGTERM, SIGCHLD, SIG_DFL, SIG_IGN, SIGINT -from pyspark.serializers import read_long, write_int, write_with_length, UTF8Deserializer +from pyspark.serializers import read_int, write_int, write_with_length, UTF8Deserializer -if len(sys.argv) > 1: +if len(sys.argv) > 1 and sys.argv[1].startswith("pyspark"): import importlib worker_module = importlib.import_module(sys.argv[1]) @@ -139,7 +139,7 @@ def handle_sigterm(*args): if 0 in ready_fds: try: - worker_pid = read_long(stdin_bin) + worker_pid = read_int(stdin_bin) except EOFError: # Spark told us to exit by closing stdin shutdown(0) diff --git a/python/pyspark/errors/error-conditions.json b/python/pyspark/errors/error-conditions.json new file mode 100644 index 0000000000000..dd70e814b1ea8 --- /dev/null +++ b/python/pyspark/errors/error-conditions.json @@ -0,0 +1,1166 @@ +{ + "APPLICATION_NAME_NOT_SET": { + "message": [ + "An application name must be set in your configuration." + ] + }, + "ARGUMENT_REQUIRED": { + "message": [ + "Argument `` is required when ." + ] + }, + "ARROW_LEGACY_IPC_FORMAT": { + "message": [ + "Arrow legacy IPC format is not supported in PySpark, please unset ARROW_PRE_0_15_IPC_FORMAT." + ] + }, + "ATTRIBUTE_NOT_CALLABLE": { + "message": [ + "Attribute `` in provided object `` is not callable." + ] + }, + "ATTRIBUTE_NOT_SUPPORTED": { + "message": [ + "Attribute `` is not supported." + ] + }, + "AXIS_LENGTH_MISMATCH": { + "message": [ + "Length mismatch: Expected axis has element, new values have elements." + ] + }, + "BROADCAST_VARIABLE_NOT_LOADED": { + "message": [ + "Broadcast variable `` not loaded." + ] + }, + "CALL_BEFORE_INITIALIZE": { + "message": [ + "Not supported to call `` before initialize ." + ] + }, + "CANNOT_ACCEPT_OBJECT_IN_TYPE": { + "message": [ + "`` can not accept object `` in type ``." + ] + }, + "CANNOT_ACCESS_TO_DUNDER": { + "message": [ + "Dunder(double underscore) attribute is for internal use only." + ] + }, + "CANNOT_APPLY_IN_FOR_COLUMN": { + "message": [ + "Cannot apply 'in' operator against a column: please use 'contains' in a string column or 'array_contains' function for an array column." + ] + }, + "CANNOT_BE_EMPTY": { + "message": [ + "At least one must be specified." + ] + }, + "CANNOT_BE_NONE": { + "message": [ + "Argument `` cannot be None." + ] + }, + "CANNOT_CONFIGURE_SPARK_CONNECT": { + "message": [ + "Spark Connect server cannot be configured: Existing [], New []." + ] + }, + "CANNOT_CONFIGURE_SPARK_CONNECT_MASTER": { + "message": [ + "Spark Connect server and Spark master cannot be configured together: Spark master [], Spark Connect []." + ] + }, + "CANNOT_CONVERT_COLUMN_INTO_BOOL": { + "message": [ + "Cannot convert column into bool: please use '&' for 'and', '|' for 'or', '~' for 'not' when building DataFrame boolean expressions." + ] + }, + "CANNOT_CONVERT_TYPE": { + "message": [ + "Cannot convert into ." + ] + }, + "CANNOT_DETERMINE_TYPE": { + "message": [ + "Some of types cannot be determined after inferring." + ] + }, + "CANNOT_GET_BATCH_ID": { + "message": [ + "Could not get batch id from ." + ] + }, + "CANNOT_INFER_ARRAY_TYPE": { + "message": [ + "Can not infer Array Type from a list with None as the first element." + ] + }, + "CANNOT_INFER_EMPTY_SCHEMA": { + "message": [ + "Can not infer schema from an empty dataset." + ] + }, + "CANNOT_INFER_SCHEMA_FOR_TYPE": { + "message": [ + "Can not infer schema for type: ``." + ] + }, + "CANNOT_INFER_TYPE_FOR_FIELD": { + "message": [ + "Unable to infer the type of the field ``." + ] + }, + "CANNOT_MERGE_TYPE": { + "message": [ + "Can not merge type `` and ``." + ] + }, + "CANNOT_OPEN_SOCKET": { + "message": [ + "Can not open socket: ." + ] + }, + "CANNOT_PARSE_DATATYPE": { + "message": [ + "Unable to parse datatype. ." + ] + }, + "CANNOT_PROVIDE_METADATA": { + "message": [ + "Metadata can only be provided for a single column." + ] + }, + "CANNOT_SET_TOGETHER": { + "message": [ + " should not be set together." + ] + }, + "CANNOT_SPECIFY_RETURN_TYPE_FOR_UDF": { + "message": [ + "returnType can not be specified when `` is a user-defined function, but got ." + ] + }, + "CANNOT_WITHOUT": { + "message": [ + "Cannot without ." + ] + }, + "CLASSIC_OPERATION_NOT_SUPPORTED_ON_DF": { + "message": [ + "Calling property or member '' is not supported in PySpark Classic, please use Spark Connect instead." + ] + }, + "COLLATION_INVALID_PROVIDER" : { + "message" : [ + "The value does not represent a correct collation provider. Supported providers are: []." + ] + }, + "COLUMN_IN_LIST": { + "message": [ + "`` does not allow a Column in a list." + ] + }, + "CONNECT_URL_ALREADY_DEFINED": { + "message": [ + "Only one Spark Connect client URL can be set; however, got a different URL [] from the existing []." + ] + }, + "CONNECT_URL_NOT_SET": { + "message": [ + "Cannot create a Spark Connect session because the Spark Connect remote URL has not been set. Please define the remote URL by setting either the 'spark.remote' option or the 'SPARK_REMOTE' environment variable." + ] + }, + "CONTEXT_ONLY_VALID_ON_DRIVER": { + "message": [ + "It appears that you are attempting to reference SparkContext from a broadcast variable, action, or transformation. SparkContext can only be used on the driver, not in code that it run on workers. For more information, see SPARK-5063." + ] + }, + "CONTEXT_UNAVAILABLE_FOR_REMOTE_CLIENT": { + "message": [ + "Remote client cannot create a SparkContext. Create SparkSession instead." + ] + }, + "DATA_SOURCE_CREATE_ERROR": { + "message": [ + "Failed to create python data source instance, error: ." + ] + }, + "DATA_SOURCE_INVALID_RETURN_TYPE": { + "message": [ + "Unsupported return type ('') from Python data source ''. Expected types: ." + ] + }, + "DATA_SOURCE_RETURN_SCHEMA_MISMATCH": { + "message": [ + "Return schema mismatch in the result from 'read' method. Expected: columns, Found: columns. Make sure the returned values match the required output schema." + ] + }, + "DATA_SOURCE_TYPE_MISMATCH": { + "message": [ + "Expected , but got ." + ] + }, + "DIFFERENT_PANDAS_DATAFRAME": { + "message": [ + "DataFrames are not almost equal:", + "Left:", + "", + "", + "Right:", + "", + "" + ] + }, + "DIFFERENT_PANDAS_INDEX": { + "message": [ + "Indices are not almost equal:", + "Left:", + "", + "", + "Right:", + "", + "" + ] + }, + "DIFFERENT_PANDAS_MULTIINDEX": { + "message": [ + "MultiIndices are not almost equal:", + "Left:", + "", + "", + "Right:", + "", + "" + ] + }, + "DIFFERENT_PANDAS_SERIES": { + "message": [ + "Series are not almost equal:", + "Left:", + "", + "", + "Right:", + "", + "" + ] + }, + "DIFFERENT_ROWS": { + "message": [ + "" + ] + }, + "DIFFERENT_SCHEMA": { + "message": [ + "Schemas do not match.", + "--- actual", + "+++ expected", + "" + ] + }, + "DISALLOWED_TYPE_FOR_CONTAINER": { + "message": [ + "Argument ``(type: ) should only contain a type in [], got " + ] + }, + "DUPLICATED_FIELD_NAME_IN_ARROW_STRUCT": { + "message": [ + "Duplicated field names in Arrow Struct are not allowed, got " + ] + }, + "ERROR_OCCURRED_WHILE_CALLING": { + "message": [ + "An error occurred while calling : ." + ] + }, + "FIELD_DATA_TYPE_UNACCEPTABLE": { + "message": [ + " can not accept object in type ." + ] + }, + "FIELD_DATA_TYPE_UNACCEPTABLE_WITH_NAME": { + "message": [ + ": can not accept object in type ." + ] + }, + "FIELD_NOT_NULLABLE": { + "message": [ + "Field is not nullable, but got None." + ] + }, + "FIELD_NOT_NULLABLE_WITH_NAME": { + "message": [ + ": This field is not nullable, but got None." + ] + }, + "FIELD_STRUCT_LENGTH_MISMATCH": { + "message": [ + "Length of object () does not match with length of fields ()." + ] + }, + "FIELD_STRUCT_LENGTH_MISMATCH_WITH_NAME": { + "message": [ + ": Length of object () does not match with length of fields ()." + ] + }, + "FIELD_TYPE_MISMATCH": { + "message": [ + " is not an instance of type ." + ] + }, + "FIELD_TYPE_MISMATCH_WITH_NAME": { + "message": [ + ": is not an instance of type ." + ] + }, + "HIGHER_ORDER_FUNCTION_SHOULD_RETURN_COLUMN": { + "message": [ + "Function `` should return Column, got ." + ] + }, + "INCORRECT_CONF_FOR_PROFILE": { + "message": [ + "`spark.python.profile` or `spark.python.profile.memory` configuration", + " must be set to `true` to enable Python profile." + ] + }, + "INDEX_NOT_POSITIVE": { + "message": [ + "Index must be positive, got ''." + ] + }, + "INDEX_OUT_OF_RANGE": { + "message": [ + " index out of range, got ''." + ] + }, + "INVALID_ARROW_UDTF_RETURN_TYPE": { + "message": [ + "The return type of the arrow-optimized Python UDTF should be of type 'pandas.DataFrame', but the '' method returned a value of type with value: ." + ] + }, + "INVALID_BROADCAST_OPERATION": { + "message": [ + "Broadcast can only be in driver." + ] + }, + "INVALID_CALL_ON_UNRESOLVED_OBJECT": { + "message": [ + "Invalid call to `` on unresolved object." + ] + }, + "INVALID_CONNECT_URL": { + "message": [ + "Invalid URL for Spark Connect: " + ] + }, + "INVALID_INTERVAL_CASTING": { + "message": [ + "Interval to is invalid." + ] + }, + "INVALID_ITEM_FOR_CONTAINER": { + "message": [ + "All items in `` should be in , got ." + ] + }, + "INVALID_JSON_DATA_TYPE_FOR_COLLATIONS" : { + "message" : [ + "Collations can only be applied to string types, but the JSON data type is ." + ] + }, + "INVALID_MULTIPLE_ARGUMENT_CONDITIONS": { + "message": [ + "[{arg_names}] cannot be ." + ] + }, + "INVALID_NDARRAY_DIMENSION": { + "message": [ + "NumPy array input should be of dimensions." + ] + }, + "INVALID_NUMBER_OF_DATAFRAMES_IN_GROUP": { + "message": [ + "Invalid number of dataframes in group ." + ] + }, + "INVALID_PANDAS_UDF": { + "message": [ + "Invalid function: " + ] + }, + "INVALID_PANDAS_UDF_TYPE": { + "message": [ + "`` should be one of the values from PandasUDFType, got " + ] + }, + "INVALID_RETURN_TYPE_FOR_ARROW_UDF": { + "message": [ + "Grouped and Cogrouped map Arrow UDF should return StructType for , got ." + ] + }, + "INVALID_RETURN_TYPE_FOR_PANDAS_UDF": { + "message": [ + "Pandas UDF should return StructType for , got ." + ] + }, + "INVALID_SESSION_UUID_ID": { + "message": [ + "Parameter value must be a valid UUID format: " + ] + }, + "INVALID_TIMEOUT_TIMESTAMP": { + "message": [ + "Timeout timestamp () cannot be earlier than the current watermark ()." + ] + }, + "INVALID_TYPE": { + "message": [ + "Argument `` should not be a ." + ] + }, + "INVALID_TYPENAME_CALL": { + "message": [ + "StructField does not have typeName. Use typeName on its type explicitly instead." + ] + }, + "INVALID_TYPE_DF_EQUALITY_ARG": { + "message": [ + "Expected type for `` but got type ." + ] + }, + "INVALID_UDF_EVAL_TYPE": { + "message": [ + "Eval type for UDF must be ." + ] + }, + "INVALID_UDTF_BOTH_RETURN_TYPE_AND_ANALYZE": { + "message": [ + "The UDTF '' is invalid. It has both its return type and an 'analyze' attribute. Please make it have one of either the return type or the 'analyze' static method in '' and try again." + ] + }, + "INVALID_UDTF_EVAL_TYPE": { + "message": [ + "The eval type for the UDTF '' is invalid. It must be one of ." + ] + }, + "INVALID_UDTF_HANDLER_TYPE": { + "message": [ + "The UDTF is invalid. The function handler must be a class, but got ''. Please provide a class as the function handler." + ] + }, + "INVALID_UDTF_NO_EVAL": { + "message": [ + "The UDTF '' is invalid. It does not implement the required 'eval' method. Please implement the 'eval' method in '' and try again." + ] + }, + "INVALID_UDTF_RETURN_TYPE": { + "message": [ + "The UDTF '' is invalid. It does not specify its return type or implement the required 'analyze' static method. Please specify the return type or implement the 'analyze' static method in '' and try again." + ] + }, + "INVALID_WHEN_USAGE": { + "message": [ + "when() can only be applied on a Column previously generated by when() function, and cannot be applied once otherwise() is applied." + ] + }, + "INVALID_WINDOW_BOUND_TYPE": { + "message": [ + "Invalid window bound type: ." + ] + }, + "JAVA_GATEWAY_EXITED": { + "message": [ + "Java gateway process exited before sending its port number." + ] + }, + "JVM_ATTRIBUTE_NOT_SUPPORTED": { + "message": [ + "Attribute `` is not supported in Spark Connect as it depends on the JVM. If you need to use this attribute, do not use Spark Connect when creating your session. Visit https://spark.apache.org/docs/latest/sql-getting-started.html#starting-point-sparksession for creating regular Spark Session in detail." + ] + }, + "KEY_NOT_EXISTS": { + "message": [ + "Key `` is not exists." + ] + }, + "KEY_VALUE_PAIR_REQUIRED": { + "message": [ + "Key-value pair or a list of pairs is required." + ] + }, + "LENGTH_SHOULD_BE_THE_SAME": { + "message": [ + " and should be of the same length, got and ." + ] + }, + "MALFORMED_VARIANT" : { + "message" : [ + "Variant binary is malformed. Please check the data source is valid." + ] + }, + "MASTER_URL_NOT_SET": { + "message": [ + "A master URL must be set in your configuration." + ] + }, + "MISSING_LIBRARY_FOR_PROFILER": { + "message": [ + "Install the 'memory_profiler' library in the cluster to enable memory profiling." + ] + }, + "MISSING_VALID_PLAN": { + "message": [ + "Argument to does not contain a valid plan." + ] + }, + "MIXED_TYPE_REPLACEMENT": { + "message": [ + "Mixed type replacements are not supported." + ] + }, + "NEGATIVE_VALUE": { + "message": [ + "Value for `` must be greater than or equal to 0, got ''." + ] + }, + "NOT_BOOL": { + "message": [ + "Argument `` should be a bool, got ." + ] + }, + "NOT_BOOL_OR_DICT_OR_FLOAT_OR_INT_OR_LIST_OR_STR_OR_TUPLE": { + "message": [ + "Argument `` should be a bool, dict, float, int, str or tuple, got ." + ] + }, + "NOT_BOOL_OR_DICT_OR_FLOAT_OR_INT_OR_STR": { + "message": [ + "Argument `` should be a bool, dict, float, int or str, got ." + ] + }, + "NOT_BOOL_OR_FLOAT_OR_INT": { + "message": [ + "Argument `` should be a bool, float or int, got ." + ] + }, + "NOT_BOOL_OR_FLOAT_OR_INT_OR_LIST_OR_NONE_OR_STR_OR_TUPLE": { + "message": [ + "Argument `` should be a bool, float, int, list, None, str or tuple, got ." + ] + }, + "NOT_BOOL_OR_FLOAT_OR_INT_OR_STR": { + "message": [ + "Argument `` should be a bool, float, int or str, got ." + ] + }, + "NOT_BOOL_OR_LIST": { + "message": [ + "Argument `` should be a bool or list, got ." + ] + }, + "NOT_BOOL_OR_STR": { + "message": [ + "Argument `` should be a bool or str, got ." + ] + }, + "NOT_CALLABLE": { + "message": [ + "Argument `` should be a callable, got ." + ] + }, + "NOT_COLUMN": { + "message": [ + "Argument `` should be a Column, got ." + ] + }, + "NOT_COLUMN_OR_DATATYPE_OR_STR": { + "message": [ + "Argument `` should be a Column, str or DataType, but got ." + ] + }, + "NOT_COLUMN_OR_FLOAT_OR_INT_OR_LIST_OR_STR": { + "message": [ + "Argument `` should be a Column, float, integer, list or string, got ." + ] + }, + "NOT_COLUMN_OR_INT": { + "message": [ + "Argument `` should be a Column or int, got ." + ] + }, + "NOT_COLUMN_OR_INT_OR_LIST_OR_STR_OR_TUPLE": { + "message": [ + "Argument `` should be a Column, int, list, str or tuple, got ." + ] + }, + "NOT_COLUMN_OR_INT_OR_STR": { + "message": [ + "Argument `` should be a Column, int or str, got ." + ] + }, + "NOT_COLUMN_OR_LIST_OR_STR": { + "message": [ + "Argument `` should be a Column, list or str, got ." + ] + }, + "NOT_COLUMN_OR_STR": { + "message": [ + "Argument `` should be a Column or str, got ." + ] + }, + "NOT_COLUMN_OR_STR_OR_STRUCT": { + "message": [ + "Argument `` should be a StructType, Column or str, got ." + ] + }, + "NOT_DATAFRAME": { + "message": [ + "Argument `` should be a DataFrame, got ." + ] + }, + "NOT_DATATYPE_OR_STR": { + "message": [ + "Argument `` should be a DataType or str, got ." + ] + }, + "NOT_DICT": { + "message": [ + "Argument `` should be a dict, got ." + ] + }, + "NOT_EXPRESSION": { + "message": [ + "Argument `` should be an Expression, got ." + ] + }, + "NOT_FLOAT_OR_INT": { + "message": [ + "Argument `` should be a float or int, got ." + ] + }, + "NOT_FLOAT_OR_INT_OR_LIST_OR_STR": { + "message": [ + "Argument `` should be a float, int, list or str, got ." + ] + }, + "NOT_IMPLEMENTED": { + "message": [ + " is not implemented." + ] + }, + "NOT_INT": { + "message": [ + "Argument `` should be an int, got ." + ] + }, + "NOT_INT_OR_SLICE_OR_STR": { + "message": [ + "Argument `` should be an int, slice or str, got ." + ] + }, + "NOT_IN_BARRIER_STAGE": { + "message": [ + "It is not in a barrier stage." + ] + }, + "NOT_ITERABLE": { + "message": [ + " is not iterable." + ] + }, + "NOT_LIST": { + "message": [ + "Argument `` should be a list, got ." + ] + }, + "NOT_LIST_OF_COLUMN": { + "message": [ + "Argument `` should be a list[Column]." + ] + }, + "NOT_LIST_OF_COLUMN_OR_STR": { + "message": [ + "Argument `` should be a list[Column]." + ] + }, + "NOT_LIST_OF_FLOAT_OR_INT": { + "message": [ + "Argument `` should be a list[float, int], got ." + ] + }, + "NOT_LIST_OF_STR": { + "message": [ + "Argument `` should be a list[str], got ." + ] + }, + "NOT_LIST_OR_NONE_OR_STRUCT": { + "message": [ + "Argument `` should be a list, None or StructType, got ." + ] + }, + "NOT_LIST_OR_STR_OR_TUPLE": { + "message": [ + "Argument `` should be a list, str or tuple, got ." + ] + }, + "NOT_LIST_OR_TUPLE": { + "message": [ + "Argument `` should be a list or tuple, got ." + ] + }, + "NOT_NUMERIC_COLUMNS": { + "message": [ + "Numeric aggregation function can only be applied on numeric columns, got ." + ] + }, + "NOT_OBSERVATION_OR_STR": { + "message": [ + "Argument `` should be an Observation or str, got ." + ] + }, + "NOT_SAME_TYPE": { + "message": [ + "Argument `` and `` should be the same type, got and ." + ] + }, + "NOT_STR": { + "message": [ + "Argument `` should be a str, got ." + ] + }, + "NOT_STRUCT": { + "message": [ + "Argument `` should be a struct type, got ." + ] + }, + "NOT_STR_OR_LIST_OF_RDD": { + "message": [ + "Argument `` should be a str or list[RDD], got ." + ] + }, + "NOT_STR_OR_STRUCT": { + "message": [ + "Argument `` should be a str or struct type, got ." + ] + }, + "NOT_WINDOWSPEC": { + "message": [ + "Argument `` should be a WindowSpec, got ." + ] + }, + "NO_ACTIVE_EXCEPTION": { + "message": [ + "No active exception." + ] + }, + "NO_ACTIVE_OR_DEFAULT_SESSION": { + "message": [ + "No active or default Spark session found. Please create a new Spark session before running the code." + ] + }, + "NO_ACTIVE_SESSION": { + "message": [ + "No active Spark session found. Please create a new Spark session before running the code." + ] + }, + "NO_OBSERVE_BEFORE_GET": { + "message": [ + "Should observe by calling `DataFrame.observe` before `get`." + ] + }, + "NO_SCHEMA_AND_DRIVER_DEFAULT_SCHEME": { + "message": [ + "Only allows to be a path without scheme, and Spark Driver should use the default scheme to determine the destination file system." + ] + }, + "ONLY_ALLOWED_FOR_SINGLE_COLUMN": { + "message": [ + "Argument `` can only be provided for a single column." + ] + }, + "ONLY_ALLOW_SINGLE_TRIGGER": { + "message": [ + "Only a single trigger is allowed." + ] + }, + "ONLY_SUPPORTED_WITH_SPARK_CONNECT": { + "message": [ + " is only supported with Spark Connect; however, the current Spark session does not use Spark Connect." + ] + }, + "PACKAGE_NOT_INSTALLED": { + "message": [ + " >= must be installed; however, it was not found." + ] + }, + "PIPE_FUNCTION_EXITED": { + "message": [ + "Pipe function `` exited with error code ." + ] + }, + "PYTHON_HASH_SEED_NOT_SET": { + "message": [ + "Randomness of hash of string should be disabled via PYTHONHASHSEED." + ] + }, + "PYTHON_STREAMING_DATA_SOURCE_RUNTIME_ERROR": { + "message": [ + "Failed when running Python streaming data source: " + ] + }, + "PYTHON_VERSION_MISMATCH": { + "message": [ + "Python in worker has different version: than that in driver: , PySpark cannot run with different minor versions.", + "Please check environment variables PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON are correctly set." + ] + }, + "RDD_TRANSFORM_ONLY_VALID_ON_DRIVER": { + "message": [ + "It appears that you are attempting to broadcast an RDD or reference an RDD from an ", + "action or transformation. RDD transformations and actions can only be invoked by the ", + "driver, not inside of other transformations; for example, ", + "rdd1.map(lambda x: rdd2.values.count() * x) is invalid because the values ", + "transformation and count action cannot be performed inside of the rdd1.map ", + "transformation. For more information, see SPARK-5063." + ] + }, + "READ_ONLY": { + "message": [ + " is read-only." + ] + }, + "RESPONSE_ALREADY_RECEIVED": { + "message": [ + "OPERATION_NOT_FOUND on the server but responses were already received from it." + ] + }, + "RESULT_COLUMNS_MISMATCH_FOR_ARROW_UDF": { + "message": [ + "Column names of the returned pyarrow.Table do not match specified schema." + ] + }, + "RESULT_COLUMNS_MISMATCH_FOR_PANDAS_UDF": { + "message": [ + "Column names of the returned pandas.DataFrame do not match specified schema." + ] + }, + "RESULT_LENGTH_MISMATCH_FOR_PANDAS_UDF": { + "message": [ + "Number of columns of the returned pandas.DataFrame doesn't match specified schema. Expected: Actual: " + ] + }, + "RESULT_LENGTH_MISMATCH_FOR_SCALAR_ITER_PANDAS_UDF": { + "message": [ + "The length of output in Scalar iterator pandas UDF should be the same with the input's; however, the length of output was and the length of input was ." + ] + }, + "RESULT_TYPE_MISMATCH_FOR_ARROW_UDF": { + "message": [ + "Columns do not match in their data type: ." + ] + }, + "RETRIES_EXCEEDED": { + "message": [ + "The maximum number of retries has been exceeded." + ] + }, + "REUSE_OBSERVATION": { + "message": [ + "An Observation can be used with a DataFrame only once." + ] + }, + "SCHEMA_MISMATCH_FOR_PANDAS_UDF": { + "message": [ + "Result vector from pandas_udf was not the required length: expected , got ." + ] + }, + "SESSION_ALREADY_EXIST": { + "message": [ + "Cannot start a remote Spark session because there is a regular Spark session already running." + ] + }, + "SESSION_NEED_CONN_STR_OR_BUILDER": { + "message": [ + "Needs either connection string or channelBuilder (mutually exclusive) to create a new SparkSession." + ] + }, + "SESSION_NOT_SAME": { + "message": [ + "Both Datasets must belong to the same SparkSession." + ] + }, + "SESSION_OR_CONTEXT_EXISTS": { + "message": [ + "There should not be an existing Spark Session or Spark Context." + ] + }, + "SESSION_OR_CONTEXT_NOT_EXISTS": { + "message": [ + "SparkContext or SparkSession should be created first." + ] + }, + "SLICE_WITH_STEP": { + "message": [ + "Slice with step is not supported." + ] + }, + "STATE_NOT_EXISTS": { + "message": [ + "State is either not defined or has already been removed." + ] + }, + "STOP_ITERATION_OCCURRED": { + "message": [ + "Caught StopIteration thrown from user's code; failing the task: " + ] + }, + "STOP_ITERATION_OCCURRED_FROM_SCALAR_ITER_PANDAS_UDF": { + "message": [ + "pandas iterator UDF should exhaust the input iterator." + ] + }, + "STREAMING_CONNECT_SERIALIZATION_ERROR": { + "message": [ + "Cannot serialize the function ``. If you accessed the Spark session, or a DataFrame defined outside of the function, or any object that contains a Spark session, please be aware that they are not allowed in Spark Connect. For `foreachBatch`, please access the Spark session using `df.sparkSession`, where `df` is the first parameter in your `foreachBatch` function. For `StreamingQueryListener`, please access the Spark session using `self.spark`. For details please check out the PySpark doc for `foreachBatch` and `StreamingQueryListener`." + ] + }, + "TEST_CLASS_NOT_COMPILED": { + "message": [ + " doesn't exist. Spark sql test classes are not compiled." + ] + }, + "TOO_MANY_VALUES": { + "message": [ + "Expected values for ``, got ." + ] + }, + "TYPE_HINT_SHOULD_BE_SPECIFIED": { + "message": [ + "Type hints for should be specified; however, got ." + ] + }, + "UDF_RETURN_TYPE": { + "message": [ + "Return type of the user-defined function should be , but is ." + ] + }, + "UDTF_ARROW_TYPE_CAST_ERROR": { + "message": [ + "Cannot convert the output value of the column '' with type '' to the specified return type of the column: ''. Please check if the data types match and try again." + ] + }, + "UDTF_CONSTRUCTOR_INVALID_IMPLEMENTS_ANALYZE_METHOD": { + "message": [ + "Failed to evaluate the user-defined table function '' because its constructor is invalid: the function implements the 'analyze' method, but its constructor has more than two arguments (including the 'self' reference). Please update the table function so that its constructor accepts exactly one 'self' argument, or one 'self' argument plus another argument for the result of the 'analyze' method, and try the query again." + ] + }, + "UDTF_CONSTRUCTOR_INVALID_NO_ANALYZE_METHOD": { + "message": [ + "Failed to evaluate the user-defined table function '' because its constructor is invalid: the function does not implement the 'analyze' method, and its constructor has more than one argument (including the 'self' reference). Please update the table function so that its constructor accepts exactly one 'self' argument, and try the query again." + ] + }, + "UDTF_EVAL_METHOD_ARGUMENTS_DO_NOT_MATCH_SIGNATURE": { + "message": [ + "Failed to evaluate the user-defined table function '' because the function arguments did not match the expected signature of the 'eval' method (). Please update the query so that this table function call provides arguments matching the expected signature, or else update the table function so that its 'eval' method accepts the provided arguments, and then try the query again." + ] + }, + "UDTF_EXEC_ERROR": { + "message": [ + "User defined table function encountered an error in the '' method: " + ] + }, + "UDTF_INVALID_OUTPUT_ROW_TYPE": { + "message": [ + "The type of an individual output row in the '' method of the UDTF is invalid. Each row should be a tuple, list, or dict, but got ''. Please make sure that the output rows are of the correct type." + ] + }, + "UDTF_RETURN_NOT_ITERABLE": { + "message": [ + "The return value of the '' method of the UDTF is invalid. It should be an iterable (e.g., generator or list), but got ''. Please make sure that the UDTF returns one of these types." + ] + }, + "UDTF_RETURN_SCHEMA_MISMATCH": { + "message": [ + "The number of columns in the result does not match the specified schema. Expected column count: , Actual column count: . Please make sure the values returned by the '' method have the same number of columns as specified in the output schema." + ] + }, + "UDTF_RETURN_TYPE_MISMATCH": { + "message": [ + "Mismatch in return type for the UDTF ''. Expected a 'StructType', but got ''. Please ensure the return type is a correctly formatted StructType." + ] + }, + "UDTF_SERIALIZATION_ERROR": { + "message": [ + "Cannot serialize the UDTF '': " + ] + }, + "UNEXPECTED_RESPONSE_FROM_SERVER": { + "message": [ + "Unexpected response from iterator server." + ] + }, + "UNEXPECTED_TUPLE_WITH_STRUCT": { + "message": [ + "Unexpected tuple with StructType." + ] + }, + "UNKNOWN_EXPLAIN_MODE": { + "message": [ + "Unknown explain mode: ''. Accepted explain modes are 'simple', 'extended', 'codegen', 'cost', 'formatted'." + ] + }, + "UNKNOWN_INTERRUPT_TYPE": { + "message": [ + "Unknown interrupt type: ''. Accepted interrupt types are 'all'." + ] + }, + "UNKNOWN_RESPONSE": { + "message": [ + "Unknown response: ." + ] + }, + "UNKNOWN_VALUE_FOR": { + "message": [ + "Unknown value for ``." + ] + }, + "UNSUPPORTED_DATA_TYPE": { + "message": [ + "Unsupported DataType ``." + ] + }, + "UNSUPPORTED_DATA_TYPE_FOR_ARROW": { + "message": [ + "Single data type is not supported with Arrow." + ] + }, + "UNSUPPORTED_DATA_TYPE_FOR_ARROW_CONVERSION": { + "message": [ + " is not supported in conversion to Arrow." + ] + }, + "UNSUPPORTED_DATA_TYPE_FOR_ARROW_VERSION": { + "message": [ + " is only supported with pyarrow 2.0.0 and above." + ] + }, + "UNSUPPORTED_JOIN_TYPE": { + "message": [ + "Unsupported join type: . Supported join types include: 'inner', 'outer', 'full', 'fullouter', 'full_outer', 'leftouter', 'left', 'left_outer', 'rightouter', 'right', 'right_outer', 'leftsemi', 'left_semi', 'semi', 'leftanti', 'left_anti', 'anti', 'cross'." + ] + }, + "UNSUPPORTED_LITERAL": { + "message": [ + "Unsupported Literal ''." + ] + }, + "UNSUPPORTED_LOCAL_CONNECTION_STRING": { + "message": [ + "Creating new SparkSessions with `local` connection string is not supported." + ] + }, + "UNSUPPORTED_NUMPY_ARRAY_SCALAR": { + "message": [ + "The type of array scalar '' is not supported." + ] + }, + "UNSUPPORTED_OPERATION": { + "message": [ + " is not supported." + ] + }, + "UNSUPPORTED_PACKAGE_VERSION": { + "message": [ + " >= must be installed; however, your version is ." + ] + }, + "UNSUPPORTED_PARAM_TYPE_FOR_HIGHER_ORDER_FUNCTION": { + "message": [ + "Function `` should use only POSITIONAL or POSITIONAL OR KEYWORD arguments." + ] + }, + "UNSUPPORTED_SIGNATURE": { + "message": [ + "Unsupported signature: ." + ] + }, + "UNSUPPORTED_WITH_ARROW_OPTIMIZATION": { + "message": [ + " is not supported with Arrow optimization enabled in Python UDFs. Disable 'spark.sql.execution.pythonUDF.arrow.enabled' to workaround." + ] + }, + "VALUE_ALLOWED": { + "message": [ + "Value for `` does not allow ." + ] + }, + "VALUE_NOT_ACCESSIBLE": { + "message": [ + "Value `` cannot be accessed inside tasks." + ] + }, + "VALUE_NOT_ALLOWED": { + "message": [ + "Value for `` has to be amongst the following values: ." + ] + }, + "VALUE_NOT_ANY_OR_ALL": { + "message": [ + "Value for `` must be 'any' or 'all', got ''." + ] + }, + "VALUE_NOT_BETWEEN": { + "message": [ + "Value for `` must be between and ." + ] + }, + "VALUE_NOT_NON_EMPTY_STR": { + "message": [ + "Value for `` must be a non-empty string, got ''." + ] + }, + "VALUE_NOT_PEARSON": { + "message": [ + "Value for `` only supports the 'pearson', got ''." + ] + }, + "VALUE_NOT_PLAIN_COLUMN_REFERENCE": { + "message": [ + "Value `` in `` should be a plain column reference such as `df.col` or `col('column')`." + ] + }, + "VALUE_NOT_POSITIVE": { + "message": [ + "Value for `` must be positive, got ''." + ] + }, + "VALUE_NOT_TRUE": { + "message": [ + "Value for `` must be True, got ''." + ] + }, + "VALUE_OUT_OF_BOUNDS": { + "message": [ + "Value for `` must be between and (inclusive), got " + ] + }, + "WRONG_NUM_ARGS_FOR_HIGHER_ORDER_FUNCTION": { + "message": [ + "Function `` should take between 1 and 3 arguments, but the provided function takes ." + ] + }, + "WRONG_NUM_COLUMNS": { + "message": [ + "Function `` should take at least columns." + ] + }, + "ZERO_INDEX": { + "message": [ + "Index must be non-zero." + ] + } +} diff --git a/python/pyspark/errors/error_classes.py b/python/pyspark/errors/error_classes.py index 6b7f19b449185..30869a3fbb2d2 100644 --- a/python/pyspark/errors/error_classes.py +++ b/python/pyspark/errors/error_classes.py @@ -15,1160 +15,17 @@ # limitations under the License. # -# NOTE: Automatically sort this file via -# - cd $SPARK_HOME -# - bin/pyspark -# - from pyspark.errors.exceptions import _write_self; _write_self() import json +import importlib.resources - -ERROR_CLASSES_JSON = ''' -{ - "APPLICATION_NAME_NOT_SET": { - "message": [ - "An application name must be set in your configuration." - ] - }, - "ARGUMENT_REQUIRED": { - "message": [ - "Argument `` is required when ." - ] - }, - "ARROW_LEGACY_IPC_FORMAT": { - "message": [ - "Arrow legacy IPC format is not supported in PySpark, please unset ARROW_PRE_0_15_IPC_FORMAT." - ] - }, - "ATTRIBUTE_NOT_CALLABLE": { - "message": [ - "Attribute `` in provided object `` is not callable." - ] - }, - "ATTRIBUTE_NOT_SUPPORTED": { - "message": [ - "Attribute `` is not supported." - ] - }, - "AXIS_LENGTH_MISMATCH": { - "message": [ - "Length mismatch: Expected axis has element, new values have elements." - ] - }, - "BROADCAST_VARIABLE_NOT_LOADED": { - "message": [ - "Broadcast variable `` not loaded." - ] - }, - "CALL_BEFORE_INITIALIZE": { - "message": [ - "Not supported to call `` before initialize ." - ] - }, - "CANNOT_ACCEPT_OBJECT_IN_TYPE": { - "message": [ - "`` can not accept object `` in type ``." - ] - }, - "CANNOT_ACCESS_TO_DUNDER": { - "message": [ - "Dunder(double underscore) attribute is for internal use only." - ] - }, - "CANNOT_APPLY_IN_FOR_COLUMN": { - "message": [ - "Cannot apply 'in' operator against a column: please use 'contains' in a string column or 'array_contains' function for an array column." - ] - }, - "CANNOT_BE_EMPTY": { - "message": [ - "At least one must be specified." - ] - }, - "CANNOT_BE_NONE": { - "message": [ - "Argument `` cannot be None." - ] - }, - "CANNOT_CONFIGURE_SPARK_CONNECT": { - "message": [ - "Spark Connect server cannot be configured: Existing [], New []." - ] - }, - "CANNOT_CONFIGURE_SPARK_CONNECT_MASTER": { - "message": [ - "Spark Connect server and Spark master cannot be configured together: Spark master [], Spark Connect []." - ] - }, - "CANNOT_CONVERT_COLUMN_INTO_BOOL": { - "message": [ - "Cannot convert column into bool: please use '&' for 'and', '|' for 'or', '~' for 'not' when building DataFrame boolean expressions." - ] - }, - "CANNOT_CONVERT_TYPE": { - "message": [ - "Cannot convert into ." - ] - }, - "CANNOT_DETERMINE_TYPE": { - "message": [ - "Some of types cannot be determined after inferring." - ] - }, - "CANNOT_GET_BATCH_ID": { - "message": [ - "Could not get batch id from ." - ] - }, - "CANNOT_INFER_ARRAY_TYPE": { - "message": [ - "Can not infer Array Type from a list with None as the first element." - ] - }, - "CANNOT_INFER_EMPTY_SCHEMA": { - "message": [ - "Can not infer schema from an empty dataset." - ] - }, - "CANNOT_INFER_SCHEMA_FOR_TYPE": { - "message": [ - "Can not infer schema for type: ``." - ] - }, - "CANNOT_INFER_TYPE_FOR_FIELD": { - "message": [ - "Unable to infer the type of the field ``." - ] - }, - "CANNOT_MERGE_TYPE": { - "message": [ - "Can not merge type `` and ``." - ] - }, - "CANNOT_OPEN_SOCKET": { - "message": [ - "Can not open socket: ." - ] - }, - "CANNOT_PARSE_DATATYPE": { - "message": [ - "Unable to parse datatype. ." - ] - }, - "CANNOT_PROVIDE_METADATA": { - "message": [ - "Metadata can only be provided for a single column." - ] - }, - "CANNOT_SET_TOGETHER": { - "message": [ - " should not be set together." - ] - }, - "CANNOT_SPECIFY_RETURN_TYPE_FOR_UDF": { - "message": [ - "returnType can not be specified when `` is a user-defined function, but got ." - ] - }, - "CANNOT_WITHOUT": { - "message": [ - "Cannot without ." - ] - }, - "COLUMN_IN_LIST": { - "message": [ - "`` does not allow a Column in a list." - ] - }, - "CONNECT_URL_ALREADY_DEFINED": { - "message": [ - "Only one Spark Connect client URL can be set; however, got a different URL [] from the existing []." - ] - }, - "CONNECT_URL_NOT_SET": { - "message": [ - "Cannot create a Spark Connect session because the Spark Connect remote URL has not been set. Please define the remote URL by setting either the 'spark.remote' option or the 'SPARK_REMOTE' environment variable." - ] - }, - "CONTEXT_ONLY_VALID_ON_DRIVER": { - "message": [ - "It appears that you are attempting to reference SparkContext from a broadcast variable, action, or transformation. SparkContext can only be used on the driver, not in code that it run on workers. For more information, see SPARK-5063." - ] - }, - "CONTEXT_UNAVAILABLE_FOR_REMOTE_CLIENT": { - "message": [ - "Remote client cannot create a SparkContext. Create SparkSession instead." - ] - }, - "DATA_SOURCE_CREATE_ERROR": { - "message": [ - "Failed to create python data source instance, error: ." - ] - }, - "DATA_SOURCE_INVALID_RETURN_TYPE": { - "message": [ - "Unsupported return type ('') from Python data source ''. Expected types: ." - ] - }, - "DATA_SOURCE_RETURN_SCHEMA_MISMATCH": { - "message": [ - "Return schema mismatch in the result from 'read' method. Expected: columns, Found: columns. Make sure the returned values match the required output schema." - ] - }, - "DATA_SOURCE_TYPE_MISMATCH": { - "message": [ - "Expected , but got ." - ] - }, - "DIFFERENT_PANDAS_DATAFRAME": { - "message": [ - "DataFrames are not almost equal:", - "Left:", - "", - "", - "Right:", - "", - "" - ] - }, - "DIFFERENT_PANDAS_INDEX": { - "message": [ - "Indices are not almost equal:", - "Left:", - "", - "", - "Right:", - "", - "" - ] - }, - "DIFFERENT_PANDAS_MULTIINDEX": { - "message": [ - "MultiIndices are not almost equal:", - "Left:", - "", - "", - "Right:", - "", - "" - ] - }, - "DIFFERENT_PANDAS_SERIES": { - "message": [ - "Series are not almost equal:", - "Left:", - "", - "", - "Right:", - "", - "" - ] - }, - "DIFFERENT_ROWS": { - "message": [ - "" - ] - }, - "DIFFERENT_SCHEMA": { - "message": [ - "Schemas do not match.", - "--- actual", - "+++ expected", - "" - ] - }, - "DISALLOWED_TYPE_FOR_CONTAINER": { - "message": [ - "Argument ``(type: ) should only contain a type in [], got " - ] - }, - "DUPLICATED_FIELD_NAME_IN_ARROW_STRUCT": { - "message": [ - "Duplicated field names in Arrow Struct are not allowed, got " - ] - }, - "ERROR_OCCURRED_WHILE_CALLING": { - "message": [ - "An error occurred while calling : ." - ] - }, - "FIELD_DATA_TYPE_UNACCEPTABLE": { - "message": [ - " can not accept object in type ." - ] - }, - "FIELD_DATA_TYPE_UNACCEPTABLE_WITH_NAME": { - "message": [ - ": can not accept object in type ." - ] - }, - "FIELD_NOT_NULLABLE": { - "message": [ - "Field is not nullable, but got None." - ] - }, - "FIELD_NOT_NULLABLE_WITH_NAME": { - "message": [ - ": This field is not nullable, but got None." - ] - }, - "FIELD_STRUCT_LENGTH_MISMATCH": { - "message": [ - "Length of object () does not match with length of fields ()." - ] - }, - "FIELD_STRUCT_LENGTH_MISMATCH_WITH_NAME": { - "message": [ - ": Length of object () does not match with length of fields ()." - ] - }, - "FIELD_TYPE_MISMATCH": { - "message": [ - " is not an instance of type ." - ] - }, - "FIELD_TYPE_MISMATCH_WITH_NAME": { - "message": [ - ": is not an instance of type ." - ] - }, - "HIGHER_ORDER_FUNCTION_SHOULD_RETURN_COLUMN": { - "message": [ - "Function `` should return Column, got ." - ] - }, - "INCORRECT_CONF_FOR_PROFILE": { - "message": [ - "`spark.python.profile` or `spark.python.profile.memory` configuration", - " must be set to `true` to enable Python profile." - ] - }, - "INDEX_NOT_POSITIVE": { - "message": [ - "Index must be positive, got ''." - ] - }, - "INDEX_OUT_OF_RANGE": { - "message": [ - " index out of range, got ''." - ] - }, - "INVALID_ARROW_UDTF_RETURN_TYPE": { - "message": [ - "The return type of the arrow-optimized Python UDTF should be of type 'pandas.DataFrame', but the '' method returned a value of type with value: ." - ] - }, - "INVALID_BROADCAST_OPERATION": { - "message": [ - "Broadcast can only be in driver." - ] - }, - "INVALID_CALL_ON_UNRESOLVED_OBJECT": { - "message": [ - "Invalid call to `` on unresolved object." - ] - }, - "INVALID_CONNECT_URL": { - "message": [ - "Invalid URL for Spark Connect: " - ] - }, - "INVALID_INTERVAL_CASTING": { - "message": [ - "Interval to is invalid." - ] - }, - "INVALID_ITEM_FOR_CONTAINER": { - "message": [ - "All items in `` should be in , got ." - ] - }, - "INVALID_MULTIPLE_ARGUMENT_CONDITIONS": { - "message": [ - "[{arg_names}] cannot be ." - ] - }, - "INVALID_NDARRAY_DIMENSION": { - "message": [ - "NumPy array input should be of dimensions." - ] - }, - "INVALID_NUMBER_OF_DATAFRAMES_IN_GROUP": { - "message": [ - "Invalid number of dataframes in group ." - ] - }, - "INVALID_PANDAS_UDF": { - "message": [ - "Invalid function: " - ] - }, - "INVALID_PANDAS_UDF_TYPE": { - "message": [ - "`` should be one of the values from PandasUDFType, got " - ] - }, - "INVALID_RETURN_TYPE_FOR_ARROW_UDF": { - "message": [ - "Grouped and Cogrouped map Arrow UDF should return StructType for , got ." - ] - }, - "INVALID_RETURN_TYPE_FOR_PANDAS_UDF": { - "message": [ - "Pandas UDF should return StructType for , got ." - ] - }, - "INVALID_SESSION_UUID_ID": { - "message": [ - "Parameter value must be a valid UUID format: " - ] - }, - "INVALID_TIMEOUT_TIMESTAMP": { - "message": [ - "Timeout timestamp () cannot be earlier than the current watermark ()." - ] - }, - "INVALID_TYPE": { - "message": [ - "Argument `` should not be a ." - ] - }, - "INVALID_TYPENAME_CALL": { - "message": [ - "StructField does not have typeName. Use typeName on its type explicitly instead." - ] - }, - "INVALID_TYPE_DF_EQUALITY_ARG": { - "message": [ - "Expected type for `` but got type ." - ] - }, - "INVALID_UDF_EVAL_TYPE": { - "message": [ - "Eval type for UDF must be ." - ] - }, - "INVALID_UDTF_BOTH_RETURN_TYPE_AND_ANALYZE": { - "message": [ - "The UDTF '' is invalid. It has both its return type and an 'analyze' attribute. Please make it have one of either the return type or the 'analyze' static method in '' and try again." - ] - }, - "INVALID_UDTF_EVAL_TYPE": { - "message": [ - "The eval type for the UDTF '' is invalid. It must be one of ." - ] - }, - "INVALID_UDTF_HANDLER_TYPE": { - "message": [ - "The UDTF is invalid. The function handler must be a class, but got ''. Please provide a class as the function handler." - ] - }, - "INVALID_UDTF_NO_EVAL": { - "message": [ - "The UDTF '' is invalid. It does not implement the required 'eval' method. Please implement the 'eval' method in '' and try again." - ] - }, - "INVALID_UDTF_RETURN_TYPE": { - "message": [ - "The UDTF '' is invalid. It does not specify its return type or implement the required 'analyze' static method. Please specify the return type or implement the 'analyze' static method in '' and try again." - ] - }, - "INVALID_WHEN_USAGE": { - "message": [ - "when() can only be applied on a Column previously generated by when() function, and cannot be applied once otherwise() is applied." - ] - }, - "INVALID_WINDOW_BOUND_TYPE": { - "message": [ - "Invalid window bound type: ." - ] - }, - "JAVA_GATEWAY_EXITED": { - "message": [ - "Java gateway process exited before sending its port number." - ] - }, - "JVM_ATTRIBUTE_NOT_SUPPORTED": { - "message": [ - "Attribute `` is not supported in Spark Connect as it depends on the JVM. If you need to use this attribute, do not use Spark Connect when creating your session. Visit https://spark.apache.org/docs/latest/sql-getting-started.html#starting-point-sparksession for creating regular Spark Session in detail." - ] - }, - "KEY_NOT_EXISTS": { - "message": [ - "Key `` is not exists." - ] - }, - "KEY_VALUE_PAIR_REQUIRED": { - "message": [ - "Key-value pair or a list of pairs is required." - ] - }, - "LENGTH_SHOULD_BE_THE_SAME": { - "message": [ - " and should be of the same length, got and ." - ] - }, - "MASTER_URL_NOT_SET": { - "message": [ - "A master URL must be set in your configuration." - ] - }, - "MISSING_LIBRARY_FOR_PROFILER": { - "message": [ - "Install the 'memory_profiler' library in the cluster to enable memory profiling." - ] - }, - "MISSING_VALID_PLAN": { - "message": [ - "Argument to does not contain a valid plan." - ] - }, - "MIXED_TYPE_REPLACEMENT": { - "message": [ - "Mixed type replacements are not supported." - ] - }, - "NEGATIVE_VALUE": { - "message": [ - "Value for `` must be greater than or equal to 0, got ''." - ] - }, - "NOT_BOOL": { - "message": [ - "Argument `` should be a bool, got ." - ] - }, - "NOT_BOOL_OR_DICT_OR_FLOAT_OR_INT_OR_LIST_OR_STR_OR_TUPLE": { - "message": [ - "Argument `` should be a bool, dict, float, int, str or tuple, got ." - ] - }, - "NOT_BOOL_OR_DICT_OR_FLOAT_OR_INT_OR_STR": { - "message": [ - "Argument `` should be a bool, dict, float, int or str, got ." - ] - }, - "NOT_BOOL_OR_FLOAT_OR_INT": { - "message": [ - "Argument `` should be a bool, float or int, got ." - ] - }, - "NOT_BOOL_OR_FLOAT_OR_INT_OR_LIST_OR_NONE_OR_STR_OR_TUPLE": { - "message": [ - "Argument `` should be a bool, float, int, list, None, str or tuple, got ." - ] - }, - "NOT_BOOL_OR_FLOAT_OR_INT_OR_STR": { - "message": [ - "Argument `` should be a bool, float, int or str, got ." - ] - }, - "NOT_BOOL_OR_LIST": { - "message": [ - "Argument `` should be a bool or list, got ." - ] - }, - "NOT_BOOL_OR_STR": { - "message": [ - "Argument `` should be a bool or str, got ." - ] - }, - "NOT_CALLABLE": { - "message": [ - "Argument `` should be a callable, got ." - ] - }, - "NOT_COLUMN": { - "message": [ - "Argument `` should be a Column, got ." - ] - }, - "NOT_COLUMN_OR_DATATYPE_OR_STR": { - "message": [ - "Argument `` should be a Column, str or DataType, but got ." - ] - }, - "NOT_COLUMN_OR_FLOAT_OR_INT_OR_LIST_OR_STR": { - "message": [ - "Argument `` should be a Column, float, integer, list or string, got ." - ] - }, - "NOT_COLUMN_OR_INT": { - "message": [ - "Argument `` should be a Column or int, got ." - ] - }, - "NOT_COLUMN_OR_INT_OR_LIST_OR_STR_OR_TUPLE": { - "message": [ - "Argument `` should be a Column, int, list, str or tuple, got ." - ] - }, - "NOT_COLUMN_OR_INT_OR_STR": { - "message": [ - "Argument `` should be a Column, int or str, got ." - ] - }, - "NOT_COLUMN_OR_LIST_OR_STR": { - "message": [ - "Argument `` should be a Column, list or str, got ." - ] - }, - "NOT_COLUMN_OR_STR": { - "message": [ - "Argument `` should be a Column or str, got ." - ] - }, - "NOT_COLUMN_OR_STR_OR_STRUCT": { - "message": [ - "Argument `` should be a StructType, Column or str, got ." - ] - }, - "NOT_DATAFRAME": { - "message": [ - "Argument `` should be a DataFrame, got ." - ] - }, - "NOT_DATATYPE_OR_STR": { - "message": [ - "Argument `` should be a DataType or str, got ." - ] - }, - "NOT_DICT": { - "message": [ - "Argument `` should be a dict, got ." - ] - }, - "NOT_EXPRESSION": { - "message": [ - "Argument `` should be an Expression, got ." - ] - }, - "NOT_FLOAT_OR_INT": { - "message": [ - "Argument `` should be a float or int, got ." - ] - }, - "NOT_FLOAT_OR_INT_OR_LIST_OR_STR": { - "message": [ - "Argument `` should be a float, int, list or str, got ." - ] - }, - "NOT_IMPLEMENTED": { - "message": [ - " is not implemented." - ] - }, - "NOT_INT": { - "message": [ - "Argument `` should be an int, got ." - ] - }, - "NOT_INT_OR_SLICE_OR_STR": { - "message": [ - "Argument `` should be an int, slice or str, got ." - ] - }, - "NOT_IN_BARRIER_STAGE": { - "message": [ - "It is not in a barrier stage." - ] - }, - "NOT_ITERABLE": { - "message": [ - " is not iterable." - ] - }, - "NOT_LIST": { - "message": [ - "Argument `` should be a list, got ." - ] - }, - "NOT_LIST_OF_COLUMN": { - "message": [ - "Argument `` should be a list[Column]." - ] - }, - "NOT_LIST_OF_COLUMN_OR_STR": { - "message": [ - "Argument `` should be a list[Column]." - ] - }, - "NOT_LIST_OF_FLOAT_OR_INT": { - "message": [ - "Argument `` should be a list[float, int], got ." - ] - }, - "NOT_LIST_OF_STR": { - "message": [ - "Argument `` should be a list[str], got ." - ] - }, - "NOT_LIST_OR_NONE_OR_STRUCT": { - "message": [ - "Argument `` should be a list, None or StructType, got ." - ] - }, - "NOT_LIST_OR_STR_OR_TUPLE": { - "message": [ - "Argument `` should be a list, str or tuple, got ." - ] - }, - "NOT_LIST_OR_TUPLE": { - "message": [ - "Argument `` should be a list or tuple, got ." - ] - }, - "NOT_NUMERIC_COLUMNS": { - "message": [ - "Numeric aggregation function can only be applied on numeric columns, got ." - ] - }, - "NOT_OBSERVATION_OR_STR": { - "message": [ - "Argument `` should be an Observation or str, got ." - ] - }, - "NOT_SAME_TYPE": { - "message": [ - "Argument `` and `` should be the same type, got and ." - ] - }, - "NOT_STR": { - "message": [ - "Argument `` should be a str, got ." - ] - }, - "NOT_STRUCT": { - "message": [ - "Argument `` should be a struct type, got ." - ] - }, - "NOT_STR_OR_LIST_OF_RDD": { - "message": [ - "Argument `` should be a str or list[RDD], got ." - ] - }, - "NOT_STR_OR_STRUCT": { - "message": [ - "Argument `` should be a str or struct type, got ." - ] - }, - "NOT_WINDOWSPEC": { - "message": [ - "Argument `` should be a WindowSpec, got ." - ] - }, - "NO_ACTIVE_EXCEPTION": { - "message": [ - "No active exception." - ] - }, - "NO_ACTIVE_OR_DEFAULT_SESSION": { - "message": [ - "No active or default Spark session found. Please create a new Spark session before running the code." - ] - }, - "NO_ACTIVE_SESSION": { - "message": [ - "No active Spark session found. Please create a new Spark session before running the code." - ] - }, - "NO_OBSERVE_BEFORE_GET": { - "message": [ - "Should observe by calling `DataFrame.observe` before `get`." - ] - }, - "NO_SCHEMA_AND_DRIVER_DEFAULT_SCHEME": { - "message": [ - "Only allows to be a path without scheme, and Spark Driver should use the default scheme to determine the destination file system." - ] - }, - "ONLY_ALLOWED_FOR_SINGLE_COLUMN": { - "message": [ - "Argument `` can only be provided for a single column." - ] - }, - "ONLY_ALLOW_SINGLE_TRIGGER": { - "message": [ - "Only a single trigger is allowed." - ] - }, - "ONLY_SUPPORTED_WITH_SPARK_CONNECT": { - "message": [ - " is only supported with Spark Connect; however, the current Spark session does not use Spark Connect." - ] - }, - "PACKAGE_NOT_INSTALLED": { - "message": [ - " >= must be installed; however, it was not found." - ] - }, - "PIPE_FUNCTION_EXITED": { - "message": [ - "Pipe function `` exited with error code ." - ] - }, - "PYTHON_HASH_SEED_NOT_SET": { - "message": [ - "Randomness of hash of string should be disabled via PYTHONHASHSEED." - ] - }, - "PYTHON_STREAMING_DATA_SOURCE_RUNTIME_ERROR": { - "message": [ - "Failed when running Python streaming data source: " - ] - }, - "PYTHON_VERSION_MISMATCH": { - "message": [ - "Python in worker has different version: than that in driver: , PySpark cannot run with different minor versions.", - "Please check environment variables PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON are correctly set." - ] - }, - "RDD_TRANSFORM_ONLY_VALID_ON_DRIVER": { - "message": [ - "It appears that you are attempting to broadcast an RDD or reference an RDD from an ", - "action or transformation. RDD transformations and actions can only be invoked by the ", - "driver, not inside of other transformations; for example, ", - "rdd1.map(lambda x: rdd2.values.count() * x) is invalid because the values ", - "transformation and count action cannot be performed inside of the rdd1.map ", - "transformation. For more information, see SPARK-5063." - ] - }, - "READ_ONLY": { - "message": [ - " is read-only." - ] - }, - "RESPONSE_ALREADY_RECEIVED": { - "message": [ - "OPERATION_NOT_FOUND on the server but responses were already received from it." - ] - }, - "RESULT_COLUMNS_MISMATCH_FOR_ARROW_UDF": { - "message": [ - "Column names of the returned pyarrow.Table do not match specified schema." - ] - }, - "RESULT_COLUMNS_MISMATCH_FOR_PANDAS_UDF": { - "message": [ - "Column names of the returned pandas.DataFrame do not match specified schema." - ] - }, - "RESULT_LENGTH_MISMATCH_FOR_PANDAS_UDF": { - "message": [ - "Number of columns of the returned pandas.DataFrame doesn't match specified schema. Expected: Actual: " - ] - }, - "RESULT_LENGTH_MISMATCH_FOR_SCALAR_ITER_PANDAS_UDF": { - "message": [ - "The length of output in Scalar iterator pandas UDF should be the same with the input's; however, the length of output was and the length of input was ." - ] - }, - "RESULT_TYPE_MISMATCH_FOR_ARROW_UDF": { - "message": [ - "Columns do not match in their data type: ." - ] - }, - "RETRIES_EXCEEDED": { - "message": [ - "The maximum number of retries has been exceeded." - ] - }, - "REUSE_OBSERVATION": { - "message": [ - "An Observation can be used with a DataFrame only once." - ] - }, - "SCHEMA_MISMATCH_FOR_PANDAS_UDF": { - "message": [ - "Result vector from pandas_udf was not the required length: expected , got ." - ] - }, - "SESSION_ALREADY_EXIST": { - "message": [ - "Cannot start a remote Spark session because there is a regular Spark session already running." - ] - }, - "SESSION_NEED_CONN_STR_OR_BUILDER": { - "message": [ - "Needs either connection string or channelBuilder (mutually exclusive) to create a new SparkSession." - ] - }, - "SESSION_NOT_SAME": { - "message": [ - "Both Datasets must belong to the same SparkSession." - ] - }, - "SESSION_OR_CONTEXT_EXISTS": { - "message": [ - "There should not be an existing Spark Session or Spark Context." - ] - }, - "SESSION_OR_CONTEXT_NOT_EXISTS": { - "message": [ - "SparkContext or SparkSession should be created first." - ] - }, - "SLICE_WITH_STEP": { - "message": [ - "Slice with step is not supported." - ] - }, - "STATE_NOT_EXISTS": { - "message": [ - "State is either not defined or has already been removed." - ] - }, - "STOP_ITERATION_OCCURRED": { - "message": [ - "Caught StopIteration thrown from user's code; failing the task: " - ] - }, - "STOP_ITERATION_OCCURRED_FROM_SCALAR_ITER_PANDAS_UDF": { - "message": [ - "pandas iterator UDF should exhaust the input iterator." - ] - }, - "STREAMING_CONNECT_SERIALIZATION_ERROR": { - "message": [ - "Cannot serialize the function ``. If you accessed the Spark session, or a DataFrame defined outside of the function, or any object that contains a Spark session, please be aware that they are not allowed in Spark Connect. For `foreachBatch`, please access the Spark session using `df.sparkSession`, where `df` is the first parameter in your `foreachBatch` function. For `StreamingQueryListener`, please access the Spark session using `self.spark`. For details please check out the PySpark doc for `foreachBatch` and `StreamingQueryListener`." - ] - }, - "TEST_CLASS_NOT_COMPILED": { - "message": [ - " doesn't exist. Spark sql test classes are not compiled." - ] - }, - "TOO_MANY_VALUES": { - "message": [ - "Expected values for ``, got ." - ] - }, - "TYPE_HINT_SHOULD_BE_SPECIFIED": { - "message": [ - "Type hints for should be specified; however, got ." - ] - }, - "UDF_RETURN_TYPE": { - "message": [ - "Return type of the user-defined function should be , but is ." - ] - }, - "UDTF_ARROW_TYPE_CAST_ERROR": { - "message": [ - "Cannot convert the output value of the column '' with type '' to the specified return type of the column: ''. Please check if the data types match and try again." - ] - }, - "UDTF_CONSTRUCTOR_INVALID_IMPLEMENTS_ANALYZE_METHOD": { - "message": [ - "Failed to evaluate the user-defined table function '' because its constructor is invalid: the function implements the 'analyze' method, but its constructor has more than two arguments (including the 'self' reference). Please update the table function so that its constructor accepts exactly one 'self' argument, or one 'self' argument plus another argument for the result of the 'analyze' method, and try the query again." - ] - }, - "UDTF_CONSTRUCTOR_INVALID_NO_ANALYZE_METHOD": { - "message": [ - "Failed to evaluate the user-defined table function '' because its constructor is invalid: the function does not implement the 'analyze' method, and its constructor has more than one argument (including the 'self' reference). Please update the table function so that its constructor accepts exactly one 'self' argument, and try the query again." - ] - }, - "UDTF_EVAL_METHOD_ARGUMENTS_DO_NOT_MATCH_SIGNATURE": { - "message": [ - "Failed to evaluate the user-defined table function '' because the function arguments did not match the expected signature of the 'eval' method (). Please update the query so that this table function call provides arguments matching the expected signature, or else update the table function so that its 'eval' method accepts the provided arguments, and then try the query again." - ] - }, - "UDTF_EXEC_ERROR": { - "message": [ - "User defined table function encountered an error in the '' method: " - ] - }, - "UDTF_INVALID_OUTPUT_ROW_TYPE": { - "message": [ - "The type of an individual output row in the '' method of the UDTF is invalid. Each row should be a tuple, list, or dict, but got ''. Please make sure that the output rows are of the correct type." - ] - }, - "UDTF_RETURN_NOT_ITERABLE": { - "message": [ - "The return value of the '' method of the UDTF is invalid. It should be an iterable (e.g., generator or list), but got ''. Please make sure that the UDTF returns one of these types." - ] - }, - "UDTF_RETURN_SCHEMA_MISMATCH": { - "message": [ - "The number of columns in the result does not match the specified schema. Expected column count: , Actual column count: . Please make sure the values returned by the '' method have the same number of columns as specified in the output schema." - ] - }, - "UDTF_RETURN_TYPE_MISMATCH": { - "message": [ - "Mismatch in return type for the UDTF ''. Expected a 'StructType', but got ''. Please ensure the return type is a correctly formatted StructType." - ] - }, - "UDTF_SERIALIZATION_ERROR": { - "message": [ - "Cannot serialize the UDTF '': " - ] - }, - "UNEXPECTED_RESPONSE_FROM_SERVER": { - "message": [ - "Unexpected response from iterator server." - ] - }, - "UNEXPECTED_TUPLE_WITH_STRUCT": { - "message": [ - "Unexpected tuple with StructType." - ] - }, - "UNKNOWN_EXPLAIN_MODE": { - "message": [ - "Unknown explain mode: ''. Accepted explain modes are 'simple', 'extended', 'codegen', 'cost', 'formatted'." - ] - }, - "UNKNOWN_INTERRUPT_TYPE": { - "message": [ - "Unknown interrupt type: ''. Accepted interrupt types are 'all'." - ] - }, - "UNKNOWN_RESPONSE": { - "message": [ - "Unknown response: ." - ] - }, - "UNKNOWN_VALUE_FOR": { - "message": [ - "Unknown value for ``." - ] - }, - "UNSUPPORTED_DATA_TYPE": { - "message": [ - "Unsupported DataType ``." - ] - }, - "UNSUPPORTED_DATA_TYPE_FOR_ARROW": { - "message": [ - "Single data type is not supported with Arrow." - ] - }, - "UNSUPPORTED_DATA_TYPE_FOR_ARROW_CONVERSION": { - "message": [ - " is not supported in conversion to Arrow." - ] - }, - "UNSUPPORTED_DATA_TYPE_FOR_ARROW_VERSION": { - "message": [ - " is only supported with pyarrow 2.0.0 and above." - ] - }, - "UNSUPPORTED_JOIN_TYPE": { - "message": [ - "Unsupported join type: . Supported join types include: 'inner', 'outer', 'full', 'fullouter', 'full_outer', 'leftouter', 'left', 'left_outer', 'rightouter', 'right', 'right_outer', 'leftsemi', 'left_semi', 'semi', 'leftanti', 'left_anti', 'anti', 'cross'." - ] - }, - "UNSUPPORTED_LITERAL": { - "message": [ - "Unsupported Literal ''." - ] - }, - "UNSUPPORTED_LOCAL_CONNECTION_STRING": { - "message": [ - "Creating new SparkSessions with `local` connection string is not supported." - ] - }, - "UNSUPPORTED_NUMPY_ARRAY_SCALAR": { - "message": [ - "The type of array scalar '' is not supported." - ] - }, - "UNSUPPORTED_OPERATION": { - "message": [ - " is not supported." - ] - }, - "UNSUPPORTED_PACKAGE_VERSION": { - "message": [ - " >= must be installed; however, your version is ." - ] - }, - "UNSUPPORTED_PARAM_TYPE_FOR_HIGHER_ORDER_FUNCTION": { - "message": [ - "Function `` should use only POSITIONAL or POSITIONAL OR KEYWORD arguments." - ] - }, - "UNSUPPORTED_SIGNATURE": { - "message": [ - "Unsupported signature: ." - ] - }, - "UNSUPPORTED_WITH_ARROW_OPTIMIZATION": { - "message": [ - " is not supported with Arrow optimization enabled in Python UDFs. Disable 'spark.sql.execution.pythonUDF.arrow.enabled' to workaround." - ] - }, - "VALUE_ALLOWED": { - "message": [ - "Value for `` does not allow ." - ] - }, - "VALUE_NOT_ACCESSIBLE": { - "message": [ - "Value `` cannot be accessed inside tasks." - ] - }, - "VALUE_NOT_ALLOWED": { - "message": [ - "Value for `` has to be amongst the following values: ." - ] - }, - "VALUE_NOT_ANY_OR_ALL": { - "message": [ - "Value for `` must be 'any' or 'all', got ''." - ] - }, - "VALUE_NOT_BETWEEN": { - "message": [ - "Value for `` must be between and ." - ] - }, - "VALUE_NOT_NON_EMPTY_STR": { - "message": [ - "Value for `` must be a non-empty string, got ''." - ] - }, - "VALUE_NOT_PEARSON": { - "message": [ - "Value for `` only supports the 'pearson', got ''." - ] - }, - "VALUE_NOT_PLAIN_COLUMN_REFERENCE": { - "message": [ - "Value `` in `` should be a plain column reference such as `df.col` or `col('column')`." - ] - }, - "VALUE_NOT_POSITIVE": { - "message": [ - "Value for `` must be positive, got ''." - ] - }, - "VALUE_NOT_TRUE": { - "message": [ - "Value for `` must be True, got ''." - ] - }, - "VALUE_OUT_OF_BOUNDS": { - "message": [ - "Value for `` must be between and (inclusive), got " - ] - }, - "WRONG_NUM_ARGS_FOR_HIGHER_ORDER_FUNCTION": { - "message": [ - "Function `` should take between 1 and 3 arguments, but the provided function takes ." - ] - }, - "WRONG_NUM_COLUMNS": { - "message": [ - "Function `` should take at least columns." - ] - }, - "ZERO_INDEX": { - "message": [ - "Index must be non-zero." - ] - } -} -''' - +# Note: Though we call them "error classes" here, the proper name is "error conditions", +# hence why the name of the JSON file is different. +# For more information, please see: https://issues.apache.org/jira/browse/SPARK-46810 +# This discrepancy will be resolved as part of: https://issues.apache.org/jira/browse/SPARK-47429 +ERROR_CLASSES_JSON = ( + importlib.resources + .files("pyspark.errors") + .joinpath("error-conditions.json") + .read_text() +) ERROR_CLASSES_MAP = json.loads(ERROR_CLASSES_JSON) diff --git a/python/pyspark/errors/exceptions/__init__.py b/python/pyspark/errors/exceptions/__init__.py index 4fd16c6a2e1ad..c66f35958f8dd 100644 --- a/python/pyspark/errors/exceptions/__init__.py +++ b/python/pyspark/errors/exceptions/__init__.py @@ -18,39 +18,15 @@ def _write_self() -> None: import json + from pathlib import Path from pyspark.errors import error_classes - with open("python/pyspark/errors/error_classes.py", "w") as f: - error_class_py_file = """# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# NOTE: Automatically sort this file via -# - cd $SPARK_HOME -# - bin/pyspark -# - from pyspark.errors.exceptions import _write_self; _write_self() -import json - - -ERROR_CLASSES_JSON = ''' -%s -''' + ERRORS_DIR = Path(__file__).parents[1] -ERROR_CLASSES_MAP = json.loads(ERROR_CLASSES_JSON) -""" % json.dumps( - error_classes.ERROR_CLASSES_MAP, sort_keys=True, indent=2 + with open(ERRORS_DIR / "error-conditions.json", "w") as f: + json.dump( + error_classes.ERROR_CLASSES_MAP, + f, + sort_keys=True, + indent=2, ) - f.write(error_class_py_file) diff --git a/python/pyspark/errors/exceptions/captured.py b/python/pyspark/errors/exceptions/captured.py index 2a30eba3fb22f..b5bb742161c06 100644 --- a/python/pyspark/errors/exceptions/captured.py +++ b/python/pyspark/errors/exceptions/captured.py @@ -166,7 +166,14 @@ def getQueryContext(self) -> List[BaseQueryContext]: if self._origin is not None and is_instance_of( gw, self._origin, "org.apache.spark.SparkThrowable" ): - return [QueryContext(q) for q in self._origin.getQueryContext()] + contexts: List[BaseQueryContext] = [] + for q in self._origin.getQueryContext(): + if q.contextType().toString() == "SQL": + contexts.append(SQLQueryContext(q)) + else: + contexts.append(DataFrameQueryContext(q)) + + return contexts else: return [] @@ -379,17 +386,12 @@ class UnknownException(CapturedException, BaseUnknownException): """ -class QueryContext(BaseQueryContext): +class SQLQueryContext(BaseQueryContext): def __init__(self, q: "JavaObject"): self._q = q def contextType(self) -> QueryContextType: - context_type = self._q.contextType().toString() - assert context_type in ("SQL", "DataFrame") - if context_type == "DataFrame": - return QueryContextType.DataFrame - else: - return QueryContextType.SQL + return QueryContextType.SQL def objectType(self) -> str: return str(self._q.objectType()) @@ -409,13 +411,34 @@ def fragment(self) -> str: def callSite(self) -> str: return str(self._q.callSite()) - def pysparkFragment(self) -> Optional[str]: # type: ignore[return] - if self.contextType() == QueryContextType.DataFrame: - return str(self._q.pysparkFragment()) + def summary(self) -> str: + return str(self._q.summary()) + + +class DataFrameQueryContext(BaseQueryContext): + def __init__(self, q: "JavaObject"): + self._q = q + + def contextType(self) -> QueryContextType: + return QueryContextType.DataFrame + + def objectType(self) -> str: + return str(self._q.objectType()) + + def objectName(self) -> str: + return str(self._q.objectName()) - def pysparkCallSite(self) -> Optional[str]: # type: ignore[return] - if self.contextType() == QueryContextType.DataFrame: - return str(self._q.pysparkCallSite()) + def startIndex(self) -> int: + return int(self._q.startIndex()) + + def stopIndex(self) -> int: + return int(self._q.stopIndex()) + + def fragment(self) -> str: + return str(self._q.fragment()) + + def callSite(self) -> str: + return str(self._q.callSite()) def summary(self) -> str: return str(self._q.summary()) diff --git a/python/pyspark/errors/exceptions/connect.py b/python/pyspark/errors/exceptions/connect.py index 0cffe72687539..8a95358f26975 100644 --- a/python/pyspark/errors/exceptions/connect.py +++ b/python/pyspark/errors/exceptions/connect.py @@ -91,7 +91,10 @@ def convert_exception( ) query_contexts = [] for query_context in resp.errors[resp.root_error_idx].spark_throwable.query_contexts: - query_contexts.append(QueryContext(query_context)) + if query_context.context_type == pb2.FetchErrorDetailsResponse.QueryContext.SQL: + query_contexts.append(SQLQueryContext(query_context)) + else: + query_contexts.append(DataFrameQueryContext(query_context)) if "org.apache.spark.sql.catalyst.parser.ParseException" in classes: return ParseException( @@ -430,17 +433,12 @@ class SparkNoSuchElementException(SparkConnectGrpcException, BaseNoSuchElementEx """ -class QueryContext(BaseQueryContext): +class SQLQueryContext(BaseQueryContext): def __init__(self, q: pb2.FetchErrorDetailsResponse.QueryContext): self._q = q def contextType(self) -> QueryContextType: - context_type = self._q.context_type - - if int(context_type) == QueryContextType.DataFrame.value: - return QueryContextType.DataFrame - else: - return QueryContextType.SQL + return QueryContextType.SQL def objectType(self) -> str: return str(self._q.object_type) @@ -457,6 +455,75 @@ def stopIndex(self) -> int: def fragment(self) -> str: return str(self._q.fragment) + def callSite(self) -> str: + raise UnsupportedOperationException( + "", + error_class="UNSUPPORTED_CALL.WITHOUT_SUGGESTION", + message_parameters={"className": "SQLQueryContext", "methodName": "callSite"}, + sql_state="0A000", + server_stacktrace=None, + display_server_stacktrace=False, + query_contexts=[], + ) + + def summary(self) -> str: + return str(self._q.summary) + + +class DataFrameQueryContext(BaseQueryContext): + def __init__(self, q: pb2.FetchErrorDetailsResponse.QueryContext): + self._q = q + + def contextType(self) -> QueryContextType: + return QueryContextType.DataFrame + + def objectType(self) -> str: + raise UnsupportedOperationException( + "", + error_class="UNSUPPORTED_CALL.WITHOUT_SUGGESTION", + message_parameters={"className": "DataFrameQueryContext", "methodName": "objectType"}, + sql_state="0A000", + server_stacktrace=None, + display_server_stacktrace=False, + query_contexts=[], + ) + + def objectName(self) -> str: + raise UnsupportedOperationException( + "", + error_class="UNSUPPORTED_CALL.WITHOUT_SUGGESTION", + message_parameters={"className": "DataFrameQueryContext", "methodName": "objectName"}, + sql_state="0A000", + server_stacktrace=None, + display_server_stacktrace=False, + query_contexts=[], + ) + + def startIndex(self) -> int: + raise UnsupportedOperationException( + "", + error_class="UNSUPPORTED_CALL.WITHOUT_SUGGESTION", + message_parameters={"className": "DataFrameQueryContext", "methodName": "startIndex"}, + sql_state="0A000", + server_stacktrace=None, + display_server_stacktrace=False, + query_contexts=[], + ) + + def stopIndex(self) -> int: + raise UnsupportedOperationException( + "", + error_class="UNSUPPORTED_CALL.WITHOUT_SUGGESTION", + message_parameters={"className": "DataFrameQueryContext", "methodName": "stopIndex"}, + sql_state="0A000", + server_stacktrace=None, + display_server_stacktrace=False, + query_contexts=[], + ) + + def fragment(self) -> str: + return str(self._q.fragment) + def callSite(self) -> str: return str(self._q.call_site) diff --git a/python/pyspark/errors/utils.py b/python/pyspark/errors/utils.py index e1f249506dd02..89721d23c3858 100644 --- a/python/pyspark/errors/utils.py +++ b/python/pyspark/errors/utils.py @@ -16,14 +16,42 @@ # import re -from typing import Dict, Match - +import functools +import inspect +import os +import threading +from typing import Any, Callable, Dict, Match, TypeVar, Type, Optional, TYPE_CHECKING +import pyspark from pyspark.errors.error_classes import ERROR_CLASSES_MAP +if TYPE_CHECKING: + from pyspark.sql import SparkSession + +T = TypeVar("T") + +_current_origin = threading.local() + + +def current_origin() -> threading.local: + global _current_origin + + if not hasattr(_current_origin, "fragment"): + _current_origin.fragment = None + if not hasattr(_current_origin, "call_site"): + _current_origin.call_site = None + return _current_origin + + +def set_current_origin(fragment: Optional[str], call_site: Optional[str]) -> None: + global _current_origin + + _current_origin.fragment = fragment + _current_origin.call_site = call_site + class ErrorClassesReader: """ - A reader to load error information from error_classes.py. + A reader to load error information from error-conditions.json. """ def __init__(self) -> None: @@ -51,11 +79,11 @@ def replace_match(match: Match[str]) -> str: def get_message_template(self, error_class: str) -> str: """ - Returns the message template for corresponding error class from error_classes.py. + Returns the message template for corresponding error class from error-conditions.json. For example, when given `error_class` is "EXAMPLE_ERROR_CLASS", - and corresponding error class in error_classes.py looks like the below: + and corresponding error class in error-conditions.json looks like the below: .. code-block:: python @@ -69,7 +97,7 @@ def get_message_template(self, error_class: str) -> str: "Problem because of ." For sub error class, when given `error_class` is "EXAMPLE_ERROR_CLASS.SUB_ERROR_CLASS", - and corresponding error class in error_classes.py looks like the below: + and corresponding error class in error-conditions.json looks like the below: .. code-block:: python @@ -119,3 +147,124 @@ def get_message_template(self, error_class: str) -> str: message_template = main_message_template + " " + sub_message_template return message_template + + +def _capture_call_site(spark_session: "SparkSession", depth: int) -> str: + """ + Capture the call site information including file name, line number, and function name. + This function updates the thread-local storage from JVM side (PySparkCurrentOrigin) + with the current call site information when a PySpark API function is called. + + Parameters + ---------- + spark_session : SparkSession + Current active Spark session. + + Notes + ----- + The call site information is used to enhance error messages with the exact location + in the user code that led to the error. + """ + # Filtering out PySpark code and keeping user code only + pyspark_root = os.path.dirname(pyspark.__file__) + stack = [ + frame_info for frame_info in inspect.stack() if pyspark_root not in frame_info.filename + ] + + selected_frames = stack[:depth] + + # We try import here since IPython is not a required dependency + try: + import IPython + + # ipykernel is required for IPython + import ipykernel # type: ignore[import-not-found] + + ipython = IPython.get_ipython() + # Filtering out IPython related frames + ipy_root = os.path.dirname(IPython.__file__) + ipykernel_root = os.path.dirname(ipykernel.__file__) + selected_frames = [ + frame + for frame in selected_frames + if (ipy_root not in frame.filename) and (ipykernel_root not in frame.filename) + ] + except ImportError: + ipython = None + + # Identifying the cell is useful when the error is generated from IPython Notebook + if ipython: + call_sites = [ + f"line {frame.lineno} in cell [{ipython.execution_count}]" for frame in selected_frames + ] + else: + call_sites = [f"{frame.filename}:{frame.lineno}" for frame in selected_frames] + call_sites_str = "\n".join(call_sites) + + return call_sites_str + + +def _with_origin(func: Callable[..., Any]) -> Callable[..., Any]: + """ + A decorator to capture and provide the call site information to the server side + when PySpark API functions are invoked. + """ + + @functools.wraps(func) + def wrapper(*args: Any, **kwargs: Any) -> Any: + from pyspark.sql import SparkSession + from pyspark.sql.utils import is_remote + + spark = SparkSession.getActiveSession() + if spark is not None and hasattr(func, "__name__"): + if is_remote(): + global current_origin + + # Getting the configuration requires RPC call. Uses the default value for now. + depth = 1 + set_current_origin(func.__name__, _capture_call_site(spark, depth)) + + try: + return func(*args, **kwargs) + finally: + set_current_origin(None, None) + else: + assert spark._jvm is not None + jvm_pyspark_origin = ( + spark._jvm.org.apache.spark.sql.catalyst.trees.PySparkCurrentOrigin + ) + depth = int( + spark.conf.get( # type: ignore[arg-type] + "spark.sql.stackTracesInDataFrameContext" + ) + ) + # Update call site when the function is called + jvm_pyspark_origin.set(func.__name__, _capture_call_site(spark, depth)) + + try: + return func(*args, **kwargs) + finally: + jvm_pyspark_origin.clear() + else: + return func(*args, **kwargs) + + return wrapper + + +def with_origin_to_class(cls: Type[T]) -> Type[T]: + """ + Decorate all methods of a class with `_with_origin` to capture call site information. + """ + if os.environ.get("PYSPARK_PIN_THREAD", "true").lower() == "true": + for name, method in cls.__dict__.items(): + # Excluding Python magic methods that do not utilize JVM functions. + if callable(method) and name not in ( + "__init__", + "__new__", + "__iter__", + "__nonzero__", + "__repr__", + "__bool__", + ): + setattr(cls, name, _with_origin(method)) + return cls diff --git a/python/pyspark/errors_doc_gen.py b/python/pyspark/errors_doc_gen.py index ad32745348127..e1bd94dcec4d0 100644 --- a/python/pyspark/errors_doc_gen.py +++ b/python/pyspark/errors_doc_gen.py @@ -41,7 +41,7 @@ def generate_errors_doc(output_rst_file_path: str) -> None: Error classes in PySpark ======================== -This is a list of common, named error classes returned by PySpark which are defined at `error_classes.py `_. +This is a list of common, named error classes returned by PySpark which are defined at `error-conditions.json `_. When writing PySpark errors, developers must use an error class from the list. If an appropriate error class is not available, add a new one into the list. For more information, please refer to `Contributing Error and Exception `_. """ # noqa diff --git a/python/pyspark/ml/connect/functions.py b/python/pyspark/ml/connect/functions.py index b305c04519ae8..6597e6c4118ad 100644 --- a/python/pyspark/ml/connect/functions.py +++ b/python/pyspark/ml/connect/functions.py @@ -15,7 +15,7 @@ # limitations under the License. # from pyspark.ml import functions as PyMLFunctions -from pyspark.sql.connect.column import Column +from pyspark.sql.column import Column from pyspark.sql.connect.functions.builtin import _invoke_function, _to_col, lit diff --git a/python/pyspark/ml/functions.py b/python/pyspark/ml/functions.py index 466d94ccc8889..32941b33c4603 100644 --- a/python/pyspark/ml/functions.py +++ b/python/pyspark/ml/functions.py @@ -28,7 +28,7 @@ pass # Let it throw a better error message later when the API is invoked. from pyspark.sql.functions import pandas_udf -from pyspark.sql.column import Column, _to_java_column +from pyspark.sql.column import Column from pyspark.sql.types import ( ArrayType, ByteType, @@ -116,6 +116,7 @@ def vector_to_array(col: Column, dtype: str = "float64") -> Column: StructField('oldVec', ArrayType(FloatType(), False), False)] """ from pyspark.core.context import SparkContext + from pyspark.sql.classic.column import Column, _to_java_column sc = SparkContext._active_spark_context assert sc is not None and sc._jvm is not None @@ -159,6 +160,7 @@ def array_to_vector(col: Column) -> Column: [Row(vec1=DenseVector([1.0, 3.0]))] """ from pyspark.core.context import SparkContext + from pyspark.sql.classic.column import Column, _to_java_column sc = SparkContext._active_spark_context assert sc is not None and sc._jvm is not None diff --git a/python/pyspark/ml/stat.py b/python/pyspark/ml/stat.py index ec5da94079ea3..4dcc961909520 100644 --- a/python/pyspark/ml/stat.py +++ b/python/pyspark/ml/stat.py @@ -22,7 +22,7 @@ from pyspark.ml.common import _java2py, _py2java from pyspark.ml.linalg import Matrix, Vector from pyspark.ml.wrapper import JavaWrapper, _jvm -from pyspark.sql.column import Column, _to_seq +from pyspark.sql.column import Column from pyspark.sql.dataframe import DataFrame from pyspark.sql.functions import lit @@ -431,6 +431,7 @@ def metrics(*metrics: str) -> "SummaryBuilder": :py:class:`pyspark.ml.stat.SummaryBuilder` """ from pyspark.core.context import SparkContext + from pyspark.sql.classic.column import _to_seq sc = SparkContext._active_spark_context assert sc is not None diff --git a/python/pyspark/ml/tests/connect/test_connect_function.py b/python/pyspark/ml/tests/connect/test_connect_function.py index f503761106608..393d38fdc426a 100644 --- a/python/pyspark/ml/tests/connect/test_connect_function.py +++ b/python/pyspark/ml/tests/connect/test_connect_function.py @@ -19,7 +19,6 @@ from pyspark.util import is_remote_only from pyspark.sql import SparkSession as PySparkSession -from pyspark.sql.dataframe import DataFrame as SDF from pyspark.ml import functions as SF from pyspark.testing.sqlutils import SQLTestUtils from pyspark.testing.connectutils import ( @@ -55,6 +54,8 @@ def tearDownClass(cls): del os.environ["PYSPARK_NO_NAMESPACE_SHARE"] def compare_by_show(self, df1, df2, n: int = 20, truncate: int = 20): + from pyspark.sql.classic.dataframe import DataFrame as SDF + assert isinstance(df1, (SDF, CDF)) if isinstance(df1, SDF): str1 = df1._jdf.showString(n, truncate, False) diff --git a/python/pyspark/pandas/config.py b/python/pyspark/pandas/config.py index a66cb5a16d2dc..bfa88253dc6f4 100644 --- a/python/pyspark/pandas/config.py +++ b/python/pyspark/pandas/config.py @@ -169,7 +169,7 @@ def validate(self, v: Any) -> None: "can be expensive in general. So, if `compute.ops_on_diff_frames` variable is not " "True, that method throws an exception." ), - default=False, + default=True, types=bool, ), Option( diff --git a/python/pyspark/pandas/data_type_ops/base.py b/python/pyspark/pandas/data_type_ops/base.py index 2df40252965bc..b4a6b1abbcaf9 100644 --- a/python/pyspark/pandas/data_type_ops/base.py +++ b/python/pyspark/pandas/data_type_ops/base.py @@ -24,7 +24,7 @@ import pandas as pd from pandas.api.types import CategoricalDtype -from pyspark.sql import functions as F +from pyspark.sql import functions as F, Column as PySparkColumn from pyspark.sql.types import ( ArrayType, BinaryType, @@ -53,9 +53,6 @@ spark_type_to_pandas_dtype, ) -# For supporting Spark Connect -from pyspark.sql.utils import get_column_class - if extension_dtypes_available: from pandas import Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype @@ -485,16 +482,14 @@ def eq(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: else: from pyspark.pandas.base import column_op - Column = get_column_class() - return column_op(Column.__eq__)(left, right) + return column_op(PySparkColumn.__eq__)(left, right) def ne(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: from pyspark.pandas.base import column_op _sanitize_list_like(right) - Column = get_column_class() - return column_op(Column.__ne__)(left, right) + return column_op(PySparkColumn.__ne__)(left, right) def invert(self, operand: IndexOpsLike) -> IndexOpsLike: raise TypeError("Unary ~ can not be applied to %s." % self.pretty_name) diff --git a/python/pyspark/pandas/data_type_ops/boolean_ops.py b/python/pyspark/pandas/data_type_ops/boolean_ops.py index 7e7ea7eb0738c..c91dcc913080b 100644 --- a/python/pyspark/pandas/data_type_ops/boolean_ops.py +++ b/python/pyspark/pandas/data_type_ops/boolean_ops.py @@ -35,10 +35,8 @@ _is_boolean_type, ) from pyspark.pandas.typedef.typehints import as_spark_type, extension_dtypes, pandas_on_spark_type -from pyspark.sql import functions as F -from pyspark.sql.column import Column as PySparkColumn +from pyspark.sql import functions as F, Column as PySparkColumn from pyspark.sql.types import BooleanType, StringType -from pyspark.sql.utils import get_column_class from pyspark.errors import PySparkValueError @@ -331,23 +329,19 @@ def abs(self, operand: IndexOpsLike) -> IndexOpsLike: def lt(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: _sanitize_list_like(right) - Column = get_column_class() - return column_op(Column.__lt__)(left, right) + return column_op(PySparkColumn.__lt__)(left, right) def le(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: _sanitize_list_like(right) - Column = get_column_class() - return column_op(Column.__le__)(left, right) + return column_op(PySparkColumn.__le__)(left, right) def ge(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: _sanitize_list_like(right) - Column = get_column_class() - return column_op(Column.__ge__)(left, right) + return column_op(PySparkColumn.__ge__)(left, right) def gt(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: _sanitize_list_like(right) - Column = get_column_class() - return column_op(Column.__gt__)(left, right) + return column_op(PySparkColumn.__gt__)(left, right) def invert(self, operand: IndexOpsLike) -> IndexOpsLike: return operand._with_new_scol(~operand.spark.column, field=operand._internal.data_fields[0]) diff --git a/python/pyspark/pandas/data_type_ops/date_ops.py b/python/pyspark/pandas/data_type_ops/date_ops.py index 771b5d38a17ac..9a0b82de6ce8b 100644 --- a/python/pyspark/pandas/data_type_ops/date_ops.py +++ b/python/pyspark/pandas/data_type_ops/date_ops.py @@ -23,9 +23,8 @@ import pandas as pd from pandas.api.types import CategoricalDtype -from pyspark.sql import functions as F +from pyspark.sql import functions as F, Column as PySparkColumn from pyspark.sql.types import BooleanType, DateType, StringType -from pyspark.sql.utils import get_column_class from pyspark.pandas._typing import Dtype, IndexOpsLike, SeriesOrIndex from pyspark.pandas.base import column_op, IndexOpsMixin from pyspark.pandas.data_type_ops.base import ( @@ -84,29 +83,25 @@ def lt(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: from pyspark.pandas.base import column_op _sanitize_list_like(right) - Column = get_column_class() - return column_op(Column.__lt__)(left, right) + return column_op(PySparkColumn.__lt__)(left, right) def le(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: from pyspark.pandas.base import column_op _sanitize_list_like(right) - Column = get_column_class() - return column_op(Column.__le__)(left, right) + return column_op(PySparkColumn.__le__)(left, right) def ge(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: from pyspark.pandas.base import column_op _sanitize_list_like(right) - Column = get_column_class() - return column_op(Column.__ge__)(left, right) + return column_op(PySparkColumn.__ge__)(left, right) def gt(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: from pyspark.pandas.base import column_op _sanitize_list_like(right) - Column = get_column_class() - return column_op(Column.__gt__)(left, right) + return column_op(PySparkColumn.__gt__)(left, right) def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike: dtype, spark_type = pandas_on_spark_type(dtype) diff --git a/python/pyspark/pandas/data_type_ops/num_ops.py b/python/pyspark/pandas/data_type_ops/num_ops.py index 6f393c9652d7d..8e8dfee9990e3 100644 --- a/python/pyspark/pandas/data_type_ops/num_ops.py +++ b/python/pyspark/pandas/data_type_ops/num_ops.py @@ -43,8 +43,7 @@ _is_boolean_type, ) from pyspark.pandas.typedef.typehints import extension_dtypes, pandas_on_spark_type -from pyspark.sql import functions as F -from pyspark.sql import Column as PySparkColumn +from pyspark.sql import functions as F, Column as PySparkColumn from pyspark.sql.types import ( BooleanType, DataType, @@ -53,7 +52,7 @@ from pyspark.errors import PySparkValueError # For Supporting Spark Connect -from pyspark.sql.utils import pyspark_column_op, get_column_class +from pyspark.sql.utils import pyspark_column_op def _non_fractional_astype( @@ -82,8 +81,7 @@ def add(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: raise TypeError("Addition can not be applied to given types.") right = transform_boolean_operand_to_numeric(right, spark_type=left.spark.data_type) - Column = get_column_class() - return column_op(Column.__add__)(left, right) + return column_op(PySparkColumn.__add__)(left, right) def sub(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: _sanitize_list_like(right) @@ -91,8 +89,7 @@ def sub(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: raise TypeError("Subtraction can not be applied to given types.") right = transform_boolean_operand_to_numeric(right, spark_type=left.spark.data_type) - Column = get_column_class() - return column_op(Column.__sub__)(left, right) + return column_op(PySparkColumn.__sub__)(left, right) def mod(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: _sanitize_list_like(right) @@ -110,13 +107,11 @@ def pow(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: if not is_valid_operand_for_numeric_arithmetic(right): raise TypeError("Exponentiation can not be applied to given types.") - Column = get_column_class() - - def pow_func(left: Column, right: Any) -> Column: # type: ignore[valid-type] + def pow_func(left: PySparkColumn, right: Any) -> PySparkColumn: return ( - F.when(left == 1, left) # type: ignore + F.when(left == 1, left) .when(F.lit(right) == 0, 1) - .otherwise(Column.__pow__(left, right)) + .otherwise(PySparkColumn.__pow__(left, right)) ) right = transform_boolean_operand_to_numeric(right, spark_type=left.spark.data_type) @@ -127,34 +122,29 @@ def radd(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: if not isinstance(right, numbers.Number): raise TypeError("Addition can not be applied to given types.") right = transform_boolean_operand_to_numeric(right) - Column = get_column_class() - return column_op(Column.__radd__)(left, right) + return column_op(PySparkColumn.__radd__)(left, right) def rsub(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: _sanitize_list_like(right) if not isinstance(right, numbers.Number): raise TypeError("Subtraction can not be applied to given types.") right = transform_boolean_operand_to_numeric(right) - Column = get_column_class() - return column_op(Column.__rsub__)(left, right) + return column_op(PySparkColumn.__rsub__)(left, right) def rmul(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: _sanitize_list_like(right) if not isinstance(right, numbers.Number): raise TypeError("Multiplication can not be applied to given types.") right = transform_boolean_operand_to_numeric(right) - Column = get_column_class() - return column_op(Column.__rmul__)(left, right) + return column_op(PySparkColumn.__rmul__)(left, right) def rpow(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: _sanitize_list_like(right) if not isinstance(right, numbers.Number): raise TypeError("Exponentiation can not be applied to given types.") - Column = get_column_class() - - def rpow_func(left: Column, right: Any) -> Column: # type: ignore[valid-type] - return F.when(F.lit(right == 1), right).otherwise(Column.__rpow__(left, right)) + def rpow_func(left: PySparkColumn, right: Any) -> PySparkColumn: + return F.when(F.lit(right == 1), right).otherwise(PySparkColumn.__rpow__(left, right)) right = transform_boolean_operand_to_numeric(right) return column_op(rpow_func)(left, right) @@ -250,8 +240,8 @@ def mul(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: raise TypeError("Multiplication can not be applied to given types.") right = transform_boolean_operand_to_numeric(right, spark_type=left.spark.data_type) - Column = get_column_class() - return column_op(Column.__mul__)(left, right) + + return column_op(PySparkColumn.__mul__)(left, right) def truediv(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: _sanitize_list_like(right) @@ -335,8 +325,8 @@ def mul(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: raise TypeError("Multiplication can not be applied to given types.") right = transform_boolean_operand_to_numeric(right, spark_type=left.spark.data_type) - Column = get_column_class() - return column_op(Column.__mul__)(left, right) + + return column_op(PySparkColumn.__mul__)(left, right) def truediv(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: _sanitize_list_like(right) @@ -496,13 +486,11 @@ def rpow(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: if not isinstance(right, numbers.Number): raise TypeError("Exponentiation can not be applied to given types.") - Column = get_column_class() - - def rpow_func(left: Column, right: Any) -> Column: # type: ignore[valid-type] + def rpow_func(left: PySparkColumn, right: Any) -> PySparkColumn: return ( - F.when(left.isNull(), np.nan) # type: ignore + F.when(left.isNull(), np.nan) .when(F.lit(right == 1), right) - .otherwise(Column.__rpow__(left, right)) + .otherwise(PySparkColumn.__rpow__(left, right)) ) right = transform_boolean_operand_to_numeric(right) diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py index e8369544124eb..52f7a327b5be0 100644 --- a/python/pyspark/pandas/frame.py +++ b/python/pyspark/pandas/frame.py @@ -149,7 +149,6 @@ create_tuple_for_frame_type, ) from pyspark.pandas.plot import PandasOnSparkPlotAccessor -from pyspark.sql.utils import get_column_class, get_dataframe_class if TYPE_CHECKING: from pyspark.sql._typing import OptionalPrimitiveType @@ -491,7 +490,8 @@ class DataFrame(Frame, Generic[T]): >>> import pandas as pd >>> sdf = spark.createDataFrame([("Data", 1), ("Bricks", 2)], ["x", "y"]) - >>> ps.DataFrame(data=sdf, index=pd.Index([0, 1, 2])) + >>> with ps.option_context("compute.ops_on_diff_frames", False): + ... ps.DataFrame(data=sdf, index=pd.Index([0, 1, 2])) Traceback (most recent call last): ... ValueError: Cannot combine the series or dataframe...'compute.ops_on_diff_frames' option. @@ -509,7 +509,8 @@ class DataFrame(Frame, Generic[T]): >>> import pandas as pd >>> sdf = spark.createDataFrame([("Data", 1), ("Bricks", 2)], ["x", "y"]) - >>> ps.DataFrame(data=sdf, index=ps.Index([0, 1, 2])) + >>> with ps.option_context("compute.ops_on_diff_frames", False): + ... ps.DataFrame(data=sdf, index=ps.Index([0, 1, 2])) Traceback (most recent call last): ... ValueError: Cannot combine the series or dataframe...'compute.ops_on_diff_frames' option. @@ -527,7 +528,6 @@ class DataFrame(Frame, Generic[T]): def __init__( # type: ignore[no-untyped-def] self, data=None, index=None, columns=None, dtype=None, copy=False ): - SparkDataFrame = get_dataframe_class() index_assigned = False if isinstance(data, InternalFrame): assert columns is None @@ -535,7 +535,7 @@ def __init__( # type: ignore[no-untyped-def] assert not copy if index is None: internal = data - elif isinstance(data, SparkDataFrame): + elif isinstance(data, PySparkDataFrame): assert columns is None assert dtype is None assert not copy @@ -5627,10 +5627,9 @@ def _assign(self, kwargs: Any) -> "DataFrame": from pyspark.pandas.indexes import MultiIndex from pyspark.pandas.series import IndexOpsMixin - Column = get_column_class() for k, v in kwargs.items(): is_invalid_assignee = ( - not (isinstance(v, (IndexOpsMixin, Column)) or callable(v) or is_scalar(v)) + not (isinstance(v, (IndexOpsMixin, PySparkColumn)) or callable(v) or is_scalar(v)) ) or isinstance(v, MultiIndex) if is_invalid_assignee: raise TypeError( @@ -5644,7 +5643,7 @@ def _assign(self, kwargs: Any) -> "DataFrame": (v.spark.column, v._internal.data_fields[0]) if isinstance(v, IndexOpsMixin) and not isinstance(v, MultiIndex) else (v, None) - if isinstance(v, Column) + if isinstance(v, PySparkColumn) else (F.lit(v), None) ) for k, v in kwargs.items() @@ -7687,21 +7686,20 @@ def _sort( if na_position not in ("first", "last"): raise ValueError("invalid na_position: '{}'".format(na_position)) - Column = get_column_class() # Mapper: Get a spark colum # n function for (ascending, na_position) combination mapper = { - (True, "first"): Column.asc_nulls_first, - (True, "last"): Column.asc_nulls_last, - (False, "first"): Column.desc_nulls_first, - (False, "last"): Column.desc_nulls_last, + (True, "first"): PySparkColumn.asc_nulls_first, + (True, "last"): PySparkColumn.asc_nulls_last, + (False, "first"): PySparkColumn.desc_nulls_first, + (False, "last"): PySparkColumn.desc_nulls_last, } by = [mapper[(asc, na_position)](scol) for scol, asc in zip(by, ascending)] natural_order_scol = F.col(NATURAL_ORDER_COLUMN_NAME) if keep == "last": - natural_order_scol = Column.desc(natural_order_scol) + natural_order_scol = PySparkColumn.desc(natural_order_scol) elif keep == "all": raise NotImplementedError("`keep`=all is not implemented yet.") elif keep != "first": @@ -13626,14 +13624,6 @@ def _set_axis_fallback(self, *args: Any, **kwargs: Any) -> "DataFrame": _f = self._build_fallback_method("set_axis") return _f(*args, **kwargs) - def _to_feather_fallback(self, *args: Any, **kwargs: Any) -> None: - _f = self._build_fallback_driver_method("to_feather") - return _f(*args, **kwargs) - - def _to_stata_fallback(self, *args: Any, **kwargs: Any) -> None: - _f = self._build_fallback_driver_method("to_stata") - return _f(*args, **kwargs) - def __getattr__(self, key: str) -> Any: if key.startswith("__"): raise AttributeError(key) @@ -13738,8 +13728,7 @@ def _reduce_spark_multi(sdf: PySparkDataFrame, aggs: List[PySparkColumn]) -> Any """ Performs a reduction on a spark DataFrame, the functions being known SQL aggregate functions. """ - SparkDataFrame = get_dataframe_class() - assert isinstance(sdf, SparkDataFrame) + assert isinstance(sdf, PySparkDataFrame) sdf0 = sdf.agg(*aggs) lst = sdf0.limit(2).toPandas() assert len(lst) == 1, (sdf, lst) diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py index ec47ab75c43cf..55627a4c740c3 100644 --- a/python/pyspark/pandas/groupby.py +++ b/python/pyspark/pandas/groupby.py @@ -308,6 +308,7 @@ def aggregate( ) if not self._as_index: + index_cols = psdf._internal.column_labels should_drop_index = set( i for i, gkey in enumerate(self._groupkeys) if gkey._psdf is not self._psdf ) @@ -322,8 +323,12 @@ def aggregate( psdf = psdf.reset_index(level=should_drop_index, drop=drop) if len(should_drop_index) < len(self._groupkeys): psdf = psdf.reset_index() + index_cols = [c for c in psdf._internal.column_labels if c not in index_cols] + if relabeling: + psdf = psdf[pd.Index(index_cols + list(order))] + psdf.columns = pd.Index([c[0] for c in index_cols] + list(columns)) - if relabeling: + if relabeling and self._as_index: psdf = psdf[order] psdf.columns = columns # type: ignore[assignment] return psdf diff --git a/python/pyspark/pandas/indexes/multi.py b/python/pyspark/pandas/indexes/multi.py index 7d2712cbb531e..b5aae890d50a2 100644 --- a/python/pyspark/pandas/indexes/multi.py +++ b/python/pyspark/pandas/indexes/multi.py @@ -23,7 +23,6 @@ from pyspark.sql import functions as F, Column as PySparkColumn, Window from pyspark.sql.types import DataType -from pyspark.sql.utils import get_column_class from pyspark import pandas as ps from pyspark.pandas._typing import Label, Name, Scalar from pyspark.pandas.exceptions import PandasNotImplementedError @@ -514,7 +513,6 @@ def _is_monotonic_increasing(self) -> Series: cond = F.lit(True) has_not_null = F.lit(True) - Column = get_column_class() for scol in self._internal.index_spark_columns[::-1]: data_type = self._internal.spark_type_for(scol) prev = F.lag(scol, 1).over(window) @@ -522,7 +520,9 @@ def _is_monotonic_increasing(self) -> Series: # Since pandas 1.1.4, null value is not allowed at any levels of MultiIndex. # Therefore, we should check `has_not_null` over all levels. has_not_null = has_not_null & scol.isNotNull() - cond = F.when(scol.eqNullSafe(prev), cond).otherwise(compare(scol, prev, Column.__gt__)) + cond = F.when(scol.eqNullSafe(prev), cond).otherwise( + compare(scol, prev, PySparkColumn.__gt__) + ) cond = has_not_null & (prev.isNull() | cond) @@ -560,7 +560,6 @@ def _is_monotonic_decreasing(self) -> Series: cond = F.lit(True) has_not_null = F.lit(True) - Column = get_column_class() for scol in self._internal.index_spark_columns[::-1]: data_type = self._internal.spark_type_for(scol) prev = F.lag(scol, 1).over(window) @@ -568,7 +567,9 @@ def _is_monotonic_decreasing(self) -> Series: # Since pandas 1.1.4, null value is not allowed at any levels of MultiIndex. # Therefore, we should check `has_not_null` over all levels. has_not_null = has_not_null & scol.isNotNull() - cond = F.when(scol.eqNullSafe(prev), cond).otherwise(compare(scol, prev, Column.__lt__)) + cond = F.when(scol.eqNullSafe(prev), cond).otherwise( + compare(scol, prev, PySparkColumn.__lt__) + ) cond = has_not_null & (prev.isNull() | cond) diff --git a/python/pyspark/pandas/indexing.py b/python/pyspark/pandas/indexing.py index 24b7c53eea997..fada94cf383a0 100644 --- a/python/pyspark/pandas/indexing.py +++ b/python/pyspark/pandas/indexing.py @@ -50,7 +50,6 @@ spark_column_equals, verify_temp_column_name, ) -from pyspark.sql.utils import get_column_class if TYPE_CHECKING: from pyspark.pandas.frame import DataFrame @@ -259,12 +258,11 @@ def _select_rows( """ from pyspark.pandas.series import Series - Column = get_column_class() if rows_sel is None: return None, None, None elif isinstance(rows_sel, Series): return self._select_rows_by_series(rows_sel) - elif isinstance(rows_sel, Column): + elif isinstance(rows_sel, PySparkColumn): return self._select_rows_by_spark_column(rows_sel) elif isinstance(rows_sel, slice): if rows_sel == slice(None): @@ -306,7 +304,6 @@ def _select_cols( """ from pyspark.pandas.series import Series - Column = get_column_class() if cols_sel is None: column_labels = self._internal.column_labels data_spark_columns = self._internal.data_spark_columns @@ -314,7 +311,7 @@ def _select_cols( return column_labels, data_spark_columns, data_fields, False, None elif isinstance(cols_sel, Series): return self._select_cols_by_series(cols_sel, missing_keys) - elif isinstance(cols_sel, Column): + elif isinstance(cols_sel, PySparkColumn): return self._select_cols_by_spark_column(cols_sel, missing_keys) elif isinstance(cols_sel, slice): if cols_sel == slice(None): @@ -579,7 +576,6 @@ def __setitem__(self, key: Any, value: Any) -> None: from pyspark.pandas.frame import DataFrame from pyspark.pandas.series import Series, first_series - Column = get_column_class() if self._is_series: if ( isinstance(key, Series) @@ -639,7 +635,7 @@ def __setitem__(self, key: Any, value: Any) -> None: self._internal.spark_frame[cast(iLocIndexer, self)._sequence_col] < F.lit(limit) ) - if isinstance(value, (Series, Column)): + if isinstance(value, (Series, PySparkColumn)): if remaining_index is not None and remaining_index == 0: raise ValueError( "No axis named {} for object type {}".format(key, type(value).__name__) @@ -724,7 +720,7 @@ def __setitem__(self, key: Any, value: Any) -> None: self._internal.spark_frame[cast(iLocIndexer, self)._sequence_col] < F.lit(limit) ) - if isinstance(value, (Series, Column)): + if isinstance(value, (Series, PySparkColumn)): if remaining_index is not None and remaining_index == 0: raise ValueError("Incompatible indexer with Series") if len(data_spark_columns) > 1: @@ -1125,9 +1121,8 @@ def _select_rows_by_slice( ) )[::-1]: compare = MultiIndex._comparator_for_monotonic_increasing(dt) - Column = get_column_class() cond = F.when(scol.eqNullSafe(F.lit(value).cast(dt)), cond).otherwise( - compare(scol, F.lit(value).cast(dt), Column.__gt__) + compare(scol, F.lit(value).cast(dt), PySparkColumn.__gt__) ) conds.append(cond) if stop is not None: @@ -1140,9 +1135,8 @@ def _select_rows_by_slice( ) )[::-1]: compare = MultiIndex._comparator_for_monotonic_increasing(dt) - Column = get_column_class() cond = F.when(scol.eqNullSafe(F.lit(value).cast(dt)), cond).otherwise( - compare(scol, F.lit(value).cast(dt), Column.__lt__) + compare(scol, F.lit(value).cast(dt), PySparkColumn.__lt__) ) conds.append(cond) @@ -1300,12 +1294,11 @@ def _select_cols_by_iterable( ]: from pyspark.pandas.series import Series - Column = get_column_class() if all(isinstance(key, Series) for key in cols_sel): column_labels = [key._column_label for key in cols_sel] data_spark_columns = [key.spark.column for key in cols_sel] data_fields = [key._internal.data_fields[0] for key in cols_sel] - elif all(isinstance(key, Column) for key in cols_sel): + elif all(isinstance(key, PySparkColumn) for key in cols_sel): column_labels = [ (self._internal.spark_frame.select(col).columns[0],) for col in cols_sel ] @@ -1804,8 +1797,7 @@ def _select_cols_else( ) def __setitem__(self, key: Any, value: Any) -> None: - Column = get_column_class() - if not isinstance(value, Column) and is_list_like(value): + if not isinstance(value, PySparkColumn) and is_list_like(value): iloc_item = self[key] if not is_list_like(key) or not is_list_like(iloc_item): raise ValueError("setting an array element with a sequence.") diff --git a/python/pyspark/pandas/internal.py b/python/pyspark/pandas/internal.py index 2966db073d0ca..c5fef3b138254 100644 --- a/python/pyspark/pandas/internal.py +++ b/python/pyspark/pandas/internal.py @@ -33,6 +33,7 @@ Window, ) from pyspark.sql.types import ( # noqa: F401 + _drop_metadata, BooleanType, DataType, LongType, @@ -40,8 +41,7 @@ StructType, StringType, ) -from pyspark.sql.utils import is_timestamp_ntz_preferred -from pyspark.sql.utils import is_remote, get_column_class, get_dataframe_class +from pyspark.sql.utils import is_timestamp_ntz_preferred, is_remote from pyspark import pandas as ps from pyspark.pandas._typing import Label from pyspark.pandas.spark.utils import as_nullable_spark_type, force_decimal_precision_scale @@ -619,8 +619,7 @@ def __init__( >>> internal.column_label_names [('column_labels_a',), ('column_labels_b',)] """ - SparkDataFrame = get_dataframe_class() - assert isinstance(spark_frame, SparkDataFrame) + assert isinstance(spark_frame, PySparkDataFrame) assert not spark_frame.isStreaming, "pandas-on-Spark does not support Structured Streaming." if not index_spark_columns: @@ -672,12 +671,12 @@ def __init__( self._sdf = spark_frame # index_spark_columns - Column = get_column_class() + assert all( - isinstance(index_scol, Column) for index_scol in index_spark_columns + isinstance(index_scol, PySparkColumn) for index_scol in index_spark_columns ), index_spark_columns - self._index_spark_columns: List[Column] = index_spark_columns # type: ignore[valid-type] + self._index_spark_columns: List[PySparkColumn] = index_spark_columns # data_spark_columns if data_spark_columns is None: @@ -691,9 +690,9 @@ def __init__( and col not in HIDDEN_COLUMNS ] else: - assert all(isinstance(scol, Column) for scol in data_spark_columns) + assert all(isinstance(scol, PySparkColumn) for scol in data_spark_columns) - self._data_spark_columns: List[Column] = data_spark_columns # type: ignore[valid-type] + self._data_spark_columns: List[PySparkColumn] = data_spark_columns # fields if index_fields is None: @@ -761,14 +760,8 @@ def __init__( # in a few tests when using Spark Connect. However, the function works properly. # Therefore, we temporarily perform Spark Connect tests by excluding metadata # until the issue is resolved. - def remove_metadata(struct_field: StructField) -> StructField: - new_struct_field = StructField( - struct_field.name, struct_field.dataType, struct_field.nullable - ) - return new_struct_field - assert all( - remove_metadata(index_field.struct_field) == remove_metadata(struct_field) + _drop_metadata(index_field.struct_field) == _drop_metadata(struct_field) for index_field, struct_field in zip(index_fields, struct_fields) ), (index_fields, struct_fields) else: @@ -795,14 +788,8 @@ def remove_metadata(struct_field: StructField) -> StructField: # in a few tests when using Spark Connect. However, the function works properly. # Therefore, we temporarily perform Spark Connect tests by excluding metadata # until the issue is resolved. - def remove_metadata(struct_field: StructField) -> StructField: - new_struct_field = StructField( - struct_field.name, struct_field.dataType, struct_field.nullable - ) - return new_struct_field - assert all( - remove_metadata(data_field.struct_field) == remove_metadata(struct_field) + _drop_metadata(data_field.struct_field) == _drop_metadata(struct_field) for data_field, struct_field in zip(data_fields, struct_fields) ), (data_fields, struct_fields) else: @@ -959,7 +946,7 @@ def attach_distributed_sequence_column( return sdf.select( ConnectColumn(DistributedSequenceID()).alias(column_name), - "*", # type: ignore[call-overload] + "*", ) else: return PySparkDataFrame( @@ -985,27 +972,27 @@ def spark_column_for(self, label: Label) -> PySparkColumn: def spark_column_name_for(self, label_or_scol: Union[Label, PySparkColumn]) -> str: """Return the actual Spark column name for the given column label.""" - Column = get_column_class() - if isinstance(label_or_scol, Column): + + if isinstance(label_or_scol, PySparkColumn): return self.spark_frame.select(label_or_scol).columns[0] else: - return self.field_for(label_or_scol).name # type: ignore[arg-type] + return self.field_for(label_or_scol).name def spark_type_for(self, label_or_scol: Union[Label, PySparkColumn]) -> DataType: """Return DataType for the given column label.""" - Column = get_column_class() - if isinstance(label_or_scol, Column): + + if isinstance(label_or_scol, PySparkColumn): return self.spark_frame.select(label_or_scol).schema[0].dataType else: - return self.field_for(label_or_scol).spark_type # type: ignore[arg-type] + return self.field_for(label_or_scol).spark_type def spark_column_nullable_for(self, label_or_scol: Union[Label, PySparkColumn]) -> bool: """Return nullability for the given column label.""" - Column = get_column_class() - if isinstance(label_or_scol, Column): + + if isinstance(label_or_scol, PySparkColumn): return self.spark_frame.select(label_or_scol).schema[0].nullable else: - return self.field_for(label_or_scol).nullable # type: ignore[arg-type] + return self.field_for(label_or_scol).nullable def field_for(self, label: Label) -> InternalField: """Return InternalField for the given column label.""" diff --git a/python/pyspark/pandas/namespace.py b/python/pyspark/pandas/namespace.py index 42a0ce49faa56..4cea4b4fff225 100644 --- a/python/pyspark/pandas/namespace.py +++ b/python/pyspark/pandas/namespace.py @@ -94,9 +94,6 @@ from pyspark.pandas.indexes import Index, DatetimeIndex, TimedeltaIndex from pyspark.pandas.indexes.multi import MultiIndex -# For Supporting Spark Connect -from pyspark.sql.utils import get_column_class - __all__ = [ "from_pandas", "range", @@ -3398,8 +3395,7 @@ def rename(col: str) -> str: else: on = None - Column = get_column_class() - if tolerance is not None and not isinstance(tolerance, Column): + if tolerance is not None and not isinstance(tolerance, PySparkColumn): tolerance = F.lit(tolerance) as_of_joined_table = left_table._joinAsOf( @@ -3424,10 +3420,10 @@ def rename(col: str) -> str: data_columns = [] column_labels = [] - def left_scol_for(label: Label) -> Column: # type: ignore[valid-type] + def left_scol_for(label: Label) -> PySparkColumn: return scol_for(as_of_joined_table, left_internal.spark_column_name_for(label)) - def right_scol_for(label: Label) -> Column: # type: ignore[valid-type] + def right_scol_for(label: Label) -> PySparkColumn: return scol_for(as_of_joined_table, right_internal.spark_column_name_for(label)) for label in left_internal.column_labels: @@ -3441,7 +3437,7 @@ def right_scol_for(label: Label) -> Column: # type: ignore[valid-type] pass else: col = col + left_suffix - scol = scol.alias(col) # type: ignore[attr-defined] + scol = scol.alias(col) label = tuple([str(label[0]) + left_suffix] + list(label[1:])) exprs.append(scol) data_columns.append(col) @@ -3449,7 +3445,7 @@ def right_scol_for(label: Label) -> Column: # type: ignore[valid-type] for label in right_internal.column_labels: # recover `right_prefix` here. col = right_internal.spark_column_name_for(label)[len(right_prefix) :] - scol = right_scol_for(label).alias(col) # type: ignore[attr-defined] + scol = right_scol_for(label).alias(col) if label in duplicate_columns: spark_column_name = left_internal.spark_column_name_for(label) if spark_column_name in left_as_of_names + left_join_on_names and ( @@ -3458,7 +3454,7 @@ def right_scol_for(label: Label) -> Column: # type: ignore[valid-type] continue else: col = col + right_suffix - scol = scol.alias(col) # type: ignore[attr-defined] + scol = scol.alias(col) label = tuple([str(label[0]) + right_suffix] + list(label[1:])) exprs.append(scol) data_columns.append(col) diff --git a/python/pyspark/pandas/plot/core.py b/python/pyspark/pandas/plot/core.py index 5bd2a67ed39bb..819ac02a51266 100644 --- a/python/pyspark/pandas/plot/core.py +++ b/python/pyspark/pandas/plot/core.py @@ -23,6 +23,7 @@ from pandas.core.dtypes.inference import is_integer from pyspark.sql import functions as F +from pyspark.sql.utils import is_remote from pyspark.pandas.missing import unsupported_function from pyspark.pandas.config import get_option from pyspark.pandas.utils import name_like_string @@ -571,10 +572,14 @@ def _get_plot_backend(backend=None): return module def __call__(self, kind="line", backend=None, **kwargs): + kind = {"density": "kde"}.get(kind, kind) + + if is_remote() and kind in ["hist", "kde"]: + return unsupported_function(class_name="pd.DataFrame", method_name=kind)() + plot_backend = PandasOnSparkPlotAccessor._get_plot_backend(backend) plot_data = self.data - kind = {"density": "kde"}.get(kind, kind) if hasattr(plot_backend, "plot_pandas_on_spark"): # use if there's pandas-on-Spark specific method. return plot_backend.plot_pandas_on_spark(plot_data, kind=kind, **kwargs) @@ -948,6 +953,9 @@ def hist(self, bins=10, **kwds): >>> df = ps.from_pandas(df) >>> df.plot.hist(bins=12, alpha=0.5) # doctest: +SKIP """ + if is_remote(): + return unsupported_function(class_name="pd.DataFrame", method_name="hist")() + return self(kind="hist", bins=bins, **kwds) def kde(self, bw_method=None, ind=None, **kwargs): @@ -1023,6 +1031,9 @@ def kde(self, bw_method=None, ind=None, **kwargs): ... }) >>> df.plot.kde(ind=[1, 2, 3, 4, 5, 6], bw_method=0.3) # doctest: +SKIP """ + if is_remote(): + return unsupported_function(class_name="pd.DataFrame", method_name="kde")() + return self(kind="kde", bw_method=bw_method, ind=ind, **kwargs) density = kde diff --git a/python/pyspark/pandas/resample.py b/python/pyspark/pandas/resample.py index 9683fc4f4e7ff..5557ca2af7738 100644 --- a/python/pyspark/pandas/resample.py +++ b/python/pyspark/pandas/resample.py @@ -56,7 +56,6 @@ scol_for, verify_temp_column_name, ) -from pyspark.pandas.spark.functions import timestampdiff class Resampler(Generic[FrameLike], metaclass=ABCMeta): @@ -279,7 +278,7 @@ def _bin_timestamp(self, origin: pd.Timestamp, ts_scol: Column) -> Column: truncated_ts_scol = F.date_trunc(unit_str, ts_scol) if isinstance(key_type, TimestampNTZType): truncated_ts_scol = F.to_timestamp_ntz(truncated_ts_scol) - diff = timestampdiff(unit_str, origin_scol, truncated_ts_scol) + diff = F.timestamp_diff(unit_str, origin_scol, truncated_ts_scol) mod = F.lit(0) if n == 1 else (diff % F.lit(n)) if rule_code in ["h", "H"]: diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py index 8edc2c531b518..4e0f3ca349177 100644 --- a/python/pyspark/pandas/series.py +++ b/python/pyspark/pandas/series.py @@ -54,7 +54,12 @@ ) from pandas.tseries.frequencies import DateOffset -from pyspark.sql import functions as F, Column as PySparkColumn, DataFrame as SparkDataFrame +from pyspark.sql import ( + functions as F, + Column as PySparkColumn, + DataFrame as SparkDataFrame, + Window as PySparkWindow, +) from pyspark.sql.types import ( ArrayType, BooleanType, @@ -70,7 +75,6 @@ NullType, ) from pyspark.sql.window import Window -from pyspark.sql.utils import get_column_class, get_window_class from pyspark import pandas as ps # For running doctests and reference resolution in PyCharm. from pyspark.pandas._typing import Axis, Dtype, Label, Name, Scalar, T from pyspark.pandas.accessors import PandasOnSparkSeriesMethods @@ -2257,15 +2261,14 @@ def _interpolate( last_non_null = F.last(scol, True) null_index = SF.null_index(scol) - Window = get_window_class() - window_forward = Window.orderBy(NATURAL_ORDER_COLUMN_NAME).rowsBetween( - Window.unboundedPreceding, Window.currentRow + window_forward = PySparkWindow.orderBy(NATURAL_ORDER_COLUMN_NAME).rowsBetween( + PySparkWindow.unboundedPreceding, PySparkWindow.currentRow ) last_non_null_forward = last_non_null.over(window_forward) null_index_forward = null_index.over(window_forward) - window_backward = Window.orderBy(F.desc(NATURAL_ORDER_COLUMN_NAME)).rowsBetween( - Window.unboundedPreceding, Window.currentRow + window_backward = PySparkWindow.orderBy(F.desc(NATURAL_ORDER_COLUMN_NAME)).rowsBetween( + PySparkWindow.unboundedPreceding, PySparkWindow.currentRow ) last_non_null_backward = last_non_null.over(window_backward) null_index_backward = null_index.over(window_backward) @@ -4171,11 +4174,10 @@ def _rank( if self._internal.index_level > 1: raise NotImplementedError("rank do not support MultiIndex now") - Column = get_column_class() if ascending: - asc_func = Column.asc + asc_func = PySparkColumn.asc else: - asc_func = Column.desc + asc_func = PySparkColumn.desc if method == "first": window = ( diff --git a/python/pyspark/pandas/spark/accessors.py b/python/pyspark/pandas/spark/accessors.py index 1ac12bb59ddc4..7f3041cf79c7c 100644 --- a/python/pyspark/pandas/spark/accessors.py +++ b/python/pyspark/pandas/spark/accessors.py @@ -27,7 +27,6 @@ from pyspark.sql.types import DataType, StructType from pyspark.pandas._typing import IndexOpsLike from pyspark.pandas.internal import InternalField -from pyspark.sql.utils import get_column_class, get_dataframe_class if TYPE_CHECKING: from pyspark.sql._typing import OptionalPrimitiveType @@ -116,8 +115,7 @@ def transform(self, func: Callable[[PySparkColumn], PySparkColumn]) -> IndexOpsL if isinstance(self._data, MultiIndex): raise NotImplementedError("MultiIndex does not support spark.transform yet.") output = func(self._data.spark.column) - Column = get_column_class() - if not isinstance(output, Column): + if not isinstance(output, PySparkColumn): raise ValueError( "The output of the function [%s] should be of a " "pyspark.sql.Column; however, got [%s]." % (func, type(output)) @@ -192,8 +190,7 @@ def apply(self, func: Callable[[PySparkColumn], PySparkColumn]) -> "ps.Series": from pyspark.pandas.internal import HIDDEN_COLUMNS output = func(self._data.spark.column) - Column = get_column_class() - if not isinstance(output, Column): + if not isinstance(output, PySparkColumn): raise ValueError( "The output of the function [%s] should be of a " "pyspark.sql.Column; however, got [%s]." % (func, type(output)) @@ -240,7 +237,8 @@ def analyzed(self) -> "ps.Series": However, it won't work with the same anchor Series. - >>> ser + ser.spark.analyzed + >>> with ps.option_context('compute.ops_on_diff_frames', False): + ... ser + ser.spark.analyzed Traceback (most recent call last): ... ValueError: ... enable 'compute.ops_on_diff_frames' option. @@ -290,7 +288,8 @@ def analyzed(self) -> "ps.Index": However, it won't work with the same anchor Index. - >>> idx + idx.spark.analyzed + >>> with ps.option_context('compute.ops_on_diff_frames', False): + ... idx + idx.spark.analyzed Traceback (most recent call last): ... ValueError: ... enable 'compute.ops_on_diff_frames' option. @@ -936,8 +935,7 @@ def apply( 2 3 1 """ output = func(self.frame(index_col)) - SparkDataFrame = get_dataframe_class() - if not isinstance(output, SparkDataFrame): + if not isinstance(output, PySparkDataFrame): raise ValueError( "The output of the function [%s] should be of a " "pyspark.sql.DataFrame; however, got [%s]." % (func, type(output)) @@ -1148,7 +1146,8 @@ def analyzed(self) -> "ps.DataFrame": However, it won't work with the same anchor Series. - >>> df + df.spark.analyzed + >>> with ps.option_context('compute.ops_on_diff_frames', False): + ... df + df.spark.analyzed Traceback (most recent call last): ... ValueError: ... enable 'compute.ops_on_diff_frames' option. diff --git a/python/pyspark/pandas/spark/functions.py b/python/pyspark/pandas/spark/functions.py index a598ff37c47f3..db1cc423078a7 100644 --- a/python/pyspark/pandas/spark/functions.py +++ b/python/pyspark/pandas/spark/functions.py @@ -25,9 +25,9 @@ def product(col: Column, dropna: bool) -> Column: if is_remote(): from pyspark.sql.connect.functions.builtin import _invoke_function_over_columns, lit - return _invoke_function_over_columns( # type: ignore[return-value] + return _invoke_function_over_columns( "pandas_product", - col, # type: ignore[arg-type] + col, lit(dropna), ) @@ -42,9 +42,9 @@ def stddev(col: Column, ddof: int) -> Column: if is_remote(): from pyspark.sql.connect.functions.builtin import _invoke_function_over_columns, lit - return _invoke_function_over_columns( # type: ignore[return-value] + return _invoke_function_over_columns( "pandas_stddev", - col, # type: ignore[arg-type] + col, lit(ddof), ) @@ -59,9 +59,9 @@ def var(col: Column, ddof: int) -> Column: if is_remote(): from pyspark.sql.connect.functions.builtin import _invoke_function_over_columns, lit - return _invoke_function_over_columns( # type: ignore[return-value] + return _invoke_function_over_columns( "pandas_var", - col, # type: ignore[arg-type] + col, lit(ddof), ) @@ -76,9 +76,9 @@ def skew(col: Column) -> Column: if is_remote(): from pyspark.sql.connect.functions.builtin import _invoke_function_over_columns - return _invoke_function_over_columns( # type: ignore[return-value] + return _invoke_function_over_columns( "pandas_skew", - col, # type: ignore[arg-type] + col, ) else: @@ -92,9 +92,9 @@ def kurt(col: Column) -> Column: if is_remote(): from pyspark.sql.connect.functions.builtin import _invoke_function_over_columns - return _invoke_function_over_columns( # type: ignore[return-value] + return _invoke_function_over_columns( "pandas_kurt", - col, # type: ignore[arg-type] + col, ) else: @@ -108,9 +108,9 @@ def mode(col: Column, dropna: bool) -> Column: if is_remote(): from pyspark.sql.connect.functions.builtin import _invoke_function_over_columns, lit - return _invoke_function_over_columns( # type: ignore[return-value] + return _invoke_function_over_columns( "pandas_mode", - col, # type: ignore[arg-type] + col, lit(dropna), ) @@ -125,10 +125,10 @@ def covar(col1: Column, col2: Column, ddof: int) -> Column: if is_remote(): from pyspark.sql.connect.functions.builtin import _invoke_function_over_columns, lit - return _invoke_function_over_columns( # type: ignore[return-value] + return _invoke_function_over_columns( "pandas_covar", - col1, # type: ignore[arg-type] - col2, # type: ignore[arg-type] + col1, + col2, lit(ddof), ) @@ -143,9 +143,9 @@ def ewm(col: Column, alpha: float, ignore_na: bool) -> Column: if is_remote(): from pyspark.sql.connect.functions.builtin import _invoke_function_over_columns, lit - return _invoke_function_over_columns( # type: ignore[return-value] + return _invoke_function_over_columns( "ewm", - col, # type: ignore[arg-type] + col, lit(alpha), lit(ignore_na), ) @@ -161,9 +161,9 @@ def null_index(col: Column) -> Column: if is_remote(): from pyspark.sql.connect.functions.builtin import _invoke_function_over_columns - return _invoke_function_over_columns( # type: ignore[return-value] + return _invoke_function_over_columns( "null_index", - col, # type: ignore[arg-type] + col, ) else: @@ -171,21 +171,3 @@ def null_index(col: Column) -> Column: sc = SparkContext._active_spark_context return Column(sc._jvm.PythonSQLUtils.nullIndex(col._jc)) - - -def timestampdiff(unit: str, start: Column, end: Column) -> Column: - if is_remote(): - from pyspark.sql.connect.functions.builtin import _invoke_function_over_columns, lit - - return _invoke_function_over_columns( # type: ignore[return-value] - "timestampdiff", - lit(unit), - start, # type: ignore[arg-type] - end, # type: ignore[arg-type] - ) - - else: - from pyspark import SparkContext - - sc = SparkContext._active_spark_context - return Column(sc._jvm.PythonSQLUtils.timestampDiff(unit, start._jc, end._jc)) diff --git a/python/pyspark/pandas/sql_formatter.py b/python/pyspark/pandas/sql_formatter.py index 7e8263f552f0c..b6d48077675bd 100644 --- a/python/pyspark/pandas/sql_formatter.py +++ b/python/pyspark/pandas/sql_formatter.py @@ -27,10 +27,10 @@ from pyspark.pandas.namespace import _get_index_map from pyspark import pandas as ps from pyspark.sql import SparkSession +from pyspark.sql.utils import get_lit_sql_str from pyspark.pandas.utils import default_session from pyspark.pandas.frame import DataFrame from pyspark.pandas.series import Series -from pyspark.errors import PySparkTypeError from pyspark.sql.utils import is_remote @@ -203,15 +203,16 @@ def sql( session = default_session() formatter = PandasSQLStringFormatter(session) try: - # ps.DataFrame are not supported for Spark Connect currently. - if is_remote(): - for obj in kwargs.values(): - if isinstance(obj, ps.DataFrame): - raise PySparkTypeError( - error_class="UNSUPPORTED_DATA_TYPE", - message_parameters={"data_type": type(obj).__name__}, - ) - sdf = session.sql(formatter.format(query, **kwargs), args) + if not is_remote(): + sdf = session.sql(formatter.format(query, **kwargs), args) + else: + ps_query = formatter.format(query, **kwargs) + # here the new_kwargs stores the views + new_kwargs = {} + for psdf, name in formatter._temp_views: + new_kwargs[name] = psdf._to_spark() + # delegate views to spark.sql + sdf = session.sql(ps_query, args, **new_kwargs) finally: formatter.clear() @@ -264,30 +265,42 @@ def _convert_value(self, val: Any, name: str) -> Optional[str]: elif isinstance(val, (DataFrame, pd.DataFrame)): df_name = "_pandas_api_%s" % str(uuid.uuid4()).replace("-", "") - if isinstance(val, pd.DataFrame): - # Don't store temp view for plain pandas instances - # because it is unable to know which pandas DataFrame - # holds which Series. - val = ps.from_pandas(val) + if not is_remote(): + if isinstance(val, pd.DataFrame): + # Don't store temp view for plain pandas instances + # because it is unable to know which pandas DataFrame + # holds which Series. + val = ps.from_pandas(val) + else: + for df, n in self._temp_views: + if df is val: + return n + self._temp_views.append((val, df_name)) + val._to_spark().createOrReplaceTempView(df_name) + return df_name else: + if isinstance(val, pd.DataFrame): + # Always convert pd.DataFrame to ps.DataFrame, and record it in _temp_views. + val = ps.from_pandas(val) + for df, n in self._temp_views: if df is val: return n - self._temp_views.append((val, df_name)) - - val._to_spark().createOrReplaceTempView(df_name) - return df_name + self._temp_views.append((val, name)) + # In Spark Connect, keep the original view name here (not the UUID one), + # the reformatted query is like: 'select * from {tbl} where A > 1' + # and then delegate the view operations to spark.sql. + return "{" + name + "}" elif isinstance(val, str): - # This is matched to behavior from JVM implementation. - # See `sql` definition from `sql/catalyst/src/main/scala/org/apache/spark/ - # sql/catalyst/expressions/literals.scala` - return "'" + val.replace("\\", "\\\\").replace("'", "\\'") + "'" + return get_lit_sql_str(val) else: return val def clear(self) -> None: - for _, n in self._temp_views: - self._session.catalog.dropTempView(n) + # In Spark Connect, views are created and dropped in Connect Server + if not is_remote(): + for _, n in self._temp_views: + self._session.catalog.dropTempView(n) self._temp_views = [] self._ref_sers = [] diff --git a/python/pyspark/pandas/tests/computation/test_binary_ops.py b/python/pyspark/pandas/tests/computation/test_binary_ops.py index 966b3d9cee7ec..44aa9380c19ec 100644 --- a/python/pyspark/pandas/tests/computation/test_binary_ops.py +++ b/python/pyspark/pandas/tests/computation/test_binary_ops.py @@ -49,11 +49,12 @@ def test_binary_operators(self): self.assert_eq(psdf + psdf.loc[:, ["A", "B"]], pdf + pdf.loc[:, ["A", "B"]]) self.assert_eq(psdf.loc[:, ["A", "B"]] + psdf, pdf.loc[:, ["A", "B"]] + pdf) - self.assertRaisesRegex( - ValueError, - "it comes from a different dataframe", - lambda: ps.range(10).add(ps.range(10)), - ) + with ps.option_context("compute.ops_on_diff_frames", False): + self.assertRaisesRegex( + ValueError, + "it comes from a different dataframe", + lambda: ps.range(10).add(ps.range(10)), + ) self.assertRaisesRegex( TypeError, diff --git a/python/pyspark/pandas/tests/computation/test_corr.py b/python/pyspark/pandas/tests/computation/test_corr.py index 99dc1733539af..49cb84ca22e04 100644 --- a/python/pyspark/pandas/tests/computation/test_corr.py +++ b/python/pyspark/pandas/tests/computation/test_corr.py @@ -160,8 +160,9 @@ def test_series_corr(self): psser1 = ps.from_pandas(pser1) psser2 = ps.from_pandas(pser2) - with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"): - psser1.corr(psser2) + with ps.option_context("compute.ops_on_diff_frames", False): + with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"): + psser1.corr(psser2) for method in ["pearson", "spearman", "kendall"]: with ps.option_context("compute.ops_on_diff_frames", True): diff --git a/python/pyspark/pandas/tests/connect/indexes/test_parity_default.py b/python/pyspark/pandas/tests/connect/indexes/test_parity_default.py index d6f0cadbf0cd1..4240eb8fdbc81 100644 --- a/python/pyspark/pandas/tests/connect/indexes/test_parity_default.py +++ b/python/pyspark/pandas/tests/connect/indexes/test_parity_default.py @@ -19,6 +19,7 @@ from pyspark.pandas.tests.indexes.test_default import DefaultIndexTestsMixin from pyspark.testing.connectutils import ReusedConnectTestCase from pyspark.testing.pandasutils import PandasOnSparkTestUtils +from pyspark.util import is_remote_only class DefaultIndexParityTests( @@ -26,7 +27,7 @@ class DefaultIndexParityTests( PandasOnSparkTestUtils, ReusedConnectTestCase, ): - @unittest.skip("Test depends on SparkContext which is not supported from Spark Connect.") + @unittest.skipIf(is_remote_only(), "Requires JVM access") def test_index_distributed_sequence_cleanup(self): super().test_index_distributed_sequence_cleanup() diff --git a/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot_matplotlib.py b/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot_matplotlib.py index f093f48b16e9c..abb18d473bf8d 100644 --- a/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot_matplotlib.py +++ b/python/pyspark/pandas/tests/connect/plot/test_parity_series_plot_matplotlib.py @@ -24,6 +24,10 @@ class SeriesPlotMatplotlibParityTests( SeriesPlotMatplotlibTestsMixin, PandasOnSparkTestUtils, TestUtils, ReusedConnectTestCase ): + @unittest.skip("Test depends on Spark ML which is not supported from Spark Connect.") + def test_empty_hist(self): + super().test_empty_hist() + @unittest.skip("Test depends on Spark ML which is not supported from Spark Connect.") def test_hist(self): super().test_hist() diff --git a/python/pyspark/pandas/tests/connect/test_connect_plotting.py b/python/pyspark/pandas/tests/connect/test_connect_plotting.py new file mode 100644 index 0000000000000..9b7cfebfcd552 --- /dev/null +++ b/python/pyspark/pandas/tests/connect/test_connect_plotting.py @@ -0,0 +1,124 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import unittest + +import pandas as pd + +from pyspark import pandas as ps +from pyspark.pandas.exceptions import PandasNotImplementedError +from pyspark.testing.connectutils import ReusedConnectTestCase +from pyspark.testing.pandasutils import PandasOnSparkTestUtils, TestUtils + + +class ConnectPlottingTests(PandasOnSparkTestUtils, TestUtils, ReusedConnectTestCase): + @property + def pdf1(self): + return pd.DataFrame( + [[1, 2], [4, 5], [7, 8]], + index=["cobra", "viper", None], + columns=["max_speed", "shield"], + ) + + @property + def psdf1(self): + return ps.from_pandas(self.pdf1) + + def test_unsupported_functions(self): + with self.assertRaises(PandasNotImplementedError): + self.psdf1.plot.hist() + + with self.assertRaises(PandasNotImplementedError): + self.psdf1.plot.hist(bins=3) + + with self.assertRaises(PandasNotImplementedError): + self.psdf1.plot.kde() + + with self.assertRaises(PandasNotImplementedError): + self.psdf1.plot.kde(bw_method=3) + + with self.assertRaises(PandasNotImplementedError): + self.psdf1.plot.density() + + with self.assertRaises(PandasNotImplementedError): + self.psdf1.plot.density(bw_method=3) + + with self.assertRaises(PandasNotImplementedError): + self.psdf1.shield.plot.hist() + + with self.assertRaises(PandasNotImplementedError): + self.psdf1.shield.plot.hist(bins=3) + + with self.assertRaises(PandasNotImplementedError): + self.psdf1.shield.plot.kde() + + with self.assertRaises(PandasNotImplementedError): + self.psdf1.shield.plot.kde(bw_method=3) + + with self.assertRaises(PandasNotImplementedError): + self.psdf1.shield.plot.density() + + with self.assertRaises(PandasNotImplementedError): + self.psdf1.shield.plot.density(bw_method=3) + + def test_unsupported_kinds(self): + with self.assertRaises(PandasNotImplementedError): + self.psdf1.plot(kind="hist") + + with self.assertRaises(PandasNotImplementedError): + self.psdf1.plot(kind="hist", bins=3) + + with self.assertRaises(PandasNotImplementedError): + self.psdf1.plot(kind="kde") + + with self.assertRaises(PandasNotImplementedError): + self.psdf1.plot(kind="kde", bw_method=3) + + with self.assertRaises(PandasNotImplementedError): + self.psdf1.plot(kind="density") + + with self.assertRaises(PandasNotImplementedError): + self.psdf1.plot(kind="density", bw_method=3) + + with self.assertRaises(PandasNotImplementedError): + self.psdf1.shield.plot(kind="hist") + + with self.assertRaises(PandasNotImplementedError): + self.psdf1.shield.plot(kind="hist", bins=3) + + with self.assertRaises(PandasNotImplementedError): + self.psdf1.shield.plot(kind="kde") + + with self.assertRaises(PandasNotImplementedError): + self.psdf1.shield.plot(kind="kde", bw_method=3) + + with self.assertRaises(PandasNotImplementedError): + self.psdf1.shield.plot(kind="density") + + with self.assertRaises(PandasNotImplementedError): + self.psdf1.shield.plot(kind="density", bw_method=3) + + +if __name__ == "__main__": + from pyspark.pandas.tests.connect.test_connect_plotting import * # noqa: F401 + + try: + import xmlrunner # type: ignore[import] + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/connect/test_parity_frame_spark.py b/python/pyspark/pandas/tests/connect/test_parity_frame_spark.py index 24626a9164e84..4a8417382253e 100644 --- a/python/pyspark/pandas/tests/connect/test_parity_frame_spark.py +++ b/python/pyspark/pandas/tests/connect/test_parity_frame_spark.py @@ -24,19 +24,19 @@ class SparkFrameMethodsParityTests( SparkFrameMethodsTestsMixin, TestUtils, PandasOnSparkTestUtils, ReusedConnectTestCase ): - @unittest.skip("Test depends on checkpoint which is not supported from Spark Connect.") + @unittest.skip("Test depends on SparkContext which is not supported from Spark Connect.") def test_checkpoint(self): super().test_checkpoint() - @unittest.skip("Test depends on RDD which is not supported from Spark Connect.") + @unittest.skip( + "Test depends on RDD, and cannot use SQL expression due to Catalyst optimization" + ) def test_coalesce(self): super().test_coalesce() - @unittest.skip("Test depends on localCheckpoint which is not supported from Spark Connect.") - def test_local_checkpoint(self): - super().test_local_checkpoint() - - @unittest.skip("Test depends on RDD which is not supported from Spark Connect.") + @unittest.skip( + "Test depends on RDD, and cannot use SQL expression due to Catalyst optimization" + ) def test_repartition(self): super().test_repartition() diff --git a/python/pyspark/pandas/tests/connect/test_parity_sql.py b/python/pyspark/pandas/tests/connect/test_parity_sql.py index 2e503cac07a8a..29abbda8c0ebb 100644 --- a/python/pyspark/pandas/tests/connect/test_parity_sql.py +++ b/python/pyspark/pandas/tests/connect/test_parity_sql.py @@ -22,13 +22,7 @@ class SQLParityTests(SQLTestsMixin, PandasOnSparkTestUtils, ReusedConnectTestCase): - @unittest.skip("Test depends on temp view issue on JVM side.") - def test_sql_with_index_col(self): - super().test_sql_with_index_col() - - @unittest.skip("Test depends on temp view issue on JVM side.") - def test_sql_with_pandas_on_spark_objects(self): - super().test_sql_with_pandas_on_spark_objects() + pass if __name__ == "__main__": diff --git a/python/pyspark/pandas/tests/frame/test_constructor.py b/python/pyspark/pandas/tests/frame/test_constructor.py index ee010d8f023dd..d0d842d8264c7 100644 --- a/python/pyspark/pandas/tests/frame/test_constructor.py +++ b/python/pyspark/pandas/tests/frame/test_constructor.py @@ -137,13 +137,14 @@ def test_creation_index(self): pd.DataFrame(data=data, index=pd.Index([1, 2, 3, 5, 6])), ) - err_msg = "Cannot combine the series or dataframe" - with self.assertRaisesRegex(ValueError, err_msg): - # test ps.DataFrame with ps.Index - ps.DataFrame(data=ps.DataFrame([1, 2]), index=ps.Index([1, 2])) - with self.assertRaisesRegex(ValueError, err_msg): - # test ps.DataFrame with pd.Index - ps.DataFrame(data=ps.DataFrame([1, 2]), index=pd.Index([3, 4])) + with ps.option_context("compute.ops_on_diff_frames", False): + err_msg = "Cannot combine the series or dataframe" + with self.assertRaisesRegex(ValueError, err_msg): + # test ps.DataFrame with ps.Index + ps.DataFrame(data=ps.DataFrame([1, 2]), index=ps.Index([1, 2])) + with self.assertRaisesRegex(ValueError, err_msg): + # test ps.DataFrame with pd.Index + ps.DataFrame(data=ps.DataFrame([1, 2]), index=pd.Index([3, 4])) with ps.option_context("compute.ops_on_diff_frames", True): # test pd.DataFrame with pd.Index @@ -195,14 +196,14 @@ def test_creation_index(self): with ps.option_context("compute.ops_on_diff_frames", True): # test with ps.DataFrame and pd.Index self.assert_eq( - ps.DataFrame(data=psdf, index=pd.Index([2, 3, 4, 5, 6])), - pd.DataFrame(data=pdf, index=pd.Index([2, 3, 4, 5, 6])), + ps.DataFrame(data=psdf, index=pd.Index([2, 3, 4, 5, 6])).sort_index(), + pd.DataFrame(data=pdf, index=pd.Index([2, 3, 4, 5, 6])).sort_index(), ) # test with ps.DataFrame and ps.Index self.assert_eq( - ps.DataFrame(data=psdf, index=ps.Index([2, 3, 4, 5, 6])), - pd.DataFrame(data=pdf, index=pd.Index([2, 3, 4, 5, 6])), + ps.DataFrame(data=psdf, index=ps.Index([2, 3, 4, 5, 6])).sort_index(), + pd.DataFrame(data=pdf, index=pd.Index([2, 3, 4, 5, 6])).sort_index(), ) # test String Index @@ -269,11 +270,11 @@ def test_creation_index(self): ps.DataFrame( data=pdf, index=pd.DatetimeIndex(["2022-08-31", "2022-09-02", "2022-09-03", "2022-09-05"]), - ), + ).sort_index(), pd.DataFrame( data=pdf, index=pd.DatetimeIndex(["2022-08-31", "2022-09-02", "2022-09-03", "2022-09-05"]), - ), + ).sort_index(), ) # test with pd.DataFrame and ps.DatetimeIndex @@ -281,11 +282,11 @@ def test_creation_index(self): ps.DataFrame( data=pdf, index=ps.DatetimeIndex(["2022-08-31", "2022-09-02", "2022-09-03", "2022-09-05"]), - ), + ).sort_index(), pd.DataFrame( data=pdf, index=pd.DatetimeIndex(["2022-08-31", "2022-09-02", "2022-09-03", "2022-09-05"]), - ), + ).sort_index(), ) with ps.option_context("compute.ops_on_diff_frames", True): @@ -296,13 +297,13 @@ def test_creation_index(self): index=pd.DatetimeIndex( ["2022-08-31", "2022-09-02", "2022-09-03", "2022-09-05"] ), - ), + ).sort_index(), pd.DataFrame( data=pdf, index=pd.DatetimeIndex( ["2022-08-31", "2022-09-02", "2022-09-03", "2022-09-05"] ), - ), + ).sort_index(), ) # test with ps.DataFrame and ps.DatetimeIndex @@ -312,13 +313,13 @@ def test_creation_index(self): index=ps.DatetimeIndex( ["2022-08-31", "2022-09-02", "2022-09-03", "2022-09-05"] ), - ), + ).sort_index(), pd.DataFrame( data=pdf, index=pd.DatetimeIndex( ["2022-08-31", "2022-09-02", "2022-09-03", "2022-09-05"] ), - ), + ).sort_index(), ) # test MultiIndex diff --git a/python/pyspark/pandas/tests/groupby/test_groupby.py b/python/pyspark/pandas/tests/groupby/test_groupby.py index 5867f7b62fa5e..b58bfddb4b996 100644 --- a/python/pyspark/pandas/tests/groupby/test_groupby.py +++ b/python/pyspark/pandas/tests/groupby/test_groupby.py @@ -451,6 +451,27 @@ def test_diff(self): pdf.groupby([("x", "a"), ("x", "b")]).diff().sort_index(), ) + def test_aggregate_relabel_index_false(self): + pdf = pd.DataFrame( + { + "A": [0, 0, 1, 1, 1], + "B": ["a", "a", "b", "a", "b"], + "C": [10, 15, 10, 20, 30], + } + ) + psdf = ps.from_pandas(pdf) + + self.assert_eq( + pdf.groupby(["B", "A"], as_index=False) + .agg(C_MAX=("C", "max")) + .sort_values(["B", "A"]) + .reset_index(drop=True), + psdf.groupby(["B", "A"], as_index=False) + .agg(C_MAX=("C", "max")) + .sort_values(["B", "A"]) + .reset_index(drop=True), + ) + class GroupByTests( GroupByTestsMixin, diff --git a/python/pyspark/pandas/tests/indexes/test_default.py b/python/pyspark/pandas/tests/indexes/test_default.py index 3d19eb407b42c..5cd9fae76dfbe 100644 --- a/python/pyspark/pandas/tests/indexes/test_default.py +++ b/python/pyspark/pandas/tests/indexes/test_default.py @@ -44,7 +44,7 @@ def test_index_distributed_sequence_cleanup(self): "compute.default_index_type", "distributed-sequence" ), ps.option_context("compute.ops_on_diff_frames", True): with ps.option_context("compute.default_index_cache", "LOCAL_CHECKPOINT"): - cached_rdd_ids = [rdd_id for rdd_id in self.spark._jsc.getPersistentRDDs()] + cached_rdd_ids = [rdd_id for rdd_id in self._legacy_sc._jsc.getPersistentRDDs()] psdf1 = ( self.spark.range(0, 100, 1, 10).withColumn("Key", F.col("id") % 33).pandas_api() @@ -61,13 +61,13 @@ def test_index_distributed_sequence_cleanup(self): self.assertTrue( any( rdd_id not in cached_rdd_ids - for rdd_id in self.spark._jsc.getPersistentRDDs() + for rdd_id in self._legacy_sc._jsc.getPersistentRDDs() ) ) for storage_level in ["NONE", "DISK_ONLY_2", "MEMORY_AND_DISK_SER"]: with ps.option_context("compute.default_index_cache", storage_level): - cached_rdd_ids = [rdd_id for rdd_id in self.spark._jsc.getPersistentRDDs()] + cached_rdd_ids = [rdd_id for rdd_id in self._legacy_sc._jsc.getPersistentRDDs()] psdf1 = ( self.spark.range(0, 100, 1, 10) @@ -86,7 +86,7 @@ def test_index_distributed_sequence_cleanup(self): self.assertTrue( all( rdd_id in cached_rdd_ids - for rdd_id in self.spark._jsc.getPersistentRDDs() + for rdd_id in self._legacy_sc._jsc.getPersistentRDDs() ) ) diff --git a/python/pyspark/pandas/tests/indexes/test_indexing.py b/python/pyspark/pandas/tests/indexes/test_indexing.py index c7367492a20f5..3178e8b17665a 100644 --- a/python/pyspark/pandas/tests/indexes/test_indexing.py +++ b/python/pyspark/pandas/tests/indexes/test_indexing.py @@ -235,7 +235,9 @@ def test_insert(self): self.assert_eq(psdf.sort_index(), pdf.sort_index(), almost=True) psser = ps.Series([4, 5, 6]) - self.assertRaises(ValueError, lambda: psdf.insert(0, "y", psser)) + with ps.option_context("compute.ops_on_diff_frames", False): + self.assertRaises(ValueError, lambda: psdf.insert(0, "y", psser)) + self.assertRaisesRegex( ValueError, "cannot insert b, already exists", lambda: psdf.insert(1, "b", 10) ) @@ -256,7 +258,9 @@ def test_insert(self): ) self.assertRaises(ValueError, lambda: psdf.insert(0, "e", [7, 8, 9, 10])) - self.assertRaises(ValueError, lambda: psdf.insert(0, "f", ps.Series([7, 8]))) + with ps.option_context("compute.ops_on_diff_frames", False): + self.assertRaises(ValueError, lambda: psdf.insert(0, "f", ps.Series([7, 8]))) + self.assertRaises(AssertionError, lambda: psdf.insert(100, "y", psser)) self.assertRaises(AssertionError, lambda: psdf.insert(1, "y", psser, allow_duplicates=True)) diff --git a/python/pyspark/pandas/utils.py b/python/pyspark/pandas/utils.py index 0fe2944bcabe9..fec45072cf93a 100644 --- a/python/pyspark/pandas/utils.py +++ b/python/pyspark/pandas/utils.py @@ -42,7 +42,7 @@ from pyspark.sql import functions as F, Column, DataFrame as PySparkDataFrame, SparkSession from pyspark.sql.types import DoubleType -from pyspark.sql.utils import is_remote, get_dataframe_class +from pyspark.sql.utils import is_remote from pyspark.errors import PySparkTypeError from pyspark import pandas as ps # noqa: F401 from pyspark.pandas._typing import ( @@ -915,8 +915,7 @@ def verify_temp_column_name( ) column_name = column_name_or_label - SparkDataFrame = get_dataframe_class() - assert isinstance(df, SparkDataFrame), type(df) + assert isinstance(df, PySparkDataFrame), type(df) assert ( column_name not in df.columns ), "The given column name `{}` already exists in the Spark DataFrame: {}".format( diff --git a/python/pyspark/resource/profile.py b/python/pyspark/resource/profile.py index a22afdf16c8b0..e9e6ef3520eea 100644 --- a/python/pyspark/resource/profile.py +++ b/python/pyspark/resource/profile.py @@ -201,14 +201,15 @@ class ResourceProfileBuilder: """ def __init__(self) -> None: - from pyspark.core.context import SparkContext + from pyspark.sql import is_remote - # TODO: ignore[attr-defined] will be removed, once SparkContext is inlined - _jvm = SparkContext._jvm + _jvm = None + if not is_remote(): + from pyspark.core.context import SparkContext - from pyspark.sql import is_remote + _jvm = SparkContext._jvm - if _jvm is not None and not is_remote(): + if _jvm is not None: self._jvm = _jvm self._java_resource_profile_builder = ( _jvm.org.apache.spark.resource.ResourceProfileBuilder() diff --git a/python/pyspark/resource/requests.py b/python/pyspark/resource/requests.py index 746fca9848393..fa8bb43ee2c49 100644 --- a/python/pyspark/resource/requests.py +++ b/python/pyspark/resource/requests.py @@ -164,14 +164,17 @@ def __init__( _jvm: Optional["JVMView"] = None, _requests: Optional[Dict[str, ExecutorResourceRequest]] = None, ): - from pyspark import SparkContext from pyspark.sql import is_remote - _jvm = _jvm or SparkContext._jvm + jvm = None + if not is_remote(): + from pyspark.core.context import SparkContext - if _jvm is not None and not is_remote(): + jvm = _jvm or SparkContext._jvm + + if jvm is not None: self._java_executor_resource_requests = ( - _jvm.org.apache.spark.resource.ExecutorResourceRequests() + jvm.org.apache.spark.resource.ExecutorResourceRequests() ) if _requests is not None: for k, v in _requests.items(): @@ -462,15 +465,18 @@ def __init__( _jvm: Optional["JVMView"] = None, _requests: Optional[Dict[str, TaskResourceRequest]] = None, ): - from pyspark import SparkContext from pyspark.sql import is_remote - _jvm = _jvm or SparkContext._jvm + jvm = None + if not is_remote(): + from pyspark.core.context import SparkContext + + jvm = _jvm or SparkContext._jvm - if _jvm is not None and not is_remote(): + if jvm is not None: self._java_task_resource_requests: Optional[ "JavaObject" - ] = _jvm.org.apache.spark.resource.TaskResourceRequests() + ] = jvm.org.apache.spark.resource.TaskResourceRequests() if _requests is not None: for k, v in _requests.items(): if k == self._CPUS: diff --git a/python/pyspark/resource/tests/test_connect_resources.py b/python/pyspark/resource/tests/test_connect_resources.py index 1529a33cb0ad0..90bae85c2a1b7 100644 --- a/python/pyspark/resource/tests/test_connect_resources.py +++ b/python/pyspark/resource/tests/test_connect_resources.py @@ -15,6 +15,7 @@ # limitations under the License. # import unittest +import os from pyspark.resource import ResourceProfileBuilder, TaskResourceRequests, ExecutorResourceRequests from pyspark.sql import SparkSession @@ -35,20 +36,20 @@ def test_profile_before_sc_for_connect(self): # check taskResources, similar to executorResources. self.assertEqual(rp.taskResources["cpus"].amount, 2.0) - # SparkContext is not initialized and is not remote. - with self.assertRaisesRegex( - RuntimeError, "SparkContext must be created to get the profile id." - ): + # SparkContext or SparkSesssion is not initialized. + with self.assertRaises(RuntimeError): rp.id # Remote mode. - spark = SparkSession.builder.remote("local-cluster[1, 2, 1024]").getOrCreate() + spark = SparkSession.builder.remote( + os.environ.get("SPARK_CONNECT_TESTING_REMOTE", "local-cluster[1, 2, 1024]") + ).getOrCreate() # Still can access taskResources, similar to executorResources. self.assertEqual(rp.taskResources["cpus"].amount, 2.0) rp.id df = spark.range(10) - df.mapInPandas(lambda x: x, df.schema, False, rp).collect() - df.mapInArrow(lambda x: x, df.schema, False, rp).collect() + df.mapInPandas(lambda x: x, df.schema, False, rp).show(n=10) + df.mapInArrow(lambda x: x, df.schema, False, rp).show(n=10) def assert_request_contents(exec_reqs, task_reqs): self.assertEqual(len(exec_reqs), 6) diff --git a/python/pyspark/sql/avro/functions.py b/python/pyspark/sql/avro/functions.py index 5cebfa384045e..fb3bd53984959 100644 --- a/python/pyspark/sql/avro/functions.py +++ b/python/pyspark/sql/avro/functions.py @@ -22,7 +22,8 @@ from typing import Dict, Optional, TYPE_CHECKING, cast -from pyspark.sql.column import Column, _to_java_column +from pyspark.errors import PySparkTypeError +from pyspark.sql.column import Column from pyspark.sql.utils import get_active_spark_context, try_remote_avro_functions from pyspark.util import _print_missing_jar @@ -78,6 +79,26 @@ def from_avro( [Row(value=Row(avro=Row(age=2, name='Alice')))] """ from py4j.java_gateway import JVMView + from pyspark.sql.classic.column import _to_java_column + + if not isinstance(data, (Column, str)): + raise PySparkTypeError( + error_class="INVALID_TYPE", + message_parameters={ + "arg_name": "data", + "arg_type": "pyspark.sql.Column or str", + }, + ) + if not isinstance(jsonFormatSchema, str): + raise PySparkTypeError( + error_class="INVALID_TYPE", + message_parameters={"arg_name": "jsonFormatSchema", "arg_type": "str"}, + ) + if options is not None and not isinstance(options, dict): + raise PySparkTypeError( + error_class="INVALID_TYPE", + message_parameters={"arg_name": "options", "arg_type": "dict, optional"}, + ) sc = get_active_spark_context() try: @@ -128,6 +149,21 @@ def to_avro(data: "ColumnOrName", jsonFormatSchema: str = "") -> Column: [Row(suite=bytearray(b'\\x02\\x00'))] """ from py4j.java_gateway import JVMView + from pyspark.sql.classic.column import _to_java_column + + if not isinstance(data, (Column, str)): + raise PySparkTypeError( + error_class="INVALID_TYPE", + message_parameters={ + "arg_name": "data", + "arg_type": "pyspark.sql.Column or str", + }, + ) + if not isinstance(jsonFormatSchema, str): + raise PySparkTypeError( + error_class="INVALID_TYPE", + message_parameters={"arg_name": "jsonFormatSchema", "arg_type": "str"}, + ) sc = get_active_spark_context() try: diff --git a/python/pyspark/sql/classic/__init__.py b/python/pyspark/sql/classic/__init__.py new file mode 100644 index 0000000000000..f7ae391c3186d --- /dev/null +++ b/python/pyspark/sql/classic/__init__.py @@ -0,0 +1,18 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Spark Classic specific""" diff --git a/python/pyspark/sql/classic/column.py b/python/pyspark/sql/classic/column.py new file mode 100644 index 0000000000000..7630cfed5c173 --- /dev/null +++ b/python/pyspark/sql/classic/column.py @@ -0,0 +1,637 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import sys +import json +import warnings +from typing import ( + cast, + overload, + Any, + Callable, + Iterable, + List, + Optional, + Tuple, + TYPE_CHECKING, + Union, +) + +from pyspark.sql.column import Column as ParentColumn +from pyspark.errors import PySparkAttributeError, PySparkTypeError, PySparkValueError +from pyspark.errors.utils import with_origin_to_class +from pyspark.sql.types import DataType +from pyspark.sql.utils import get_active_spark_context + +if TYPE_CHECKING: + from py4j.java_gateway import JavaObject + from pyspark.core.context import SparkContext + from pyspark.sql._typing import ColumnOrName, LiteralType, DecimalLiteral, DateTimeLiteral + from pyspark.sql.window import WindowSpec + +__all__ = ["Column"] + + +def _create_column_from_literal( + literal: Union["LiteralType", "DecimalLiteral", "DateTimeLiteral", "ParentColumn"] +) -> "JavaObject": + from py4j.java_gateway import JVMView + + sc = get_active_spark_context() + return cast(JVMView, sc._jvm).functions.lit(literal) + + +def _create_column_from_name(name: str) -> "JavaObject": + from py4j.java_gateway import JVMView + + sc = get_active_spark_context() + return cast(JVMView, sc._jvm).functions.col(name) + + +def _to_java_column(col: "ColumnOrName") -> "JavaObject": + if isinstance(col, Column): + jcol = col._jc + elif isinstance(col, str): + jcol = _create_column_from_name(col) + else: + raise PySparkTypeError( + error_class="NOT_COLUMN_OR_STR", + message_parameters={"arg_name": "col", "arg_type": type(col).__name__}, + ) + return jcol + + +def _to_java_expr(col: "ColumnOrName") -> "JavaObject": + return _to_java_column(col).expr() + + +@overload +def _to_seq(sc: "SparkContext", cols: Iterable["JavaObject"]) -> "JavaObject": + ... + + +@overload +def _to_seq( + sc: "SparkContext", + cols: Iterable["ColumnOrName"], + converter: Optional[Callable[["ColumnOrName"], "JavaObject"]], +) -> "JavaObject": + ... + + +def _to_seq( + sc: "SparkContext", + cols: Union[Iterable["ColumnOrName"], Iterable["JavaObject"]], + converter: Optional[Callable[["ColumnOrName"], "JavaObject"]] = None, +) -> "JavaObject": + """ + Convert a list of Columns (or names) into a JVM Seq of Column. + + An optional `converter` could be used to convert items in `cols` + into JVM Column objects. + """ + if converter: + cols = [converter(c) for c in cols] + assert sc._jvm is not None + return sc._jvm.PythonUtils.toSeq(cols) + + +def _to_list( + sc: "SparkContext", + cols: List["ColumnOrName"], + converter: Optional[Callable[["ColumnOrName"], "JavaObject"]] = None, +) -> "JavaObject": + """ + Convert a list of Columns (or names) into a JVM (Scala) List of Columns. + + An optional `converter` could be used to convert items in `cols` + into JVM Column objects. + """ + if converter: + cols = [converter(c) for c in cols] + assert sc._jvm is not None + return sc._jvm.PythonUtils.toList(cols) + + +def _unary_op(name: str, self: ParentColumn) -> ParentColumn: + """Create a method for given unary operator""" + + jc = getattr(self._jc, name)() + return Column(jc) + + +def _func_op(name: str, self: ParentColumn) -> ParentColumn: + from py4j.java_gateway import JVMView + + sc = get_active_spark_context() + jc = getattr(cast(JVMView, sc._jvm).functions, name)(self._jc) + return Column(jc) + + +def _bin_func_op( + name: str, + self: ParentColumn, + other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"], + reverse: bool = False, +) -> ParentColumn: + from py4j.java_gateway import JVMView + + sc = get_active_spark_context() + fn = getattr(cast(JVMView, sc._jvm).functions, name) + jc = other._jc if isinstance(other, ParentColumn) else _create_column_from_literal(other) + njc = fn(self._jc, jc) if not reverse else fn(jc, self._jc) + return Column(njc) + + +def _bin_op( + name: str, + self: ParentColumn, + other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"], +) -> ParentColumn: + """Create a method for given binary operator""" + jc = other._jc if isinstance(other, ParentColumn) else other + njc = getattr(self._jc, name)(jc) + return Column(njc) + + +def _reverse_op( + name: str, + self: ParentColumn, + other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"], +) -> ParentColumn: + """Create a method for binary operator (this object is on right side)""" + jother = _create_column_from_literal(other) + jc = getattr(jother, name)(self._jc) + return Column(jc) + + +@with_origin_to_class +class Column(ParentColumn): + def __new__( + cls, + jc: "JavaObject", + ) -> "Column": + self = object.__new__(cls) + self.__init__(jc) # type: ignore[misc] + return self + + def __init__(self, jc: "JavaObject") -> None: + self._jc = jc + + # arithmetic operators + def __neg__(self) -> ParentColumn: + return _func_op("negate", self) + + def __add__( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("plus", self, other) + + def __sub__( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("minus", self, other) + + def __mul__( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("multiply", self, other) + + def __div__( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("divide", self, other) + + def __truediv__( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("divide", self, other) + + def __mod__( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("mod", self, other) + + def __radd__( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("plus", self, other) + + def __rsub__( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _reverse_op("minus", self, other) + + def __rmul__( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("multiply", self, other) + + def __rdiv__( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _reverse_op("divide", self, other) + + def __rtruediv__( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _reverse_op("divide", self, other) + + def __rmod__( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _reverse_op("mod", self, other) + + def __pow__( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_func_op("pow", self, other) + + def __rpow__( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_func_op("pow", self, other, reverse=True) + + # logistic operators + def __eq__( # type: ignore[override] + self, + other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"], + ) -> ParentColumn: + return _bin_op("equalTo", self, other) + + def __ne__( # type: ignore[override] + self, + other: Any, + ) -> ParentColumn: + return _bin_op("notEqual", self, other) + + def __lt__( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("lt", self, other) + + def __le__( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("leq", self, other) + + def __ge__( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("geq", self, other) + + def __gt__( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("gt", self, other) + + def eqNullSafe( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("eqNullSafe", self, other) + + # `and`, `or`, `not` cannot be overloaded in Python, + # so use bitwise operators as boolean operators + def __and__( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("and", self, other) + + def __or__( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("or", self, other) + + def __invert__(self) -> ParentColumn: + return _func_op("not", self) + + def __rand__( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("and", self, other) + + def __ror__( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("or", self, other) + + # container operators + def __contains__(self, item: Any) -> None: + raise PySparkValueError( + error_class="CANNOT_APPLY_IN_FOR_COLUMN", + message_parameters={}, + ) + + # bitwise operators + def bitwiseOR( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("bitwiseOR", self, other) + + def bitwiseAND( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("bitwiseAND", self, other) + + def bitwiseXOR( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("bitwiseXOR", self, other) + + def getItem(self, key: Any) -> ParentColumn: + if isinstance(key, Column): + warnings.warn( + "A column as 'key' in getItem is deprecated as of Spark 3.0, and will not " + "be supported in the future release. Use `column[key]` or `column.key` syntax " + "instead.", + FutureWarning, + ) + return self[key] + + def getField(self, name: Any) -> ParentColumn: + if isinstance(name, Column): + warnings.warn( + "A column as 'name' in getField is deprecated as of Spark 3.0, and will not " + "be supported in the future release. Use `column[name]` or `column.name` syntax " + "instead.", + FutureWarning, + ) + return self[name] + + def withField(self, fieldName: str, col: ParentColumn) -> ParentColumn: + if not isinstance(fieldName, str): + raise PySparkTypeError( + error_class="NOT_STR", + message_parameters={"arg_name": "fieldName", "arg_type": type(fieldName).__name__}, + ) + + if not isinstance(col, Column): + raise PySparkTypeError( + error_class="NOT_COLUMN", + message_parameters={"arg_name": "col", "arg_type": type(col).__name__}, + ) + + return Column(self._jc.withField(fieldName, col._jc)) + + def dropFields(self, *fieldNames: str) -> ParentColumn: + sc = get_active_spark_context() + jc = self._jc.dropFields(_to_seq(sc, fieldNames)) + return Column(jc) + + def __getattr__(self, item: Any) -> ParentColumn: + if item.startswith("__"): + raise PySparkAttributeError( + error_class="CANNOT_ACCESS_TO_DUNDER", + message_parameters={}, + ) + return self[item] + + def __getitem__(self, k: Any) -> ParentColumn: + if isinstance(k, slice): + if k.step is not None: + raise PySparkValueError( + error_class="SLICE_WITH_STEP", + message_parameters={}, + ) + return self.substr(k.start, k.stop) + else: + return _bin_op("apply", self, k) + + def __iter__(self) -> None: + raise PySparkTypeError( + error_class="NOT_ITERABLE", message_parameters={"objectName": "Column"} + ) + + # string methods + def contains( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("contains", self, other) + + def startswith( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("startsWith", self, other) + + def endswith( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("endsWith", self, other) + + def like(self: ParentColumn, other: str) -> ParentColumn: + njc = getattr(self._jc, "like")(other) + return Column(njc) + + def rlike(self: ParentColumn, other: str) -> ParentColumn: + njc = getattr(self._jc, "rlike")(other) + return Column(njc) + + def ilike(self: ParentColumn, other: str) -> ParentColumn: + njc = getattr(self._jc, "ilike")(other) + return Column(njc) + + def substr( + self, startPos: Union[int, ParentColumn], length: Union[int, ParentColumn] + ) -> ParentColumn: + if type(startPos) != type(length): + raise PySparkTypeError( + error_class="NOT_SAME_TYPE", + message_parameters={ + "arg_name1": "startPos", + "arg_name2": "length", + "arg_type1": type(startPos).__name__, + "arg_type2": type(length).__name__, + }, + ) + if isinstance(startPos, int): + jc = self._jc.substr(startPos, length) + elif isinstance(startPos, Column): + jc = self._jc.substr(startPos._jc, cast(ParentColumn, length)._jc) + else: + raise PySparkTypeError( + error_class="NOT_COLUMN_OR_INT", + message_parameters={"arg_name": "startPos", "arg_type": type(startPos).__name__}, + ) + return Column(jc) + + def isin(self, *cols: Any) -> ParentColumn: + if len(cols) == 1 and isinstance(cols[0], (list, set)): + cols = cast(Tuple, cols[0]) + cols = cast( + Tuple, + [c._jc if isinstance(c, Column) else _create_column_from_literal(c) for c in cols], + ) + sc = get_active_spark_context() + jc = getattr(self._jc, "isin")(_to_seq(sc, cols)) + return Column(jc) + + # order + def asc(self) -> ParentColumn: + return _unary_op("asc", self) + + def asc_nulls_first(self) -> ParentColumn: + return _unary_op("asc_nulls_first", self) + + def asc_nulls_last(self) -> ParentColumn: + return _unary_op("asc_nulls_last", self) + + def desc(self) -> ParentColumn: + return _unary_op("desc", self) + + def desc_nulls_first(self) -> ParentColumn: + return _unary_op("desc_nulls_first", self) + + def desc_nulls_last(self) -> ParentColumn: + return _unary_op("desc_nulls_last", self) + + def isNull(self) -> ParentColumn: + return _unary_op("isNull", self) + + def isNotNull(self) -> ParentColumn: + return _unary_op("isNotNull", self) + + def isNaN(self) -> ParentColumn: + return _unary_op("isNaN", self) + + def alias(self, *alias: str, **kwargs: Any) -> ParentColumn: + metadata = kwargs.pop("metadata", None) + assert not kwargs, "Unexpected kwargs where passed: %s" % kwargs + + sc = get_active_spark_context() + if len(alias) == 1: + if metadata: + assert sc._jvm is not None + jmeta = sc._jvm.org.apache.spark.sql.types.Metadata.fromJson(json.dumps(metadata)) + return Column(getattr(self._jc, "as")(alias[0], jmeta)) + else: + return Column(getattr(self._jc, "as")(alias[0])) + else: + if metadata is not None: + raise PySparkValueError( + error_class="ONLY_ALLOWED_FOR_SINGLE_COLUMN", + message_parameters={"arg_name": "metadata"}, + ) + return Column(getattr(self._jc, "as")(_to_seq(sc, list(alias)))) + + def name(self, *alias: str, **kwargs: Any) -> ParentColumn: + return self.alias(*alias, **kwargs) + + def cast(self, dataType: Union[DataType, str]) -> ParentColumn: + if isinstance(dataType, str): + jc = self._jc.cast(dataType) + elif isinstance(dataType, DataType): + from pyspark.sql import SparkSession + + spark = SparkSession._getActiveSessionOrCreate() + jdt = spark._jsparkSession.parseDataType(dataType.json()) + jc = self._jc.cast(jdt) + else: + raise PySparkTypeError( + error_class="NOT_DATATYPE_OR_STR", + message_parameters={"arg_name": "dataType", "arg_type": type(dataType).__name__}, + ) + return Column(jc) + + def try_cast(self, dataType: Union[DataType, str]) -> ParentColumn: + if isinstance(dataType, str): + jc = self._jc.try_cast(dataType) + elif isinstance(dataType, DataType): + from pyspark.sql import SparkSession + + spark = SparkSession._getActiveSessionOrCreate() + jdt = spark._jsparkSession.parseDataType(dataType.json()) + jc = self._jc.try_cast(jdt) + else: + raise PySparkTypeError( + error_class="NOT_DATATYPE_OR_STR", + message_parameters={"arg_name": "dataType", "arg_type": type(dataType).__name__}, + ) + return Column(jc) + + def astype(self, dataType: Union[DataType, str]) -> ParentColumn: + return self.cast(dataType) + + def between( + self, + lowerBound: Union[ParentColumn, "LiteralType", "DateTimeLiteral", "DecimalLiteral"], + upperBound: Union[ParentColumn, "LiteralType", "DateTimeLiteral", "DecimalLiteral"], + ) -> ParentColumn: + return (self >= lowerBound) & (self <= upperBound) + + def when(self, condition: ParentColumn, value: Any) -> ParentColumn: + if not isinstance(condition, Column): + raise PySparkTypeError( + error_class="NOT_COLUMN", + message_parameters={"arg_name": "condition", "arg_type": type(condition).__name__}, + ) + v = value._jc if isinstance(value, Column) else value + jc = self._jc.when(condition._jc, v) + return Column(jc) + + def otherwise(self, value: Any) -> ParentColumn: + v = value._jc if isinstance(value, Column) else value + jc = self._jc.otherwise(v) + return Column(jc) + + def over(self, window: "WindowSpec") -> ParentColumn: + from pyspark.sql.classic.window import WindowSpec + + if not isinstance(window, WindowSpec): + raise PySparkTypeError( + error_class="NOT_WINDOWSPEC", + message_parameters={"arg_name": "window", "arg_type": type(window).__name__}, + ) + jc = self._jc.over(window._jspec) + return Column(jc) + + def __nonzero__(self) -> None: + raise PySparkValueError( + error_class="CANNOT_CONVERT_COLUMN_INTO_BOOL", + message_parameters={}, + ) + + __bool__ = __nonzero__ + + def __repr__(self) -> str: + return "Column<'%s'>" % self._jc.toString() + + +def _test() -> None: + import doctest + from pyspark.sql import SparkSession + import pyspark.sql.column + + # It inherits docstrings but doctests cannot detect them so we run + # the parent classe's doctests here directly. + globs = pyspark.sql.column.__dict__.copy() + spark = ( + SparkSession.builder.master("local[4]").appName("sql.classic.column tests").getOrCreate() + ) + globs["spark"] = spark + + (failure_count, test_count) = doctest.testmod( + pyspark.sql.column, + globs=globs, + optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF, + ) + spark.stop() + if failure_count: + sys.exit(-1) + + +if __name__ == "__main__": + _test() diff --git a/python/pyspark/sql/classic/dataframe.py b/python/pyspark/sql/classic/dataframe.py new file mode 100644 index 0000000000000..1bedd624603e1 --- /dev/null +++ b/python/pyspark/sql/classic/dataframe.py @@ -0,0 +1,1990 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import json +import sys +import random +import warnings +from collections.abc import Iterable +from functools import reduce +from typing import ( + Any, + Callable, + Dict, + Iterator, + List, + Optional, + Sequence, + Tuple, + Type, + Union, + cast, + overload, + TYPE_CHECKING, +) + +from pyspark import _NoValue +from pyspark.resource import ResourceProfile +from pyspark._globals import _NoValueType +from pyspark.errors import ( + PySparkTypeError, + PySparkValueError, + PySparkIndexError, + PySparkAttributeError, +) +from pyspark.util import ( + _load_from_socket, + _local_iterator_from_socket, +) +from pyspark.serializers import BatchedSerializer, CPickleSerializer, UTF8Deserializer +from pyspark.storagelevel import StorageLevel +from pyspark.traceback_utils import SCCallSiteSync +from pyspark.sql.column import Column +from pyspark.sql.classic.column import _to_seq, _to_list, _to_java_column +from pyspark.sql.readwriter import DataFrameWriter, DataFrameWriterV2 +from pyspark.sql.streaming import DataStreamWriter +from pyspark.sql.types import ( + StructType, + Row, + _parse_datatype_json_string, +) +from pyspark.sql.dataframe import ( + DataFrame as ParentDataFrame, + DataFrameNaFunctions as ParentDataFrameNaFunctions, + DataFrameStatFunctions as ParentDataFrameStatFunctions, +) +from pyspark.sql.utils import get_active_spark_context, toJArray +from pyspark.sql.pandas.conversion import PandasConversionMixin +from pyspark.sql.pandas.map_ops import PandasMapOpsMixin + +if TYPE_CHECKING: + from py4j.java_gateway import JavaObject + import pyarrow as pa + from pyspark.core.rdd import RDD + from pyspark.core.context import SparkContext + from pyspark._typing import PrimitiveType + from pyspark.pandas.frame import DataFrame as PandasOnSparkDataFrame + from pyspark.sql._typing import ( + ColumnOrName, + ColumnOrNameOrOrdinal, + LiteralType, + OptionalPrimitiveType, + ) + from pyspark.sql.pandas._typing import ( + PandasMapIterFunction, + ArrowMapIterFunction, + DataFrameLike as PandasDataFrameLike, + ) + from pyspark.sql.context import SQLContext + from pyspark.sql.session import SparkSession + from pyspark.sql.group import GroupedData + from pyspark.sql.observation import Observation + from pyspark.sql.metrics import ExecutionInfo + + +class DataFrame(ParentDataFrame, PandasMapOpsMixin, PandasConversionMixin): + def __new__( + cls, + jdf: "JavaObject", + sql_ctx: Union["SQLContext", "SparkSession"], + ) -> "DataFrame": + self = object.__new__(cls) + self.__init__(jdf, sql_ctx) # type: ignore[misc] + return self + + def __init__( + self, + jdf: "JavaObject", + sql_ctx: Union["SQLContext", "SparkSession"], + ): + from pyspark.sql.context import SQLContext + + self._sql_ctx: Optional["SQLContext"] = None + + if isinstance(sql_ctx, SQLContext): + assert not os.environ.get("SPARK_TESTING") # Sanity check for our internal usage. + assert isinstance(sql_ctx, SQLContext) + # We should remove this if-else branch in the future release, and rename + # sql_ctx to session in the constructor. This is an internal code path but + # was kept with a warning because it's used intensively by third-party libraries. + warnings.warn("DataFrame constructor is internal. Do not directly use it.") + self._sql_ctx = sql_ctx + session = sql_ctx.sparkSession + else: + session = sql_ctx + self._session: "SparkSession" = session + + self._sc: "SparkContext" = sql_ctx._sc + self._jdf: "JavaObject" = jdf + self.is_cached = False + # initialized lazily + self._schema: Optional[StructType] = None + self._lazy_rdd: Optional["RDD[Row]"] = None + # Check whether _repr_html is supported or not, we use it to avoid calling _jdf twice + # by __repr__ and _repr_html_ while eager evaluation opens. + self._support_repr_html = False + + @property + def sql_ctx(self) -> "SQLContext": + from pyspark.sql.context import SQLContext + + warnings.warn( + "DataFrame.sql_ctx is an internal property, and will be removed " + "in future releases. Use DataFrame.sparkSession instead." + ) + if self._sql_ctx is None: + self._sql_ctx = SQLContext._get_or_create(self._sc) + return self._sql_ctx + + @property + def sparkSession(self) -> "SparkSession": + return self._session + + @property + def rdd(self) -> "RDD[Row]": + from pyspark.core.rdd import RDD + + if self._lazy_rdd is None: + jrdd = self._jdf.javaToPython() + self._lazy_rdd = RDD( + jrdd, self.sparkSession._sc, BatchedSerializer(CPickleSerializer()) + ) + return self._lazy_rdd + + @property + def na(self) -> ParentDataFrameNaFunctions: + return DataFrameNaFunctions(self) + + @property + def stat(self) -> ParentDataFrameStatFunctions: + return DataFrameStatFunctions(self) + + def toJSON(self, use_unicode: bool = True) -> "RDD[str]": + from pyspark.core.rdd import RDD + + rdd = self._jdf.toJSON() + return RDD(rdd.toJavaRDD(), self._sc, UTF8Deserializer(use_unicode)) + + def registerTempTable(self, name: str) -> None: + warnings.warn("Deprecated in 2.0, use createOrReplaceTempView instead.", FutureWarning) + self._jdf.createOrReplaceTempView(name) + + def createTempView(self, name: str) -> None: + self._jdf.createTempView(name) + + def createOrReplaceTempView(self, name: str) -> None: + self._jdf.createOrReplaceTempView(name) + + def createGlobalTempView(self, name: str) -> None: + self._jdf.createGlobalTempView(name) + + def createOrReplaceGlobalTempView(self, name: str) -> None: + self._jdf.createOrReplaceGlobalTempView(name) + + @property + def write(self) -> DataFrameWriter: + return DataFrameWriter(self) + + @property + def writeStream(self) -> DataStreamWriter: + return DataStreamWriter(self) + + @property + def schema(self) -> StructType: + if self._schema is None: + try: + self._schema = cast( + StructType, _parse_datatype_json_string(self._jdf.schema().json()) + ) + except Exception as e: + raise PySparkValueError( + error_class="CANNOT_PARSE_DATATYPE", + message_parameters={"error": str(e)}, + ) + return self._schema + + def printSchema(self, level: Optional[int] = None) -> None: + if level: + print(self._jdf.schema().treeString(level)) + else: + print(self._jdf.schema().treeString()) + + def explain( + self, extended: Optional[Union[bool, str]] = None, mode: Optional[str] = None + ) -> None: + if extended is not None and mode is not None: + raise PySparkValueError( + error_class="CANNOT_SET_TOGETHER", + message_parameters={"arg_list": "extended and mode"}, + ) + + # For the no argument case: df.explain() + is_no_argument = extended is None and mode is None + + # For the cases below: + # explain(True) + # explain(extended=False) + is_extended_case = isinstance(extended, bool) and mode is None + + # For the case when extended is mode: + # df.explain("formatted") + is_extended_as_mode = isinstance(extended, str) and mode is None + + # For the mode specified: + # df.explain(mode="formatted") + is_mode_case = extended is None and isinstance(mode, str) + + if not (is_no_argument or is_extended_case or is_extended_as_mode or is_mode_case): + if (extended is not None) and (not isinstance(extended, (bool, str))): + raise PySparkTypeError( + error_class="NOT_BOOL_OR_STR", + message_parameters={ + "arg_name": "extended", + "arg_type": type(extended).__name__, + }, + ) + if (mode is not None) and (not isinstance(mode, str)): + raise PySparkTypeError( + error_class="NOT_STR", + message_parameters={"arg_name": "mode", "arg_type": type(mode).__name__}, + ) + + # Sets an explain mode depending on a given argument + if is_no_argument: + explain_mode = "simple" + elif is_extended_case: + explain_mode = "extended" if extended else "simple" + elif is_mode_case: + explain_mode = cast(str, mode) + elif is_extended_as_mode: + explain_mode = cast(str, extended) + assert self._sc._jvm is not None + print(self._sc._jvm.PythonSQLUtils.explainString(self._jdf.queryExecution(), explain_mode)) + + def exceptAll(self, other: ParentDataFrame) -> ParentDataFrame: + return DataFrame(self._jdf.exceptAll(other._jdf), self.sparkSession) + + def isLocal(self) -> bool: + return self._jdf.isLocal() + + @property + def isStreaming(self) -> bool: + return self._jdf.isStreaming() + + def isEmpty(self) -> bool: + return self._jdf.isEmpty() + + def show(self, n: int = 20, truncate: Union[bool, int] = True, vertical: bool = False) -> None: + print(self._show_string(n, truncate, vertical)) + + def _show_string( + self, n: int = 20, truncate: Union[bool, int] = True, vertical: bool = False + ) -> str: + if not isinstance(n, int) or isinstance(n, bool): + raise PySparkTypeError( + error_class="NOT_INT", + message_parameters={"arg_name": "n", "arg_type": type(n).__name__}, + ) + + if not isinstance(vertical, bool): + raise PySparkTypeError( + error_class="NOT_BOOL", + message_parameters={"arg_name": "vertical", "arg_type": type(vertical).__name__}, + ) + + if isinstance(truncate, bool) and truncate: + return self._jdf.showString(n, 20, vertical) + else: + try: + int_truncate = int(truncate) + except ValueError: + raise PySparkTypeError( + error_class="NOT_BOOL", + message_parameters={ + "arg_name": "truncate", + "arg_type": type(truncate).__name__, + }, + ) + + return self._jdf.showString(n, int_truncate, vertical) + + def __repr__(self) -> str: + if not self._support_repr_html and self.sparkSession._jconf.isReplEagerEvalEnabled(): + vertical = False + return self._jdf.showString( + self.sparkSession._jconf.replEagerEvalMaxNumRows(), + self.sparkSession._jconf.replEagerEvalTruncate(), + vertical, + ) + else: + return "DataFrame[%s]" % (", ".join("%s: %s" % c for c in self.dtypes)) + + def _repr_html_(self) -> Optional[str]: + """Returns a :class:`DataFrame` with html code when you enabled eager evaluation + by 'spark.sql.repl.eagerEval.enabled', this only called by REPL you are + using support eager evaluation with HTML. + """ + if not self._support_repr_html: + self._support_repr_html = True + if self.sparkSession._jconf.isReplEagerEvalEnabled(): + return self._jdf.htmlString( + self.sparkSession._jconf.replEagerEvalMaxNumRows(), + self.sparkSession._jconf.replEagerEvalTruncate(), + ) + else: + return None + + def checkpoint(self, eager: bool = True) -> ParentDataFrame: + jdf = self._jdf.checkpoint(eager) + return DataFrame(jdf, self.sparkSession) + + def localCheckpoint(self, eager: bool = True) -> ParentDataFrame: + jdf = self._jdf.localCheckpoint(eager) + return DataFrame(jdf, self.sparkSession) + + def withWatermark(self, eventTime: str, delayThreshold: str) -> ParentDataFrame: + if not eventTime or type(eventTime) is not str: + raise PySparkTypeError( + error_class="NOT_STR", + message_parameters={"arg_name": "eventTime", "arg_type": type(eventTime).__name__}, + ) + if not delayThreshold or type(delayThreshold) is not str: + raise PySparkTypeError( + error_class="NOT_STR", + message_parameters={ + "arg_name": "delayThreshold", + "arg_type": type(delayThreshold).__name__, + }, + ) + jdf = self._jdf.withWatermark(eventTime, delayThreshold) + return DataFrame(jdf, self.sparkSession) + + def hint( + self, name: str, *parameters: Union["PrimitiveType", "Column", List["PrimitiveType"]] + ) -> ParentDataFrame: + if len(parameters) == 1 and isinstance(parameters[0], list): + parameters = parameters[0] # type: ignore[assignment] + + if not isinstance(name, str): + raise PySparkTypeError( + error_class="NOT_STR", + message_parameters={"arg_name": "name", "arg_type": type(name).__name__}, + ) + + allowed_types = (str, float, int, Column, list) + allowed_primitive_types = (str, float, int) + allowed_types_repr = ", ".join( + [t.__name__ for t in allowed_types[:-1]] + + ["list[" + t.__name__ + "]" for t in allowed_primitive_types] + ) + for p in parameters: + if not isinstance(p, allowed_types): + raise PySparkTypeError( + error_class="DISALLOWED_TYPE_FOR_CONTAINER", + message_parameters={ + "arg_name": "parameters", + "arg_type": type(parameters).__name__, + "allowed_types": allowed_types_repr, + "item_type": type(p).__name__, + }, + ) + if isinstance(p, list): + if not all(isinstance(e, allowed_primitive_types) for e in p): + raise PySparkTypeError( + error_class="DISALLOWED_TYPE_FOR_CONTAINER", + message_parameters={ + "arg_name": "parameters", + "arg_type": type(parameters).__name__, + "allowed_types": allowed_types_repr, + "item_type": type(p).__name__ + "[" + type(p[0]).__name__ + "]", + }, + ) + + def _converter(parameter: Union[str, list, float, int, Column]) -> Any: + if isinstance(parameter, Column): + return _to_java_column(parameter) + elif isinstance(parameter, list): + # for list input, we are assuming only one element type exist in the list. + # for empty list, we are converting it into an empty long[] in the JVM side. + gateway = self._sc._gateway + assert gateway is not None + jclass = gateway.jvm.long + if len(parameter) >= 1: + mapping = { + str: gateway.jvm.java.lang.String, + float: gateway.jvm.double, + int: gateway.jvm.long, + } + jclass = mapping[type(parameter[0])] + return toJArray(gateway, jclass, parameter) + else: + return parameter + + jdf = self._jdf.hint(name, self._jseq(parameters, _converter)) + return DataFrame(jdf, self.sparkSession) + + def count(self) -> int: + return int(self._jdf.count()) + + def collect(self) -> List[Row]: + with SCCallSiteSync(self._sc): + sock_info = self._jdf.collectToPython() + return list(_load_from_socket(sock_info, BatchedSerializer(CPickleSerializer()))) + + def toLocalIterator(self, prefetchPartitions: bool = False) -> Iterator[Row]: + with SCCallSiteSync(self._sc): + sock_info = self._jdf.toPythonIterator(prefetchPartitions) + return _local_iterator_from_socket(sock_info, BatchedSerializer(CPickleSerializer())) + + def limit(self, num: int) -> ParentDataFrame: + jdf = self._jdf.limit(num) + return DataFrame(jdf, self.sparkSession) + + def offset(self, num: int) -> ParentDataFrame: + jdf = self._jdf.offset(num) + return DataFrame(jdf, self.sparkSession) + + def take(self, num: int) -> List[Row]: + return self.limit(num).collect() + + def tail(self, num: int) -> List[Row]: + with SCCallSiteSync(self._sc): + sock_info = self._jdf.tailToPython(num) + return list(_load_from_socket(sock_info, BatchedSerializer(CPickleSerializer()))) + + def foreach(self, f: Callable[[Row], None]) -> None: + self.rdd.foreach(f) + + def foreachPartition(self, f: Callable[[Iterator[Row]], None]) -> None: + self.rdd.foreachPartition(f) # type: ignore[arg-type] + + def cache(self) -> ParentDataFrame: + self.is_cached = True + self._jdf.cache() + return self + + def persist( + self, + storageLevel: StorageLevel = (StorageLevel.MEMORY_AND_DISK_DESER), + ) -> ParentDataFrame: + self.is_cached = True + javaStorageLevel = self._sc._getJavaStorageLevel(storageLevel) + self._jdf.persist(javaStorageLevel) + return self + + @property + def storageLevel(self) -> StorageLevel: + java_storage_level = self._jdf.storageLevel() + storage_level = StorageLevel( + java_storage_level.useDisk(), + java_storage_level.useMemory(), + java_storage_level.useOffHeap(), + java_storage_level.deserialized(), + java_storage_level.replication(), + ) + return storage_level + + def unpersist(self, blocking: bool = False) -> ParentDataFrame: + self.is_cached = False + self._jdf.unpersist(blocking) + return self + + def coalesce(self, numPartitions: int) -> ParentDataFrame: + return DataFrame(self._jdf.coalesce(numPartitions), self.sparkSession) + + @overload + def repartition(self, numPartitions: int, *cols: "ColumnOrName") -> ParentDataFrame: + ... + + @overload + def repartition(self, *cols: "ColumnOrName") -> ParentDataFrame: + ... + + def repartition( # type: ignore[misc] + self, numPartitions: Union[int, "ColumnOrName"], *cols: "ColumnOrName" + ) -> ParentDataFrame: + if isinstance(numPartitions, int): + if len(cols) == 0: + return DataFrame(self._jdf.repartition(numPartitions), self.sparkSession) + else: + return DataFrame( + self._jdf.repartition(numPartitions, self._jcols(*cols)), + self.sparkSession, + ) + elif isinstance(numPartitions, (str, Column)): + cols = (numPartitions,) + cols + return DataFrame(self._jdf.repartition(self._jcols(*cols)), self.sparkSession) + else: + raise PySparkTypeError( + error_class="NOT_COLUMN_OR_STR", + message_parameters={ + "arg_name": "numPartitions", + "arg_type": type(numPartitions).__name__, + }, + ) + + @overload + def repartitionByRange(self, numPartitions: int, *cols: "ColumnOrName") -> ParentDataFrame: + ... + + @overload + def repartitionByRange(self, *cols: "ColumnOrName") -> ParentDataFrame: + ... + + def repartitionByRange( # type: ignore[misc] + self, numPartitions: Union[int, "ColumnOrName"], *cols: "ColumnOrName" + ) -> ParentDataFrame: + if isinstance(numPartitions, int): + if len(cols) == 0: + raise PySparkValueError( + error_class="CANNOT_BE_EMPTY", + message_parameters={"item": "partition-by expression"}, + ) + else: + return DataFrame( + self._jdf.repartitionByRange(numPartitions, self._jcols(*cols)), + self.sparkSession, + ) + elif isinstance(numPartitions, (str, Column)): + cols = (numPartitions,) + cols + return DataFrame(self._jdf.repartitionByRange(self._jcols(*cols)), self.sparkSession) + else: + raise PySparkTypeError( + error_class="NOT_COLUMN_OR_INT_OR_STR", + message_parameters={ + "arg_name": "numPartitions", + "arg_type": type(numPartitions).__name__, + }, + ) + + def distinct(self) -> ParentDataFrame: + return DataFrame(self._jdf.distinct(), self.sparkSession) + + @overload + def sample(self, fraction: float, seed: Optional[int] = ...) -> ParentDataFrame: + ... + + @overload + def sample( + self, + withReplacement: Optional[bool], + fraction: float, + seed: Optional[int] = ..., + ) -> ParentDataFrame: + ... + + def sample( # type: ignore[misc] + self, + withReplacement: Optional[Union[float, bool]] = None, + fraction: Optional[Union[int, float]] = None, + seed: Optional[int] = None, + ) -> ParentDataFrame: + # For the cases below: + # sample(True, 0.5 [, seed]) + # sample(True, fraction=0.5 [, seed]) + # sample(withReplacement=False, fraction=0.5 [, seed]) + is_withReplacement_set = type(withReplacement) == bool and isinstance(fraction, float) + + # For the case below: + # sample(faction=0.5 [, seed]) + is_withReplacement_omitted_kwargs = withReplacement is None and isinstance(fraction, float) + + # For the case below: + # sample(0.5 [, seed]) + is_withReplacement_omitted_args = isinstance(withReplacement, float) + + if not ( + is_withReplacement_set + or is_withReplacement_omitted_kwargs + or is_withReplacement_omitted_args + ): + argtypes = [type(arg).__name__ for arg in [withReplacement, fraction, seed]] + raise PySparkTypeError( + error_class="NOT_BOOL_OR_FLOAT_OR_INT", + message_parameters={ + "arg_name": "withReplacement (optional), " + + "fraction (required) and seed (optional)", + "arg_type": ", ".join(argtypes), + }, + ) + + if is_withReplacement_omitted_args: + if fraction is not None: + seed = cast(int, fraction) + fraction = withReplacement + withReplacement = None + + seed = int(seed) if seed is not None else None + args = [arg for arg in [withReplacement, fraction, seed] if arg is not None] + jdf = self._jdf.sample(*args) + return DataFrame(jdf, self.sparkSession) + + def sampleBy( + self, col: "ColumnOrName", fractions: Dict[Any, float], seed: Optional[int] = None + ) -> ParentDataFrame: + if isinstance(col, str): + col = Column(col) + elif not isinstance(col, Column): + raise PySparkTypeError( + error_class="NOT_COLUMN_OR_STR", + message_parameters={"arg_name": "col", "arg_type": type(col).__name__}, + ) + if not isinstance(fractions, dict): + raise PySparkTypeError( + error_class="NOT_DICT", + message_parameters={"arg_name": "fractions", "arg_type": type(fractions).__name__}, + ) + for k, v in fractions.items(): + if not isinstance(k, (float, int, str)): + raise PySparkTypeError( + error_class="DISALLOWED_TYPE_FOR_CONTAINER", + message_parameters={ + "arg_name": "fractions", + "arg_type": type(fractions).__name__, + "allowed_types": "float, int, str", + "item_type": type(k).__name__, + }, + ) + fractions[k] = float(v) + col = col._jc + seed = seed if seed is not None else random.randint(0, sys.maxsize) + return DataFrame( + self._jdf.stat().sampleBy(col, self._jmap(fractions), seed), self.sparkSession + ) + + def randomSplit( + self, weights: List[float], seed: Optional[int] = None + ) -> List[ParentDataFrame]: + for w in weights: + if w < 0.0: + raise PySparkValueError( + error_class="VALUE_NOT_POSITIVE", + message_parameters={"arg_name": "weights", "arg_value": str(w)}, + ) + seed = seed if seed is not None else random.randint(0, sys.maxsize) + df_array = self._jdf.randomSplit( + _to_list(self.sparkSession._sc, cast(List["ColumnOrName"], weights)), int(seed) + ) + return [DataFrame(df, self.sparkSession) for df in df_array] + + @property + def dtypes(self) -> List[Tuple[str, str]]: + return [(str(f.name), f.dataType.simpleString()) for f in self.schema.fields] + + @property + def columns(self) -> List[str]: + return [f.name for f in self.schema.fields] + + def colRegex(self, colName: str) -> Column: + if not isinstance(colName, str): + raise PySparkTypeError( + error_class="NOT_STR", + message_parameters={"arg_name": "colName", "arg_type": type(colName).__name__}, + ) + jc = self._jdf.colRegex(colName) + return Column(jc) + + def to(self, schema: StructType) -> ParentDataFrame: + assert schema is not None + jschema = self._jdf.sparkSession().parseDataType(schema.json()) + return DataFrame(self._jdf.to(jschema), self.sparkSession) + + def alias(self, alias: str) -> ParentDataFrame: + assert isinstance(alias, str), "alias should be a string" + return DataFrame(getattr(self._jdf, "as")(alias), self.sparkSession) + + def crossJoin(self, other: ParentDataFrame) -> ParentDataFrame: + jdf = self._jdf.crossJoin(other._jdf) + return DataFrame(jdf, self.sparkSession) + + def join( + self, + other: ParentDataFrame, + on: Optional[Union[str, List[str], Column, List[Column]]] = None, + how: Optional[str] = None, + ) -> ParentDataFrame: + if on is not None and not isinstance(on, list): + on = [on] # type: ignore[assignment] + + if on is not None: + if isinstance(on[0], str): + on = self._jseq(cast(List[str], on)) + else: + assert isinstance(on[0], Column), "on should be Column or list of Column" + on = reduce(lambda x, y: x.__and__(y), cast(List[Column], on)) + on = on._jc + + if on is None and how is None: + jdf = self._jdf.join(other._jdf) + else: + if how is None: + how = "inner" + if on is None: + on = self._jseq([]) + assert isinstance(how, str), "how should be a string" + jdf = self._jdf.join(other._jdf, on, how) + return DataFrame(jdf, self.sparkSession) + + # TODO(SPARK-22947): Fix the DataFrame API. + def _joinAsOf( + self, + other: ParentDataFrame, + leftAsOfColumn: Union[str, Column], + rightAsOfColumn: Union[str, Column], + on: Optional[Union[str, List[str], Column, List[Column]]] = None, + how: Optional[str] = None, + *, + tolerance: Optional[Column] = None, + allowExactMatches: bool = True, + direction: str = "backward", + ) -> ParentDataFrame: + """ + Perform an as-of join. + + This is similar to a left-join except that we match on the nearest + key rather than equal keys. + + .. versionchanged:: 4.0.0 + Supports Spark Connect. + + Parameters + ---------- + other : :class:`DataFrame` + Right side of the join + leftAsOfColumn : str or :class:`Column` + a string for the as-of join column name, or a Column + rightAsOfColumn : str or :class:`Column` + a string for the as-of join column name, or a Column + on : str, list or :class:`Column`, optional + a string for the join column name, a list of column names, + a join expression (Column), or a list of Columns. + If `on` is a string or a list of strings indicating the name of the join column(s), + the column(s) must exist on both sides, and this performs an equi-join. + how : str, optional + default ``inner``. Must be one of: ``inner`` and ``left``. + tolerance : :class:`Column`, optional + an asof tolerance within this range; must be compatible + with the merge index. + allowExactMatches : bool, optional + default ``True``. + direction : str, optional + default ``backward``. Must be one of: ``backward``, ``forward``, and ``nearest``. + + Examples + -------- + The following performs an as-of join between ``left`` and ``right``. + + >>> left = spark.createDataFrame([(1, "a"), (5, "b"), (10, "c")], ["a", "left_val"]) + >>> right = spark.createDataFrame([(1, 1), (2, 2), (3, 3), (6, 6), (7, 7)], + ... ["a", "right_val"]) + >>> left._joinAsOf( + ... right, leftAsOfColumn="a", rightAsOfColumn="a" + ... ).select(left.a, 'left_val', 'right_val').sort("a").collect() + [Row(a=1, left_val='a', right_val=1), + Row(a=5, left_val='b', right_val=3), + Row(a=10, left_val='c', right_val=7)] + + >>> from pyspark.sql import functions as sf + >>> left._joinAsOf( + ... right, leftAsOfColumn="a", rightAsOfColumn="a", tolerance=sf.lit(1) + ... ).select(left.a, 'left_val', 'right_val').sort("a").collect() + [Row(a=1, left_val='a', right_val=1)] + + >>> left._joinAsOf( + ... right, leftAsOfColumn="a", rightAsOfColumn="a", how="left", tolerance=sf.lit(1) + ... ).select(left.a, 'left_val', 'right_val').sort("a").collect() + [Row(a=1, left_val='a', right_val=1), + Row(a=5, left_val='b', right_val=None), + Row(a=10, left_val='c', right_val=None)] + + >>> left._joinAsOf( + ... right, leftAsOfColumn="a", rightAsOfColumn="a", allowExactMatches=False + ... ).select(left.a, 'left_val', 'right_val').sort("a").collect() + [Row(a=5, left_val='b', right_val=3), + Row(a=10, left_val='c', right_val=7)] + + >>> left._joinAsOf( + ... right, leftAsOfColumn="a", rightAsOfColumn="a", direction="forward" + ... ).select(left.a, 'left_val', 'right_val').sort("a").collect() + [Row(a=1, left_val='a', right_val=1), + Row(a=5, left_val='b', right_val=6)] + """ + if isinstance(leftAsOfColumn, str): + leftAsOfColumn = self[leftAsOfColumn] + left_as_of_jcol = leftAsOfColumn._jc + if isinstance(rightAsOfColumn, str): + rightAsOfColumn = other[rightAsOfColumn] + right_as_of_jcol = rightAsOfColumn._jc + + if on is not None and not isinstance(on, list): + on = [on] # type: ignore[assignment] + + if on is not None: + if isinstance(on[0], str): + on = self._jseq(cast(List[str], on)) + else: + assert isinstance(on[0], Column), "on should be Column or list of Column" + on = reduce(lambda x, y: x.__and__(y), cast(List[Column], on)) + on = on._jc + + if how is None: + how = "inner" + assert isinstance(how, str), "how should be a string" + + if tolerance is not None: + assert isinstance(tolerance, Column), "tolerance should be Column" + tolerance = tolerance._jc + + jdf = self._jdf.joinAsOf( + other._jdf, + left_as_of_jcol, + right_as_of_jcol, + on, + how, + tolerance, + allowExactMatches, + direction, + ) + return DataFrame(jdf, self.sparkSession) + + def sortWithinPartitions( + self, + *cols: Union[int, str, Column, List[Union[int, str, Column]]], + **kwargs: Any, + ) -> ParentDataFrame: + jdf = self._jdf.sortWithinPartitions(self._sort_cols(cols, kwargs)) + return DataFrame(jdf, self.sparkSession) + + def sort( + self, + *cols: Union[int, str, Column, List[Union[int, str, Column]]], + **kwargs: Any, + ) -> ParentDataFrame: + jdf = self._jdf.sort(self._sort_cols(cols, kwargs)) + return DataFrame(jdf, self.sparkSession) + + orderBy = sort + + def _jseq( + self, + cols: Sequence, + converter: Optional[Callable[..., Union["PrimitiveType", "JavaObject"]]] = None, + ) -> "JavaObject": + """Return a JVM Seq of Columns from a list of Column or names""" + return _to_seq(self.sparkSession._sc, cols, converter) + + def _jmap(self, jm: Dict) -> "JavaObject": + """Return a JVM Scala Map from a dict""" + return _to_scala_map(self.sparkSession._sc, jm) + + def _jcols(self, *cols: "ColumnOrName") -> "JavaObject": + """Return a JVM Seq of Columns from a list of Column or column names + + If `cols` has only one list in it, cols[0] will be used as the list. + """ + if len(cols) == 1 and isinstance(cols[0], list): + cols = cols[0] + return self._jseq(cols, _to_java_column) + + def _jcols_ordinal(self, *cols: "ColumnOrNameOrOrdinal") -> "JavaObject": + """Return a JVM Seq of Columns from a list of Column or column names or column ordinals. + + If `cols` has only one list in it, cols[0] will be used as the list. + """ + if len(cols) == 1 and isinstance(cols[0], list): + cols = cols[0] + + _cols = [] + for c in cols: + if isinstance(c, int) and not isinstance(c, bool): + if c < 1: + raise PySparkIndexError( + error_class="INDEX_NOT_POSITIVE", message_parameters={"index": str(c)} + ) + # ordinal is 1-based + _cols.append(self[c - 1]) + else: + _cols.append(c) # type: ignore[arg-type] + return self._jseq(_cols, _to_java_column) + + def _sort_cols( + self, + cols: Sequence[Union[int, str, Column, List[Union[int, str, Column]]]], + kwargs: Dict[str, Any], + ) -> "JavaObject": + """Return a JVM Seq of Columns that describes the sort order""" + if not cols: + raise PySparkValueError( + error_class="CANNOT_BE_EMPTY", + message_parameters={"item": "column"}, + ) + if len(cols) == 1 and isinstance(cols[0], list): + cols = cols[0] + + jcols = [] + for c in cols: + if isinstance(c, int) and not isinstance(c, bool): + # ordinal is 1-based + if c > 0: + _c = self[c - 1] + # negative ordinal means sort by desc + elif c < 0: + _c = self[-c - 1].desc() + else: + raise PySparkIndexError( + error_class="ZERO_INDEX", + message_parameters={}, + ) + else: + _c = c # type: ignore[assignment] + jcols.append(_to_java_column(cast("ColumnOrName", _c))) + + ascending = kwargs.get("ascending", True) + if isinstance(ascending, (bool, int)): + if not ascending: + jcols = [jc.desc() for jc in jcols] + elif isinstance(ascending, list): + jcols = [jc if asc else jc.desc() for asc, jc in zip(ascending, jcols)] + else: + raise PySparkTypeError( + error_class="NOT_BOOL_OR_LIST", + message_parameters={"arg_name": "ascending", "arg_type": type(ascending).__name__}, + ) + return self._jseq(jcols) + + def describe(self, *cols: Union[str, List[str]]) -> ParentDataFrame: + if len(cols) == 1 and isinstance(cols[0], list): + cols = cols[0] # type: ignore[assignment] + jdf = self._jdf.describe(self._jseq(cols)) + return DataFrame(jdf, self.sparkSession) + + def summary(self, *statistics: str) -> ParentDataFrame: + if len(statistics) == 1 and isinstance(statistics[0], list): + statistics = statistics[0] + jdf = self._jdf.summary(self._jseq(statistics)) + return DataFrame(jdf, self.sparkSession) + + @overload + def head(self) -> Optional[Row]: + ... + + @overload + def head(self, n: int) -> List[Row]: + ... + + def head(self, n: Optional[int] = None) -> Union[Optional[Row], List[Row]]: + if n is None: + rs = self.head(1) + return rs[0] if rs else None + return self.take(n) + + def first(self) -> Optional[Row]: + return self.head() + + @overload + def __getitem__(self, item: Union[int, str]) -> Column: + ... + + @overload + def __getitem__(self, item: Union[Column, List, Tuple]) -> ParentDataFrame: + ... + + def __getitem__( + self, item: Union[int, str, Column, List, Tuple] + ) -> Union[Column, ParentDataFrame]: + if isinstance(item, str): + jc = self._jdf.apply(item) + return Column(jc) + elif isinstance(item, Column): + return self.filter(item) + elif isinstance(item, (list, tuple)): + return self.select(*item) + elif isinstance(item, int): + jc = self._jdf.apply(self.columns[item]) + return Column(jc) + else: + raise PySparkTypeError( + error_class="NOT_COLUMN_OR_FLOAT_OR_INT_OR_LIST_OR_STR", + message_parameters={"arg_name": "item", "arg_type": type(item).__name__}, + ) + + def __getattr__(self, name: str) -> Column: + if name not in self.columns: + raise PySparkAttributeError( + error_class="ATTRIBUTE_NOT_SUPPORTED", message_parameters={"attr_name": name} + ) + jc = self._jdf.apply(name) + return Column(jc) + + def __dir__(self) -> List[str]: + attrs = set(dir(DataFrame)) + attrs.update(filter(lambda s: s.isidentifier(), self.columns)) + return sorted(attrs) + + @overload + def select(self, *cols: "ColumnOrName") -> ParentDataFrame: + ... + + @overload + def select(self, __cols: Union[List[Column], List[str]]) -> ParentDataFrame: + ... + + def select(self, *cols: "ColumnOrName") -> ParentDataFrame: # type: ignore[misc] + jdf = self._jdf.select(self._jcols(*cols)) + return DataFrame(jdf, self.sparkSession) + + @overload + def selectExpr(self, *expr: str) -> ParentDataFrame: + ... + + @overload + def selectExpr(self, *expr: List[str]) -> ParentDataFrame: + ... + + def selectExpr(self, *expr: Union[str, List[str]]) -> ParentDataFrame: + if len(expr) == 1 and isinstance(expr[0], list): + expr = expr[0] # type: ignore[assignment] + jdf = self._jdf.selectExpr(self._jseq(expr)) + return DataFrame(jdf, self.sparkSession) + + def filter(self, condition: "ColumnOrName") -> ParentDataFrame: + if isinstance(condition, str): + jdf = self._jdf.filter(condition) + elif isinstance(condition, Column): + jdf = self._jdf.filter(condition._jc) + else: + raise PySparkTypeError( + error_class="NOT_COLUMN_OR_STR", + message_parameters={"arg_name": "condition", "arg_type": type(condition).__name__}, + ) + return DataFrame(jdf, self.sparkSession) + + @overload + def groupBy(self, *cols: "ColumnOrNameOrOrdinal") -> "GroupedData": + ... + + @overload + def groupBy(self, __cols: Union[List[Column], List[str], List[int]]) -> "GroupedData": + ... + + def groupBy(self, *cols: "ColumnOrNameOrOrdinal") -> "GroupedData": # type: ignore[misc] + jgd = self._jdf.groupBy(self._jcols_ordinal(*cols)) + from pyspark.sql.group import GroupedData + + return GroupedData(jgd, self) + + @overload + def rollup(self, *cols: "ColumnOrName") -> "GroupedData": + ... + + @overload + def rollup(self, __cols: Union[List[Column], List[str]]) -> "GroupedData": + ... + + def rollup(self, *cols: "ColumnOrNameOrOrdinal") -> "GroupedData": # type: ignore[misc] + jgd = self._jdf.rollup(self._jcols_ordinal(*cols)) + from pyspark.sql.group import GroupedData + + return GroupedData(jgd, self) + + @overload + def cube(self, *cols: "ColumnOrName") -> "GroupedData": + ... + + @overload + def cube(self, __cols: Union[List[Column], List[str]]) -> "GroupedData": + ... + + def cube(self, *cols: "ColumnOrName") -> "GroupedData": # type: ignore[misc] + jgd = self._jdf.cube(self._jcols_ordinal(*cols)) + from pyspark.sql.group import GroupedData + + return GroupedData(jgd, self) + + def groupingSets( + self, groupingSets: Sequence[Sequence["ColumnOrName"]], *cols: "ColumnOrName" + ) -> "GroupedData": + from pyspark.sql.group import GroupedData + + jgrouping_sets = _to_seq(self._sc, [self._jcols(*inner) for inner in groupingSets]) + + jgd = self._jdf.groupingSets(jgrouping_sets, self._jcols(*cols)) + return GroupedData(jgd, self) + + def unpivot( + self, + ids: Union["ColumnOrName", List["ColumnOrName"], Tuple["ColumnOrName", ...]], + values: Optional[Union["ColumnOrName", List["ColumnOrName"], Tuple["ColumnOrName", ...]]], + variableColumnName: str, + valueColumnName: str, + ) -> ParentDataFrame: + assert ids is not None, "ids must not be None" + + def to_jcols( + cols: Union["ColumnOrName", List["ColumnOrName"], Tuple["ColumnOrName", ...]] + ) -> "JavaObject": + if isinstance(cols, list): + return self._jcols(*cols) + if isinstance(cols, tuple): + return self._jcols(*list(cols)) + return self._jcols(cols) + + jids = to_jcols(ids) + if values is None: + jdf = self._jdf.unpivotWithSeq(jids, variableColumnName, valueColumnName) + else: + jvals = to_jcols(values) + jdf = self._jdf.unpivotWithSeq(jids, jvals, variableColumnName, valueColumnName) + + return DataFrame(jdf, self.sparkSession) + + def melt( + self, + ids: Union["ColumnOrName", List["ColumnOrName"], Tuple["ColumnOrName", ...]], + values: Optional[Union["ColumnOrName", List["ColumnOrName"], Tuple["ColumnOrName", ...]]], + variableColumnName: str, + valueColumnName: str, + ) -> ParentDataFrame: + return self.unpivot(ids, values, variableColumnName, valueColumnName) + + def agg(self, *exprs: Union[Column, Dict[str, str]]) -> ParentDataFrame: + return self.groupBy().agg(*exprs) # type: ignore[arg-type] + + def observe( + self, + observation: Union["Observation", str], + *exprs: Column, + ) -> ParentDataFrame: + from pyspark.sql import Observation + + if len(exprs) == 0: + raise PySparkValueError( + error_class="CANNOT_BE_EMPTY", + message_parameters={"item": "exprs"}, + ) + if not all(isinstance(c, Column) for c in exprs): + raise PySparkTypeError( + error_class="NOT_LIST_OF_COLUMN", + message_parameters={"arg_name": "exprs"}, + ) + + if isinstance(observation, Observation): + return observation._on(self, *exprs) + elif isinstance(observation, str): + return DataFrame( + self._jdf.observe( + observation, exprs[0]._jc, _to_seq(self._sc, [c._jc for c in exprs[1:]]) + ), + self.sparkSession, + ) + else: + raise PySparkTypeError( + error_class="NOT_LIST_OF_COLUMN", + message_parameters={ + "arg_name": "observation", + "arg_type": type(observation).__name__, + }, + ) + + def union(self, other: ParentDataFrame) -> ParentDataFrame: + return DataFrame(self._jdf.union(other._jdf), self.sparkSession) + + def unionAll(self, other: ParentDataFrame) -> ParentDataFrame: + return self.union(other) + + def unionByName( + self, other: ParentDataFrame, allowMissingColumns: bool = False + ) -> ParentDataFrame: + return DataFrame(self._jdf.unionByName(other._jdf, allowMissingColumns), self.sparkSession) + + def intersect(self, other: ParentDataFrame) -> ParentDataFrame: + return DataFrame(self._jdf.intersect(other._jdf), self.sparkSession) + + def intersectAll(self, other: ParentDataFrame) -> ParentDataFrame: + return DataFrame(self._jdf.intersectAll(other._jdf), self.sparkSession) + + def subtract(self, other: ParentDataFrame) -> ParentDataFrame: + return DataFrame(getattr(self._jdf, "except")(other._jdf), self.sparkSession) + + def dropDuplicates(self, *subset: Union[str, List[str]]) -> ParentDataFrame: + # Acceptable args should be str, ... or a single List[str] + # So if subset length is 1, it can be either single str, or a list of str + # if subset length is greater than 1, it must be a sequence of str + if len(subset) > 1: + assert all(isinstance(c, str) for c in subset) + + if not subset: + jdf = self._jdf.dropDuplicates() + elif len(subset) == 1 and isinstance(subset[0], list): + jdf = self._jdf.dropDuplicates(self._jseq(subset[0])) + else: + jdf = self._jdf.dropDuplicates(self._jseq(subset)) + return DataFrame(jdf, self.sparkSession) + + drop_duplicates = dropDuplicates + + def dropDuplicatesWithinWatermark(self, *subset: Union[str, List[str]]) -> ParentDataFrame: + # Acceptable args should be str, ... or a single List[str] + # So if subset length is 1, it can be either single str, or a list of str + # if subset length is greater than 1, it must be a sequence of str + if len(subset) > 1: + assert all(isinstance(c, str) for c in subset) + + if not subset: + jdf = self._jdf.dropDuplicatesWithinWatermark() + elif len(subset) == 1 and isinstance(subset[0], list): + jdf = self._jdf.dropDuplicatesWithinWatermark(self._jseq(subset[0])) + else: + jdf = self._jdf.dropDuplicatesWithinWatermark(self._jseq(subset)) + return DataFrame(jdf, self.sparkSession) + + def dropna( + self, + how: str = "any", + thresh: Optional[int] = None, + subset: Optional[Union[str, Tuple[str, ...], List[str]]] = None, + ) -> ParentDataFrame: + if how is not None and how not in ["any", "all"]: + raise PySparkValueError( + error_class="VALUE_NOT_ANY_OR_ALL", + message_parameters={"arg_name": "how", "arg_type": how}, + ) + + if subset is None: + subset = self.columns + elif isinstance(subset, str): + subset = [subset] + elif not isinstance(subset, (list, tuple)): + raise PySparkTypeError( + error_class="NOT_LIST_OR_STR_OR_TUPLE", + message_parameters={"arg_name": "subset", "arg_type": type(subset).__name__}, + ) + + if thresh is None: + thresh = len(subset) if how == "any" else 1 + + return DataFrame(self._jdf.na().drop(thresh, self._jseq(subset)), self.sparkSession) + + @overload + def fillna( + self, + value: "LiteralType", + subset: Optional[Union[str, Tuple[str, ...], List[str]]] = ..., + ) -> ParentDataFrame: + ... + + @overload + def fillna(self, value: Dict[str, "LiteralType"]) -> ParentDataFrame: + ... + + def fillna( + self, + value: Union["LiteralType", Dict[str, "LiteralType"]], + subset: Optional[Union[str, Tuple[str, ...], List[str]]] = None, + ) -> ParentDataFrame: + if not isinstance(value, (float, int, str, bool, dict)): + raise PySparkTypeError( + error_class="NOT_BOOL_OR_DICT_OR_FLOAT_OR_INT_OR_STR", + message_parameters={"arg_name": "value", "arg_type": type(value).__name__}, + ) + + # Note that bool validates isinstance(int), but we don't want to + # convert bools to floats + + if not isinstance(value, bool) and isinstance(value, int): + value = float(value) + + if isinstance(value, dict): + return DataFrame(self._jdf.na().fill(value), self.sparkSession) + elif subset is None: + return DataFrame(self._jdf.na().fill(value), self.sparkSession) + else: + if isinstance(subset, str): + subset = [subset] + elif not isinstance(subset, (list, tuple)): + raise PySparkTypeError( + error_class="NOT_LIST_OR_TUPLE", + message_parameters={"arg_name": "subset", "arg_type": type(subset).__name__}, + ) + + return DataFrame(self._jdf.na().fill(value, self._jseq(subset)), self.sparkSession) + + @overload + def replace( + self, + to_replace: "LiteralType", + value: "OptionalPrimitiveType", + subset: Optional[List[str]] = ..., + ) -> ParentDataFrame: + ... + + @overload + def replace( + self, + to_replace: List["LiteralType"], + value: List["OptionalPrimitiveType"], + subset: Optional[List[str]] = ..., + ) -> ParentDataFrame: + ... + + @overload + def replace( + self, + to_replace: Dict["LiteralType", "OptionalPrimitiveType"], + subset: Optional[List[str]] = ..., + ) -> ParentDataFrame: + ... + + @overload + def replace( + self, + to_replace: List["LiteralType"], + value: "OptionalPrimitiveType", + subset: Optional[List[str]] = ..., + ) -> ParentDataFrame: + ... + + def replace( # type: ignore[misc] + self, + to_replace: Union[ + "LiteralType", List["LiteralType"], Dict["LiteralType", "OptionalPrimitiveType"] + ], + value: Optional[ + Union["OptionalPrimitiveType", List["OptionalPrimitiveType"], _NoValueType] + ] = _NoValue, + subset: Optional[List[str]] = None, + ) -> ParentDataFrame: + if value is _NoValue: + if isinstance(to_replace, dict): + value = None + else: + raise PySparkTypeError( + error_class="ARGUMENT_REQUIRED", + message_parameters={"arg_name": "value", "condition": "`to_replace` is dict"}, + ) + + # Helper functions + def all_of(types: Union[Type, Tuple[Type, ...]]) -> Callable[[Iterable], bool]: + """Given a type or tuple of types and a sequence of xs + check if each x is instance of type(s) + + >>> all_of(bool)([True, False]) + True + >>> all_of(str)(["a", 1]) + False + """ + + def all_of_(xs: Iterable) -> bool: + return all(isinstance(x, types) for x in xs) + + return all_of_ + + all_of_bool = all_of(bool) + all_of_str = all_of(str) + all_of_numeric = all_of((float, int)) + + # Validate input types + valid_types = (bool, float, int, str, list, tuple) + if not isinstance(to_replace, valid_types + (dict,)): + raise PySparkTypeError( + error_class="NOT_BOOL_OR_DICT_OR_FLOAT_OR_INT_OR_LIST_OR_STR_OR_TUPLE", + message_parameters={ + "arg_name": "to_replace", + "arg_type": type(to_replace).__name__, + }, + ) + + if ( + not isinstance(value, valid_types) + and value is not None + and not isinstance(to_replace, dict) + ): + raise PySparkTypeError( + error_class="NOT_BOOL_OR_FLOAT_OR_INT_OR_LIST_OR_NONE_OR_STR_OR_TUPLE", + message_parameters={ + "arg_name": "value", + "arg_type": type(value).__name__, + }, + ) + + if isinstance(to_replace, (list, tuple)) and isinstance(value, (list, tuple)): + if len(to_replace) != len(value): + raise PySparkValueError( + error_class="LENGTH_SHOULD_BE_THE_SAME", + message_parameters={ + "arg1": "to_replace", + "arg2": "value", + "arg1_length": str(len(to_replace)), + "arg2_length": str(len(value)), + }, + ) + + if not (subset is None or isinstance(subset, (list, tuple, str))): + raise PySparkTypeError( + error_class="NOT_LIST_OR_STR_OR_TUPLE", + message_parameters={"arg_name": "subset", "arg_type": type(subset).__name__}, + ) + + # Reshape input arguments if necessary + if isinstance(to_replace, (float, int, str)): + to_replace = [to_replace] + + if isinstance(to_replace, dict): + rep_dict = to_replace + if value is not None: + warnings.warn("to_replace is a dict and value is not None. value will be ignored.") + else: + if isinstance(value, (float, int, str)) or value is None: + value = [value for _ in range(len(to_replace))] + rep_dict = dict(zip(to_replace, cast("Iterable[Optional[Union[float, str]]]", value))) + + if isinstance(subset, str): + subset = [subset] + + # Verify we were not passed in mixed type generics. + if not any( + all_of_type(rep_dict.keys()) + and all_of_type(x for x in rep_dict.values() if x is not None) + for all_of_type in [all_of_bool, all_of_str, all_of_numeric] + ): + raise PySparkValueError( + error_class="MIXED_TYPE_REPLACEMENT", + message_parameters={}, + ) + + if subset is None: + return DataFrame(self._jdf.na().replace("*", rep_dict), self.sparkSession) + else: + return DataFrame( + self._jdf.na().replace(self._jseq(subset), self._jmap(rep_dict)), + self.sparkSession, + ) + + @overload + def approxQuantile( + self, + col: str, + probabilities: Union[List[float], Tuple[float]], + relativeError: float, + ) -> List[float]: + ... + + @overload + def approxQuantile( + self, + col: Union[List[str], Tuple[str]], + probabilities: Union[List[float], Tuple[float]], + relativeError: float, + ) -> List[List[float]]: + ... + + def approxQuantile( + self, + col: Union[str, List[str], Tuple[str]], + probabilities: Union[List[float], Tuple[float]], + relativeError: float, + ) -> Union[List[float], List[List[float]]]: + if not isinstance(col, (str, list, tuple)): + raise PySparkTypeError( + error_class="NOT_LIST_OR_STR_OR_TUPLE", + message_parameters={"arg_name": "col", "arg_type": type(col).__name__}, + ) + + isStr = isinstance(col, str) + + if isinstance(col, tuple): + col = list(col) + elif isStr: + col = [cast(str, col)] + + for c in col: + if not isinstance(c, str): + raise PySparkTypeError( + error_class="DISALLOWED_TYPE_FOR_CONTAINER", + message_parameters={ + "arg_name": "col", + "arg_type": type(col).__name__, + "allowed_types": "str", + "item_type": type(c).__name__, + }, + ) + col = _to_list(self._sc, cast(List["ColumnOrName"], col)) + + if not isinstance(probabilities, (list, tuple)): + raise PySparkTypeError( + error_class="NOT_LIST_OR_TUPLE", + message_parameters={ + "arg_name": "probabilities", + "arg_type": type(probabilities).__name__, + }, + ) + if isinstance(probabilities, tuple): + probabilities = list(probabilities) + for p in probabilities: + if not isinstance(p, (float, int)) or p < 0 or p > 1: + raise PySparkTypeError( + error_class="NOT_LIST_OF_FLOAT_OR_INT", + message_parameters={ + "arg_name": "probabilities", + "arg_type": type(p).__name__, + }, + ) + probabilities = _to_list(self._sc, cast(List["ColumnOrName"], probabilities)) + + if not isinstance(relativeError, (float, int)): + raise PySparkTypeError( + error_class="NOT_FLOAT_OR_INT", + message_parameters={ + "arg_name": "relativeError", + "arg_type": type(relativeError).__name__, + }, + ) + if relativeError < 0: + raise PySparkValueError( + error_class="NEGATIVE_VALUE", + message_parameters={ + "arg_name": "relativeError", + "arg_value": str(relativeError), + }, + ) + relativeError = float(relativeError) + + jaq = self._jdf.stat().approxQuantile(col, probabilities, relativeError) + jaq_list = [list(j) for j in jaq] + return jaq_list[0] if isStr else jaq_list + + def corr(self, col1: str, col2: str, method: Optional[str] = None) -> float: + if not isinstance(col1, str): + raise PySparkTypeError( + error_class="NOT_STR", + message_parameters={"arg_name": "col1", "arg_type": type(col1).__name__}, + ) + if not isinstance(col2, str): + raise PySparkTypeError( + error_class="NOT_STR", + message_parameters={"arg_name": "col2", "arg_type": type(col2).__name__}, + ) + if not method: + method = "pearson" + if not method == "pearson": + raise PySparkValueError( + error_class="VALUE_NOT_PEARSON", + message_parameters={"arg_name": "method", "arg_value": method}, + ) + return self._jdf.stat().corr(col1, col2, method) + + def cov(self, col1: str, col2: str) -> float: + if not isinstance(col1, str): + raise PySparkTypeError( + error_class="NOT_STR", + message_parameters={"arg_name": "col1", "arg_type": type(col1).__name__}, + ) + if not isinstance(col2, str): + raise PySparkTypeError( + error_class="NOT_STR", + message_parameters={"arg_name": "col2", "arg_type": type(col2).__name__}, + ) + return self._jdf.stat().cov(col1, col2) + + def crosstab(self, col1: str, col2: str) -> ParentDataFrame: + if not isinstance(col1, str): + raise PySparkTypeError( + error_class="NOT_STR", + message_parameters={"arg_name": "col1", "arg_type": type(col1).__name__}, + ) + if not isinstance(col2, str): + raise PySparkTypeError( + error_class="NOT_STR", + message_parameters={"arg_name": "col2", "arg_type": type(col2).__name__}, + ) + return DataFrame(self._jdf.stat().crosstab(col1, col2), self.sparkSession) + + def freqItems( + self, cols: Union[List[str], Tuple[str]], support: Optional[float] = None + ) -> ParentDataFrame: + if isinstance(cols, tuple): + cols = list(cols) + if not isinstance(cols, list): + raise PySparkTypeError( + error_class="NOT_LIST_OR_TUPLE", + message_parameters={"arg_name": "cols", "arg_type": type(cols).__name__}, + ) + if not support: + support = 0.01 + return DataFrame( + self._jdf.stat().freqItems(_to_seq(self._sc, cols), support), self.sparkSession + ) + + def _ipython_key_completions_(self) -> List[str]: + """Returns the names of columns in this :class:`DataFrame`. + + Examples + -------- + >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], ["age", "name"]) + >>> df._ipython_key_completions_() + ['age', 'name'] + + Would return illegal identifiers. + >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], ["age 1", "name?1"]) + >>> df._ipython_key_completions_() + ['age 1', 'name?1'] + """ + return self.columns + + def withColumns(self, *colsMap: Dict[str, Column]) -> ParentDataFrame: + # Below code is to help enable kwargs in future. + assert len(colsMap) == 1 + colsMap = colsMap[0] # type: ignore[assignment] + + if not isinstance(colsMap, dict): + raise PySparkTypeError( + error_class="NOT_DICT", + message_parameters={"arg_name": "colsMap", "arg_type": type(colsMap).__name__}, + ) + + col_names = list(colsMap.keys()) + cols = list(colsMap.values()) + + return DataFrame( + self._jdf.withColumns(_to_seq(self._sc, col_names), self._jcols(*cols)), + self.sparkSession, + ) + + def withColumn(self, colName: str, col: Column) -> ParentDataFrame: + if not isinstance(col, Column): + raise PySparkTypeError( + error_class="NOT_COLUMN", + message_parameters={"arg_name": "col", "arg_type": type(col).__name__}, + ) + return DataFrame(self._jdf.withColumn(colName, col._jc), self.sparkSession) + + def withColumnRenamed(self, existing: str, new: str) -> ParentDataFrame: + return DataFrame(self._jdf.withColumnRenamed(existing, new), self.sparkSession) + + def withColumnsRenamed(self, colsMap: Dict[str, str]) -> ParentDataFrame: + if not isinstance(colsMap, dict): + raise PySparkTypeError( + error_class="NOT_DICT", + message_parameters={"arg_name": "colsMap", "arg_type": type(colsMap).__name__}, + ) + + col_names: List[str] = [] + new_col_names: List[str] = [] + for k, v in colsMap.items(): + col_names.append(k) + new_col_names.append(v) + + return DataFrame( + self._jdf.withColumnsRenamed( + _to_seq(self._sc, col_names), _to_seq(self._sc, new_col_names) + ), + self.sparkSession, + ) + + def withMetadata(self, columnName: str, metadata: Dict[str, Any]) -> ParentDataFrame: + from py4j.java_gateway import JVMView + + if not isinstance(metadata, dict): + raise PySparkTypeError( + error_class="NOT_DICT", + message_parameters={"arg_name": "metadata", "arg_type": type(metadata).__name__}, + ) + sc = get_active_spark_context() + jmeta = cast(JVMView, sc._jvm).org.apache.spark.sql.types.Metadata.fromJson( + json.dumps(metadata) + ) + return DataFrame(self._jdf.withMetadata(columnName, jmeta), self.sparkSession) + + @overload + def drop(self, cols: "ColumnOrName") -> ParentDataFrame: + ... + + @overload + def drop(self, *cols: str) -> ParentDataFrame: + ... + + def drop(self, *cols: "ColumnOrName") -> ParentDataFrame: # type: ignore[misc] + column_names: List[str] = [] + java_columns: List["JavaObject"] = [] + + for c in cols: + if isinstance(c, str): + column_names.append(c) + elif isinstance(c, Column): + java_columns.append(c._jc) + else: + raise PySparkTypeError( + error_class="NOT_COLUMN_OR_STR", + message_parameters={"arg_name": "col", "arg_type": type(c).__name__}, + ) + + jdf = self._jdf + if len(java_columns) > 0: + first_column, *remaining_columns = java_columns + jdf = jdf.drop(first_column, self._jseq(remaining_columns)) + if len(column_names) > 0: + jdf = jdf.drop(self._jseq(column_names)) + + return DataFrame(jdf, self.sparkSession) + + def toDF(self, *cols: str) -> ParentDataFrame: + for col in cols: + if not isinstance(col, str): + raise PySparkTypeError( + error_class="NOT_LIST_OF_STR", + message_parameters={"arg_name": "cols", "arg_type": type(col).__name__}, + ) + jdf = self._jdf.toDF(self._jseq(cols)) + return DataFrame(jdf, self.sparkSession) + + def transform( + self, func: Callable[..., ParentDataFrame], *args: Any, **kwargs: Any + ) -> ParentDataFrame: + result = func(self, *args, **kwargs) + assert isinstance( + result, DataFrame + ), "Func returned an instance of type [%s], " "should have been DataFrame." % type(result) + return result + + def sameSemantics(self, other: ParentDataFrame) -> bool: + if not isinstance(other, DataFrame): + raise PySparkTypeError( + error_class="NOT_DATAFRAME", + message_parameters={"arg_name": "other", "arg_type": type(other).__name__}, + ) + return self._jdf.sameSemantics(other._jdf) + + def semanticHash(self) -> int: + return self._jdf.semanticHash() + + def inputFiles(self) -> List[str]: + return list(self._jdf.inputFiles()) + + def where(self, condition: "ColumnOrName") -> ParentDataFrame: + return self.filter(condition) + + # Two aliases below were added for pandas compatibility many years ago. + # There are too many differences compared to pandas and we cannot just + # make it "compatible" by adding aliases. Therefore, we stop adding such + # aliases as of Spark 3.0. Two methods below remain just + # for legacy users currently. + @overload + def groupby(self, *cols: "ColumnOrNameOrOrdinal") -> "GroupedData": + ... + + @overload + def groupby(self, __cols: Union[List[Column], List[str], List[int]]) -> "GroupedData": + ... + + def groupby(self, *cols: "ColumnOrNameOrOrdinal") -> "GroupedData": # type: ignore[misc] + return self.groupBy(*cols) + + def writeTo(self, table: str) -> DataFrameWriterV2: + return DataFrameWriterV2(self, table) + + def pandas_api( + self, index_col: Optional[Union[str, List[str]]] = None + ) -> "PandasOnSparkDataFrame": + from pyspark.pandas.namespace import _get_index_map + from pyspark.pandas.frame import DataFrame as PandasOnSparkDataFrame + from pyspark.pandas.internal import InternalFrame + + index_spark_columns, index_names = _get_index_map(self, index_col) + internal = InternalFrame( + spark_frame=self, + index_spark_columns=index_spark_columns, + index_names=index_names, # type: ignore[arg-type] + ) + return PandasOnSparkDataFrame(internal) + + def mapInPandas( + self, + func: "PandasMapIterFunction", + schema: Union[StructType, str], + barrier: bool = False, + profile: Optional[ResourceProfile] = None, + ) -> ParentDataFrame: + return PandasMapOpsMixin.mapInPandas(self, func, schema, barrier, profile) + + def mapInArrow( + self, + func: "ArrowMapIterFunction", + schema: Union[StructType, str], + barrier: bool = False, + profile: Optional[ResourceProfile] = None, + ) -> ParentDataFrame: + return PandasMapOpsMixin.mapInArrow(self, func, schema, barrier, profile) + + def toArrow(self) -> "pa.Table": + return PandasConversionMixin.toArrow(self) + + def toPandas(self) -> "PandasDataFrameLike": + return PandasConversionMixin.toPandas(self) + + @property + def executionInfo(self) -> Optional["ExecutionInfo"]: + raise PySparkValueError( + error_class="CLASSIC_OPERATION_NOT_SUPPORTED_ON_DF", + message_parameters={"member": "queryExecution"}, + ) + + +def _to_scala_map(sc: "SparkContext", jm: Dict) -> "JavaObject": + """ + Convert a dict into a JVM Map. + """ + assert sc._jvm is not None + return sc._jvm.PythonUtils.toScalaMap(jm) + + +class DataFrameNaFunctions(ParentDataFrameNaFunctions): + def __init__(self, df: ParentDataFrame): + self.df = df + + def drop( + self, + how: str = "any", + thresh: Optional[int] = None, + subset: Optional[Union[str, Tuple[str, ...], List[str]]] = None, + ) -> ParentDataFrame: + return self.df.dropna(how=how, thresh=thresh, subset=subset) + + @overload + def fill(self, value: "LiteralType", subset: Optional[List[str]] = ...) -> ParentDataFrame: + ... + + @overload + def fill(self, value: Dict[str, "LiteralType"]) -> ParentDataFrame: + ... + + def fill( + self, + value: Union["LiteralType", Dict[str, "LiteralType"]], + subset: Optional[List[str]] = None, + ) -> ParentDataFrame: + return self.df.fillna(value=value, subset=subset) # type: ignore[arg-type] + + @overload + def replace( + self, + to_replace: List["LiteralType"], + value: List["OptionalPrimitiveType"], + subset: Optional[List[str]] = ..., + ) -> ParentDataFrame: + ... + + @overload + def replace( + self, + to_replace: Dict["LiteralType", "OptionalPrimitiveType"], + subset: Optional[List[str]] = ..., + ) -> ParentDataFrame: + ... + + @overload + def replace( + self, + to_replace: List["LiteralType"], + value: "OptionalPrimitiveType", + subset: Optional[List[str]] = ..., + ) -> ParentDataFrame: + ... + + def replace( # type: ignore[misc] + self, + to_replace: Union[List["LiteralType"], Dict["LiteralType", "OptionalPrimitiveType"]], + value: Optional[ + Union["OptionalPrimitiveType", List["OptionalPrimitiveType"], _NoValueType] + ] = _NoValue, + subset: Optional[List[str]] = None, + ) -> ParentDataFrame: + return self.df.replace(to_replace, value, subset) # type: ignore[arg-type] + + +class DataFrameStatFunctions(ParentDataFrameStatFunctions): + def __init__(self, df: ParentDataFrame): + self.df = df + + @overload + def approxQuantile( + self, + col: str, + probabilities: Union[List[float], Tuple[float]], + relativeError: float, + ) -> List[float]: + ... + + @overload + def approxQuantile( + self, + col: Union[List[str], Tuple[str]], + probabilities: Union[List[float], Tuple[float]], + relativeError: float, + ) -> List[List[float]]: + ... + + def approxQuantile( + self, + col: Union[str, List[str], Tuple[str]], + probabilities: Union[List[float], Tuple[float]], + relativeError: float, + ) -> Union[List[float], List[List[float]]]: + return self.df.approxQuantile(col, probabilities, relativeError) + + def corr(self, col1: str, col2: str, method: Optional[str] = None) -> float: + return self.df.corr(col1, col2, method) + + def cov(self, col1: str, col2: str) -> float: + return self.df.cov(col1, col2) + + def crosstab(self, col1: str, col2: str) -> ParentDataFrame: + return self.df.crosstab(col1, col2) + + def freqItems(self, cols: List[str], support: Optional[float] = None) -> ParentDataFrame: + return self.df.freqItems(cols, support) + + def sampleBy( + self, col: str, fractions: Dict[Any, float], seed: Optional[int] = None + ) -> ParentDataFrame: + return self.df.sampleBy(col, fractions, seed) + + +def _test() -> None: + import doctest + from pyspark.sql import SparkSession + import pyspark.sql.dataframe + + # It inherits docstrings but doctests cannot detect them so we run + # the parent classe's doctests here directly. + globs = pyspark.sql.dataframe.__dict__.copy() + spark = ( + SparkSession.builder.master("local[4]").appName("sql.classic.dataframe tests").getOrCreate() + ) + globs["spark"] = spark + (failure_count, test_count) = doctest.testmod( + pyspark.sql.dataframe, + globs=globs, + optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF, + ) + spark.stop() + if failure_count: + sys.exit(-1) + + +if __name__ == "__main__": + _test() diff --git a/python/pyspark/sql/classic/window.py b/python/pyspark/sql/classic/window.py new file mode 100644 index 0000000000000..b5c528eec10a1 --- /dev/null +++ b/python/pyspark/sql/classic/window.py @@ -0,0 +1,146 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import sys +from typing import cast, Iterable, List, Tuple, TYPE_CHECKING, Union + +from pyspark.sql.window import ( + Window as ParentWindow, + WindowSpec as ParentWindowSpec, +) +from pyspark.sql.utils import get_active_spark_context + +if TYPE_CHECKING: + from py4j.java_gateway import JavaObject + from pyspark.sql._typing import ColumnOrName, ColumnOrName_ + + +__all__ = ["Window", "WindowSpec"] + + +def _to_java_cols(cols: Tuple[Union["ColumnOrName", List["ColumnOrName_"]], ...]) -> "JavaObject": + from pyspark.sql.classic.column import _to_seq, _to_java_column + + if len(cols) == 1 and isinstance(cols[0], list): + cols = cols[0] # type: ignore[assignment] + sc = get_active_spark_context() + return _to_seq(sc, cast(Iterable["ColumnOrName"], cols), _to_java_column) + + +class Window(ParentWindow): + @staticmethod + def partitionBy(*cols: Union["ColumnOrName", List["ColumnOrName_"]]) -> ParentWindowSpec: + from py4j.java_gateway import JVMView + + sc = get_active_spark_context() + jspec = cast(JVMView, sc._jvm).org.apache.spark.sql.expressions.Window.partitionBy( + _to_java_cols(cols) + ) + return WindowSpec(jspec) + + @staticmethod + def orderBy(*cols: Union["ColumnOrName", List["ColumnOrName_"]]) -> ParentWindowSpec: + from py4j.java_gateway import JVMView + + sc = get_active_spark_context() + jspec = cast(JVMView, sc._jvm).org.apache.spark.sql.expressions.Window.orderBy( + _to_java_cols(cols) + ) + return WindowSpec(jspec) + + @staticmethod + def rowsBetween(start: int, end: int) -> ParentWindowSpec: + from py4j.java_gateway import JVMView + + if start <= Window._PRECEDING_THRESHOLD: + start = Window.unboundedPreceding + if end >= Window._FOLLOWING_THRESHOLD: + end = Window.unboundedFollowing + sc = get_active_spark_context() + jspec = cast(JVMView, sc._jvm).org.apache.spark.sql.expressions.Window.rowsBetween( + start, end + ) + return WindowSpec(jspec) + + @staticmethod + def rangeBetween(start: int, end: int) -> ParentWindowSpec: + from py4j.java_gateway import JVMView + + if start <= Window._PRECEDING_THRESHOLD: + start = Window.unboundedPreceding + if end >= Window._FOLLOWING_THRESHOLD: + end = Window.unboundedFollowing + sc = get_active_spark_context() + jspec = cast(JVMView, sc._jvm).org.apache.spark.sql.expressions.Window.rangeBetween( + start, end + ) + return WindowSpec(jspec) + + +class WindowSpec(ParentWindowSpec): + def __new__(cls, jspec: "JavaObject") -> "WindowSpec": + self = object.__new__(cls) + self.__init__(jspec) # type: ignore[misc] + return self + + def __init__(self, jspec: "JavaObject") -> None: + self._jspec = jspec + + def partitionBy(self, *cols: Union["ColumnOrName", List["ColumnOrName_"]]) -> ParentWindowSpec: + return WindowSpec(self._jspec.partitionBy(_to_java_cols(cols))) + + def orderBy(self, *cols: Union["ColumnOrName", List["ColumnOrName_"]]) -> ParentWindowSpec: + return WindowSpec(self._jspec.orderBy(_to_java_cols(cols))) + + def rowsBetween(self, start: int, end: int) -> ParentWindowSpec: + if start <= Window._PRECEDING_THRESHOLD: + start = Window.unboundedPreceding + if end >= Window._FOLLOWING_THRESHOLD: + end = Window.unboundedFollowing + return WindowSpec(self._jspec.rowsBetween(start, end)) + + def rangeBetween(self, start: int, end: int) -> ParentWindowSpec: + if start <= Window._PRECEDING_THRESHOLD: + start = Window.unboundedPreceding + if end >= Window._FOLLOWING_THRESHOLD: + end = Window.unboundedFollowing + return WindowSpec(self._jspec.rangeBetween(start, end)) + + +def _test() -> None: + import doctest + from pyspark.sql import SparkSession + import pyspark.sql.window + + # It inherits docstrings but doctests cannot detect them so we run + # the parent classe's doctests here directly. + globs = pyspark.sql.window.__dict__.copy() + spark = ( + SparkSession.builder.master("local[4]").appName("sql.classic.window tests").getOrCreate() + ) + globs["spark"] = spark + (failure_count, test_count) = doctest.testmod( + pyspark.sql.window, + globs=globs, + optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF, + ) + spark.stop() + if failure_count: + sys.exit(-1) + + +if __name__ == "__main__": + _test() diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py index fb266b03c2ffd..4ea621b626bb8 100644 --- a/python/pyspark/sql/column.py +++ b/python/pyspark/sql/column.py @@ -15,228 +15,28 @@ # limitations under the License. # +# mypy: disable-error-code="empty-body" + import sys -import json -import warnings -import inspect from typing import ( - cast, overload, Any, - Callable, - Iterable, - List, - Optional, - Tuple, TYPE_CHECKING, Union, ) -from pyspark.errors import PySparkAttributeError, PySparkTypeError, PySparkValueError +from pyspark.sql.utils import dispatch_col_method from pyspark.sql.types import DataType -from pyspark.sql.utils import get_active_spark_context +from pyspark.errors import PySparkValueError if TYPE_CHECKING: from py4j.java_gateway import JavaObject - from pyspark.core.context import SparkContext - from pyspark.sql._typing import ColumnOrName, LiteralType, DecimalLiteral, DateTimeLiteral + from pyspark.sql._typing import LiteralType, DecimalLiteral, DateTimeLiteral from pyspark.sql.window import WindowSpec __all__ = ["Column"] -def _create_column_from_literal(literal: Union["LiteralType", "DecimalLiteral"]) -> "Column": - from py4j.java_gateway import JVMView - - sc = get_active_spark_context() - return cast(JVMView, sc._jvm).functions.lit(literal) - - -def _create_column_from_name(name: str) -> "Column": - from py4j.java_gateway import JVMView - - sc = get_active_spark_context() - return cast(JVMView, sc._jvm).functions.col(name) - - -def _to_java_column(col: "ColumnOrName") -> "JavaObject": - if isinstance(col, Column): - jcol = col._jc - elif isinstance(col, str): - jcol = _create_column_from_name(col) - else: - raise PySparkTypeError( - error_class="NOT_COLUMN_OR_STR", - message_parameters={"arg_name": "col", "arg_type": type(col).__name__}, - ) - return jcol - - -def _to_java_expr(col: "ColumnOrName") -> "JavaObject": - return _to_java_column(col).expr() - - -@overload -def _to_seq(sc: "SparkContext", cols: Iterable["JavaObject"]) -> "JavaObject": - ... - - -@overload -def _to_seq( - sc: "SparkContext", - cols: Iterable["ColumnOrName"], - converter: Optional[Callable[["ColumnOrName"], "JavaObject"]], -) -> "JavaObject": - ... - - -def _to_seq( - sc: "SparkContext", - cols: Union[Iterable["ColumnOrName"], Iterable["JavaObject"]], - converter: Optional[Callable[["ColumnOrName"], "JavaObject"]] = None, -) -> "JavaObject": - """ - Convert a list of Columns (or names) into a JVM Seq of Column. - - An optional `converter` could be used to convert items in `cols` - into JVM Column objects. - """ - if converter: - cols = [converter(c) for c in cols] - assert sc._jvm is not None - return sc._jvm.PythonUtils.toSeq(cols) - - -def _to_list( - sc: "SparkContext", - cols: List["ColumnOrName"], - converter: Optional[Callable[["ColumnOrName"], "JavaObject"]] = None, -) -> "JavaObject": - """ - Convert a list of Columns (or names) into a JVM (Scala) List of Columns. - - An optional `converter` could be used to convert items in `cols` - into JVM Column objects. - """ - if converter: - cols = [converter(c) for c in cols] - assert sc._jvm is not None - return sc._jvm.PythonUtils.toList(cols) - - -def _unary_op( - name: str, - doc: str = "unary operator", -) -> Callable[["Column"], "Column"]: - """Create a method for given unary operator""" - - def _(self: "Column") -> "Column": - jc = getattr(self._jc, name)() - return Column(jc) - - _.__doc__ = doc - return _ - - -def _func_op(name: str, doc: str = "") -> Callable[["Column"], "Column"]: - def _(self: "Column") -> "Column": - from py4j.java_gateway import JVMView - - sc = get_active_spark_context() - jc = getattr(cast(JVMView, sc._jvm).functions, name)(self._jc) - return Column(jc) - - _.__doc__ = doc - return _ - - -def _bin_func_op( - name: str, - reverse: bool = False, - doc: str = "binary function", -) -> Callable[["Column", Union["Column", "LiteralType", "DecimalLiteral"]], "Column"]: - def _(self: "Column", other: Union["Column", "LiteralType", "DecimalLiteral"]) -> "Column": - from py4j.java_gateway import JVMView - - sc = get_active_spark_context() - fn = getattr(cast(JVMView, sc._jvm).functions, name) - jc = other._jc if isinstance(other, Column) else _create_column_from_literal(other) - njc = fn(self._jc, jc) if not reverse else fn(jc, self._jc) - return Column(njc) - - _.__doc__ = doc - return _ - - -def _bin_op( - name: str, - doc: str = "binary operator", -) -> Callable[ - ["Column", Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]], "Column" -]: - """Create a method for given binary operator""" - binary_operator_map = { - "plus": "+", - "minus": "-", - "divide": "/", - "multiply": "*", - "mod": "%", - "equalTo": "=", - "lt": "<", - "leq": "<=", - "geq": ">=", - "gt": ">", - "eqNullSafe": "<=>", - "bitwiseOR": "|", - "bitwiseAND": "&", - "bitwiseXOR": "^", - # Just following JVM rule even if the names of source and target are the same. - "and": "and", - "or": "or", - } - - def _( - self: "Column", - other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"], - ) -> "Column": - jc = other._jc if isinstance(other, Column) else other - if name in binary_operator_map: - from pyspark.sql import SparkSession - - spark = SparkSession._getActiveSessionOrCreate() - stack = list(reversed(inspect.stack())) - depth = int( - spark.conf.get("spark.sql.stackTracesInDataFrameContext") # type: ignore[arg-type] - ) - selected_frames = stack[:depth] - call_sites = [f"{frame.filename}:{frame.lineno}" for frame in selected_frames] - call_site_str = "\n".join(call_sites) - - njc = getattr(self._jc, "fn")(binary_operator_map[name], jc, name, call_site_str) - else: - njc = getattr(self._jc, name)(jc) - return Column(njc) - - _.__doc__ = doc - _.__name__ = name - return _ - - -def _reverse_op( - name: str, - doc: str = "binary operator", -) -> Callable[["Column", Union["LiteralType", "DecimalLiteral"]], "Column"]: - """Create a method for binary operator (this object is on right side)""" - - def _(self: "Column", other: Union["LiteralType", "DecimalLiteral"]) -> "Column": - jother = _create_column_from_literal(other) - jc = getattr(jother, name)(self._jc) - return Column(jc) - - _.__doc__ = doc - return _ - - class Column: """ @@ -268,146 +68,251 @@ class Column: Column<...> """ + # HACK ALERT!! this is to reduce the backward compatibility concern, and returns + # Spark Classic Column by default. This is NOT an API, and NOT supposed to + # be directly invoked. DO NOT use this constructor. + def __new__( + cls, + jc: "JavaObject", + ) -> "Column": + from pyspark.sql.classic.column import Column + + return Column.__new__(Column, jc) + def __init__(self, jc: "JavaObject") -> None: self._jc = jc # arithmetic operators - __neg__ = _func_op("negate") - __add__ = cast( - Callable[["Column", Union["Column", "LiteralType", "DecimalLiteral"]], "Column"], - _bin_op("plus"), - ) - __sub__ = cast( - Callable[["Column", Union["Column", "LiteralType", "DecimalLiteral"]], "Column"], - _bin_op("minus"), - ) - __mul__ = cast( - Callable[["Column", Union["Column", "LiteralType", "DecimalLiteral"]], "Column"], - _bin_op("multiply"), - ) - __div__ = cast( - Callable[["Column", Union["Column", "LiteralType", "DecimalLiteral"]], "Column"], - _bin_op("divide"), - ) - __truediv__ = cast( - Callable[["Column", Union["Column", "LiteralType", "DecimalLiteral"]], "Column"], - _bin_op("divide"), - ) - __mod__ = cast( - Callable[["Column", Union["Column", "LiteralType", "DecimalLiteral"]], "Column"], - _bin_op("mod"), - ) - __radd__ = cast( - Callable[["Column", Union["LiteralType", "DecimalLiteral"]], "Column"], _bin_op("plus") - ) - __rsub__ = _reverse_op("minus") - __rmul__ = cast( - Callable[["Column", Union["LiteralType", "DecimalLiteral"]], "Column"], _bin_op("multiply") - ) - __rdiv__ = _reverse_op("divide") - __rtruediv__ = _reverse_op("divide") - __rmod__ = _reverse_op("mod") - - __pow__ = _bin_func_op("pow") - __rpow__ = cast( - Callable[["Column", Union["LiteralType", "DecimalLiteral"]], "Column"], - _bin_func_op("pow", reverse=True), - ) + @dispatch_col_method + def __neg__(self) -> "Column": + ... + + @dispatch_col_method + def __add__( + self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> "Column": + ... + + @dispatch_col_method + def __sub__( + self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> "Column": + ... + + @dispatch_col_method + def __mul__( + self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> "Column": + ... + + @dispatch_col_method + def __div__( + self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> "Column": + ... + + @dispatch_col_method + def __truediv__( + self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> "Column": + ... + + @dispatch_col_method + def __mod__( + self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> "Column": + ... + + @dispatch_col_method + def __radd__( + self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> "Column": + ... + + @dispatch_col_method + def __rsub__( + self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> "Column": + ... + + @dispatch_col_method + def __rmul__( + self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> "Column": + ... + + @dispatch_col_method + def __rdiv__( + self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> "Column": + ... + + @dispatch_col_method + def __rtruediv__( + self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> "Column": + ... + + @dispatch_col_method + def __rmod__( + self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> "Column": + ... + + @dispatch_col_method + def __pow__( + self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> "Column": + ... + + @dispatch_col_method + def __rpow__( + self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> "Column": + ... # logistic operators + @dispatch_col_method def __eq__( # type: ignore[override] self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"], ) -> "Column": """binary function""" - return _bin_op("equalTo")(self, other) + ... + @dispatch_col_method def __ne__( # type: ignore[override] self, other: Any, ) -> "Column": """binary function""" - return _bin_op("notEqual")(self, other) + ... - __lt__ = _bin_op("lt") - __le__ = _bin_op("leq") - __ge__ = _bin_op("geq") - __gt__ = _bin_op("gt") + @dispatch_col_method + def __lt__( + self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> "Column": + ... - _eqNullSafe_doc = """ - Equality test that is safe for null values. + @dispatch_col_method + def __le__( + self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> "Column": + ... - .. versionadded:: 2.3.0 + @dispatch_col_method + def __ge__( + self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> "Column": + ... - .. versionchanged:: 3.4.0 - Supports Spark Connect. + @dispatch_col_method + def __gt__( + self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> "Column": + ... - Parameters - ---------- - other - a value or :class:`Column` + @dispatch_col_method + def eqNullSafe( + self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> "Column": + """ + Equality test that is safe for null values. - Examples - -------- - >>> from pyspark.sql import Row - >>> df1 = spark.createDataFrame([ - ... Row(id=1, value='foo'), - ... Row(id=2, value=None) - ... ]) - >>> df1.select( - ... df1['value'] == 'foo', - ... df1['value'].eqNullSafe('foo'), - ... df1['value'].eqNullSafe(None) - ... ).show() - +-------------+---------------+----------------+ - |(value = foo)|(value <=> foo)|(value <=> NULL)| - +-------------+---------------+----------------+ - | true| true| false| - | NULL| false| true| - +-------------+---------------+----------------+ - >>> df2 = spark.createDataFrame([ - ... Row(value = 'bar'), - ... Row(value = None) - ... ]) - >>> df1.join(df2, df1["value"] == df2["value"]).count() - 0 - >>> df1.join(df2, df1["value"].eqNullSafe(df2["value"])).count() - 1 - >>> df2 = spark.createDataFrame([ - ... Row(id=1, value=float('NaN')), - ... Row(id=2, value=42.0), - ... Row(id=3, value=None) - ... ]) - >>> df2.select( - ... df2['value'].eqNullSafe(None), - ... df2['value'].eqNullSafe(float('NaN')), - ... df2['value'].eqNullSafe(42.0) - ... ).show() - +----------------+---------------+----------------+ - |(value <=> NULL)|(value <=> NaN)|(value <=> 42.0)| - +----------------+---------------+----------------+ - | false| true| false| - | false| false| true| - | true| false| false| - +----------------+---------------+----------------+ - - Notes - ----- - Unlike Pandas, PySpark doesn't consider NaN values to be NULL. See the - `NaN Semantics `_ - for details. - """ - eqNullSafe = _bin_op("eqNullSafe", _eqNullSafe_doc) + .. versionadded:: 2.3.0 + + .. versionchanged:: 3.4.0 + Supports Spark Connect. + + Parameters + ---------- + other + a value or :class:`Column` + + Examples + -------- + >>> from pyspark.sql import Row + >>> df1 = spark.createDataFrame([ + ... Row(id=1, value='foo'), + ... Row(id=2, value=None) + ... ]) + >>> df1.select( + ... df1['value'] == 'foo', + ... df1['value'].eqNullSafe('foo'), + ... df1['value'].eqNullSafe(None) + ... ).show() + +-------------+---------------+----------------+ + |(value = foo)|(value <=> foo)|(value <=> NULL)| + +-------------+---------------+----------------+ + | true| true| false| + | NULL| false| true| + +-------------+---------------+----------------+ + >>> df2 = spark.createDataFrame([ + ... Row(value = 'bar'), + ... Row(value = None) + ... ]) + >>> df1.join(df2, df1["value"] == df2["value"]).count() + 0 + >>> df1.join(df2, df1["value"].eqNullSafe(df2["value"])).count() + 1 + >>> df2 = spark.createDataFrame([ + ... Row(id=1, value=float('NaN')), + ... Row(id=2, value=42.0), + ... Row(id=3, value=None) + ... ]) + >>> df2.select( + ... df2['value'].eqNullSafe(None), + ... df2['value'].eqNullSafe(float('NaN')), + ... df2['value'].eqNullSafe(42.0) + ... ).show() + +----------------+---------------+----------------+ + |(value <=> NULL)|(value <=> NaN)|(value <=> 42.0)| + +----------------+---------------+----------------+ + | false| true| false| + | false| false| true| + | true| false| false| + +----------------+---------------+----------------+ + + Notes + ----- + Unlike Pandas, PySpark doesn't consider NaN values to be NULL. See the + `NaN Semantics `_ + for details. + """ + ... # `and`, `or`, `not` cannot be overloaded in Python, # so use bitwise operators as boolean operators - __and__ = _bin_op("and") - __or__ = _bin_op("or") - __invert__ = _func_op("not") - __rand__ = _bin_op("and") - __ror__ = _bin_op("or") + @dispatch_col_method + def __and__( + self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> "Column": + ... + + @dispatch_col_method + def __or__( + self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> "Column": + ... + + @dispatch_col_method + def __invert__(self) -> "Column": + ... + + @dispatch_col_method + def __rand__( + self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> "Column": + ... + + @dispatch_col_method + def __ror__( + self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> "Column": + ... # container operators + @dispatch_col_method def __contains__(self, item: Any) -> None: raise PySparkValueError( error_class="CANNOT_APPLY_IN_FOR_COLUMN", @@ -415,68 +320,82 @@ def __contains__(self, item: Any) -> None: ) # bitwise operators - _bitwiseOR_doc = """ - Compute bitwise OR of this expression with another expression. + @dispatch_col_method + def bitwiseOR( + self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> "Column": + """ " + Compute bitwise OR of this expression with another expression. - .. versionchanged:: 3.4.0 - Supports Spark Connect. + .. versionchanged:: 3.4.0 + Supports Spark Connect. - Parameters - ---------- - other - a value or :class:`Column` to calculate bitwise or(|) with - this :class:`Column`. + Parameters + ---------- + other + a value or :class:`Column` to calculate bitwise or(|) with + this :class:`Column`. - Examples - -------- - >>> from pyspark.sql import Row - >>> df = spark.createDataFrame([Row(a=170, b=75)]) - >>> df.select(df.a.bitwiseOR(df.b)).collect() - [Row((a | b)=235)] - """ - _bitwiseAND_doc = """ - Compute bitwise AND of this expression with another expression. + Examples + -------- + >>> from pyspark.sql import Row + >>> df = spark.createDataFrame([Row(a=170, b=75)]) + >>> df.select(df.a.bitwiseOR(df.b)).collect() + [Row((a | b)=235)] + """ + ... - .. versionchanged:: 3.4.0 - Supports Spark Connect. + @dispatch_col_method + def bitwiseAND( + self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> "Column": + """ + Compute bitwise AND of this expression with another expression. - Parameters - ---------- - other - a value or :class:`Column` to calculate bitwise and(&) with - this :class:`Column`. + .. versionchanged:: 3.4.0 + Supports Spark Connect. - Examples - -------- - >>> from pyspark.sql import Row - >>> df = spark.createDataFrame([Row(a=170, b=75)]) - >>> df.select(df.a.bitwiseAND(df.b)).collect() - [Row((a & b)=10)] - """ - _bitwiseXOR_doc = """ - Compute bitwise XOR of this expression with another expression. + Parameters + ---------- + other + a value or :class:`Column` to calculate bitwise and(&) with + this :class:`Column`. - .. versionchanged:: 3.4.0 - Supports Spark Connect. + Examples + -------- + >>> from pyspark.sql import Row + >>> df = spark.createDataFrame([Row(a=170, b=75)]) + >>> df.select(df.a.bitwiseAND(df.b)).collect() + [Row((a & b)=10)] + """ + ... + + @dispatch_col_method + def bitwiseXOR( + self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> "Column": + """ + Compute bitwise XOR of this expression with another expression. - Parameters - ---------- - other - a value or :class:`Column` to calculate bitwise xor(^) with - this :class:`Column`. + .. versionchanged:: 3.4.0 + Supports Spark Connect. - Examples - -------- - >>> from pyspark.sql import Row - >>> df = spark.createDataFrame([Row(a=170, b=75)]) - >>> df.select(df.a.bitwiseXOR(df.b)).collect() - [Row((a ^ b)=225)] - """ + Parameters + ---------- + other + a value or :class:`Column` to calculate bitwise xor(^) with + this :class:`Column`. - bitwiseOR = _bin_op("bitwiseOR", _bitwiseOR_doc) - bitwiseAND = _bin_op("bitwiseAND", _bitwiseAND_doc) - bitwiseXOR = _bin_op("bitwiseXOR", _bitwiseXOR_doc) + Examples + -------- + >>> from pyspark.sql import Row + >>> df = spark.createDataFrame([Row(a=170, b=75)]) + >>> df.select(df.a.bitwiseXOR(df.b)).collect() + [Row((a ^ b)=225)] + """ + ... + @dispatch_col_method def getItem(self, key: Any) -> "Column": """ An expression that gets an item at position ``ordinal`` out of a list, @@ -511,15 +430,9 @@ def getItem(self, key: Any) -> "Column": | 1| value| +----+------+ """ - if isinstance(key, Column): - warnings.warn( - "A column as 'key' in getItem is deprecated as of Spark 3.0, and will not " - "be supported in the future release. Use `column[key]` or `column.key` syntax " - "instead.", - FutureWarning, - ) - return self[key] + ... + @dispatch_col_method def getField(self, name: Any) -> "Column": """ An expression that gets a field by name in a :class:`StructType`. @@ -559,15 +472,9 @@ def getField(self, name: Any) -> "Column": | 1| +---+ """ - if isinstance(name, Column): - warnings.warn( - "A column as 'name' in getField is deprecated as of Spark 3.0, and will not " - "be supported in the future release. Use `column[name]` or `column.name` syntax " - "instead.", - FutureWarning, - ) - return self[name] + ... + @dispatch_col_method def withField(self, fieldName: str, col: "Column") -> "Column": """ An expression that adds/replaces a field in :class:`StructType` by name. @@ -609,20 +516,9 @@ def withField(self, fieldName: str, col: "Column") -> "Column": | 4| +---+ """ - if not isinstance(fieldName, str): - raise PySparkTypeError( - error_class="NOT_STR", - message_parameters={"arg_name": "fieldName", "arg_type": type(fieldName).__name__}, - ) - - if not isinstance(col, Column): - raise PySparkTypeError( - error_class="NOT_COLUMN", - message_parameters={"arg_name": "col", "arg_type": type(col).__name__}, - ) - - return Column(self._jc.withField(fieldName, col._jc)) + ... + @dispatch_col_method def dropFields(self, *fieldNames: str) -> "Column": """ An expression that drops fields in :class:`StructType` by name. @@ -687,10 +583,9 @@ def dropFields(self, *fieldNames: str) -> "Column": +--------------+ """ - sc = get_active_spark_context() - jc = self._jc.dropFields(_to_seq(sc, fieldNames)) - return Column(jc) + ... + @dispatch_col_method def __getattr__(self, item: Any) -> "Column": """ An expression that gets an item at position ``ordinal`` out of a list, @@ -721,13 +616,9 @@ def __getattr__(self, item: Any) -> "Column": | value| +------+ """ - if item.startswith("__"): - raise PySparkAttributeError( - error_class="CANNOT_ACCESS_TO_DUNDER", - message_parameters={}, - ) - return self[item] + ... + @dispatch_col_method def __getitem__(self, k: Any) -> "Column": """ An expression that gets an item at position ``ordinal`` out of a list, @@ -759,85 +650,90 @@ def __getitem__(self, k: Any) -> "Column": | abc| value| +---------------+------+ """ - if isinstance(k, slice): - if k.step is not None: - raise PySparkValueError( - error_class="SLICE_WITH_STEP", - message_parameters={}, - ) - return self.substr(k.start, k.stop) - else: - return _bin_op("apply")(self, k) + ... + @dispatch_col_method def __iter__(self) -> None: - raise PySparkTypeError( - error_class="NOT_ITERABLE", message_parameters={"objectName": "Column"} - ) + ... # string methods - _contains_doc = """ - Contains the other element. Returns a boolean :class:`Column` based on a string match. + @dispatch_col_method + def contains( + self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> "Column": + """ + Contains the other element. Returns a boolean :class:`Column` based on a string match. - .. versionchanged:: 3.4.0 - Supports Spark Connect. + .. versionchanged:: 3.4.0 + Supports Spark Connect. + + Parameters + ---------- + other + string in line. A value as a literal or a :class:`Column`. - Parameters - ---------- - other - string in line. A value as a literal or a :class:`Column`. + Examples + -------- + >>> df = spark.createDataFrame( + ... [(2, "Alice"), (5, "Bob")], ["age", "name"]) + >>> df.filter(df.name.contains('o')).collect() + [Row(age=5, name='Bob')] + """ + ... - Examples - -------- - >>> df = spark.createDataFrame( - ... [(2, "Alice"), (5, "Bob")], ["age", "name"]) - >>> df.filter(df.name.contains('o')).collect() - [Row(age=5, name='Bob')] - """ - _startswith_doc = """ - String starts with. Returns a boolean :class:`Column` based on a string match. + @dispatch_col_method + def startswith( + self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> "Column": + """ + String starts with. Returns a boolean :class:`Column` based on a string match. - .. versionchanged:: 3.4.0 - Supports Spark Connect. + .. versionchanged:: 3.4.0 + Supports Spark Connect. - Parameters - ---------- - other : :class:`Column` or str - string at start of line (do not use a regex `^`) + Parameters + ---------- + other : :class:`Column` or str + string at start of line (do not use a regex `^`) - Examples - -------- - >>> df = spark.createDataFrame( - ... [(2, "Alice"), (5, "Bob")], ["age", "name"]) - >>> df.filter(df.name.startswith('Al')).collect() - [Row(age=2, name='Alice')] - >>> df.filter(df.name.startswith('^Al')).collect() - [] - """ - _endswith_doc = """ - String ends with. Returns a boolean :class:`Column` based on a string match. + Examples + -------- + >>> df = spark.createDataFrame( + ... [(2, "Alice"), (5, "Bob")], ["age", "name"]) + >>> df.filter(df.name.startswith('Al')).collect() + [Row(age=2, name='Alice')] + >>> df.filter(df.name.startswith('^Al')).collect() + [] + """ + ... - .. versionchanged:: 3.4.0 - Supports Spark Connect. + @dispatch_col_method + def endswith( + self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> "Column": + """ + String ends with. Returns a boolean :class:`Column` based on a string match. - Parameters - ---------- - other : :class:`Column` or str - string at end of line (do not use a regex `$`) + .. versionchanged:: 3.4.0 + Supports Spark Connect. - Examples - -------- - >>> df = spark.createDataFrame( - ... [(2, "Alice"), (5, "Bob")], ["age", "name"]) - >>> df.filter(df.name.endswith('ice')).collect() - [Row(age=2, name='Alice')] - >>> df.filter(df.name.endswith('ice$')).collect() - [] - """ + Parameters + ---------- + other : :class:`Column` or str + string at end of line (do not use a regex `$`) - contains = _bin_op("contains", _contains_doc) - startswith = _bin_op("startsWith", _startswith_doc) - endswith = _bin_op("endsWith", _endswith_doc) + Examples + -------- + >>> df = spark.createDataFrame( + ... [(2, "Alice"), (5, "Bob")], ["age", "name"]) + >>> df.filter(df.name.endswith('ice')).collect() + [Row(age=2, name='Alice')] + >>> df.filter(df.name.endswith('ice$')).collect() + [] + """ + ... + @dispatch_col_method def like(self: "Column", other: str) -> "Column": """ SQL like expression. Returns a boolean :class:`Column` based on a SQL LIKE match. @@ -867,9 +763,9 @@ def like(self: "Column", other: str) -> "Column": >>> df.filter(df.name.like('Al%')).collect() [Row(age=2, name='Alice')] """ - njc = getattr(self._jc, "like")(other) - return Column(njc) + ... + @dispatch_col_method def rlike(self: "Column", other: str) -> "Column": """ SQL RLIKE expression (LIKE with Regex). Returns a boolean :class:`Column` based on a regex @@ -896,9 +792,9 @@ def rlike(self: "Column", other: str) -> "Column": >>> df.filter(df.name.rlike('ice$')).collect() [Row(age=2, name='Alice')] """ - njc = getattr(self._jc, "rlike")(other) - return Column(njc) + ... + @dispatch_col_method def ilike(self: "Column", other: str) -> "Column": """ SQL ILIKE expression (case insensitive LIKE). Returns a boolean :class:`Column` @@ -931,8 +827,7 @@ def ilike(self: "Column", other: str) -> "Column": >>> df.filter(df.name.ilike('%Ice')).collect() [Row(age=2, name='Alice')] """ - njc = getattr(self._jc, "ilike")(other) - return Column(njc) + ... @overload def substr(self, startPos: int, length: int) -> "Column": @@ -942,6 +837,7 @@ def substr(self, startPos: int, length: int) -> "Column": def substr(self, startPos: "Column", length: "Column") -> "Column": ... + @dispatch_col_method def substr(self, startPos: Union[int, "Column"], length: Union[int, "Column"]) -> "Column": """ Return a :class:`Column` which is a substring of the column. @@ -980,27 +876,9 @@ def substr(self, startPos: Union[int, "Column"], length: Union[int, "Column"]) - >>> df.select(df.name.substr(df.sidx, df.eidx).alias("col")).collect() [Row(col='ice'), Row(col='ob')] """ - if type(startPos) != type(length): - raise PySparkTypeError( - error_class="NOT_SAME_TYPE", - message_parameters={ - "arg_name1": "startPos", - "arg_name2": "length", - "arg_type1": type(startPos).__name__, - "arg_type2": type(length).__name__, - }, - ) - if isinstance(startPos, int): - jc = self._jc.substr(startPos, length) - elif isinstance(startPos, Column): - jc = self._jc.substr(startPos._jc, cast("Column", length)._jc) - else: - raise PySparkTypeError( - error_class="NOT_COLUMN_OR_INT", - message_parameters={"arg_name": "startPos", "arg_type": type(startPos).__name__}, - ) - return Column(jc) + ... + @dispatch_col_method def isin(self, *cols: Any) -> "Column": """ A boolean expression that is evaluated to true if the value of this @@ -1054,164 +932,184 @@ def isin(self, *cols: Any) -> "Column": | 8|Mike| +---+----+ """ - if len(cols) == 1 and isinstance(cols[0], (list, set)): - cols = cast(Tuple, cols[0]) - cols = cast( - Tuple, - [c._jc if isinstance(c, Column) else _create_column_from_literal(c) for c in cols], - ) - sc = get_active_spark_context() - jc = getattr(self._jc, "isin")(_to_seq(sc, cols)) - return Column(jc) + ... # order - _asc_doc = """ - Returns a sort expression based on the ascending order of the column. + @dispatch_col_method + def asc(self) -> "Column": + """ + Returns a sort expression based on the ascending order of the column. - .. versionchanged:: 3.4.0 - Supports Spark Connect. + .. versionchanged:: 3.4.0 + Supports Spark Connect. - Examples - -------- - >>> from pyspark.sql import Row - >>> df = spark.createDataFrame([('Tom', 80), ('Alice', None)], ["name", "height"]) - >>> df.select(df.name).orderBy(df.name.asc()).collect() - [Row(name='Alice'), Row(name='Tom')] - """ - _asc_nulls_first_doc = """ - Returns a sort expression based on ascending order of the column, and null values - return before non-null values. + Examples + -------- + >>> from pyspark.sql import Row + >>> df = spark.createDataFrame([('Tom', 80), ('Alice', None)], ["name", "height"]) + >>> df.select(df.name).orderBy(df.name.asc()).collect() + [Row(name='Alice'), Row(name='Tom')] + """ + ... - .. versionadded:: 2.4.0 + @dispatch_col_method + def asc_nulls_first(self) -> "Column": + """ + Returns a sort expression based on ascending order of the column, and null values + return before non-null values. - .. versionchanged:: 3.4.0 - Supports Spark Connect. + .. versionadded:: 2.4.0 - Examples - -------- - >>> from pyspark.sql import Row - >>> df = spark.createDataFrame([('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"]) - >>> df.select(df.name).orderBy(df.name.asc_nulls_first()).collect() - [Row(name=None), Row(name='Alice'), Row(name='Tom')] + .. versionchanged:: 3.4.0 + Supports Spark Connect. - """ - _asc_nulls_last_doc = """ - Returns a sort expression based on ascending order of the column, and null values - appear after non-null values. + Examples + -------- + >>> from pyspark.sql import Row + >>> df = spark.createDataFrame( + ... [('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"]) + >>> df.select(df.name).orderBy(df.name.asc_nulls_first()).collect() + [Row(name=None), Row(name='Alice'), Row(name='Tom')] - .. versionadded:: 2.4.0 + """ + ... - .. versionchanged:: 3.4.0 - Supports Spark Connect. + @dispatch_col_method + def asc_nulls_last(self) -> "Column": + """ + Returns a sort expression based on ascending order of the column, and null values + appear after non-null values. - Examples - -------- - >>> from pyspark.sql import Row - >>> df = spark.createDataFrame([('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"]) - >>> df.select(df.name).orderBy(df.name.asc_nulls_last()).collect() - [Row(name='Alice'), Row(name='Tom'), Row(name=None)] + .. versionadded:: 2.4.0 - """ - _desc_doc = """ - Returns a sort expression based on the descending order of the column. + .. versionchanged:: 3.4.0 + Supports Spark Connect. - .. versionadded:: 2.4.0 + Examples + -------- + >>> from pyspark.sql import Row + >>> df = spark.createDataFrame( + ... [('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"]) + >>> df.select(df.name).orderBy(df.name.asc_nulls_last()).collect() + [Row(name='Alice'), Row(name='Tom'), Row(name=None)] - .. versionchanged:: 3.4.0 - Supports Spark Connect. + """ + ... - Examples - -------- - >>> from pyspark.sql import Row - >>> df = spark.createDataFrame([('Tom', 80), ('Alice', None)], ["name", "height"]) - >>> df.select(df.name).orderBy(df.name.desc()).collect() - [Row(name='Tom'), Row(name='Alice')] - """ - _desc_nulls_first_doc = """ - Returns a sort expression based on the descending order of the column, and null values - appear before non-null values. + @dispatch_col_method + def desc(self) -> "Column": + """ + Returns a sort expression based on the descending order of the column. - .. versionadded:: 2.4.0 + .. versionadded:: 2.4.0 - .. versionchanged:: 3.4.0 - Supports Spark Connect. + .. versionchanged:: 3.4.0 + Supports Spark Connect. - Examples - -------- - >>> from pyspark.sql import Row - >>> df = spark.createDataFrame([('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"]) - >>> df.select(df.name).orderBy(df.name.desc_nulls_first()).collect() - [Row(name=None), Row(name='Tom'), Row(name='Alice')] + Examples + -------- + >>> from pyspark.sql import Row + >>> df = spark.createDataFrame([('Tom', 80), ('Alice', None)], ["name", "height"]) + >>> df.select(df.name).orderBy(df.name.desc()).collect() + [Row(name='Tom'), Row(name='Alice')] + """ + ... - """ - _desc_nulls_last_doc = """ - Returns a sort expression based on the descending order of the column, and null values - appear after non-null values. + @dispatch_col_method + def desc_nulls_first(self) -> "Column": + """ + Returns a sort expression based on the descending order of the column, and null values + appear before non-null values. - .. versionadded:: 2.4.0 + .. versionadded:: 2.4.0 - .. versionchanged:: 3.4.0 - Supports Spark Connect. + .. versionchanged:: 3.4.0 + Supports Spark Connect. - Examples - -------- - >>> from pyspark.sql import Row - >>> df = spark.createDataFrame([('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"]) - >>> df.select(df.name).orderBy(df.name.desc_nulls_last()).collect() - [Row(name='Tom'), Row(name='Alice'), Row(name=None)] - """ + Examples + -------- + >>> from pyspark.sql import Row + >>> df = spark.createDataFrame( + ... [('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"]) + >>> df.select(df.name).orderBy(df.name.desc_nulls_first()).collect() + [Row(name=None), Row(name='Tom'), Row(name='Alice')] - asc = _unary_op("asc", _asc_doc) - asc_nulls_first = _unary_op("asc_nulls_first", _asc_nulls_first_doc) - asc_nulls_last = _unary_op("asc_nulls_last", _asc_nulls_last_doc) - desc = _unary_op("desc", _desc_doc) - desc_nulls_first = _unary_op("desc_nulls_first", _desc_nulls_first_doc) - desc_nulls_last = _unary_op("desc_nulls_last", _desc_nulls_last_doc) + """ + ... - _isNull_doc = """ - True if the current expression is null. + @dispatch_col_method + def desc_nulls_last(self) -> "Column": + """ + Returns a sort expression based on the descending order of the column, and null values + appear after non-null values. - .. versionchanged:: 3.4.0 - Supports Spark Connect. + .. versionadded:: 2.4.0 - Examples - -------- - >>> from pyspark.sql import Row - >>> df = spark.createDataFrame([Row(name='Tom', height=80), Row(name='Alice', height=None)]) - >>> df.filter(df.height.isNull()).collect() - [Row(name='Alice', height=None)] - """ - _isNotNull_doc = """ - True if the current expression is NOT null. + .. versionchanged:: 3.4.0 + Supports Spark Connect. - .. versionchanged:: 3.4.0 - Supports Spark Connect. + Examples + -------- + >>> from pyspark.sql import Row + >>> df = spark.createDataFrame( + ... [('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"]) + >>> df.select(df.name).orderBy(df.name.desc_nulls_last()).collect() + [Row(name='Tom'), Row(name='Alice'), Row(name=None)] + """ + ... - Examples - -------- - >>> from pyspark.sql import Row - >>> df = spark.createDataFrame([Row(name='Tom', height=80), Row(name='Alice', height=None)]) - >>> df.filter(df.height.isNotNull()).collect() - [Row(name='Tom', height=80)] - """ - _isNaN_doc = """ - True if the current expression is NaN. + @dispatch_col_method + def isNull(self) -> "Column": + """ + True if the current expression is null. - .. versionadded:: 4.0.0 + .. versionchanged:: 3.4.0 + Supports Spark Connect. - Examples - -------- - >>> from pyspark.sql import Row - >>> df = spark.createDataFrame( - ... [Row(name='Tom', height=80.0), Row(name='Alice', height=float('nan'))]) - >>> df.filter(df.height.isNaN()).collect() - [Row(name='Alice', height=nan)] - """ + Examples + -------- + >>> from pyspark.sql import Row + >>> df = spark.createDataFrame([Row(name='Tom', height=80), Row(name='Alice', height=None)]) + >>> df.filter(df.height.isNull()).collect() + [Row(name='Alice', height=None)] + """ + ... + + @dispatch_col_method + def isNotNull(self) -> "Column": + """ + True if the current expression is NOT null. + + .. versionchanged:: 3.4.0 + Supports Spark Connect. + + Examples + -------- + >>> from pyspark.sql import Row + >>> df = spark.createDataFrame([Row(name='Tom', height=80), Row(name='Alice', height=None)]) + >>> df.filter(df.height.isNotNull()).collect() + [Row(name='Tom', height=80)] + """ + ... + + @dispatch_col_method + def isNaN(self) -> "Column": + """ + True if the current expression is NaN. - isNull = _unary_op("isNull", _isNull_doc) - isNotNull = _unary_op("isNotNull", _isNotNull_doc) - isNaN = _unary_op("isNaN", _isNaN_doc) + .. versionadded:: 4.0.0 + Examples + -------- + >>> from pyspark.sql import Row + >>> df = spark.createDataFrame( + ... [Row(name='Tom', height=80.0), Row(name='Alice', height=float('nan'))]) + >>> df.filter(df.height.isNaN()).collect() + [Row(name='Alice', height=nan)] + """ + ... + + @dispatch_col_method def alias(self, *alias: str, **kwargs: Any) -> "Column": """ Returns this column aliased with a new name or names (in the case of expressions that @@ -1251,34 +1149,18 @@ def alias(self, *alias: str, **kwargs: Any) -> "Column": >>> df.select(df.age.alias("age3", metadata={'max': 99})).schema['age3'].metadata['max'] 99 """ + ... - metadata = kwargs.pop("metadata", None) - assert not kwargs, "Unexpected kwargs where passed: %s" % kwargs - - sc = get_active_spark_context() - if len(alias) == 1: - if metadata: - assert sc._jvm is not None - jmeta = sc._jvm.org.apache.spark.sql.types.Metadata.fromJson(json.dumps(metadata)) - return Column(getattr(self._jc, "as")(alias[0], jmeta)) - else: - return Column(getattr(self._jc, "as")(alias[0])) - else: - if metadata is not None: - raise PySparkValueError( - error_class="ONLY_ALLOWED_FOR_SINGLE_COLUMN", - message_parameters={"arg_name": "metadata"}, - ) - return Column(getattr(self._jc, "as")(_to_seq(sc, list(alias)))) - + @dispatch_col_method def name(self, *alias: str, **kwargs: Any) -> "Column": """ :func:`name` is an alias for :func:`alias`. .. versionadded:: 2.0.0 """ - return self.alias(*alias, **kwargs) + ... + @dispatch_col_method def cast(self, dataType: Union[DataType, str]) -> "Column": """ Casts the column into type ``dataType``. @@ -1309,21 +1191,9 @@ def cast(self, dataType: Union[DataType, str]) -> "Column": >>> df.select(df.age.cast(StringType()).alias('ages')).collect() [Row(ages='2'), Row(ages='5')] """ - if isinstance(dataType, str): - jc = self._jc.cast(dataType) - elif isinstance(dataType, DataType): - from pyspark.sql import SparkSession - - spark = SparkSession._getActiveSessionOrCreate() - jdt = spark._jsparkSession.parseDataType(dataType.json()) - jc = self._jc.cast(jdt) - else: - raise PySparkTypeError( - error_class="NOT_DATATYPE_OR_STR", - message_parameters={"arg_name": "dataType", "arg_type": type(dataType).__name__}, - ) - return Column(jc) + ... + @dispatch_col_method def try_cast(self, dataType: Union[DataType, str]) -> "Column": """ This is a special version of `cast` that performs the same operation, but returns a NULL @@ -1371,29 +1241,18 @@ def try_cast(self, dataType: Union[DataType, str]) -> "Column": | NULL| +-----+ """ - if isinstance(dataType, str): - jc = self._jc.try_cast(dataType) - elif isinstance(dataType, DataType): - from pyspark.sql import SparkSession - - spark = SparkSession._getActiveSessionOrCreate() - jdt = spark._jsparkSession.parseDataType(dataType.json()) - jc = self._jc.try_cast(jdt) - else: - raise PySparkTypeError( - error_class="NOT_DATATYPE_OR_STR", - message_parameters={"arg_name": "dataType", "arg_type": type(dataType).__name__}, - ) - return Column(jc) + ... + @dispatch_col_method def astype(self, dataType: Union[DataType, str]) -> "Column": """ :func:`astype` is an alias for :func:`cast`. .. versionadded:: 1.4.0 """ - return self.cast(dataType) + ... + @dispatch_col_method def between( self, lowerBound: Union["Column", "LiteralType", "DateTimeLiteral", "DecimalLiteral"], @@ -1501,8 +1360,9 @@ def between( | Bob| true| +-----+------------------------------------------------------------------+ """ - return (self >= lowerBound) & (self <= upperBound) + ... + @dispatch_col_method def when(self, condition: "Column", value: Any) -> "Column": """ Evaluates a list of conditions and returns one of multiple possible result expressions. @@ -1576,15 +1436,9 @@ def when(self, condition: "Column", value: Any) -> "Column": -------- pyspark.sql.functions.when """ - if not isinstance(condition, Column): - raise PySparkTypeError( - error_class="NOT_COLUMN", - message_parameters={"arg_name": "condition", "arg_type": type(condition).__name__}, - ) - v = value._jc if isinstance(value, Column) else value - jc = self._jc.when(condition._jc, v) - return Column(jc) + ... + @dispatch_col_method def otherwise(self, value: Any) -> "Column": """ Evaluates a list of conditions and returns one of multiple possible result expressions. @@ -1622,10 +1476,9 @@ def otherwise(self, value: Any) -> "Column": -------- pyspark.sql.functions.when """ - v = value._jc if isinstance(value, Column) else value - jc = self._jc.otherwise(v) - return Column(jc) + ... + @dispatch_col_method def over(self, window: "WindowSpec") -> "Column": """ Define a windowing column. @@ -1666,26 +1519,19 @@ def over(self, window: "WindowSpec") -> "Column": | 2|Alice| 1| 2| +---+-----+----+---+ """ - from pyspark.sql.window import WindowSpec - - if not isinstance(window, WindowSpec): - raise PySparkTypeError( - error_class="NOT_WINDOWSPEC", - message_parameters={"arg_name": "window", "arg_type": type(window).__name__}, - ) - jc = self._jc.over(window._jspec) - return Column(jc) + ... + @dispatch_col_method def __nonzero__(self) -> None: - raise PySparkValueError( - error_class="CANNOT_CONVERT_COLUMN_INTO_BOOL", - message_parameters={}, - ) + ... - __bool__ = __nonzero__ + @dispatch_col_method + def __bool__(self) -> None: + ... + @dispatch_col_method def __repr__(self) -> str: - return "Column<'%s'>" % self._jc.toString() + ... def _test() -> None: diff --git a/python/pyspark/sql/connect/_typing.py b/python/pyspark/sql/connect/_typing.py index 1b8516427dbdd..806476af1eb60 100644 --- a/python/pyspark/sql/connect/_typing.py +++ b/python/pyspark/sql/connect/_typing.py @@ -15,19 +15,20 @@ # limitations under the License. # from types import FunctionType -from typing import Any, Callable, Iterable, Union, Optional, NewType, Protocol, Tuple +from typing import Any, Callable, Iterable, Union, Optional, NewType, Protocol, Tuple, TypeVar import datetime import decimal import pyarrow from pandas.core.frame import DataFrame as PandasDataFrame -from pyspark.sql.connect.column import Column +from pyspark.sql.column import Column from pyspark.sql.connect.types import DataType from pyspark.sql.streaming.state import GroupState ColumnOrName = Union[Column, str] +ColumnOrName_ = TypeVar("ColumnOrName_", bound=ColumnOrName) ColumnOrNameOrOrdinal = Union[Column, str, int] diff --git a/python/pyspark/sql/connect/avro/functions.py b/python/pyspark/sql/connect/avro/functions.py index 43088333b1086..da350f92a531a 100644 --- a/python/pyspark/sql/connect/avro/functions.py +++ b/python/pyspark/sql/connect/avro/functions.py @@ -19,6 +19,7 @@ A collections of builtin avro functions """ +from pyspark.errors import PySparkTypeError from pyspark.sql.connect.utils import check_dependencies check_dependencies(__name__) @@ -26,8 +27,7 @@ from typing import Dict, Optional, TYPE_CHECKING from pyspark.sql.avro import functions as PyAvroFunctions - -from pyspark.sql.connect.column import Column +from pyspark.sql.column import Column from pyspark.sql.connect.functions.builtin import _invoke_function, _to_col, _options_to_col, lit if TYPE_CHECKING: @@ -37,6 +37,25 @@ def from_avro( data: "ColumnOrName", jsonFormatSchema: str, options: Optional[Dict[str, str]] = None ) -> Column: + if not isinstance(data, (Column, str)): + raise PySparkTypeError( + error_class="INVALID_TYPE", + message_parameters={ + "arg_name": "data", + "arg_type": "pyspark.sql.Column or str", + }, + ) + if not isinstance(jsonFormatSchema, str): + raise PySparkTypeError( + error_class="INVALID_TYPE", + message_parameters={"arg_name": "jsonFormatSchema", "arg_type": "str"}, + ) + if options is not None and not isinstance(options, dict): + raise PySparkTypeError( + error_class="INVALID_TYPE", + message_parameters={"arg_name": "options", "arg_type": "dict, optional"}, + ) + if options is None: return _invoke_function("from_avro", _to_col(data), lit(jsonFormatSchema)) else: @@ -49,6 +68,20 @@ def from_avro( def to_avro(data: "ColumnOrName", jsonFormatSchema: str = "") -> Column: + if not isinstance(data, (Column, str)): + raise PySparkTypeError( + error_class="INVALID_TYPE", + message_parameters={ + "arg_name": "data", + "arg_type": "pyspark.sql.Column or str", + }, + ) + if not isinstance(jsonFormatSchema, str): + raise PySparkTypeError( + error_class="INVALID_TYPE", + message_parameters={"arg_name": "jsonFormatSchema", "arg_type": "str"}, + ) + if jsonFormatSchema == "": return _invoke_function("to_avro", _to_col(data)) else: @@ -80,15 +113,8 @@ def _test() -> None: import doctest from pyspark.sql import SparkSession as PySparkSession import pyspark.sql.connect.avro.functions - from pyspark.util import is_remote_only globs = pyspark.sql.connect.avro.functions.__dict__.copy() - - # TODO(SPARK-47760): Reeanble Avro function doctests - if is_remote_only(): - del pyspark.sql.connect.avro.functions.from_avro - del pyspark.sql.connect.avro.functions.to_avro - globs["spark"] = ( PySparkSession.builder.appName("sql.connect.avro.functions tests") .remote(os.environ.get("SPARK_CONNECT_TESTING_REMOTE", "local[4]")) diff --git a/python/pyspark/sql/connect/client/core.py b/python/pyspark/sql/connect/client/core.py index 0bdfb4bb7910d..e91324150cbd8 100644 --- a/python/pyspark/sql/connect/client/core.py +++ b/python/pyspark/sql/connect/client/core.py @@ -61,12 +61,17 @@ from pyspark.loose_version import LooseVersion from pyspark.version import __version__ from pyspark.resource.information import ResourceInformation +from pyspark.sql.metrics import MetricValue, PlanMetrics, ExecutionInfo, ObservedMetrics from pyspark.sql.connect.client.artifact import ArtifactManager from pyspark.sql.connect.client.logging import logger from pyspark.sql.connect.profiler import ConnectProfilerCollector from pyspark.sql.connect.client.reattach import ExecutePlanResponseReattachableIterator from pyspark.sql.connect.client.retries import RetryPolicy, Retrying, DefaultPolicy -from pyspark.sql.connect.conversion import storage_level_to_proto, proto_to_storage_level +from pyspark.sql.connect.conversion import ( + storage_level_to_proto, + proto_to_storage_level, + proto_to_remote_cached_dataframe, +) import pyspark.sql.connect.proto as pb2 import pyspark.sql.connect.proto.base_pb2_grpc as grpc_lib import pyspark.sql.connect.types as types @@ -443,56 +448,7 @@ def toChannel(self) -> grpc.Channel: return self._secure_channel(self.endpoint, creds) -class MetricValue: - def __init__(self, name: str, value: Union[int, float], type: str): - self._name = name - self._type = type - self._value = value - - def __repr__(self) -> str: - return f"<{self._name}={self._value} ({self._type})>" - - @property - def name(self) -> str: - return self._name - - @property - def value(self) -> Union[int, float]: - return self._value - - @property - def metric_type(self) -> str: - return self._type - - -class PlanMetrics: - def __init__(self, name: str, id: int, parent: int, metrics: List[MetricValue]): - self._name = name - self._id = id - self._parent_id = parent - self._metrics = metrics - - def __repr__(self) -> str: - return f"Plan({self._name})={self._metrics}" - - @property - def name(self) -> str: - return self._name - - @property - def plan_id(self) -> int: - return self._id - - @property - def parent_plan_id(self) -> int: - return self._parent_id - - @property - def metrics(self) -> List[MetricValue]: - return self._metrics - - -class PlanObservedMetrics: +class PlanObservedMetrics(ObservedMetrics): def __init__(self, name: str, metrics: List[pb2.Expression.Literal], keys: List[str]): self._name = name self._metrics = metrics @@ -509,6 +465,13 @@ def name(self) -> str: def metrics(self) -> List[pb2.Expression.Literal]: return self._metrics + @property + def pairs(self) -> dict[str, Any]: + result = {} + for x in range(len(self._metrics)): + result[self.keys[x]] = LiteralExpression._to_value(self.metrics[x]) + return result + @property def keys(self) -> List[str]: return self._keys @@ -655,12 +618,7 @@ def __init__( use_reattachable_execute: bool Enable reattachable execution. """ - - class ClientThreadLocals(threading.local): - tags: set = set() - inside_error_handling: bool = False - - self.thread_local = ClientThreadLocals() + self.thread_local = threading.local() # Parse the connection string. self._builder = ( @@ -889,7 +847,7 @@ def _resources(self) -> Dict[str, ResourceInformation]: logger.info("Fetching the resources") cmd = pb2.Command() cmd.get_resources_command.SetInParent() - (_, properties) = self.execute_command(cmd) + (_, properties, _) = self.execute_command(cmd) resources = properties["get_resources_command_result"] return resources @@ -916,18 +874,23 @@ def to_table_as_iterator( def to_table( self, plan: pb2.Plan, observations: Dict[str, Observation] - ) -> Tuple["pa.Table", Optional[StructType]]: + ) -> Tuple["pa.Table", Optional[StructType], ExecutionInfo]: """ Return given plan as a PyArrow Table. """ logger.info(f"Executing plan {self._proto_to_string(plan)}") req = self._execute_plan_request_with_metadata() req.plan.CopyFrom(plan) - table, schema, _, _, _ = self._execute_and_fetch(req, observations) + table, schema, metrics, observed_metrics, _ = self._execute_and_fetch(req, observations) + + # Create a query execution object. + ei = ExecutionInfo(metrics, observed_metrics) assert table is not None - return table, schema + return table, schema, ei - def to_pandas(self, plan: pb2.Plan, observations: Dict[str, Observation]) -> "pd.DataFrame": + def to_pandas( + self, plan: pb2.Plan, observations: Dict[str, Observation] + ) -> Tuple["pd.DataFrame", "ExecutionInfo"]: """ Return given plan as a pandas DataFrame. """ @@ -942,6 +905,7 @@ def to_pandas(self, plan: pb2.Plan, observations: Dict[str, Observation]) -> "pd req, observations, self_destruct=self_destruct ) assert table is not None + ei = ExecutionInfo(metrics, observed_metrics) schema = schema or from_arrow_schema(table.schema, prefer_timestamp_ntz=True) assert schema is not None and isinstance(schema, StructType) @@ -1008,7 +972,7 @@ def to_pandas(self, plan: pb2.Plan, observations: Dict[str, Observation]) -> "pd pdf.attrs["metrics"] = metrics if len(observed_metrics) > 0: pdf.attrs["observed_metrics"] = observed_metrics - return pdf + return pdf, ei def _proto_to_string(self, p: google.protobuf.message.Message) -> str: """ @@ -1052,7 +1016,7 @@ def explain_string(self, plan: pb2.Plan, explain_mode: str = "extended") -> str: def execute_command( self, command: pb2.Command, observations: Optional[Dict[str, Observation]] = None - ) -> Tuple[Optional[pd.DataFrame], Dict[str, Any]]: + ) -> Tuple[Optional[pd.DataFrame], Dict[str, Any], ExecutionInfo]: """ Execute given command. """ @@ -1061,11 +1025,15 @@ def execute_command( if self._user_id: req.user_context.user_id = self._user_id req.plan.command.CopyFrom(command) - data, _, _, _, properties = self._execute_and_fetch(req, observations or {}) + data, _, metrics, observed_metrics, properties = self._execute_and_fetch( + req, observations or {} + ) + # Create a query execution object. + ei = ExecutionInfo(metrics, observed_metrics) if data is not None: - return (data.to_pandas(), properties) + return (data.to_pandas(), properties, ei) else: - return (None, properties) + return (None, properties, ei) def execute_command_as_iterator( self, command: pb2.Command, observations: Optional[Dict[str, Observation]] = None @@ -1400,6 +1368,12 @@ def handle_response( if b.HasField("create_resource_profile_command_result"): profile_id = b.create_resource_profile_command_result.profile_id yield {"create_resource_profile_command_result": profile_id} + if b.HasField("checkpoint_command_result"): + yield { + "checkpoint_command_result": proto_to_remote_cached_dataframe( + b.checkpoint_command_result.relation + ) + } try: if self._use_reattachable_execute: @@ -1683,7 +1657,7 @@ def _handle_error(self, error: Exception) -> NoReturn: Throws the appropriate internal Python exception. """ - if self.thread_local.inside_error_handling: + if getattr(self.thread_local, "inside_error_handling", False): # We are already inside error handling routine, # avoid recursive error processing (with potentially infinite recursion) raise error @@ -1763,6 +1737,9 @@ def _handle_rpc_error(self, rpc_error: grpc.RpcError) -> NoReturn: info = error_details_pb2.ErrorInfo() d.Unpack(info) + if info.metadata["errorClass"] == "INVALID_HANDLE.SESSION_CHANGED": + self._closed = True + raise convert_exception( info, status.message, @@ -1826,6 +1803,7 @@ def _verify_response_integrity( response.server_side_session_id and response.server_side_session_id != self._server_session_id ): + self._closed = True raise PySparkAssertionError( "Received incorrect server side session identifier for request. " "Please create a new Spark Session to reconnect. (" @@ -1840,6 +1818,6 @@ def _create_profile(self, profile: pb2.ResourceProfile) -> int: logger.info("Creating the ResourceProfile") cmd = pb2.Command() cmd.create_resource_profile_command.profile.CopyFrom(profile) - (_, properties) = self.execute_command(cmd) + (_, properties, _) = self.execute_command(cmd) profile_id = properties["create_resource_profile_command_result"] return profile_id diff --git a/python/pyspark/sql/connect/client/reattach.py b/python/pyspark/sql/connect/client/reattach.py index 4468582ca80ea..82c7ae9772188 100644 --- a/python/pyspark/sql/connect/client/reattach.py +++ b/python/pyspark/sql/connect/client/reattach.py @@ -58,7 +58,20 @@ class ExecutePlanResponseReattachableIterator(Generator): # Lock to manage the pool _lock: ClassVar[RLock] = RLock() - _release_thread_pool: Optional[ThreadPool] = ThreadPool(os.cpu_count() if os.cpu_count() else 8) + _release_thread_pool_instance: Optional[ThreadPool] = None + + @classmethod # type: ignore[misc] + @property + def _release_thread_pool(cls) -> ThreadPool: + # Perform a first check outside the critical path. + if cls._release_thread_pool_instance is not None: + return cls._release_thread_pool_instance + with cls._lock: + if cls._release_thread_pool_instance is None: + cls._release_thread_pool_instance = ThreadPool( + os.cpu_count() if os.cpu_count() else 8 + ) + return cls._release_thread_pool_instance @classmethod def shutdown(cls: Type["ExecutePlanResponseReattachableIterator"]) -> None: @@ -67,19 +80,10 @@ def shutdown(cls: Type["ExecutePlanResponseReattachableIterator"]) -> None: outstanding calls are closed. """ with cls._lock: - if cls._release_thread_pool is not None: - cls._release_thread_pool.close() - cls._release_thread_pool.join() - cls._release_thread_pool = None - - @classmethod - def _initialize_pool_if_necessary(cls: Type["ExecutePlanResponseReattachableIterator"]) -> None: - """ - If the processing pool for the release calls is None, initialize the pool exactly once. - """ - with cls._lock: - if cls._release_thread_pool is None: - cls._release_thread_pool = ThreadPool(os.cpu_count() if os.cpu_count() else 8) + if cls._release_thread_pool_instance is not None: + cls._release_thread_pool.close() # type: ignore[attr-defined] + cls._release_thread_pool.join() # type: ignore[attr-defined] + cls._release_thread_pool_instance = None def __init__( self, @@ -88,7 +92,7 @@ def __init__( retrying: Callable[[], Retrying], metadata: Iterable[Tuple[str, str]], ): - ExecutePlanResponseReattachableIterator._initialize_pool_if_necessary() + self._release_thread_pool # Trigger initialization self._request = request self._retrying = retrying if request.operation_id: @@ -206,8 +210,9 @@ def target() -> None: except Exception as e: warnings.warn(f"ReleaseExecute failed with exception: {e}.") - if ExecutePlanResponseReattachableIterator._release_thread_pool is not None: - ExecutePlanResponseReattachableIterator._release_thread_pool.apply_async(target) + with self._lock: + if self._release_thread_pool_instance is not None: + self._release_thread_pool.apply_async(target) def _release_all(self) -> None: """ @@ -230,8 +235,9 @@ def target() -> None: except Exception as e: warnings.warn(f"ReleaseExecute failed with exception: {e}.") - if ExecutePlanResponseReattachableIterator._release_thread_pool is not None: - ExecutePlanResponseReattachableIterator._release_thread_pool.apply_async(target) + with self._lock: + if self._release_thread_pool_instance is not None: + self._release_thread_pool.apply_async(target) self._result_complete = True def _call_iter(self, iter_fun: Callable) -> Any: @@ -254,7 +260,10 @@ def _call_iter(self, iter_fun: Callable) -> Any: return iter_fun() except grpc.RpcError as e: status = rpc_status.from_call(cast(grpc.Call, e)) - if status is not None and "INVALID_HANDLE.OPERATION_NOT_FOUND" in status.message: + if status is not None and ( + "INVALID_HANDLE.OPERATION_NOT_FOUND" in status.message + or "INVALID_HANDLE.SESSION_NOT_FOUND" in status.message + ): if self._last_returned_response_id is not None: raise PySparkRuntimeError( error_class="RESPONSE_ALREADY_RECEIVED", diff --git a/python/pyspark/sql/connect/column.py b/python/pyspark/sql/connect/column.py index 4436b36907a96..ef48091a35b0c 100644 --- a/python/pyspark/sql/connect/column.py +++ b/python/pyspark/sql/connect/column.py @@ -24,16 +24,14 @@ from typing import ( TYPE_CHECKING, - Callable, Any, Union, - overload, Optional, ) +from pyspark.sql.column import Column as ParentColumn from pyspark.errors import PySparkTypeError, PySparkAttributeError, PySparkValueError from pyspark.sql.types import DataType -from pyspark.sql.column import Column as PySparkColumn import pyspark.sql.connect.proto as proto from pyspark.sql.connect.expressions import ( @@ -48,6 +46,7 @@ WithField, DropField, ) +from pyspark.errors.utils import with_origin_to_class if TYPE_CHECKING: @@ -60,53 +59,57 @@ from pyspark.sql.connect.window import WindowSpec -def _func_op(name: str, doc: Optional[str] = "") -> Callable[["Column"], "Column"]: - def wrapped(self: "Column") -> "Column": - return Column(UnresolvedFunction(name, [self._expr])) - - wrapped.__doc__ = doc - return wrapped +def _func_op(name: str, self: ParentColumn) -> ParentColumn: + return Column(UnresolvedFunction(name, [self._expr])) # type: ignore[list-item] def _bin_op( - name: str, doc: Optional[str] = "binary function", reverse: bool = False -) -> Callable[["Column", Any], "Column"]: - def wrapped(self: "Column", other: Any) -> "Column": - if other is None or isinstance( - other, - ( - bool, - float, - int, - str, - datetime.datetime, - datetime.date, - decimal.Decimal, - datetime.timedelta, - ), - ): - other_expr = LiteralExpression._from_value(other) - else: - other_expr = other._expr - - if not reverse: - return Column(UnresolvedFunction(name, [self._expr, other_expr])) - else: - return Column(UnresolvedFunction(name, [other_expr, self._expr])) - - wrapped.__doc__ = doc - return wrapped - - -def _unary_op(name: str, doc: Optional[str] = "unary function") -> Callable[["Column"], "Column"]: - def wrapped(self: "Column") -> "Column": - return Column(UnresolvedFunction(name, [self._expr])) - - wrapped.__doc__ = doc - return wrapped - + name: str, + self: ParentColumn, + other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"], + reverse: bool = False, +) -> ParentColumn: + if other is None or isinstance( + other, + ( + bool, + float, + int, + str, + datetime.datetime, + datetime.date, + decimal.Decimal, + datetime.timedelta, + ), + ): + other_expr = LiteralExpression._from_value(other) + else: + other_expr = other._expr # type: ignore[assignment] + + if not reverse: + return Column(UnresolvedFunction(name, [self._expr, other_expr])) # type: ignore[list-item] + else: + return Column(UnresolvedFunction(name, [other_expr, self._expr])) # type: ignore[list-item] + + +def _unary_op(name: str, self: ParentColumn) -> ParentColumn: + return Column(UnresolvedFunction(name, [self._expr])) # type: ignore[list-item] + + +def _to_expr(v: Any) -> Expression: + return v._expr if isinstance(v, Column) else LiteralExpression._from_value(v) + + +@with_origin_to_class +class Column(ParentColumn): + def __new__( + cls, + expr: "Expression", + ) -> "Column": + self = object.__new__(cls) + self.__init__(expr) # type: ignore[misc] + return self -class Column: def __init__(self, expr: "Expression") -> None: if not isinstance(expr, Expression): raise PySparkTypeError( @@ -115,36 +118,128 @@ def __init__(self, expr: "Expression") -> None: ) self._expr = expr - __gt__ = _bin_op(">") - __lt__ = _bin_op("<") - __add__ = _bin_op("+") - __sub__ = _bin_op("-") - __mul__ = _bin_op("*") - __div__ = _bin_op("/") - __truediv__ = _bin_op("/") - __mod__ = _bin_op("%") - __radd__ = _bin_op("+", reverse=True) - __rsub__ = _bin_op("-", reverse=True) - __rmul__ = _bin_op("*", reverse=True) - __rdiv__ = _bin_op("/", reverse=True) - __rtruediv__ = _bin_op("/", reverse=True) - __rmod__ = _bin_op("%", reverse=True) - __pow__ = _bin_op("power") - __rpow__ = _bin_op("power", reverse=True) - __ge__ = _bin_op(">=") - __le__ = _bin_op("<=") - - eqNullSafe = _bin_op("<=>", PySparkColumn.eqNullSafe.__doc__) - - __neg__ = _func_op("negative") + def __gt__( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op(">", self, other) + + def __lt__( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("<", self, other) + + def __add__( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("+", self, other) + + def __sub__( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("-", self, other) + + def __mul__( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("*", self, other) + + def __div__( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("/", self, other) + + def __truediv__( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("/", self, other) + + def __mod__( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("%", self, other) + + def __radd__( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("+", self, other, reverse=True) + + def __rsub__( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("-", self, other, reverse=True) + + def __rmul__( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("*", self, other, reverse=True) + + def __rdiv__( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("/", self, other, reverse=True) + + def __rtruediv__( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("/", self, other, reverse=True) + + def __rmod__( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("%", self, other, reverse=True) + + def __pow__( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("power", self, other) + + def __rpow__( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("power", self, other, reverse=True) + + def __ge__( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op(">=", self, other) + + def __le__( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("<=", self, other) + + def eqNullSafe( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("<=>", self, other) + + def __neg__(self) -> ParentColumn: + return _func_op("negative", self) # `and`, `or`, `not` cannot be overloaded in Python, # so use bitwise operators as boolean operators - __and__ = _bin_op("and") - __or__ = _bin_op("or") - __invert__ = _func_op("not") - __rand__ = _bin_op("and") - __ror__ = _bin_op("or") + def __and__( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("and", self, other) + + def __or__( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("or", self, other) + + def __invert__(self) -> ParentColumn: + return _func_op("not", self) + + def __rand__( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("and", self, other) + + def __ror__( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("or", self, other) # container operators def __contains__(self, item: Any) -> None: @@ -154,27 +249,53 @@ def __contains__(self, item: Any) -> None: ) # bitwise operators - bitwiseOR = _bin_op("|", PySparkColumn.bitwiseOR.__doc__) - bitwiseAND = _bin_op("&", PySparkColumn.bitwiseAND.__doc__) - bitwiseXOR = _bin_op("^", PySparkColumn.bitwiseXOR.__doc__) + def bitwiseOR( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("|", self, other) + + def bitwiseAND( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("&", self, other) + + def bitwiseXOR( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("^", self, other) + + def isNull(self) -> ParentColumn: + return _unary_op("isNull", self) + + def isNotNull(self) -> ParentColumn: + return _unary_op("isNotNull", self) - isNull = _unary_op("isnull", PySparkColumn.isNull.__doc__) - isNotNull = _unary_op("isnotnull", PySparkColumn.isNotNull.__doc__) - isNaN = _unary_op("isNaN", PySparkColumn.isNaN.__doc__) + def isNaN(self) -> ParentColumn: + return _unary_op("isNaN", self) def __ne__( # type: ignore[override] self, other: Any, - ) -> "Column": - """binary function""" - return _func_op("not")(_bin_op("==")(self, other)) + ) -> ParentColumn: + return _func_op("not", _bin_op("==", self, other)) # string methods - contains = _bin_op("contains", PySparkColumn.contains.__doc__) - startswith = _bin_op("startswith", PySparkColumn.startswith.__doc__) - endswith = _bin_op("endswith", PySparkColumn.endswith.__doc__) - - def when(self, condition: "Column", value: Any) -> "Column": + def contains( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("contains", self, other) + + def startswith( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("startsWith", self, other) + + def endswith( + self, other: Union[ParentColumn, "LiteralType", "DecimalLiteral", "DateTimeLiteral"] + ) -> ParentColumn: + return _bin_op("endsWith", self, other) + + def when(self, condition: ParentColumn, value: Any) -> ParentColumn: if not isinstance(condition, Column): raise PySparkTypeError( error_class="NOT_COLUMN", @@ -193,18 +314,14 @@ def when(self, condition: "Column", value: Any) -> "Column": message_parameters={}, ) - if isinstance(value, Column): - _value = value._expr - else: - _value = LiteralExpression._from_value(value) - - _branches = self._expr._branches + [(condition._expr, _value)] - - return Column(CaseWhen(branches=_branches, else_value=None)) - - when.__doc__ = PySparkColumn.when.__doc__ + return Column( + CaseWhen( + branches=self._expr._branches + [(condition._expr, _to_expr(value))], + else_value=None, + ) + ) - def otherwise(self, value: Any) -> "Column": + def otherwise(self, value: Any) -> ParentColumn: if not isinstance(self._expr, CaseWhen): raise PySparkTypeError( "otherwise() can only be applied on a Column previously generated by when()" @@ -215,28 +332,25 @@ def otherwise(self, value: Any) -> "Column": "otherwise() can only be applied once on a Column previously generated by when()" ) - if isinstance(value, Column): - _value = value._expr - else: - _value = LiteralExpression._from_value(value) - - return Column(CaseWhen(branches=self._expr._branches, else_value=_value)) - - otherwise.__doc__ = PySparkColumn.otherwise.__doc__ + return Column( + CaseWhen( + branches=self._expr._branches, + else_value=_to_expr(value), + ) + ) - like = _bin_op("like", PySparkColumn.like.__doc__) - rlike = _bin_op("rlike", PySparkColumn.rlike.__doc__) - ilike = _bin_op("ilike", PySparkColumn.ilike.__doc__) + def like(self: ParentColumn, other: str) -> ParentColumn: + return _bin_op("like", self, other) - @overload - def substr(self, startPos: int, length: int) -> "Column": - ... + def rlike(self: ParentColumn, other: str) -> ParentColumn: + return _bin_op("rlike", self, other) - @overload - def substr(self, startPos: "Column", length: "Column") -> "Column": - ... + def ilike(self: ParentColumn, other: str) -> ParentColumn: + return _bin_op("ilike", self, other) - def substr(self, startPos: Union[int, "Column"], length: Union[int, "Column"]) -> "Column": + def substr( + self, startPos: Union[int, ParentColumn], length: Union[int, ParentColumn] + ) -> ParentColumn: if type(startPos) != type(length): raise PySparkTypeError( error_class="NOT_SAME_TYPE", @@ -248,12 +362,9 @@ def substr(self, startPos: Union[int, "Column"], length: Union[int, "Column"]) - }, ) - if isinstance(length, Column): - length_expr = length._expr - start_expr = startPos._expr # type: ignore[union-attr] - elif isinstance(length, int): - length_expr = LiteralExpression._from_value(length) - start_expr = LiteralExpression._from_value(startPos) + if isinstance(length, (Column, int)): + length_expr = _to_expr(length) + start_expr = _to_expr(startPos) else: raise PySparkTypeError( error_class="NOT_COLUMN_OR_INT", @@ -261,12 +372,7 @@ def substr(self, startPos: Union[int, "Column"], length: Union[int, "Column"]) - ) return Column(UnresolvedFunction("substr", [self._expr, start_expr, length_expr])) - substr.__doc__ = PySparkColumn.substr.__doc__ - - def __eq__(self, other: Any) -> "Column": # type: ignore[override] - """Returns a binary expression with the current column as the left - side and the other expression as the right side. - """ + def __eq__(self, other: Any) -> ParentColumn: # type: ignore[override] if other is None or isinstance( other, (bool, float, int, str, datetime.datetime, datetime.date, decimal.Decimal) ): @@ -279,46 +385,30 @@ def __eq__(self, other: Any) -> "Column": # type: ignore[override] def to_plan(self, session: "SparkConnectClient") -> proto.Expression: return self._expr.to_plan(session) - def alias(self, *alias: str, **kwargs: Any) -> "Column": + def alias(self, *alias: str, **kwargs: Any) -> ParentColumn: return Column(self._expr.alias(*alias, **kwargs)) - alias.__doc__ = PySparkColumn.alias.__doc__ - name = alias - name.__doc__ = PySparkColumn.name.__doc__ - - def asc(self) -> "Column": + def asc(self) -> ParentColumn: return self.asc_nulls_first() - asc.__doc__ = PySparkColumn.asc.__doc__ - - def asc_nulls_first(self) -> "Column": + def asc_nulls_first(self) -> ParentColumn: return Column(SortOrder(self._expr, ascending=True, nullsFirst=True)) - asc_nulls_first.__doc__ = PySparkColumn.asc_nulls_first.__doc__ - - def asc_nulls_last(self) -> "Column": + def asc_nulls_last(self) -> ParentColumn: return Column(SortOrder(self._expr, ascending=True, nullsFirst=False)) - asc_nulls_last.__doc__ = PySparkColumn.asc_nulls_last.__doc__ - - def desc(self) -> "Column": + def desc(self) -> ParentColumn: return self.desc_nulls_last() - desc.__doc__ = PySparkColumn.desc.__doc__ - - def desc_nulls_first(self) -> "Column": + def desc_nulls_first(self) -> ParentColumn: return Column(SortOrder(self._expr, ascending=False, nullsFirst=True)) - desc_nulls_first.__doc__ = PySparkColumn.desc_nulls_first.__doc__ - - def desc_nulls_last(self) -> "Column": + def desc_nulls_last(self) -> ParentColumn: return Column(SortOrder(self._expr, ascending=False, nullsFirst=False)) - desc_nulls_last.__doc__ = PySparkColumn.desc_nulls_last.__doc__ - - def cast(self, dataType: Union[DataType, str]) -> "Column": + def cast(self, dataType: Union[DataType, str]) -> ParentColumn: if isinstance(dataType, (DataType, str)): return Column(CastExpression(expr=self._expr, data_type=dataType)) else: @@ -327,11 +417,9 @@ def cast(self, dataType: Union[DataType, str]) -> "Column": message_parameters={"arg_name": "dataType", "arg_type": type(dataType).__name__}, ) - cast.__doc__ = PySparkColumn.cast.__doc__ - astype = cast - def try_cast(self, dataType: Union[DataType, str]) -> "Column": + def try_cast(self, dataType: Union[DataType, str]) -> ParentColumn: if isinstance(dataType, (DataType, str)): return Column( CastExpression( @@ -346,12 +434,10 @@ def try_cast(self, dataType: Union[DataType, str]) -> "Column": message_parameters={"arg_name": "dataType", "arg_type": type(dataType).__name__}, ) - try_cast.__doc__ = PySparkColumn.try_cast.__doc__ - def __repr__(self) -> str: return "Column<'%s'>" % self._expr.__repr__() - def over(self, window: "WindowSpec") -> "Column": + def over(self, window: "WindowSpec") -> ParentColumn: # type: ignore[override] from pyspark.sql.connect.window import WindowSpec if not isinstance(window, WindowSpec): @@ -362,35 +448,22 @@ def over(self, window: "WindowSpec") -> "Column": return Column(WindowExpression(windowFunction=self._expr, windowSpec=window)) - over.__doc__ = PySparkColumn.over.__doc__ - - def isin(self, *cols: Any) -> "Column": + def isin(self, *cols: Any) -> ParentColumn: if len(cols) == 1 and isinstance(cols[0], (list, set)): _cols = list(cols[0]) else: _cols = list(cols) - _exprs = [self._expr] - for c in _cols: - if isinstance(c, Column): - _exprs.append(c._expr) - else: - _exprs.append(LiteralExpression._from_value(c)) - - return Column(UnresolvedFunction("in", _exprs)) - - isin.__doc__ = PySparkColumn.isin.__doc__ + return Column(UnresolvedFunction("in", [self._expr] + [_to_expr(c) for c in _cols])) def between( self, - lowerBound: Union["Column", "LiteralType", "DateTimeLiteral", "DecimalLiteral"], - upperBound: Union["Column", "LiteralType", "DateTimeLiteral", "DecimalLiteral"], - ) -> "Column": + lowerBound: Union[ParentColumn, "LiteralType", "DateTimeLiteral", "DecimalLiteral"], + upperBound: Union[ParentColumn, "LiteralType", "DateTimeLiteral", "DecimalLiteral"], + ) -> ParentColumn: return (self >= lowerBound) & (self <= upperBound) - between.__doc__ = PySparkColumn.between.__doc__ - - def getItem(self, key: Any) -> "Column": + def getItem(self, key: Any) -> ParentColumn: if isinstance(key, Column): warnings.warn( "A column as 'key' in getItem is deprecated as of Spark 3.0, and will not " @@ -400,9 +473,7 @@ def getItem(self, key: Any) -> "Column": ) return self[key] - getItem.__doc__ = PySparkColumn.getItem.__doc__ - - def getField(self, name: Any) -> "Column": + def getField(self, name: Any) -> ParentColumn: if isinstance(name, Column): warnings.warn( "A column as 'name' in getField is deprecated as of Spark 3.0, and will not " @@ -412,9 +483,7 @@ def getField(self, name: Any) -> "Column": ) return self[name] - getField.__doc__ = PySparkColumn.getField.__doc__ - - def withField(self, fieldName: str, col: "Column") -> "Column": + def withField(self, fieldName: str, col: ParentColumn) -> ParentColumn: if not isinstance(fieldName, str): raise PySparkTypeError( error_class="NOT_STR", @@ -429,9 +498,7 @@ def withField(self, fieldName: str, col: "Column") -> "Column": return Column(WithField(self._expr, fieldName, col._expr)) - withField.__doc__ = PySparkColumn.withField.__doc__ - - def dropFields(self, *fieldNames: str) -> "Column": + def dropFields(self, *fieldNames: str) -> ParentColumn: dropField: Optional[DropField] = None for fieldName in fieldNames: if not isinstance(fieldName, str): @@ -458,9 +525,7 @@ def dropFields(self, *fieldNames: str) -> "Column": return Column(dropField) - dropFields.__doc__ = PySparkColumn.dropFields.__doc__ - - def __getattr__(self, item: Any) -> "Column": + def __getattr__(self, item: Any) -> ParentColumn: if item == "_jc": raise PySparkAttributeError( error_class="JVM_ATTRIBUTE_NOT_SUPPORTED", message_parameters={"attr_name": "_jc"} @@ -471,7 +536,7 @@ def __getattr__(self, item: Any) -> "Column": ) return self[item] - def __getitem__(self, k: Any) -> "Column": + def __getitem__(self, k: Any) -> ParentColumn: if isinstance(k, slice): if k.step is not None: raise PySparkValueError( @@ -479,10 +544,8 @@ def __getitem__(self, k: Any) -> "Column": message_parameters={}, ) return self.substr(k.start, k.stop) - elif isinstance(k, Column): - return Column(UnresolvedExtractValue(self._expr, k._expr)) else: - return Column(UnresolvedExtractValue(self._expr, LiteralExpression._from_value(k))) + return Column(UnresolvedExtractValue(self._expr, _to_expr(k))) def __iter__(self) -> None: raise PySparkTypeError( @@ -499,17 +562,14 @@ def __nonzero__(self) -> None: __bool__ = __nonzero__ -Column.__doc__ = PySparkColumn.__doc__ - - def _test() -> None: import os import sys import doctest from pyspark.sql import SparkSession as PySparkSession - import pyspark.sql.connect.column + import pyspark.sql.column - globs = pyspark.sql.connect.column.__dict__.copy() + globs = pyspark.sql.column.__dict__.copy() globs["spark"] = ( PySparkSession.builder.appName("sql.connect.column tests") .remote(os.environ.get("SPARK_CONNECT_TESTING_REMOTE", "local[4]")) @@ -517,7 +577,7 @@ def _test() -> None: ) (failure_count, test_count) = doctest.testmod( - pyspark.sql.connect.column, + pyspark.sql.column, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE diff --git a/python/pyspark/sql/connect/conversion.py b/python/pyspark/sql/connect/conversion.py index 9b1007c41f9c0..1c205586d6096 100644 --- a/python/pyspark/sql/connect/conversion.py +++ b/python/pyspark/sql/connect/conversion.py @@ -48,12 +48,10 @@ import pyspark.sql.connect.proto as pb2 from pyspark.sql.pandas.types import to_arrow_schema, _dedup_names, _deduplicate_field_names -from typing import ( - Any, - Callable, - Sequence, - List, -) +from typing import Any, Callable, Sequence, List, TYPE_CHECKING + +if TYPE_CHECKING: + from pyspark.sql.connect.dataframe import DataFrame class LocalDataToArrowConversion: @@ -570,3 +568,17 @@ def proto_to_storage_level(storage_level: pb2.StorageLevel) -> StorageLevel: deserialized=storage_level.deserialized, replication=storage_level.replication, ) + + +def proto_to_remote_cached_dataframe(relation: pb2.CachedRemoteRelation) -> "DataFrame": + assert relation is not None and isinstance(relation, pb2.CachedRemoteRelation) + + from pyspark.sql.connect.dataframe import DataFrame + from pyspark.sql.connect.session import SparkSession + import pyspark.sql.connect.plan as plan + + session = SparkSession.active() + return DataFrame( + plan=plan.CachedRemoteRelation(relation.relation_id, session), + session=session, + ) diff --git a/python/pyspark/sql/connect/dataframe.py b/python/pyspark/sql/connect/dataframe.py index f0dc412760a4a..46698c2530eab 100644 --- a/python/pyspark/sql/connect/dataframe.py +++ b/python/pyspark/sql/connect/dataframe.py @@ -14,6 +14,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # + +# mypy: disable-error-code="override" from pyspark.errors.exceptions.base import ( SessionNotSameException, PySparkIndexError, @@ -40,21 +42,23 @@ Type, ) +import copy import sys import random -import pandas import pyarrow as pa import json import warnings from collections.abc import Iterable +import functools from pyspark import _NoValue from pyspark._globals import _NoValueType +from pyspark.util import is_remote_only from pyspark.sql.types import Row, StructType, _create_row from pyspark.sql.dataframe import ( - DataFrame as PySparkDataFrame, - DataFrameNaFunctions as PySparkDataFrameNaFunctions, - DataFrameStatFunctions as PySparkDataFrameStatFunctions, + DataFrame as ParentDataFrame, + DataFrameNaFunctions as ParentDataFrameNaFunctions, + DataFrameStatFunctions as ParentDataFrameStatFunctions, ) from pyspark.errors import ( @@ -71,14 +75,15 @@ from pyspark.sql.connect.group import GroupedData from pyspark.sql.connect.readwriter import DataFrameWriter, DataFrameWriterV2 from pyspark.sql.connect.streaming.readwriter import DataStreamWriter -from pyspark.sql.connect.column import Column +from pyspark.sql.column import Column from pyspark.sql.connect.expressions import ( ColumnReference, UnresolvedRegex, UnresolvedStar, ) from pyspark.sql.connect.functions import builtin as F -from pyspark.sql.pandas.types import from_arrow_schema +from pyspark.sql.pandas.types import from_arrow_schema, to_arrow_schema +from pyspark.sql.pandas.functions import _validate_pandas_udf # type: ignore[attr-defined] if TYPE_CHECKING: @@ -91,12 +96,24 @@ PandasMapIterFunction, ArrowMapIterFunction, ) + from pyspark.core.rdd import RDD + from pyspark.sql.pandas._typing import DataFrameLike as PandasDataFrameLike from pyspark.sql.connect.observation import Observation from pyspark.sql.connect.session import SparkSession from pyspark.pandas.frame import DataFrame as PandasOnSparkDataFrame + from pyspark.sql.metrics import ExecutionInfo + +class DataFrame(ParentDataFrame): + def __new__( + cls, + plan: plan.LogicalPlan, + session: "SparkSession", + ) -> "DataFrame": + self = object.__new__(cls) + self.__init__(plan, session) # type: ignore[misc] + return self -class DataFrame: def __init__( self, plan: plan.LogicalPlan, @@ -110,7 +127,7 @@ def __init__( message_parameters={"operator": "__init__"}, ) - self._session: "SparkSession" = session + self._session: "SparkSession" = session # type: ignore[assignment] if self._session is None: raise PySparkRuntimeError( error_class="NO_ACTIVE_SESSION", @@ -121,6 +138,7 @@ def __init__( # by __repr__ and _repr_html_ while eager evaluation opens. self._support_repr_html = False self._cached_schema: Optional[StructType] = None + self._execution_info: Optional["ExecutionInfo"] = None def __reduce__(self) -> Tuple: """ @@ -188,30 +206,39 @@ def _repr_html_(self) -> Optional[str]: else: return None - _repr_html_.__doc__ = PySparkDataFrame._repr_html_.__doc__ - @property def write(self) -> "DataFrameWriter": - return DataFrameWriter(self._plan, self._session) + def cb(qe: "ExecutionInfo") -> None: + self._execution_info = qe - write.__doc__ = PySparkDataFrame.write.__doc__ + return DataFrameWriter(self._plan, self._session, cb) + @functools.cache def isEmpty(self) -> bool: return len(self.select().take(1)) == 0 - isEmpty.__doc__ = PySparkDataFrame.isEmpty.__doc__ + @overload + def select(self, *cols: "ColumnOrName") -> ParentDataFrame: + ... + + @overload + def select(self, __cols: Union[List[Column], List[str]]) -> ParentDataFrame: + ... - def select(self, *cols: "ColumnOrName") -> "DataFrame": + def select(self, *cols: "ColumnOrName") -> ParentDataFrame: # type: ignore[misc] if len(cols) == 1 and isinstance(cols[0], list): cols = cols[0] + if any(not isinstance(c, (str, Column)) for c in cols): + raise PySparkTypeError( + error_class="NOT_LIST_OF_COLUMN_OR_STR", + message_parameters={"arg_name": "columns"}, + ) return DataFrame( plan.Project(self._plan, [F._to_col(c) for c in cols]), session=self._session, ) - select.__doc__ = PySparkDataFrame.select.__doc__ - - def selectExpr(self, *expr: Union[str, List[str]]) -> "DataFrame": + def selectExpr(self, *expr: Union[str, List[str]]) -> ParentDataFrame: sql_expr = [] if len(expr) == 1 and isinstance(expr[0], list): expr = expr[0] # type: ignore[assignment] @@ -223,9 +250,7 @@ def selectExpr(self, *expr: Union[str, List[str]]) -> "DataFrame": return DataFrame(plan.Project(self._plan, sql_expr), session=self._session) - selectExpr.__doc__ = PySparkDataFrame.selectExpr.__doc__ - - def agg(self, *exprs: Union[Column, Dict[str, str]]) -> "DataFrame": + def agg(self, *exprs: Union[Column, Dict[str, str]]) -> ParentDataFrame: if not exprs: raise PySparkValueError( error_class="CANNOT_BE_EMPTY", @@ -241,87 +266,79 @@ def agg(self, *exprs: Union[Column, Dict[str, str]]) -> "DataFrame": exprs = cast(Tuple[Column, ...], exprs) return self.groupBy().agg(*exprs) - agg.__doc__ = PySparkDataFrame.agg.__doc__ - - def alias(self, alias: str) -> "DataFrame": - return DataFrame(plan.SubqueryAlias(self._plan, alias), session=self._session) - - alias.__doc__ = PySparkDataFrame.alias.__doc__ + def alias(self, alias: str) -> ParentDataFrame: + res = DataFrame(plan.SubqueryAlias(self._plan, alias), session=self._session) + res._cached_schema = self._cached_schema + return res def colRegex(self, colName: str) -> Column: + from pyspark.sql.connect.column import Column as ConnectColumn + if not isinstance(colName, str): raise PySparkTypeError( error_class="NOT_STR", message_parameters={"arg_name": "colName", "arg_type": type(colName).__name__}, ) - return Column(UnresolvedRegex(colName, self._plan._plan_id)) - - colRegex.__doc__ = PySparkDataFrame.colRegex.__doc__ + return ConnectColumn(UnresolvedRegex(colName, self._plan._plan_id)) @property def dtypes(self) -> List[Tuple[str, str]]: return [(str(f.name), f.dataType.simpleString()) for f in self.schema.fields] - dtypes.__doc__ = PySparkDataFrame.dtypes.__doc__ - @property def columns(self) -> List[str]: return self.schema.names - columns.__doc__ = PySparkDataFrame.columns.__doc__ - @property def sparkSession(self) -> "SparkSession": return self._session - sparkSession.__doc__ = PySparkDataFrame.sparkSession.__doc__ - def count(self) -> int: - table, _ = self.agg(F._invoke_function("count", F.lit(1)))._to_table() + table, _ = self.agg( + F._invoke_function("count", F.lit(1)) + )._to_table() # type: ignore[operator] return table[0][0].as_py() - count.__doc__ = PySparkDataFrame.count.__doc__ - - def crossJoin(self, other: "DataFrame") -> "DataFrame": + def crossJoin(self, other: ParentDataFrame) -> ParentDataFrame: self._check_same_session(other) return DataFrame( - plan.Join(left=self._plan, right=other._plan, on=None, how="cross"), + plan.Join( + left=self._plan, right=other._plan, on=None, how="cross" # type: ignore[arg-type] + ), session=self._session, ) - crossJoin.__doc__ = PySparkDataFrame.crossJoin.__doc__ - - def _check_same_session(self, other: "DataFrame") -> None: - if self._session.session_id != other._session.session_id: + def _check_same_session(self, other: ParentDataFrame) -> None: + if self._session.session_id != other._session.session_id: # type: ignore[attr-defined] raise SessionNotSameException( error_class="SESSION_NOT_SAME", message_parameters={}, ) - def coalesce(self, numPartitions: int) -> "DataFrame": + def coalesce(self, numPartitions: int) -> ParentDataFrame: if not numPartitions > 0: raise PySparkValueError( error_class="VALUE_NOT_POSITIVE", message_parameters={"arg_name": "numPartitions", "arg_value": str(numPartitions)}, ) - return DataFrame( + res = DataFrame( plan.Repartition(self._plan, num_partitions=numPartitions, shuffle=False), self._session, ) - - coalesce.__doc__ = PySparkDataFrame.coalesce.__doc__ + res._cached_schema = self._cached_schema + return res @overload - def repartition(self, numPartitions: int, *cols: "ColumnOrName") -> "DataFrame": + def repartition(self, numPartitions: int, *cols: "ColumnOrName") -> ParentDataFrame: ... @overload - def repartition(self, *cols: "ColumnOrName") -> "DataFrame": + def repartition(self, *cols: "ColumnOrName") -> ParentDataFrame: ... def repartition( # type: ignore[misc] self, numPartitions: Union[int, "ColumnOrName"], *cols: "ColumnOrName" - ) -> "DataFrame": + ) -> ParentDataFrame: if isinstance(numPartitions, int): if not numPartitions > 0: raise PySparkValueError( @@ -332,12 +349,12 @@ def repartition( # type: ignore[misc] }, ) if len(cols) == 0: - return DataFrame( + res = DataFrame( plan.Repartition(self._plan, numPartitions, shuffle=True), self._session, ) else: - return DataFrame( + res = DataFrame( plan.RepartitionByExpression( self._plan, numPartitions, [F._to_col(c) for c in cols] ), @@ -345,7 +362,7 @@ def repartition( # type: ignore[misc] ) elif isinstance(numPartitions, (str, Column)): cols = (numPartitions,) + cols - return DataFrame( + res = DataFrame( plan.RepartitionByExpression(self._plan, None, [F._to_col(c) for c in cols]), self.sparkSession, ) @@ -358,19 +375,20 @@ def repartition( # type: ignore[misc] }, ) - repartition.__doc__ = PySparkDataFrame.repartition.__doc__ + res._cached_schema = self._cached_schema + return res @overload - def repartitionByRange(self, numPartitions: int, *cols: "ColumnOrName") -> "DataFrame": + def repartitionByRange(self, numPartitions: int, *cols: "ColumnOrName") -> ParentDataFrame: ... @overload - def repartitionByRange(self, *cols: "ColumnOrName") -> "DataFrame": + def repartitionByRange(self, *cols: "ColumnOrName") -> ParentDataFrame: ... def repartitionByRange( # type: ignore[misc] self, numPartitions: Union[int, "ColumnOrName"], *cols: "ColumnOrName" - ) -> "DataFrame": + ) -> ParentDataFrame: if isinstance(numPartitions, int): if not numPartitions > 0: raise PySparkValueError( @@ -386,14 +404,14 @@ def repartitionByRange( # type: ignore[misc] message_parameters={"item": "cols"}, ) else: - return DataFrame( + res = DataFrame( plan.RepartitionByExpression( self._plan, numPartitions, [F._sort_col(c) for c in cols] ), self.sparkSession, ) elif isinstance(numPartitions, (str, Column)): - return DataFrame( + res = DataFrame( plan.RepartitionByExpression( self._plan, None, [F._sort_col(c) for c in [numPartitions] + list(cols)] ), @@ -408,58 +426,79 @@ def repartitionByRange( # type: ignore[misc] }, ) - repartitionByRange.__doc__ = PySparkDataFrame.repartitionByRange.__doc__ + res._cached_schema = self._cached_schema + return res - def dropDuplicates(self, subset: Optional[List[str]] = None) -> "DataFrame": - if subset is not None and not isinstance(subset, (list, tuple)): - raise PySparkTypeError( - error_class="NOT_LIST_OR_TUPLE", - message_parameters={"arg_name": "subset", "arg_type": type(subset).__name__}, - ) + def dropDuplicates(self, *subset: Union[str, List[str]]) -> ParentDataFrame: + # Acceptable args should be str, ... or a single List[str] + # So if subset length is 1, it can be either single str, or a list of str + # if subset length is greater than 1, it must be a sequence of str + if len(subset) > 1: + assert all(isinstance(c, str) for c in subset) - if subset is None: - return DataFrame( + if not subset: + res = DataFrame( plan.Deduplicate(child=self._plan, all_columns_as_keys=True), session=self._session ) + elif len(subset) == 1 and isinstance(subset[0], list): + res = DataFrame( + plan.Deduplicate(child=self._plan, column_names=subset[0]), + session=self._session, + ) else: - return DataFrame( - plan.Deduplicate(child=self._plan, column_names=subset), session=self._session + res = DataFrame( + plan.Deduplicate(child=self._plan, column_names=cast(List[str], subset)), + session=self._session, ) - dropDuplicates.__doc__ = PySparkDataFrame.dropDuplicates.__doc__ + res._cached_schema = self._cached_schema + return res drop_duplicates = dropDuplicates - def dropDuplicatesWithinWatermark(self, subset: Optional[List[str]] = None) -> "DataFrame": - if subset is not None and not isinstance(subset, (list, tuple)): - raise PySparkTypeError( - error_class="NOT_LIST_OR_TUPLE", - message_parameters={"arg_name": "subset", "arg_type": type(subset).__name__}, - ) + def dropDuplicatesWithinWatermark(self, *subset: Union[str, List[str]]) -> ParentDataFrame: + # Acceptable args should be str, ... or a single List[str] + # So if subset length is 1, it can be either single str, or a list of str + # if subset length is greater than 1, it must be a sequence of str + if len(subset) > 1: + assert all(isinstance(c, str) for c in subset) - if subset is None: + if not subset: return DataFrame( plan.Deduplicate(child=self._plan, all_columns_as_keys=True, within_watermark=True), session=self._session, ) + elif len(subset) == 1 and isinstance(subset[0], list): + return DataFrame( + plan.Deduplicate(child=self._plan, column_names=subset[0], within_watermark=True), + session=self._session, + ) else: return DataFrame( - plan.Deduplicate(child=self._plan, column_names=subset, within_watermark=True), + plan.Deduplicate( + child=self._plan, + column_names=cast(List[str], subset), + within_watermark=True, + ), session=self._session, ) - dropDuplicatesWithinWatermark.__doc__ = PySparkDataFrame.dropDuplicatesWithinWatermark.__doc__ - - drop_duplicates_within_watermark = dropDuplicatesWithinWatermark - - def distinct(self) -> "DataFrame": - return DataFrame( + def distinct(self) -> ParentDataFrame: + res = DataFrame( plan.Deduplicate(child=self._plan, all_columns_as_keys=True), session=self._session ) + res._cached_schema = self._cached_schema + return res - distinct.__doc__ = PySparkDataFrame.distinct.__doc__ + @overload + def drop(self, cols: "ColumnOrName") -> ParentDataFrame: + ... - def drop(self, *cols: "ColumnOrName") -> "DataFrame": + @overload + def drop(self, *cols: str) -> ParentDataFrame: + ... + + def drop(self, *cols: "ColumnOrName") -> ParentDataFrame: # type: ignore[misc] _cols = list(cols) if any(not isinstance(c, (str, Column)) for c in _cols): raise PySparkTypeError( @@ -475,21 +514,25 @@ def drop(self, *cols: "ColumnOrName") -> "DataFrame": session=self._session, ) - drop.__doc__ = PySparkDataFrame.drop.__doc__ - - def filter(self, condition: Union[Column, str]) -> "DataFrame": + def filter(self, condition: Union[Column, str]) -> ParentDataFrame: if isinstance(condition, str): expr = F.expr(condition) else: expr = condition - return DataFrame(plan.Filter(child=self._plan, filter=expr), session=self._session) - - filter.__doc__ = PySparkDataFrame.filter.__doc__ + res = DataFrame(plan.Filter(child=self._plan, filter=expr), session=self._session) + res._cached_schema = self._cached_schema + return res def first(self) -> Optional[Row]: return self.head() - first.__doc__ = PySparkDataFrame.first.__doc__ + @overload # type: ignore[no-overload-impl] + def groupby(self, *cols: "ColumnOrNameOrOrdinal") -> "GroupedData": + ... + + @overload + def groupby(self, __cols: Union[List[Column], List[str], List[int]]) -> "GroupedData": + ... def groupBy(self, *cols: "ColumnOrNameOrOrdinal") -> GroupedData: if len(cols) == 1 and isinstance(cols[0], list): @@ -516,11 +559,17 @@ def groupBy(self, *cols: "ColumnOrNameOrOrdinal") -> GroupedData: return GroupedData(df=self, group_type="groupby", grouping_cols=_cols) - groupBy.__doc__ = PySparkDataFrame.groupBy.__doc__ - - groupby = groupBy + groupby = groupBy # type: ignore[assignment] + @overload def rollup(self, *cols: "ColumnOrName") -> "GroupedData": + ... + + @overload + def rollup(self, __cols: Union[List[Column], List[str]]) -> "GroupedData": + ... + + def rollup(self, *cols: "ColumnOrName") -> "GroupedData": # type: ignore[misc] _cols: List[Column] = [] for c in cols: if isinstance(c, Column): @@ -542,9 +591,15 @@ def rollup(self, *cols: "ColumnOrName") -> "GroupedData": return GroupedData(df=self, group_type="rollup", grouping_cols=_cols) - rollup.__doc__ = PySparkDataFrame.rollup.__doc__ - + @overload def cube(self, *cols: "ColumnOrName") -> "GroupedData": + ... + + @overload + def cube(self, __cols: Union[List[Column], List[str]]) -> "GroupedData": + ... + + def cube(self, *cols: "ColumnOrName") -> "GroupedData": # type: ignore[misc] _cols: List[Column] = [] for c in cols: if isinstance(c, Column): @@ -566,8 +621,6 @@ def cube(self, *cols: "ColumnOrName") -> "GroupedData": return GroupedData(df=self, group_type="cube", grouping_cols=_cols) - cube.__doc__ = PySparkDataFrame.cube.__doc__ - def groupingSets( self, groupingSets: Sequence[Sequence["ColumnOrName"]], *cols: "ColumnOrName" ) -> "GroupedData": @@ -605,8 +658,6 @@ def groupingSets( df=self, group_type="grouping_sets", grouping_cols=gcols, grouping_sets=gsets ) - groupingSets.__doc__ = PySparkDataFrame.groupingSets.__doc__ - @overload def head(self) -> Optional[Row]: ... @@ -621,32 +672,26 @@ def head(self, n: Optional[int] = None) -> Union[Optional[Row], List[Row]]: return rs[0] if rs else None return self.take(n) - head.__doc__ = PySparkDataFrame.head.__doc__ - def take(self, num: int) -> List[Row]: return self.limit(num).collect() - take.__doc__ = PySparkDataFrame.take.__doc__ - def join( self, - other: "DataFrame", + other: ParentDataFrame, on: Optional[Union[str, List[str], Column, List[Column]]] = None, how: Optional[str] = None, - ) -> "DataFrame": + ) -> ParentDataFrame: self._check_same_session(other) if how is not None and isinstance(how, str): how = how.lower().replace("_", "") return DataFrame( - plan.Join(left=self._plan, right=other._plan, on=on, how=how), + plan.Join(left=self._plan, right=other._plan, on=on, how=how), # type: ignore[arg-type] session=self._session, ) - join.__doc__ = PySparkDataFrame.join.__doc__ - def _joinAsOf( self, - other: "DataFrame", + other: ParentDataFrame, leftAsOfColumn: Union[str, Column], rightAsOfColumn: Union[str, Column], on: Optional[Union[str, List[str], Column, List[Column]]] = None, @@ -655,7 +700,7 @@ def _joinAsOf( tolerance: Optional[Column] = None, allowExactMatches: bool = True, direction: str = "backward", - ) -> "DataFrame": + ) -> ParentDataFrame: self._check_same_session(other) if how is None: how = "inner" @@ -664,16 +709,16 @@ def _joinAsOf( if tolerance is not None: assert isinstance(tolerance, Column), "tolerance should be Column" - def _convert_col(df: "DataFrame", col: "ColumnOrName") -> Column: + def _convert_col(df: ParentDataFrame, col: "ColumnOrName") -> Column: if isinstance(col, Column): return col else: - return df._col(col) + return df._col(col) # type: ignore[operator] return DataFrame( plan.AsOfJoin( left=self._plan, - right=other._plan, + right=other._plan, # type: ignore[arg-type] left_as_of=_convert_col(self, leftAsOfColumn), right_as_of=_convert_col(other, rightAsOfColumn), on=on, @@ -685,18 +730,14 @@ def _convert_col(df: "DataFrame", col: "ColumnOrName") -> Column: session=self._session, ) - _joinAsOf.__doc__ = PySparkDataFrame._joinAsOf.__doc__ - - def limit(self, n: int) -> "DataFrame": - return DataFrame(plan.Limit(child=self._plan, limit=n), session=self._session) - - limit.__doc__ = PySparkDataFrame.limit.__doc__ + def limit(self, n: int) -> ParentDataFrame: + res = DataFrame(plan.Limit(child=self._plan, limit=n), session=self._session) + res._cached_schema = self._cached_schema + return res def tail(self, num: int) -> List[Row]: return DataFrame(plan.Tail(child=self._plan, limit=num), session=self._session).collect() - tail.__doc__ = PySparkDataFrame.tail.__doc__ - def _sort_cols( self, cols: Sequence[Union[int, str, Column, List[Union[int, str, Column]]]], @@ -748,8 +789,8 @@ def sort( self, *cols: Union[int, str, Column, List[Union[int, str, Column]]], **kwargs: Any, - ) -> "DataFrame": - return DataFrame( + ) -> ParentDataFrame: + res = DataFrame( plan.Sort( self._plan, columns=self._sort_cols(cols, kwargs), @@ -757,8 +798,8 @@ def sort( ), session=self._session, ) - - sort.__doc__ = PySparkDataFrame.sort.__doc__ + res._cached_schema = self._cached_schema + return res orderBy = sort @@ -766,8 +807,8 @@ def sortWithinPartitions( self, *cols: Union[int, str, Column, List[Union[int, str, Column]]], **kwargs: Any, - ) -> "DataFrame": - return DataFrame( + ) -> ParentDataFrame: + res = DataFrame( plan.Sort( self._plan, columns=self._sort_cols(cols, kwargs), @@ -775,15 +816,15 @@ def sortWithinPartitions( ), session=self._session, ) - - sortWithinPartitions.__doc__ = PySparkDataFrame.sortWithinPartitions.__doc__ + res._cached_schema = self._cached_schema + return res def sample( self, withReplacement: Optional[Union[float, bool]] = None, fraction: Optional[Union[int, float]] = None, seed: Optional[int] = None, - ) -> "DataFrame": + ) -> ParentDataFrame: # For the cases below: # sample(True, 0.5 [, seed]) # sample(True, fraction=0.5 [, seed]) @@ -822,9 +863,9 @@ def sample( if withReplacement is None: withReplacement = False - seed = int(seed) if seed is not None else None + seed = int(seed) if seed is not None else random.randint(0, sys.maxsize) - return DataFrame( + res = DataFrame( plan.Sample( child=self._plan, lower_bound=0.0, @@ -834,15 +875,13 @@ def sample( ), session=self._session, ) + res._cached_schema = self._cached_schema + return res - sample.__doc__ = PySparkDataFrame.sample.__doc__ - - def withColumnRenamed(self, existing: str, new: str) -> "DataFrame": + def withColumnRenamed(self, existing: str, new: str) -> ParentDataFrame: return self.withColumnsRenamed({existing: new}) - withColumnRenamed.__doc__ = PySparkDataFrame.withColumnRenamed.__doc__ - - def withColumnsRenamed(self, colsMap: Dict[str, str]) -> "DataFrame": + def withColumnsRenamed(self, colsMap: Dict[str, str]) -> ParentDataFrame: if not isinstance(colsMap, dict): raise PySparkTypeError( error_class="NOT_DICT", @@ -851,8 +890,6 @@ def withColumnsRenamed(self, colsMap: Dict[str, str]) -> "DataFrame": return DataFrame(plan.WithColumnsRenamed(self._plan, colsMap), self._session) - withColumnsRenamed.__doc__ = PySparkDataFrame.withColumnsRenamed.__doc__ - def _show_string( self, n: int = 20, truncate: Union[bool, int] = True, vertical: bool = False ) -> str: @@ -893,7 +930,7 @@ def _show_string( )._to_table() return table[0][0].as_py() - def withColumns(self, colsMap: Dict[str, Column]) -> "DataFrame": + def withColumns(self, colsMap: Dict[str, Column]) -> ParentDataFrame: if not isinstance(colsMap, dict): raise PySparkTypeError( error_class="NOT_DICT", @@ -915,9 +952,7 @@ def withColumns(self, colsMap: Dict[str, Column]) -> "DataFrame": session=self._session, ) - withColumns.__doc__ = PySparkDataFrame.withColumns.__doc__ - - def withColumn(self, colName: str, col: Column) -> "DataFrame": + def withColumn(self, colName: str, col: Column) -> ParentDataFrame: if not isinstance(col, Column): raise PySparkTypeError( error_class="NOT_COLUMN", @@ -932,9 +967,7 @@ def withColumn(self, colName: str, col: Column) -> "DataFrame": session=self._session, ) - withColumn.__doc__ = PySparkDataFrame.withColumn.__doc__ - - def withMetadata(self, columnName: str, metadata: Dict[str, Any]) -> "DataFrame": + def withMetadata(self, columnName: str, metadata: Dict[str, Any]) -> ParentDataFrame: if not isinstance(metadata, dict): raise PySparkTypeError( error_class="NOT_DICT", @@ -951,15 +984,13 @@ def withMetadata(self, columnName: str, metadata: Dict[str, Any]) -> "DataFrame" session=self._session, ) - withMetadata.__doc__ = PySparkDataFrame.withMetadata.__doc__ - def unpivot( self, ids: Union["ColumnOrName", List["ColumnOrName"], Tuple["ColumnOrName", ...]], values: Optional[Union["ColumnOrName", List["ColumnOrName"], Tuple["ColumnOrName", ...]]], variableColumnName: str, valueColumnName: str, - ) -> "DataFrame": + ) -> ParentDataFrame: assert ids is not None, "ids must not be None" def _convert_cols( @@ -983,11 +1014,9 @@ def _convert_cols( self._session, ) - unpivot.__doc__ = PySparkDataFrame.unpivot.__doc__ - melt = unpivot - def withWatermark(self, eventTime: str, delayThreshold: str) -> "DataFrame": + def withWatermark(self, eventTime: str, delayThreshold: str) -> ParentDataFrame: # TODO: reuse error handling code in sql.DataFrame.withWatermark() if not eventTime or type(eventTime) is not str: raise PySparkTypeError( @@ -1012,11 +1041,9 @@ def withWatermark(self, eventTime: str, delayThreshold: str) -> "DataFrame": session=self._session, ) - withWatermark.__doc__ = PySparkDataFrame.withWatermark.__doc__ - def hint( self, name: str, *parameters: Union["PrimitiveType", "Column", List["PrimitiveType"]] - ) -> "DataFrame": + ) -> ParentDataFrame: if len(parameters) == 1 and isinstance(parameters[0], list): parameters = parameters[0] # type: ignore[assignment] @@ -1053,18 +1080,18 @@ def hint( }, ) - return DataFrame( + res = DataFrame( plan.Hint(self._plan, name, [F.lit(p) for p in list(parameters)]), session=self._session, ) - - hint.__doc__ = PySparkDataFrame.hint.__doc__ + res._cached_schema = self._cached_schema + return res def randomSplit( self, weights: List[float], seed: Optional[int] = None, - ) -> List["DataFrame"]: + ) -> List[ParentDataFrame]: for w in weights: if w < 0.0: raise PySparkValueError( @@ -1099,18 +1126,17 @@ def randomSplit( ), session=self._session, ) + samplePlan._cached_schema = self._cached_schema splits.append(samplePlan) j += 1 - return splits - - randomSplit.__doc__ = PySparkDataFrame.randomSplit.__doc__ + return splits # type: ignore[return-value] def observe( self, observation: Union["Observation", str], *exprs: Column, - ) -> "DataFrame": + ) -> ParentDataFrame: from pyspark.sql.connect.observation import Observation if len(exprs) == 0: @@ -1125,9 +1151,9 @@ def observe( ) if isinstance(observation, Observation): - return observation._on(self, *exprs) + res = observation._on(self, *exprs) elif isinstance(observation, str): - return DataFrame( + res = DataFrame( plan.CollectMetrics(self._plan, observation, list(exprs)), self._session, ) @@ -1140,78 +1166,96 @@ def observe( }, ) - observe.__doc__ = PySparkDataFrame.observe.__doc__ + res._cached_schema = self._cached_schema + return res def show(self, n: int = 20, truncate: Union[bool, int] = True, vertical: bool = False) -> None: print(self._show_string(n, truncate, vertical)) - show.__doc__ = PySparkDataFrame.show.__doc__ + def _merge_cached_schema(self, other: ParentDataFrame) -> Optional[StructType]: + # to avoid type coercion, only propagate the schema + # when the cached schemas are exactly the same + if self._cached_schema is not None and self._cached_schema == other._cached_schema: + return self.schema + return None - def union(self, other: "DataFrame") -> "DataFrame": + def union(self, other: ParentDataFrame) -> ParentDataFrame: self._check_same_session(other) return self.unionAll(other) - union.__doc__ = PySparkDataFrame.union.__doc__ - - def unionAll(self, other: "DataFrame") -> "DataFrame": + def unionAll(self, other: ParentDataFrame) -> ParentDataFrame: self._check_same_session(other) - return DataFrame( - plan.SetOperation(self._plan, other._plan, "union", is_all=True), session=self._session + res = DataFrame( + plan.SetOperation( + self._plan, other._plan, "union", is_all=True # type: ignore[arg-type] + ), + session=self._session, ) + res._cached_schema = self._merge_cached_schema(other) + return res - unionAll.__doc__ = PySparkDataFrame.unionAll.__doc__ - - def unionByName(self, other: "DataFrame", allowMissingColumns: bool = False) -> "DataFrame": + def unionByName( + self, other: ParentDataFrame, allowMissingColumns: bool = False + ) -> ParentDataFrame: self._check_same_session(other) - return DataFrame( + res = DataFrame( plan.SetOperation( self._plan, - other._plan, + other._plan, # type: ignore[arg-type] "union", by_name=True, allow_missing_columns=allowMissingColumns, ), session=self._session, ) + res._cached_schema = self._merge_cached_schema(other) + return res - unionByName.__doc__ = PySparkDataFrame.unionByName.__doc__ - - def subtract(self, other: "DataFrame") -> "DataFrame": + def subtract(self, other: ParentDataFrame) -> ParentDataFrame: self._check_same_session(other) - return DataFrame( - plan.SetOperation(self._plan, other._plan, "except", is_all=False), + res = DataFrame( + plan.SetOperation( + self._plan, other._plan, "except", is_all=False # type: ignore[arg-type] + ), session=self._session, ) + res._cached_schema = self._merge_cached_schema(other) + return res - subtract.__doc__ = PySparkDataFrame.subtract.__doc__ - - def exceptAll(self, other: "DataFrame") -> "DataFrame": + def exceptAll(self, other: ParentDataFrame) -> ParentDataFrame: self._check_same_session(other) - return DataFrame( - plan.SetOperation(self._plan, other._plan, "except", is_all=True), session=self._session + res = DataFrame( + plan.SetOperation( + self._plan, other._plan, "except", is_all=True # type: ignore[arg-type] + ), + session=self._session, ) + res._cached_schema = self._merge_cached_schema(other) + return res - exceptAll.__doc__ = PySparkDataFrame.exceptAll.__doc__ - - def intersect(self, other: "DataFrame") -> "DataFrame": + def intersect(self, other: ParentDataFrame) -> ParentDataFrame: self._check_same_session(other) - return DataFrame( - plan.SetOperation(self._plan, other._plan, "intersect", is_all=False), + res = DataFrame( + plan.SetOperation( + self._plan, other._plan, "intersect", is_all=False # type: ignore[arg-type] + ), session=self._session, ) + res._cached_schema = self._merge_cached_schema(other) + return res - intersect.__doc__ = PySparkDataFrame.intersect.__doc__ - - def intersectAll(self, other: "DataFrame") -> "DataFrame": + def intersectAll(self, other: ParentDataFrame) -> ParentDataFrame: self._check_same_session(other) - return DataFrame( - plan.SetOperation(self._plan, other._plan, "intersect", is_all=True), + res = DataFrame( + plan.SetOperation( + self._plan, other._plan, "intersect", is_all=True # type: ignore[arg-type] + ), session=self._session, ) + res._cached_schema = self._merge_cached_schema(other) + return res - intersectAll.__doc__ = PySparkDataFrame.intersectAll.__doc__ - - def where(self, condition: Union[Column, str]) -> "DataFrame": + def where(self, condition: Union[Column, str]) -> ParentDataFrame: if not isinstance(condition, (str, Column)): raise PySparkTypeError( error_class="NOT_COLUMN_OR_STR", @@ -1219,19 +1263,15 @@ def where(self, condition: Union[Column, str]) -> "DataFrame": ) return self.filter(condition) - where.__doc__ = PySparkDataFrame.where.__doc__ - @property - def na(self) -> "DataFrameNaFunctions": + def na(self) -> ParentDataFrameNaFunctions: return DataFrameNaFunctions(self) - na.__doc__ = PySparkDataFrame.na.__doc__ - def fillna( self, value: Union["LiteralType", Dict[str, "LiteralType"]], subset: Optional[Union[str, Tuple[str, ...], List[str]]] = None, - ) -> "DataFrame": + ) -> ParentDataFrame: if not isinstance(value, (float, int, str, bool, dict)): raise PySparkTypeError( error_class="NOT_BOOL_OR_DICT_OR_FLOAT_OR_INT_OR_STR", @@ -1290,14 +1330,12 @@ def fillna( session=self._session, ) - fillna.__doc__ = PySparkDataFrame.fillna.__doc__ - def dropna( self, how: str = "any", thresh: Optional[int] = None, subset: Optional[Union[str, Tuple[str, ...], List[str]]] = None, - ) -> "DataFrame": + ) -> ParentDataFrame: min_non_nulls: Optional[int] = None if how is not None: @@ -1349,8 +1387,6 @@ def dropna( session=self._session, ) - dropna.__doc__ = PySparkDataFrame.dropna.__doc__ - def replace( self, to_replace: Union[ @@ -1360,7 +1396,7 @@ def replace( Union["OptionalPrimitiveType", List["OptionalPrimitiveType"], _NoValueType] ] = _NoValue, subset: Optional[List[str]] = None, - ) -> "DataFrame": + ) -> ParentDataFrame: if value is _NoValue: if isinstance(to_replace, dict): value = None @@ -1478,15 +1514,11 @@ def _convert_int_to_float(v: Any) -> Any: session=self._session, ) - replace.__doc__ = PySparkDataFrame.replace.__doc__ - @property - def stat(self) -> "DataFrameStatFunctions": + def stat(self) -> ParentDataFrameStatFunctions: return DataFrameStatFunctions(self) - stat.__doc__ = PySparkDataFrame.stat.__doc__ - - def summary(self, *statistics: str) -> "DataFrame": + def summary(self, *statistics: str) -> ParentDataFrame: _statistics: List[str] = list(statistics) for s in _statistics: if not isinstance(s, str): @@ -1499,9 +1531,7 @@ def summary(self, *statistics: str) -> "DataFrame": session=self._session, ) - summary.__doc__ = PySparkDataFrame.summary.__doc__ - - def describe(self, *cols: Union[str, List[str]]) -> "DataFrame": + def describe(self, *cols: Union[str, List[str]]) -> ParentDataFrame: if len(cols) == 1 and isinstance(cols[0], list): cols = cols[0] # type: ignore[assignment] @@ -1516,8 +1546,6 @@ def describe(self, *cols: Union[str, List[str]]) -> "DataFrame": session=self._session, ) - describe.__doc__ = PySparkDataFrame.describe.__doc__ - def cov(self, col1: str, col2: str) -> float: if not isinstance(col1, str): raise PySparkTypeError( @@ -1535,8 +1563,6 @@ def cov(self, col1: str, col2: str) -> float: )._to_table() return table[0][0].as_py() - cov.__doc__ = PySparkDataFrame.cov.__doc__ - def corr(self, col1: str, col2: str, method: Optional[str] = None) -> float: if not isinstance(col1, str): raise PySparkTypeError( @@ -1561,8 +1587,6 @@ def corr(self, col1: str, col2: str, method: Optional[str] = None) -> float: )._to_table() return table[0][0].as_py() - corr.__doc__ = PySparkDataFrame.corr.__doc__ - def approxQuantile( self, col: Union[str, List[str], Tuple[str]], @@ -1639,9 +1663,7 @@ def approxQuantile( jaq_list = [list(j) for j in jaq] return jaq_list[0] if isStr else jaq_list - approxQuantile.__doc__ = PySparkDataFrame.approxQuantile.__doc__ - - def crosstab(self, col1: str, col2: str) -> "DataFrame": + def crosstab(self, col1: str, col2: str) -> ParentDataFrame: if not isinstance(col1, str): raise PySparkTypeError( error_class="NOT_STR", @@ -1657,11 +1679,9 @@ def crosstab(self, col1: str, col2: str) -> "DataFrame": session=self._session, ) - crosstab.__doc__ = PySparkDataFrame.crosstab.__doc__ - def freqItems( self, cols: Union[List[str], Tuple[str]], support: Optional[float] = None - ) -> "DataFrame": + ) -> ParentDataFrame: if isinstance(cols, tuple): cols = list(cols) if not isinstance(cols, list): @@ -1676,11 +1696,9 @@ def freqItems( session=self._session, ) - freqItems.__doc__ = PySparkDataFrame.freqItems.__doc__ - def sampleBy( self, col: "ColumnOrName", fractions: Dict[Any, float], seed: Optional[int] = None - ) -> "DataFrame": + ) -> ParentDataFrame: if not isinstance(col, (str, Column)): raise PySparkTypeError( error_class="NOT_COLUMN_OR_STR", @@ -1717,21 +1735,27 @@ def sampleBy( session=self._session, ) - sampleBy.__doc__ = PySparkDataFrame.sampleBy.__doc__ + def _ipython_key_completions_(self) -> List[str]: + """Returns the names of columns in this :class:`DataFrame`. + + Examples + -------- + >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], ["age", "name"]) + >>> df._ipython_key_completions_() + ['age', 'name'] + + Would return illegal identifiers. + >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], ["age 1", "name?1"]) + >>> df._ipython_key_completions_() + ['age 1', 'name?1'] + """ + return self.columns def __getattr__(self, name: str) -> "Column": if name in ["_jseq", "_jdf", "_jmap", "_jcols", "rdd", "toJSON"]: raise PySparkAttributeError( error_class="JVM_ATTRIBUTE_NOT_SUPPORTED", message_parameters={"attr_name": name} ) - elif name in [ - "checkpoint", - "localCheckpoint", - ]: - raise PySparkNotImplementedError( - error_class="NOT_IMPLEMENTED", - message_parameters={"feature": f"{name}()"}, - ) if name not in self.columns: raise PySparkAttributeError( @@ -1740,20 +1764,22 @@ def __getattr__(self, name: str) -> "Column": return self._col(name) - __getattr__.__doc__ = PySparkDataFrame.__getattr__.__doc__ - @overload def __getitem__(self, item: Union[int, str]) -> Column: ... @overload - def __getitem__(self, item: Union[Column, List, Tuple]) -> "DataFrame": + def __getitem__(self, item: Union[Column, List, Tuple]) -> ParentDataFrame: ... - def __getitem__(self, item: Union[int, str, Column, List, Tuple]) -> Union[Column, "DataFrame"]: + def __getitem__( + self, item: Union[int, str, Column, List, Tuple] + ) -> Union[Column, ParentDataFrame]: + from pyspark.sql.connect.column import Column as ConnectColumn + if isinstance(item, str): if item == "*": - return Column( + return ConnectColumn( UnresolvedStar( unparsed_target=None, plan_id=self._plan._plan_id, @@ -1764,7 +1790,7 @@ def __getitem__(self, item: Union[int, str, Column, List, Tuple]) -> Union[Colum # if (sparkSession.sessionState.conf.supportQuotedRegexColumnName) { # colRegex(colName) # } else { - # Column(addDataFrameIdToCol(resolve(colName))) + # ConnectColumn(addDataFrameIdToCol(resolve(colName))) # } # validate the column name @@ -1790,7 +1816,9 @@ def __getitem__(self, item: Union[int, str, Column, List, Tuple]) -> Union[Colum ) def _col(self, name: str) -> Column: - return Column( + from pyspark.sql.connect.column import Column as ConnectColumn + + return ConnectColumn( ColumnReference( unparsed_identifier=name, plan_id=self._plan._plan_id, @@ -1798,34 +1826,40 @@ def _col(self, name: str) -> Column: ) def __dir__(self) -> List[str]: - attrs = set(super().__dir__()) + attrs = set(dir(DataFrame)) attrs.update(self.columns) return sorted(attrs) - __dir__.__doc__ = PySparkDataFrame.__dir__.__doc__ - def collect(self) -> List[Row]: table, schema = self._to_table() - schema = schema or from_arrow_schema(table.schema, prefer_timestamp_ntz=True) + # not all datatypes are supported in arrow based collect + # here always verify the schema by from_arrow_schema + schema2 = from_arrow_schema(table.schema, prefer_timestamp_ntz=True) + schema = schema or schema2 assert schema is not None and isinstance(schema, StructType) return ArrowTableToRowsConversion.convert(table, schema) - collect.__doc__ = PySparkDataFrame.collect.__doc__ - def _to_table(self) -> Tuple["pa.Table", Optional[StructType]]: query = self._plan.to_proto(self._session.client) - table, schema = self._session.client.to_table(query, self._plan.observations) + table, schema, self._execution_info = self._session.client.to_table( + query, self._plan.observations + ) assert table is not None return (table, schema) - def toPandas(self) -> "pandas.DataFrame": - query = self._plan.to_proto(self._session.client) - return self._session.client.to_pandas(query, self._plan.observations) + def toArrow(self) -> "pa.Table": + schema = to_arrow_schema(self.schema, error_on_duplicated_field_names_in_struct=True) + table, _ = self._to_table() + return table.cast(schema) - toPandas.__doc__ = PySparkDataFrame.toPandas.__doc__ + def toPandas(self) -> "PandasDataFrameLike": + query = self._plan.to_proto(self._session.client) + pdf, ei = self._session.client.to_pandas(query, self._plan.observations) + self._execution_info = ei + return pdf @property def schema(self) -> StructType: @@ -1837,58 +1871,45 @@ def schema(self) -> StructType: if self._cached_schema is None: query = self._plan.to_proto(self._session.client) self._cached_schema = self._session.client.schema(query) - return self._cached_schema - - schema.__doc__ = PySparkDataFrame.schema.__doc__ + return copy.deepcopy(self._cached_schema) + @functools.cache def isLocal(self) -> bool: query = self._plan.to_proto(self._session.client) result = self._session.client._analyze(method="is_local", plan=query).is_local assert result is not None return result - isLocal.__doc__ = PySparkDataFrame.isLocal.__doc__ - - @property + @functools.cached_property def isStreaming(self) -> bool: query = self._plan.to_proto(self._session.client) result = self._session.client._analyze(method="is_streaming", plan=query).is_streaming assert result is not None return result - isStreaming.__doc__ = PySparkDataFrame.isStreaming.__doc__ - - def _tree_string(self, level: Optional[int] = None) -> str: - query = self._plan.to_proto(self._session.client) - result = self._session.client._analyze( - method="tree_string", plan=query, level=level - ).tree_string - assert result is not None - return result - def printSchema(self, level: Optional[int] = None) -> None: - print(self._tree_string(level)) - - printSchema.__doc__ = PySparkDataFrame.printSchema.__doc__ + if level: + print(self.schema.treeString(level)) + else: + print(self.schema.treeString()) + @functools.cache def inputFiles(self) -> List[str]: query = self._plan.to_proto(self._session.client) result = self._session.client._analyze(method="input_files", plan=query).input_files assert result is not None return result - inputFiles.__doc__ = PySparkDataFrame.inputFiles.__doc__ - - def to(self, schema: StructType) -> "DataFrame": + def to(self, schema: StructType) -> ParentDataFrame: assert schema is not None - return DataFrame( + res = DataFrame( plan.ToSchema(child=self._plan, schema=schema), session=self._session, ) + res._cached_schema = schema + return res - to.__doc__ = PySparkDataFrame.to.__doc__ - - def toDF(self, *cols: str) -> "DataFrame": + def toDF(self, *cols: str) -> ParentDataFrame: for col_ in cols: if not isinstance(col_, str): raise PySparkTypeError( @@ -1897,17 +1918,15 @@ def toDF(self, *cols: str) -> "DataFrame": ) return DataFrame(plan.ToDF(self._plan, list(cols)), self._session) - toDF.__doc__ = PySparkDataFrame.toDF.__doc__ - - def transform(self, func: Callable[..., "DataFrame"], *args: Any, **kwargs: Any) -> "DataFrame": + def transform( + self, func: Callable[..., ParentDataFrame], *args: Any, **kwargs: Any + ) -> ParentDataFrame: result = func(self, *args, **kwargs) assert isinstance( result, DataFrame ), "Func returned an instance of type [%s], " "should have been DataFrame." % type(result) return result - transform.__doc__ = PySparkDataFrame.transform.__doc__ - def _explain_string( self, extended: Optional[Union[bool, str]] = None, mode: Optional[str] = None ) -> str: @@ -1961,57 +1980,47 @@ def explain( ) -> None: print(self._explain_string(extended=extended, mode=mode)) - explain.__doc__ = PySparkDataFrame.explain.__doc__ - def createTempView(self, name: str) -> None: command = plan.CreateView( child=self._plan, name=name, is_global=False, replace=False ).command(session=self._session.client) - self._session.client.execute_command(command, self._plan.observations) - - createTempView.__doc__ = PySparkDataFrame.createTempView.__doc__ + _, _, ei = self._session.client.execute_command(command, self._plan.observations) + self._execution_info = ei def createOrReplaceTempView(self, name: str) -> None: command = plan.CreateView( child=self._plan, name=name, is_global=False, replace=True ).command(session=self._session.client) - self._session.client.execute_command(command, self._plan.observations) - - createOrReplaceTempView.__doc__ = PySparkDataFrame.createOrReplaceTempView.__doc__ + _, _, ei = self._session.client.execute_command(command, self._plan.observations) + self._execution_info = ei def createGlobalTempView(self, name: str) -> None: command = plan.CreateView( child=self._plan, name=name, is_global=True, replace=False ).command(session=self._session.client) - self._session.client.execute_command(command, self._plan.observations) - - createGlobalTempView.__doc__ = PySparkDataFrame.createGlobalTempView.__doc__ + _, _, ei = self._session.client.execute_command(command, self._plan.observations) + self._execution_info = ei def createOrReplaceGlobalTempView(self, name: str) -> None: command = plan.CreateView( child=self._plan, name=name, is_global=True, replace=True ).command(session=self._session.client) - self._session.client.execute_command(command, self._plan.observations) - - createOrReplaceGlobalTempView.__doc__ = PySparkDataFrame.createOrReplaceGlobalTempView.__doc__ + _, _, ei = self._session.client.execute_command(command, self._plan.observations) + self._execution_info = ei - def cache(self) -> "DataFrame": + def cache(self) -> ParentDataFrame: return self.persist() - cache.__doc__ = PySparkDataFrame.cache.__doc__ - def persist( self, storageLevel: StorageLevel = (StorageLevel.MEMORY_AND_DISK_DESER), - ) -> "DataFrame": + ) -> ParentDataFrame: relation = self._plan.plan(self._session.client) self._session.client._analyze( method="persist", relation=relation, storage_level=storageLevel ) return self - persist.__doc__ = PySparkDataFrame.persist.__doc__ - @property def storageLevel(self) -> StorageLevel: relation = self._plan.plan(self._session.client) @@ -2021,15 +2030,11 @@ def storageLevel(self) -> StorageLevel: assert storage_level is not None return storage_level - storageLevel.__doc__ = PySparkDataFrame.storageLevel.__doc__ - - def unpersist(self, blocking: bool = False) -> "DataFrame": + def unpersist(self, blocking: bool = False) -> ParentDataFrame: relation = self._plan.plan(self._session.client) self._session.client._analyze(method="unpersist", relation=relation, blocking=blocking) return self - unpersist.__doc__ = PySparkDataFrame.unpivot.__doc__ - @property def is_cached(self) -> bool: return self.storageLevel != StorageLevel.NONE @@ -2051,8 +2056,6 @@ def toLocalIterator(self, prefetchPartitions: bool = False) -> Iterator[Row]: schema = from_arrow_schema(table.schema, prefer_timestamp_ntz=True) yield from ArrowTableToRowsConversion.convert(table, schema) - toLocalIterator.__doc__ = PySparkDataFrame.toLocalIterator.__doc__ - def pandas_api( self, index_col: Optional[Union[str, List[str]]] = None ) -> "PandasOnSparkDataFrame": @@ -2060,22 +2063,18 @@ def pandas_api( from pyspark.pandas.frame import DataFrame as PandasOnSparkDataFrame from pyspark.pandas.internal import InternalFrame - index_spark_columns, index_names = _get_index_map(self, index_col) # type: ignore[arg-type] + index_spark_columns, index_names = _get_index_map(self, index_col) internal = InternalFrame( - spark_frame=self, # type: ignore[arg-type] + spark_frame=self, index_spark_columns=index_spark_columns, index_names=index_names, # type: ignore[arg-type] ) return PandasOnSparkDataFrame(internal) - pandas_api.__doc__ = PySparkDataFrame.pandas_api.__doc__ - def registerTempTable(self, name: str) -> None: warnings.warn("Deprecated in 2.0, use createOrReplaceTempView instead.", FutureWarning) self.createOrReplaceTempView(name) - registerTempTable.__doc__ = PySparkDataFrame.registerTempTable.__doc__ - def _map_partitions( self, func: "PandasMapIterFunction", @@ -2083,16 +2082,17 @@ def _map_partitions( evalType: int, barrier: bool, profile: Optional[ResourceProfile], - ) -> "DataFrame": + ) -> ParentDataFrame: from pyspark.sql.connect.udf import UserDefinedFunction + _validate_pandas_udf(func, evalType) udf_obj = UserDefinedFunction( func, returnType=schema, evalType=evalType, ) - return DataFrame( + res = DataFrame( plan.MapPartitions( child=self._plan, function=udf_obj, @@ -2102,6 +2102,9 @@ def _map_partitions( ), session=self._session, ) + if isinstance(schema, StructType): + res._cached_schema = schema + return res def mapInPandas( self, @@ -2109,26 +2112,22 @@ def mapInPandas( schema: Union[StructType, str], barrier: bool = False, profile: Optional[ResourceProfile] = None, - ) -> "DataFrame": + ) -> ParentDataFrame: return self._map_partitions( func, schema, PythonEvalType.SQL_MAP_PANDAS_ITER_UDF, barrier, profile ) - mapInPandas.__doc__ = PySparkDataFrame.mapInPandas.__doc__ - def mapInArrow( self, func: "ArrowMapIterFunction", schema: Union[StructType, str], barrier: bool = False, profile: Optional[ResourceProfile] = None, - ) -> "DataFrame": + ) -> ParentDataFrame: return self._map_partitions( func, schema, PythonEvalType.SQL_MAP_ARROW_ITER_UDF, barrier, profile ) - mapInArrow.__doc__ = PySparkDataFrame.mapInArrow.__doc__ - def foreach(self, f: Callable[[Row], None]) -> None: def foreach_func(row: Any) -> None: f(row) @@ -2137,8 +2136,6 @@ def foreach_func(row: Any) -> None: F.udf(foreach_func, StructType())("row") # type: ignore[arg-type] ).collect() - foreach.__doc__ = PySparkDataFrame.foreach.__doc__ - def foreachPartition(self, f: Callable[[Iterator[Row]], None]) -> None: schema = self.schema field_converters = [ @@ -2161,15 +2158,11 @@ def flatten() -> Iterator[Row]: self.mapInArrow(foreach_partition_func, schema=StructType()).collect() - foreachPartition.__doc__ = PySparkDataFrame.foreachPartition.__doc__ - @property def writeStream(self) -> DataStreamWriter: return DataStreamWriter(plan=self._plan, session=self._session) - writeStream.__doc__ = PySparkDataFrame.writeStream.__doc__ - - def sameSemantics(self, other: "DataFrame") -> bool: + def sameSemantics(self, other: ParentDataFrame) -> bool: if not isinstance(other, DataFrame): raise PySparkTypeError( error_class="NOT_DATAFRAME", @@ -2181,50 +2174,80 @@ def sameSemantics(self, other: "DataFrame") -> bool: other=other._plan.to_proto(other._session.client), ) - sameSemantics.__doc__ = PySparkDataFrame.sameSemantics.__doc__ - + @functools.cache def semanticHash(self) -> int: return self._session.client.semantic_hash( plan=self._plan.to_proto(self._session.client), ) - semanticHash.__doc__ = PySparkDataFrame.semanticHash.__doc__ - def writeTo(self, table: str) -> "DataFrameWriterV2": - return DataFrameWriterV2(self._plan, self._session, table) + def cb(ei: "ExecutionInfo") -> None: + self._execution_info = ei - writeTo.__doc__ = PySparkDataFrame.writeTo.__doc__ + return DataFrameWriterV2(self._plan, self._session, table, cb) - # SparkConnect specific API - def offset(self, n: int) -> "DataFrame": + def offset(self, n: int) -> ParentDataFrame: return DataFrame(plan.Offset(child=self._plan, offset=n), session=self._session) - offset.__doc__ = PySparkDataFrame.offset.__doc__ + def checkpoint(self, eager: bool = True) -> "DataFrame": + cmd = plan.Checkpoint(child=self._plan, local=False, eager=eager) + _, properties, self._execution_info = self._session.client.execute_command( + cmd.command(self._session.client) + ) + assert "checkpoint_command_result" in properties + checkpointed = properties["checkpoint_command_result"] + assert isinstance(checkpointed._plan, plan.CachedRemoteRelation) + return checkpointed + + def localCheckpoint(self, eager: bool = True) -> "DataFrame": + cmd = plan.Checkpoint(child=self._plan, local=True, eager=eager) + _, properties, self._execution_info = self._session.client.execute_command( + cmd.command(self._session.client) + ) + assert "checkpoint_command_result" in properties + checkpointed = properties["checkpoint_command_result"] + assert isinstance(checkpointed._plan, plan.CachedRemoteRelation) + return checkpointed + + if not is_remote_only(): + + def toJSON(self, use_unicode: bool = True) -> "RDD[str]": + raise PySparkNotImplementedError( + error_class="NOT_IMPLEMENTED", + message_parameters={"feature": "toJSON()"}, + ) + + @property + def rdd(self) -> "RDD[Row]": + raise PySparkNotImplementedError( + error_class="NOT_IMPLEMENTED", + message_parameters={"feature": "rdd"}, + ) + + @property + def executionInfo(self) -> Optional["ExecutionInfo"]: + return self._execution_info -class DataFrameNaFunctions: - def __init__(self, df: DataFrame): +class DataFrameNaFunctions(ParentDataFrameNaFunctions): + def __init__(self, df: ParentDataFrame): self.df = df def fill( self, value: Union["LiteralType", Dict[str, "LiteralType"]], subset: Optional[Union[str, Tuple[str, ...], List[str]]] = None, - ) -> DataFrame: - return self.df.fillna(value=value, subset=subset) - - fill.__doc__ = DataFrame.fillna.__doc__ + ) -> ParentDataFrame: + return self.df.fillna(value=value, subset=subset) # type: ignore[arg-type] def drop( self, how: str = "any", thresh: Optional[int] = None, subset: Optional[Union[str, Tuple[str, ...], List[str]]] = None, - ) -> DataFrame: + ) -> ParentDataFrame: return self.df.dropna(how=how, thresh=thresh, subset=subset) - drop.__doc__ = DataFrame.dropna.__doc__ - def replace( self, to_replace: Union[List["LiteralType"], Dict["LiteralType", "OptionalPrimitiveType"]], @@ -2232,29 +2255,20 @@ def replace( Union["OptionalPrimitiveType", List["OptionalPrimitiveType"], _NoValueType] ] = _NoValue, subset: Optional[List[str]] = None, - ) -> DataFrame: - return self.df.replace(to_replace, value, subset) - - replace.__doc__ = DataFrame.replace.__doc__ - + ) -> ParentDataFrame: + return self.df.replace(to_replace, value, subset) # type: ignore[arg-type] -DataFrameNaFunctions.__doc__ = PySparkDataFrameNaFunctions.__doc__ - -class DataFrameStatFunctions: - def __init__(self, df: DataFrame): +class DataFrameStatFunctions(ParentDataFrameStatFunctions): + def __init__(self, df: ParentDataFrame): self.df = df def cov(self, col1: str, col2: str) -> float: return self.df.cov(col1, col2) - cov.__doc__ = DataFrame.cov.__doc__ - def corr(self, col1: str, col2: str, method: Optional[str] = None) -> float: return self.df.corr(col1, col2, method) - corr.__doc__ = DataFrame.corr.__doc__ - def approxQuantile( self, col: Union[str, List[str], Tuple[str]], @@ -2263,41 +2277,37 @@ def approxQuantile( ) -> Union[List[float], List[List[float]]]: return self.df.approxQuantile(col, probabilities, relativeError) - approxQuantile.__doc__ = DataFrame.approxQuantile.__doc__ - - def crosstab(self, col1: str, col2: str) -> DataFrame: + def crosstab(self, col1: str, col2: str) -> ParentDataFrame: return self.df.crosstab(col1, col2) - crosstab.__doc__ = DataFrame.crosstab.__doc__ - def freqItems( self, cols: Union[List[str], Tuple[str]], support: Optional[float] = None - ) -> DataFrame: + ) -> ParentDataFrame: return self.df.freqItems(cols, support) - freqItems.__doc__ = DataFrame.freqItems.__doc__ - def sampleBy( self, col: str, fractions: Dict[Any, float], seed: Optional[int] = None - ) -> DataFrame: + ) -> ParentDataFrame: return self.df.sampleBy(col, fractions, seed) - sampleBy.__doc__ = DataFrame.sampleBy.__doc__ - - -DataFrameStatFunctions.__doc__ = PySparkDataFrameStatFunctions.__doc__ - def _test() -> None: import os import sys import doctest + from pyspark.util import is_remote_only from pyspark.sql import SparkSession as PySparkSession - import pyspark.sql.connect.dataframe + import pyspark.sql.dataframe + # It inherits docstrings but doctests cannot detect them so we run + # the parent classe's doctests here directly. os.chdir(os.environ["SPARK_HOME"]) - globs = pyspark.sql.connect.dataframe.__dict__.copy() + globs = pyspark.sql.dataframe.__dict__.copy() + + if not is_remote_only(): + del pyspark.sql.dataframe.DataFrame.toJSON.__doc__ + del pyspark.sql.dataframe.DataFrame.rdd.__doc__ globs["spark"] = ( PySparkSession.builder.appName("sql.connect.dataframe tests") @@ -2306,7 +2316,7 @@ def _test() -> None: ) (failure_count, test_count) = doctest.testmod( - pyspark.sql.connect.dataframe, + pyspark.sql.dataframe, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE diff --git a/python/pyspark/sql/connect/expressions.py b/python/pyspark/sql/connect/expressions.py index b1735f65f520e..c10bef56c3b83 100644 --- a/python/pyspark/sql/connect/expressions.py +++ b/python/pyspark/sql/connect/expressions.py @@ -15,7 +15,6 @@ # limitations under the License. # from pyspark.sql.connect.utils import check_dependencies -from pyspark.sql.utils import is_timestamp_ntz_preferred check_dependencies(__name__) @@ -61,7 +60,7 @@ ) import pyspark.sql.connect.proto as proto -from pyspark.sql.connect.types import ( +from pyspark.util import ( JVM_BYTE_MIN, JVM_BYTE_MAX, JVM_SHORT_MIN, @@ -70,11 +69,15 @@ JVM_INT_MAX, JVM_LONG_MIN, JVM_LONG_MAX, +) +from pyspark.sql.connect.types import ( UnparsedDataType, pyspark_types_to_proto_types, proto_schema_to_pyspark_data_type, ) from pyspark.errors import PySparkTypeError, PySparkValueError +from pyspark.errors.utils import current_origin +from pyspark.sql.utils import is_timestamp_ntz_preferred if TYPE_CHECKING: from pyspark.sql.connect.client import SparkConnectClient @@ -87,7 +90,16 @@ class Expression: """ def __init__(self) -> None: - pass + origin = current_origin() + fragment = origin.fragment + call_site = origin.call_site + self.origin = None + if fragment is not None and call_site is not None: + self.origin = proto.Origin( + python_origin=proto.PythonOrigin( + fragment=origin.fragment, call_site=origin.call_site + ) + ) def to_plan( # type: ignore[empty-body] self, session: "SparkConnectClient" @@ -110,6 +122,12 @@ def alias(self, *alias: str, **kwargs: Any) -> "ColumnAlias": def name(self) -> str: # type: ignore[empty-body] ... + def _create_proto_expression(self) -> proto.Expression: + plan = proto.Expression() + if self.origin is not None: + plan.common.origin.CopyFrom(self.origin) + return plan + class CaseWhen(Expression): def __init__( @@ -151,18 +169,18 @@ def __repr__(self) -> str: class ColumnAlias(Expression): - def __init__(self, parent: Expression, alias: Sequence[str], metadata: Any): + def __init__(self, child: Expression, alias: Sequence[str], metadata: Any): super().__init__() self._alias = alias self._metadata = metadata - self._parent = parent + self._child = child def to_plan(self, session: "SparkConnectClient") -> "proto.Expression": if len(self._alias) == 1: - exp = proto.Expression() + exp = self._create_proto_expression() exp.alias.name.append(self._alias[0]) - exp.alias.expr.CopyFrom(self._parent.to_plan(session)) + exp.alias.expr.CopyFrom(self._child.to_plan(session)) if self._metadata: exp.alias.metadata = json.dumps(self._metadata) @@ -173,13 +191,13 @@ def to_plan(self, session: "SparkConnectClient") -> "proto.Expression": error_class="CANNOT_PROVIDE_METADATA", message_parameters={}, ) - exp = proto.Expression() + exp = self._create_proto_expression() exp.alias.name.extend(self._alias) - exp.alias.expr.CopyFrom(self._parent.to_plan(session)) + exp.alias.expr.CopyFrom(self._child.to_plan(session)) return exp def __repr__(self) -> str: - return f"{self._parent} AS {','.join(self._alias)}" + return f"{self._child} AS {','.join(self._alias)}" class LiteralExpression(Expression): @@ -405,7 +423,7 @@ def _to_value( def to_plan(self, session: "SparkConnectClient") -> "proto.Expression": """Converts the literal expression to the literal in proto.""" - expr = proto.Expression() + expr = self._create_proto_expression() if self._value is None: expr.literal.null.CopyFrom(pyspark_types_to_proto_types(self._dataType)) @@ -455,7 +473,10 @@ def to_plan(self, session: "SparkConnectClient") -> "proto.Expression": return expr def __repr__(self) -> str: - return f"{self._value}" + if self._value is None: + return "NULL" + else: + return f"{self._value}" class ColumnReference(Expression): @@ -478,7 +499,7 @@ def name(self) -> str: def to_plan(self, session: "SparkConnectClient") -> proto.Expression: """Returns the Proto representation of the expression.""" - expr = proto.Expression() + expr = self._create_proto_expression() expr.unresolved_attribute.unparsed_identifier = self._unparsed_identifier if self._plan_id is not None: expr.unresolved_attribute.plan_id = self._plan_id @@ -507,7 +528,7 @@ def __init__(self, unparsed_target: Optional[str], plan_id: Optional[int] = None self._plan_id = plan_id def to_plan(self, session: "SparkConnectClient") -> "proto.Expression": - expr = proto.Expression() + expr = self._create_proto_expression() expr.unresolved_star.SetInParent() if self._unparsed_target is not None: expr.unresolved_star.unparsed_target = self._unparsed_target @@ -536,17 +557,21 @@ class SQLExpression(Expression): def __init__(self, expr: str) -> None: super().__init__() + assert isinstance(expr, str) self._expr: str = expr def to_plan(self, session: "SparkConnectClient") -> proto.Expression: """Returns the Proto representation of the SQL expression.""" - expr = proto.Expression() + expr = self._create_proto_expression() expr.expression_string.expression = self._expr return expr def __eq__(self, other: Any) -> bool: return other is not None and isinstance(other, SQLExpression) and other._expr == self._expr + def __repr__(self) -> str: + return self._expr + class SortOrder(Expression): def __init__(self, child: Expression, ascending: bool = True, nullsFirst: bool = True) -> None: @@ -563,7 +588,7 @@ def __repr__(self) -> str: ) def to_plan(self, session: "SparkConnectClient") -> proto.Expression: - sort = proto.Expression() + sort = self._create_proto_expression() sort.sort_order.child.CopyFrom(self._child.to_plan(session)) if self._ascending: @@ -602,7 +627,7 @@ def __init__( self._is_distinct = is_distinct def to_plan(self, session: "SparkConnectClient") -> proto.Expression: - fun = proto.Expression() + fun = self._create_proto_expression() fun.unresolved_function.function_name = self._name if len(self._args) > 0: fun.unresolved_function.arguments.extend([arg.to_plan(session) for arg in self._args]) @@ -699,7 +724,7 @@ def __init__( self._function = function def to_plan(self, session: "SparkConnectClient") -> "proto.Expression": - expr = proto.Expression() + expr = self._create_proto_expression() expr.common_inline_user_defined_function.function_name = self._function_name expr.common_inline_user_defined_function.deterministic = self._deterministic if len(self._arguments) > 0: @@ -753,7 +778,7 @@ def __init__( self._valueExpr = valueExpr def to_plan(self, session: "SparkConnectClient") -> proto.Expression: - expr = proto.Expression() + expr = self._create_proto_expression() expr.update_fields.struct_expression.CopyFrom(self._structExpr.to_plan(session)) expr.update_fields.field_name = self._fieldName expr.update_fields.value_expression.CopyFrom(self._valueExpr.to_plan(session)) @@ -778,7 +803,7 @@ def __init__( self._fieldName = fieldName def to_plan(self, session: "SparkConnectClient") -> proto.Expression: - expr = proto.Expression() + expr = self._create_proto_expression() expr.update_fields.struct_expression.CopyFrom(self._structExpr.to_plan(session)) expr.update_fields.field_name = self._fieldName return expr @@ -802,7 +827,7 @@ def __init__( self._extraction = extraction def to_plan(self, session: "SparkConnectClient") -> proto.Expression: - expr = proto.Expression() + expr = self._create_proto_expression() expr.unresolved_extract_value.child.CopyFrom(self._child.to_plan(session)) expr.unresolved_extract_value.extraction.CopyFrom(self._extraction.to_plan(session)) return expr @@ -822,7 +847,7 @@ def __init__(self, col_name: str, plan_id: Optional[int] = None) -> None: self._plan_id = plan_id def to_plan(self, session: "SparkConnectClient") -> proto.Expression: - expr = proto.Expression() + expr = self._create_proto_expression() expr.unresolved_regex.col_name = self.col_name if self._plan_id is not None: expr.unresolved_regex.plan_id = self._plan_id @@ -841,6 +866,7 @@ def __init__( ) -> None: super().__init__() self._expr = expr + assert isinstance(data_type, (DataType, str)) self._data_type = data_type if eval_mode is not None: assert isinstance(eval_mode, str) @@ -848,7 +874,7 @@ def __init__( self._eval_mode = eval_mode def to_plan(self, session: "SparkConnectClient") -> proto.Expression: - fun = proto.Expression() + fun = self._create_proto_expression() fun.cast.expr.CopyFrom(self._expr.to_plan(session)) if isinstance(self._data_type, str): fun.cast.type_str = self._data_type @@ -866,7 +892,18 @@ def to_plan(self, session: "SparkConnectClient") -> proto.Expression: return fun def __repr__(self) -> str: - return f"({self._expr} ({self._data_type}))" + # We cannot guarantee the string representations be exactly the same, e.g. + # str(sf.col("a").cast("long")): + # Column<'CAST(a AS BIGINT)'> <- Spark Classic + # Column<'CAST(a AS LONG)'> <- Spark Connect + if isinstance(self._data_type, DataType): + str_data_type = self._data_type.simpleString().upper() + else: + str_data_type = str(self._data_type).upper() + if self._eval_mode is not None and self._eval_mode == "try": + return f"TRY_CAST({self._expr} AS {str_data_type})" + else: + return f"CAST({self._expr} AS {str_data_type})" class UnresolvedNamedLambdaVariable(Expression): @@ -888,12 +925,12 @@ def __init__( self._name_parts = name_parts def to_plan(self, session: "SparkConnectClient") -> proto.Expression: - expr = proto.Expression() + expr = self._create_proto_expression() expr.unresolved_named_lambda_variable.name_parts.extend(self._name_parts) return expr def __repr__(self) -> str: - return f"(UnresolvedNamedLambdaVariable({', '.join(self._name_parts)})" + return ", ".join(self._name_parts) @staticmethod def fresh_var_name(name: str) -> str: @@ -930,7 +967,7 @@ def __init__( self._arguments = arguments def to_plan(self, session: "SparkConnectClient") -> proto.Expression: - expr = proto.Expression() + expr = self._create_proto_expression() expr.lambda_function.function.CopyFrom(self._function.to_plan(session)) expr.lambda_function.arguments.extend( [arg.to_plan(session).unresolved_named_lambda_variable for arg in self._arguments] @@ -938,7 +975,10 @@ def to_plan(self, session: "SparkConnectClient") -> proto.Expression: return expr def __repr__(self) -> str: - return f"(LambdaFunction({str(self._function)}, {', '.join(self._arguments)})" + return ( + f"LambdaFunction({str(self._function)}, " + + f"{', '.join([str(arg) for arg in self._arguments])})" + ) class WindowExpression(Expression): @@ -960,7 +1000,7 @@ def __init__( self._windowSpec = windowSpec def to_plan(self, session: "SparkConnectClient") -> proto.Expression: - expr = proto.Expression() + expr = self._create_proto_expression() expr.window.window_function.CopyFrom(self._windowFunction.to_plan(session)) @@ -1067,7 +1107,7 @@ def __init__(self, name: str, args: Sequence["Expression"]): self._args = args def to_plan(self, session: "SparkConnectClient") -> "proto.Expression": - expr = proto.Expression() + expr = self._create_proto_expression() expr.call_function.function_name = self._name if len(self._args) > 0: expr.call_function.arguments.extend([arg.to_plan(session) for arg in self._args]) @@ -1091,7 +1131,7 @@ def __init__(self, key: str, value: Expression): self._value = value def to_plan(self, session: "SparkConnectClient") -> "proto.Expression": - expr = proto.Expression() + expr = self._create_proto_expression() expr.named_argument_expression.key = self._key expr.named_argument_expression.value.CopyFrom(self._value.to_plan(session)) return expr diff --git a/python/pyspark/sql/connect/functions/builtin.py b/python/pyspark/sql/connect/functions/builtin.py index 2b40b31b75280..8d3442b6496f7 100644 --- a/python/pyspark/sql/connect/functions/builtin.py +++ b/python/pyspark/sql/connect/functions/builtin.py @@ -42,7 +42,8 @@ import numpy as np from pyspark.errors import PySparkTypeError, PySparkValueError -from pyspark.sql.connect.column import Column +from pyspark.sql.dataframe import DataFrame as ParentDataFrame +from pyspark.sql import Column from pyspark.sql.connect.expressions import ( CaseWhen, SortOrder, @@ -80,7 +81,6 @@ DataTypeOrString, UserDefinedFunctionLike, ) - from pyspark.sql.connect.dataframe import DataFrame from pyspark.sql.connect.udtf import UserDefinedTableFunction @@ -112,14 +112,16 @@ def _invoke_function(name: str, *args: Union[Column, Expression]) -> Column: ------- :class:`Column` """ + from pyspark.sql.connect.column import Column as ConnectColumn + expressions: List[Expression] = [] for arg in args: assert isinstance(arg, (Column, Expression)) if isinstance(arg, Column): - expressions.append(arg._expr) + expressions.append(arg._expr) # type: ignore[arg-type] else: expressions.append(arg) - return Column(UnresolvedFunction(name, expressions)) + return ConnectColumn(UnresolvedFunction(name, expressions)) def _invoke_function_over_columns(name: str, *cols: "ColumnOrName") -> Column: @@ -180,6 +182,8 @@ def _create_lambda(f: Callable) -> LambdaFunction: - (Column, Column) -> Column: ... - (Column, Column, Column) -> Column: ... """ + from pyspark.sql.connect.column import Column as ConnectColumn + parameters = _get_lambda_parameters(f) arg_names = ["x", "y", "z"][: len(parameters)] @@ -187,7 +191,7 @@ def _create_lambda(f: Callable) -> LambdaFunction: UnresolvedNamedLambdaVariable([UnresolvedNamedLambdaVariable.fresh_var_name(arg_name)]) for arg_name in arg_names ] - arg_cols = [Column(arg_expr) for arg_expr in arg_exprs] + arg_cols = [ConnectColumn(arg_expr) for arg_expr in arg_exprs] result = f(*arg_cols) @@ -197,7 +201,7 @@ def _create_lambda(f: Callable) -> LambdaFunction: message_parameters={"func_name": f.__name__, "return_type": type(result).__name__}, ) - return LambdaFunction(result._expr, arg_exprs) + return LambdaFunction(result._expr, arg_exprs) # type: ignore[arg-type] def _invoke_higher_order_function( @@ -234,12 +238,14 @@ def _options_to_col(options: Dict[str, Any]) -> Column: def col(col: str) -> Column: + from pyspark.sql.connect.column import Column as ConnectColumn + if col == "*": - return Column(UnresolvedStar(unparsed_target=None)) + return ConnectColumn(UnresolvedStar(unparsed_target=None)) elif col.endswith(".*"): - return Column(UnresolvedStar(unparsed_target=col)) + return ConnectColumn(UnresolvedStar(unparsed_target=col)) else: - return Column(ColumnReference(unparsed_identifier=col)) + return ConnectColumn(ColumnReference(unparsed_identifier=col)) col.__doc__ = pysparkfuncs.col.__doc__ @@ -249,6 +255,8 @@ def col(col: str) -> Column: def lit(col: Any) -> Column: + from pyspark.sql.connect.column import Column as ConnectColumn + if isinstance(col, Column): return col elif isinstance(col, list): @@ -272,7 +280,7 @@ def lit(col: Any) -> Column: return array(*[lit(c) for c in col]) else: - return Column(LiteralExpression._from_value(col)) + return ConnectColumn(LiteralExpression._from_value(col)) lit.__doc__ = pysparkfuncs.lit.__doc__ @@ -314,7 +322,7 @@ def getbit(col: "ColumnOrName", pos: "ColumnOrName") -> Column: getbit.__doc__ = pysparkfuncs.getbit.__doc__ -def broadcast(df: "DataFrame") -> "DataFrame": +def broadcast(df: "ParentDataFrame") -> "ParentDataFrame": from pyspark.sql.connect.dataframe import DataFrame if not isinstance(df, DataFrame): @@ -336,7 +344,9 @@ def coalesce(*cols: "ColumnOrName") -> Column: def expr(str: str) -> Column: - return Column(SQLExpression(str)) + from pyspark.sql.connect.column import Column as ConnectColumn + + return ConnectColumn(SQLExpression(str)) expr.__doc__ = pysparkfuncs.expr.__doc__ @@ -429,6 +439,8 @@ def spark_partition_id() -> Column: def when(condition: Column, value: Any) -> Column: + from pyspark.sql.connect.column import Column as ConnectColumn + # Explicitly not using ColumnOrName type here to make reading condition less opaque if not isinstance(condition, Column): raise PySparkTypeError( @@ -438,7 +450,12 @@ def when(condition: Column, value: Any) -> Column: value_col = value if isinstance(value, Column) else lit(value) - return Column(CaseWhen(branches=[(condition._expr, value_col._expr)], else_value=None)) + return ConnectColumn( + CaseWhen( + branches=[(condition._expr, value_col._expr)], # type: ignore[list-item] + else_value=None, + ) + ) when.__doc__ = pysparkfuncs.when.__doc__ @@ -917,6 +934,13 @@ def try_divide(left: "ColumnOrName", right: "ColumnOrName") -> Column: try_divide.__doc__ = pysparkfuncs.try_divide.__doc__ +def try_remainder(left: "ColumnOrName", right: "ColumnOrName") -> Column: + return _invoke_function_over_columns("try_remainder", left, right) + + +try_remainder.__doc__ = pysparkfuncs.try_remainder.__doc__ + + def try_multiply(left: "ColumnOrName", right: "ColumnOrName") -> Column: return _invoke_function_over_columns("try_multiply", left, right) @@ -1045,8 +1069,12 @@ def countDistinct(col: "ColumnOrName", *cols: "ColumnOrName") -> Column: def count_distinct(col: "ColumnOrName", *cols: "ColumnOrName") -> Column: + from pyspark.sql.connect.column import Column as ConnectColumn + _exprs = [_to_col(c)._expr for c in [col] + list(cols)] - return Column(UnresolvedFunction("count", _exprs, is_distinct=True)) + return ConnectColumn( + UnresolvedFunction("count", _exprs, is_distinct=True) # type: ignore[arg-type] + ) count_distinct.__doc__ = pysparkfuncs.count_distinct.__doc__ @@ -1167,20 +1195,10 @@ def percentile( percentage: Union[Column, float, List[float], Tuple[float]], frequency: Union[Column, int] = 1, ) -> Column: - if isinstance(percentage, Column): - _percentage = percentage - elif isinstance(percentage, (list, tuple)): - # Convert tuple to list - _percentage = lit(list(percentage)) - else: - # Probably scalar - _percentage = lit(percentage) + if isinstance(percentage, (list, tuple)): + percentage = list(percentage) - if isinstance(frequency, int): - _frequency = lit(frequency) - elif isinstance(frequency, Column): - _frequency = frequency - else: + if not isinstance(frequency, (int, Column)): raise PySparkTypeError( error_class="NOT_COLUMN_OR_INT", message_parameters={ @@ -1189,7 +1207,7 @@ def percentile( }, ) - return _invoke_function("percentile", _to_col(col), _percentage, _frequency) + return _invoke_function("percentile", _to_col(col), lit(percentage), lit(frequency)) percentile.__doc__ = pysparkfuncs.percentile.__doc__ @@ -1200,16 +1218,10 @@ def percentile_approx( percentage: Union[Column, float, List[float], Tuple[float]], accuracy: Union[Column, float] = 10000, ) -> Column: - if isinstance(percentage, Column): - percentage_col = percentage - elif isinstance(percentage, (list, tuple)): - # Convert tuple to list - percentage_col = lit(list(percentage)) - else: - # Probably scalar - percentage_col = lit(percentage) + if isinstance(percentage, (list, tuple)): + percentage = lit(list(percentage)) - return _invoke_function("percentile_approx", _to_col(col), percentage_col, lit(accuracy)) + return _invoke_function("percentile_approx", _to_col(col), lit(percentage), lit(accuracy)) percentile_approx.__doc__ = pysparkfuncs.percentile_approx.__doc__ @@ -1220,16 +1232,10 @@ def approx_percentile( percentage: Union[Column, float, List[float], Tuple[float]], accuracy: Union[Column, float] = 10000, ) -> Column: - if isinstance(percentage, Column): - percentage_col = percentage - elif isinstance(percentage, (list, tuple)): - # Convert tuple to list - percentage_col = lit(list(percentage)) - else: - # Probably scalar - percentage_col = lit(percentage) + if isinstance(percentage, (list, tuple)): + percentage = list(percentage) - return _invoke_function("approx_percentile", _to_col(col), percentage_col, lit(accuracy)) + return _invoke_function("approx_percentile", _to_col(col), lit(percentage), lit(accuracy)) approx_percentile.__doc__ = pysparkfuncs.approx_percentile.__doc__ @@ -1293,7 +1299,11 @@ def sumDistinct(col: "ColumnOrName") -> Column: def sum_distinct(col: "ColumnOrName") -> Column: - return Column(UnresolvedFunction("sum", [_to_col(col)._expr], is_distinct=True)) + from pyspark.sql.connect.column import Column as ConnectColumn + + return ConnectColumn( + UnresolvedFunction("sum", [_to_col(col)._expr], is_distinct=True) # type: ignore[list-item] + ) sum_distinct.__doc__ = pysparkfuncs.sum_distinct.__doc__ @@ -1853,12 +1863,10 @@ def from_json( schema: Union[ArrayType, StructType, Column, str], options: Optional[Dict[str, str]] = None, ) -> Column: - if isinstance(schema, Column): - _schema = schema + if isinstance(schema, (str, Column)): + _schema = lit(schema) elif isinstance(schema, DataType): _schema = lit(schema.json()) - elif isinstance(schema, str): - _schema = lit(schema) else: raise PySparkTypeError( error_class="NOT_COLUMN_OR_DATATYPE_OR_STR", @@ -1879,12 +1887,10 @@ def from_xml( schema: Union[StructType, Column, str], options: Optional[Dict[str, str]] = None, ) -> Column: - if isinstance(schema, Column): - _schema = schema + if isinstance(schema, (str, Column)): + _schema = lit(schema) elif isinstance(schema, StructType): _schema = lit(schema.json()) - elif isinstance(schema, str): - _schema = lit(schema) else: raise PySparkTypeError( error_class="NOT_COLUMN_OR_STR_OR_STRUCT", @@ -2041,6 +2047,13 @@ def str_to_map( str_to_map.__doc__ = pysparkfuncs.str_to_map.__doc__ +def try_parse_json(col: "ColumnOrName") -> Column: + return _invoke_function("try_parse_json", _to_col(col)) + + +try_parse_json.__doc__ = pysparkfuncs.try_parse_json.__doc__ + + def parse_json(col: "ColumnOrName") -> Column: return _invoke_function("parse_json", _to_col(col)) @@ -2048,6 +2061,41 @@ def parse_json(col: "ColumnOrName") -> Column: parse_json.__doc__ = pysparkfuncs.parse_json.__doc__ +def is_variant_null(v: "ColumnOrName") -> Column: + return _invoke_function("is_variant_null", _to_col(v)) + + +is_variant_null.__doc__ = pysparkfuncs.is_variant_null.__doc__ + + +def variant_get(v: "ColumnOrName", path: str, targetType: str) -> Column: + return _invoke_function("variant_get", _to_col(v), lit(path), lit(targetType)) + + +variant_get.__doc__ = pysparkfuncs.variant_get.__doc__ + + +def try_variant_get(v: "ColumnOrName", path: str, targetType: str) -> Column: + return _invoke_function("try_variant_get", _to_col(v), lit(path), lit(targetType)) + + +try_variant_get.__doc__ = pysparkfuncs.try_variant_get.__doc__ + + +def schema_of_variant(v: "ColumnOrName") -> Column: + return _invoke_function("schema_of_variant", _to_col(v)) + + +schema_of_variant.__doc__ = pysparkfuncs.schema_of_variant.__doc__ + + +def schema_of_variant_agg(v: "ColumnOrName") -> Column: + return _invoke_function("schema_of_variant_agg", _to_col(v)) + + +schema_of_variant_agg.__doc__ = pysparkfuncs.schema_of_variant_agg.__doc__ + + def posexplode(col: "ColumnOrName") -> Column: return _invoke_function_over_columns("posexplode", col) @@ -2081,10 +2129,8 @@ def sequence( sequence.__doc__ = pysparkfuncs.sequence.__doc__ -def schema_of_csv(csv: "ColumnOrName", options: Optional[Dict[str, str]] = None) -> Column: - if isinstance(csv, Column): - _csv = csv - elif isinstance(csv, str): +def schema_of_csv(csv: Union[str, Column], options: Optional[Dict[str, str]] = None) -> Column: + if isinstance(csv, (str, Column)): _csv = lit(csv) else: raise PySparkTypeError( @@ -2101,10 +2147,8 @@ def schema_of_csv(csv: "ColumnOrName", options: Optional[Dict[str, str]] = None) schema_of_csv.__doc__ = pysparkfuncs.schema_of_csv.__doc__ -def schema_of_json(json: "ColumnOrName", options: Optional[Dict[str, str]] = None) -> Column: - if isinstance(json, Column): - _json = json - elif isinstance(json, str): +def schema_of_json(json: Union[str, Column], options: Optional[Dict[str, str]] = None) -> Column: + if isinstance(json, (str, Column)): _json = lit(json) else: raise PySparkTypeError( @@ -2121,10 +2165,8 @@ def schema_of_json(json: "ColumnOrName", options: Optional[Dict[str, str]] = Non schema_of_json.__doc__ = pysparkfuncs.schema_of_json.__doc__ -def schema_of_xml(xml: "ColumnOrName", options: Optional[Dict[str, str]] = None) -> Column: - if isinstance(xml, Column): - _xml = xml - elif isinstance(xml, str): +def schema_of_xml(xml: Union[str, Column], options: Optional[Dict[str, str]] = None) -> Column: + if isinstance(xml, (str, Column)): _xml = lit(xml) else: raise PySparkTypeError( @@ -2476,8 +2518,13 @@ def repeat(col: "ColumnOrName", n: Union["ColumnOrName", int]) -> Column: repeat.__doc__ = pysparkfuncs.repeat.__doc__ -def split(str: "ColumnOrName", pattern: str, limit: int = -1) -> Column: - return _invoke_function("split", _to_col(str), lit(pattern), lit(limit)) +def split( + str: "ColumnOrName", + pattern: Union[Column, str], + limit: Union["ColumnOrName", int] = -1, +) -> Column: + limit = lit(limit) if isinstance(limit, int) else _to_col(limit) + return _invoke_function("split", _to_col(str), lit(pattern), limit) split.__doc__ = pysparkfuncs.split.__doc__ @@ -3350,6 +3397,20 @@ def timestamp_micros(col: "ColumnOrName") -> Column: timestamp_micros.__doc__ = pysparkfuncs.timestamp_micros.__doc__ +def timestamp_diff(unit: str, start: "ColumnOrName", end: "ColumnOrName") -> Column: + return _invoke_function_over_columns("timestampdiff", lit(unit), start, end) + + +timestamp_diff.__doc__ = pysparkfuncs.timestamp_diff.__doc__ + + +def timestamp_add(unit: str, quantity: "ColumnOrName", ts: "ColumnOrName") -> Column: + return _invoke_function_over_columns("timestampadd", lit(unit), quantity, ts) + + +timestamp_add.__doc__ = pysparkfuncs.timestamp_add.__doc__ + + def window( timeColumn: "ColumnOrName", windowDuration: str, @@ -3764,27 +3825,27 @@ def sha2(col: "ColumnOrName", numBits: int) -> Column: sha2.__doc__ = pysparkfuncs.sha2.__doc__ -def hll_sketch_agg(col: "ColumnOrName", lgConfigK: Optional[Union[int, Column]] = None) -> Column: +def hll_sketch_agg( + col: "ColumnOrName", + lgConfigK: Optional[Union[int, Column]] = None, +) -> Column: if lgConfigK is None: return _invoke_function_over_columns("hll_sketch_agg", col) else: - _lgConfigK = lit(lgConfigK) if isinstance(lgConfigK, int) else lgConfigK - return _invoke_function_over_columns("hll_sketch_agg", col, _lgConfigK) + return _invoke_function_over_columns("hll_sketch_agg", col, lit(lgConfigK)) hll_sketch_agg.__doc__ = pysparkfuncs.hll_sketch_agg.__doc__ -def hll_union_agg(col: "ColumnOrName", allowDifferentLgConfigK: Optional[bool] = None) -> Column: +def hll_union_agg( + col: "ColumnOrName", + allowDifferentLgConfigK: Optional[Union[bool, Column]] = None, +) -> Column: if allowDifferentLgConfigK is None: return _invoke_function_over_columns("hll_union_agg", col) else: - _allowDifferentLgConfigK = ( - lit(allowDifferentLgConfigK) - if isinstance(allowDifferentLgConfigK, bool) - else allowDifferentLgConfigK - ) - return _invoke_function_over_columns("hll_union_agg", col, _allowDifferentLgConfigK) + return _invoke_function_over_columns("hll_union_agg", col, lit(allowDifferentLgConfigK)) hll_union_agg.__doc__ = pysparkfuncs.hll_union_agg.__doc__ @@ -4061,8 +4122,10 @@ def udtf( def call_function(funcName: str, *cols: "ColumnOrName") -> Column: + from pyspark.sql.connect.column import Column as ConnectColumn + expressions = [_to_col(c)._expr for c in cols] - return Column(CallFunction(funcName, expressions)) + return ConnectColumn(CallFunction(funcName, expressions)) # type: ignore[arg-type] call_function.__doc__ = pysparkfuncs.call_function.__doc__ diff --git a/python/pyspark/sql/connect/functions/partitioning.py b/python/pyspark/sql/connect/functions/partitioning.py index bfeddad7d5686..5d2dd58313bb6 100644 --- a/python/pyspark/sql/connect/functions/partitioning.py +++ b/python/pyspark/sql/connect/functions/partitioning.py @@ -22,7 +22,7 @@ from pyspark.errors import PySparkTypeError from pyspark.sql import functions as pysparkfuncs -from pyspark.sql.connect.column import Column +from pyspark.sql.column import Column from pyspark.sql.connect.functions.builtin import _to_col, _invoke_function_over_columns from pyspark.sql.connect.functions.builtin import lit, _invoke_function diff --git a/python/pyspark/sql/connect/group.py b/python/pyspark/sql/connect/group.py index b866f61efe4ae..85806b1a265b0 100644 --- a/python/pyspark/sql/connect/group.py +++ b/python/pyspark/sql/connect/group.py @@ -34,11 +34,12 @@ from pyspark.util import PythonEvalType from pyspark.sql.group import GroupedData as PySparkGroupedData from pyspark.sql.pandas.group_ops import PandasCogroupedOps as PySparkPandasCogroupedOps +from pyspark.sql.pandas.functions import _validate_pandas_udf # type: ignore[attr-defined] from pyspark.sql.types import NumericType from pyspark.sql.types import StructType import pyspark.sql.connect.plan as plan -from pyspark.sql.connect.column import Column +from pyspark.sql.column import Column from pyspark.sql.connect.functions import builtin as F from pyspark.errors import PySparkNotImplementedError, PySparkTypeError @@ -61,10 +62,10 @@ def __init__( self, df: "DataFrame", group_type: str, - grouping_cols: Sequence["Column"], - pivot_col: Optional["Column"] = None, + grouping_cols: Sequence[Column], + pivot_col: Optional[Column] = None, pivot_values: Optional[Sequence["LiteralType"]] = None, - grouping_sets: Optional[Sequence[Sequence["Column"]]] = None, + grouping_sets: Optional[Sequence[Sequence[Column]]] = None, ) -> None: from pyspark.sql.connect.dataframe import DataFrame @@ -293,13 +294,14 @@ def applyInPandas( from pyspark.sql.connect.udf import UserDefinedFunction from pyspark.sql.connect.dataframe import DataFrame + _validate_pandas_udf(func, PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF) udf_obj = UserDefinedFunction( func, returnType=schema, evalType=PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF, ) - return DataFrame( + res = DataFrame( plan.GroupMap( child=self._df._plan, grouping_cols=self._grouping_cols, @@ -308,6 +310,9 @@ def applyInPandas( ), session=self._df._session, ) + if isinstance(schema, StructType): + res._cached_schema = schema + return res applyInPandas.__doc__ = PySparkGroupedData.applyInPandas.__doc__ @@ -322,6 +327,7 @@ def applyInPandasWithState( from pyspark.sql.connect.udf import UserDefinedFunction from pyspark.sql.connect.dataframe import DataFrame + _validate_pandas_udf(func, PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF_WITH_STATE) udf_obj = UserDefinedFunction( func, returnType=outputStructType, @@ -360,13 +366,14 @@ def applyInArrow( from pyspark.sql.connect.udf import UserDefinedFunction from pyspark.sql.connect.dataframe import DataFrame + _validate_pandas_udf(func, PythonEvalType.SQL_GROUPED_MAP_ARROW_UDF) udf_obj = UserDefinedFunction( func, returnType=schema, evalType=PythonEvalType.SQL_GROUPED_MAP_ARROW_UDF, ) - return DataFrame( + res = DataFrame( plan.GroupMap( child=self._df._plan, grouping_cols=self._grouping_cols, @@ -375,6 +382,9 @@ def applyInArrow( ), session=self._df._session, ) + if isinstance(schema, StructType): + res._cached_schema = schema + return res applyInArrow.__doc__ = PySparkGroupedData.applyInArrow.__doc__ @@ -399,13 +409,14 @@ def applyInPandas( from pyspark.sql.connect.udf import UserDefinedFunction from pyspark.sql.connect.dataframe import DataFrame + _validate_pandas_udf(func, PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF) udf_obj = UserDefinedFunction( func, returnType=schema, evalType=PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF, ) - return DataFrame( + res = DataFrame( plan.CoGroupMap( input=self._gd1._df._plan, input_grouping_cols=self._gd1._grouping_cols, @@ -415,6 +426,9 @@ def applyInPandas( ), session=self._gd1._df._session, ) + if isinstance(schema, StructType): + res._cached_schema = schema + return res applyInPandas.__doc__ = PySparkPandasCogroupedOps.applyInPandas.__doc__ @@ -424,13 +438,14 @@ def applyInArrow( from pyspark.sql.connect.udf import UserDefinedFunction from pyspark.sql.connect.dataframe import DataFrame + _validate_pandas_udf(func, PythonEvalType.SQL_COGROUPED_MAP_ARROW_UDF) udf_obj = UserDefinedFunction( func, returnType=schema, evalType=PythonEvalType.SQL_COGROUPED_MAP_ARROW_UDF, ) - return DataFrame( + res = DataFrame( plan.CoGroupMap( input=self._gd1._df._plan, input_grouping_cols=self._gd1._grouping_cols, @@ -440,6 +455,9 @@ def applyInArrow( ), session=self._gd1._df._session, ) + if isinstance(schema, StructType): + res._cached_schema = schema + return res applyInArrow.__doc__ = PySparkPandasCogroupedOps.applyInArrow.__doc__ diff --git a/python/pyspark/sql/connect/observation.py b/python/pyspark/sql/connect/observation.py index 4fefb8aac41fb..2471cf04cfbe7 100644 --- a/python/pyspark/sql/connect/observation.py +++ b/python/pyspark/sql/connect/observation.py @@ -23,7 +23,7 @@ IllegalArgumentException, PySparkAssertionError, ) -from pyspark.sql.connect.column import Column +from pyspark.sql.column import Column from pyspark.sql.connect.dataframe import DataFrame from pyspark.sql.observation import Observation as PySparkObservation import pyspark.sql.connect.plan as plan diff --git a/python/pyspark/sql/connect/plan.py b/python/pyspark/sql/connect/plan.py index 239ee23c2061c..19377515ed28c 100644 --- a/python/pyspark/sql/connect/plan.py +++ b/python/pyspark/sql/connect/plan.py @@ -14,6 +14,9 @@ # See the License for the specific language governing permissions and # limitations under the License. # + +# mypy: disable-error-code="operator" + from pyspark.resource import ResourceProfile from pyspark.sql.connect.utils import check_dependencies @@ -37,6 +40,7 @@ import pickle from threading import Lock from inspect import signature, isclass +import warnings import pyarrow as pa @@ -45,20 +49,22 @@ from pyspark.sql.types import DataType import pyspark.sql.connect.proto as proto +from pyspark.sql.column import Column +from pyspark.sql.connect.proto import base_pb2 as spark_dot_connect_dot_base__pb2 from pyspark.sql.connect.conversion import storage_level_to_proto -from pyspark.sql.connect.column import Column from pyspark.sql.connect.expressions import Expression from pyspark.sql.connect.types import pyspark_types_to_proto_types, UnparsedDataType from pyspark.errors import ( + AnalysisException, PySparkValueError, PySparkPicklingError, - IllegalArgumentException, ) if TYPE_CHECKING: from pyspark.sql.connect.client import SparkConnectClient from pyspark.sql.connect.udf import UserDefinedFunction from pyspark.sql.connect.observation import Observation + from pyspark.sql.connect.session import SparkSession class LogicalPlan: @@ -544,14 +550,49 @@ class CachedRemoteRelation(LogicalPlan): """Logical plan object for a DataFrame reference which represents a DataFrame that's been cached on the server with a given id.""" - def __init__(self, relationId: str): + def __init__(self, relation_id: str, spark_session: "SparkSession"): super().__init__(None) - self._relationId = relationId - - def plan(self, session: "SparkConnectClient") -> proto.Relation: - plan = self._create_proto_relation() - plan.cached_remote_relation.relation_id = self._relationId - return plan + self._relation_id = relation_id + # Needs to hold the session to make a request itself. + self._spark_session = spark_session + + def plan(self, session: "SparkConnectClient") -> proto.Relation: + plan = self._create_proto_relation() + plan.cached_remote_relation.relation_id = self._relation_id + return plan + + def __del__(self) -> None: + session = self._spark_session + # If session is already closed, all cached DataFrame should be released. + if session is not None and not session.client.is_closed and self._relation_id is not None: + try: + command = RemoveRemoteCachedRelation(self).command(session=session.client) + req = session.client._execute_plan_request_with_metadata() + if session.client._user_id: + req.user_context.user_id = session.client._user_id + req.plan.command.CopyFrom(command) + + for attempt in session.client._retrying(): + with attempt: + # !!HACK ALERT!! + # unary_stream does not work on Python's exit for an unknown reasons + # Therefore, here we open unary_unary channel instead. + # See also :class:`SparkConnectServiceStub`. + request_serializer = ( + spark_dot_connect_dot_base__pb2.ExecutePlanRequest.SerializeToString + ) + response_deserializer = ( + spark_dot_connect_dot_base__pb2.ExecutePlanResponse.FromString + ) + channel = session.client._channel.unary_unary( + "/spark.connect.SparkConnectService/ExecutePlan", + request_serializer=request_serializer, + response_deserializer=response_deserializer, + ) + metadata = session.client._builder.metadata() + channel(req, metadata=metadata) # type: ignore[arg-type] + except Exception as e: + warnings.warn(f"RemoveRemoteCachedRelation failed with exception: {e}.") class Hint(LogicalPlan): @@ -641,7 +682,7 @@ def __init__( self, child: Optional["LogicalPlan"], all_columns_as_keys: bool = False, - column_names: Optional[List[str]] = None, + column_names: Optional[Sequence[str]] = None, within_watermark: bool = False, ) -> None: super().__init__(child) @@ -714,7 +755,7 @@ def __init__( lower_bound: float, upper_bound: float, with_replacement: bool, - seed: Optional[int], + seed: int, deterministic_order: bool = False, ) -> None: super().__init__(child) @@ -731,8 +772,7 @@ def plan(self, session: "SparkConnectClient") -> proto.Relation: plan.sample.lower_bound = self.lower_bound plan.sample.upper_bound = self.upper_bound plan.sample.with_replacement = self.with_replacement - if self.seed is not None: - plan.sample.seed = self.seed + plan.sample.seed = self.seed plan.sample.deterministic_order = self.deterministic_order return plan @@ -847,7 +887,7 @@ def __init__( elif how == "cross": join_type = proto.Join.JoinType.JOIN_TYPE_CROSS else: - raise IllegalArgumentException( + raise AnalysisException( error_class="UNSUPPORTED_JOIN_TYPE", message_parameters={"join_type": how}, ) @@ -1523,7 +1563,7 @@ def __init__( child: Optional["LogicalPlan"], col: Column, fractions: Sequence[Tuple[Column, float]], - seed: Optional[int], + seed: int, ) -> None: super().__init__(child) @@ -1551,8 +1591,7 @@ def plan(self, session: "SparkConnectClient") -> proto.Relation: fraction.stratum.CopyFrom(k.to_plan(session).literal) fraction.fraction = float(v) plan.sample_by.fractions.append(fraction) - if self._seed is not None: - plan.sample_by.seed = self._seed + plan.sample_by.seed = self._seed return plan @@ -1784,9 +1823,39 @@ def command(self, session: "SparkConnectClient") -> proto.Command: return cmd -# Catalog API (internal-only) +class RemoveRemoteCachedRelation(LogicalPlan): + def __init__(self, relation: CachedRemoteRelation) -> None: + super().__init__(None) + self._relation = relation + + def command(self, session: "SparkConnectClient") -> proto.Command: + plan = self._create_proto_relation() + plan.cached_remote_relation.relation_id = self._relation._relation_id + cmd = proto.Command() + cmd.remove_cached_remote_relation_command.relation.CopyFrom(plan.cached_remote_relation) + return cmd + + +class Checkpoint(LogicalPlan): + def __init__(self, child: Optional["LogicalPlan"], local: bool, eager: bool) -> None: + super().__init__(child) + self._local = local + self._eager = eager + + def command(self, session: "SparkConnectClient") -> proto.Command: + cmd = proto.Command() + assert self._child is not None + cmd.checkpoint_command.CopyFrom( + proto.CheckpointCommand( + relation=self._child.plan(session), + local=self._local, + eager=self._eager, + ) + ) + return cmd +# Catalog API (internal-only) class CurrentDatabase(LogicalPlan): def __init__(self) -> None: super().__init__(None) diff --git a/python/pyspark/sql/connect/proto/base_pb2.py b/python/pyspark/sql/connect/proto/base_pb2.py index 2a30ffe60a9f2..5243e55576f8e 100644 --- a/python/pyspark/sql/connect/proto/base_pb2.py +++ b/python/pyspark/sql/connect/proto/base_pb2.py @@ -37,7 +37,7 @@ DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x18spark/connect/base.proto\x12\rspark.connect\x1a\x19google/protobuf/any.proto\x1a\x1cspark/connect/commands.proto\x1a\x1aspark/connect/common.proto\x1a\x1fspark/connect/expressions.proto\x1a\x1dspark/connect/relations.proto\x1a\x19spark/connect/types.proto"t\n\x04Plan\x12-\n\x04root\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationH\x00R\x04root\x12\x32\n\x07\x63ommand\x18\x02 \x01(\x0b\x32\x16.spark.connect.CommandH\x00R\x07\x63ommandB\t\n\x07op_type"z\n\x0bUserContext\x12\x17\n\x07user_id\x18\x01 \x01(\tR\x06userId\x12\x1b\n\tuser_name\x18\x02 \x01(\tR\x08userName\x12\x35\n\nextensions\x18\xe7\x07 \x03(\x0b\x32\x14.google.protobuf.AnyR\nextensions"\xf8\x13\n\x12\x41nalyzePlanRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x11 \x01(\tH\x01R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12$\n\x0b\x63lient_type\x18\x03 \x01(\tH\x02R\nclientType\x88\x01\x01\x12\x42\n\x06schema\x18\x04 \x01(\x0b\x32(.spark.connect.AnalyzePlanRequest.SchemaH\x00R\x06schema\x12\x45\n\x07\x65xplain\x18\x05 \x01(\x0b\x32).spark.connect.AnalyzePlanRequest.ExplainH\x00R\x07\x65xplain\x12O\n\x0btree_string\x18\x06 \x01(\x0b\x32,.spark.connect.AnalyzePlanRequest.TreeStringH\x00R\ntreeString\x12\x46\n\x08is_local\x18\x07 \x01(\x0b\x32).spark.connect.AnalyzePlanRequest.IsLocalH\x00R\x07isLocal\x12R\n\x0cis_streaming\x18\x08 \x01(\x0b\x32-.spark.connect.AnalyzePlanRequest.IsStreamingH\x00R\x0bisStreaming\x12O\n\x0binput_files\x18\t \x01(\x0b\x32,.spark.connect.AnalyzePlanRequest.InputFilesH\x00R\ninputFiles\x12U\n\rspark_version\x18\n \x01(\x0b\x32..spark.connect.AnalyzePlanRequest.SparkVersionH\x00R\x0csparkVersion\x12I\n\tddl_parse\x18\x0b \x01(\x0b\x32*.spark.connect.AnalyzePlanRequest.DDLParseH\x00R\x08\x64\x64lParse\x12X\n\x0esame_semantics\x18\x0c \x01(\x0b\x32/.spark.connect.AnalyzePlanRequest.SameSemanticsH\x00R\rsameSemantics\x12U\n\rsemantic_hash\x18\r \x01(\x0b\x32..spark.connect.AnalyzePlanRequest.SemanticHashH\x00R\x0csemanticHash\x12\x45\n\x07persist\x18\x0e \x01(\x0b\x32).spark.connect.AnalyzePlanRequest.PersistH\x00R\x07persist\x12K\n\tunpersist\x18\x0f \x01(\x0b\x32+.spark.connect.AnalyzePlanRequest.UnpersistH\x00R\tunpersist\x12_\n\x11get_storage_level\x18\x10 \x01(\x0b\x32\x31.spark.connect.AnalyzePlanRequest.GetStorageLevelH\x00R\x0fgetStorageLevel\x1a\x31\n\x06Schema\x12\'\n\x04plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x1a\xbb\x02\n\x07\x45xplain\x12\'\n\x04plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x12X\n\x0c\x65xplain_mode\x18\x02 \x01(\x0e\x32\x35.spark.connect.AnalyzePlanRequest.Explain.ExplainModeR\x0b\x65xplainMode"\xac\x01\n\x0b\x45xplainMode\x12\x1c\n\x18\x45XPLAIN_MODE_UNSPECIFIED\x10\x00\x12\x17\n\x13\x45XPLAIN_MODE_SIMPLE\x10\x01\x12\x19\n\x15\x45XPLAIN_MODE_EXTENDED\x10\x02\x12\x18\n\x14\x45XPLAIN_MODE_CODEGEN\x10\x03\x12\x15\n\x11\x45XPLAIN_MODE_COST\x10\x04\x12\x1a\n\x16\x45XPLAIN_MODE_FORMATTED\x10\x05\x1aZ\n\nTreeString\x12\'\n\x04plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x12\x19\n\x05level\x18\x02 \x01(\x05H\x00R\x05level\x88\x01\x01\x42\x08\n\x06_level\x1a\x32\n\x07IsLocal\x12\'\n\x04plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x1a\x36\n\x0bIsStreaming\x12\'\n\x04plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x1a\x35\n\nInputFiles\x12\'\n\x04plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x1a\x0e\n\x0cSparkVersion\x1a)\n\x08\x44\x44LParse\x12\x1d\n\nddl_string\x18\x01 \x01(\tR\tddlString\x1ay\n\rSameSemantics\x12\x34\n\x0btarget_plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\ntargetPlan\x12\x32\n\nother_plan\x18\x02 \x01(\x0b\x32\x13.spark.connect.PlanR\totherPlan\x1a\x37\n\x0cSemanticHash\x12\'\n\x04plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x1a\x97\x01\n\x07Persist\x12\x33\n\x08relation\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x08relation\x12\x45\n\rstorage_level\x18\x02 \x01(\x0b\x32\x1b.spark.connect.StorageLevelH\x00R\x0cstorageLevel\x88\x01\x01\x42\x10\n\x0e_storage_level\x1an\n\tUnpersist\x12\x33\n\x08relation\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x08relation\x12\x1f\n\x08\x62locking\x18\x02 \x01(\x08H\x00R\x08\x62locking\x88\x01\x01\x42\x0b\n\t_blocking\x1a\x46\n\x0fGetStorageLevel\x12\x33\n\x08relation\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x08relationB\t\n\x07\x61nalyzeB)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_type"\xce\r\n\x13\x41nalyzePlanResponse\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x0f \x01(\tR\x13serverSideSessionId\x12\x43\n\x06schema\x18\x02 \x01(\x0b\x32).spark.connect.AnalyzePlanResponse.SchemaH\x00R\x06schema\x12\x46\n\x07\x65xplain\x18\x03 \x01(\x0b\x32*.spark.connect.AnalyzePlanResponse.ExplainH\x00R\x07\x65xplain\x12P\n\x0btree_string\x18\x04 \x01(\x0b\x32-.spark.connect.AnalyzePlanResponse.TreeStringH\x00R\ntreeString\x12G\n\x08is_local\x18\x05 \x01(\x0b\x32*.spark.connect.AnalyzePlanResponse.IsLocalH\x00R\x07isLocal\x12S\n\x0cis_streaming\x18\x06 \x01(\x0b\x32..spark.connect.AnalyzePlanResponse.IsStreamingH\x00R\x0bisStreaming\x12P\n\x0binput_files\x18\x07 \x01(\x0b\x32-.spark.connect.AnalyzePlanResponse.InputFilesH\x00R\ninputFiles\x12V\n\rspark_version\x18\x08 \x01(\x0b\x32/.spark.connect.AnalyzePlanResponse.SparkVersionH\x00R\x0csparkVersion\x12J\n\tddl_parse\x18\t \x01(\x0b\x32+.spark.connect.AnalyzePlanResponse.DDLParseH\x00R\x08\x64\x64lParse\x12Y\n\x0esame_semantics\x18\n \x01(\x0b\x32\x30.spark.connect.AnalyzePlanResponse.SameSemanticsH\x00R\rsameSemantics\x12V\n\rsemantic_hash\x18\x0b \x01(\x0b\x32/.spark.connect.AnalyzePlanResponse.SemanticHashH\x00R\x0csemanticHash\x12\x46\n\x07persist\x18\x0c \x01(\x0b\x32*.spark.connect.AnalyzePlanResponse.PersistH\x00R\x07persist\x12L\n\tunpersist\x18\r \x01(\x0b\x32,.spark.connect.AnalyzePlanResponse.UnpersistH\x00R\tunpersist\x12`\n\x11get_storage_level\x18\x0e \x01(\x0b\x32\x32.spark.connect.AnalyzePlanResponse.GetStorageLevelH\x00R\x0fgetStorageLevel\x1a\x39\n\x06Schema\x12/\n\x06schema\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeR\x06schema\x1a\x30\n\x07\x45xplain\x12%\n\x0e\x65xplain_string\x18\x01 \x01(\tR\rexplainString\x1a-\n\nTreeString\x12\x1f\n\x0btree_string\x18\x01 \x01(\tR\ntreeString\x1a$\n\x07IsLocal\x12\x19\n\x08is_local\x18\x01 \x01(\x08R\x07isLocal\x1a\x30\n\x0bIsStreaming\x12!\n\x0cis_streaming\x18\x01 \x01(\x08R\x0bisStreaming\x1a"\n\nInputFiles\x12\x14\n\x05\x66iles\x18\x01 \x03(\tR\x05\x66iles\x1a(\n\x0cSparkVersion\x12\x18\n\x07version\x18\x01 \x01(\tR\x07version\x1a;\n\x08\x44\x44LParse\x12/\n\x06parsed\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeR\x06parsed\x1a\'\n\rSameSemantics\x12\x16\n\x06result\x18\x01 \x01(\x08R\x06result\x1a&\n\x0cSemanticHash\x12\x16\n\x06result\x18\x01 \x01(\x05R\x06result\x1a\t\n\x07Persist\x1a\x0b\n\tUnpersist\x1aS\n\x0fGetStorageLevel\x12@\n\rstorage_level\x18\x01 \x01(\x0b\x32\x1b.spark.connect.StorageLevelR\x0cstorageLevelB\x08\n\x06result"\xa3\x05\n\x12\x45xecutePlanRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x08 \x01(\tH\x00R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12&\n\x0coperation_id\x18\x06 \x01(\tH\x01R\x0boperationId\x88\x01\x01\x12\'\n\x04plan\x18\x03 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x12$\n\x0b\x63lient_type\x18\x04 \x01(\tH\x02R\nclientType\x88\x01\x01\x12X\n\x0frequest_options\x18\x05 \x03(\x0b\x32/.spark.connect.ExecutePlanRequest.RequestOptionR\x0erequestOptions\x12\x12\n\x04tags\x18\x07 \x03(\tR\x04tags\x1a\xa5\x01\n\rRequestOption\x12K\n\x10reattach_options\x18\x01 \x01(\x0b\x32\x1e.spark.connect.ReattachOptionsH\x00R\x0freattachOptions\x12\x35\n\textension\x18\xe7\x07 \x01(\x0b\x32\x14.google.protobuf.AnyH\x00R\textensionB\x10\n\x0erequest_optionB)\n\'_client_observed_server_side_session_idB\x0f\n\r_operation_idB\x0e\n\x0c_client_type"\xe6\x15\n\x13\x45xecutePlanResponse\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x0f \x01(\tR\x13serverSideSessionId\x12!\n\x0coperation_id\x18\x0c \x01(\tR\x0boperationId\x12\x1f\n\x0bresponse_id\x18\r \x01(\tR\nresponseId\x12P\n\x0b\x61rrow_batch\x18\x02 \x01(\x0b\x32-.spark.connect.ExecutePlanResponse.ArrowBatchH\x00R\narrowBatch\x12\x63\n\x12sql_command_result\x18\x05 \x01(\x0b\x32\x33.spark.connect.ExecutePlanResponse.SqlCommandResultH\x00R\x10sqlCommandResult\x12~\n#write_stream_operation_start_result\x18\x08 \x01(\x0b\x32..spark.connect.WriteStreamOperationStartResultH\x00R\x1fwriteStreamOperationStartResult\x12q\n\x1estreaming_query_command_result\x18\t \x01(\x0b\x32*.spark.connect.StreamingQueryCommandResultH\x00R\x1bstreamingQueryCommandResult\x12k\n\x1cget_resources_command_result\x18\n \x01(\x0b\x32(.spark.connect.GetResourcesCommandResultH\x00R\x19getResourcesCommandResult\x12\x87\x01\n&streaming_query_manager_command_result\x18\x0b \x01(\x0b\x32\x31.spark.connect.StreamingQueryManagerCommandResultH\x00R"streamingQueryManagerCommandResult\x12\x87\x01\n&streaming_query_listener_events_result\x18\x10 \x01(\x0b\x32\x31.spark.connect.StreamingQueryListenerEventsResultH\x00R"streamingQueryListenerEventsResult\x12\\\n\x0fresult_complete\x18\x0e \x01(\x0b\x32\x31.spark.connect.ExecutePlanResponse.ResultCompleteH\x00R\x0eresultComplete\x12\x87\x01\n&create_resource_profile_command_result\x18\x11 \x01(\x0b\x32\x31.spark.connect.CreateResourceProfileCommandResultH\x00R"createResourceProfileCommandResult\x12\x65\n\x12\x65xecution_progress\x18\x12 \x01(\x0b\x32\x34.spark.connect.ExecutePlanResponse.ExecutionProgressH\x00R\x11\x65xecutionProgress\x12\x35\n\textension\x18\xe7\x07 \x01(\x0b\x32\x14.google.protobuf.AnyH\x00R\textension\x12\x44\n\x07metrics\x18\x04 \x01(\x0b\x32*.spark.connect.ExecutePlanResponse.MetricsR\x07metrics\x12]\n\x10observed_metrics\x18\x06 \x03(\x0b\x32\x32.spark.connect.ExecutePlanResponse.ObservedMetricsR\x0fobservedMetrics\x12/\n\x06schema\x18\x07 \x01(\x0b\x32\x17.spark.connect.DataTypeR\x06schema\x1aG\n\x10SqlCommandResult\x12\x33\n\x08relation\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x08relation\x1av\n\nArrowBatch\x12\x1b\n\trow_count\x18\x01 \x01(\x03R\x08rowCount\x12\x12\n\x04\x64\x61ta\x18\x02 \x01(\x0cR\x04\x64\x61ta\x12&\n\x0cstart_offset\x18\x03 \x01(\x03H\x00R\x0bstartOffset\x88\x01\x01\x42\x0f\n\r_start_offset\x1a\x85\x04\n\x07Metrics\x12Q\n\x07metrics\x18\x01 \x03(\x0b\x32\x37.spark.connect.ExecutePlanResponse.Metrics.MetricObjectR\x07metrics\x1a\xcc\x02\n\x0cMetricObject\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12\x17\n\x07plan_id\x18\x02 \x01(\x03R\x06planId\x12\x16\n\x06parent\x18\x03 \x01(\x03R\x06parent\x12z\n\x11\x65xecution_metrics\x18\x04 \x03(\x0b\x32M.spark.connect.ExecutePlanResponse.Metrics.MetricObject.ExecutionMetricsEntryR\x10\x65xecutionMetrics\x1a{\n\x15\x45xecutionMetricsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12L\n\x05value\x18\x02 \x01(\x0b\x32\x36.spark.connect.ExecutePlanResponse.Metrics.MetricValueR\x05value:\x02\x38\x01\x1aX\n\x0bMetricValue\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12\x14\n\x05value\x18\x02 \x01(\x03R\x05value\x12\x1f\n\x0bmetric_type\x18\x03 \x01(\tR\nmetricType\x1at\n\x0fObservedMetrics\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12\x39\n\x06values\x18\x02 \x03(\x0b\x32!.spark.connect.Expression.LiteralR\x06values\x12\x12\n\x04keys\x18\x03 \x03(\tR\x04keys\x1a\x10\n\x0eResultComplete\x1a\xcd\x02\n\x11\x45xecutionProgress\x12V\n\x06stages\x18\x01 \x03(\x0b\x32>.spark.connect.ExecutePlanResponse.ExecutionProgress.StageInfoR\x06stages\x12,\n\x12num_inflight_tasks\x18\x02 \x01(\x03R\x10numInflightTasks\x1a\xb1\x01\n\tStageInfo\x12\x19\n\x08stage_id\x18\x01 \x01(\x03R\x07stageId\x12\x1b\n\tnum_tasks\x18\x02 \x01(\x03R\x08numTasks\x12.\n\x13num_completed_tasks\x18\x03 \x01(\x03R\x11numCompletedTasks\x12(\n\x10input_bytes_read\x18\x04 \x01(\x03R\x0einputBytesRead\x12\x12\n\x04\x64one\x18\x05 \x01(\x08R\x04\x64oneB\x0f\n\rresponse_type"A\n\x08KeyValue\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x19\n\x05value\x18\x02 \x01(\tH\x00R\x05value\x88\x01\x01\x42\x08\n\x06_value"\x87\t\n\rConfigRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x08 \x01(\tH\x00R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12\x44\n\toperation\x18\x03 \x01(\x0b\x32&.spark.connect.ConfigRequest.OperationR\toperation\x12$\n\x0b\x63lient_type\x18\x04 \x01(\tH\x01R\nclientType\x88\x01\x01\x1a\xf2\x03\n\tOperation\x12\x34\n\x03set\x18\x01 \x01(\x0b\x32 .spark.connect.ConfigRequest.SetH\x00R\x03set\x12\x34\n\x03get\x18\x02 \x01(\x0b\x32 .spark.connect.ConfigRequest.GetH\x00R\x03get\x12W\n\x10get_with_default\x18\x03 \x01(\x0b\x32+.spark.connect.ConfigRequest.GetWithDefaultH\x00R\x0egetWithDefault\x12G\n\nget_option\x18\x04 \x01(\x0b\x32&.spark.connect.ConfigRequest.GetOptionH\x00R\tgetOption\x12>\n\x07get_all\x18\x05 \x01(\x0b\x32#.spark.connect.ConfigRequest.GetAllH\x00R\x06getAll\x12:\n\x05unset\x18\x06 \x01(\x0b\x32".spark.connect.ConfigRequest.UnsetH\x00R\x05unset\x12P\n\ris_modifiable\x18\x07 \x01(\x0b\x32).spark.connect.ConfigRequest.IsModifiableH\x00R\x0cisModifiableB\t\n\x07op_type\x1a\x34\n\x03Set\x12-\n\x05pairs\x18\x01 \x03(\x0b\x32\x17.spark.connect.KeyValueR\x05pairs\x1a\x19\n\x03Get\x12\x12\n\x04keys\x18\x01 \x03(\tR\x04keys\x1a?\n\x0eGetWithDefault\x12-\n\x05pairs\x18\x01 \x03(\x0b\x32\x17.spark.connect.KeyValueR\x05pairs\x1a\x1f\n\tGetOption\x12\x12\n\x04keys\x18\x01 \x03(\tR\x04keys\x1a\x30\n\x06GetAll\x12\x1b\n\x06prefix\x18\x01 \x01(\tH\x00R\x06prefix\x88\x01\x01\x42\t\n\x07_prefix\x1a\x1b\n\x05Unset\x12\x12\n\x04keys\x18\x01 \x03(\tR\x04keys\x1a"\n\x0cIsModifiable\x12\x12\n\x04keys\x18\x01 \x03(\tR\x04keysB)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_type"\xaf\x01\n\x0e\x43onfigResponse\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x04 \x01(\tR\x13serverSideSessionId\x12-\n\x05pairs\x18\x02 \x03(\x0b\x32\x17.spark.connect.KeyValueR\x05pairs\x12\x1a\n\x08warnings\x18\x03 \x03(\tR\x08warnings"\xea\x07\n\x13\x41\x64\x64\x41rtifactsRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12V\n&client_observed_server_side_session_id\x18\x07 \x01(\tH\x01R!clientObservedServerSideSessionId\x88\x01\x01\x12$\n\x0b\x63lient_type\x18\x06 \x01(\tH\x02R\nclientType\x88\x01\x01\x12@\n\x05\x62\x61tch\x18\x03 \x01(\x0b\x32(.spark.connect.AddArtifactsRequest.BatchH\x00R\x05\x62\x61tch\x12Z\n\x0b\x62\x65gin_chunk\x18\x04 \x01(\x0b\x32\x37.spark.connect.AddArtifactsRequest.BeginChunkedArtifactH\x00R\nbeginChunk\x12H\n\x05\x63hunk\x18\x05 \x01(\x0b\x32\x30.spark.connect.AddArtifactsRequest.ArtifactChunkH\x00R\x05\x63hunk\x1a\x35\n\rArtifactChunk\x12\x12\n\x04\x64\x61ta\x18\x01 \x01(\x0cR\x04\x64\x61ta\x12\x10\n\x03\x63rc\x18\x02 \x01(\x03R\x03\x63rc\x1ao\n\x13SingleChunkArtifact\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12\x44\n\x04\x64\x61ta\x18\x02 \x01(\x0b\x32\x30.spark.connect.AddArtifactsRequest.ArtifactChunkR\x04\x64\x61ta\x1a]\n\x05\x42\x61tch\x12T\n\tartifacts\x18\x01 \x03(\x0b\x32\x36.spark.connect.AddArtifactsRequest.SingleChunkArtifactR\tartifacts\x1a\xc1\x01\n\x14\x42\x65ginChunkedArtifact\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12\x1f\n\x0btotal_bytes\x18\x02 \x01(\x03R\ntotalBytes\x12\x1d\n\nnum_chunks\x18\x03 \x01(\x03R\tnumChunks\x12U\n\rinitial_chunk\x18\x04 \x01(\x0b\x32\x30.spark.connect.AddArtifactsRequest.ArtifactChunkR\x0cinitialChunkB\t\n\x07payloadB)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_type"\x90\x02\n\x14\x41\x64\x64\x41rtifactsResponse\x12\x1d\n\nsession_id\x18\x02 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x03 \x01(\tR\x13serverSideSessionId\x12Q\n\tartifacts\x18\x01 \x03(\x0b\x32\x33.spark.connect.AddArtifactsResponse.ArtifactSummaryR\tartifacts\x1aQ\n\x0f\x41rtifactSummary\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12*\n\x11is_crc_successful\x18\x02 \x01(\x08R\x0fisCrcSuccessful"\xc6\x02\n\x17\x41rtifactStatusesRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x05 \x01(\tH\x00R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12$\n\x0b\x63lient_type\x18\x03 \x01(\tH\x01R\nclientType\x88\x01\x01\x12\x14\n\x05names\x18\x04 \x03(\tR\x05namesB)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_type"\xe0\x02\n\x18\x41rtifactStatusesResponse\x12\x1d\n\nsession_id\x18\x02 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x03 \x01(\tR\x13serverSideSessionId\x12Q\n\x08statuses\x18\x01 \x03(\x0b\x32\x35.spark.connect.ArtifactStatusesResponse.StatusesEntryR\x08statuses\x1as\n\rStatusesEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12L\n\x05value\x18\x02 \x01(\x0b\x32\x36.spark.connect.ArtifactStatusesResponse.ArtifactStatusR\x05value:\x02\x38\x01\x1a(\n\x0e\x41rtifactStatus\x12\x16\n\x06\x65xists\x18\x01 \x01(\x08R\x06\x65xists"\xdb\x04\n\x10InterruptRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x07 \x01(\tH\x01R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12$\n\x0b\x63lient_type\x18\x03 \x01(\tH\x02R\nclientType\x88\x01\x01\x12T\n\x0einterrupt_type\x18\x04 \x01(\x0e\x32-.spark.connect.InterruptRequest.InterruptTypeR\rinterruptType\x12%\n\roperation_tag\x18\x05 \x01(\tH\x00R\x0coperationTag\x12#\n\x0coperation_id\x18\x06 \x01(\tH\x00R\x0boperationId"\x80\x01\n\rInterruptType\x12\x1e\n\x1aINTERRUPT_TYPE_UNSPECIFIED\x10\x00\x12\x16\n\x12INTERRUPT_TYPE_ALL\x10\x01\x12\x16\n\x12INTERRUPT_TYPE_TAG\x10\x02\x12\x1f\n\x1bINTERRUPT_TYPE_OPERATION_ID\x10\x03\x42\x0b\n\tinterruptB)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_type"\x90\x01\n\x11InterruptResponse\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x03 \x01(\tR\x13serverSideSessionId\x12\'\n\x0finterrupted_ids\x18\x02 \x03(\tR\x0einterruptedIds"5\n\x0fReattachOptions\x12"\n\x0creattachable\x18\x01 \x01(\x08R\x0creattachable"\x96\x03\n\x16ReattachExecuteRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x06 \x01(\tH\x00R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12!\n\x0coperation_id\x18\x03 \x01(\tR\x0boperationId\x12$\n\x0b\x63lient_type\x18\x04 \x01(\tH\x01R\nclientType\x88\x01\x01\x12-\n\x10last_response_id\x18\x05 \x01(\tH\x02R\x0elastResponseId\x88\x01\x01\x42)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_typeB\x13\n\x11_last_response_id"\xc9\x04\n\x15ReleaseExecuteRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x07 \x01(\tH\x01R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12!\n\x0coperation_id\x18\x03 \x01(\tR\x0boperationId\x12$\n\x0b\x63lient_type\x18\x04 \x01(\tH\x02R\nclientType\x88\x01\x01\x12R\n\x0brelease_all\x18\x05 \x01(\x0b\x32/.spark.connect.ReleaseExecuteRequest.ReleaseAllH\x00R\nreleaseAll\x12X\n\rrelease_until\x18\x06 \x01(\x0b\x32\x31.spark.connect.ReleaseExecuteRequest.ReleaseUntilH\x00R\x0creleaseUntil\x1a\x0c\n\nReleaseAll\x1a/\n\x0cReleaseUntil\x12\x1f\n\x0bresponse_id\x18\x01 \x01(\tR\nresponseIdB\t\n\x07releaseB)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_type"\xa5\x01\n\x16ReleaseExecuteResponse\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x03 \x01(\tR\x13serverSideSessionId\x12&\n\x0coperation_id\x18\x02 \x01(\tH\x00R\x0boperationId\x88\x01\x01\x42\x0f\n\r_operation_id"\xab\x01\n\x15ReleaseSessionRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12$\n\x0b\x63lient_type\x18\x03 \x01(\tH\x00R\nclientType\x88\x01\x01\x42\x0e\n\x0c_client_type"l\n\x16ReleaseSessionResponse\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x02 \x01(\tR\x13serverSideSessionId"\xcc\x02\n\x18\x46\x65tchErrorDetailsRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x05 \x01(\tH\x00R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12\x19\n\x08\x65rror_id\x18\x03 \x01(\tR\x07\x65rrorId\x12$\n\x0b\x63lient_type\x18\x04 \x01(\tH\x01R\nclientType\x88\x01\x01\x42)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_type"\x93\x0c\n\x19\x46\x65tchErrorDetailsResponse\x12\x33\n\x16server_side_session_id\x18\x03 \x01(\tR\x13serverSideSessionId\x12\x1d\n\nsession_id\x18\x04 \x01(\tR\tsessionId\x12)\n\x0eroot_error_idx\x18\x01 \x01(\x05H\x00R\x0crootErrorIdx\x88\x01\x01\x12\x46\n\x06\x65rrors\x18\x02 \x03(\x0b\x32..spark.connect.FetchErrorDetailsResponse.ErrorR\x06\x65rrors\x1a\xae\x01\n\x11StackTraceElement\x12\'\n\x0f\x64\x65\x63laring_class\x18\x01 \x01(\tR\x0e\x64\x65\x63laringClass\x12\x1f\n\x0bmethod_name\x18\x02 \x01(\tR\nmethodName\x12 \n\tfile_name\x18\x03 \x01(\tH\x00R\x08\x66ileName\x88\x01\x01\x12\x1f\n\x0bline_number\x18\x04 \x01(\x05R\nlineNumberB\x0c\n\n_file_name\x1a\xf0\x02\n\x0cQueryContext\x12\x64\n\x0c\x63ontext_type\x18\n \x01(\x0e\x32\x41.spark.connect.FetchErrorDetailsResponse.QueryContext.ContextTypeR\x0b\x63ontextType\x12\x1f\n\x0bobject_type\x18\x01 \x01(\tR\nobjectType\x12\x1f\n\x0bobject_name\x18\x02 \x01(\tR\nobjectName\x12\x1f\n\x0bstart_index\x18\x03 \x01(\x05R\nstartIndex\x12\x1d\n\nstop_index\x18\x04 \x01(\x05R\tstopIndex\x12\x1a\n\x08\x66ragment\x18\x05 \x01(\tR\x08\x66ragment\x12\x1b\n\tcall_site\x18\x06 \x01(\tR\x08\x63\x61llSite\x12\x18\n\x07summary\x18\x07 \x01(\tR\x07summary"%\n\x0b\x43ontextType\x12\x07\n\x03SQL\x10\x00\x12\r\n\tDATAFRAME\x10\x01\x1a\x99\x03\n\x0eSparkThrowable\x12$\n\x0b\x65rror_class\x18\x01 \x01(\tH\x00R\nerrorClass\x88\x01\x01\x12}\n\x12message_parameters\x18\x02 \x03(\x0b\x32N.spark.connect.FetchErrorDetailsResponse.SparkThrowable.MessageParametersEntryR\x11messageParameters\x12\\\n\x0equery_contexts\x18\x03 \x03(\x0b\x32\x35.spark.connect.FetchErrorDetailsResponse.QueryContextR\rqueryContexts\x12 \n\tsql_state\x18\x04 \x01(\tH\x01R\x08sqlState\x88\x01\x01\x1a\x44\n\x16MessageParametersEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x42\x0e\n\x0c_error_classB\x0c\n\n_sql_state\x1a\xdb\x02\n\x05\x45rror\x12\x30\n\x14\x65rror_type_hierarchy\x18\x01 \x03(\tR\x12\x65rrorTypeHierarchy\x12\x18\n\x07message\x18\x02 \x01(\tR\x07message\x12[\n\x0bstack_trace\x18\x03 \x03(\x0b\x32:.spark.connect.FetchErrorDetailsResponse.StackTraceElementR\nstackTrace\x12 \n\tcause_idx\x18\x04 \x01(\x05H\x00R\x08\x63\x61useIdx\x88\x01\x01\x12\x65\n\x0fspark_throwable\x18\x05 \x01(\x0b\x32\x37.spark.connect.FetchErrorDetailsResponse.SparkThrowableH\x01R\x0esparkThrowable\x88\x01\x01\x42\x0c\n\n_cause_idxB\x12\n\x10_spark_throwableB\x11\n\x0f_root_error_idx2\xb2\x07\n\x13SparkConnectService\x12X\n\x0b\x45xecutePlan\x12!.spark.connect.ExecutePlanRequest\x1a".spark.connect.ExecutePlanResponse"\x00\x30\x01\x12V\n\x0b\x41nalyzePlan\x12!.spark.connect.AnalyzePlanRequest\x1a".spark.connect.AnalyzePlanResponse"\x00\x12G\n\x06\x43onfig\x12\x1c.spark.connect.ConfigRequest\x1a\x1d.spark.connect.ConfigResponse"\x00\x12[\n\x0c\x41\x64\x64\x41rtifacts\x12".spark.connect.AddArtifactsRequest\x1a#.spark.connect.AddArtifactsResponse"\x00(\x01\x12\x63\n\x0e\x41rtifactStatus\x12&.spark.connect.ArtifactStatusesRequest\x1a\'.spark.connect.ArtifactStatusesResponse"\x00\x12P\n\tInterrupt\x12\x1f.spark.connect.InterruptRequest\x1a .spark.connect.InterruptResponse"\x00\x12`\n\x0fReattachExecute\x12%.spark.connect.ReattachExecuteRequest\x1a".spark.connect.ExecutePlanResponse"\x00\x30\x01\x12_\n\x0eReleaseExecute\x12$.spark.connect.ReleaseExecuteRequest\x1a%.spark.connect.ReleaseExecuteResponse"\x00\x12_\n\x0eReleaseSession\x12$.spark.connect.ReleaseSessionRequest\x1a%.spark.connect.ReleaseSessionResponse"\x00\x12h\n\x11\x46\x65tchErrorDetails\x12\'.spark.connect.FetchErrorDetailsRequest\x1a(.spark.connect.FetchErrorDetailsResponse"\x00\x42\x36\n\x1eorg.apache.spark.connect.protoP\x01Z\x12internal/generatedb\x06proto3' + b'\n\x18spark/connect/base.proto\x12\rspark.connect\x1a\x19google/protobuf/any.proto\x1a\x1cspark/connect/commands.proto\x1a\x1aspark/connect/common.proto\x1a\x1fspark/connect/expressions.proto\x1a\x1dspark/connect/relations.proto\x1a\x19spark/connect/types.proto"t\n\x04Plan\x12-\n\x04root\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationH\x00R\x04root\x12\x32\n\x07\x63ommand\x18\x02 \x01(\x0b\x32\x16.spark.connect.CommandH\x00R\x07\x63ommandB\t\n\x07op_type"z\n\x0bUserContext\x12\x17\n\x07user_id\x18\x01 \x01(\tR\x06userId\x12\x1b\n\tuser_name\x18\x02 \x01(\tR\x08userName\x12\x35\n\nextensions\x18\xe7\x07 \x03(\x0b\x32\x14.google.protobuf.AnyR\nextensions"\xf8\x13\n\x12\x41nalyzePlanRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x11 \x01(\tH\x01R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12$\n\x0b\x63lient_type\x18\x03 \x01(\tH\x02R\nclientType\x88\x01\x01\x12\x42\n\x06schema\x18\x04 \x01(\x0b\x32(.spark.connect.AnalyzePlanRequest.SchemaH\x00R\x06schema\x12\x45\n\x07\x65xplain\x18\x05 \x01(\x0b\x32).spark.connect.AnalyzePlanRequest.ExplainH\x00R\x07\x65xplain\x12O\n\x0btree_string\x18\x06 \x01(\x0b\x32,.spark.connect.AnalyzePlanRequest.TreeStringH\x00R\ntreeString\x12\x46\n\x08is_local\x18\x07 \x01(\x0b\x32).spark.connect.AnalyzePlanRequest.IsLocalH\x00R\x07isLocal\x12R\n\x0cis_streaming\x18\x08 \x01(\x0b\x32-.spark.connect.AnalyzePlanRequest.IsStreamingH\x00R\x0bisStreaming\x12O\n\x0binput_files\x18\t \x01(\x0b\x32,.spark.connect.AnalyzePlanRequest.InputFilesH\x00R\ninputFiles\x12U\n\rspark_version\x18\n \x01(\x0b\x32..spark.connect.AnalyzePlanRequest.SparkVersionH\x00R\x0csparkVersion\x12I\n\tddl_parse\x18\x0b \x01(\x0b\x32*.spark.connect.AnalyzePlanRequest.DDLParseH\x00R\x08\x64\x64lParse\x12X\n\x0esame_semantics\x18\x0c \x01(\x0b\x32/.spark.connect.AnalyzePlanRequest.SameSemanticsH\x00R\rsameSemantics\x12U\n\rsemantic_hash\x18\r \x01(\x0b\x32..spark.connect.AnalyzePlanRequest.SemanticHashH\x00R\x0csemanticHash\x12\x45\n\x07persist\x18\x0e \x01(\x0b\x32).spark.connect.AnalyzePlanRequest.PersistH\x00R\x07persist\x12K\n\tunpersist\x18\x0f \x01(\x0b\x32+.spark.connect.AnalyzePlanRequest.UnpersistH\x00R\tunpersist\x12_\n\x11get_storage_level\x18\x10 \x01(\x0b\x32\x31.spark.connect.AnalyzePlanRequest.GetStorageLevelH\x00R\x0fgetStorageLevel\x1a\x31\n\x06Schema\x12\'\n\x04plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x1a\xbb\x02\n\x07\x45xplain\x12\'\n\x04plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x12X\n\x0c\x65xplain_mode\x18\x02 \x01(\x0e\x32\x35.spark.connect.AnalyzePlanRequest.Explain.ExplainModeR\x0b\x65xplainMode"\xac\x01\n\x0b\x45xplainMode\x12\x1c\n\x18\x45XPLAIN_MODE_UNSPECIFIED\x10\x00\x12\x17\n\x13\x45XPLAIN_MODE_SIMPLE\x10\x01\x12\x19\n\x15\x45XPLAIN_MODE_EXTENDED\x10\x02\x12\x18\n\x14\x45XPLAIN_MODE_CODEGEN\x10\x03\x12\x15\n\x11\x45XPLAIN_MODE_COST\x10\x04\x12\x1a\n\x16\x45XPLAIN_MODE_FORMATTED\x10\x05\x1aZ\n\nTreeString\x12\'\n\x04plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x12\x19\n\x05level\x18\x02 \x01(\x05H\x00R\x05level\x88\x01\x01\x42\x08\n\x06_level\x1a\x32\n\x07IsLocal\x12\'\n\x04plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x1a\x36\n\x0bIsStreaming\x12\'\n\x04plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x1a\x35\n\nInputFiles\x12\'\n\x04plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x1a\x0e\n\x0cSparkVersion\x1a)\n\x08\x44\x44LParse\x12\x1d\n\nddl_string\x18\x01 \x01(\tR\tddlString\x1ay\n\rSameSemantics\x12\x34\n\x0btarget_plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\ntargetPlan\x12\x32\n\nother_plan\x18\x02 \x01(\x0b\x32\x13.spark.connect.PlanR\totherPlan\x1a\x37\n\x0cSemanticHash\x12\'\n\x04plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x1a\x97\x01\n\x07Persist\x12\x33\n\x08relation\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x08relation\x12\x45\n\rstorage_level\x18\x02 \x01(\x0b\x32\x1b.spark.connect.StorageLevelH\x00R\x0cstorageLevel\x88\x01\x01\x42\x10\n\x0e_storage_level\x1an\n\tUnpersist\x12\x33\n\x08relation\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x08relation\x12\x1f\n\x08\x62locking\x18\x02 \x01(\x08H\x00R\x08\x62locking\x88\x01\x01\x42\x0b\n\t_blocking\x1a\x46\n\x0fGetStorageLevel\x12\x33\n\x08relation\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x08relationB\t\n\x07\x61nalyzeB)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_type"\xce\r\n\x13\x41nalyzePlanResponse\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x0f \x01(\tR\x13serverSideSessionId\x12\x43\n\x06schema\x18\x02 \x01(\x0b\x32).spark.connect.AnalyzePlanResponse.SchemaH\x00R\x06schema\x12\x46\n\x07\x65xplain\x18\x03 \x01(\x0b\x32*.spark.connect.AnalyzePlanResponse.ExplainH\x00R\x07\x65xplain\x12P\n\x0btree_string\x18\x04 \x01(\x0b\x32-.spark.connect.AnalyzePlanResponse.TreeStringH\x00R\ntreeString\x12G\n\x08is_local\x18\x05 \x01(\x0b\x32*.spark.connect.AnalyzePlanResponse.IsLocalH\x00R\x07isLocal\x12S\n\x0cis_streaming\x18\x06 \x01(\x0b\x32..spark.connect.AnalyzePlanResponse.IsStreamingH\x00R\x0bisStreaming\x12P\n\x0binput_files\x18\x07 \x01(\x0b\x32-.spark.connect.AnalyzePlanResponse.InputFilesH\x00R\ninputFiles\x12V\n\rspark_version\x18\x08 \x01(\x0b\x32/.spark.connect.AnalyzePlanResponse.SparkVersionH\x00R\x0csparkVersion\x12J\n\tddl_parse\x18\t \x01(\x0b\x32+.spark.connect.AnalyzePlanResponse.DDLParseH\x00R\x08\x64\x64lParse\x12Y\n\x0esame_semantics\x18\n \x01(\x0b\x32\x30.spark.connect.AnalyzePlanResponse.SameSemanticsH\x00R\rsameSemantics\x12V\n\rsemantic_hash\x18\x0b \x01(\x0b\x32/.spark.connect.AnalyzePlanResponse.SemanticHashH\x00R\x0csemanticHash\x12\x46\n\x07persist\x18\x0c \x01(\x0b\x32*.spark.connect.AnalyzePlanResponse.PersistH\x00R\x07persist\x12L\n\tunpersist\x18\r \x01(\x0b\x32,.spark.connect.AnalyzePlanResponse.UnpersistH\x00R\tunpersist\x12`\n\x11get_storage_level\x18\x0e \x01(\x0b\x32\x32.spark.connect.AnalyzePlanResponse.GetStorageLevelH\x00R\x0fgetStorageLevel\x1a\x39\n\x06Schema\x12/\n\x06schema\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeR\x06schema\x1a\x30\n\x07\x45xplain\x12%\n\x0e\x65xplain_string\x18\x01 \x01(\tR\rexplainString\x1a-\n\nTreeString\x12\x1f\n\x0btree_string\x18\x01 \x01(\tR\ntreeString\x1a$\n\x07IsLocal\x12\x19\n\x08is_local\x18\x01 \x01(\x08R\x07isLocal\x1a\x30\n\x0bIsStreaming\x12!\n\x0cis_streaming\x18\x01 \x01(\x08R\x0bisStreaming\x1a"\n\nInputFiles\x12\x14\n\x05\x66iles\x18\x01 \x03(\tR\x05\x66iles\x1a(\n\x0cSparkVersion\x12\x18\n\x07version\x18\x01 \x01(\tR\x07version\x1a;\n\x08\x44\x44LParse\x12/\n\x06parsed\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeR\x06parsed\x1a\'\n\rSameSemantics\x12\x16\n\x06result\x18\x01 \x01(\x08R\x06result\x1a&\n\x0cSemanticHash\x12\x16\n\x06result\x18\x01 \x01(\x05R\x06result\x1a\t\n\x07Persist\x1a\x0b\n\tUnpersist\x1aS\n\x0fGetStorageLevel\x12@\n\rstorage_level\x18\x01 \x01(\x0b\x32\x1b.spark.connect.StorageLevelR\x0cstorageLevelB\x08\n\x06result"\xa3\x05\n\x12\x45xecutePlanRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x08 \x01(\tH\x00R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12&\n\x0coperation_id\x18\x06 \x01(\tH\x01R\x0boperationId\x88\x01\x01\x12\'\n\x04plan\x18\x03 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x12$\n\x0b\x63lient_type\x18\x04 \x01(\tH\x02R\nclientType\x88\x01\x01\x12X\n\x0frequest_options\x18\x05 \x03(\x0b\x32/.spark.connect.ExecutePlanRequest.RequestOptionR\x0erequestOptions\x12\x12\n\x04tags\x18\x07 \x03(\tR\x04tags\x1a\xa5\x01\n\rRequestOption\x12K\n\x10reattach_options\x18\x01 \x01(\x0b\x32\x1e.spark.connect.ReattachOptionsH\x00R\x0freattachOptions\x12\x35\n\textension\x18\xe7\x07 \x01(\x0b\x32\x14.google.protobuf.AnyH\x00R\textensionB\x10\n\x0erequest_optionB)\n\'_client_observed_server_side_session_idB\x0f\n\r_operation_idB\x0e\n\x0c_client_type"\xe6\x16\n\x13\x45xecutePlanResponse\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x0f \x01(\tR\x13serverSideSessionId\x12!\n\x0coperation_id\x18\x0c \x01(\tR\x0boperationId\x12\x1f\n\x0bresponse_id\x18\r \x01(\tR\nresponseId\x12P\n\x0b\x61rrow_batch\x18\x02 \x01(\x0b\x32-.spark.connect.ExecutePlanResponse.ArrowBatchH\x00R\narrowBatch\x12\x63\n\x12sql_command_result\x18\x05 \x01(\x0b\x32\x33.spark.connect.ExecutePlanResponse.SqlCommandResultH\x00R\x10sqlCommandResult\x12~\n#write_stream_operation_start_result\x18\x08 \x01(\x0b\x32..spark.connect.WriteStreamOperationStartResultH\x00R\x1fwriteStreamOperationStartResult\x12q\n\x1estreaming_query_command_result\x18\t \x01(\x0b\x32*.spark.connect.StreamingQueryCommandResultH\x00R\x1bstreamingQueryCommandResult\x12k\n\x1cget_resources_command_result\x18\n \x01(\x0b\x32(.spark.connect.GetResourcesCommandResultH\x00R\x19getResourcesCommandResult\x12\x87\x01\n&streaming_query_manager_command_result\x18\x0b \x01(\x0b\x32\x31.spark.connect.StreamingQueryManagerCommandResultH\x00R"streamingQueryManagerCommandResult\x12\x87\x01\n&streaming_query_listener_events_result\x18\x10 \x01(\x0b\x32\x31.spark.connect.StreamingQueryListenerEventsResultH\x00R"streamingQueryListenerEventsResult\x12\\\n\x0fresult_complete\x18\x0e \x01(\x0b\x32\x31.spark.connect.ExecutePlanResponse.ResultCompleteH\x00R\x0eresultComplete\x12\x87\x01\n&create_resource_profile_command_result\x18\x11 \x01(\x0b\x32\x31.spark.connect.CreateResourceProfileCommandResultH\x00R"createResourceProfileCommandResult\x12\x65\n\x12\x65xecution_progress\x18\x12 \x01(\x0b\x32\x34.spark.connect.ExecutePlanResponse.ExecutionProgressH\x00R\x11\x65xecutionProgress\x12\x64\n\x19\x63heckpoint_command_result\x18\x13 \x01(\x0b\x32&.spark.connect.CheckpointCommandResultH\x00R\x17\x63heckpointCommandResult\x12\x35\n\textension\x18\xe7\x07 \x01(\x0b\x32\x14.google.protobuf.AnyH\x00R\textension\x12\x44\n\x07metrics\x18\x04 \x01(\x0b\x32*.spark.connect.ExecutePlanResponse.MetricsR\x07metrics\x12]\n\x10observed_metrics\x18\x06 \x03(\x0b\x32\x32.spark.connect.ExecutePlanResponse.ObservedMetricsR\x0fobservedMetrics\x12/\n\x06schema\x18\x07 \x01(\x0b\x32\x17.spark.connect.DataTypeR\x06schema\x1aG\n\x10SqlCommandResult\x12\x33\n\x08relation\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x08relation\x1av\n\nArrowBatch\x12\x1b\n\trow_count\x18\x01 \x01(\x03R\x08rowCount\x12\x12\n\x04\x64\x61ta\x18\x02 \x01(\x0cR\x04\x64\x61ta\x12&\n\x0cstart_offset\x18\x03 \x01(\x03H\x00R\x0bstartOffset\x88\x01\x01\x42\x0f\n\r_start_offset\x1a\x85\x04\n\x07Metrics\x12Q\n\x07metrics\x18\x01 \x03(\x0b\x32\x37.spark.connect.ExecutePlanResponse.Metrics.MetricObjectR\x07metrics\x1a\xcc\x02\n\x0cMetricObject\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12\x17\n\x07plan_id\x18\x02 \x01(\x03R\x06planId\x12\x16\n\x06parent\x18\x03 \x01(\x03R\x06parent\x12z\n\x11\x65xecution_metrics\x18\x04 \x03(\x0b\x32M.spark.connect.ExecutePlanResponse.Metrics.MetricObject.ExecutionMetricsEntryR\x10\x65xecutionMetrics\x1a{\n\x15\x45xecutionMetricsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12L\n\x05value\x18\x02 \x01(\x0b\x32\x36.spark.connect.ExecutePlanResponse.Metrics.MetricValueR\x05value:\x02\x38\x01\x1aX\n\x0bMetricValue\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12\x14\n\x05value\x18\x02 \x01(\x03R\x05value\x12\x1f\n\x0bmetric_type\x18\x03 \x01(\tR\nmetricType\x1a\x8d\x01\n\x0fObservedMetrics\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12\x39\n\x06values\x18\x02 \x03(\x0b\x32!.spark.connect.Expression.LiteralR\x06values\x12\x12\n\x04keys\x18\x03 \x03(\tR\x04keys\x12\x17\n\x07plan_id\x18\x04 \x01(\x03R\x06planId\x1a\x10\n\x0eResultComplete\x1a\xcd\x02\n\x11\x45xecutionProgress\x12V\n\x06stages\x18\x01 \x03(\x0b\x32>.spark.connect.ExecutePlanResponse.ExecutionProgress.StageInfoR\x06stages\x12,\n\x12num_inflight_tasks\x18\x02 \x01(\x03R\x10numInflightTasks\x1a\xb1\x01\n\tStageInfo\x12\x19\n\x08stage_id\x18\x01 \x01(\x03R\x07stageId\x12\x1b\n\tnum_tasks\x18\x02 \x01(\x03R\x08numTasks\x12.\n\x13num_completed_tasks\x18\x03 \x01(\x03R\x11numCompletedTasks\x12(\n\x10input_bytes_read\x18\x04 \x01(\x03R\x0einputBytesRead\x12\x12\n\x04\x64one\x18\x05 \x01(\x08R\x04\x64oneB\x0f\n\rresponse_type"A\n\x08KeyValue\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x19\n\x05value\x18\x02 \x01(\tH\x00R\x05value\x88\x01\x01\x42\x08\n\x06_value"\x87\t\n\rConfigRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x08 \x01(\tH\x00R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12\x44\n\toperation\x18\x03 \x01(\x0b\x32&.spark.connect.ConfigRequest.OperationR\toperation\x12$\n\x0b\x63lient_type\x18\x04 \x01(\tH\x01R\nclientType\x88\x01\x01\x1a\xf2\x03\n\tOperation\x12\x34\n\x03set\x18\x01 \x01(\x0b\x32 .spark.connect.ConfigRequest.SetH\x00R\x03set\x12\x34\n\x03get\x18\x02 \x01(\x0b\x32 .spark.connect.ConfigRequest.GetH\x00R\x03get\x12W\n\x10get_with_default\x18\x03 \x01(\x0b\x32+.spark.connect.ConfigRequest.GetWithDefaultH\x00R\x0egetWithDefault\x12G\n\nget_option\x18\x04 \x01(\x0b\x32&.spark.connect.ConfigRequest.GetOptionH\x00R\tgetOption\x12>\n\x07get_all\x18\x05 \x01(\x0b\x32#.spark.connect.ConfigRequest.GetAllH\x00R\x06getAll\x12:\n\x05unset\x18\x06 \x01(\x0b\x32".spark.connect.ConfigRequest.UnsetH\x00R\x05unset\x12P\n\ris_modifiable\x18\x07 \x01(\x0b\x32).spark.connect.ConfigRequest.IsModifiableH\x00R\x0cisModifiableB\t\n\x07op_type\x1a\x34\n\x03Set\x12-\n\x05pairs\x18\x01 \x03(\x0b\x32\x17.spark.connect.KeyValueR\x05pairs\x1a\x19\n\x03Get\x12\x12\n\x04keys\x18\x01 \x03(\tR\x04keys\x1a?\n\x0eGetWithDefault\x12-\n\x05pairs\x18\x01 \x03(\x0b\x32\x17.spark.connect.KeyValueR\x05pairs\x1a\x1f\n\tGetOption\x12\x12\n\x04keys\x18\x01 \x03(\tR\x04keys\x1a\x30\n\x06GetAll\x12\x1b\n\x06prefix\x18\x01 \x01(\tH\x00R\x06prefix\x88\x01\x01\x42\t\n\x07_prefix\x1a\x1b\n\x05Unset\x12\x12\n\x04keys\x18\x01 \x03(\tR\x04keys\x1a"\n\x0cIsModifiable\x12\x12\n\x04keys\x18\x01 \x03(\tR\x04keysB)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_type"\xaf\x01\n\x0e\x43onfigResponse\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x04 \x01(\tR\x13serverSideSessionId\x12-\n\x05pairs\x18\x02 \x03(\x0b\x32\x17.spark.connect.KeyValueR\x05pairs\x12\x1a\n\x08warnings\x18\x03 \x03(\tR\x08warnings"\xea\x07\n\x13\x41\x64\x64\x41rtifactsRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12V\n&client_observed_server_side_session_id\x18\x07 \x01(\tH\x01R!clientObservedServerSideSessionId\x88\x01\x01\x12$\n\x0b\x63lient_type\x18\x06 \x01(\tH\x02R\nclientType\x88\x01\x01\x12@\n\x05\x62\x61tch\x18\x03 \x01(\x0b\x32(.spark.connect.AddArtifactsRequest.BatchH\x00R\x05\x62\x61tch\x12Z\n\x0b\x62\x65gin_chunk\x18\x04 \x01(\x0b\x32\x37.spark.connect.AddArtifactsRequest.BeginChunkedArtifactH\x00R\nbeginChunk\x12H\n\x05\x63hunk\x18\x05 \x01(\x0b\x32\x30.spark.connect.AddArtifactsRequest.ArtifactChunkH\x00R\x05\x63hunk\x1a\x35\n\rArtifactChunk\x12\x12\n\x04\x64\x61ta\x18\x01 \x01(\x0cR\x04\x64\x61ta\x12\x10\n\x03\x63rc\x18\x02 \x01(\x03R\x03\x63rc\x1ao\n\x13SingleChunkArtifact\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12\x44\n\x04\x64\x61ta\x18\x02 \x01(\x0b\x32\x30.spark.connect.AddArtifactsRequest.ArtifactChunkR\x04\x64\x61ta\x1a]\n\x05\x42\x61tch\x12T\n\tartifacts\x18\x01 \x03(\x0b\x32\x36.spark.connect.AddArtifactsRequest.SingleChunkArtifactR\tartifacts\x1a\xc1\x01\n\x14\x42\x65ginChunkedArtifact\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12\x1f\n\x0btotal_bytes\x18\x02 \x01(\x03R\ntotalBytes\x12\x1d\n\nnum_chunks\x18\x03 \x01(\x03R\tnumChunks\x12U\n\rinitial_chunk\x18\x04 \x01(\x0b\x32\x30.spark.connect.AddArtifactsRequest.ArtifactChunkR\x0cinitialChunkB\t\n\x07payloadB)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_type"\x90\x02\n\x14\x41\x64\x64\x41rtifactsResponse\x12\x1d\n\nsession_id\x18\x02 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x03 \x01(\tR\x13serverSideSessionId\x12Q\n\tartifacts\x18\x01 \x03(\x0b\x32\x33.spark.connect.AddArtifactsResponse.ArtifactSummaryR\tartifacts\x1aQ\n\x0f\x41rtifactSummary\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12*\n\x11is_crc_successful\x18\x02 \x01(\x08R\x0fisCrcSuccessful"\xc6\x02\n\x17\x41rtifactStatusesRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x05 \x01(\tH\x00R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12$\n\x0b\x63lient_type\x18\x03 \x01(\tH\x01R\nclientType\x88\x01\x01\x12\x14\n\x05names\x18\x04 \x03(\tR\x05namesB)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_type"\xe0\x02\n\x18\x41rtifactStatusesResponse\x12\x1d\n\nsession_id\x18\x02 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x03 \x01(\tR\x13serverSideSessionId\x12Q\n\x08statuses\x18\x01 \x03(\x0b\x32\x35.spark.connect.ArtifactStatusesResponse.StatusesEntryR\x08statuses\x1as\n\rStatusesEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12L\n\x05value\x18\x02 \x01(\x0b\x32\x36.spark.connect.ArtifactStatusesResponse.ArtifactStatusR\x05value:\x02\x38\x01\x1a(\n\x0e\x41rtifactStatus\x12\x16\n\x06\x65xists\x18\x01 \x01(\x08R\x06\x65xists"\xdb\x04\n\x10InterruptRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x07 \x01(\tH\x01R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12$\n\x0b\x63lient_type\x18\x03 \x01(\tH\x02R\nclientType\x88\x01\x01\x12T\n\x0einterrupt_type\x18\x04 \x01(\x0e\x32-.spark.connect.InterruptRequest.InterruptTypeR\rinterruptType\x12%\n\roperation_tag\x18\x05 \x01(\tH\x00R\x0coperationTag\x12#\n\x0coperation_id\x18\x06 \x01(\tH\x00R\x0boperationId"\x80\x01\n\rInterruptType\x12\x1e\n\x1aINTERRUPT_TYPE_UNSPECIFIED\x10\x00\x12\x16\n\x12INTERRUPT_TYPE_ALL\x10\x01\x12\x16\n\x12INTERRUPT_TYPE_TAG\x10\x02\x12\x1f\n\x1bINTERRUPT_TYPE_OPERATION_ID\x10\x03\x42\x0b\n\tinterruptB)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_type"\x90\x01\n\x11InterruptResponse\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x03 \x01(\tR\x13serverSideSessionId\x12\'\n\x0finterrupted_ids\x18\x02 \x03(\tR\x0einterruptedIds"5\n\x0fReattachOptions\x12"\n\x0creattachable\x18\x01 \x01(\x08R\x0creattachable"\x96\x03\n\x16ReattachExecuteRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x06 \x01(\tH\x00R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12!\n\x0coperation_id\x18\x03 \x01(\tR\x0boperationId\x12$\n\x0b\x63lient_type\x18\x04 \x01(\tH\x01R\nclientType\x88\x01\x01\x12-\n\x10last_response_id\x18\x05 \x01(\tH\x02R\x0elastResponseId\x88\x01\x01\x42)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_typeB\x13\n\x11_last_response_id"\xc9\x04\n\x15ReleaseExecuteRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x07 \x01(\tH\x01R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12!\n\x0coperation_id\x18\x03 \x01(\tR\x0boperationId\x12$\n\x0b\x63lient_type\x18\x04 \x01(\tH\x02R\nclientType\x88\x01\x01\x12R\n\x0brelease_all\x18\x05 \x01(\x0b\x32/.spark.connect.ReleaseExecuteRequest.ReleaseAllH\x00R\nreleaseAll\x12X\n\rrelease_until\x18\x06 \x01(\x0b\x32\x31.spark.connect.ReleaseExecuteRequest.ReleaseUntilH\x00R\x0creleaseUntil\x1a\x0c\n\nReleaseAll\x1a/\n\x0cReleaseUntil\x12\x1f\n\x0bresponse_id\x18\x01 \x01(\tR\nresponseIdB\t\n\x07releaseB)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_type"\xa5\x01\n\x16ReleaseExecuteResponse\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x03 \x01(\tR\x13serverSideSessionId\x12&\n\x0coperation_id\x18\x02 \x01(\tH\x00R\x0boperationId\x88\x01\x01\x42\x0f\n\r_operation_id"\xab\x01\n\x15ReleaseSessionRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12$\n\x0b\x63lient_type\x18\x03 \x01(\tH\x00R\nclientType\x88\x01\x01\x42\x0e\n\x0c_client_type"l\n\x16ReleaseSessionResponse\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x02 \x01(\tR\x13serverSideSessionId"\xcc\x02\n\x18\x46\x65tchErrorDetailsRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x05 \x01(\tH\x00R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12\x19\n\x08\x65rror_id\x18\x03 \x01(\tR\x07\x65rrorId\x12$\n\x0b\x63lient_type\x18\x04 \x01(\tH\x01R\nclientType\x88\x01\x01\x42)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_type"\x93\x0c\n\x19\x46\x65tchErrorDetailsResponse\x12\x33\n\x16server_side_session_id\x18\x03 \x01(\tR\x13serverSideSessionId\x12\x1d\n\nsession_id\x18\x04 \x01(\tR\tsessionId\x12)\n\x0eroot_error_idx\x18\x01 \x01(\x05H\x00R\x0crootErrorIdx\x88\x01\x01\x12\x46\n\x06\x65rrors\x18\x02 \x03(\x0b\x32..spark.connect.FetchErrorDetailsResponse.ErrorR\x06\x65rrors\x1a\xae\x01\n\x11StackTraceElement\x12\'\n\x0f\x64\x65\x63laring_class\x18\x01 \x01(\tR\x0e\x64\x65\x63laringClass\x12\x1f\n\x0bmethod_name\x18\x02 \x01(\tR\nmethodName\x12 \n\tfile_name\x18\x03 \x01(\tH\x00R\x08\x66ileName\x88\x01\x01\x12\x1f\n\x0bline_number\x18\x04 \x01(\x05R\nlineNumberB\x0c\n\n_file_name\x1a\xf0\x02\n\x0cQueryContext\x12\x64\n\x0c\x63ontext_type\x18\n \x01(\x0e\x32\x41.spark.connect.FetchErrorDetailsResponse.QueryContext.ContextTypeR\x0b\x63ontextType\x12\x1f\n\x0bobject_type\x18\x01 \x01(\tR\nobjectType\x12\x1f\n\x0bobject_name\x18\x02 \x01(\tR\nobjectName\x12\x1f\n\x0bstart_index\x18\x03 \x01(\x05R\nstartIndex\x12\x1d\n\nstop_index\x18\x04 \x01(\x05R\tstopIndex\x12\x1a\n\x08\x66ragment\x18\x05 \x01(\tR\x08\x66ragment\x12\x1b\n\tcall_site\x18\x06 \x01(\tR\x08\x63\x61llSite\x12\x18\n\x07summary\x18\x07 \x01(\tR\x07summary"%\n\x0b\x43ontextType\x12\x07\n\x03SQL\x10\x00\x12\r\n\tDATAFRAME\x10\x01\x1a\x99\x03\n\x0eSparkThrowable\x12$\n\x0b\x65rror_class\x18\x01 \x01(\tH\x00R\nerrorClass\x88\x01\x01\x12}\n\x12message_parameters\x18\x02 \x03(\x0b\x32N.spark.connect.FetchErrorDetailsResponse.SparkThrowable.MessageParametersEntryR\x11messageParameters\x12\\\n\x0equery_contexts\x18\x03 \x03(\x0b\x32\x35.spark.connect.FetchErrorDetailsResponse.QueryContextR\rqueryContexts\x12 \n\tsql_state\x18\x04 \x01(\tH\x01R\x08sqlState\x88\x01\x01\x1a\x44\n\x16MessageParametersEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x42\x0e\n\x0c_error_classB\x0c\n\n_sql_state\x1a\xdb\x02\n\x05\x45rror\x12\x30\n\x14\x65rror_type_hierarchy\x18\x01 \x03(\tR\x12\x65rrorTypeHierarchy\x12\x18\n\x07message\x18\x02 \x01(\tR\x07message\x12[\n\x0bstack_trace\x18\x03 \x03(\x0b\x32:.spark.connect.FetchErrorDetailsResponse.StackTraceElementR\nstackTrace\x12 \n\tcause_idx\x18\x04 \x01(\x05H\x00R\x08\x63\x61useIdx\x88\x01\x01\x12\x65\n\x0fspark_throwable\x18\x05 \x01(\x0b\x32\x37.spark.connect.FetchErrorDetailsResponse.SparkThrowableH\x01R\x0esparkThrowable\x88\x01\x01\x42\x0c\n\n_cause_idxB\x12\n\x10_spark_throwableB\x11\n\x0f_root_error_idx"Z\n\x17\x43heckpointCommandResult\x12?\n\x08relation\x18\x01 \x01(\x0b\x32#.spark.connect.CachedRemoteRelationR\x08relation2\xb2\x07\n\x13SparkConnectService\x12X\n\x0b\x45xecutePlan\x12!.spark.connect.ExecutePlanRequest\x1a".spark.connect.ExecutePlanResponse"\x00\x30\x01\x12V\n\x0b\x41nalyzePlan\x12!.spark.connect.AnalyzePlanRequest\x1a".spark.connect.AnalyzePlanResponse"\x00\x12G\n\x06\x43onfig\x12\x1c.spark.connect.ConfigRequest\x1a\x1d.spark.connect.ConfigResponse"\x00\x12[\n\x0c\x41\x64\x64\x41rtifacts\x12".spark.connect.AddArtifactsRequest\x1a#.spark.connect.AddArtifactsResponse"\x00(\x01\x12\x63\n\x0e\x41rtifactStatus\x12&.spark.connect.ArtifactStatusesRequest\x1a\'.spark.connect.ArtifactStatusesResponse"\x00\x12P\n\tInterrupt\x12\x1f.spark.connect.InterruptRequest\x1a .spark.connect.InterruptResponse"\x00\x12`\n\x0fReattachExecute\x12%.spark.connect.ReattachExecuteRequest\x1a".spark.connect.ExecutePlanResponse"\x00\x30\x01\x12_\n\x0eReleaseExecute\x12$.spark.connect.ReleaseExecuteRequest\x1a%.spark.connect.ReleaseExecuteResponse"\x00\x12_\n\x0eReleaseSession\x12$.spark.connect.ReleaseSessionRequest\x1a%.spark.connect.ReleaseSessionResponse"\x00\x12h\n\x11\x46\x65tchErrorDetails\x12\'.spark.connect.FetchErrorDetailsRequest\x1a(.spark.connect.FetchErrorDetailsResponse"\x00\x42\x36\n\x1eorg.apache.spark.connect.protoP\x01Z\x12internal/generatedb\x06proto3' ) _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) @@ -120,109 +120,111 @@ _EXECUTEPLANREQUEST_REQUESTOPTION._serialized_start = 5196 _EXECUTEPLANREQUEST_REQUESTOPTION._serialized_end = 5361 _EXECUTEPLANRESPONSE._serialized_start = 5440 - _EXECUTEPLANRESPONSE._serialized_end = 8230 - _EXECUTEPLANRESPONSE_SQLCOMMANDRESULT._serialized_start = 7030 - _EXECUTEPLANRESPONSE_SQLCOMMANDRESULT._serialized_end = 7101 - _EXECUTEPLANRESPONSE_ARROWBATCH._serialized_start = 7103 - _EXECUTEPLANRESPONSE_ARROWBATCH._serialized_end = 7221 - _EXECUTEPLANRESPONSE_METRICS._serialized_start = 7224 - _EXECUTEPLANRESPONSE_METRICS._serialized_end = 7741 - _EXECUTEPLANRESPONSE_METRICS_METRICOBJECT._serialized_start = 7319 - _EXECUTEPLANRESPONSE_METRICS_METRICOBJECT._serialized_end = 7651 - _EXECUTEPLANRESPONSE_METRICS_METRICOBJECT_EXECUTIONMETRICSENTRY._serialized_start = 7528 - _EXECUTEPLANRESPONSE_METRICS_METRICOBJECT_EXECUTIONMETRICSENTRY._serialized_end = 7651 - _EXECUTEPLANRESPONSE_METRICS_METRICVALUE._serialized_start = 7653 - _EXECUTEPLANRESPONSE_METRICS_METRICVALUE._serialized_end = 7741 - _EXECUTEPLANRESPONSE_OBSERVEDMETRICS._serialized_start = 7743 - _EXECUTEPLANRESPONSE_OBSERVEDMETRICS._serialized_end = 7859 - _EXECUTEPLANRESPONSE_RESULTCOMPLETE._serialized_start = 7861 - _EXECUTEPLANRESPONSE_RESULTCOMPLETE._serialized_end = 7877 - _EXECUTEPLANRESPONSE_EXECUTIONPROGRESS._serialized_start = 7880 - _EXECUTEPLANRESPONSE_EXECUTIONPROGRESS._serialized_end = 8213 - _EXECUTEPLANRESPONSE_EXECUTIONPROGRESS_STAGEINFO._serialized_start = 8036 - _EXECUTEPLANRESPONSE_EXECUTIONPROGRESS_STAGEINFO._serialized_end = 8213 - _KEYVALUE._serialized_start = 8232 - _KEYVALUE._serialized_end = 8297 - _CONFIGREQUEST._serialized_start = 8300 - _CONFIGREQUEST._serialized_end = 9459 - _CONFIGREQUEST_OPERATION._serialized_start = 8608 - _CONFIGREQUEST_OPERATION._serialized_end = 9106 - _CONFIGREQUEST_SET._serialized_start = 9108 - _CONFIGREQUEST_SET._serialized_end = 9160 - _CONFIGREQUEST_GET._serialized_start = 9162 - _CONFIGREQUEST_GET._serialized_end = 9187 - _CONFIGREQUEST_GETWITHDEFAULT._serialized_start = 9189 - _CONFIGREQUEST_GETWITHDEFAULT._serialized_end = 9252 - _CONFIGREQUEST_GETOPTION._serialized_start = 9254 - _CONFIGREQUEST_GETOPTION._serialized_end = 9285 - _CONFIGREQUEST_GETALL._serialized_start = 9287 - _CONFIGREQUEST_GETALL._serialized_end = 9335 - _CONFIGREQUEST_UNSET._serialized_start = 9337 - _CONFIGREQUEST_UNSET._serialized_end = 9364 - _CONFIGREQUEST_ISMODIFIABLE._serialized_start = 9366 - _CONFIGREQUEST_ISMODIFIABLE._serialized_end = 9400 - _CONFIGRESPONSE._serialized_start = 9462 - _CONFIGRESPONSE._serialized_end = 9637 - _ADDARTIFACTSREQUEST._serialized_start = 9640 - _ADDARTIFACTSREQUEST._serialized_end = 10642 - _ADDARTIFACTSREQUEST_ARTIFACTCHUNK._serialized_start = 10115 - _ADDARTIFACTSREQUEST_ARTIFACTCHUNK._serialized_end = 10168 - _ADDARTIFACTSREQUEST_SINGLECHUNKARTIFACT._serialized_start = 10170 - _ADDARTIFACTSREQUEST_SINGLECHUNKARTIFACT._serialized_end = 10281 - _ADDARTIFACTSREQUEST_BATCH._serialized_start = 10283 - _ADDARTIFACTSREQUEST_BATCH._serialized_end = 10376 - _ADDARTIFACTSREQUEST_BEGINCHUNKEDARTIFACT._serialized_start = 10379 - _ADDARTIFACTSREQUEST_BEGINCHUNKEDARTIFACT._serialized_end = 10572 - _ADDARTIFACTSRESPONSE._serialized_start = 10645 - _ADDARTIFACTSRESPONSE._serialized_end = 10917 - _ADDARTIFACTSRESPONSE_ARTIFACTSUMMARY._serialized_start = 10836 - _ADDARTIFACTSRESPONSE_ARTIFACTSUMMARY._serialized_end = 10917 - _ARTIFACTSTATUSESREQUEST._serialized_start = 10920 - _ARTIFACTSTATUSESREQUEST._serialized_end = 11246 - _ARTIFACTSTATUSESRESPONSE._serialized_start = 11249 - _ARTIFACTSTATUSESRESPONSE._serialized_end = 11601 - _ARTIFACTSTATUSESRESPONSE_STATUSESENTRY._serialized_start = 11444 - _ARTIFACTSTATUSESRESPONSE_STATUSESENTRY._serialized_end = 11559 - _ARTIFACTSTATUSESRESPONSE_ARTIFACTSTATUS._serialized_start = 11561 - _ARTIFACTSTATUSESRESPONSE_ARTIFACTSTATUS._serialized_end = 11601 - _INTERRUPTREQUEST._serialized_start = 11604 - _INTERRUPTREQUEST._serialized_end = 12207 - _INTERRUPTREQUEST_INTERRUPTTYPE._serialized_start = 12007 - _INTERRUPTREQUEST_INTERRUPTTYPE._serialized_end = 12135 - _INTERRUPTRESPONSE._serialized_start = 12210 - _INTERRUPTRESPONSE._serialized_end = 12354 - _REATTACHOPTIONS._serialized_start = 12356 - _REATTACHOPTIONS._serialized_end = 12409 - _REATTACHEXECUTEREQUEST._serialized_start = 12412 - _REATTACHEXECUTEREQUEST._serialized_end = 12818 - _RELEASEEXECUTEREQUEST._serialized_start = 12821 - _RELEASEEXECUTEREQUEST._serialized_end = 13406 - _RELEASEEXECUTEREQUEST_RELEASEALL._serialized_start = 13275 - _RELEASEEXECUTEREQUEST_RELEASEALL._serialized_end = 13287 - _RELEASEEXECUTEREQUEST_RELEASEUNTIL._serialized_start = 13289 - _RELEASEEXECUTEREQUEST_RELEASEUNTIL._serialized_end = 13336 - _RELEASEEXECUTERESPONSE._serialized_start = 13409 - _RELEASEEXECUTERESPONSE._serialized_end = 13574 - _RELEASESESSIONREQUEST._serialized_start = 13577 - _RELEASESESSIONREQUEST._serialized_end = 13748 - _RELEASESESSIONRESPONSE._serialized_start = 13750 - _RELEASESESSIONRESPONSE._serialized_end = 13858 - _FETCHERRORDETAILSREQUEST._serialized_start = 13861 - _FETCHERRORDETAILSREQUEST._serialized_end = 14193 - _FETCHERRORDETAILSRESPONSE._serialized_start = 14196 - _FETCHERRORDETAILSRESPONSE._serialized_end = 15751 - _FETCHERRORDETAILSRESPONSE_STACKTRACEELEMENT._serialized_start = 14425 - _FETCHERRORDETAILSRESPONSE_STACKTRACEELEMENT._serialized_end = 14599 - _FETCHERRORDETAILSRESPONSE_QUERYCONTEXT._serialized_start = 14602 - _FETCHERRORDETAILSRESPONSE_QUERYCONTEXT._serialized_end = 14970 - _FETCHERRORDETAILSRESPONSE_QUERYCONTEXT_CONTEXTTYPE._serialized_start = 14933 - _FETCHERRORDETAILSRESPONSE_QUERYCONTEXT_CONTEXTTYPE._serialized_end = 14970 - _FETCHERRORDETAILSRESPONSE_SPARKTHROWABLE._serialized_start = 14973 - _FETCHERRORDETAILSRESPONSE_SPARKTHROWABLE._serialized_end = 15382 - _FETCHERRORDETAILSRESPONSE_SPARKTHROWABLE_MESSAGEPARAMETERSENTRY._serialized_start = 15284 - _FETCHERRORDETAILSRESPONSE_SPARKTHROWABLE_MESSAGEPARAMETERSENTRY._serialized_end = 15352 - _FETCHERRORDETAILSRESPONSE_ERROR._serialized_start = 15385 - _FETCHERRORDETAILSRESPONSE_ERROR._serialized_end = 15732 - _SPARKCONNECTSERVICE._serialized_start = 15754 - _SPARKCONNECTSERVICE._serialized_end = 16700 + _EXECUTEPLANRESPONSE._serialized_end = 8358 + _EXECUTEPLANRESPONSE_SQLCOMMANDRESULT._serialized_start = 7132 + _EXECUTEPLANRESPONSE_SQLCOMMANDRESULT._serialized_end = 7203 + _EXECUTEPLANRESPONSE_ARROWBATCH._serialized_start = 7205 + _EXECUTEPLANRESPONSE_ARROWBATCH._serialized_end = 7323 + _EXECUTEPLANRESPONSE_METRICS._serialized_start = 7326 + _EXECUTEPLANRESPONSE_METRICS._serialized_end = 7843 + _EXECUTEPLANRESPONSE_METRICS_METRICOBJECT._serialized_start = 7421 + _EXECUTEPLANRESPONSE_METRICS_METRICOBJECT._serialized_end = 7753 + _EXECUTEPLANRESPONSE_METRICS_METRICOBJECT_EXECUTIONMETRICSENTRY._serialized_start = 7630 + _EXECUTEPLANRESPONSE_METRICS_METRICOBJECT_EXECUTIONMETRICSENTRY._serialized_end = 7753 + _EXECUTEPLANRESPONSE_METRICS_METRICVALUE._serialized_start = 7755 + _EXECUTEPLANRESPONSE_METRICS_METRICVALUE._serialized_end = 7843 + _EXECUTEPLANRESPONSE_OBSERVEDMETRICS._serialized_start = 7846 + _EXECUTEPLANRESPONSE_OBSERVEDMETRICS._serialized_end = 7987 + _EXECUTEPLANRESPONSE_RESULTCOMPLETE._serialized_start = 7989 + _EXECUTEPLANRESPONSE_RESULTCOMPLETE._serialized_end = 8005 + _EXECUTEPLANRESPONSE_EXECUTIONPROGRESS._serialized_start = 8008 + _EXECUTEPLANRESPONSE_EXECUTIONPROGRESS._serialized_end = 8341 + _EXECUTEPLANRESPONSE_EXECUTIONPROGRESS_STAGEINFO._serialized_start = 8164 + _EXECUTEPLANRESPONSE_EXECUTIONPROGRESS_STAGEINFO._serialized_end = 8341 + _KEYVALUE._serialized_start = 8360 + _KEYVALUE._serialized_end = 8425 + _CONFIGREQUEST._serialized_start = 8428 + _CONFIGREQUEST._serialized_end = 9587 + _CONFIGREQUEST_OPERATION._serialized_start = 8736 + _CONFIGREQUEST_OPERATION._serialized_end = 9234 + _CONFIGREQUEST_SET._serialized_start = 9236 + _CONFIGREQUEST_SET._serialized_end = 9288 + _CONFIGREQUEST_GET._serialized_start = 9290 + _CONFIGREQUEST_GET._serialized_end = 9315 + _CONFIGREQUEST_GETWITHDEFAULT._serialized_start = 9317 + _CONFIGREQUEST_GETWITHDEFAULT._serialized_end = 9380 + _CONFIGREQUEST_GETOPTION._serialized_start = 9382 + _CONFIGREQUEST_GETOPTION._serialized_end = 9413 + _CONFIGREQUEST_GETALL._serialized_start = 9415 + _CONFIGREQUEST_GETALL._serialized_end = 9463 + _CONFIGREQUEST_UNSET._serialized_start = 9465 + _CONFIGREQUEST_UNSET._serialized_end = 9492 + _CONFIGREQUEST_ISMODIFIABLE._serialized_start = 9494 + _CONFIGREQUEST_ISMODIFIABLE._serialized_end = 9528 + _CONFIGRESPONSE._serialized_start = 9590 + _CONFIGRESPONSE._serialized_end = 9765 + _ADDARTIFACTSREQUEST._serialized_start = 9768 + _ADDARTIFACTSREQUEST._serialized_end = 10770 + _ADDARTIFACTSREQUEST_ARTIFACTCHUNK._serialized_start = 10243 + _ADDARTIFACTSREQUEST_ARTIFACTCHUNK._serialized_end = 10296 + _ADDARTIFACTSREQUEST_SINGLECHUNKARTIFACT._serialized_start = 10298 + _ADDARTIFACTSREQUEST_SINGLECHUNKARTIFACT._serialized_end = 10409 + _ADDARTIFACTSREQUEST_BATCH._serialized_start = 10411 + _ADDARTIFACTSREQUEST_BATCH._serialized_end = 10504 + _ADDARTIFACTSREQUEST_BEGINCHUNKEDARTIFACT._serialized_start = 10507 + _ADDARTIFACTSREQUEST_BEGINCHUNKEDARTIFACT._serialized_end = 10700 + _ADDARTIFACTSRESPONSE._serialized_start = 10773 + _ADDARTIFACTSRESPONSE._serialized_end = 11045 + _ADDARTIFACTSRESPONSE_ARTIFACTSUMMARY._serialized_start = 10964 + _ADDARTIFACTSRESPONSE_ARTIFACTSUMMARY._serialized_end = 11045 + _ARTIFACTSTATUSESREQUEST._serialized_start = 11048 + _ARTIFACTSTATUSESREQUEST._serialized_end = 11374 + _ARTIFACTSTATUSESRESPONSE._serialized_start = 11377 + _ARTIFACTSTATUSESRESPONSE._serialized_end = 11729 + _ARTIFACTSTATUSESRESPONSE_STATUSESENTRY._serialized_start = 11572 + _ARTIFACTSTATUSESRESPONSE_STATUSESENTRY._serialized_end = 11687 + _ARTIFACTSTATUSESRESPONSE_ARTIFACTSTATUS._serialized_start = 11689 + _ARTIFACTSTATUSESRESPONSE_ARTIFACTSTATUS._serialized_end = 11729 + _INTERRUPTREQUEST._serialized_start = 11732 + _INTERRUPTREQUEST._serialized_end = 12335 + _INTERRUPTREQUEST_INTERRUPTTYPE._serialized_start = 12135 + _INTERRUPTREQUEST_INTERRUPTTYPE._serialized_end = 12263 + _INTERRUPTRESPONSE._serialized_start = 12338 + _INTERRUPTRESPONSE._serialized_end = 12482 + _REATTACHOPTIONS._serialized_start = 12484 + _REATTACHOPTIONS._serialized_end = 12537 + _REATTACHEXECUTEREQUEST._serialized_start = 12540 + _REATTACHEXECUTEREQUEST._serialized_end = 12946 + _RELEASEEXECUTEREQUEST._serialized_start = 12949 + _RELEASEEXECUTEREQUEST._serialized_end = 13534 + _RELEASEEXECUTEREQUEST_RELEASEALL._serialized_start = 13403 + _RELEASEEXECUTEREQUEST_RELEASEALL._serialized_end = 13415 + _RELEASEEXECUTEREQUEST_RELEASEUNTIL._serialized_start = 13417 + _RELEASEEXECUTEREQUEST_RELEASEUNTIL._serialized_end = 13464 + _RELEASEEXECUTERESPONSE._serialized_start = 13537 + _RELEASEEXECUTERESPONSE._serialized_end = 13702 + _RELEASESESSIONREQUEST._serialized_start = 13705 + _RELEASESESSIONREQUEST._serialized_end = 13876 + _RELEASESESSIONRESPONSE._serialized_start = 13878 + _RELEASESESSIONRESPONSE._serialized_end = 13986 + _FETCHERRORDETAILSREQUEST._serialized_start = 13989 + _FETCHERRORDETAILSREQUEST._serialized_end = 14321 + _FETCHERRORDETAILSRESPONSE._serialized_start = 14324 + _FETCHERRORDETAILSRESPONSE._serialized_end = 15879 + _FETCHERRORDETAILSRESPONSE_STACKTRACEELEMENT._serialized_start = 14553 + _FETCHERRORDETAILSRESPONSE_STACKTRACEELEMENT._serialized_end = 14727 + _FETCHERRORDETAILSRESPONSE_QUERYCONTEXT._serialized_start = 14730 + _FETCHERRORDETAILSRESPONSE_QUERYCONTEXT._serialized_end = 15098 + _FETCHERRORDETAILSRESPONSE_QUERYCONTEXT_CONTEXTTYPE._serialized_start = 15061 + _FETCHERRORDETAILSRESPONSE_QUERYCONTEXT_CONTEXTTYPE._serialized_end = 15098 + _FETCHERRORDETAILSRESPONSE_SPARKTHROWABLE._serialized_start = 15101 + _FETCHERRORDETAILSRESPONSE_SPARKTHROWABLE._serialized_end = 15510 + _FETCHERRORDETAILSRESPONSE_SPARKTHROWABLE_MESSAGEPARAMETERSENTRY._serialized_start = 15412 + _FETCHERRORDETAILSRESPONSE_SPARKTHROWABLE_MESSAGEPARAMETERSENTRY._serialized_end = 15480 + _FETCHERRORDETAILSRESPONSE_ERROR._serialized_start = 15513 + _FETCHERRORDETAILSRESPONSE_ERROR._serialized_end = 15860 + _CHECKPOINTCOMMANDRESULT._serialized_start = 15881 + _CHECKPOINTCOMMANDRESULT._serialized_end = 15971 + _SPARKCONNECTSERVICE._serialized_start = 15974 + _SPARKCONNECTSERVICE._serialized_end = 16920 # @@protoc_insertion_point(module_scope) diff --git a/python/pyspark/sql/connect/proto/base_pb2.pyi b/python/pyspark/sql/connect/proto/base_pb2.pyi index d22502f8839db..1f9dfbb3294d0 100644 --- a/python/pyspark/sql/connect/proto/base_pb2.pyi +++ b/python/pyspark/sql/connect/proto/base_pb2.pyi @@ -1406,6 +1406,7 @@ class ExecutePlanResponse(google.protobuf.message.Message): NAME_FIELD_NUMBER: builtins.int VALUES_FIELD_NUMBER: builtins.int KEYS_FIELD_NUMBER: builtins.int + PLAN_ID_FIELD_NUMBER: builtins.int name: builtins.str @property def values( @@ -1417,6 +1418,7 @@ class ExecutePlanResponse(google.protobuf.message.Message): def keys( self, ) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.str]: ... + plan_id: builtins.int def __init__( self, *, @@ -1426,11 +1428,12 @@ class ExecutePlanResponse(google.protobuf.message.Message): ] | None = ..., keys: collections.abc.Iterable[builtins.str] | None = ..., + plan_id: builtins.int = ..., ) -> None: ... def ClearField( self, field_name: typing_extensions.Literal[ - "keys", b"keys", "name", b"name", "values", b"values" + "keys", b"keys", "name", b"name", "plan_id", b"plan_id", "values", b"values" ], ) -> None: ... @@ -1530,6 +1533,7 @@ class ExecutePlanResponse(google.protobuf.message.Message): RESULT_COMPLETE_FIELD_NUMBER: builtins.int CREATE_RESOURCE_PROFILE_COMMAND_RESULT_FIELD_NUMBER: builtins.int EXECUTION_PROGRESS_FIELD_NUMBER: builtins.int + CHECKPOINT_COMMAND_RESULT_FIELD_NUMBER: builtins.int EXTENSION_FIELD_NUMBER: builtins.int METRICS_FIELD_NUMBER: builtins.int OBSERVED_METRICS_FIELD_NUMBER: builtins.int @@ -1591,6 +1595,9 @@ class ExecutePlanResponse(google.protobuf.message.Message): def execution_progress(self) -> global___ExecutePlanResponse.ExecutionProgress: """(Optional) Intermediate query progress reports.""" @property + def checkpoint_command_result(self) -> global___CheckpointCommandResult: + """Response for command that checkpoints a DataFrame.""" + @property def extension(self) -> google.protobuf.any_pb2.Any: """Support arbitrary result objects.""" @property @@ -1631,6 +1638,7 @@ class ExecutePlanResponse(google.protobuf.message.Message): create_resource_profile_command_result: pyspark.sql.connect.proto.commands_pb2.CreateResourceProfileCommandResult | None = ..., execution_progress: global___ExecutePlanResponse.ExecutionProgress | None = ..., + checkpoint_command_result: global___CheckpointCommandResult | None = ..., extension: google.protobuf.any_pb2.Any | None = ..., metrics: global___ExecutePlanResponse.Metrics | None = ..., observed_metrics: collections.abc.Iterable[global___ExecutePlanResponse.ObservedMetrics] @@ -1642,6 +1650,8 @@ class ExecutePlanResponse(google.protobuf.message.Message): field_name: typing_extensions.Literal[ "arrow_batch", b"arrow_batch", + "checkpoint_command_result", + b"checkpoint_command_result", "create_resource_profile_command_result", b"create_resource_profile_command_result", "execution_progress", @@ -1675,6 +1685,8 @@ class ExecutePlanResponse(google.protobuf.message.Message): field_name: typing_extensions.Literal[ "arrow_batch", b"arrow_batch", + "checkpoint_command_result", + b"checkpoint_command_result", "create_resource_profile_command_result", b"create_resource_profile_command_result", "execution_progress", @@ -1727,6 +1739,7 @@ class ExecutePlanResponse(google.protobuf.message.Message): "result_complete", "create_resource_profile_command_result", "execution_progress", + "checkpoint_command_result", "extension", ] | None @@ -3700,3 +3713,24 @@ class FetchErrorDetailsResponse(google.protobuf.message.Message): ) -> typing_extensions.Literal["root_error_idx"] | None: ... global___FetchErrorDetailsResponse = FetchErrorDetailsResponse + +class CheckpointCommandResult(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + RELATION_FIELD_NUMBER: builtins.int + @property + def relation(self) -> pyspark.sql.connect.proto.relations_pb2.CachedRemoteRelation: + """(Required) The logical plan checkpointed.""" + def __init__( + self, + *, + relation: pyspark.sql.connect.proto.relations_pb2.CachedRemoteRelation | None = ..., + ) -> None: ... + def HasField( + self, field_name: typing_extensions.Literal["relation", b"relation"] + ) -> builtins.bool: ... + def ClearField( + self, field_name: typing_extensions.Literal["relation", b"relation"] + ) -> None: ... + +global___CheckpointCommandResult = CheckpointCommandResult diff --git a/python/pyspark/sql/connect/proto/commands_pb2.py b/python/pyspark/sql/connect/proto/commands_pb2.py index 50cd309dcd8da..8f67f817c3f00 100644 --- a/python/pyspark/sql/connect/proto/commands_pb2.py +++ b/python/pyspark/sql/connect/proto/commands_pb2.py @@ -35,7 +35,7 @@ DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x1cspark/connect/commands.proto\x12\rspark.connect\x1a\x19google/protobuf/any.proto\x1a\x1aspark/connect/common.proto\x1a\x1fspark/connect/expressions.proto\x1a\x1dspark/connect/relations.proto"\xd5\n\n\x07\x43ommand\x12]\n\x11register_function\x18\x01 \x01(\x0b\x32..spark.connect.CommonInlineUserDefinedFunctionH\x00R\x10registerFunction\x12H\n\x0fwrite_operation\x18\x02 \x01(\x0b\x32\x1d.spark.connect.WriteOperationH\x00R\x0ewriteOperation\x12_\n\x15\x63reate_dataframe_view\x18\x03 \x01(\x0b\x32).spark.connect.CreateDataFrameViewCommandH\x00R\x13\x63reateDataframeView\x12O\n\x12write_operation_v2\x18\x04 \x01(\x0b\x32\x1f.spark.connect.WriteOperationV2H\x00R\x10writeOperationV2\x12<\n\x0bsql_command\x18\x05 \x01(\x0b\x32\x19.spark.connect.SqlCommandH\x00R\nsqlCommand\x12k\n\x1cwrite_stream_operation_start\x18\x06 \x01(\x0b\x32(.spark.connect.WriteStreamOperationStartH\x00R\x19writeStreamOperationStart\x12^\n\x17streaming_query_command\x18\x07 \x01(\x0b\x32$.spark.connect.StreamingQueryCommandH\x00R\x15streamingQueryCommand\x12X\n\x15get_resources_command\x18\x08 \x01(\x0b\x32".spark.connect.GetResourcesCommandH\x00R\x13getResourcesCommand\x12t\n\x1fstreaming_query_manager_command\x18\t \x01(\x0b\x32+.spark.connect.StreamingQueryManagerCommandH\x00R\x1cstreamingQueryManagerCommand\x12m\n\x17register_table_function\x18\n \x01(\x0b\x32\x33.spark.connect.CommonInlineUserDefinedTableFunctionH\x00R\x15registerTableFunction\x12\x81\x01\n$streaming_query_listener_bus_command\x18\x0b \x01(\x0b\x32/.spark.connect.StreamingQueryListenerBusCommandH\x00R streamingQueryListenerBusCommand\x12\x64\n\x14register_data_source\x18\x0c \x01(\x0b\x32\x30.spark.connect.CommonInlineUserDefinedDataSourceH\x00R\x12registerDataSource\x12t\n\x1f\x63reate_resource_profile_command\x18\r \x01(\x0b\x32+.spark.connect.CreateResourceProfileCommandH\x00R\x1c\x63reateResourceProfileCommand\x12\x35\n\textension\x18\xe7\x07 \x01(\x0b\x32\x14.google.protobuf.AnyH\x00R\textensionB\x0e\n\x0c\x63ommand_type"\xaa\x04\n\nSqlCommand\x12\x14\n\x03sql\x18\x01 \x01(\tB\x02\x18\x01R\x03sql\x12;\n\x04\x61rgs\x18\x02 \x03(\x0b\x32#.spark.connect.SqlCommand.ArgsEntryB\x02\x18\x01R\x04\x61rgs\x12@\n\x08pos_args\x18\x03 \x03(\x0b\x32!.spark.connect.Expression.LiteralB\x02\x18\x01R\x07posArgs\x12Z\n\x0fnamed_arguments\x18\x04 \x03(\x0b\x32-.spark.connect.SqlCommand.NamedArgumentsEntryB\x02\x18\x01R\x0enamedArguments\x12\x42\n\rpos_arguments\x18\x05 \x03(\x0b\x32\x19.spark.connect.ExpressionB\x02\x18\x01R\x0cposArguments\x12-\n\x05input\x18\x06 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x1aZ\n\tArgsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x37\n\x05value\x18\x02 \x01(\x0b\x32!.spark.connect.Expression.LiteralR\x05value:\x02\x38\x01\x1a\\\n\x13NamedArgumentsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12/\n\x05value\x18\x02 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x05value:\x02\x38\x01"\x96\x01\n\x1a\x43reateDataFrameViewCommand\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04name\x18\x02 \x01(\tR\x04name\x12\x1b\n\tis_global\x18\x03 \x01(\x08R\x08isGlobal\x12\x18\n\x07replace\x18\x04 \x01(\x08R\x07replace"\xca\x08\n\x0eWriteOperation\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x1b\n\x06source\x18\x02 \x01(\tH\x01R\x06source\x88\x01\x01\x12\x14\n\x04path\x18\x03 \x01(\tH\x00R\x04path\x12?\n\x05table\x18\x04 \x01(\x0b\x32\'.spark.connect.WriteOperation.SaveTableH\x00R\x05table\x12:\n\x04mode\x18\x05 \x01(\x0e\x32&.spark.connect.WriteOperation.SaveModeR\x04mode\x12*\n\x11sort_column_names\x18\x06 \x03(\tR\x0fsortColumnNames\x12\x31\n\x14partitioning_columns\x18\x07 \x03(\tR\x13partitioningColumns\x12\x43\n\tbucket_by\x18\x08 \x01(\x0b\x32&.spark.connect.WriteOperation.BucketByR\x08\x62ucketBy\x12\x44\n\x07options\x18\t \x03(\x0b\x32*.spark.connect.WriteOperation.OptionsEntryR\x07options\x12-\n\x12\x63lustering_columns\x18\n \x03(\tR\x11\x63lusteringColumns\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a\x82\x02\n\tSaveTable\x12\x1d\n\ntable_name\x18\x01 \x01(\tR\ttableName\x12X\n\x0bsave_method\x18\x02 \x01(\x0e\x32\x37.spark.connect.WriteOperation.SaveTable.TableSaveMethodR\nsaveMethod"|\n\x0fTableSaveMethod\x12!\n\x1dTABLE_SAVE_METHOD_UNSPECIFIED\x10\x00\x12#\n\x1fTABLE_SAVE_METHOD_SAVE_AS_TABLE\x10\x01\x12!\n\x1dTABLE_SAVE_METHOD_INSERT_INTO\x10\x02\x1a[\n\x08\x42ucketBy\x12.\n\x13\x62ucket_column_names\x18\x01 \x03(\tR\x11\x62ucketColumnNames\x12\x1f\n\x0bnum_buckets\x18\x02 \x01(\x05R\nnumBuckets"\x89\x01\n\x08SaveMode\x12\x19\n\x15SAVE_MODE_UNSPECIFIED\x10\x00\x12\x14\n\x10SAVE_MODE_APPEND\x10\x01\x12\x17\n\x13SAVE_MODE_OVERWRITE\x10\x02\x12\x1d\n\x19SAVE_MODE_ERROR_IF_EXISTS\x10\x03\x12\x14\n\x10SAVE_MODE_IGNORE\x10\x04\x42\x0b\n\tsave_typeB\t\n\x07_source"\xdc\x06\n\x10WriteOperationV2\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x1d\n\ntable_name\x18\x02 \x01(\tR\ttableName\x12\x1f\n\x08provider\x18\x03 \x01(\tH\x00R\x08provider\x88\x01\x01\x12L\n\x14partitioning_columns\x18\x04 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x13partitioningColumns\x12\x46\n\x07options\x18\x05 \x03(\x0b\x32,.spark.connect.WriteOperationV2.OptionsEntryR\x07options\x12_\n\x10table_properties\x18\x06 \x03(\x0b\x32\x34.spark.connect.WriteOperationV2.TablePropertiesEntryR\x0ftableProperties\x12\x38\n\x04mode\x18\x07 \x01(\x0e\x32$.spark.connect.WriteOperationV2.ModeR\x04mode\x12J\n\x13overwrite_condition\x18\x08 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x12overwriteCondition\x12-\n\x12\x63lustering_columns\x18\t \x03(\tR\x11\x63lusteringColumns\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a\x42\n\x14TablePropertiesEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01"\x9f\x01\n\x04Mode\x12\x14\n\x10MODE_UNSPECIFIED\x10\x00\x12\x0f\n\x0bMODE_CREATE\x10\x01\x12\x12\n\x0eMODE_OVERWRITE\x10\x02\x12\x1d\n\x19MODE_OVERWRITE_PARTITIONS\x10\x03\x12\x0f\n\x0bMODE_APPEND\x10\x04\x12\x10\n\x0cMODE_REPLACE\x10\x05\x12\x1a\n\x16MODE_CREATE_OR_REPLACE\x10\x06\x42\x0b\n\t_provider"\xa0\x06\n\x19WriteStreamOperationStart\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x16\n\x06\x66ormat\x18\x02 \x01(\tR\x06\x66ormat\x12O\n\x07options\x18\x03 \x03(\x0b\x32\x35.spark.connect.WriteStreamOperationStart.OptionsEntryR\x07options\x12:\n\x19partitioning_column_names\x18\x04 \x03(\tR\x17partitioningColumnNames\x12:\n\x18processing_time_interval\x18\x05 \x01(\tH\x00R\x16processingTimeInterval\x12%\n\ravailable_now\x18\x06 \x01(\x08H\x00R\x0c\x61vailableNow\x12\x14\n\x04once\x18\x07 \x01(\x08H\x00R\x04once\x12\x46\n\x1e\x63ontinuous_checkpoint_interval\x18\x08 \x01(\tH\x00R\x1c\x63ontinuousCheckpointInterval\x12\x1f\n\x0boutput_mode\x18\t \x01(\tR\noutputMode\x12\x1d\n\nquery_name\x18\n \x01(\tR\tqueryName\x12\x14\n\x04path\x18\x0b \x01(\tH\x01R\x04path\x12\x1f\n\ntable_name\x18\x0c \x01(\tH\x01R\ttableName\x12N\n\x0e\x66oreach_writer\x18\r \x01(\x0b\x32\'.spark.connect.StreamingForeachFunctionR\rforeachWriter\x12L\n\rforeach_batch\x18\x0e \x01(\x0b\x32\'.spark.connect.StreamingForeachFunctionR\x0c\x66oreachBatch\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x42\t\n\x07triggerB\x12\n\x10sink_destination"\xb3\x01\n\x18StreamingForeachFunction\x12\x43\n\x0fpython_function\x18\x01 \x01(\x0b\x32\x18.spark.connect.PythonUDFH\x00R\x0epythonFunction\x12\x46\n\x0escala_function\x18\x02 \x01(\x0b\x32\x1d.spark.connect.ScalarScalaUDFH\x00R\rscalaFunctionB\n\n\x08\x66unction"\xd4\x01\n\x1fWriteStreamOperationStartResult\x12\x42\n\x08query_id\x18\x01 \x01(\x0b\x32\'.spark.connect.StreamingQueryInstanceIdR\x07queryId\x12\x12\n\x04name\x18\x02 \x01(\tR\x04name\x12<\n\x18query_started_event_json\x18\x03 \x01(\tH\x00R\x15queryStartedEventJson\x88\x01\x01\x42\x1b\n\x19_query_started_event_json"A\n\x18StreamingQueryInstanceId\x12\x0e\n\x02id\x18\x01 \x01(\tR\x02id\x12\x15\n\x06run_id\x18\x02 \x01(\tR\x05runId"\xf8\x04\n\x15StreamingQueryCommand\x12\x42\n\x08query_id\x18\x01 \x01(\x0b\x32\'.spark.connect.StreamingQueryInstanceIdR\x07queryId\x12\x18\n\x06status\x18\x02 \x01(\x08H\x00R\x06status\x12%\n\rlast_progress\x18\x03 \x01(\x08H\x00R\x0clastProgress\x12)\n\x0frecent_progress\x18\x04 \x01(\x08H\x00R\x0erecentProgress\x12\x14\n\x04stop\x18\x05 \x01(\x08H\x00R\x04stop\x12\x34\n\x15process_all_available\x18\x06 \x01(\x08H\x00R\x13processAllAvailable\x12O\n\x07\x65xplain\x18\x07 \x01(\x0b\x32\x33.spark.connect.StreamingQueryCommand.ExplainCommandH\x00R\x07\x65xplain\x12\x1e\n\texception\x18\x08 \x01(\x08H\x00R\texception\x12k\n\x11\x61wait_termination\x18\t \x01(\x0b\x32<.spark.connect.StreamingQueryCommand.AwaitTerminationCommandH\x00R\x10\x61waitTermination\x1a,\n\x0e\x45xplainCommand\x12\x1a\n\x08\x65xtended\x18\x01 \x01(\x08R\x08\x65xtended\x1aL\n\x17\x41waitTerminationCommand\x12"\n\ntimeout_ms\x18\x02 \x01(\x03H\x00R\ttimeoutMs\x88\x01\x01\x42\r\n\x0b_timeout_msB\t\n\x07\x63ommand"\xf5\x08\n\x1bStreamingQueryCommandResult\x12\x42\n\x08query_id\x18\x01 \x01(\x0b\x32\'.spark.connect.StreamingQueryInstanceIdR\x07queryId\x12Q\n\x06status\x18\x02 \x01(\x0b\x32\x37.spark.connect.StreamingQueryCommandResult.StatusResultH\x00R\x06status\x12j\n\x0frecent_progress\x18\x03 \x01(\x0b\x32?.spark.connect.StreamingQueryCommandResult.RecentProgressResultH\x00R\x0erecentProgress\x12T\n\x07\x65xplain\x18\x04 \x01(\x0b\x32\x38.spark.connect.StreamingQueryCommandResult.ExplainResultH\x00R\x07\x65xplain\x12Z\n\texception\x18\x05 \x01(\x0b\x32:.spark.connect.StreamingQueryCommandResult.ExceptionResultH\x00R\texception\x12p\n\x11\x61wait_termination\x18\x06 \x01(\x0b\x32\x41.spark.connect.StreamingQueryCommandResult.AwaitTerminationResultH\x00R\x10\x61waitTermination\x1a\xaa\x01\n\x0cStatusResult\x12%\n\x0estatus_message\x18\x01 \x01(\tR\rstatusMessage\x12*\n\x11is_data_available\x18\x02 \x01(\x08R\x0fisDataAvailable\x12*\n\x11is_trigger_active\x18\x03 \x01(\x08R\x0fisTriggerActive\x12\x1b\n\tis_active\x18\x04 \x01(\x08R\x08isActive\x1aH\n\x14RecentProgressResult\x12\x30\n\x14recent_progress_json\x18\x05 \x03(\tR\x12recentProgressJson\x1a\'\n\rExplainResult\x12\x16\n\x06result\x18\x01 \x01(\tR\x06result\x1a\xc5\x01\n\x0f\x45xceptionResult\x12\x30\n\x11\x65xception_message\x18\x01 \x01(\tH\x00R\x10\x65xceptionMessage\x88\x01\x01\x12$\n\x0b\x65rror_class\x18\x02 \x01(\tH\x01R\nerrorClass\x88\x01\x01\x12$\n\x0bstack_trace\x18\x03 \x01(\tH\x02R\nstackTrace\x88\x01\x01\x42\x14\n\x12_exception_messageB\x0e\n\x0c_error_classB\x0e\n\x0c_stack_trace\x1a\x38\n\x16\x41waitTerminationResult\x12\x1e\n\nterminated\x18\x01 \x01(\x08R\nterminatedB\r\n\x0bresult_type"\xbd\x06\n\x1cStreamingQueryManagerCommand\x12\x18\n\x06\x61\x63tive\x18\x01 \x01(\x08H\x00R\x06\x61\x63tive\x12\x1d\n\tget_query\x18\x02 \x01(\tH\x00R\x08getQuery\x12|\n\x15\x61wait_any_termination\x18\x03 \x01(\x0b\x32\x46.spark.connect.StreamingQueryManagerCommand.AwaitAnyTerminationCommandH\x00R\x13\x61waitAnyTermination\x12+\n\x10reset_terminated\x18\x04 \x01(\x08H\x00R\x0fresetTerminated\x12n\n\x0c\x61\x64\x64_listener\x18\x05 \x01(\x0b\x32I.spark.connect.StreamingQueryManagerCommand.StreamingQueryListenerCommandH\x00R\x0b\x61\x64\x64Listener\x12t\n\x0fremove_listener\x18\x06 \x01(\x0b\x32I.spark.connect.StreamingQueryManagerCommand.StreamingQueryListenerCommandH\x00R\x0eremoveListener\x12\'\n\x0elist_listeners\x18\x07 \x01(\x08H\x00R\rlistListeners\x1aO\n\x1a\x41waitAnyTerminationCommand\x12"\n\ntimeout_ms\x18\x01 \x01(\x03H\x00R\ttimeoutMs\x88\x01\x01\x42\r\n\x0b_timeout_ms\x1a\xcd\x01\n\x1dStreamingQueryListenerCommand\x12)\n\x10listener_payload\x18\x01 \x01(\x0cR\x0flistenerPayload\x12U\n\x17python_listener_payload\x18\x02 \x01(\x0b\x32\x18.spark.connect.PythonUDFH\x00R\x15pythonListenerPayload\x88\x01\x01\x12\x0e\n\x02id\x18\x03 \x01(\tR\x02idB\x1a\n\x18_python_listener_payloadB\t\n\x07\x63ommand"\xb4\x08\n"StreamingQueryManagerCommandResult\x12X\n\x06\x61\x63tive\x18\x01 \x01(\x0b\x32>.spark.connect.StreamingQueryManagerCommandResult.ActiveResultH\x00R\x06\x61\x63tive\x12`\n\x05query\x18\x02 \x01(\x0b\x32H.spark.connect.StreamingQueryManagerCommandResult.StreamingQueryInstanceH\x00R\x05query\x12\x81\x01\n\x15\x61wait_any_termination\x18\x03 \x01(\x0b\x32K.spark.connect.StreamingQueryManagerCommandResult.AwaitAnyTerminationResultH\x00R\x13\x61waitAnyTermination\x12+\n\x10reset_terminated\x18\x04 \x01(\x08H\x00R\x0fresetTerminated\x12#\n\x0c\x61\x64\x64_listener\x18\x05 \x01(\x08H\x00R\x0b\x61\x64\x64Listener\x12)\n\x0fremove_listener\x18\x06 \x01(\x08H\x00R\x0eremoveListener\x12{\n\x0elist_listeners\x18\x07 \x01(\x0b\x32R.spark.connect.StreamingQueryManagerCommandResult.ListStreamingQueryListenerResultH\x00R\rlistListeners\x1a\x7f\n\x0c\x41\x63tiveResult\x12o\n\x0e\x61\x63tive_queries\x18\x01 \x03(\x0b\x32H.spark.connect.StreamingQueryManagerCommandResult.StreamingQueryInstanceR\ractiveQueries\x1as\n\x16StreamingQueryInstance\x12\x37\n\x02id\x18\x01 \x01(\x0b\x32\'.spark.connect.StreamingQueryInstanceIdR\x02id\x12\x17\n\x04name\x18\x02 \x01(\tH\x00R\x04name\x88\x01\x01\x42\x07\n\x05_name\x1a;\n\x19\x41waitAnyTerminationResult\x12\x1e\n\nterminated\x18\x01 \x01(\x08R\nterminated\x1aK\n\x1eStreamingQueryListenerInstance\x12)\n\x10listener_payload\x18\x01 \x01(\x0cR\x0flistenerPayload\x1a\x45\n ListStreamingQueryListenerResult\x12!\n\x0clistener_ids\x18\x01 \x03(\tR\x0blistenerIdsB\r\n\x0bresult_type"\xad\x01\n StreamingQueryListenerBusCommand\x12;\n\x19\x61\x64\x64_listener_bus_listener\x18\x01 \x01(\x08H\x00R\x16\x61\x64\x64ListenerBusListener\x12\x41\n\x1cremove_listener_bus_listener\x18\x02 \x01(\x08H\x00R\x19removeListenerBusListenerB\t\n\x07\x63ommand"\x83\x01\n\x1bStreamingQueryListenerEvent\x12\x1d\n\nevent_json\x18\x01 \x01(\tR\teventJson\x12\x45\n\nevent_type\x18\x02 \x01(\x0e\x32&.spark.connect.StreamingQueryEventTypeR\teventType"\xcc\x01\n"StreamingQueryListenerEventsResult\x12\x42\n\x06\x65vents\x18\x01 \x03(\x0b\x32*.spark.connect.StreamingQueryListenerEventR\x06\x65vents\x12\x42\n\x1blistener_bus_listener_added\x18\x02 \x01(\x08H\x00R\x18listenerBusListenerAdded\x88\x01\x01\x42\x1e\n\x1c_listener_bus_listener_added"\x15\n\x13GetResourcesCommand"\xd4\x01\n\x19GetResourcesCommandResult\x12U\n\tresources\x18\x01 \x03(\x0b\x32\x37.spark.connect.GetResourcesCommandResult.ResourcesEntryR\tresources\x1a`\n\x0eResourcesEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x38\n\x05value\x18\x02 \x01(\x0b\x32".spark.connect.ResourceInformationR\x05value:\x02\x38\x01"X\n\x1c\x43reateResourceProfileCommand\x12\x38\n\x07profile\x18\x01 \x01(\x0b\x32\x1e.spark.connect.ResourceProfileR\x07profile"C\n"CreateResourceProfileCommandResult\x12\x1d\n\nprofile_id\x18\x01 \x01(\x05R\tprofileId*\x85\x01\n\x17StreamingQueryEventType\x12\x1e\n\x1aQUERY_PROGRESS_UNSPECIFIED\x10\x00\x12\x18\n\x14QUERY_PROGRESS_EVENT\x10\x01\x12\x1a\n\x16QUERY_TERMINATED_EVENT\x10\x02\x12\x14\n\x10QUERY_IDLE_EVENT\x10\x03\x42\x36\n\x1eorg.apache.spark.connect.protoP\x01Z\x12internal/generatedb\x06proto3' + b'\n\x1cspark/connect/commands.proto\x12\rspark.connect\x1a\x19google/protobuf/any.proto\x1a\x1aspark/connect/common.proto\x1a\x1fspark/connect/expressions.proto\x1a\x1dspark/connect/relations.proto"\xaf\x0c\n\x07\x43ommand\x12]\n\x11register_function\x18\x01 \x01(\x0b\x32..spark.connect.CommonInlineUserDefinedFunctionH\x00R\x10registerFunction\x12H\n\x0fwrite_operation\x18\x02 \x01(\x0b\x32\x1d.spark.connect.WriteOperationH\x00R\x0ewriteOperation\x12_\n\x15\x63reate_dataframe_view\x18\x03 \x01(\x0b\x32).spark.connect.CreateDataFrameViewCommandH\x00R\x13\x63reateDataframeView\x12O\n\x12write_operation_v2\x18\x04 \x01(\x0b\x32\x1f.spark.connect.WriteOperationV2H\x00R\x10writeOperationV2\x12<\n\x0bsql_command\x18\x05 \x01(\x0b\x32\x19.spark.connect.SqlCommandH\x00R\nsqlCommand\x12k\n\x1cwrite_stream_operation_start\x18\x06 \x01(\x0b\x32(.spark.connect.WriteStreamOperationStartH\x00R\x19writeStreamOperationStart\x12^\n\x17streaming_query_command\x18\x07 \x01(\x0b\x32$.spark.connect.StreamingQueryCommandH\x00R\x15streamingQueryCommand\x12X\n\x15get_resources_command\x18\x08 \x01(\x0b\x32".spark.connect.GetResourcesCommandH\x00R\x13getResourcesCommand\x12t\n\x1fstreaming_query_manager_command\x18\t \x01(\x0b\x32+.spark.connect.StreamingQueryManagerCommandH\x00R\x1cstreamingQueryManagerCommand\x12m\n\x17register_table_function\x18\n \x01(\x0b\x32\x33.spark.connect.CommonInlineUserDefinedTableFunctionH\x00R\x15registerTableFunction\x12\x81\x01\n$streaming_query_listener_bus_command\x18\x0b \x01(\x0b\x32/.spark.connect.StreamingQueryListenerBusCommandH\x00R streamingQueryListenerBusCommand\x12\x64\n\x14register_data_source\x18\x0c \x01(\x0b\x32\x30.spark.connect.CommonInlineUserDefinedDataSourceH\x00R\x12registerDataSource\x12t\n\x1f\x63reate_resource_profile_command\x18\r \x01(\x0b\x32+.spark.connect.CreateResourceProfileCommandH\x00R\x1c\x63reateResourceProfileCommand\x12Q\n\x12\x63heckpoint_command\x18\x0e \x01(\x0b\x32 .spark.connect.CheckpointCommandH\x00R\x11\x63heckpointCommand\x12\x84\x01\n%remove_cached_remote_relation_command\x18\x0f \x01(\x0b\x32\x30.spark.connect.RemoveCachedRemoteRelationCommandH\x00R!removeCachedRemoteRelationCommand\x12\x35\n\textension\x18\xe7\x07 \x01(\x0b\x32\x14.google.protobuf.AnyH\x00R\textensionB\x0e\n\x0c\x63ommand_type"\xaa\x04\n\nSqlCommand\x12\x14\n\x03sql\x18\x01 \x01(\tB\x02\x18\x01R\x03sql\x12;\n\x04\x61rgs\x18\x02 \x03(\x0b\x32#.spark.connect.SqlCommand.ArgsEntryB\x02\x18\x01R\x04\x61rgs\x12@\n\x08pos_args\x18\x03 \x03(\x0b\x32!.spark.connect.Expression.LiteralB\x02\x18\x01R\x07posArgs\x12Z\n\x0fnamed_arguments\x18\x04 \x03(\x0b\x32-.spark.connect.SqlCommand.NamedArgumentsEntryB\x02\x18\x01R\x0enamedArguments\x12\x42\n\rpos_arguments\x18\x05 \x03(\x0b\x32\x19.spark.connect.ExpressionB\x02\x18\x01R\x0cposArguments\x12-\n\x05input\x18\x06 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x1aZ\n\tArgsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x37\n\x05value\x18\x02 \x01(\x0b\x32!.spark.connect.Expression.LiteralR\x05value:\x02\x38\x01\x1a\\\n\x13NamedArgumentsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12/\n\x05value\x18\x02 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x05value:\x02\x38\x01"\x96\x01\n\x1a\x43reateDataFrameViewCommand\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04name\x18\x02 \x01(\tR\x04name\x12\x1b\n\tis_global\x18\x03 \x01(\x08R\x08isGlobal\x12\x18\n\x07replace\x18\x04 \x01(\x08R\x07replace"\xca\x08\n\x0eWriteOperation\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x1b\n\x06source\x18\x02 \x01(\tH\x01R\x06source\x88\x01\x01\x12\x14\n\x04path\x18\x03 \x01(\tH\x00R\x04path\x12?\n\x05table\x18\x04 \x01(\x0b\x32\'.spark.connect.WriteOperation.SaveTableH\x00R\x05table\x12:\n\x04mode\x18\x05 \x01(\x0e\x32&.spark.connect.WriteOperation.SaveModeR\x04mode\x12*\n\x11sort_column_names\x18\x06 \x03(\tR\x0fsortColumnNames\x12\x31\n\x14partitioning_columns\x18\x07 \x03(\tR\x13partitioningColumns\x12\x43\n\tbucket_by\x18\x08 \x01(\x0b\x32&.spark.connect.WriteOperation.BucketByR\x08\x62ucketBy\x12\x44\n\x07options\x18\t \x03(\x0b\x32*.spark.connect.WriteOperation.OptionsEntryR\x07options\x12-\n\x12\x63lustering_columns\x18\n \x03(\tR\x11\x63lusteringColumns\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a\x82\x02\n\tSaveTable\x12\x1d\n\ntable_name\x18\x01 \x01(\tR\ttableName\x12X\n\x0bsave_method\x18\x02 \x01(\x0e\x32\x37.spark.connect.WriteOperation.SaveTable.TableSaveMethodR\nsaveMethod"|\n\x0fTableSaveMethod\x12!\n\x1dTABLE_SAVE_METHOD_UNSPECIFIED\x10\x00\x12#\n\x1fTABLE_SAVE_METHOD_SAVE_AS_TABLE\x10\x01\x12!\n\x1dTABLE_SAVE_METHOD_INSERT_INTO\x10\x02\x1a[\n\x08\x42ucketBy\x12.\n\x13\x62ucket_column_names\x18\x01 \x03(\tR\x11\x62ucketColumnNames\x12\x1f\n\x0bnum_buckets\x18\x02 \x01(\x05R\nnumBuckets"\x89\x01\n\x08SaveMode\x12\x19\n\x15SAVE_MODE_UNSPECIFIED\x10\x00\x12\x14\n\x10SAVE_MODE_APPEND\x10\x01\x12\x17\n\x13SAVE_MODE_OVERWRITE\x10\x02\x12\x1d\n\x19SAVE_MODE_ERROR_IF_EXISTS\x10\x03\x12\x14\n\x10SAVE_MODE_IGNORE\x10\x04\x42\x0b\n\tsave_typeB\t\n\x07_source"\xdc\x06\n\x10WriteOperationV2\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x1d\n\ntable_name\x18\x02 \x01(\tR\ttableName\x12\x1f\n\x08provider\x18\x03 \x01(\tH\x00R\x08provider\x88\x01\x01\x12L\n\x14partitioning_columns\x18\x04 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x13partitioningColumns\x12\x46\n\x07options\x18\x05 \x03(\x0b\x32,.spark.connect.WriteOperationV2.OptionsEntryR\x07options\x12_\n\x10table_properties\x18\x06 \x03(\x0b\x32\x34.spark.connect.WriteOperationV2.TablePropertiesEntryR\x0ftableProperties\x12\x38\n\x04mode\x18\x07 \x01(\x0e\x32$.spark.connect.WriteOperationV2.ModeR\x04mode\x12J\n\x13overwrite_condition\x18\x08 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x12overwriteCondition\x12-\n\x12\x63lustering_columns\x18\t \x03(\tR\x11\x63lusteringColumns\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a\x42\n\x14TablePropertiesEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01"\x9f\x01\n\x04Mode\x12\x14\n\x10MODE_UNSPECIFIED\x10\x00\x12\x0f\n\x0bMODE_CREATE\x10\x01\x12\x12\n\x0eMODE_OVERWRITE\x10\x02\x12\x1d\n\x19MODE_OVERWRITE_PARTITIONS\x10\x03\x12\x0f\n\x0bMODE_APPEND\x10\x04\x12\x10\n\x0cMODE_REPLACE\x10\x05\x12\x1a\n\x16MODE_CREATE_OR_REPLACE\x10\x06\x42\x0b\n\t_provider"\xa0\x06\n\x19WriteStreamOperationStart\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x16\n\x06\x66ormat\x18\x02 \x01(\tR\x06\x66ormat\x12O\n\x07options\x18\x03 \x03(\x0b\x32\x35.spark.connect.WriteStreamOperationStart.OptionsEntryR\x07options\x12:\n\x19partitioning_column_names\x18\x04 \x03(\tR\x17partitioningColumnNames\x12:\n\x18processing_time_interval\x18\x05 \x01(\tH\x00R\x16processingTimeInterval\x12%\n\ravailable_now\x18\x06 \x01(\x08H\x00R\x0c\x61vailableNow\x12\x14\n\x04once\x18\x07 \x01(\x08H\x00R\x04once\x12\x46\n\x1e\x63ontinuous_checkpoint_interval\x18\x08 \x01(\tH\x00R\x1c\x63ontinuousCheckpointInterval\x12\x1f\n\x0boutput_mode\x18\t \x01(\tR\noutputMode\x12\x1d\n\nquery_name\x18\n \x01(\tR\tqueryName\x12\x14\n\x04path\x18\x0b \x01(\tH\x01R\x04path\x12\x1f\n\ntable_name\x18\x0c \x01(\tH\x01R\ttableName\x12N\n\x0e\x66oreach_writer\x18\r \x01(\x0b\x32\'.spark.connect.StreamingForeachFunctionR\rforeachWriter\x12L\n\rforeach_batch\x18\x0e \x01(\x0b\x32\'.spark.connect.StreamingForeachFunctionR\x0c\x66oreachBatch\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x42\t\n\x07triggerB\x12\n\x10sink_destination"\xb3\x01\n\x18StreamingForeachFunction\x12\x43\n\x0fpython_function\x18\x01 \x01(\x0b\x32\x18.spark.connect.PythonUDFH\x00R\x0epythonFunction\x12\x46\n\x0escala_function\x18\x02 \x01(\x0b\x32\x1d.spark.connect.ScalarScalaUDFH\x00R\rscalaFunctionB\n\n\x08\x66unction"\xd4\x01\n\x1fWriteStreamOperationStartResult\x12\x42\n\x08query_id\x18\x01 \x01(\x0b\x32\'.spark.connect.StreamingQueryInstanceIdR\x07queryId\x12\x12\n\x04name\x18\x02 \x01(\tR\x04name\x12<\n\x18query_started_event_json\x18\x03 \x01(\tH\x00R\x15queryStartedEventJson\x88\x01\x01\x42\x1b\n\x19_query_started_event_json"A\n\x18StreamingQueryInstanceId\x12\x0e\n\x02id\x18\x01 \x01(\tR\x02id\x12\x15\n\x06run_id\x18\x02 \x01(\tR\x05runId"\xf8\x04\n\x15StreamingQueryCommand\x12\x42\n\x08query_id\x18\x01 \x01(\x0b\x32\'.spark.connect.StreamingQueryInstanceIdR\x07queryId\x12\x18\n\x06status\x18\x02 \x01(\x08H\x00R\x06status\x12%\n\rlast_progress\x18\x03 \x01(\x08H\x00R\x0clastProgress\x12)\n\x0frecent_progress\x18\x04 \x01(\x08H\x00R\x0erecentProgress\x12\x14\n\x04stop\x18\x05 \x01(\x08H\x00R\x04stop\x12\x34\n\x15process_all_available\x18\x06 \x01(\x08H\x00R\x13processAllAvailable\x12O\n\x07\x65xplain\x18\x07 \x01(\x0b\x32\x33.spark.connect.StreamingQueryCommand.ExplainCommandH\x00R\x07\x65xplain\x12\x1e\n\texception\x18\x08 \x01(\x08H\x00R\texception\x12k\n\x11\x61wait_termination\x18\t \x01(\x0b\x32<.spark.connect.StreamingQueryCommand.AwaitTerminationCommandH\x00R\x10\x61waitTermination\x1a,\n\x0e\x45xplainCommand\x12\x1a\n\x08\x65xtended\x18\x01 \x01(\x08R\x08\x65xtended\x1aL\n\x17\x41waitTerminationCommand\x12"\n\ntimeout_ms\x18\x02 \x01(\x03H\x00R\ttimeoutMs\x88\x01\x01\x42\r\n\x0b_timeout_msB\t\n\x07\x63ommand"\xf5\x08\n\x1bStreamingQueryCommandResult\x12\x42\n\x08query_id\x18\x01 \x01(\x0b\x32\'.spark.connect.StreamingQueryInstanceIdR\x07queryId\x12Q\n\x06status\x18\x02 \x01(\x0b\x32\x37.spark.connect.StreamingQueryCommandResult.StatusResultH\x00R\x06status\x12j\n\x0frecent_progress\x18\x03 \x01(\x0b\x32?.spark.connect.StreamingQueryCommandResult.RecentProgressResultH\x00R\x0erecentProgress\x12T\n\x07\x65xplain\x18\x04 \x01(\x0b\x32\x38.spark.connect.StreamingQueryCommandResult.ExplainResultH\x00R\x07\x65xplain\x12Z\n\texception\x18\x05 \x01(\x0b\x32:.spark.connect.StreamingQueryCommandResult.ExceptionResultH\x00R\texception\x12p\n\x11\x61wait_termination\x18\x06 \x01(\x0b\x32\x41.spark.connect.StreamingQueryCommandResult.AwaitTerminationResultH\x00R\x10\x61waitTermination\x1a\xaa\x01\n\x0cStatusResult\x12%\n\x0estatus_message\x18\x01 \x01(\tR\rstatusMessage\x12*\n\x11is_data_available\x18\x02 \x01(\x08R\x0fisDataAvailable\x12*\n\x11is_trigger_active\x18\x03 \x01(\x08R\x0fisTriggerActive\x12\x1b\n\tis_active\x18\x04 \x01(\x08R\x08isActive\x1aH\n\x14RecentProgressResult\x12\x30\n\x14recent_progress_json\x18\x05 \x03(\tR\x12recentProgressJson\x1a\'\n\rExplainResult\x12\x16\n\x06result\x18\x01 \x01(\tR\x06result\x1a\xc5\x01\n\x0f\x45xceptionResult\x12\x30\n\x11\x65xception_message\x18\x01 \x01(\tH\x00R\x10\x65xceptionMessage\x88\x01\x01\x12$\n\x0b\x65rror_class\x18\x02 \x01(\tH\x01R\nerrorClass\x88\x01\x01\x12$\n\x0bstack_trace\x18\x03 \x01(\tH\x02R\nstackTrace\x88\x01\x01\x42\x14\n\x12_exception_messageB\x0e\n\x0c_error_classB\x0e\n\x0c_stack_trace\x1a\x38\n\x16\x41waitTerminationResult\x12\x1e\n\nterminated\x18\x01 \x01(\x08R\nterminatedB\r\n\x0bresult_type"\xbd\x06\n\x1cStreamingQueryManagerCommand\x12\x18\n\x06\x61\x63tive\x18\x01 \x01(\x08H\x00R\x06\x61\x63tive\x12\x1d\n\tget_query\x18\x02 \x01(\tH\x00R\x08getQuery\x12|\n\x15\x61wait_any_termination\x18\x03 \x01(\x0b\x32\x46.spark.connect.StreamingQueryManagerCommand.AwaitAnyTerminationCommandH\x00R\x13\x61waitAnyTermination\x12+\n\x10reset_terminated\x18\x04 \x01(\x08H\x00R\x0fresetTerminated\x12n\n\x0c\x61\x64\x64_listener\x18\x05 \x01(\x0b\x32I.spark.connect.StreamingQueryManagerCommand.StreamingQueryListenerCommandH\x00R\x0b\x61\x64\x64Listener\x12t\n\x0fremove_listener\x18\x06 \x01(\x0b\x32I.spark.connect.StreamingQueryManagerCommand.StreamingQueryListenerCommandH\x00R\x0eremoveListener\x12\'\n\x0elist_listeners\x18\x07 \x01(\x08H\x00R\rlistListeners\x1aO\n\x1a\x41waitAnyTerminationCommand\x12"\n\ntimeout_ms\x18\x01 \x01(\x03H\x00R\ttimeoutMs\x88\x01\x01\x42\r\n\x0b_timeout_ms\x1a\xcd\x01\n\x1dStreamingQueryListenerCommand\x12)\n\x10listener_payload\x18\x01 \x01(\x0cR\x0flistenerPayload\x12U\n\x17python_listener_payload\x18\x02 \x01(\x0b\x32\x18.spark.connect.PythonUDFH\x00R\x15pythonListenerPayload\x88\x01\x01\x12\x0e\n\x02id\x18\x03 \x01(\tR\x02idB\x1a\n\x18_python_listener_payloadB\t\n\x07\x63ommand"\xb4\x08\n"StreamingQueryManagerCommandResult\x12X\n\x06\x61\x63tive\x18\x01 \x01(\x0b\x32>.spark.connect.StreamingQueryManagerCommandResult.ActiveResultH\x00R\x06\x61\x63tive\x12`\n\x05query\x18\x02 \x01(\x0b\x32H.spark.connect.StreamingQueryManagerCommandResult.StreamingQueryInstanceH\x00R\x05query\x12\x81\x01\n\x15\x61wait_any_termination\x18\x03 \x01(\x0b\x32K.spark.connect.StreamingQueryManagerCommandResult.AwaitAnyTerminationResultH\x00R\x13\x61waitAnyTermination\x12+\n\x10reset_terminated\x18\x04 \x01(\x08H\x00R\x0fresetTerminated\x12#\n\x0c\x61\x64\x64_listener\x18\x05 \x01(\x08H\x00R\x0b\x61\x64\x64Listener\x12)\n\x0fremove_listener\x18\x06 \x01(\x08H\x00R\x0eremoveListener\x12{\n\x0elist_listeners\x18\x07 \x01(\x0b\x32R.spark.connect.StreamingQueryManagerCommandResult.ListStreamingQueryListenerResultH\x00R\rlistListeners\x1a\x7f\n\x0c\x41\x63tiveResult\x12o\n\x0e\x61\x63tive_queries\x18\x01 \x03(\x0b\x32H.spark.connect.StreamingQueryManagerCommandResult.StreamingQueryInstanceR\ractiveQueries\x1as\n\x16StreamingQueryInstance\x12\x37\n\x02id\x18\x01 \x01(\x0b\x32\'.spark.connect.StreamingQueryInstanceIdR\x02id\x12\x17\n\x04name\x18\x02 \x01(\tH\x00R\x04name\x88\x01\x01\x42\x07\n\x05_name\x1a;\n\x19\x41waitAnyTerminationResult\x12\x1e\n\nterminated\x18\x01 \x01(\x08R\nterminated\x1aK\n\x1eStreamingQueryListenerInstance\x12)\n\x10listener_payload\x18\x01 \x01(\x0cR\x0flistenerPayload\x1a\x45\n ListStreamingQueryListenerResult\x12!\n\x0clistener_ids\x18\x01 \x03(\tR\x0blistenerIdsB\r\n\x0bresult_type"\xad\x01\n StreamingQueryListenerBusCommand\x12;\n\x19\x61\x64\x64_listener_bus_listener\x18\x01 \x01(\x08H\x00R\x16\x61\x64\x64ListenerBusListener\x12\x41\n\x1cremove_listener_bus_listener\x18\x02 \x01(\x08H\x00R\x19removeListenerBusListenerB\t\n\x07\x63ommand"\x83\x01\n\x1bStreamingQueryListenerEvent\x12\x1d\n\nevent_json\x18\x01 \x01(\tR\teventJson\x12\x45\n\nevent_type\x18\x02 \x01(\x0e\x32&.spark.connect.StreamingQueryEventTypeR\teventType"\xcc\x01\n"StreamingQueryListenerEventsResult\x12\x42\n\x06\x65vents\x18\x01 \x03(\x0b\x32*.spark.connect.StreamingQueryListenerEventR\x06\x65vents\x12\x42\n\x1blistener_bus_listener_added\x18\x02 \x01(\x08H\x00R\x18listenerBusListenerAdded\x88\x01\x01\x42\x1e\n\x1c_listener_bus_listener_added"\x15\n\x13GetResourcesCommand"\xd4\x01\n\x19GetResourcesCommandResult\x12U\n\tresources\x18\x01 \x03(\x0b\x32\x37.spark.connect.GetResourcesCommandResult.ResourcesEntryR\tresources\x1a`\n\x0eResourcesEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x38\n\x05value\x18\x02 \x01(\x0b\x32".spark.connect.ResourceInformationR\x05value:\x02\x38\x01"X\n\x1c\x43reateResourceProfileCommand\x12\x38\n\x07profile\x18\x01 \x01(\x0b\x32\x1e.spark.connect.ResourceProfileR\x07profile"C\n"CreateResourceProfileCommandResult\x12\x1d\n\nprofile_id\x18\x01 \x01(\x05R\tprofileId"d\n!RemoveCachedRemoteRelationCommand\x12?\n\x08relation\x18\x01 \x01(\x0b\x32#.spark.connect.CachedRemoteRelationR\x08relation"t\n\x11\x43heckpointCommand\x12\x33\n\x08relation\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x08relation\x12\x14\n\x05local\x18\x02 \x01(\x08R\x05local\x12\x14\n\x05\x65\x61ger\x18\x03 \x01(\x08R\x05\x65\x61ger*\x85\x01\n\x17StreamingQueryEventType\x12\x1e\n\x1aQUERY_PROGRESS_UNSPECIFIED\x10\x00\x12\x18\n\x14QUERY_PROGRESS_EVENT\x10\x01\x12\x1a\n\x16QUERY_TERMINATED_EVENT\x10\x02\x12\x14\n\x10QUERY_IDLE_EVENT\x10\x03\x42\x36\n\x1eorg.apache.spark.connect.protoP\x01Z\x12internal/generatedb\x06proto3' ) _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) @@ -71,98 +71,102 @@ _WRITESTREAMOPERATIONSTART_OPTIONSENTRY._serialized_options = b"8\001" _GETRESOURCESCOMMANDRESULT_RESOURCESENTRY._options = None _GETRESOURCESCOMMANDRESULT_RESOURCESENTRY._serialized_options = b"8\001" - _STREAMINGQUERYEVENTTYPE._serialized_start = 10080 - _STREAMINGQUERYEVENTTYPE._serialized_end = 10213 + _STREAMINGQUERYEVENTTYPE._serialized_start = 10518 + _STREAMINGQUERYEVENTTYPE._serialized_end = 10651 _COMMAND._serialized_start = 167 - _COMMAND._serialized_end = 1532 - _SQLCOMMAND._serialized_start = 1535 - _SQLCOMMAND._serialized_end = 2089 - _SQLCOMMAND_ARGSENTRY._serialized_start = 1905 - _SQLCOMMAND_ARGSENTRY._serialized_end = 1995 - _SQLCOMMAND_NAMEDARGUMENTSENTRY._serialized_start = 1997 - _SQLCOMMAND_NAMEDARGUMENTSENTRY._serialized_end = 2089 - _CREATEDATAFRAMEVIEWCOMMAND._serialized_start = 2092 - _CREATEDATAFRAMEVIEWCOMMAND._serialized_end = 2242 - _WRITEOPERATION._serialized_start = 2245 - _WRITEOPERATION._serialized_end = 3343 - _WRITEOPERATION_OPTIONSENTRY._serialized_start = 2767 - _WRITEOPERATION_OPTIONSENTRY._serialized_end = 2825 - _WRITEOPERATION_SAVETABLE._serialized_start = 2828 - _WRITEOPERATION_SAVETABLE._serialized_end = 3086 - _WRITEOPERATION_SAVETABLE_TABLESAVEMETHOD._serialized_start = 2962 - _WRITEOPERATION_SAVETABLE_TABLESAVEMETHOD._serialized_end = 3086 - _WRITEOPERATION_BUCKETBY._serialized_start = 3088 - _WRITEOPERATION_BUCKETBY._serialized_end = 3179 - _WRITEOPERATION_SAVEMODE._serialized_start = 3182 - _WRITEOPERATION_SAVEMODE._serialized_end = 3319 - _WRITEOPERATIONV2._serialized_start = 3346 - _WRITEOPERATIONV2._serialized_end = 4206 - _WRITEOPERATIONV2_OPTIONSENTRY._serialized_start = 2767 - _WRITEOPERATIONV2_OPTIONSENTRY._serialized_end = 2825 - _WRITEOPERATIONV2_TABLEPROPERTIESENTRY._serialized_start = 3965 - _WRITEOPERATIONV2_TABLEPROPERTIESENTRY._serialized_end = 4031 - _WRITEOPERATIONV2_MODE._serialized_start = 4034 - _WRITEOPERATIONV2_MODE._serialized_end = 4193 - _WRITESTREAMOPERATIONSTART._serialized_start = 4209 - _WRITESTREAMOPERATIONSTART._serialized_end = 5009 - _WRITESTREAMOPERATIONSTART_OPTIONSENTRY._serialized_start = 2767 - _WRITESTREAMOPERATIONSTART_OPTIONSENTRY._serialized_end = 2825 - _STREAMINGFOREACHFUNCTION._serialized_start = 5012 - _STREAMINGFOREACHFUNCTION._serialized_end = 5191 - _WRITESTREAMOPERATIONSTARTRESULT._serialized_start = 5194 - _WRITESTREAMOPERATIONSTARTRESULT._serialized_end = 5406 - _STREAMINGQUERYINSTANCEID._serialized_start = 5408 - _STREAMINGQUERYINSTANCEID._serialized_end = 5473 - _STREAMINGQUERYCOMMAND._serialized_start = 5476 - _STREAMINGQUERYCOMMAND._serialized_end = 6108 - _STREAMINGQUERYCOMMAND_EXPLAINCOMMAND._serialized_start = 5975 - _STREAMINGQUERYCOMMAND_EXPLAINCOMMAND._serialized_end = 6019 - _STREAMINGQUERYCOMMAND_AWAITTERMINATIONCOMMAND._serialized_start = 6021 - _STREAMINGQUERYCOMMAND_AWAITTERMINATIONCOMMAND._serialized_end = 6097 - _STREAMINGQUERYCOMMANDRESULT._serialized_start = 6111 - _STREAMINGQUERYCOMMANDRESULT._serialized_end = 7252 - _STREAMINGQUERYCOMMANDRESULT_STATUSRESULT._serialized_start = 6694 - _STREAMINGQUERYCOMMANDRESULT_STATUSRESULT._serialized_end = 6864 - _STREAMINGQUERYCOMMANDRESULT_RECENTPROGRESSRESULT._serialized_start = 6866 - _STREAMINGQUERYCOMMANDRESULT_RECENTPROGRESSRESULT._serialized_end = 6938 - _STREAMINGQUERYCOMMANDRESULT_EXPLAINRESULT._serialized_start = 6940 - _STREAMINGQUERYCOMMANDRESULT_EXPLAINRESULT._serialized_end = 6979 - _STREAMINGQUERYCOMMANDRESULT_EXCEPTIONRESULT._serialized_start = 6982 - _STREAMINGQUERYCOMMANDRESULT_EXCEPTIONRESULT._serialized_end = 7179 - _STREAMINGQUERYCOMMANDRESULT_AWAITTERMINATIONRESULT._serialized_start = 7181 - _STREAMINGQUERYCOMMANDRESULT_AWAITTERMINATIONRESULT._serialized_end = 7237 - _STREAMINGQUERYMANAGERCOMMAND._serialized_start = 7255 - _STREAMINGQUERYMANAGERCOMMAND._serialized_end = 8084 - _STREAMINGQUERYMANAGERCOMMAND_AWAITANYTERMINATIONCOMMAND._serialized_start = 7786 - _STREAMINGQUERYMANAGERCOMMAND_AWAITANYTERMINATIONCOMMAND._serialized_end = 7865 - _STREAMINGQUERYMANAGERCOMMAND_STREAMINGQUERYLISTENERCOMMAND._serialized_start = 7868 - _STREAMINGQUERYMANAGERCOMMAND_STREAMINGQUERYLISTENERCOMMAND._serialized_end = 8073 - _STREAMINGQUERYMANAGERCOMMANDRESULT._serialized_start = 8087 - _STREAMINGQUERYMANAGERCOMMANDRESULT._serialized_end = 9163 - _STREAMINGQUERYMANAGERCOMMANDRESULT_ACTIVERESULT._serialized_start = 8695 - _STREAMINGQUERYMANAGERCOMMANDRESULT_ACTIVERESULT._serialized_end = 8822 - _STREAMINGQUERYMANAGERCOMMANDRESULT_STREAMINGQUERYINSTANCE._serialized_start = 8824 - _STREAMINGQUERYMANAGERCOMMANDRESULT_STREAMINGQUERYINSTANCE._serialized_end = 8939 - _STREAMINGQUERYMANAGERCOMMANDRESULT_AWAITANYTERMINATIONRESULT._serialized_start = 8941 - _STREAMINGQUERYMANAGERCOMMANDRESULT_AWAITANYTERMINATIONRESULT._serialized_end = 9000 - _STREAMINGQUERYMANAGERCOMMANDRESULT_STREAMINGQUERYLISTENERINSTANCE._serialized_start = 9002 - _STREAMINGQUERYMANAGERCOMMANDRESULT_STREAMINGQUERYLISTENERINSTANCE._serialized_end = 9077 - _STREAMINGQUERYMANAGERCOMMANDRESULT_LISTSTREAMINGQUERYLISTENERRESULT._serialized_start = 9079 - _STREAMINGQUERYMANAGERCOMMANDRESULT_LISTSTREAMINGQUERYLISTENERRESULT._serialized_end = 9148 - _STREAMINGQUERYLISTENERBUSCOMMAND._serialized_start = 9166 - _STREAMINGQUERYLISTENERBUSCOMMAND._serialized_end = 9339 - _STREAMINGQUERYLISTENEREVENT._serialized_start = 9342 - _STREAMINGQUERYLISTENEREVENT._serialized_end = 9473 - _STREAMINGQUERYLISTENEREVENTSRESULT._serialized_start = 9476 - _STREAMINGQUERYLISTENEREVENTSRESULT._serialized_end = 9680 - _GETRESOURCESCOMMAND._serialized_start = 9682 - _GETRESOURCESCOMMAND._serialized_end = 9703 - _GETRESOURCESCOMMANDRESULT._serialized_start = 9706 - _GETRESOURCESCOMMANDRESULT._serialized_end = 9918 - _GETRESOURCESCOMMANDRESULT_RESOURCESENTRY._serialized_start = 9822 - _GETRESOURCESCOMMANDRESULT_RESOURCESENTRY._serialized_end = 9918 - _CREATERESOURCEPROFILECOMMAND._serialized_start = 9920 - _CREATERESOURCEPROFILECOMMAND._serialized_end = 10008 - _CREATERESOURCEPROFILECOMMANDRESULT._serialized_start = 10010 - _CREATERESOURCEPROFILECOMMANDRESULT._serialized_end = 10077 + _COMMAND._serialized_end = 1750 + _SQLCOMMAND._serialized_start = 1753 + _SQLCOMMAND._serialized_end = 2307 + _SQLCOMMAND_ARGSENTRY._serialized_start = 2123 + _SQLCOMMAND_ARGSENTRY._serialized_end = 2213 + _SQLCOMMAND_NAMEDARGUMENTSENTRY._serialized_start = 2215 + _SQLCOMMAND_NAMEDARGUMENTSENTRY._serialized_end = 2307 + _CREATEDATAFRAMEVIEWCOMMAND._serialized_start = 2310 + _CREATEDATAFRAMEVIEWCOMMAND._serialized_end = 2460 + _WRITEOPERATION._serialized_start = 2463 + _WRITEOPERATION._serialized_end = 3561 + _WRITEOPERATION_OPTIONSENTRY._serialized_start = 2985 + _WRITEOPERATION_OPTIONSENTRY._serialized_end = 3043 + _WRITEOPERATION_SAVETABLE._serialized_start = 3046 + _WRITEOPERATION_SAVETABLE._serialized_end = 3304 + _WRITEOPERATION_SAVETABLE_TABLESAVEMETHOD._serialized_start = 3180 + _WRITEOPERATION_SAVETABLE_TABLESAVEMETHOD._serialized_end = 3304 + _WRITEOPERATION_BUCKETBY._serialized_start = 3306 + _WRITEOPERATION_BUCKETBY._serialized_end = 3397 + _WRITEOPERATION_SAVEMODE._serialized_start = 3400 + _WRITEOPERATION_SAVEMODE._serialized_end = 3537 + _WRITEOPERATIONV2._serialized_start = 3564 + _WRITEOPERATIONV2._serialized_end = 4424 + _WRITEOPERATIONV2_OPTIONSENTRY._serialized_start = 2985 + _WRITEOPERATIONV2_OPTIONSENTRY._serialized_end = 3043 + _WRITEOPERATIONV2_TABLEPROPERTIESENTRY._serialized_start = 4183 + _WRITEOPERATIONV2_TABLEPROPERTIESENTRY._serialized_end = 4249 + _WRITEOPERATIONV2_MODE._serialized_start = 4252 + _WRITEOPERATIONV2_MODE._serialized_end = 4411 + _WRITESTREAMOPERATIONSTART._serialized_start = 4427 + _WRITESTREAMOPERATIONSTART._serialized_end = 5227 + _WRITESTREAMOPERATIONSTART_OPTIONSENTRY._serialized_start = 2985 + _WRITESTREAMOPERATIONSTART_OPTIONSENTRY._serialized_end = 3043 + _STREAMINGFOREACHFUNCTION._serialized_start = 5230 + _STREAMINGFOREACHFUNCTION._serialized_end = 5409 + _WRITESTREAMOPERATIONSTARTRESULT._serialized_start = 5412 + _WRITESTREAMOPERATIONSTARTRESULT._serialized_end = 5624 + _STREAMINGQUERYINSTANCEID._serialized_start = 5626 + _STREAMINGQUERYINSTANCEID._serialized_end = 5691 + _STREAMINGQUERYCOMMAND._serialized_start = 5694 + _STREAMINGQUERYCOMMAND._serialized_end = 6326 + _STREAMINGQUERYCOMMAND_EXPLAINCOMMAND._serialized_start = 6193 + _STREAMINGQUERYCOMMAND_EXPLAINCOMMAND._serialized_end = 6237 + _STREAMINGQUERYCOMMAND_AWAITTERMINATIONCOMMAND._serialized_start = 6239 + _STREAMINGQUERYCOMMAND_AWAITTERMINATIONCOMMAND._serialized_end = 6315 + _STREAMINGQUERYCOMMANDRESULT._serialized_start = 6329 + _STREAMINGQUERYCOMMANDRESULT._serialized_end = 7470 + _STREAMINGQUERYCOMMANDRESULT_STATUSRESULT._serialized_start = 6912 + _STREAMINGQUERYCOMMANDRESULT_STATUSRESULT._serialized_end = 7082 + _STREAMINGQUERYCOMMANDRESULT_RECENTPROGRESSRESULT._serialized_start = 7084 + _STREAMINGQUERYCOMMANDRESULT_RECENTPROGRESSRESULT._serialized_end = 7156 + _STREAMINGQUERYCOMMANDRESULT_EXPLAINRESULT._serialized_start = 7158 + _STREAMINGQUERYCOMMANDRESULT_EXPLAINRESULT._serialized_end = 7197 + _STREAMINGQUERYCOMMANDRESULT_EXCEPTIONRESULT._serialized_start = 7200 + _STREAMINGQUERYCOMMANDRESULT_EXCEPTIONRESULT._serialized_end = 7397 + _STREAMINGQUERYCOMMANDRESULT_AWAITTERMINATIONRESULT._serialized_start = 7399 + _STREAMINGQUERYCOMMANDRESULT_AWAITTERMINATIONRESULT._serialized_end = 7455 + _STREAMINGQUERYMANAGERCOMMAND._serialized_start = 7473 + _STREAMINGQUERYMANAGERCOMMAND._serialized_end = 8302 + _STREAMINGQUERYMANAGERCOMMAND_AWAITANYTERMINATIONCOMMAND._serialized_start = 8004 + _STREAMINGQUERYMANAGERCOMMAND_AWAITANYTERMINATIONCOMMAND._serialized_end = 8083 + _STREAMINGQUERYMANAGERCOMMAND_STREAMINGQUERYLISTENERCOMMAND._serialized_start = 8086 + _STREAMINGQUERYMANAGERCOMMAND_STREAMINGQUERYLISTENERCOMMAND._serialized_end = 8291 + _STREAMINGQUERYMANAGERCOMMANDRESULT._serialized_start = 8305 + _STREAMINGQUERYMANAGERCOMMANDRESULT._serialized_end = 9381 + _STREAMINGQUERYMANAGERCOMMANDRESULT_ACTIVERESULT._serialized_start = 8913 + _STREAMINGQUERYMANAGERCOMMANDRESULT_ACTIVERESULT._serialized_end = 9040 + _STREAMINGQUERYMANAGERCOMMANDRESULT_STREAMINGQUERYINSTANCE._serialized_start = 9042 + _STREAMINGQUERYMANAGERCOMMANDRESULT_STREAMINGQUERYINSTANCE._serialized_end = 9157 + _STREAMINGQUERYMANAGERCOMMANDRESULT_AWAITANYTERMINATIONRESULT._serialized_start = 9159 + _STREAMINGQUERYMANAGERCOMMANDRESULT_AWAITANYTERMINATIONRESULT._serialized_end = 9218 + _STREAMINGQUERYMANAGERCOMMANDRESULT_STREAMINGQUERYLISTENERINSTANCE._serialized_start = 9220 + _STREAMINGQUERYMANAGERCOMMANDRESULT_STREAMINGQUERYLISTENERINSTANCE._serialized_end = 9295 + _STREAMINGQUERYMANAGERCOMMANDRESULT_LISTSTREAMINGQUERYLISTENERRESULT._serialized_start = 9297 + _STREAMINGQUERYMANAGERCOMMANDRESULT_LISTSTREAMINGQUERYLISTENERRESULT._serialized_end = 9366 + _STREAMINGQUERYLISTENERBUSCOMMAND._serialized_start = 9384 + _STREAMINGQUERYLISTENERBUSCOMMAND._serialized_end = 9557 + _STREAMINGQUERYLISTENEREVENT._serialized_start = 9560 + _STREAMINGQUERYLISTENEREVENT._serialized_end = 9691 + _STREAMINGQUERYLISTENEREVENTSRESULT._serialized_start = 9694 + _STREAMINGQUERYLISTENEREVENTSRESULT._serialized_end = 9898 + _GETRESOURCESCOMMAND._serialized_start = 9900 + _GETRESOURCESCOMMAND._serialized_end = 9921 + _GETRESOURCESCOMMANDRESULT._serialized_start = 9924 + _GETRESOURCESCOMMANDRESULT._serialized_end = 10136 + _GETRESOURCESCOMMANDRESULT_RESOURCESENTRY._serialized_start = 10040 + _GETRESOURCESCOMMANDRESULT_RESOURCESENTRY._serialized_end = 10136 + _CREATERESOURCEPROFILECOMMAND._serialized_start = 10138 + _CREATERESOURCEPROFILECOMMAND._serialized_end = 10226 + _CREATERESOURCEPROFILECOMMANDRESULT._serialized_start = 10228 + _CREATERESOURCEPROFILECOMMANDRESULT._serialized_end = 10295 + _REMOVECACHEDREMOTERELATIONCOMMAND._serialized_start = 10297 + _REMOVECACHEDREMOTERELATIONCOMMAND._serialized_end = 10397 + _CHECKPOINTCOMMAND._serialized_start = 10399 + _CHECKPOINTCOMMAND._serialized_end = 10515 # @@protoc_insertion_point(module_scope) diff --git a/python/pyspark/sql/connect/proto/commands_pb2.pyi b/python/pyspark/sql/connect/proto/commands_pb2.pyi index f86ae653508e3..04d50d5b5e4f4 100644 --- a/python/pyspark/sql/connect/proto/commands_pb2.pyi +++ b/python/pyspark/sql/connect/proto/commands_pb2.pyi @@ -101,6 +101,8 @@ class Command(google.protobuf.message.Message): STREAMING_QUERY_LISTENER_BUS_COMMAND_FIELD_NUMBER: builtins.int REGISTER_DATA_SOURCE_FIELD_NUMBER: builtins.int CREATE_RESOURCE_PROFILE_COMMAND_FIELD_NUMBER: builtins.int + CHECKPOINT_COMMAND_FIELD_NUMBER: builtins.int + REMOVE_CACHED_REMOTE_RELATION_COMMAND_FIELD_NUMBER: builtins.int EXTENSION_FIELD_NUMBER: builtins.int @property def register_function( @@ -135,6 +137,12 @@ class Command(google.protobuf.message.Message): @property def create_resource_profile_command(self) -> global___CreateResourceProfileCommand: ... @property + def checkpoint_command(self) -> global___CheckpointCommand: ... + @property + def remove_cached_remote_relation_command( + self, + ) -> global___RemoveCachedRemoteRelationCommand: ... + @property def extension(self) -> google.protobuf.any_pb2.Any: """This field is used to mark extensions to the protocol. When plugins generate arbitrary Commands they can add them here. During the planning the correct resolution is done. @@ -159,11 +167,16 @@ class Command(google.protobuf.message.Message): register_data_source: pyspark.sql.connect.proto.relations_pb2.CommonInlineUserDefinedDataSource | None = ..., create_resource_profile_command: global___CreateResourceProfileCommand | None = ..., + checkpoint_command: global___CheckpointCommand | None = ..., + remove_cached_remote_relation_command: global___RemoveCachedRemoteRelationCommand + | None = ..., extension: google.protobuf.any_pb2.Any | None = ..., ) -> None: ... def HasField( self, field_name: typing_extensions.Literal[ + "checkpoint_command", + b"checkpoint_command", "command_type", b"command_type", "create_dataframe_view", @@ -180,6 +193,8 @@ class Command(google.protobuf.message.Message): b"register_function", "register_table_function", b"register_table_function", + "remove_cached_remote_relation_command", + b"remove_cached_remote_relation_command", "sql_command", b"sql_command", "streaming_query_command", @@ -199,6 +214,8 @@ class Command(google.protobuf.message.Message): def ClearField( self, field_name: typing_extensions.Literal[ + "checkpoint_command", + b"checkpoint_command", "command_type", b"command_type", "create_dataframe_view", @@ -215,6 +232,8 @@ class Command(google.protobuf.message.Message): b"register_function", "register_table_function", b"register_table_function", + "remove_cached_remote_relation_command", + b"remove_cached_remote_relation_command", "sql_command", b"sql_command", "streaming_query_command", @@ -248,6 +267,8 @@ class Command(google.protobuf.message.Message): "streaming_query_listener_bus_command", "register_data_source", "create_resource_profile_command", + "checkpoint_command", + "remove_cached_remote_relation_command", "extension", ] | None @@ -2119,3 +2140,60 @@ class CreateResourceProfileCommandResult(google.protobuf.message.Message): ) -> None: ... global___CreateResourceProfileCommandResult = CreateResourceProfileCommandResult + +class RemoveCachedRemoteRelationCommand(google.protobuf.message.Message): + """Command to remove `CashedRemoteRelation`""" + + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + RELATION_FIELD_NUMBER: builtins.int + @property + def relation(self) -> pyspark.sql.connect.proto.relations_pb2.CachedRemoteRelation: + """(Required) The remote to be related""" + def __init__( + self, + *, + relation: pyspark.sql.connect.proto.relations_pb2.CachedRemoteRelation | None = ..., + ) -> None: ... + def HasField( + self, field_name: typing_extensions.Literal["relation", b"relation"] + ) -> builtins.bool: ... + def ClearField( + self, field_name: typing_extensions.Literal["relation", b"relation"] + ) -> None: ... + +global___RemoveCachedRemoteRelationCommand = RemoveCachedRemoteRelationCommand + +class CheckpointCommand(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + RELATION_FIELD_NUMBER: builtins.int + LOCAL_FIELD_NUMBER: builtins.int + EAGER_FIELD_NUMBER: builtins.int + @property + def relation(self) -> pyspark.sql.connect.proto.relations_pb2.Relation: + """(Required) The logical plan to checkpoint.""" + local: builtins.bool + """(Required) Locally checkpoint using a local temporary + directory in Spark Connect server (Spark Driver) + """ + eager: builtins.bool + """(Required) Whether to checkpoint this dataframe immediately.""" + def __init__( + self, + *, + relation: pyspark.sql.connect.proto.relations_pb2.Relation | None = ..., + local: builtins.bool = ..., + eager: builtins.bool = ..., + ) -> None: ... + def HasField( + self, field_name: typing_extensions.Literal["relation", b"relation"] + ) -> builtins.bool: ... + def ClearField( + self, + field_name: typing_extensions.Literal[ + "eager", b"eager", "local", b"local", "relation", b"relation" + ], + ) -> None: ... + +global___CheckpointCommand = CheckpointCommand diff --git a/python/pyspark/sql/connect/proto/common_pb2.py b/python/pyspark/sql/connect/proto/common_pb2.py index a77d1463e51da..fd528fae33691 100644 --- a/python/pyspark/sql/connect/proto/common_pb2.py +++ b/python/pyspark/sql/connect/proto/common_pb2.py @@ -29,7 +29,7 @@ DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x1aspark/connect/common.proto\x12\rspark.connect"\xb0\x01\n\x0cStorageLevel\x12\x19\n\x08use_disk\x18\x01 \x01(\x08R\x07useDisk\x12\x1d\n\nuse_memory\x18\x02 \x01(\x08R\tuseMemory\x12 \n\x0cuse_off_heap\x18\x03 \x01(\x08R\nuseOffHeap\x12"\n\x0c\x64\x65serialized\x18\x04 \x01(\x08R\x0c\x64\x65serialized\x12 \n\x0breplication\x18\x05 \x01(\x05R\x0breplication"G\n\x13ResourceInformation\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12\x1c\n\taddresses\x18\x02 \x03(\tR\taddresses"\xc3\x01\n\x17\x45xecutorResourceRequest\x12#\n\rresource_name\x18\x01 \x01(\tR\x0cresourceName\x12\x16\n\x06\x61mount\x18\x02 \x01(\x03R\x06\x61mount\x12.\n\x10\x64iscovery_script\x18\x03 \x01(\tH\x00R\x0f\x64iscoveryScript\x88\x01\x01\x12\x1b\n\x06vendor\x18\x04 \x01(\tH\x01R\x06vendor\x88\x01\x01\x42\x13\n\x11_discovery_scriptB\t\n\x07_vendor"R\n\x13TaskResourceRequest\x12#\n\rresource_name\x18\x01 \x01(\tR\x0cresourceName\x12\x16\n\x06\x61mount\x18\x02 \x01(\x01R\x06\x61mount"\xa5\x03\n\x0fResourceProfile\x12\x64\n\x12\x65xecutor_resources\x18\x01 \x03(\x0b\x32\x35.spark.connect.ResourceProfile.ExecutorResourcesEntryR\x11\x65xecutorResources\x12X\n\x0etask_resources\x18\x02 \x03(\x0b\x32\x31.spark.connect.ResourceProfile.TaskResourcesEntryR\rtaskResources\x1al\n\x16\x45xecutorResourcesEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12<\n\x05value\x18\x02 \x01(\x0b\x32&.spark.connect.ExecutorResourceRequestR\x05value:\x02\x38\x01\x1a\x64\n\x12TaskResourcesEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x38\n\x05value\x18\x02 \x01(\x0b\x32".spark.connect.TaskResourceRequestR\x05value:\x02\x38\x01\x42\x36\n\x1eorg.apache.spark.connect.protoP\x01Z\x12internal/generatedb\x06proto3' + b'\n\x1aspark/connect/common.proto\x12\rspark.connect"\xb0\x01\n\x0cStorageLevel\x12\x19\n\x08use_disk\x18\x01 \x01(\x08R\x07useDisk\x12\x1d\n\nuse_memory\x18\x02 \x01(\x08R\tuseMemory\x12 \n\x0cuse_off_heap\x18\x03 \x01(\x08R\nuseOffHeap\x12"\n\x0c\x64\x65serialized\x18\x04 \x01(\x08R\x0c\x64\x65serialized\x12 \n\x0breplication\x18\x05 \x01(\x05R\x0breplication"G\n\x13ResourceInformation\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12\x1c\n\taddresses\x18\x02 \x03(\tR\taddresses"\xc3\x01\n\x17\x45xecutorResourceRequest\x12#\n\rresource_name\x18\x01 \x01(\tR\x0cresourceName\x12\x16\n\x06\x61mount\x18\x02 \x01(\x03R\x06\x61mount\x12.\n\x10\x64iscovery_script\x18\x03 \x01(\tH\x00R\x0f\x64iscoveryScript\x88\x01\x01\x12\x1b\n\x06vendor\x18\x04 \x01(\tH\x01R\x06vendor\x88\x01\x01\x42\x13\n\x11_discovery_scriptB\t\n\x07_vendor"R\n\x13TaskResourceRequest\x12#\n\rresource_name\x18\x01 \x01(\tR\x0cresourceName\x12\x16\n\x06\x61mount\x18\x02 \x01(\x01R\x06\x61mount"\xa5\x03\n\x0fResourceProfile\x12\x64\n\x12\x65xecutor_resources\x18\x01 \x03(\x0b\x32\x35.spark.connect.ResourceProfile.ExecutorResourcesEntryR\x11\x65xecutorResources\x12X\n\x0etask_resources\x18\x02 \x03(\x0b\x32\x31.spark.connect.ResourceProfile.TaskResourcesEntryR\rtaskResources\x1al\n\x16\x45xecutorResourcesEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12<\n\x05value\x18\x02 \x01(\x0b\x32&.spark.connect.ExecutorResourceRequestR\x05value:\x02\x38\x01\x1a\x64\n\x12TaskResourcesEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x38\n\x05value\x18\x02 \x01(\x0b\x32".spark.connect.TaskResourceRequestR\x05value:\x02\x38\x01"X\n\x06Origin\x12\x42\n\rpython_origin\x18\x01 \x01(\x0b\x32\x1b.spark.connect.PythonOriginH\x00R\x0cpythonOriginB\n\n\x08\x66unction"G\n\x0cPythonOrigin\x12\x1a\n\x08\x66ragment\x18\x01 \x01(\tR\x08\x66ragment\x12\x1b\n\tcall_site\x18\x02 \x01(\tR\x08\x63\x61llSiteB6\n\x1eorg.apache.spark.connect.protoP\x01Z\x12internal/generatedb\x06proto3' ) _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) @@ -59,4 +59,8 @@ _RESOURCEPROFILE_EXECUTORRESOURCESENTRY._serialized_end = 899 _RESOURCEPROFILE_TASKRESOURCESENTRY._serialized_start = 901 _RESOURCEPROFILE_TASKRESOURCESENTRY._serialized_end = 1001 + _ORIGIN._serialized_start = 1003 + _ORIGIN._serialized_end = 1091 + _PYTHONORIGIN._serialized_start = 1093 + _PYTHONORIGIN._serialized_end = 1164 # @@protoc_insertion_point(module_scope) diff --git a/python/pyspark/sql/connect/proto/common_pb2.pyi b/python/pyspark/sql/connect/proto/common_pb2.pyi index 163781b41998f..eda172e26cf4e 100644 --- a/python/pyspark/sql/connect/proto/common_pb2.pyi +++ b/python/pyspark/sql/connect/proto/common_pb2.pyi @@ -296,3 +296,54 @@ class ResourceProfile(google.protobuf.message.Message): ) -> None: ... global___ResourceProfile = ResourceProfile + +class Origin(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + PYTHON_ORIGIN_FIELD_NUMBER: builtins.int + @property + def python_origin(self) -> global___PythonOrigin: ... + def __init__( + self, + *, + python_origin: global___PythonOrigin | None = ..., + ) -> None: ... + def HasField( + self, + field_name: typing_extensions.Literal[ + "function", b"function", "python_origin", b"python_origin" + ], + ) -> builtins.bool: ... + def ClearField( + self, + field_name: typing_extensions.Literal[ + "function", b"function", "python_origin", b"python_origin" + ], + ) -> None: ... + def WhichOneof( + self, oneof_group: typing_extensions.Literal["function", b"function"] + ) -> typing_extensions.Literal["python_origin"] | None: ... + +global___Origin = Origin + +class PythonOrigin(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + FRAGMENT_FIELD_NUMBER: builtins.int + CALL_SITE_FIELD_NUMBER: builtins.int + fragment: builtins.str + """(Required) Name of the origin, for example, the name of the function""" + call_site: builtins.str + """(Required) Callsite to show to end users, for example, stacktrace.""" + def __init__( + self, + *, + fragment: builtins.str = ..., + call_site: builtins.str = ..., + ) -> None: ... + def ClearField( + self, + field_name: typing_extensions.Literal["call_site", b"call_site", "fragment", b"fragment"], + ) -> None: ... + +global___PythonOrigin = PythonOrigin diff --git a/python/pyspark/sql/connect/proto/expressions_pb2.py b/python/pyspark/sql/connect/proto/expressions_pb2.py index e42acbf49a7df..c8a183105fd11 100644 --- a/python/pyspark/sql/connect/proto/expressions_pb2.py +++ b/python/pyspark/sql/connect/proto/expressions_pb2.py @@ -30,10 +30,11 @@ from google.protobuf import any_pb2 as google_dot_protobuf_dot_any__pb2 from pyspark.sql.connect.proto import types_pb2 as spark_dot_connect_dot_types__pb2 +from pyspark.sql.connect.proto import common_pb2 as spark_dot_connect_dot_common__pb2 DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x1fspark/connect/expressions.proto\x12\rspark.connect\x1a\x19google/protobuf/any.proto\x1a\x19spark/connect/types.proto"\xde.\n\nExpression\x12=\n\x07literal\x18\x01 \x01(\x0b\x32!.spark.connect.Expression.LiteralH\x00R\x07literal\x12\x62\n\x14unresolved_attribute\x18\x02 \x01(\x0b\x32-.spark.connect.Expression.UnresolvedAttributeH\x00R\x13unresolvedAttribute\x12_\n\x13unresolved_function\x18\x03 \x01(\x0b\x32,.spark.connect.Expression.UnresolvedFunctionH\x00R\x12unresolvedFunction\x12Y\n\x11\x65xpression_string\x18\x04 \x01(\x0b\x32*.spark.connect.Expression.ExpressionStringH\x00R\x10\x65xpressionString\x12S\n\x0funresolved_star\x18\x05 \x01(\x0b\x32(.spark.connect.Expression.UnresolvedStarH\x00R\x0eunresolvedStar\x12\x37\n\x05\x61lias\x18\x06 \x01(\x0b\x32\x1f.spark.connect.Expression.AliasH\x00R\x05\x61lias\x12\x34\n\x04\x63\x61st\x18\x07 \x01(\x0b\x32\x1e.spark.connect.Expression.CastH\x00R\x04\x63\x61st\x12V\n\x10unresolved_regex\x18\x08 \x01(\x0b\x32).spark.connect.Expression.UnresolvedRegexH\x00R\x0funresolvedRegex\x12\x44\n\nsort_order\x18\t \x01(\x0b\x32#.spark.connect.Expression.SortOrderH\x00R\tsortOrder\x12S\n\x0flambda_function\x18\n \x01(\x0b\x32(.spark.connect.Expression.LambdaFunctionH\x00R\x0elambdaFunction\x12:\n\x06window\x18\x0b \x01(\x0b\x32 .spark.connect.Expression.WindowH\x00R\x06window\x12l\n\x18unresolved_extract_value\x18\x0c \x01(\x0b\x32\x30.spark.connect.Expression.UnresolvedExtractValueH\x00R\x16unresolvedExtractValue\x12M\n\rupdate_fields\x18\r \x01(\x0b\x32&.spark.connect.Expression.UpdateFieldsH\x00R\x0cupdateFields\x12\x82\x01\n unresolved_named_lambda_variable\x18\x0e \x01(\x0b\x32\x37.spark.connect.Expression.UnresolvedNamedLambdaVariableH\x00R\x1dunresolvedNamedLambdaVariable\x12~\n#common_inline_user_defined_function\x18\x0f \x01(\x0b\x32..spark.connect.CommonInlineUserDefinedFunctionH\x00R\x1f\x63ommonInlineUserDefinedFunction\x12\x42\n\rcall_function\x18\x10 \x01(\x0b\x32\x1b.spark.connect.CallFunctionH\x00R\x0c\x63\x61llFunction\x12\x64\n\x19named_argument_expression\x18\x11 \x01(\x0b\x32&.spark.connect.NamedArgumentExpressionH\x00R\x17namedArgumentExpression\x12\x35\n\textension\x18\xe7\x07 \x01(\x0b\x32\x14.google.protobuf.AnyH\x00R\textension\x1a\x8f\x06\n\x06Window\x12\x42\n\x0fwindow_function\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x0ewindowFunction\x12@\n\x0epartition_spec\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\rpartitionSpec\x12\x42\n\norder_spec\x18\x03 \x03(\x0b\x32#.spark.connect.Expression.SortOrderR\torderSpec\x12K\n\nframe_spec\x18\x04 \x01(\x0b\x32,.spark.connect.Expression.Window.WindowFrameR\tframeSpec\x1a\xed\x03\n\x0bWindowFrame\x12U\n\nframe_type\x18\x01 \x01(\x0e\x32\x36.spark.connect.Expression.Window.WindowFrame.FrameTypeR\tframeType\x12P\n\x05lower\x18\x02 \x01(\x0b\x32:.spark.connect.Expression.Window.WindowFrame.FrameBoundaryR\x05lower\x12P\n\x05upper\x18\x03 \x01(\x0b\x32:.spark.connect.Expression.Window.WindowFrame.FrameBoundaryR\x05upper\x1a\x91\x01\n\rFrameBoundary\x12!\n\x0b\x63urrent_row\x18\x01 \x01(\x08H\x00R\ncurrentRow\x12\x1e\n\tunbounded\x18\x02 \x01(\x08H\x00R\tunbounded\x12\x31\n\x05value\x18\x03 \x01(\x0b\x32\x19.spark.connect.ExpressionH\x00R\x05valueB\n\n\x08\x62oundary"O\n\tFrameType\x12\x18\n\x14\x46RAME_TYPE_UNDEFINED\x10\x00\x12\x12\n\x0e\x46RAME_TYPE_ROW\x10\x01\x12\x14\n\x10\x46RAME_TYPE_RANGE\x10\x02\x1a\xa9\x03\n\tSortOrder\x12/\n\x05\x63hild\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x05\x63hild\x12O\n\tdirection\x18\x02 \x01(\x0e\x32\x31.spark.connect.Expression.SortOrder.SortDirectionR\tdirection\x12U\n\rnull_ordering\x18\x03 \x01(\x0e\x32\x30.spark.connect.Expression.SortOrder.NullOrderingR\x0cnullOrdering"l\n\rSortDirection\x12\x1e\n\x1aSORT_DIRECTION_UNSPECIFIED\x10\x00\x12\x1c\n\x18SORT_DIRECTION_ASCENDING\x10\x01\x12\x1d\n\x19SORT_DIRECTION_DESCENDING\x10\x02"U\n\x0cNullOrdering\x12\x1a\n\x16SORT_NULLS_UNSPECIFIED\x10\x00\x12\x14\n\x10SORT_NULLS_FIRST\x10\x01\x12\x13\n\x0fSORT_NULLS_LAST\x10\x02\x1a\xbb\x02\n\x04\x43\x61st\x12-\n\x04\x65xpr\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x04\x65xpr\x12-\n\x04type\x18\x02 \x01(\x0b\x32\x17.spark.connect.DataTypeH\x00R\x04type\x12\x1b\n\x08type_str\x18\x03 \x01(\tH\x00R\x07typeStr\x12\x44\n\teval_mode\x18\x04 \x01(\x0e\x32\'.spark.connect.Expression.Cast.EvalModeR\x08\x65valMode"b\n\x08\x45valMode\x12\x19\n\x15\x45VAL_MODE_UNSPECIFIED\x10\x00\x12\x14\n\x10\x45VAL_MODE_LEGACY\x10\x01\x12\x12\n\x0e\x45VAL_MODE_ANSI\x10\x02\x12\x11\n\rEVAL_MODE_TRY\x10\x03\x42\x0e\n\x0c\x63\x61st_to_type\x1a\x9b\x0c\n\x07Literal\x12-\n\x04null\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeH\x00R\x04null\x12\x18\n\x06\x62inary\x18\x02 \x01(\x0cH\x00R\x06\x62inary\x12\x1a\n\x07\x62oolean\x18\x03 \x01(\x08H\x00R\x07\x62oolean\x12\x14\n\x04\x62yte\x18\x04 \x01(\x05H\x00R\x04\x62yte\x12\x16\n\x05short\x18\x05 \x01(\x05H\x00R\x05short\x12\x1a\n\x07integer\x18\x06 \x01(\x05H\x00R\x07integer\x12\x14\n\x04long\x18\x07 \x01(\x03H\x00R\x04long\x12\x16\n\x05\x66loat\x18\n \x01(\x02H\x00R\x05\x66loat\x12\x18\n\x06\x64ouble\x18\x0b \x01(\x01H\x00R\x06\x64ouble\x12\x45\n\x07\x64\x65\x63imal\x18\x0c \x01(\x0b\x32).spark.connect.Expression.Literal.DecimalH\x00R\x07\x64\x65\x63imal\x12\x18\n\x06string\x18\r \x01(\tH\x00R\x06string\x12\x14\n\x04\x64\x61te\x18\x10 \x01(\x05H\x00R\x04\x64\x61te\x12\x1e\n\ttimestamp\x18\x11 \x01(\x03H\x00R\ttimestamp\x12%\n\rtimestamp_ntz\x18\x12 \x01(\x03H\x00R\x0ctimestampNtz\x12\x61\n\x11\x63\x61lendar_interval\x18\x13 \x01(\x0b\x32\x32.spark.connect.Expression.Literal.CalendarIntervalH\x00R\x10\x63\x61lendarInterval\x12\x30\n\x13year_month_interval\x18\x14 \x01(\x05H\x00R\x11yearMonthInterval\x12,\n\x11\x64\x61y_time_interval\x18\x15 \x01(\x03H\x00R\x0f\x64\x61yTimeInterval\x12?\n\x05\x61rray\x18\x16 \x01(\x0b\x32\'.spark.connect.Expression.Literal.ArrayH\x00R\x05\x61rray\x12\x39\n\x03map\x18\x17 \x01(\x0b\x32%.spark.connect.Expression.Literal.MapH\x00R\x03map\x12\x42\n\x06struct\x18\x18 \x01(\x0b\x32(.spark.connect.Expression.Literal.StructH\x00R\x06struct\x1au\n\x07\x44\x65\x63imal\x12\x14\n\x05value\x18\x01 \x01(\tR\x05value\x12!\n\tprecision\x18\x02 \x01(\x05H\x00R\tprecision\x88\x01\x01\x12\x19\n\x05scale\x18\x03 \x01(\x05H\x01R\x05scale\x88\x01\x01\x42\x0c\n\n_precisionB\x08\n\x06_scale\x1a\x62\n\x10\x43\x61lendarInterval\x12\x16\n\x06months\x18\x01 \x01(\x05R\x06months\x12\x12\n\x04\x64\x61ys\x18\x02 \x01(\x05R\x04\x64\x61ys\x12"\n\x0cmicroseconds\x18\x03 \x01(\x03R\x0cmicroseconds\x1a\x82\x01\n\x05\x41rray\x12:\n\x0c\x65lement_type\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeR\x0b\x65lementType\x12=\n\x08\x65lements\x18\x02 \x03(\x0b\x32!.spark.connect.Expression.LiteralR\x08\x65lements\x1a\xe3\x01\n\x03Map\x12\x32\n\x08key_type\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeR\x07keyType\x12\x36\n\nvalue_type\x18\x02 \x01(\x0b\x32\x17.spark.connect.DataTypeR\tvalueType\x12\x35\n\x04keys\x18\x03 \x03(\x0b\x32!.spark.connect.Expression.LiteralR\x04keys\x12\x39\n\x06values\x18\x04 \x03(\x0b\x32!.spark.connect.Expression.LiteralR\x06values\x1a\x81\x01\n\x06Struct\x12\x38\n\x0bstruct_type\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeR\nstructType\x12=\n\x08\x65lements\x18\x02 \x03(\x0b\x32!.spark.connect.Expression.LiteralR\x08\x65lementsB\x0e\n\x0cliteral_type\x1a\xba\x01\n\x13UnresolvedAttribute\x12/\n\x13unparsed_identifier\x18\x01 \x01(\tR\x12unparsedIdentifier\x12\x1c\n\x07plan_id\x18\x02 \x01(\x03H\x00R\x06planId\x88\x01\x01\x12\x31\n\x12is_metadata_column\x18\x03 \x01(\x08H\x01R\x10isMetadataColumn\x88\x01\x01\x42\n\n\x08_plan_idB\x15\n\x13_is_metadata_column\x1a\xcc\x01\n\x12UnresolvedFunction\x12#\n\rfunction_name\x18\x01 \x01(\tR\x0c\x66unctionName\x12\x37\n\targuments\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\targuments\x12\x1f\n\x0bis_distinct\x18\x03 \x01(\x08R\nisDistinct\x12\x37\n\x18is_user_defined_function\x18\x04 \x01(\x08R\x15isUserDefinedFunction\x1a\x32\n\x10\x45xpressionString\x12\x1e\n\nexpression\x18\x01 \x01(\tR\nexpression\x1a|\n\x0eUnresolvedStar\x12,\n\x0funparsed_target\x18\x01 \x01(\tH\x00R\x0eunparsedTarget\x88\x01\x01\x12\x1c\n\x07plan_id\x18\x02 \x01(\x03H\x01R\x06planId\x88\x01\x01\x42\x12\n\x10_unparsed_targetB\n\n\x08_plan_id\x1aV\n\x0fUnresolvedRegex\x12\x19\n\x08\x63ol_name\x18\x01 \x01(\tR\x07\x63olName\x12\x1c\n\x07plan_id\x18\x02 \x01(\x03H\x00R\x06planId\x88\x01\x01\x42\n\n\x08_plan_id\x1a\x84\x01\n\x16UnresolvedExtractValue\x12/\n\x05\x63hild\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x05\x63hild\x12\x39\n\nextraction\x18\x02 \x01(\x0b\x32\x19.spark.connect.ExpressionR\nextraction\x1a\xbb\x01\n\x0cUpdateFields\x12\x46\n\x11struct_expression\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x10structExpression\x12\x1d\n\nfield_name\x18\x02 \x01(\tR\tfieldName\x12\x44\n\x10value_expression\x18\x03 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x0fvalueExpression\x1ax\n\x05\x41lias\x12-\n\x04\x65xpr\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x04\x65xpr\x12\x12\n\x04name\x18\x02 \x03(\tR\x04name\x12\x1f\n\x08metadata\x18\x03 \x01(\tH\x00R\x08metadata\x88\x01\x01\x42\x0b\n\t_metadata\x1a\x9e\x01\n\x0eLambdaFunction\x12\x35\n\x08\x66unction\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x08\x66unction\x12U\n\targuments\x18\x02 \x03(\x0b\x32\x37.spark.connect.Expression.UnresolvedNamedLambdaVariableR\targuments\x1a>\n\x1dUnresolvedNamedLambdaVariable\x12\x1d\n\nname_parts\x18\x01 \x03(\tR\tnamePartsB\x0b\n\texpr_type"\xec\x02\n\x1f\x43ommonInlineUserDefinedFunction\x12#\n\rfunction_name\x18\x01 \x01(\tR\x0c\x66unctionName\x12$\n\rdeterministic\x18\x02 \x01(\x08R\rdeterministic\x12\x37\n\targuments\x18\x03 \x03(\x0b\x32\x19.spark.connect.ExpressionR\targuments\x12\x39\n\npython_udf\x18\x04 \x01(\x0b\x32\x18.spark.connect.PythonUDFH\x00R\tpythonUdf\x12I\n\x10scalar_scala_udf\x18\x05 \x01(\x0b\x32\x1d.spark.connect.ScalarScalaUDFH\x00R\x0escalarScalaUdf\x12\x33\n\x08java_udf\x18\x06 \x01(\x0b\x32\x16.spark.connect.JavaUDFH\x00R\x07javaUdfB\n\n\x08\x66unction"\x9b\x01\n\tPythonUDF\x12\x38\n\x0boutput_type\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeR\noutputType\x12\x1b\n\teval_type\x18\x02 \x01(\x05R\x08\x65valType\x12\x18\n\x07\x63ommand\x18\x03 \x01(\x0cR\x07\x63ommand\x12\x1d\n\npython_ver\x18\x04 \x01(\tR\tpythonVer"\xb8\x01\n\x0eScalarScalaUDF\x12\x18\n\x07payload\x18\x01 \x01(\x0cR\x07payload\x12\x37\n\ninputTypes\x18\x02 \x03(\x0b\x32\x17.spark.connect.DataTypeR\ninputTypes\x12\x37\n\noutputType\x18\x03 \x01(\x0b\x32\x17.spark.connect.DataTypeR\noutputType\x12\x1a\n\x08nullable\x18\x04 \x01(\x08R\x08nullable"\x95\x01\n\x07JavaUDF\x12\x1d\n\nclass_name\x18\x01 \x01(\tR\tclassName\x12=\n\x0boutput_type\x18\x02 \x01(\x0b\x32\x17.spark.connect.DataTypeH\x00R\noutputType\x88\x01\x01\x12\x1c\n\taggregate\x18\x03 \x01(\x08R\taggregateB\x0e\n\x0c_output_type"l\n\x0c\x43\x61llFunction\x12#\n\rfunction_name\x18\x01 \x01(\tR\x0c\x66unctionName\x12\x37\n\targuments\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\targuments"\\\n\x17NamedArgumentExpression\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12/\n\x05value\x18\x02 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x05valueB6\n\x1eorg.apache.spark.connect.protoP\x01Z\x12internal/generatedb\x06proto3' + b'\n\x1fspark/connect/expressions.proto\x12\rspark.connect\x1a\x19google/protobuf/any.proto\x1a\x19spark/connect/types.proto\x1a\x1aspark/connect/common.proto"\x97/\n\nExpression\x12\x37\n\x06\x63ommon\x18\x12 \x01(\x0b\x32\x1f.spark.connect.ExpressionCommonR\x06\x63ommon\x12=\n\x07literal\x18\x01 \x01(\x0b\x32!.spark.connect.Expression.LiteralH\x00R\x07literal\x12\x62\n\x14unresolved_attribute\x18\x02 \x01(\x0b\x32-.spark.connect.Expression.UnresolvedAttributeH\x00R\x13unresolvedAttribute\x12_\n\x13unresolved_function\x18\x03 \x01(\x0b\x32,.spark.connect.Expression.UnresolvedFunctionH\x00R\x12unresolvedFunction\x12Y\n\x11\x65xpression_string\x18\x04 \x01(\x0b\x32*.spark.connect.Expression.ExpressionStringH\x00R\x10\x65xpressionString\x12S\n\x0funresolved_star\x18\x05 \x01(\x0b\x32(.spark.connect.Expression.UnresolvedStarH\x00R\x0eunresolvedStar\x12\x37\n\x05\x61lias\x18\x06 \x01(\x0b\x32\x1f.spark.connect.Expression.AliasH\x00R\x05\x61lias\x12\x34\n\x04\x63\x61st\x18\x07 \x01(\x0b\x32\x1e.spark.connect.Expression.CastH\x00R\x04\x63\x61st\x12V\n\x10unresolved_regex\x18\x08 \x01(\x0b\x32).spark.connect.Expression.UnresolvedRegexH\x00R\x0funresolvedRegex\x12\x44\n\nsort_order\x18\t \x01(\x0b\x32#.spark.connect.Expression.SortOrderH\x00R\tsortOrder\x12S\n\x0flambda_function\x18\n \x01(\x0b\x32(.spark.connect.Expression.LambdaFunctionH\x00R\x0elambdaFunction\x12:\n\x06window\x18\x0b \x01(\x0b\x32 .spark.connect.Expression.WindowH\x00R\x06window\x12l\n\x18unresolved_extract_value\x18\x0c \x01(\x0b\x32\x30.spark.connect.Expression.UnresolvedExtractValueH\x00R\x16unresolvedExtractValue\x12M\n\rupdate_fields\x18\r \x01(\x0b\x32&.spark.connect.Expression.UpdateFieldsH\x00R\x0cupdateFields\x12\x82\x01\n unresolved_named_lambda_variable\x18\x0e \x01(\x0b\x32\x37.spark.connect.Expression.UnresolvedNamedLambdaVariableH\x00R\x1dunresolvedNamedLambdaVariable\x12~\n#common_inline_user_defined_function\x18\x0f \x01(\x0b\x32..spark.connect.CommonInlineUserDefinedFunctionH\x00R\x1f\x63ommonInlineUserDefinedFunction\x12\x42\n\rcall_function\x18\x10 \x01(\x0b\x32\x1b.spark.connect.CallFunctionH\x00R\x0c\x63\x61llFunction\x12\x64\n\x19named_argument_expression\x18\x11 \x01(\x0b\x32&.spark.connect.NamedArgumentExpressionH\x00R\x17namedArgumentExpression\x12\x35\n\textension\x18\xe7\x07 \x01(\x0b\x32\x14.google.protobuf.AnyH\x00R\textension\x1a\x8f\x06\n\x06Window\x12\x42\n\x0fwindow_function\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x0ewindowFunction\x12@\n\x0epartition_spec\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\rpartitionSpec\x12\x42\n\norder_spec\x18\x03 \x03(\x0b\x32#.spark.connect.Expression.SortOrderR\torderSpec\x12K\n\nframe_spec\x18\x04 \x01(\x0b\x32,.spark.connect.Expression.Window.WindowFrameR\tframeSpec\x1a\xed\x03\n\x0bWindowFrame\x12U\n\nframe_type\x18\x01 \x01(\x0e\x32\x36.spark.connect.Expression.Window.WindowFrame.FrameTypeR\tframeType\x12P\n\x05lower\x18\x02 \x01(\x0b\x32:.spark.connect.Expression.Window.WindowFrame.FrameBoundaryR\x05lower\x12P\n\x05upper\x18\x03 \x01(\x0b\x32:.spark.connect.Expression.Window.WindowFrame.FrameBoundaryR\x05upper\x1a\x91\x01\n\rFrameBoundary\x12!\n\x0b\x63urrent_row\x18\x01 \x01(\x08H\x00R\ncurrentRow\x12\x1e\n\tunbounded\x18\x02 \x01(\x08H\x00R\tunbounded\x12\x31\n\x05value\x18\x03 \x01(\x0b\x32\x19.spark.connect.ExpressionH\x00R\x05valueB\n\n\x08\x62oundary"O\n\tFrameType\x12\x18\n\x14\x46RAME_TYPE_UNDEFINED\x10\x00\x12\x12\n\x0e\x46RAME_TYPE_ROW\x10\x01\x12\x14\n\x10\x46RAME_TYPE_RANGE\x10\x02\x1a\xa9\x03\n\tSortOrder\x12/\n\x05\x63hild\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x05\x63hild\x12O\n\tdirection\x18\x02 \x01(\x0e\x32\x31.spark.connect.Expression.SortOrder.SortDirectionR\tdirection\x12U\n\rnull_ordering\x18\x03 \x01(\x0e\x32\x30.spark.connect.Expression.SortOrder.NullOrderingR\x0cnullOrdering"l\n\rSortDirection\x12\x1e\n\x1aSORT_DIRECTION_UNSPECIFIED\x10\x00\x12\x1c\n\x18SORT_DIRECTION_ASCENDING\x10\x01\x12\x1d\n\x19SORT_DIRECTION_DESCENDING\x10\x02"U\n\x0cNullOrdering\x12\x1a\n\x16SORT_NULLS_UNSPECIFIED\x10\x00\x12\x14\n\x10SORT_NULLS_FIRST\x10\x01\x12\x13\n\x0fSORT_NULLS_LAST\x10\x02\x1a\xbb\x02\n\x04\x43\x61st\x12-\n\x04\x65xpr\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x04\x65xpr\x12-\n\x04type\x18\x02 \x01(\x0b\x32\x17.spark.connect.DataTypeH\x00R\x04type\x12\x1b\n\x08type_str\x18\x03 \x01(\tH\x00R\x07typeStr\x12\x44\n\teval_mode\x18\x04 \x01(\x0e\x32\'.spark.connect.Expression.Cast.EvalModeR\x08\x65valMode"b\n\x08\x45valMode\x12\x19\n\x15\x45VAL_MODE_UNSPECIFIED\x10\x00\x12\x14\n\x10\x45VAL_MODE_LEGACY\x10\x01\x12\x12\n\x0e\x45VAL_MODE_ANSI\x10\x02\x12\x11\n\rEVAL_MODE_TRY\x10\x03\x42\x0e\n\x0c\x63\x61st_to_type\x1a\x9b\x0c\n\x07Literal\x12-\n\x04null\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeH\x00R\x04null\x12\x18\n\x06\x62inary\x18\x02 \x01(\x0cH\x00R\x06\x62inary\x12\x1a\n\x07\x62oolean\x18\x03 \x01(\x08H\x00R\x07\x62oolean\x12\x14\n\x04\x62yte\x18\x04 \x01(\x05H\x00R\x04\x62yte\x12\x16\n\x05short\x18\x05 \x01(\x05H\x00R\x05short\x12\x1a\n\x07integer\x18\x06 \x01(\x05H\x00R\x07integer\x12\x14\n\x04long\x18\x07 \x01(\x03H\x00R\x04long\x12\x16\n\x05\x66loat\x18\n \x01(\x02H\x00R\x05\x66loat\x12\x18\n\x06\x64ouble\x18\x0b \x01(\x01H\x00R\x06\x64ouble\x12\x45\n\x07\x64\x65\x63imal\x18\x0c \x01(\x0b\x32).spark.connect.Expression.Literal.DecimalH\x00R\x07\x64\x65\x63imal\x12\x18\n\x06string\x18\r \x01(\tH\x00R\x06string\x12\x14\n\x04\x64\x61te\x18\x10 \x01(\x05H\x00R\x04\x64\x61te\x12\x1e\n\ttimestamp\x18\x11 \x01(\x03H\x00R\ttimestamp\x12%\n\rtimestamp_ntz\x18\x12 \x01(\x03H\x00R\x0ctimestampNtz\x12\x61\n\x11\x63\x61lendar_interval\x18\x13 \x01(\x0b\x32\x32.spark.connect.Expression.Literal.CalendarIntervalH\x00R\x10\x63\x61lendarInterval\x12\x30\n\x13year_month_interval\x18\x14 \x01(\x05H\x00R\x11yearMonthInterval\x12,\n\x11\x64\x61y_time_interval\x18\x15 \x01(\x03H\x00R\x0f\x64\x61yTimeInterval\x12?\n\x05\x61rray\x18\x16 \x01(\x0b\x32\'.spark.connect.Expression.Literal.ArrayH\x00R\x05\x61rray\x12\x39\n\x03map\x18\x17 \x01(\x0b\x32%.spark.connect.Expression.Literal.MapH\x00R\x03map\x12\x42\n\x06struct\x18\x18 \x01(\x0b\x32(.spark.connect.Expression.Literal.StructH\x00R\x06struct\x1au\n\x07\x44\x65\x63imal\x12\x14\n\x05value\x18\x01 \x01(\tR\x05value\x12!\n\tprecision\x18\x02 \x01(\x05H\x00R\tprecision\x88\x01\x01\x12\x19\n\x05scale\x18\x03 \x01(\x05H\x01R\x05scale\x88\x01\x01\x42\x0c\n\n_precisionB\x08\n\x06_scale\x1a\x62\n\x10\x43\x61lendarInterval\x12\x16\n\x06months\x18\x01 \x01(\x05R\x06months\x12\x12\n\x04\x64\x61ys\x18\x02 \x01(\x05R\x04\x64\x61ys\x12"\n\x0cmicroseconds\x18\x03 \x01(\x03R\x0cmicroseconds\x1a\x82\x01\n\x05\x41rray\x12:\n\x0c\x65lement_type\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeR\x0b\x65lementType\x12=\n\x08\x65lements\x18\x02 \x03(\x0b\x32!.spark.connect.Expression.LiteralR\x08\x65lements\x1a\xe3\x01\n\x03Map\x12\x32\n\x08key_type\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeR\x07keyType\x12\x36\n\nvalue_type\x18\x02 \x01(\x0b\x32\x17.spark.connect.DataTypeR\tvalueType\x12\x35\n\x04keys\x18\x03 \x03(\x0b\x32!.spark.connect.Expression.LiteralR\x04keys\x12\x39\n\x06values\x18\x04 \x03(\x0b\x32!.spark.connect.Expression.LiteralR\x06values\x1a\x81\x01\n\x06Struct\x12\x38\n\x0bstruct_type\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeR\nstructType\x12=\n\x08\x65lements\x18\x02 \x03(\x0b\x32!.spark.connect.Expression.LiteralR\x08\x65lementsB\x0e\n\x0cliteral_type\x1a\xba\x01\n\x13UnresolvedAttribute\x12/\n\x13unparsed_identifier\x18\x01 \x01(\tR\x12unparsedIdentifier\x12\x1c\n\x07plan_id\x18\x02 \x01(\x03H\x00R\x06planId\x88\x01\x01\x12\x31\n\x12is_metadata_column\x18\x03 \x01(\x08H\x01R\x10isMetadataColumn\x88\x01\x01\x42\n\n\x08_plan_idB\x15\n\x13_is_metadata_column\x1a\xcc\x01\n\x12UnresolvedFunction\x12#\n\rfunction_name\x18\x01 \x01(\tR\x0c\x66unctionName\x12\x37\n\targuments\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\targuments\x12\x1f\n\x0bis_distinct\x18\x03 \x01(\x08R\nisDistinct\x12\x37\n\x18is_user_defined_function\x18\x04 \x01(\x08R\x15isUserDefinedFunction\x1a\x32\n\x10\x45xpressionString\x12\x1e\n\nexpression\x18\x01 \x01(\tR\nexpression\x1a|\n\x0eUnresolvedStar\x12,\n\x0funparsed_target\x18\x01 \x01(\tH\x00R\x0eunparsedTarget\x88\x01\x01\x12\x1c\n\x07plan_id\x18\x02 \x01(\x03H\x01R\x06planId\x88\x01\x01\x42\x12\n\x10_unparsed_targetB\n\n\x08_plan_id\x1aV\n\x0fUnresolvedRegex\x12\x19\n\x08\x63ol_name\x18\x01 \x01(\tR\x07\x63olName\x12\x1c\n\x07plan_id\x18\x02 \x01(\x03H\x00R\x06planId\x88\x01\x01\x42\n\n\x08_plan_id\x1a\x84\x01\n\x16UnresolvedExtractValue\x12/\n\x05\x63hild\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x05\x63hild\x12\x39\n\nextraction\x18\x02 \x01(\x0b\x32\x19.spark.connect.ExpressionR\nextraction\x1a\xbb\x01\n\x0cUpdateFields\x12\x46\n\x11struct_expression\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x10structExpression\x12\x1d\n\nfield_name\x18\x02 \x01(\tR\tfieldName\x12\x44\n\x10value_expression\x18\x03 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x0fvalueExpression\x1ax\n\x05\x41lias\x12-\n\x04\x65xpr\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x04\x65xpr\x12\x12\n\x04name\x18\x02 \x03(\tR\x04name\x12\x1f\n\x08metadata\x18\x03 \x01(\tH\x00R\x08metadata\x88\x01\x01\x42\x0b\n\t_metadata\x1a\x9e\x01\n\x0eLambdaFunction\x12\x35\n\x08\x66unction\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x08\x66unction\x12U\n\targuments\x18\x02 \x03(\x0b\x32\x37.spark.connect.Expression.UnresolvedNamedLambdaVariableR\targuments\x1a>\n\x1dUnresolvedNamedLambdaVariable\x12\x1d\n\nname_parts\x18\x01 \x03(\tR\tnamePartsB\x0b\n\texpr_type"A\n\x10\x45xpressionCommon\x12-\n\x06origin\x18\x01 \x01(\x0b\x32\x15.spark.connect.OriginR\x06origin"\xec\x02\n\x1f\x43ommonInlineUserDefinedFunction\x12#\n\rfunction_name\x18\x01 \x01(\tR\x0c\x66unctionName\x12$\n\rdeterministic\x18\x02 \x01(\x08R\rdeterministic\x12\x37\n\targuments\x18\x03 \x03(\x0b\x32\x19.spark.connect.ExpressionR\targuments\x12\x39\n\npython_udf\x18\x04 \x01(\x0b\x32\x18.spark.connect.PythonUDFH\x00R\tpythonUdf\x12I\n\x10scalar_scala_udf\x18\x05 \x01(\x0b\x32\x1d.spark.connect.ScalarScalaUDFH\x00R\x0escalarScalaUdf\x12\x33\n\x08java_udf\x18\x06 \x01(\x0b\x32\x16.spark.connect.JavaUDFH\x00R\x07javaUdfB\n\n\x08\x66unction"\xcc\x01\n\tPythonUDF\x12\x38\n\x0boutput_type\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeR\noutputType\x12\x1b\n\teval_type\x18\x02 \x01(\x05R\x08\x65valType\x12\x18\n\x07\x63ommand\x18\x03 \x01(\x0cR\x07\x63ommand\x12\x1d\n\npython_ver\x18\x04 \x01(\tR\tpythonVer\x12/\n\x13\x61\x64\x64itional_includes\x18\x05 \x03(\tR\x12\x61\x64\x64itionalIncludes"\xd6\x01\n\x0eScalarScalaUDF\x12\x18\n\x07payload\x18\x01 \x01(\x0cR\x07payload\x12\x37\n\ninputTypes\x18\x02 \x03(\x0b\x32\x17.spark.connect.DataTypeR\ninputTypes\x12\x37\n\noutputType\x18\x03 \x01(\x0b\x32\x17.spark.connect.DataTypeR\noutputType\x12\x1a\n\x08nullable\x18\x04 \x01(\x08R\x08nullable\x12\x1c\n\taggregate\x18\x05 \x01(\x08R\taggregate"\x95\x01\n\x07JavaUDF\x12\x1d\n\nclass_name\x18\x01 \x01(\tR\tclassName\x12=\n\x0boutput_type\x18\x02 \x01(\x0b\x32\x17.spark.connect.DataTypeH\x00R\noutputType\x88\x01\x01\x12\x1c\n\taggregate\x18\x03 \x01(\x08R\taggregateB\x0e\n\x0c_output_type"l\n\x0c\x43\x61llFunction\x12#\n\rfunction_name\x18\x01 \x01(\tR\x0c\x66unctionName\x12\x37\n\targuments\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\targuments"\\\n\x17NamedArgumentExpression\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12/\n\x05value\x18\x02 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x05valueB6\n\x1eorg.apache.spark.connect.protoP\x01Z\x12internal/generatedb\x06proto3' ) _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) @@ -45,68 +46,70 @@ DESCRIPTOR._serialized_options = ( b"\n\036org.apache.spark.connect.protoP\001Z\022internal/generated" ) - _EXPRESSION._serialized_start = 105 - _EXPRESSION._serialized_end = 6087 - _EXPRESSION_WINDOW._serialized_start = 1645 - _EXPRESSION_WINDOW._serialized_end = 2428 - _EXPRESSION_WINDOW_WINDOWFRAME._serialized_start = 1935 - _EXPRESSION_WINDOW_WINDOWFRAME._serialized_end = 2428 - _EXPRESSION_WINDOW_WINDOWFRAME_FRAMEBOUNDARY._serialized_start = 2202 - _EXPRESSION_WINDOW_WINDOWFRAME_FRAMEBOUNDARY._serialized_end = 2347 - _EXPRESSION_WINDOW_WINDOWFRAME_FRAMETYPE._serialized_start = 2349 - _EXPRESSION_WINDOW_WINDOWFRAME_FRAMETYPE._serialized_end = 2428 - _EXPRESSION_SORTORDER._serialized_start = 2431 - _EXPRESSION_SORTORDER._serialized_end = 2856 - _EXPRESSION_SORTORDER_SORTDIRECTION._serialized_start = 2661 - _EXPRESSION_SORTORDER_SORTDIRECTION._serialized_end = 2769 - _EXPRESSION_SORTORDER_NULLORDERING._serialized_start = 2771 - _EXPRESSION_SORTORDER_NULLORDERING._serialized_end = 2856 - _EXPRESSION_CAST._serialized_start = 2859 - _EXPRESSION_CAST._serialized_end = 3174 - _EXPRESSION_CAST_EVALMODE._serialized_start = 3060 - _EXPRESSION_CAST_EVALMODE._serialized_end = 3158 - _EXPRESSION_LITERAL._serialized_start = 3177 - _EXPRESSION_LITERAL._serialized_end = 4740 - _EXPRESSION_LITERAL_DECIMAL._serialized_start = 4012 - _EXPRESSION_LITERAL_DECIMAL._serialized_end = 4129 - _EXPRESSION_LITERAL_CALENDARINTERVAL._serialized_start = 4131 - _EXPRESSION_LITERAL_CALENDARINTERVAL._serialized_end = 4229 - _EXPRESSION_LITERAL_ARRAY._serialized_start = 4232 - _EXPRESSION_LITERAL_ARRAY._serialized_end = 4362 - _EXPRESSION_LITERAL_MAP._serialized_start = 4365 - _EXPRESSION_LITERAL_MAP._serialized_end = 4592 - _EXPRESSION_LITERAL_STRUCT._serialized_start = 4595 - _EXPRESSION_LITERAL_STRUCT._serialized_end = 4724 - _EXPRESSION_UNRESOLVEDATTRIBUTE._serialized_start = 4743 - _EXPRESSION_UNRESOLVEDATTRIBUTE._serialized_end = 4929 - _EXPRESSION_UNRESOLVEDFUNCTION._serialized_start = 4932 - _EXPRESSION_UNRESOLVEDFUNCTION._serialized_end = 5136 - _EXPRESSION_EXPRESSIONSTRING._serialized_start = 5138 - _EXPRESSION_EXPRESSIONSTRING._serialized_end = 5188 - _EXPRESSION_UNRESOLVEDSTAR._serialized_start = 5190 - _EXPRESSION_UNRESOLVEDSTAR._serialized_end = 5314 - _EXPRESSION_UNRESOLVEDREGEX._serialized_start = 5316 - _EXPRESSION_UNRESOLVEDREGEX._serialized_end = 5402 - _EXPRESSION_UNRESOLVEDEXTRACTVALUE._serialized_start = 5405 - _EXPRESSION_UNRESOLVEDEXTRACTVALUE._serialized_end = 5537 - _EXPRESSION_UPDATEFIELDS._serialized_start = 5540 - _EXPRESSION_UPDATEFIELDS._serialized_end = 5727 - _EXPRESSION_ALIAS._serialized_start = 5729 - _EXPRESSION_ALIAS._serialized_end = 5849 - _EXPRESSION_LAMBDAFUNCTION._serialized_start = 5852 - _EXPRESSION_LAMBDAFUNCTION._serialized_end = 6010 - _EXPRESSION_UNRESOLVEDNAMEDLAMBDAVARIABLE._serialized_start = 6012 - _EXPRESSION_UNRESOLVEDNAMEDLAMBDAVARIABLE._serialized_end = 6074 - _COMMONINLINEUSERDEFINEDFUNCTION._serialized_start = 6090 - _COMMONINLINEUSERDEFINEDFUNCTION._serialized_end = 6454 - _PYTHONUDF._serialized_start = 6457 - _PYTHONUDF._serialized_end = 6612 - _SCALARSCALAUDF._serialized_start = 6615 - _SCALARSCALAUDF._serialized_end = 6799 - _JAVAUDF._serialized_start = 6802 - _JAVAUDF._serialized_end = 6951 - _CALLFUNCTION._serialized_start = 6953 - _CALLFUNCTION._serialized_end = 7061 - _NAMEDARGUMENTEXPRESSION._serialized_start = 7063 - _NAMEDARGUMENTEXPRESSION._serialized_end = 7155 + _EXPRESSION._serialized_start = 133 + _EXPRESSION._serialized_end = 6172 + _EXPRESSION_WINDOW._serialized_start = 1730 + _EXPRESSION_WINDOW._serialized_end = 2513 + _EXPRESSION_WINDOW_WINDOWFRAME._serialized_start = 2020 + _EXPRESSION_WINDOW_WINDOWFRAME._serialized_end = 2513 + _EXPRESSION_WINDOW_WINDOWFRAME_FRAMEBOUNDARY._serialized_start = 2287 + _EXPRESSION_WINDOW_WINDOWFRAME_FRAMEBOUNDARY._serialized_end = 2432 + _EXPRESSION_WINDOW_WINDOWFRAME_FRAMETYPE._serialized_start = 2434 + _EXPRESSION_WINDOW_WINDOWFRAME_FRAMETYPE._serialized_end = 2513 + _EXPRESSION_SORTORDER._serialized_start = 2516 + _EXPRESSION_SORTORDER._serialized_end = 2941 + _EXPRESSION_SORTORDER_SORTDIRECTION._serialized_start = 2746 + _EXPRESSION_SORTORDER_SORTDIRECTION._serialized_end = 2854 + _EXPRESSION_SORTORDER_NULLORDERING._serialized_start = 2856 + _EXPRESSION_SORTORDER_NULLORDERING._serialized_end = 2941 + _EXPRESSION_CAST._serialized_start = 2944 + _EXPRESSION_CAST._serialized_end = 3259 + _EXPRESSION_CAST_EVALMODE._serialized_start = 3145 + _EXPRESSION_CAST_EVALMODE._serialized_end = 3243 + _EXPRESSION_LITERAL._serialized_start = 3262 + _EXPRESSION_LITERAL._serialized_end = 4825 + _EXPRESSION_LITERAL_DECIMAL._serialized_start = 4097 + _EXPRESSION_LITERAL_DECIMAL._serialized_end = 4214 + _EXPRESSION_LITERAL_CALENDARINTERVAL._serialized_start = 4216 + _EXPRESSION_LITERAL_CALENDARINTERVAL._serialized_end = 4314 + _EXPRESSION_LITERAL_ARRAY._serialized_start = 4317 + _EXPRESSION_LITERAL_ARRAY._serialized_end = 4447 + _EXPRESSION_LITERAL_MAP._serialized_start = 4450 + _EXPRESSION_LITERAL_MAP._serialized_end = 4677 + _EXPRESSION_LITERAL_STRUCT._serialized_start = 4680 + _EXPRESSION_LITERAL_STRUCT._serialized_end = 4809 + _EXPRESSION_UNRESOLVEDATTRIBUTE._serialized_start = 4828 + _EXPRESSION_UNRESOLVEDATTRIBUTE._serialized_end = 5014 + _EXPRESSION_UNRESOLVEDFUNCTION._serialized_start = 5017 + _EXPRESSION_UNRESOLVEDFUNCTION._serialized_end = 5221 + _EXPRESSION_EXPRESSIONSTRING._serialized_start = 5223 + _EXPRESSION_EXPRESSIONSTRING._serialized_end = 5273 + _EXPRESSION_UNRESOLVEDSTAR._serialized_start = 5275 + _EXPRESSION_UNRESOLVEDSTAR._serialized_end = 5399 + _EXPRESSION_UNRESOLVEDREGEX._serialized_start = 5401 + _EXPRESSION_UNRESOLVEDREGEX._serialized_end = 5487 + _EXPRESSION_UNRESOLVEDEXTRACTVALUE._serialized_start = 5490 + _EXPRESSION_UNRESOLVEDEXTRACTVALUE._serialized_end = 5622 + _EXPRESSION_UPDATEFIELDS._serialized_start = 5625 + _EXPRESSION_UPDATEFIELDS._serialized_end = 5812 + _EXPRESSION_ALIAS._serialized_start = 5814 + _EXPRESSION_ALIAS._serialized_end = 5934 + _EXPRESSION_LAMBDAFUNCTION._serialized_start = 5937 + _EXPRESSION_LAMBDAFUNCTION._serialized_end = 6095 + _EXPRESSION_UNRESOLVEDNAMEDLAMBDAVARIABLE._serialized_start = 6097 + _EXPRESSION_UNRESOLVEDNAMEDLAMBDAVARIABLE._serialized_end = 6159 + _EXPRESSIONCOMMON._serialized_start = 6174 + _EXPRESSIONCOMMON._serialized_end = 6239 + _COMMONINLINEUSERDEFINEDFUNCTION._serialized_start = 6242 + _COMMONINLINEUSERDEFINEDFUNCTION._serialized_end = 6606 + _PYTHONUDF._serialized_start = 6609 + _PYTHONUDF._serialized_end = 6813 + _SCALARSCALAUDF._serialized_start = 6816 + _SCALARSCALAUDF._serialized_end = 7030 + _JAVAUDF._serialized_start = 7033 + _JAVAUDF._serialized_end = 7182 + _CALLFUNCTION._serialized_start = 7184 + _CALLFUNCTION._serialized_end = 7292 + _NAMEDARGUMENTEXPRESSION._serialized_start = 7294 + _NAMEDARGUMENTEXPRESSION._serialized_end = 7386 # @@protoc_insertion_point(module_scope) diff --git a/python/pyspark/sql/connect/proto/expressions_pb2.pyi b/python/pyspark/sql/connect/proto/expressions_pb2.pyi index 183a839da9204..42031d47bb851 100644 --- a/python/pyspark/sql/connect/proto/expressions_pb2.pyi +++ b/python/pyspark/sql/connect/proto/expressions_pb2.pyi @@ -40,6 +40,7 @@ import google.protobuf.descriptor import google.protobuf.internal.containers import google.protobuf.internal.enum_type_wrapper import google.protobuf.message +import pyspark.sql.connect.proto.common_pb2 import pyspark.sql.connect.proto.types_pb2 import sys import typing @@ -1163,6 +1164,7 @@ class Expression(google.protobuf.message.Message): self, field_name: typing_extensions.Literal["name_parts", b"name_parts"] ) -> None: ... + COMMON_FIELD_NUMBER: builtins.int LITERAL_FIELD_NUMBER: builtins.int UNRESOLVED_ATTRIBUTE_FIELD_NUMBER: builtins.int UNRESOLVED_FUNCTION_FIELD_NUMBER: builtins.int @@ -1182,6 +1184,8 @@ class Expression(google.protobuf.message.Message): NAMED_ARGUMENT_EXPRESSION_FIELD_NUMBER: builtins.int EXTENSION_FIELD_NUMBER: builtins.int @property + def common(self) -> global___ExpressionCommon: ... + @property def literal(self) -> global___Expression.Literal: ... @property def unresolved_attribute(self) -> global___Expression.UnresolvedAttribute: ... @@ -1225,6 +1229,7 @@ class Expression(google.protobuf.message.Message): def __init__( self, *, + common: global___ExpressionCommon | None = ..., literal: global___Expression.Literal | None = ..., unresolved_attribute: global___Expression.UnresolvedAttribute | None = ..., unresolved_function: global___Expression.UnresolvedFunction | None = ..., @@ -1254,6 +1259,8 @@ class Expression(google.protobuf.message.Message): b"call_function", "cast", b"cast", + "common", + b"common", "common_inline_user_defined_function", b"common_inline_user_defined_function", "expr_type", @@ -1297,6 +1304,8 @@ class Expression(google.protobuf.message.Message): b"call_function", "cast", b"cast", + "common", + b"common", "common_inline_user_defined_function", b"common_inline_user_defined_function", "expr_type", @@ -1359,6 +1368,25 @@ class Expression(google.protobuf.message.Message): global___Expression = Expression +class ExpressionCommon(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + ORIGIN_FIELD_NUMBER: builtins.int + @property + def origin(self) -> pyspark.sql.connect.proto.common_pb2.Origin: + """(Required) Keep the information of the origin for this expression such as stacktrace.""" + def __init__( + self, + *, + origin: pyspark.sql.connect.proto.common_pb2.Origin | None = ..., + ) -> None: ... + def HasField( + self, field_name: typing_extensions.Literal["origin", b"origin"] + ) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["origin", b"origin"]) -> None: ... + +global___ExpressionCommon = ExpressionCommon + class CommonInlineUserDefinedFunction(google.protobuf.message.Message): DESCRIPTOR: google.protobuf.descriptor.Descriptor @@ -1438,6 +1466,7 @@ class PythonUDF(google.protobuf.message.Message): EVAL_TYPE_FIELD_NUMBER: builtins.int COMMAND_FIELD_NUMBER: builtins.int PYTHON_VER_FIELD_NUMBER: builtins.int + ADDITIONAL_INCLUDES_FIELD_NUMBER: builtins.int @property def output_type(self) -> pyspark.sql.connect.proto.types_pb2.DataType: """(Required) Output type of the Python UDF""" @@ -1447,6 +1476,11 @@ class PythonUDF(google.protobuf.message.Message): """(Required) The encoded commands of the Python UDF""" python_ver: builtins.str """(Required) Python version being used in the client.""" + @property + def additional_includes( + self, + ) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.str]: + """(Optional) Additional includes for the Python UDF.""" def __init__( self, *, @@ -1454,6 +1488,7 @@ class PythonUDF(google.protobuf.message.Message): eval_type: builtins.int = ..., command: builtins.bytes = ..., python_ver: builtins.str = ..., + additional_includes: collections.abc.Iterable[builtins.str] | None = ..., ) -> None: ... def HasField( self, field_name: typing_extensions.Literal["output_type", b"output_type"] @@ -1461,6 +1496,8 @@ class PythonUDF(google.protobuf.message.Message): def ClearField( self, field_name: typing_extensions.Literal[ + "additional_includes", + b"additional_includes", "command", b"command", "eval_type", @@ -1481,6 +1518,7 @@ class ScalarScalaUDF(google.protobuf.message.Message): INPUTTYPES_FIELD_NUMBER: builtins.int OUTPUTTYPE_FIELD_NUMBER: builtins.int NULLABLE_FIELD_NUMBER: builtins.int + AGGREGATE_FIELD_NUMBER: builtins.int payload: builtins.bytes """(Required) Serialized JVM object containing UDF definition, input encoders and output encoder""" @property @@ -1495,6 +1533,8 @@ class ScalarScalaUDF(google.protobuf.message.Message): """(Required) Output type of the UDF""" nullable: builtins.bool """(Required) True if the UDF can return null value""" + aggregate: builtins.bool + """(Required) Indicate if the UDF is an aggregate function""" def __init__( self, *, @@ -1503,6 +1543,7 @@ class ScalarScalaUDF(google.protobuf.message.Message): | None = ..., outputType: pyspark.sql.connect.proto.types_pb2.DataType | None = ..., nullable: builtins.bool = ..., + aggregate: builtins.bool = ..., ) -> None: ... def HasField( self, field_name: typing_extensions.Literal["outputType", b"outputType"] @@ -1510,6 +1551,8 @@ class ScalarScalaUDF(google.protobuf.message.Message): def ClearField( self, field_name: typing_extensions.Literal[ + "aggregate", + b"aggregate", "inputTypes", b"inputTypes", "nullable", diff --git a/python/pyspark/sql/connect/proto/relations_pb2.py b/python/pyspark/sql/connect/proto/relations_pb2.py index 467d0610bbc60..9f4d1e717a28d 100644 --- a/python/pyspark/sql/connect/proto/relations_pb2.py +++ b/python/pyspark/sql/connect/proto/relations_pb2.py @@ -36,7 +36,7 @@ DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x1dspark/connect/relations.proto\x12\rspark.connect\x1a\x19google/protobuf/any.proto\x1a\x1fspark/connect/expressions.proto\x1a\x19spark/connect/types.proto\x1a\x1bspark/connect/catalog.proto\x1a\x1aspark/connect/common.proto"\xe9\x1a\n\x08Relation\x12\x35\n\x06\x63ommon\x18\x01 \x01(\x0b\x32\x1d.spark.connect.RelationCommonR\x06\x63ommon\x12)\n\x04read\x18\x02 \x01(\x0b\x32\x13.spark.connect.ReadH\x00R\x04read\x12\x32\n\x07project\x18\x03 \x01(\x0b\x32\x16.spark.connect.ProjectH\x00R\x07project\x12/\n\x06\x66ilter\x18\x04 \x01(\x0b\x32\x15.spark.connect.FilterH\x00R\x06\x66ilter\x12)\n\x04join\x18\x05 \x01(\x0b\x32\x13.spark.connect.JoinH\x00R\x04join\x12\x34\n\x06set_op\x18\x06 \x01(\x0b\x32\x1b.spark.connect.SetOperationH\x00R\x05setOp\x12)\n\x04sort\x18\x07 \x01(\x0b\x32\x13.spark.connect.SortH\x00R\x04sort\x12,\n\x05limit\x18\x08 \x01(\x0b\x32\x14.spark.connect.LimitH\x00R\x05limit\x12\x38\n\taggregate\x18\t \x01(\x0b\x32\x18.spark.connect.AggregateH\x00R\taggregate\x12&\n\x03sql\x18\n \x01(\x0b\x32\x12.spark.connect.SQLH\x00R\x03sql\x12\x45\n\x0elocal_relation\x18\x0b \x01(\x0b\x32\x1c.spark.connect.LocalRelationH\x00R\rlocalRelation\x12/\n\x06sample\x18\x0c \x01(\x0b\x32\x15.spark.connect.SampleH\x00R\x06sample\x12/\n\x06offset\x18\r \x01(\x0b\x32\x15.spark.connect.OffsetH\x00R\x06offset\x12>\n\x0b\x64\x65\x64uplicate\x18\x0e \x01(\x0b\x32\x1a.spark.connect.DeduplicateH\x00R\x0b\x64\x65\x64uplicate\x12,\n\x05range\x18\x0f \x01(\x0b\x32\x14.spark.connect.RangeH\x00R\x05range\x12\x45\n\x0esubquery_alias\x18\x10 \x01(\x0b\x32\x1c.spark.connect.SubqueryAliasH\x00R\rsubqueryAlias\x12>\n\x0brepartition\x18\x11 \x01(\x0b\x32\x1a.spark.connect.RepartitionH\x00R\x0brepartition\x12*\n\x05to_df\x18\x12 \x01(\x0b\x32\x13.spark.connect.ToDFH\x00R\x04toDf\x12U\n\x14with_columns_renamed\x18\x13 \x01(\x0b\x32!.spark.connect.WithColumnsRenamedH\x00R\x12withColumnsRenamed\x12<\n\x0bshow_string\x18\x14 \x01(\x0b\x32\x19.spark.connect.ShowStringH\x00R\nshowString\x12)\n\x04\x64rop\x18\x15 \x01(\x0b\x32\x13.spark.connect.DropH\x00R\x04\x64rop\x12)\n\x04tail\x18\x16 \x01(\x0b\x32\x13.spark.connect.TailH\x00R\x04tail\x12?\n\x0cwith_columns\x18\x17 \x01(\x0b\x32\x1a.spark.connect.WithColumnsH\x00R\x0bwithColumns\x12)\n\x04hint\x18\x18 \x01(\x0b\x32\x13.spark.connect.HintH\x00R\x04hint\x12\x32\n\x07unpivot\x18\x19 \x01(\x0b\x32\x16.spark.connect.UnpivotH\x00R\x07unpivot\x12\x36\n\tto_schema\x18\x1a \x01(\x0b\x32\x17.spark.connect.ToSchemaH\x00R\x08toSchema\x12\x64\n\x19repartition_by_expression\x18\x1b \x01(\x0b\x32&.spark.connect.RepartitionByExpressionH\x00R\x17repartitionByExpression\x12\x45\n\x0emap_partitions\x18\x1c \x01(\x0b\x32\x1c.spark.connect.MapPartitionsH\x00R\rmapPartitions\x12H\n\x0f\x63ollect_metrics\x18\x1d \x01(\x0b\x32\x1d.spark.connect.CollectMetricsH\x00R\x0e\x63ollectMetrics\x12,\n\x05parse\x18\x1e \x01(\x0b\x32\x14.spark.connect.ParseH\x00R\x05parse\x12\x36\n\tgroup_map\x18\x1f \x01(\x0b\x32\x17.spark.connect.GroupMapH\x00R\x08groupMap\x12=\n\x0c\x63o_group_map\x18 \x01(\x0b\x32\x19.spark.connect.CoGroupMapH\x00R\ncoGroupMap\x12\x45\n\x0ewith_watermark\x18! \x01(\x0b\x32\x1c.spark.connect.WithWatermarkH\x00R\rwithWatermark\x12\x63\n\x1a\x61pply_in_pandas_with_state\x18" \x01(\x0b\x32%.spark.connect.ApplyInPandasWithStateH\x00R\x16\x61pplyInPandasWithState\x12<\n\x0bhtml_string\x18# \x01(\x0b\x32\x19.spark.connect.HtmlStringH\x00R\nhtmlString\x12X\n\x15\x63\x61\x63hed_local_relation\x18$ \x01(\x0b\x32".spark.connect.CachedLocalRelationH\x00R\x13\x63\x61\x63hedLocalRelation\x12[\n\x16\x63\x61\x63hed_remote_relation\x18% \x01(\x0b\x32#.spark.connect.CachedRemoteRelationH\x00R\x14\x63\x61\x63hedRemoteRelation\x12\x8e\x01\n)common_inline_user_defined_table_function\x18& \x01(\x0b\x32\x33.spark.connect.CommonInlineUserDefinedTableFunctionH\x00R$commonInlineUserDefinedTableFunction\x12\x37\n\nas_of_join\x18\' \x01(\x0b\x32\x17.spark.connect.AsOfJoinH\x00R\x08\x61sOfJoin\x12\x85\x01\n&common_inline_user_defined_data_source\x18( \x01(\x0b\x32\x30.spark.connect.CommonInlineUserDefinedDataSourceH\x00R!commonInlineUserDefinedDataSource\x12\x45\n\x0ewith_relations\x18) \x01(\x0b\x32\x1c.spark.connect.WithRelationsH\x00R\rwithRelations\x12\x30\n\x07\x66ill_na\x18Z \x01(\x0b\x32\x15.spark.connect.NAFillH\x00R\x06\x66illNa\x12\x30\n\x07\x64rop_na\x18[ \x01(\x0b\x32\x15.spark.connect.NADropH\x00R\x06\x64ropNa\x12\x34\n\x07replace\x18\\ \x01(\x0b\x32\x18.spark.connect.NAReplaceH\x00R\x07replace\x12\x36\n\x07summary\x18\x64 \x01(\x0b\x32\x1a.spark.connect.StatSummaryH\x00R\x07summary\x12\x39\n\x08\x63rosstab\x18\x65 \x01(\x0b\x32\x1b.spark.connect.StatCrosstabH\x00R\x08\x63rosstab\x12\x39\n\x08\x64\x65scribe\x18\x66 \x01(\x0b\x32\x1b.spark.connect.StatDescribeH\x00R\x08\x64\x65scribe\x12*\n\x03\x63ov\x18g \x01(\x0b\x32\x16.spark.connect.StatCovH\x00R\x03\x63ov\x12-\n\x04\x63orr\x18h \x01(\x0b\x32\x17.spark.connect.StatCorrH\x00R\x04\x63orr\x12L\n\x0f\x61pprox_quantile\x18i \x01(\x0b\x32!.spark.connect.StatApproxQuantileH\x00R\x0e\x61pproxQuantile\x12=\n\nfreq_items\x18j \x01(\x0b\x32\x1c.spark.connect.StatFreqItemsH\x00R\tfreqItems\x12:\n\tsample_by\x18k \x01(\x0b\x32\x1b.spark.connect.StatSampleByH\x00R\x08sampleBy\x12\x33\n\x07\x63\x61talog\x18\xc8\x01 \x01(\x0b\x32\x16.spark.connect.CatalogH\x00R\x07\x63\x61talog\x12\x35\n\textension\x18\xe6\x07 \x01(\x0b\x32\x14.google.protobuf.AnyH\x00R\textension\x12\x33\n\x07unknown\x18\xe7\x07 \x01(\x0b\x32\x16.spark.connect.UnknownH\x00R\x07unknownB\n\n\x08rel_type"\t\n\x07Unknown"[\n\x0eRelationCommon\x12\x1f\n\x0bsource_info\x18\x01 \x01(\tR\nsourceInfo\x12\x1c\n\x07plan_id\x18\x02 \x01(\x03H\x00R\x06planId\x88\x01\x01\x42\n\n\x08_plan_id"\xde\x03\n\x03SQL\x12\x14\n\x05query\x18\x01 \x01(\tR\x05query\x12\x34\n\x04\x61rgs\x18\x02 \x03(\x0b\x32\x1c.spark.connect.SQL.ArgsEntryB\x02\x18\x01R\x04\x61rgs\x12@\n\x08pos_args\x18\x03 \x03(\x0b\x32!.spark.connect.Expression.LiteralB\x02\x18\x01R\x07posArgs\x12O\n\x0fnamed_arguments\x18\x04 \x03(\x0b\x32&.spark.connect.SQL.NamedArgumentsEntryR\x0enamedArguments\x12>\n\rpos_arguments\x18\x05 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x0cposArguments\x1aZ\n\tArgsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x37\n\x05value\x18\x02 \x01(\x0b\x32!.spark.connect.Expression.LiteralR\x05value:\x02\x38\x01\x1a\\\n\x13NamedArgumentsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12/\n\x05value\x18\x02 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x05value:\x02\x38\x01"u\n\rWithRelations\x12+\n\x04root\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x04root\x12\x37\n\nreferences\x18\x02 \x03(\x0b\x32\x17.spark.connect.RelationR\nreferences"\x97\x05\n\x04Read\x12\x41\n\x0bnamed_table\x18\x01 \x01(\x0b\x32\x1e.spark.connect.Read.NamedTableH\x00R\nnamedTable\x12\x41\n\x0b\x64\x61ta_source\x18\x02 \x01(\x0b\x32\x1e.spark.connect.Read.DataSourceH\x00R\ndataSource\x12!\n\x0cis_streaming\x18\x03 \x01(\x08R\x0bisStreaming\x1a\xc0\x01\n\nNamedTable\x12/\n\x13unparsed_identifier\x18\x01 \x01(\tR\x12unparsedIdentifier\x12\x45\n\x07options\x18\x02 \x03(\x0b\x32+.spark.connect.Read.NamedTable.OptionsEntryR\x07options\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a\x95\x02\n\nDataSource\x12\x1b\n\x06\x66ormat\x18\x01 \x01(\tH\x00R\x06\x66ormat\x88\x01\x01\x12\x1b\n\x06schema\x18\x02 \x01(\tH\x01R\x06schema\x88\x01\x01\x12\x45\n\x07options\x18\x03 \x03(\x0b\x32+.spark.connect.Read.DataSource.OptionsEntryR\x07options\x12\x14\n\x05paths\x18\x04 \x03(\tR\x05paths\x12\x1e\n\npredicates\x18\x05 \x03(\tR\npredicates\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x42\t\n\x07_formatB\t\n\x07_schemaB\x0b\n\tread_type"u\n\x07Project\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12;\n\x0b\x65xpressions\x18\x03 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x0b\x65xpressions"p\n\x06\x46ilter\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x37\n\tcondition\x18\x02 \x01(\x0b\x32\x19.spark.connect.ExpressionR\tcondition"\x95\x05\n\x04Join\x12+\n\x04left\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x04left\x12-\n\x05right\x18\x02 \x01(\x0b\x32\x17.spark.connect.RelationR\x05right\x12@\n\x0ejoin_condition\x18\x03 \x01(\x0b\x32\x19.spark.connect.ExpressionR\rjoinCondition\x12\x39\n\tjoin_type\x18\x04 \x01(\x0e\x32\x1c.spark.connect.Join.JoinTypeR\x08joinType\x12#\n\rusing_columns\x18\x05 \x03(\tR\x0cusingColumns\x12K\n\x0ejoin_data_type\x18\x06 \x01(\x0b\x32 .spark.connect.Join.JoinDataTypeH\x00R\x0cjoinDataType\x88\x01\x01\x1a\\\n\x0cJoinDataType\x12$\n\x0eis_left_struct\x18\x01 \x01(\x08R\x0cisLeftStruct\x12&\n\x0fis_right_struct\x18\x02 \x01(\x08R\risRightStruct"\xd0\x01\n\x08JoinType\x12\x19\n\x15JOIN_TYPE_UNSPECIFIED\x10\x00\x12\x13\n\x0fJOIN_TYPE_INNER\x10\x01\x12\x18\n\x14JOIN_TYPE_FULL_OUTER\x10\x02\x12\x18\n\x14JOIN_TYPE_LEFT_OUTER\x10\x03\x12\x19\n\x15JOIN_TYPE_RIGHT_OUTER\x10\x04\x12\x17\n\x13JOIN_TYPE_LEFT_ANTI\x10\x05\x12\x17\n\x13JOIN_TYPE_LEFT_SEMI\x10\x06\x12\x13\n\x0fJOIN_TYPE_CROSS\x10\x07\x42\x11\n\x0f_join_data_type"\xdf\x03\n\x0cSetOperation\x12\x36\n\nleft_input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\tleftInput\x12\x38\n\x0bright_input\x18\x02 \x01(\x0b\x32\x17.spark.connect.RelationR\nrightInput\x12\x45\n\x0bset_op_type\x18\x03 \x01(\x0e\x32%.spark.connect.SetOperation.SetOpTypeR\tsetOpType\x12\x1a\n\x06is_all\x18\x04 \x01(\x08H\x00R\x05isAll\x88\x01\x01\x12\x1c\n\x07\x62y_name\x18\x05 \x01(\x08H\x01R\x06\x62yName\x88\x01\x01\x12\x37\n\x15\x61llow_missing_columns\x18\x06 \x01(\x08H\x02R\x13\x61llowMissingColumns\x88\x01\x01"r\n\tSetOpType\x12\x1b\n\x17SET_OP_TYPE_UNSPECIFIED\x10\x00\x12\x19\n\x15SET_OP_TYPE_INTERSECT\x10\x01\x12\x15\n\x11SET_OP_TYPE_UNION\x10\x02\x12\x16\n\x12SET_OP_TYPE_EXCEPT\x10\x03\x42\t\n\x07_is_allB\n\n\x08_by_nameB\x18\n\x16_allow_missing_columns"L\n\x05Limit\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x14\n\x05limit\x18\x02 \x01(\x05R\x05limit"O\n\x06Offset\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x16\n\x06offset\x18\x02 \x01(\x05R\x06offset"K\n\x04Tail\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x14\n\x05limit\x18\x02 \x01(\x05R\x05limit"\xfe\x05\n\tAggregate\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x41\n\ngroup_type\x18\x02 \x01(\x0e\x32".spark.connect.Aggregate.GroupTypeR\tgroupType\x12L\n\x14grouping_expressions\x18\x03 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x13groupingExpressions\x12N\n\x15\x61ggregate_expressions\x18\x04 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x14\x61ggregateExpressions\x12\x34\n\x05pivot\x18\x05 \x01(\x0b\x32\x1e.spark.connect.Aggregate.PivotR\x05pivot\x12J\n\rgrouping_sets\x18\x06 \x03(\x0b\x32%.spark.connect.Aggregate.GroupingSetsR\x0cgroupingSets\x1ao\n\x05Pivot\x12+\n\x03\x63ol\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x03\x63ol\x12\x39\n\x06values\x18\x02 \x03(\x0b\x32!.spark.connect.Expression.LiteralR\x06values\x1aL\n\x0cGroupingSets\x12<\n\x0cgrouping_set\x18\x01 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x0bgroupingSet"\x9f\x01\n\tGroupType\x12\x1a\n\x16GROUP_TYPE_UNSPECIFIED\x10\x00\x12\x16\n\x12GROUP_TYPE_GROUPBY\x10\x01\x12\x15\n\x11GROUP_TYPE_ROLLUP\x10\x02\x12\x13\n\x0fGROUP_TYPE_CUBE\x10\x03\x12\x14\n\x10GROUP_TYPE_PIVOT\x10\x04\x12\x1c\n\x18GROUP_TYPE_GROUPING_SETS\x10\x05"\xa0\x01\n\x04Sort\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x39\n\x05order\x18\x02 \x03(\x0b\x32#.spark.connect.Expression.SortOrderR\x05order\x12 \n\tis_global\x18\x03 \x01(\x08H\x00R\x08isGlobal\x88\x01\x01\x42\x0c\n\n_is_global"\x8d\x01\n\x04\x44rop\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x33\n\x07\x63olumns\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x07\x63olumns\x12!\n\x0c\x63olumn_names\x18\x03 \x03(\tR\x0b\x63olumnNames"\xf0\x01\n\x0b\x44\x65\x64uplicate\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12!\n\x0c\x63olumn_names\x18\x02 \x03(\tR\x0b\x63olumnNames\x12\x32\n\x13\x61ll_columns_as_keys\x18\x03 \x01(\x08H\x00R\x10\x61llColumnsAsKeys\x88\x01\x01\x12.\n\x10within_watermark\x18\x04 \x01(\x08H\x01R\x0fwithinWatermark\x88\x01\x01\x42\x16\n\x14_all_columns_as_keysB\x13\n\x11_within_watermark"Y\n\rLocalRelation\x12\x17\n\x04\x64\x61ta\x18\x01 \x01(\x0cH\x00R\x04\x64\x61ta\x88\x01\x01\x12\x1b\n\x06schema\x18\x02 \x01(\tH\x01R\x06schema\x88\x01\x01\x42\x07\n\x05_dataB\t\n\x07_schema"H\n\x13\x43\x61\x63hedLocalRelation\x12\x12\n\x04hash\x18\x03 \x01(\tR\x04hashJ\x04\x08\x01\x10\x02J\x04\x08\x02\x10\x03R\x06userIdR\tsessionId"7\n\x14\x43\x61\x63hedRemoteRelation\x12\x1f\n\x0brelation_id\x18\x01 \x01(\tR\nrelationId"\x91\x02\n\x06Sample\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x1f\n\x0blower_bound\x18\x02 \x01(\x01R\nlowerBound\x12\x1f\n\x0bupper_bound\x18\x03 \x01(\x01R\nupperBound\x12.\n\x10with_replacement\x18\x04 \x01(\x08H\x00R\x0fwithReplacement\x88\x01\x01\x12\x17\n\x04seed\x18\x05 \x01(\x03H\x01R\x04seed\x88\x01\x01\x12/\n\x13\x64\x65terministic_order\x18\x06 \x01(\x08R\x12\x64\x65terministicOrderB\x13\n\x11_with_replacementB\x07\n\x05_seed"\x91\x01\n\x05Range\x12\x19\n\x05start\x18\x01 \x01(\x03H\x00R\x05start\x88\x01\x01\x12\x10\n\x03\x65nd\x18\x02 \x01(\x03R\x03\x65nd\x12\x12\n\x04step\x18\x03 \x01(\x03R\x04step\x12*\n\x0enum_partitions\x18\x04 \x01(\x05H\x01R\rnumPartitions\x88\x01\x01\x42\x08\n\x06_startB\x11\n\x0f_num_partitions"r\n\rSubqueryAlias\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x14\n\x05\x61lias\x18\x02 \x01(\tR\x05\x61lias\x12\x1c\n\tqualifier\x18\x03 \x03(\tR\tqualifier"\x8e\x01\n\x0bRepartition\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12%\n\x0enum_partitions\x18\x02 \x01(\x05R\rnumPartitions\x12\x1d\n\x07shuffle\x18\x03 \x01(\x08H\x00R\x07shuffle\x88\x01\x01\x42\n\n\x08_shuffle"\x8e\x01\n\nShowString\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x19\n\x08num_rows\x18\x02 \x01(\x05R\x07numRows\x12\x1a\n\x08truncate\x18\x03 \x01(\x05R\x08truncate\x12\x1a\n\x08vertical\x18\x04 \x01(\x08R\x08vertical"r\n\nHtmlString\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x19\n\x08num_rows\x18\x02 \x01(\x05R\x07numRows\x12\x1a\n\x08truncate\x18\x03 \x01(\x05R\x08truncate"\\\n\x0bStatSummary\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x1e\n\nstatistics\x18\x02 \x03(\tR\nstatistics"Q\n\x0cStatDescribe\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ols\x18\x02 \x03(\tR\x04\x63ols"e\n\x0cStatCrosstab\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ol1\x18\x02 \x01(\tR\x04\x63ol1\x12\x12\n\x04\x63ol2\x18\x03 \x01(\tR\x04\x63ol2"`\n\x07StatCov\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ol1\x18\x02 \x01(\tR\x04\x63ol1\x12\x12\n\x04\x63ol2\x18\x03 \x01(\tR\x04\x63ol2"\x89\x01\n\x08StatCorr\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ol1\x18\x02 \x01(\tR\x04\x63ol1\x12\x12\n\x04\x63ol2\x18\x03 \x01(\tR\x04\x63ol2\x12\x1b\n\x06method\x18\x04 \x01(\tH\x00R\x06method\x88\x01\x01\x42\t\n\x07_method"\xa4\x01\n\x12StatApproxQuantile\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ols\x18\x02 \x03(\tR\x04\x63ols\x12$\n\rprobabilities\x18\x03 \x03(\x01R\rprobabilities\x12%\n\x0erelative_error\x18\x04 \x01(\x01R\rrelativeError"}\n\rStatFreqItems\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ols\x18\x02 \x03(\tR\x04\x63ols\x12\x1d\n\x07support\x18\x03 \x01(\x01H\x00R\x07support\x88\x01\x01\x42\n\n\x08_support"\xb5\x02\n\x0cStatSampleBy\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12+\n\x03\x63ol\x18\x02 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x03\x63ol\x12\x42\n\tfractions\x18\x03 \x03(\x0b\x32$.spark.connect.StatSampleBy.FractionR\tfractions\x12\x17\n\x04seed\x18\x05 \x01(\x03H\x00R\x04seed\x88\x01\x01\x1a\x63\n\x08\x46raction\x12;\n\x07stratum\x18\x01 \x01(\x0b\x32!.spark.connect.Expression.LiteralR\x07stratum\x12\x1a\n\x08\x66raction\x18\x02 \x01(\x01R\x08\x66ractionB\x07\n\x05_seed"\x86\x01\n\x06NAFill\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ols\x18\x02 \x03(\tR\x04\x63ols\x12\x39\n\x06values\x18\x03 \x03(\x0b\x32!.spark.connect.Expression.LiteralR\x06values"\x86\x01\n\x06NADrop\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ols\x18\x02 \x03(\tR\x04\x63ols\x12\'\n\rmin_non_nulls\x18\x03 \x01(\x05H\x00R\x0bminNonNulls\x88\x01\x01\x42\x10\n\x0e_min_non_nulls"\xa8\x02\n\tNAReplace\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ols\x18\x02 \x03(\tR\x04\x63ols\x12H\n\x0creplacements\x18\x03 \x03(\x0b\x32$.spark.connect.NAReplace.ReplacementR\x0creplacements\x1a\x8d\x01\n\x0bReplacement\x12>\n\told_value\x18\x01 \x01(\x0b\x32!.spark.connect.Expression.LiteralR\x08oldValue\x12>\n\tnew_value\x18\x02 \x01(\x0b\x32!.spark.connect.Expression.LiteralR\x08newValue"X\n\x04ToDF\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12!\n\x0c\x63olumn_names\x18\x02 \x03(\tR\x0b\x63olumnNames"\xfe\x02\n\x12WithColumnsRenamed\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12i\n\x12rename_columns_map\x18\x02 \x03(\x0b\x32\x37.spark.connect.WithColumnsRenamed.RenameColumnsMapEntryB\x02\x18\x01R\x10renameColumnsMap\x12\x42\n\x07renames\x18\x03 \x03(\x0b\x32(.spark.connect.WithColumnsRenamed.RenameR\x07renames\x1a\x43\n\x15RenameColumnsMapEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a\x45\n\x06Rename\x12\x19\n\x08\x63ol_name\x18\x01 \x01(\tR\x07\x63olName\x12 \n\x0cnew_col_name\x18\x02 \x01(\tR\nnewColName"w\n\x0bWithColumns\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x39\n\x07\x61liases\x18\x02 \x03(\x0b\x32\x1f.spark.connect.Expression.AliasR\x07\x61liases"\x86\x01\n\rWithWatermark\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x1d\n\nevent_time\x18\x02 \x01(\tR\teventTime\x12\'\n\x0f\x64\x65lay_threshold\x18\x03 \x01(\tR\x0e\x64\x65layThreshold"\x84\x01\n\x04Hint\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04name\x18\x02 \x01(\tR\x04name\x12\x39\n\nparameters\x18\x03 \x03(\x0b\x32\x19.spark.connect.ExpressionR\nparameters"\xc7\x02\n\x07Unpivot\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12+\n\x03ids\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x03ids\x12:\n\x06values\x18\x03 \x01(\x0b\x32\x1d.spark.connect.Unpivot.ValuesH\x00R\x06values\x88\x01\x01\x12\x30\n\x14variable_column_name\x18\x04 \x01(\tR\x12variableColumnName\x12*\n\x11value_column_name\x18\x05 \x01(\tR\x0fvalueColumnName\x1a;\n\x06Values\x12\x31\n\x06values\x18\x01 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x06valuesB\t\n\x07_values"j\n\x08ToSchema\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12/\n\x06schema\x18\x02 \x01(\x0b\x32\x17.spark.connect.DataTypeR\x06schema"\xcb\x01\n\x17RepartitionByExpression\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x42\n\x0fpartition_exprs\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x0epartitionExprs\x12*\n\x0enum_partitions\x18\x03 \x01(\x05H\x00R\rnumPartitions\x88\x01\x01\x42\x11\n\x0f_num_partitions"\xe8\x01\n\rMapPartitions\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x42\n\x04\x66unc\x18\x02 \x01(\x0b\x32..spark.connect.CommonInlineUserDefinedFunctionR\x04\x66unc\x12"\n\nis_barrier\x18\x03 \x01(\x08H\x00R\tisBarrier\x88\x01\x01\x12"\n\nprofile_id\x18\x04 \x01(\x05H\x01R\tprofileId\x88\x01\x01\x42\r\n\x0b_is_barrierB\r\n\x0b_profile_id"\xfb\x04\n\x08GroupMap\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12L\n\x14grouping_expressions\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x13groupingExpressions\x12\x42\n\x04\x66unc\x18\x03 \x01(\x0b\x32..spark.connect.CommonInlineUserDefinedFunctionR\x04\x66unc\x12J\n\x13sorting_expressions\x18\x04 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x12sortingExpressions\x12<\n\rinitial_input\x18\x05 \x01(\x0b\x32\x17.spark.connect.RelationR\x0cinitialInput\x12[\n\x1cinitial_grouping_expressions\x18\x06 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x1ainitialGroupingExpressions\x12;\n\x18is_map_groups_with_state\x18\x07 \x01(\x08H\x00R\x14isMapGroupsWithState\x88\x01\x01\x12$\n\x0boutput_mode\x18\x08 \x01(\tH\x01R\noutputMode\x88\x01\x01\x12&\n\x0ctimeout_conf\x18\t \x01(\tH\x02R\x0btimeoutConf\x88\x01\x01\x42\x1b\n\x19_is_map_groups_with_stateB\x0e\n\x0c_output_modeB\x0f\n\r_timeout_conf"\x8e\x04\n\nCoGroupMap\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12W\n\x1ainput_grouping_expressions\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x18inputGroupingExpressions\x12-\n\x05other\x18\x03 \x01(\x0b\x32\x17.spark.connect.RelationR\x05other\x12W\n\x1aother_grouping_expressions\x18\x04 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x18otherGroupingExpressions\x12\x42\n\x04\x66unc\x18\x05 \x01(\x0b\x32..spark.connect.CommonInlineUserDefinedFunctionR\x04\x66unc\x12U\n\x19input_sorting_expressions\x18\x06 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x17inputSortingExpressions\x12U\n\x19other_sorting_expressions\x18\x07 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x17otherSortingExpressions"\xe5\x02\n\x16\x41pplyInPandasWithState\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12L\n\x14grouping_expressions\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x13groupingExpressions\x12\x42\n\x04\x66unc\x18\x03 \x01(\x0b\x32..spark.connect.CommonInlineUserDefinedFunctionR\x04\x66unc\x12#\n\routput_schema\x18\x04 \x01(\tR\x0coutputSchema\x12!\n\x0cstate_schema\x18\x05 \x01(\tR\x0bstateSchema\x12\x1f\n\x0boutput_mode\x18\x06 \x01(\tR\noutputMode\x12!\n\x0ctimeout_conf\x18\x07 \x01(\tR\x0btimeoutConf"\xf4\x01\n$CommonInlineUserDefinedTableFunction\x12#\n\rfunction_name\x18\x01 \x01(\tR\x0c\x66unctionName\x12$\n\rdeterministic\x18\x02 \x01(\x08R\rdeterministic\x12\x37\n\targuments\x18\x03 \x03(\x0b\x32\x19.spark.connect.ExpressionR\targuments\x12<\n\x0bpython_udtf\x18\x04 \x01(\x0b\x32\x19.spark.connect.PythonUDTFH\x00R\npythonUdtfB\n\n\x08\x66unction"\xb1\x01\n\nPythonUDTF\x12=\n\x0breturn_type\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeH\x00R\nreturnType\x88\x01\x01\x12\x1b\n\teval_type\x18\x02 \x01(\x05R\x08\x65valType\x12\x18\n\x07\x63ommand\x18\x03 \x01(\x0cR\x07\x63ommand\x12\x1d\n\npython_ver\x18\x04 \x01(\tR\tpythonVerB\x0e\n\x0c_return_type"\x97\x01\n!CommonInlineUserDefinedDataSource\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12O\n\x12python_data_source\x18\x02 \x01(\x0b\x32\x1f.spark.connect.PythonDataSourceH\x00R\x10pythonDataSourceB\r\n\x0b\x64\x61ta_source"K\n\x10PythonDataSource\x12\x18\n\x07\x63ommand\x18\x01 \x01(\x0cR\x07\x63ommand\x12\x1d\n\npython_ver\x18\x02 \x01(\tR\tpythonVer"\x88\x01\n\x0e\x43ollectMetrics\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04name\x18\x02 \x01(\tR\x04name\x12\x33\n\x07metrics\x18\x03 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x07metrics"\x84\x03\n\x05Parse\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x38\n\x06\x66ormat\x18\x02 \x01(\x0e\x32 .spark.connect.Parse.ParseFormatR\x06\x66ormat\x12\x34\n\x06schema\x18\x03 \x01(\x0b\x32\x17.spark.connect.DataTypeH\x00R\x06schema\x88\x01\x01\x12;\n\x07options\x18\x04 \x03(\x0b\x32!.spark.connect.Parse.OptionsEntryR\x07options\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01"X\n\x0bParseFormat\x12\x1c\n\x18PARSE_FORMAT_UNSPECIFIED\x10\x00\x12\x14\n\x10PARSE_FORMAT_CSV\x10\x01\x12\x15\n\x11PARSE_FORMAT_JSON\x10\x02\x42\t\n\x07_schema"\xdb\x03\n\x08\x41sOfJoin\x12+\n\x04left\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x04left\x12-\n\x05right\x18\x02 \x01(\x0b\x32\x17.spark.connect.RelationR\x05right\x12\x37\n\nleft_as_of\x18\x03 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x08leftAsOf\x12\x39\n\x0bright_as_of\x18\x04 \x01(\x0b\x32\x19.spark.connect.ExpressionR\trightAsOf\x12\x36\n\tjoin_expr\x18\x05 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x08joinExpr\x12#\n\rusing_columns\x18\x06 \x03(\tR\x0cusingColumns\x12\x1b\n\tjoin_type\x18\x07 \x01(\tR\x08joinType\x12\x37\n\ttolerance\x18\x08 \x01(\x0b\x32\x19.spark.connect.ExpressionR\ttolerance\x12.\n\x13\x61llow_exact_matches\x18\t \x01(\x08R\x11\x61llowExactMatches\x12\x1c\n\tdirection\x18\n \x01(\tR\tdirectionB6\n\x1eorg.apache.spark.connect.protoP\x01Z\x12internal/generatedb\x06proto3' + b'\n\x1dspark/connect/relations.proto\x12\rspark.connect\x1a\x19google/protobuf/any.proto\x1a\x1fspark/connect/expressions.proto\x1a\x19spark/connect/types.proto\x1a\x1bspark/connect/catalog.proto\x1a\x1aspark/connect/common.proto"\xe9\x1a\n\x08Relation\x12\x35\n\x06\x63ommon\x18\x01 \x01(\x0b\x32\x1d.spark.connect.RelationCommonR\x06\x63ommon\x12)\n\x04read\x18\x02 \x01(\x0b\x32\x13.spark.connect.ReadH\x00R\x04read\x12\x32\n\x07project\x18\x03 \x01(\x0b\x32\x16.spark.connect.ProjectH\x00R\x07project\x12/\n\x06\x66ilter\x18\x04 \x01(\x0b\x32\x15.spark.connect.FilterH\x00R\x06\x66ilter\x12)\n\x04join\x18\x05 \x01(\x0b\x32\x13.spark.connect.JoinH\x00R\x04join\x12\x34\n\x06set_op\x18\x06 \x01(\x0b\x32\x1b.spark.connect.SetOperationH\x00R\x05setOp\x12)\n\x04sort\x18\x07 \x01(\x0b\x32\x13.spark.connect.SortH\x00R\x04sort\x12,\n\x05limit\x18\x08 \x01(\x0b\x32\x14.spark.connect.LimitH\x00R\x05limit\x12\x38\n\taggregate\x18\t \x01(\x0b\x32\x18.spark.connect.AggregateH\x00R\taggregate\x12&\n\x03sql\x18\n \x01(\x0b\x32\x12.spark.connect.SQLH\x00R\x03sql\x12\x45\n\x0elocal_relation\x18\x0b \x01(\x0b\x32\x1c.spark.connect.LocalRelationH\x00R\rlocalRelation\x12/\n\x06sample\x18\x0c \x01(\x0b\x32\x15.spark.connect.SampleH\x00R\x06sample\x12/\n\x06offset\x18\r \x01(\x0b\x32\x15.spark.connect.OffsetH\x00R\x06offset\x12>\n\x0b\x64\x65\x64uplicate\x18\x0e \x01(\x0b\x32\x1a.spark.connect.DeduplicateH\x00R\x0b\x64\x65\x64uplicate\x12,\n\x05range\x18\x0f \x01(\x0b\x32\x14.spark.connect.RangeH\x00R\x05range\x12\x45\n\x0esubquery_alias\x18\x10 \x01(\x0b\x32\x1c.spark.connect.SubqueryAliasH\x00R\rsubqueryAlias\x12>\n\x0brepartition\x18\x11 \x01(\x0b\x32\x1a.spark.connect.RepartitionH\x00R\x0brepartition\x12*\n\x05to_df\x18\x12 \x01(\x0b\x32\x13.spark.connect.ToDFH\x00R\x04toDf\x12U\n\x14with_columns_renamed\x18\x13 \x01(\x0b\x32!.spark.connect.WithColumnsRenamedH\x00R\x12withColumnsRenamed\x12<\n\x0bshow_string\x18\x14 \x01(\x0b\x32\x19.spark.connect.ShowStringH\x00R\nshowString\x12)\n\x04\x64rop\x18\x15 \x01(\x0b\x32\x13.spark.connect.DropH\x00R\x04\x64rop\x12)\n\x04tail\x18\x16 \x01(\x0b\x32\x13.spark.connect.TailH\x00R\x04tail\x12?\n\x0cwith_columns\x18\x17 \x01(\x0b\x32\x1a.spark.connect.WithColumnsH\x00R\x0bwithColumns\x12)\n\x04hint\x18\x18 \x01(\x0b\x32\x13.spark.connect.HintH\x00R\x04hint\x12\x32\n\x07unpivot\x18\x19 \x01(\x0b\x32\x16.spark.connect.UnpivotH\x00R\x07unpivot\x12\x36\n\tto_schema\x18\x1a \x01(\x0b\x32\x17.spark.connect.ToSchemaH\x00R\x08toSchema\x12\x64\n\x19repartition_by_expression\x18\x1b \x01(\x0b\x32&.spark.connect.RepartitionByExpressionH\x00R\x17repartitionByExpression\x12\x45\n\x0emap_partitions\x18\x1c \x01(\x0b\x32\x1c.spark.connect.MapPartitionsH\x00R\rmapPartitions\x12H\n\x0f\x63ollect_metrics\x18\x1d \x01(\x0b\x32\x1d.spark.connect.CollectMetricsH\x00R\x0e\x63ollectMetrics\x12,\n\x05parse\x18\x1e \x01(\x0b\x32\x14.spark.connect.ParseH\x00R\x05parse\x12\x36\n\tgroup_map\x18\x1f \x01(\x0b\x32\x17.spark.connect.GroupMapH\x00R\x08groupMap\x12=\n\x0c\x63o_group_map\x18 \x01(\x0b\x32\x19.spark.connect.CoGroupMapH\x00R\ncoGroupMap\x12\x45\n\x0ewith_watermark\x18! \x01(\x0b\x32\x1c.spark.connect.WithWatermarkH\x00R\rwithWatermark\x12\x63\n\x1a\x61pply_in_pandas_with_state\x18" \x01(\x0b\x32%.spark.connect.ApplyInPandasWithStateH\x00R\x16\x61pplyInPandasWithState\x12<\n\x0bhtml_string\x18# \x01(\x0b\x32\x19.spark.connect.HtmlStringH\x00R\nhtmlString\x12X\n\x15\x63\x61\x63hed_local_relation\x18$ \x01(\x0b\x32".spark.connect.CachedLocalRelationH\x00R\x13\x63\x61\x63hedLocalRelation\x12[\n\x16\x63\x61\x63hed_remote_relation\x18% \x01(\x0b\x32#.spark.connect.CachedRemoteRelationH\x00R\x14\x63\x61\x63hedRemoteRelation\x12\x8e\x01\n)common_inline_user_defined_table_function\x18& \x01(\x0b\x32\x33.spark.connect.CommonInlineUserDefinedTableFunctionH\x00R$commonInlineUserDefinedTableFunction\x12\x37\n\nas_of_join\x18\' \x01(\x0b\x32\x17.spark.connect.AsOfJoinH\x00R\x08\x61sOfJoin\x12\x85\x01\n&common_inline_user_defined_data_source\x18( \x01(\x0b\x32\x30.spark.connect.CommonInlineUserDefinedDataSourceH\x00R!commonInlineUserDefinedDataSource\x12\x45\n\x0ewith_relations\x18) \x01(\x0b\x32\x1c.spark.connect.WithRelationsH\x00R\rwithRelations\x12\x30\n\x07\x66ill_na\x18Z \x01(\x0b\x32\x15.spark.connect.NAFillH\x00R\x06\x66illNa\x12\x30\n\x07\x64rop_na\x18[ \x01(\x0b\x32\x15.spark.connect.NADropH\x00R\x06\x64ropNa\x12\x34\n\x07replace\x18\\ \x01(\x0b\x32\x18.spark.connect.NAReplaceH\x00R\x07replace\x12\x36\n\x07summary\x18\x64 \x01(\x0b\x32\x1a.spark.connect.StatSummaryH\x00R\x07summary\x12\x39\n\x08\x63rosstab\x18\x65 \x01(\x0b\x32\x1b.spark.connect.StatCrosstabH\x00R\x08\x63rosstab\x12\x39\n\x08\x64\x65scribe\x18\x66 \x01(\x0b\x32\x1b.spark.connect.StatDescribeH\x00R\x08\x64\x65scribe\x12*\n\x03\x63ov\x18g \x01(\x0b\x32\x16.spark.connect.StatCovH\x00R\x03\x63ov\x12-\n\x04\x63orr\x18h \x01(\x0b\x32\x17.spark.connect.StatCorrH\x00R\x04\x63orr\x12L\n\x0f\x61pprox_quantile\x18i \x01(\x0b\x32!.spark.connect.StatApproxQuantileH\x00R\x0e\x61pproxQuantile\x12=\n\nfreq_items\x18j \x01(\x0b\x32\x1c.spark.connect.StatFreqItemsH\x00R\tfreqItems\x12:\n\tsample_by\x18k \x01(\x0b\x32\x1b.spark.connect.StatSampleByH\x00R\x08sampleBy\x12\x33\n\x07\x63\x61talog\x18\xc8\x01 \x01(\x0b\x32\x16.spark.connect.CatalogH\x00R\x07\x63\x61talog\x12\x35\n\textension\x18\xe6\x07 \x01(\x0b\x32\x14.google.protobuf.AnyH\x00R\textension\x12\x33\n\x07unknown\x18\xe7\x07 \x01(\x0b\x32\x16.spark.connect.UnknownH\x00R\x07unknownB\n\n\x08rel_type"\t\n\x07Unknown"\x8e\x01\n\x0eRelationCommon\x12#\n\x0bsource_info\x18\x01 \x01(\tB\x02\x18\x01R\nsourceInfo\x12\x1c\n\x07plan_id\x18\x02 \x01(\x03H\x00R\x06planId\x88\x01\x01\x12-\n\x06origin\x18\x03 \x01(\x0b\x32\x15.spark.connect.OriginR\x06originB\n\n\x08_plan_id"\xde\x03\n\x03SQL\x12\x14\n\x05query\x18\x01 \x01(\tR\x05query\x12\x34\n\x04\x61rgs\x18\x02 \x03(\x0b\x32\x1c.spark.connect.SQL.ArgsEntryB\x02\x18\x01R\x04\x61rgs\x12@\n\x08pos_args\x18\x03 \x03(\x0b\x32!.spark.connect.Expression.LiteralB\x02\x18\x01R\x07posArgs\x12O\n\x0fnamed_arguments\x18\x04 \x03(\x0b\x32&.spark.connect.SQL.NamedArgumentsEntryR\x0enamedArguments\x12>\n\rpos_arguments\x18\x05 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x0cposArguments\x1aZ\n\tArgsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x37\n\x05value\x18\x02 \x01(\x0b\x32!.spark.connect.Expression.LiteralR\x05value:\x02\x38\x01\x1a\\\n\x13NamedArgumentsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12/\n\x05value\x18\x02 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x05value:\x02\x38\x01"u\n\rWithRelations\x12+\n\x04root\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x04root\x12\x37\n\nreferences\x18\x02 \x03(\x0b\x32\x17.spark.connect.RelationR\nreferences"\x97\x05\n\x04Read\x12\x41\n\x0bnamed_table\x18\x01 \x01(\x0b\x32\x1e.spark.connect.Read.NamedTableH\x00R\nnamedTable\x12\x41\n\x0b\x64\x61ta_source\x18\x02 \x01(\x0b\x32\x1e.spark.connect.Read.DataSourceH\x00R\ndataSource\x12!\n\x0cis_streaming\x18\x03 \x01(\x08R\x0bisStreaming\x1a\xc0\x01\n\nNamedTable\x12/\n\x13unparsed_identifier\x18\x01 \x01(\tR\x12unparsedIdentifier\x12\x45\n\x07options\x18\x02 \x03(\x0b\x32+.spark.connect.Read.NamedTable.OptionsEntryR\x07options\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a\x95\x02\n\nDataSource\x12\x1b\n\x06\x66ormat\x18\x01 \x01(\tH\x00R\x06\x66ormat\x88\x01\x01\x12\x1b\n\x06schema\x18\x02 \x01(\tH\x01R\x06schema\x88\x01\x01\x12\x45\n\x07options\x18\x03 \x03(\x0b\x32+.spark.connect.Read.DataSource.OptionsEntryR\x07options\x12\x14\n\x05paths\x18\x04 \x03(\tR\x05paths\x12\x1e\n\npredicates\x18\x05 \x03(\tR\npredicates\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x42\t\n\x07_formatB\t\n\x07_schemaB\x0b\n\tread_type"u\n\x07Project\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12;\n\x0b\x65xpressions\x18\x03 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x0b\x65xpressions"p\n\x06\x46ilter\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x37\n\tcondition\x18\x02 \x01(\x0b\x32\x19.spark.connect.ExpressionR\tcondition"\x95\x05\n\x04Join\x12+\n\x04left\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x04left\x12-\n\x05right\x18\x02 \x01(\x0b\x32\x17.spark.connect.RelationR\x05right\x12@\n\x0ejoin_condition\x18\x03 \x01(\x0b\x32\x19.spark.connect.ExpressionR\rjoinCondition\x12\x39\n\tjoin_type\x18\x04 \x01(\x0e\x32\x1c.spark.connect.Join.JoinTypeR\x08joinType\x12#\n\rusing_columns\x18\x05 \x03(\tR\x0cusingColumns\x12K\n\x0ejoin_data_type\x18\x06 \x01(\x0b\x32 .spark.connect.Join.JoinDataTypeH\x00R\x0cjoinDataType\x88\x01\x01\x1a\\\n\x0cJoinDataType\x12$\n\x0eis_left_struct\x18\x01 \x01(\x08R\x0cisLeftStruct\x12&\n\x0fis_right_struct\x18\x02 \x01(\x08R\risRightStruct"\xd0\x01\n\x08JoinType\x12\x19\n\x15JOIN_TYPE_UNSPECIFIED\x10\x00\x12\x13\n\x0fJOIN_TYPE_INNER\x10\x01\x12\x18\n\x14JOIN_TYPE_FULL_OUTER\x10\x02\x12\x18\n\x14JOIN_TYPE_LEFT_OUTER\x10\x03\x12\x19\n\x15JOIN_TYPE_RIGHT_OUTER\x10\x04\x12\x17\n\x13JOIN_TYPE_LEFT_ANTI\x10\x05\x12\x17\n\x13JOIN_TYPE_LEFT_SEMI\x10\x06\x12\x13\n\x0fJOIN_TYPE_CROSS\x10\x07\x42\x11\n\x0f_join_data_type"\xdf\x03\n\x0cSetOperation\x12\x36\n\nleft_input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\tleftInput\x12\x38\n\x0bright_input\x18\x02 \x01(\x0b\x32\x17.spark.connect.RelationR\nrightInput\x12\x45\n\x0bset_op_type\x18\x03 \x01(\x0e\x32%.spark.connect.SetOperation.SetOpTypeR\tsetOpType\x12\x1a\n\x06is_all\x18\x04 \x01(\x08H\x00R\x05isAll\x88\x01\x01\x12\x1c\n\x07\x62y_name\x18\x05 \x01(\x08H\x01R\x06\x62yName\x88\x01\x01\x12\x37\n\x15\x61llow_missing_columns\x18\x06 \x01(\x08H\x02R\x13\x61llowMissingColumns\x88\x01\x01"r\n\tSetOpType\x12\x1b\n\x17SET_OP_TYPE_UNSPECIFIED\x10\x00\x12\x19\n\x15SET_OP_TYPE_INTERSECT\x10\x01\x12\x15\n\x11SET_OP_TYPE_UNION\x10\x02\x12\x16\n\x12SET_OP_TYPE_EXCEPT\x10\x03\x42\t\n\x07_is_allB\n\n\x08_by_nameB\x18\n\x16_allow_missing_columns"L\n\x05Limit\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x14\n\x05limit\x18\x02 \x01(\x05R\x05limit"O\n\x06Offset\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x16\n\x06offset\x18\x02 \x01(\x05R\x06offset"K\n\x04Tail\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x14\n\x05limit\x18\x02 \x01(\x05R\x05limit"\xfe\x05\n\tAggregate\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x41\n\ngroup_type\x18\x02 \x01(\x0e\x32".spark.connect.Aggregate.GroupTypeR\tgroupType\x12L\n\x14grouping_expressions\x18\x03 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x13groupingExpressions\x12N\n\x15\x61ggregate_expressions\x18\x04 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x14\x61ggregateExpressions\x12\x34\n\x05pivot\x18\x05 \x01(\x0b\x32\x1e.spark.connect.Aggregate.PivotR\x05pivot\x12J\n\rgrouping_sets\x18\x06 \x03(\x0b\x32%.spark.connect.Aggregate.GroupingSetsR\x0cgroupingSets\x1ao\n\x05Pivot\x12+\n\x03\x63ol\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x03\x63ol\x12\x39\n\x06values\x18\x02 \x03(\x0b\x32!.spark.connect.Expression.LiteralR\x06values\x1aL\n\x0cGroupingSets\x12<\n\x0cgrouping_set\x18\x01 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x0bgroupingSet"\x9f\x01\n\tGroupType\x12\x1a\n\x16GROUP_TYPE_UNSPECIFIED\x10\x00\x12\x16\n\x12GROUP_TYPE_GROUPBY\x10\x01\x12\x15\n\x11GROUP_TYPE_ROLLUP\x10\x02\x12\x13\n\x0fGROUP_TYPE_CUBE\x10\x03\x12\x14\n\x10GROUP_TYPE_PIVOT\x10\x04\x12\x1c\n\x18GROUP_TYPE_GROUPING_SETS\x10\x05"\xa0\x01\n\x04Sort\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x39\n\x05order\x18\x02 \x03(\x0b\x32#.spark.connect.Expression.SortOrderR\x05order\x12 \n\tis_global\x18\x03 \x01(\x08H\x00R\x08isGlobal\x88\x01\x01\x42\x0c\n\n_is_global"\x8d\x01\n\x04\x44rop\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x33\n\x07\x63olumns\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x07\x63olumns\x12!\n\x0c\x63olumn_names\x18\x03 \x03(\tR\x0b\x63olumnNames"\xf0\x01\n\x0b\x44\x65\x64uplicate\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12!\n\x0c\x63olumn_names\x18\x02 \x03(\tR\x0b\x63olumnNames\x12\x32\n\x13\x61ll_columns_as_keys\x18\x03 \x01(\x08H\x00R\x10\x61llColumnsAsKeys\x88\x01\x01\x12.\n\x10within_watermark\x18\x04 \x01(\x08H\x01R\x0fwithinWatermark\x88\x01\x01\x42\x16\n\x14_all_columns_as_keysB\x13\n\x11_within_watermark"Y\n\rLocalRelation\x12\x17\n\x04\x64\x61ta\x18\x01 \x01(\x0cH\x00R\x04\x64\x61ta\x88\x01\x01\x12\x1b\n\x06schema\x18\x02 \x01(\tH\x01R\x06schema\x88\x01\x01\x42\x07\n\x05_dataB\t\n\x07_schema"H\n\x13\x43\x61\x63hedLocalRelation\x12\x12\n\x04hash\x18\x03 \x01(\tR\x04hashJ\x04\x08\x01\x10\x02J\x04\x08\x02\x10\x03R\x06userIdR\tsessionId"7\n\x14\x43\x61\x63hedRemoteRelation\x12\x1f\n\x0brelation_id\x18\x01 \x01(\tR\nrelationId"\x91\x02\n\x06Sample\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x1f\n\x0blower_bound\x18\x02 \x01(\x01R\nlowerBound\x12\x1f\n\x0bupper_bound\x18\x03 \x01(\x01R\nupperBound\x12.\n\x10with_replacement\x18\x04 \x01(\x08H\x00R\x0fwithReplacement\x88\x01\x01\x12\x17\n\x04seed\x18\x05 \x01(\x03H\x01R\x04seed\x88\x01\x01\x12/\n\x13\x64\x65terministic_order\x18\x06 \x01(\x08R\x12\x64\x65terministicOrderB\x13\n\x11_with_replacementB\x07\n\x05_seed"\x91\x01\n\x05Range\x12\x19\n\x05start\x18\x01 \x01(\x03H\x00R\x05start\x88\x01\x01\x12\x10\n\x03\x65nd\x18\x02 \x01(\x03R\x03\x65nd\x12\x12\n\x04step\x18\x03 \x01(\x03R\x04step\x12*\n\x0enum_partitions\x18\x04 \x01(\x05H\x01R\rnumPartitions\x88\x01\x01\x42\x08\n\x06_startB\x11\n\x0f_num_partitions"r\n\rSubqueryAlias\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x14\n\x05\x61lias\x18\x02 \x01(\tR\x05\x61lias\x12\x1c\n\tqualifier\x18\x03 \x03(\tR\tqualifier"\x8e\x01\n\x0bRepartition\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12%\n\x0enum_partitions\x18\x02 \x01(\x05R\rnumPartitions\x12\x1d\n\x07shuffle\x18\x03 \x01(\x08H\x00R\x07shuffle\x88\x01\x01\x42\n\n\x08_shuffle"\x8e\x01\n\nShowString\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x19\n\x08num_rows\x18\x02 \x01(\x05R\x07numRows\x12\x1a\n\x08truncate\x18\x03 \x01(\x05R\x08truncate\x12\x1a\n\x08vertical\x18\x04 \x01(\x08R\x08vertical"r\n\nHtmlString\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x19\n\x08num_rows\x18\x02 \x01(\x05R\x07numRows\x12\x1a\n\x08truncate\x18\x03 \x01(\x05R\x08truncate"\\\n\x0bStatSummary\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x1e\n\nstatistics\x18\x02 \x03(\tR\nstatistics"Q\n\x0cStatDescribe\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ols\x18\x02 \x03(\tR\x04\x63ols"e\n\x0cStatCrosstab\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ol1\x18\x02 \x01(\tR\x04\x63ol1\x12\x12\n\x04\x63ol2\x18\x03 \x01(\tR\x04\x63ol2"`\n\x07StatCov\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ol1\x18\x02 \x01(\tR\x04\x63ol1\x12\x12\n\x04\x63ol2\x18\x03 \x01(\tR\x04\x63ol2"\x89\x01\n\x08StatCorr\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ol1\x18\x02 \x01(\tR\x04\x63ol1\x12\x12\n\x04\x63ol2\x18\x03 \x01(\tR\x04\x63ol2\x12\x1b\n\x06method\x18\x04 \x01(\tH\x00R\x06method\x88\x01\x01\x42\t\n\x07_method"\xa4\x01\n\x12StatApproxQuantile\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ols\x18\x02 \x03(\tR\x04\x63ols\x12$\n\rprobabilities\x18\x03 \x03(\x01R\rprobabilities\x12%\n\x0erelative_error\x18\x04 \x01(\x01R\rrelativeError"}\n\rStatFreqItems\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ols\x18\x02 \x03(\tR\x04\x63ols\x12\x1d\n\x07support\x18\x03 \x01(\x01H\x00R\x07support\x88\x01\x01\x42\n\n\x08_support"\xb5\x02\n\x0cStatSampleBy\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12+\n\x03\x63ol\x18\x02 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x03\x63ol\x12\x42\n\tfractions\x18\x03 \x03(\x0b\x32$.spark.connect.StatSampleBy.FractionR\tfractions\x12\x17\n\x04seed\x18\x05 \x01(\x03H\x00R\x04seed\x88\x01\x01\x1a\x63\n\x08\x46raction\x12;\n\x07stratum\x18\x01 \x01(\x0b\x32!.spark.connect.Expression.LiteralR\x07stratum\x12\x1a\n\x08\x66raction\x18\x02 \x01(\x01R\x08\x66ractionB\x07\n\x05_seed"\x86\x01\n\x06NAFill\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ols\x18\x02 \x03(\tR\x04\x63ols\x12\x39\n\x06values\x18\x03 \x03(\x0b\x32!.spark.connect.Expression.LiteralR\x06values"\x86\x01\n\x06NADrop\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ols\x18\x02 \x03(\tR\x04\x63ols\x12\'\n\rmin_non_nulls\x18\x03 \x01(\x05H\x00R\x0bminNonNulls\x88\x01\x01\x42\x10\n\x0e_min_non_nulls"\xa8\x02\n\tNAReplace\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ols\x18\x02 \x03(\tR\x04\x63ols\x12H\n\x0creplacements\x18\x03 \x03(\x0b\x32$.spark.connect.NAReplace.ReplacementR\x0creplacements\x1a\x8d\x01\n\x0bReplacement\x12>\n\told_value\x18\x01 \x01(\x0b\x32!.spark.connect.Expression.LiteralR\x08oldValue\x12>\n\tnew_value\x18\x02 \x01(\x0b\x32!.spark.connect.Expression.LiteralR\x08newValue"X\n\x04ToDF\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12!\n\x0c\x63olumn_names\x18\x02 \x03(\tR\x0b\x63olumnNames"\xfe\x02\n\x12WithColumnsRenamed\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12i\n\x12rename_columns_map\x18\x02 \x03(\x0b\x32\x37.spark.connect.WithColumnsRenamed.RenameColumnsMapEntryB\x02\x18\x01R\x10renameColumnsMap\x12\x42\n\x07renames\x18\x03 \x03(\x0b\x32(.spark.connect.WithColumnsRenamed.RenameR\x07renames\x1a\x43\n\x15RenameColumnsMapEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a\x45\n\x06Rename\x12\x19\n\x08\x63ol_name\x18\x01 \x01(\tR\x07\x63olName\x12 \n\x0cnew_col_name\x18\x02 \x01(\tR\nnewColName"w\n\x0bWithColumns\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x39\n\x07\x61liases\x18\x02 \x03(\x0b\x32\x1f.spark.connect.Expression.AliasR\x07\x61liases"\x86\x01\n\rWithWatermark\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x1d\n\nevent_time\x18\x02 \x01(\tR\teventTime\x12\'\n\x0f\x64\x65lay_threshold\x18\x03 \x01(\tR\x0e\x64\x65layThreshold"\x84\x01\n\x04Hint\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04name\x18\x02 \x01(\tR\x04name\x12\x39\n\nparameters\x18\x03 \x03(\x0b\x32\x19.spark.connect.ExpressionR\nparameters"\xc7\x02\n\x07Unpivot\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12+\n\x03ids\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x03ids\x12:\n\x06values\x18\x03 \x01(\x0b\x32\x1d.spark.connect.Unpivot.ValuesH\x00R\x06values\x88\x01\x01\x12\x30\n\x14variable_column_name\x18\x04 \x01(\tR\x12variableColumnName\x12*\n\x11value_column_name\x18\x05 \x01(\tR\x0fvalueColumnName\x1a;\n\x06Values\x12\x31\n\x06values\x18\x01 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x06valuesB\t\n\x07_values"j\n\x08ToSchema\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12/\n\x06schema\x18\x02 \x01(\x0b\x32\x17.spark.connect.DataTypeR\x06schema"\xcb\x01\n\x17RepartitionByExpression\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x42\n\x0fpartition_exprs\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x0epartitionExprs\x12*\n\x0enum_partitions\x18\x03 \x01(\x05H\x00R\rnumPartitions\x88\x01\x01\x42\x11\n\x0f_num_partitions"\xe8\x01\n\rMapPartitions\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x42\n\x04\x66unc\x18\x02 \x01(\x0b\x32..spark.connect.CommonInlineUserDefinedFunctionR\x04\x66unc\x12"\n\nis_barrier\x18\x03 \x01(\x08H\x00R\tisBarrier\x88\x01\x01\x12"\n\nprofile_id\x18\x04 \x01(\x05H\x01R\tprofileId\x88\x01\x01\x42\r\n\x0b_is_barrierB\r\n\x0b_profile_id"\xfb\x04\n\x08GroupMap\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12L\n\x14grouping_expressions\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x13groupingExpressions\x12\x42\n\x04\x66unc\x18\x03 \x01(\x0b\x32..spark.connect.CommonInlineUserDefinedFunctionR\x04\x66unc\x12J\n\x13sorting_expressions\x18\x04 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x12sortingExpressions\x12<\n\rinitial_input\x18\x05 \x01(\x0b\x32\x17.spark.connect.RelationR\x0cinitialInput\x12[\n\x1cinitial_grouping_expressions\x18\x06 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x1ainitialGroupingExpressions\x12;\n\x18is_map_groups_with_state\x18\x07 \x01(\x08H\x00R\x14isMapGroupsWithState\x88\x01\x01\x12$\n\x0boutput_mode\x18\x08 \x01(\tH\x01R\noutputMode\x88\x01\x01\x12&\n\x0ctimeout_conf\x18\t \x01(\tH\x02R\x0btimeoutConf\x88\x01\x01\x42\x1b\n\x19_is_map_groups_with_stateB\x0e\n\x0c_output_modeB\x0f\n\r_timeout_conf"\x8e\x04\n\nCoGroupMap\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12W\n\x1ainput_grouping_expressions\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x18inputGroupingExpressions\x12-\n\x05other\x18\x03 \x01(\x0b\x32\x17.spark.connect.RelationR\x05other\x12W\n\x1aother_grouping_expressions\x18\x04 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x18otherGroupingExpressions\x12\x42\n\x04\x66unc\x18\x05 \x01(\x0b\x32..spark.connect.CommonInlineUserDefinedFunctionR\x04\x66unc\x12U\n\x19input_sorting_expressions\x18\x06 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x17inputSortingExpressions\x12U\n\x19other_sorting_expressions\x18\x07 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x17otherSortingExpressions"\xe5\x02\n\x16\x41pplyInPandasWithState\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12L\n\x14grouping_expressions\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x13groupingExpressions\x12\x42\n\x04\x66unc\x18\x03 \x01(\x0b\x32..spark.connect.CommonInlineUserDefinedFunctionR\x04\x66unc\x12#\n\routput_schema\x18\x04 \x01(\tR\x0coutputSchema\x12!\n\x0cstate_schema\x18\x05 \x01(\tR\x0bstateSchema\x12\x1f\n\x0boutput_mode\x18\x06 \x01(\tR\noutputMode\x12!\n\x0ctimeout_conf\x18\x07 \x01(\tR\x0btimeoutConf"\xf4\x01\n$CommonInlineUserDefinedTableFunction\x12#\n\rfunction_name\x18\x01 \x01(\tR\x0c\x66unctionName\x12$\n\rdeterministic\x18\x02 \x01(\x08R\rdeterministic\x12\x37\n\targuments\x18\x03 \x03(\x0b\x32\x19.spark.connect.ExpressionR\targuments\x12<\n\x0bpython_udtf\x18\x04 \x01(\x0b\x32\x19.spark.connect.PythonUDTFH\x00R\npythonUdtfB\n\n\x08\x66unction"\xb1\x01\n\nPythonUDTF\x12=\n\x0breturn_type\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeH\x00R\nreturnType\x88\x01\x01\x12\x1b\n\teval_type\x18\x02 \x01(\x05R\x08\x65valType\x12\x18\n\x07\x63ommand\x18\x03 \x01(\x0cR\x07\x63ommand\x12\x1d\n\npython_ver\x18\x04 \x01(\tR\tpythonVerB\x0e\n\x0c_return_type"\x97\x01\n!CommonInlineUserDefinedDataSource\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12O\n\x12python_data_source\x18\x02 \x01(\x0b\x32\x1f.spark.connect.PythonDataSourceH\x00R\x10pythonDataSourceB\r\n\x0b\x64\x61ta_source"K\n\x10PythonDataSource\x12\x18\n\x07\x63ommand\x18\x01 \x01(\x0cR\x07\x63ommand\x12\x1d\n\npython_ver\x18\x02 \x01(\tR\tpythonVer"\x88\x01\n\x0e\x43ollectMetrics\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04name\x18\x02 \x01(\tR\x04name\x12\x33\n\x07metrics\x18\x03 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x07metrics"\x84\x03\n\x05Parse\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x38\n\x06\x66ormat\x18\x02 \x01(\x0e\x32 .spark.connect.Parse.ParseFormatR\x06\x66ormat\x12\x34\n\x06schema\x18\x03 \x01(\x0b\x32\x17.spark.connect.DataTypeH\x00R\x06schema\x88\x01\x01\x12;\n\x07options\x18\x04 \x03(\x0b\x32!.spark.connect.Parse.OptionsEntryR\x07options\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01"X\n\x0bParseFormat\x12\x1c\n\x18PARSE_FORMAT_UNSPECIFIED\x10\x00\x12\x14\n\x10PARSE_FORMAT_CSV\x10\x01\x12\x15\n\x11PARSE_FORMAT_JSON\x10\x02\x42\t\n\x07_schema"\xdb\x03\n\x08\x41sOfJoin\x12+\n\x04left\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x04left\x12-\n\x05right\x18\x02 \x01(\x0b\x32\x17.spark.connect.RelationR\x05right\x12\x37\n\nleft_as_of\x18\x03 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x08leftAsOf\x12\x39\n\x0bright_as_of\x18\x04 \x01(\x0b\x32\x19.spark.connect.ExpressionR\trightAsOf\x12\x36\n\tjoin_expr\x18\x05 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x08joinExpr\x12#\n\rusing_columns\x18\x06 \x03(\tR\x0cusingColumns\x12\x1b\n\tjoin_type\x18\x07 \x01(\tR\x08joinType\x12\x37\n\ttolerance\x18\x08 \x01(\x0b\x32\x19.spark.connect.ExpressionR\ttolerance\x12.\n\x13\x61llow_exact_matches\x18\t \x01(\x08R\x11\x61llowExactMatches\x12\x1c\n\tdirection\x18\n \x01(\tR\tdirectionB6\n\x1eorg.apache.spark.connect.protoP\x01Z\x12internal/generatedb\x06proto3' ) _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) @@ -48,6 +48,8 @@ DESCRIPTOR._serialized_options = ( b"\n\036org.apache.spark.connect.protoP\001Z\022internal/generated" ) + _RELATIONCOMMON.fields_by_name["source_info"]._options = None + _RELATIONCOMMON.fields_by_name["source_info"]._serialized_options = b"\030\001" _SQL_ARGSENTRY._options = None _SQL_ARGSENTRY._serialized_options = b"8\001" _SQL_NAMEDARGUMENTSENTRY._options = None @@ -70,150 +72,150 @@ _RELATION._serialized_end = 3626 _UNKNOWN._serialized_start = 3628 _UNKNOWN._serialized_end = 3637 - _RELATIONCOMMON._serialized_start = 3639 - _RELATIONCOMMON._serialized_end = 3730 - _SQL._serialized_start = 3733 - _SQL._serialized_end = 4211 - _SQL_ARGSENTRY._serialized_start = 4027 - _SQL_ARGSENTRY._serialized_end = 4117 - _SQL_NAMEDARGUMENTSENTRY._serialized_start = 4119 - _SQL_NAMEDARGUMENTSENTRY._serialized_end = 4211 - _WITHRELATIONS._serialized_start = 4213 - _WITHRELATIONS._serialized_end = 4330 - _READ._serialized_start = 4333 - _READ._serialized_end = 4996 - _READ_NAMEDTABLE._serialized_start = 4511 - _READ_NAMEDTABLE._serialized_end = 4703 - _READ_NAMEDTABLE_OPTIONSENTRY._serialized_start = 4645 - _READ_NAMEDTABLE_OPTIONSENTRY._serialized_end = 4703 - _READ_DATASOURCE._serialized_start = 4706 - _READ_DATASOURCE._serialized_end = 4983 - _READ_DATASOURCE_OPTIONSENTRY._serialized_start = 4645 - _READ_DATASOURCE_OPTIONSENTRY._serialized_end = 4703 - _PROJECT._serialized_start = 4998 - _PROJECT._serialized_end = 5115 - _FILTER._serialized_start = 5117 - _FILTER._serialized_end = 5229 - _JOIN._serialized_start = 5232 - _JOIN._serialized_end = 5893 - _JOIN_JOINDATATYPE._serialized_start = 5571 - _JOIN_JOINDATATYPE._serialized_end = 5663 - _JOIN_JOINTYPE._serialized_start = 5666 - _JOIN_JOINTYPE._serialized_end = 5874 - _SETOPERATION._serialized_start = 5896 - _SETOPERATION._serialized_end = 6375 - _SETOPERATION_SETOPTYPE._serialized_start = 6212 - _SETOPERATION_SETOPTYPE._serialized_end = 6326 - _LIMIT._serialized_start = 6377 - _LIMIT._serialized_end = 6453 - _OFFSET._serialized_start = 6455 - _OFFSET._serialized_end = 6534 - _TAIL._serialized_start = 6536 - _TAIL._serialized_end = 6611 - _AGGREGATE._serialized_start = 6614 - _AGGREGATE._serialized_end = 7380 - _AGGREGATE_PIVOT._serialized_start = 7029 - _AGGREGATE_PIVOT._serialized_end = 7140 - _AGGREGATE_GROUPINGSETS._serialized_start = 7142 - _AGGREGATE_GROUPINGSETS._serialized_end = 7218 - _AGGREGATE_GROUPTYPE._serialized_start = 7221 - _AGGREGATE_GROUPTYPE._serialized_end = 7380 - _SORT._serialized_start = 7383 - _SORT._serialized_end = 7543 - _DROP._serialized_start = 7546 - _DROP._serialized_end = 7687 - _DEDUPLICATE._serialized_start = 7690 - _DEDUPLICATE._serialized_end = 7930 - _LOCALRELATION._serialized_start = 7932 - _LOCALRELATION._serialized_end = 8021 - _CACHEDLOCALRELATION._serialized_start = 8023 - _CACHEDLOCALRELATION._serialized_end = 8095 - _CACHEDREMOTERELATION._serialized_start = 8097 - _CACHEDREMOTERELATION._serialized_end = 8152 - _SAMPLE._serialized_start = 8155 - _SAMPLE._serialized_end = 8428 - _RANGE._serialized_start = 8431 - _RANGE._serialized_end = 8576 - _SUBQUERYALIAS._serialized_start = 8578 - _SUBQUERYALIAS._serialized_end = 8692 - _REPARTITION._serialized_start = 8695 - _REPARTITION._serialized_end = 8837 - _SHOWSTRING._serialized_start = 8840 - _SHOWSTRING._serialized_end = 8982 - _HTMLSTRING._serialized_start = 8984 - _HTMLSTRING._serialized_end = 9098 - _STATSUMMARY._serialized_start = 9100 - _STATSUMMARY._serialized_end = 9192 - _STATDESCRIBE._serialized_start = 9194 - _STATDESCRIBE._serialized_end = 9275 - _STATCROSSTAB._serialized_start = 9277 - _STATCROSSTAB._serialized_end = 9378 - _STATCOV._serialized_start = 9380 - _STATCOV._serialized_end = 9476 - _STATCORR._serialized_start = 9479 - _STATCORR._serialized_end = 9616 - _STATAPPROXQUANTILE._serialized_start = 9619 - _STATAPPROXQUANTILE._serialized_end = 9783 - _STATFREQITEMS._serialized_start = 9785 - _STATFREQITEMS._serialized_end = 9910 - _STATSAMPLEBY._serialized_start = 9913 - _STATSAMPLEBY._serialized_end = 10222 - _STATSAMPLEBY_FRACTION._serialized_start = 10114 - _STATSAMPLEBY_FRACTION._serialized_end = 10213 - _NAFILL._serialized_start = 10225 - _NAFILL._serialized_end = 10359 - _NADROP._serialized_start = 10362 - _NADROP._serialized_end = 10496 - _NAREPLACE._serialized_start = 10499 - _NAREPLACE._serialized_end = 10795 - _NAREPLACE_REPLACEMENT._serialized_start = 10654 - _NAREPLACE_REPLACEMENT._serialized_end = 10795 - _TODF._serialized_start = 10797 - _TODF._serialized_end = 10885 - _WITHCOLUMNSRENAMED._serialized_start = 10888 - _WITHCOLUMNSRENAMED._serialized_end = 11270 - _WITHCOLUMNSRENAMED_RENAMECOLUMNSMAPENTRY._serialized_start = 11132 - _WITHCOLUMNSRENAMED_RENAMECOLUMNSMAPENTRY._serialized_end = 11199 - _WITHCOLUMNSRENAMED_RENAME._serialized_start = 11201 - _WITHCOLUMNSRENAMED_RENAME._serialized_end = 11270 - _WITHCOLUMNS._serialized_start = 11272 - _WITHCOLUMNS._serialized_end = 11391 - _WITHWATERMARK._serialized_start = 11394 - _WITHWATERMARK._serialized_end = 11528 - _HINT._serialized_start = 11531 - _HINT._serialized_end = 11663 - _UNPIVOT._serialized_start = 11666 - _UNPIVOT._serialized_end = 11993 - _UNPIVOT_VALUES._serialized_start = 11923 - _UNPIVOT_VALUES._serialized_end = 11982 - _TOSCHEMA._serialized_start = 11995 - _TOSCHEMA._serialized_end = 12101 - _REPARTITIONBYEXPRESSION._serialized_start = 12104 - _REPARTITIONBYEXPRESSION._serialized_end = 12307 - _MAPPARTITIONS._serialized_start = 12310 - _MAPPARTITIONS._serialized_end = 12542 - _GROUPMAP._serialized_start = 12545 - _GROUPMAP._serialized_end = 13180 - _COGROUPMAP._serialized_start = 13183 - _COGROUPMAP._serialized_end = 13709 - _APPLYINPANDASWITHSTATE._serialized_start = 13712 - _APPLYINPANDASWITHSTATE._serialized_end = 14069 - _COMMONINLINEUSERDEFINEDTABLEFUNCTION._serialized_start = 14072 - _COMMONINLINEUSERDEFINEDTABLEFUNCTION._serialized_end = 14316 - _PYTHONUDTF._serialized_start = 14319 - _PYTHONUDTF._serialized_end = 14496 - _COMMONINLINEUSERDEFINEDDATASOURCE._serialized_start = 14499 - _COMMONINLINEUSERDEFINEDDATASOURCE._serialized_end = 14650 - _PYTHONDATASOURCE._serialized_start = 14652 - _PYTHONDATASOURCE._serialized_end = 14727 - _COLLECTMETRICS._serialized_start = 14730 - _COLLECTMETRICS._serialized_end = 14866 - _PARSE._serialized_start = 14869 - _PARSE._serialized_end = 15257 - _PARSE_OPTIONSENTRY._serialized_start = 4645 - _PARSE_OPTIONSENTRY._serialized_end = 4703 - _PARSE_PARSEFORMAT._serialized_start = 15158 - _PARSE_PARSEFORMAT._serialized_end = 15246 - _ASOFJOIN._serialized_start = 15260 - _ASOFJOIN._serialized_end = 15735 + _RELATIONCOMMON._serialized_start = 3640 + _RELATIONCOMMON._serialized_end = 3782 + _SQL._serialized_start = 3785 + _SQL._serialized_end = 4263 + _SQL_ARGSENTRY._serialized_start = 4079 + _SQL_ARGSENTRY._serialized_end = 4169 + _SQL_NAMEDARGUMENTSENTRY._serialized_start = 4171 + _SQL_NAMEDARGUMENTSENTRY._serialized_end = 4263 + _WITHRELATIONS._serialized_start = 4265 + _WITHRELATIONS._serialized_end = 4382 + _READ._serialized_start = 4385 + _READ._serialized_end = 5048 + _READ_NAMEDTABLE._serialized_start = 4563 + _READ_NAMEDTABLE._serialized_end = 4755 + _READ_NAMEDTABLE_OPTIONSENTRY._serialized_start = 4697 + _READ_NAMEDTABLE_OPTIONSENTRY._serialized_end = 4755 + _READ_DATASOURCE._serialized_start = 4758 + _READ_DATASOURCE._serialized_end = 5035 + _READ_DATASOURCE_OPTIONSENTRY._serialized_start = 4697 + _READ_DATASOURCE_OPTIONSENTRY._serialized_end = 4755 + _PROJECT._serialized_start = 5050 + _PROJECT._serialized_end = 5167 + _FILTER._serialized_start = 5169 + _FILTER._serialized_end = 5281 + _JOIN._serialized_start = 5284 + _JOIN._serialized_end = 5945 + _JOIN_JOINDATATYPE._serialized_start = 5623 + _JOIN_JOINDATATYPE._serialized_end = 5715 + _JOIN_JOINTYPE._serialized_start = 5718 + _JOIN_JOINTYPE._serialized_end = 5926 + _SETOPERATION._serialized_start = 5948 + _SETOPERATION._serialized_end = 6427 + _SETOPERATION_SETOPTYPE._serialized_start = 6264 + _SETOPERATION_SETOPTYPE._serialized_end = 6378 + _LIMIT._serialized_start = 6429 + _LIMIT._serialized_end = 6505 + _OFFSET._serialized_start = 6507 + _OFFSET._serialized_end = 6586 + _TAIL._serialized_start = 6588 + _TAIL._serialized_end = 6663 + _AGGREGATE._serialized_start = 6666 + _AGGREGATE._serialized_end = 7432 + _AGGREGATE_PIVOT._serialized_start = 7081 + _AGGREGATE_PIVOT._serialized_end = 7192 + _AGGREGATE_GROUPINGSETS._serialized_start = 7194 + _AGGREGATE_GROUPINGSETS._serialized_end = 7270 + _AGGREGATE_GROUPTYPE._serialized_start = 7273 + _AGGREGATE_GROUPTYPE._serialized_end = 7432 + _SORT._serialized_start = 7435 + _SORT._serialized_end = 7595 + _DROP._serialized_start = 7598 + _DROP._serialized_end = 7739 + _DEDUPLICATE._serialized_start = 7742 + _DEDUPLICATE._serialized_end = 7982 + _LOCALRELATION._serialized_start = 7984 + _LOCALRELATION._serialized_end = 8073 + _CACHEDLOCALRELATION._serialized_start = 8075 + _CACHEDLOCALRELATION._serialized_end = 8147 + _CACHEDREMOTERELATION._serialized_start = 8149 + _CACHEDREMOTERELATION._serialized_end = 8204 + _SAMPLE._serialized_start = 8207 + _SAMPLE._serialized_end = 8480 + _RANGE._serialized_start = 8483 + _RANGE._serialized_end = 8628 + _SUBQUERYALIAS._serialized_start = 8630 + _SUBQUERYALIAS._serialized_end = 8744 + _REPARTITION._serialized_start = 8747 + _REPARTITION._serialized_end = 8889 + _SHOWSTRING._serialized_start = 8892 + _SHOWSTRING._serialized_end = 9034 + _HTMLSTRING._serialized_start = 9036 + _HTMLSTRING._serialized_end = 9150 + _STATSUMMARY._serialized_start = 9152 + _STATSUMMARY._serialized_end = 9244 + _STATDESCRIBE._serialized_start = 9246 + _STATDESCRIBE._serialized_end = 9327 + _STATCROSSTAB._serialized_start = 9329 + _STATCROSSTAB._serialized_end = 9430 + _STATCOV._serialized_start = 9432 + _STATCOV._serialized_end = 9528 + _STATCORR._serialized_start = 9531 + _STATCORR._serialized_end = 9668 + _STATAPPROXQUANTILE._serialized_start = 9671 + _STATAPPROXQUANTILE._serialized_end = 9835 + _STATFREQITEMS._serialized_start = 9837 + _STATFREQITEMS._serialized_end = 9962 + _STATSAMPLEBY._serialized_start = 9965 + _STATSAMPLEBY._serialized_end = 10274 + _STATSAMPLEBY_FRACTION._serialized_start = 10166 + _STATSAMPLEBY_FRACTION._serialized_end = 10265 + _NAFILL._serialized_start = 10277 + _NAFILL._serialized_end = 10411 + _NADROP._serialized_start = 10414 + _NADROP._serialized_end = 10548 + _NAREPLACE._serialized_start = 10551 + _NAREPLACE._serialized_end = 10847 + _NAREPLACE_REPLACEMENT._serialized_start = 10706 + _NAREPLACE_REPLACEMENT._serialized_end = 10847 + _TODF._serialized_start = 10849 + _TODF._serialized_end = 10937 + _WITHCOLUMNSRENAMED._serialized_start = 10940 + _WITHCOLUMNSRENAMED._serialized_end = 11322 + _WITHCOLUMNSRENAMED_RENAMECOLUMNSMAPENTRY._serialized_start = 11184 + _WITHCOLUMNSRENAMED_RENAMECOLUMNSMAPENTRY._serialized_end = 11251 + _WITHCOLUMNSRENAMED_RENAME._serialized_start = 11253 + _WITHCOLUMNSRENAMED_RENAME._serialized_end = 11322 + _WITHCOLUMNS._serialized_start = 11324 + _WITHCOLUMNS._serialized_end = 11443 + _WITHWATERMARK._serialized_start = 11446 + _WITHWATERMARK._serialized_end = 11580 + _HINT._serialized_start = 11583 + _HINT._serialized_end = 11715 + _UNPIVOT._serialized_start = 11718 + _UNPIVOT._serialized_end = 12045 + _UNPIVOT_VALUES._serialized_start = 11975 + _UNPIVOT_VALUES._serialized_end = 12034 + _TOSCHEMA._serialized_start = 12047 + _TOSCHEMA._serialized_end = 12153 + _REPARTITIONBYEXPRESSION._serialized_start = 12156 + _REPARTITIONBYEXPRESSION._serialized_end = 12359 + _MAPPARTITIONS._serialized_start = 12362 + _MAPPARTITIONS._serialized_end = 12594 + _GROUPMAP._serialized_start = 12597 + _GROUPMAP._serialized_end = 13232 + _COGROUPMAP._serialized_start = 13235 + _COGROUPMAP._serialized_end = 13761 + _APPLYINPANDASWITHSTATE._serialized_start = 13764 + _APPLYINPANDASWITHSTATE._serialized_end = 14121 + _COMMONINLINEUSERDEFINEDTABLEFUNCTION._serialized_start = 14124 + _COMMONINLINEUSERDEFINEDTABLEFUNCTION._serialized_end = 14368 + _PYTHONUDTF._serialized_start = 14371 + _PYTHONUDTF._serialized_end = 14548 + _COMMONINLINEUSERDEFINEDDATASOURCE._serialized_start = 14551 + _COMMONINLINEUSERDEFINEDDATASOURCE._serialized_end = 14702 + _PYTHONDATASOURCE._serialized_start = 14704 + _PYTHONDATASOURCE._serialized_end = 14779 + _COLLECTMETRICS._serialized_start = 14782 + _COLLECTMETRICS._serialized_end = 14918 + _PARSE._serialized_start = 14921 + _PARSE._serialized_end = 15309 + _PARSE_OPTIONSENTRY._serialized_start = 4697 + _PARSE_OPTIONSENTRY._serialized_end = 4755 + _PARSE_PARSEFORMAT._serialized_start = 15210 + _PARSE_PARSEFORMAT._serialized_end = 15298 + _ASOFJOIN._serialized_start = 15312 + _ASOFJOIN._serialized_end = 15787 # @@protoc_insertion_point(module_scope) diff --git a/python/pyspark/sql/connect/proto/relations_pb2.pyi b/python/pyspark/sql/connect/proto/relations_pb2.pyi index 5dfb47da67a97..864803fd33084 100644 --- a/python/pyspark/sql/connect/proto/relations_pb2.pyi +++ b/python/pyspark/sql/connect/proto/relations_pb2.pyi @@ -41,6 +41,7 @@ import google.protobuf.internal.containers import google.protobuf.internal.enum_type_wrapper import google.protobuf.message import pyspark.sql.connect.proto.catalog_pb2 +import pyspark.sql.connect.proto.common_pb2 import pyspark.sql.connect.proto.expressions_pb2 import pyspark.sql.connect.proto.types_pb2 import sys @@ -614,23 +615,38 @@ class RelationCommon(google.protobuf.message.Message): SOURCE_INFO_FIELD_NUMBER: builtins.int PLAN_ID_FIELD_NUMBER: builtins.int + ORIGIN_FIELD_NUMBER: builtins.int source_info: builtins.str """(Required) Shared relation metadata.""" plan_id: builtins.int """(Optional) A per-client globally unique id for a given connect plan.""" + @property + def origin(self) -> pyspark.sql.connect.proto.common_pb2.Origin: + """(Optional) Keep the information of the origin for this expression such as stacktrace.""" def __init__( self, *, source_info: builtins.str = ..., plan_id: builtins.int | None = ..., + origin: pyspark.sql.connect.proto.common_pb2.Origin | None = ..., ) -> None: ... def HasField( - self, field_name: typing_extensions.Literal["_plan_id", b"_plan_id", "plan_id", b"plan_id"] + self, + field_name: typing_extensions.Literal[ + "_plan_id", b"_plan_id", "origin", b"origin", "plan_id", b"plan_id" + ], ) -> builtins.bool: ... def ClearField( self, field_name: typing_extensions.Literal[ - "_plan_id", b"_plan_id", "plan_id", b"plan_id", "source_info", b"source_info" + "_plan_id", + b"_plan_id", + "origin", + b"origin", + "plan_id", + b"plan_id", + "source_info", + b"source_info", ], ) -> None: ... def WhichOneof( @@ -1865,7 +1881,10 @@ class Sample(google.protobuf.message.Message): with_replacement: builtins.bool """(Optional) Whether to sample with replacement.""" seed: builtins.int - """(Optional) The random seed.""" + """(Required) The random seed. + This field is required to avoid generating mutable dataframes (see SPARK-48184 for details), + however, still keep it 'optional' here for backward compatibility. + """ deterministic_order: builtins.bool """(Required) Explicitly sort the underlying plan to make the ordering deterministic or cache it. This flag is true when invoking `dataframe.randomSplit` to randomly splits DataFrame with the @@ -2545,7 +2564,10 @@ class StatSampleBy(google.protobuf.message.Message): If a stratum is not specified, we treat its fraction as zero. """ seed: builtins.int - """(Optional) The random seed.""" + """(Required) The random seed. + This field is required to avoid generating mutable dataframes (see SPARK-48184 for details), + however, still keep it 'optional' here for backward compatibility. + """ def __init__( self, *, diff --git a/python/pyspark/sql/connect/proto/types_pb2.py b/python/pyspark/sql/connect/proto/types_pb2.py index 65e5860b5dc60..1022605fb160d 100644 --- a/python/pyspark/sql/connect/proto/types_pb2.py +++ b/python/pyspark/sql/connect/proto/types_pb2.py @@ -29,7 +29,7 @@ DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b"\n\x19spark/connect/types.proto\x12\rspark.connect\"\xec!\n\x08\x44\x61taType\x12\x32\n\x04null\x18\x01 \x01(\x0b\x32\x1c.spark.connect.DataType.NULLH\x00R\x04null\x12\x38\n\x06\x62inary\x18\x02 \x01(\x0b\x32\x1e.spark.connect.DataType.BinaryH\x00R\x06\x62inary\x12;\n\x07\x62oolean\x18\x03 \x01(\x0b\x32\x1f.spark.connect.DataType.BooleanH\x00R\x07\x62oolean\x12\x32\n\x04\x62yte\x18\x04 \x01(\x0b\x32\x1c.spark.connect.DataType.ByteH\x00R\x04\x62yte\x12\x35\n\x05short\x18\x05 \x01(\x0b\x32\x1d.spark.connect.DataType.ShortH\x00R\x05short\x12;\n\x07integer\x18\x06 \x01(\x0b\x32\x1f.spark.connect.DataType.IntegerH\x00R\x07integer\x12\x32\n\x04long\x18\x07 \x01(\x0b\x32\x1c.spark.connect.DataType.LongH\x00R\x04long\x12\x35\n\x05\x66loat\x18\x08 \x01(\x0b\x32\x1d.spark.connect.DataType.FloatH\x00R\x05\x66loat\x12\x38\n\x06\x64ouble\x18\t \x01(\x0b\x32\x1e.spark.connect.DataType.DoubleH\x00R\x06\x64ouble\x12;\n\x07\x64\x65\x63imal\x18\n \x01(\x0b\x32\x1f.spark.connect.DataType.DecimalH\x00R\x07\x64\x65\x63imal\x12\x38\n\x06string\x18\x0b \x01(\x0b\x32\x1e.spark.connect.DataType.StringH\x00R\x06string\x12\x32\n\x04\x63har\x18\x0c \x01(\x0b\x32\x1c.spark.connect.DataType.CharH\x00R\x04\x63har\x12<\n\x08var_char\x18\r \x01(\x0b\x32\x1f.spark.connect.DataType.VarCharH\x00R\x07varChar\x12\x32\n\x04\x64\x61te\x18\x0e \x01(\x0b\x32\x1c.spark.connect.DataType.DateH\x00R\x04\x64\x61te\x12\x41\n\ttimestamp\x18\x0f \x01(\x0b\x32!.spark.connect.DataType.TimestampH\x00R\ttimestamp\x12K\n\rtimestamp_ntz\x18\x10 \x01(\x0b\x32$.spark.connect.DataType.TimestampNTZH\x00R\x0ctimestampNtz\x12W\n\x11\x63\x61lendar_interval\x18\x11 \x01(\x0b\x32(.spark.connect.DataType.CalendarIntervalH\x00R\x10\x63\x61lendarInterval\x12[\n\x13year_month_interval\x18\x12 \x01(\x0b\x32).spark.connect.DataType.YearMonthIntervalH\x00R\x11yearMonthInterval\x12U\n\x11\x64\x61y_time_interval\x18\x13 \x01(\x0b\x32'.spark.connect.DataType.DayTimeIntervalH\x00R\x0f\x64\x61yTimeInterval\x12\x35\n\x05\x61rray\x18\x14 \x01(\x0b\x32\x1d.spark.connect.DataType.ArrayH\x00R\x05\x61rray\x12\x38\n\x06struct\x18\x15 \x01(\x0b\x32\x1e.spark.connect.DataType.StructH\x00R\x06struct\x12/\n\x03map\x18\x16 \x01(\x0b\x32\x1b.spark.connect.DataType.MapH\x00R\x03map\x12;\n\x07variant\x18\x19 \x01(\x0b\x32\x1f.spark.connect.DataType.VariantH\x00R\x07variant\x12/\n\x03udt\x18\x17 \x01(\x0b\x32\x1b.spark.connect.DataType.UDTH\x00R\x03udt\x12>\n\x08unparsed\x18\x18 \x01(\x0b\x32 .spark.connect.DataType.UnparsedH\x00R\x08unparsed\x1a\x43\n\x07\x42oolean\x12\x38\n\x18type_variation_reference\x18\x01 \x01(\rR\x16typeVariationReference\x1a@\n\x04\x42yte\x12\x38\n\x18type_variation_reference\x18\x01 \x01(\rR\x16typeVariationReference\x1a\x41\n\x05Short\x12\x38\n\x18type_variation_reference\x18\x01 \x01(\rR\x16typeVariationReference\x1a\x43\n\x07Integer\x12\x38\n\x18type_variation_reference\x18\x01 \x01(\rR\x16typeVariationReference\x1a@\n\x04Long\x12\x38\n\x18type_variation_reference\x18\x01 \x01(\rR\x16typeVariationReference\x1a\x41\n\x05\x46loat\x12\x38\n\x18type_variation_reference\x18\x01 \x01(\rR\x16typeVariationReference\x1a\x42\n\x06\x44ouble\x12\x38\n\x18type_variation_reference\x18\x01 \x01(\rR\x16typeVariationReference\x1a\x65\n\x06String\x12\x38\n\x18type_variation_reference\x18\x01 \x01(\rR\x16typeVariationReference\x12!\n\x0c\x63ollation_id\x18\x02 \x01(\rR\x0b\x63ollationId\x1a\x42\n\x06\x42inary\x12\x38\n\x18type_variation_reference\x18\x01 \x01(\rR\x16typeVariationReference\x1a@\n\x04NULL\x12\x38\n\x18type_variation_reference\x18\x01 \x01(\rR\x16typeVariationReference\x1a\x45\n\tTimestamp\x12\x38\n\x18type_variation_reference\x18\x01 \x01(\rR\x16typeVariationReference\x1a@\n\x04\x44\x61te\x12\x38\n\x18type_variation_reference\x18\x01 \x01(\rR\x16typeVariationReference\x1aH\n\x0cTimestampNTZ\x12\x38\n\x18type_variation_reference\x18\x01 \x01(\rR\x16typeVariationReference\x1aL\n\x10\x43\x61lendarInterval\x12\x38\n\x18type_variation_reference\x18\x01 \x01(\rR\x16typeVariationReference\x1a\xb3\x01\n\x11YearMonthInterval\x12$\n\x0bstart_field\x18\x01 \x01(\x05H\x00R\nstartField\x88\x01\x01\x12 \n\tend_field\x18\x02 \x01(\x05H\x01R\x08\x65ndField\x88\x01\x01\x12\x38\n\x18type_variation_reference\x18\x03 \x01(\rR\x16typeVariationReferenceB\x0e\n\x0c_start_fieldB\x0c\n\n_end_field\x1a\xb1\x01\n\x0f\x44\x61yTimeInterval\x12$\n\x0bstart_field\x18\x01 \x01(\x05H\x00R\nstartField\x88\x01\x01\x12 \n\tend_field\x18\x02 \x01(\x05H\x01R\x08\x65ndField\x88\x01\x01\x12\x38\n\x18type_variation_reference\x18\x03 \x01(\rR\x16typeVariationReferenceB\x0e\n\x0c_start_fieldB\x0c\n\n_end_field\x1aX\n\x04\x43har\x12\x16\n\x06length\x18\x01 \x01(\x05R\x06length\x12\x38\n\x18type_variation_reference\x18\x02 \x01(\rR\x16typeVariationReference\x1a[\n\x07VarChar\x12\x16\n\x06length\x18\x01 \x01(\x05R\x06length\x12\x38\n\x18type_variation_reference\x18\x02 \x01(\rR\x16typeVariationReference\x1a\x99\x01\n\x07\x44\x65\x63imal\x12\x19\n\x05scale\x18\x01 \x01(\x05H\x00R\x05scale\x88\x01\x01\x12!\n\tprecision\x18\x02 \x01(\x05H\x01R\tprecision\x88\x01\x01\x12\x38\n\x18type_variation_reference\x18\x03 \x01(\rR\x16typeVariationReferenceB\x08\n\x06_scaleB\x0c\n\n_precision\x1a\xa1\x01\n\x0bStructField\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12\x34\n\tdata_type\x18\x02 \x01(\x0b\x32\x17.spark.connect.DataTypeR\x08\x64\x61taType\x12\x1a\n\x08nullable\x18\x03 \x01(\x08R\x08nullable\x12\x1f\n\x08metadata\x18\x04 \x01(\tH\x00R\x08metadata\x88\x01\x01\x42\x0b\n\t_metadata\x1a\x7f\n\x06Struct\x12;\n\x06\x66ields\x18\x01 \x03(\x0b\x32#.spark.connect.DataType.StructFieldR\x06\x66ields\x12\x38\n\x18type_variation_reference\x18\x02 \x01(\rR\x16typeVariationReference\x1a\xa2\x01\n\x05\x41rray\x12:\n\x0c\x65lement_type\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeR\x0b\x65lementType\x12#\n\rcontains_null\x18\x02 \x01(\x08R\x0c\x63ontainsNull\x12\x38\n\x18type_variation_reference\x18\x03 \x01(\rR\x16typeVariationReference\x1a\xdb\x01\n\x03Map\x12\x32\n\x08key_type\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeR\x07keyType\x12\x36\n\nvalue_type\x18\x02 \x01(\x0b\x32\x17.spark.connect.DataTypeR\tvalueType\x12.\n\x13value_contains_null\x18\x03 \x01(\x08R\x11valueContainsNull\x12\x38\n\x18type_variation_reference\x18\x04 \x01(\rR\x16typeVariationReference\x1a\x43\n\x07Variant\x12\x38\n\x18type_variation_reference\x18\x01 \x01(\rR\x16typeVariationReference\x1a\x8f\x02\n\x03UDT\x12\x12\n\x04type\x18\x01 \x01(\tR\x04type\x12 \n\tjvm_class\x18\x02 \x01(\tH\x00R\x08jvmClass\x88\x01\x01\x12&\n\x0cpython_class\x18\x03 \x01(\tH\x01R\x0bpythonClass\x88\x01\x01\x12;\n\x17serialized_python_class\x18\x04 \x01(\tH\x02R\x15serializedPythonClass\x88\x01\x01\x12\x32\n\x08sql_type\x18\x05 \x01(\x0b\x32\x17.spark.connect.DataTypeR\x07sqlTypeB\x0c\n\n_jvm_classB\x0f\n\r_python_classB\x1a\n\x18_serialized_python_class\x1a\x34\n\x08Unparsed\x12(\n\x10\x64\x61ta_type_string\x18\x01 \x01(\tR\x0e\x64\x61taTypeStringB\x06\n\x04kindB6\n\x1eorg.apache.spark.connect.protoP\x01Z\x12internal/generatedb\x06proto3" + b"\n\x19spark/connect/types.proto\x12\rspark.connect\"\xe7!\n\x08\x44\x61taType\x12\x32\n\x04null\x18\x01 \x01(\x0b\x32\x1c.spark.connect.DataType.NULLH\x00R\x04null\x12\x38\n\x06\x62inary\x18\x02 \x01(\x0b\x32\x1e.spark.connect.DataType.BinaryH\x00R\x06\x62inary\x12;\n\x07\x62oolean\x18\x03 \x01(\x0b\x32\x1f.spark.connect.DataType.BooleanH\x00R\x07\x62oolean\x12\x32\n\x04\x62yte\x18\x04 \x01(\x0b\x32\x1c.spark.connect.DataType.ByteH\x00R\x04\x62yte\x12\x35\n\x05short\x18\x05 \x01(\x0b\x32\x1d.spark.connect.DataType.ShortH\x00R\x05short\x12;\n\x07integer\x18\x06 \x01(\x0b\x32\x1f.spark.connect.DataType.IntegerH\x00R\x07integer\x12\x32\n\x04long\x18\x07 \x01(\x0b\x32\x1c.spark.connect.DataType.LongH\x00R\x04long\x12\x35\n\x05\x66loat\x18\x08 \x01(\x0b\x32\x1d.spark.connect.DataType.FloatH\x00R\x05\x66loat\x12\x38\n\x06\x64ouble\x18\t \x01(\x0b\x32\x1e.spark.connect.DataType.DoubleH\x00R\x06\x64ouble\x12;\n\x07\x64\x65\x63imal\x18\n \x01(\x0b\x32\x1f.spark.connect.DataType.DecimalH\x00R\x07\x64\x65\x63imal\x12\x38\n\x06string\x18\x0b \x01(\x0b\x32\x1e.spark.connect.DataType.StringH\x00R\x06string\x12\x32\n\x04\x63har\x18\x0c \x01(\x0b\x32\x1c.spark.connect.DataType.CharH\x00R\x04\x63har\x12<\n\x08var_char\x18\r \x01(\x0b\x32\x1f.spark.connect.DataType.VarCharH\x00R\x07varChar\x12\x32\n\x04\x64\x61te\x18\x0e \x01(\x0b\x32\x1c.spark.connect.DataType.DateH\x00R\x04\x64\x61te\x12\x41\n\ttimestamp\x18\x0f \x01(\x0b\x32!.spark.connect.DataType.TimestampH\x00R\ttimestamp\x12K\n\rtimestamp_ntz\x18\x10 \x01(\x0b\x32$.spark.connect.DataType.TimestampNTZH\x00R\x0ctimestampNtz\x12W\n\x11\x63\x61lendar_interval\x18\x11 \x01(\x0b\x32(.spark.connect.DataType.CalendarIntervalH\x00R\x10\x63\x61lendarInterval\x12[\n\x13year_month_interval\x18\x12 \x01(\x0b\x32).spark.connect.DataType.YearMonthIntervalH\x00R\x11yearMonthInterval\x12U\n\x11\x64\x61y_time_interval\x18\x13 \x01(\x0b\x32'.spark.connect.DataType.DayTimeIntervalH\x00R\x0f\x64\x61yTimeInterval\x12\x35\n\x05\x61rray\x18\x14 \x01(\x0b\x32\x1d.spark.connect.DataType.ArrayH\x00R\x05\x61rray\x12\x38\n\x06struct\x18\x15 \x01(\x0b\x32\x1e.spark.connect.DataType.StructH\x00R\x06struct\x12/\n\x03map\x18\x16 \x01(\x0b\x32\x1b.spark.connect.DataType.MapH\x00R\x03map\x12;\n\x07variant\x18\x19 \x01(\x0b\x32\x1f.spark.connect.DataType.VariantH\x00R\x07variant\x12/\n\x03udt\x18\x17 \x01(\x0b\x32\x1b.spark.connect.DataType.UDTH\x00R\x03udt\x12>\n\x08unparsed\x18\x18 \x01(\x0b\x32 .spark.connect.DataType.UnparsedH\x00R\x08unparsed\x1a\x43\n\x07\x42oolean\x12\x38\n\x18type_variation_reference\x18\x01 \x01(\rR\x16typeVariationReference\x1a@\n\x04\x42yte\x12\x38\n\x18type_variation_reference\x18\x01 \x01(\rR\x16typeVariationReference\x1a\x41\n\x05Short\x12\x38\n\x18type_variation_reference\x18\x01 \x01(\rR\x16typeVariationReference\x1a\x43\n\x07Integer\x12\x38\n\x18type_variation_reference\x18\x01 \x01(\rR\x16typeVariationReference\x1a@\n\x04Long\x12\x38\n\x18type_variation_reference\x18\x01 \x01(\rR\x16typeVariationReference\x1a\x41\n\x05\x46loat\x12\x38\n\x18type_variation_reference\x18\x01 \x01(\rR\x16typeVariationReference\x1a\x42\n\x06\x44ouble\x12\x38\n\x18type_variation_reference\x18\x01 \x01(\rR\x16typeVariationReference\x1a`\n\x06String\x12\x38\n\x18type_variation_reference\x18\x01 \x01(\rR\x16typeVariationReference\x12\x1c\n\tcollation\x18\x02 \x01(\tR\tcollation\x1a\x42\n\x06\x42inary\x12\x38\n\x18type_variation_reference\x18\x01 \x01(\rR\x16typeVariationReference\x1a@\n\x04NULL\x12\x38\n\x18type_variation_reference\x18\x01 \x01(\rR\x16typeVariationReference\x1a\x45\n\tTimestamp\x12\x38\n\x18type_variation_reference\x18\x01 \x01(\rR\x16typeVariationReference\x1a@\n\x04\x44\x61te\x12\x38\n\x18type_variation_reference\x18\x01 \x01(\rR\x16typeVariationReference\x1aH\n\x0cTimestampNTZ\x12\x38\n\x18type_variation_reference\x18\x01 \x01(\rR\x16typeVariationReference\x1aL\n\x10\x43\x61lendarInterval\x12\x38\n\x18type_variation_reference\x18\x01 \x01(\rR\x16typeVariationReference\x1a\xb3\x01\n\x11YearMonthInterval\x12$\n\x0bstart_field\x18\x01 \x01(\x05H\x00R\nstartField\x88\x01\x01\x12 \n\tend_field\x18\x02 \x01(\x05H\x01R\x08\x65ndField\x88\x01\x01\x12\x38\n\x18type_variation_reference\x18\x03 \x01(\rR\x16typeVariationReferenceB\x0e\n\x0c_start_fieldB\x0c\n\n_end_field\x1a\xb1\x01\n\x0f\x44\x61yTimeInterval\x12$\n\x0bstart_field\x18\x01 \x01(\x05H\x00R\nstartField\x88\x01\x01\x12 \n\tend_field\x18\x02 \x01(\x05H\x01R\x08\x65ndField\x88\x01\x01\x12\x38\n\x18type_variation_reference\x18\x03 \x01(\rR\x16typeVariationReferenceB\x0e\n\x0c_start_fieldB\x0c\n\n_end_field\x1aX\n\x04\x43har\x12\x16\n\x06length\x18\x01 \x01(\x05R\x06length\x12\x38\n\x18type_variation_reference\x18\x02 \x01(\rR\x16typeVariationReference\x1a[\n\x07VarChar\x12\x16\n\x06length\x18\x01 \x01(\x05R\x06length\x12\x38\n\x18type_variation_reference\x18\x02 \x01(\rR\x16typeVariationReference\x1a\x99\x01\n\x07\x44\x65\x63imal\x12\x19\n\x05scale\x18\x01 \x01(\x05H\x00R\x05scale\x88\x01\x01\x12!\n\tprecision\x18\x02 \x01(\x05H\x01R\tprecision\x88\x01\x01\x12\x38\n\x18type_variation_reference\x18\x03 \x01(\rR\x16typeVariationReferenceB\x08\n\x06_scaleB\x0c\n\n_precision\x1a\xa1\x01\n\x0bStructField\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12\x34\n\tdata_type\x18\x02 \x01(\x0b\x32\x17.spark.connect.DataTypeR\x08\x64\x61taType\x12\x1a\n\x08nullable\x18\x03 \x01(\x08R\x08nullable\x12\x1f\n\x08metadata\x18\x04 \x01(\tH\x00R\x08metadata\x88\x01\x01\x42\x0b\n\t_metadata\x1a\x7f\n\x06Struct\x12;\n\x06\x66ields\x18\x01 \x03(\x0b\x32#.spark.connect.DataType.StructFieldR\x06\x66ields\x12\x38\n\x18type_variation_reference\x18\x02 \x01(\rR\x16typeVariationReference\x1a\xa2\x01\n\x05\x41rray\x12:\n\x0c\x65lement_type\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeR\x0b\x65lementType\x12#\n\rcontains_null\x18\x02 \x01(\x08R\x0c\x63ontainsNull\x12\x38\n\x18type_variation_reference\x18\x03 \x01(\rR\x16typeVariationReference\x1a\xdb\x01\n\x03Map\x12\x32\n\x08key_type\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeR\x07keyType\x12\x36\n\nvalue_type\x18\x02 \x01(\x0b\x32\x17.spark.connect.DataTypeR\tvalueType\x12.\n\x13value_contains_null\x18\x03 \x01(\x08R\x11valueContainsNull\x12\x38\n\x18type_variation_reference\x18\x04 \x01(\rR\x16typeVariationReference\x1a\x43\n\x07Variant\x12\x38\n\x18type_variation_reference\x18\x01 \x01(\rR\x16typeVariationReference\x1a\x8f\x02\n\x03UDT\x12\x12\n\x04type\x18\x01 \x01(\tR\x04type\x12 \n\tjvm_class\x18\x02 \x01(\tH\x00R\x08jvmClass\x88\x01\x01\x12&\n\x0cpython_class\x18\x03 \x01(\tH\x01R\x0bpythonClass\x88\x01\x01\x12;\n\x17serialized_python_class\x18\x04 \x01(\tH\x02R\x15serializedPythonClass\x88\x01\x01\x12\x32\n\x08sql_type\x18\x05 \x01(\x0b\x32\x17.spark.connect.DataTypeR\x07sqlTypeB\x0c\n\n_jvm_classB\x0f\n\r_python_classB\x1a\n\x18_serialized_python_class\x1a\x34\n\x08Unparsed\x12(\n\x10\x64\x61ta_type_string\x18\x01 \x01(\tR\x0e\x64\x61taTypeStringB\x06\n\x04kindB6\n\x1eorg.apache.spark.connect.protoP\x01Z\x12internal/generatedb\x06proto3" ) _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) @@ -42,7 +42,7 @@ b"\n\036org.apache.spark.connect.protoP\001Z\022internal/generated" ) _DATATYPE._serialized_start = 45 - _DATATYPE._serialized_end = 4377 + _DATATYPE._serialized_end = 4372 _DATATYPE_BOOLEAN._serialized_start = 1595 _DATATYPE_BOOLEAN._serialized_end = 1662 _DATATYPE_BYTE._serialized_start = 1664 @@ -58,41 +58,41 @@ _DATATYPE_DOUBLE._serialized_start = 1999 _DATATYPE_DOUBLE._serialized_end = 2065 _DATATYPE_STRING._serialized_start = 2067 - _DATATYPE_STRING._serialized_end = 2168 - _DATATYPE_BINARY._serialized_start = 2170 - _DATATYPE_BINARY._serialized_end = 2236 - _DATATYPE_NULL._serialized_start = 2238 - _DATATYPE_NULL._serialized_end = 2302 - _DATATYPE_TIMESTAMP._serialized_start = 2304 - _DATATYPE_TIMESTAMP._serialized_end = 2373 - _DATATYPE_DATE._serialized_start = 2375 - _DATATYPE_DATE._serialized_end = 2439 - _DATATYPE_TIMESTAMPNTZ._serialized_start = 2441 - _DATATYPE_TIMESTAMPNTZ._serialized_end = 2513 - _DATATYPE_CALENDARINTERVAL._serialized_start = 2515 - _DATATYPE_CALENDARINTERVAL._serialized_end = 2591 - _DATATYPE_YEARMONTHINTERVAL._serialized_start = 2594 - _DATATYPE_YEARMONTHINTERVAL._serialized_end = 2773 - _DATATYPE_DAYTIMEINTERVAL._serialized_start = 2776 - _DATATYPE_DAYTIMEINTERVAL._serialized_end = 2953 - _DATATYPE_CHAR._serialized_start = 2955 - _DATATYPE_CHAR._serialized_end = 3043 - _DATATYPE_VARCHAR._serialized_start = 3045 - _DATATYPE_VARCHAR._serialized_end = 3136 - _DATATYPE_DECIMAL._serialized_start = 3139 - _DATATYPE_DECIMAL._serialized_end = 3292 - _DATATYPE_STRUCTFIELD._serialized_start = 3295 - _DATATYPE_STRUCTFIELD._serialized_end = 3456 - _DATATYPE_STRUCT._serialized_start = 3458 - _DATATYPE_STRUCT._serialized_end = 3585 - _DATATYPE_ARRAY._serialized_start = 3588 - _DATATYPE_ARRAY._serialized_end = 3750 - _DATATYPE_MAP._serialized_start = 3753 - _DATATYPE_MAP._serialized_end = 3972 - _DATATYPE_VARIANT._serialized_start = 3974 - _DATATYPE_VARIANT._serialized_end = 4041 - _DATATYPE_UDT._serialized_start = 4044 - _DATATYPE_UDT._serialized_end = 4315 - _DATATYPE_UNPARSED._serialized_start = 4317 - _DATATYPE_UNPARSED._serialized_end = 4369 + _DATATYPE_STRING._serialized_end = 2163 + _DATATYPE_BINARY._serialized_start = 2165 + _DATATYPE_BINARY._serialized_end = 2231 + _DATATYPE_NULL._serialized_start = 2233 + _DATATYPE_NULL._serialized_end = 2297 + _DATATYPE_TIMESTAMP._serialized_start = 2299 + _DATATYPE_TIMESTAMP._serialized_end = 2368 + _DATATYPE_DATE._serialized_start = 2370 + _DATATYPE_DATE._serialized_end = 2434 + _DATATYPE_TIMESTAMPNTZ._serialized_start = 2436 + _DATATYPE_TIMESTAMPNTZ._serialized_end = 2508 + _DATATYPE_CALENDARINTERVAL._serialized_start = 2510 + _DATATYPE_CALENDARINTERVAL._serialized_end = 2586 + _DATATYPE_YEARMONTHINTERVAL._serialized_start = 2589 + _DATATYPE_YEARMONTHINTERVAL._serialized_end = 2768 + _DATATYPE_DAYTIMEINTERVAL._serialized_start = 2771 + _DATATYPE_DAYTIMEINTERVAL._serialized_end = 2948 + _DATATYPE_CHAR._serialized_start = 2950 + _DATATYPE_CHAR._serialized_end = 3038 + _DATATYPE_VARCHAR._serialized_start = 3040 + _DATATYPE_VARCHAR._serialized_end = 3131 + _DATATYPE_DECIMAL._serialized_start = 3134 + _DATATYPE_DECIMAL._serialized_end = 3287 + _DATATYPE_STRUCTFIELD._serialized_start = 3290 + _DATATYPE_STRUCTFIELD._serialized_end = 3451 + _DATATYPE_STRUCT._serialized_start = 3453 + _DATATYPE_STRUCT._serialized_end = 3580 + _DATATYPE_ARRAY._serialized_start = 3583 + _DATATYPE_ARRAY._serialized_end = 3745 + _DATATYPE_MAP._serialized_start = 3748 + _DATATYPE_MAP._serialized_end = 3967 + _DATATYPE_VARIANT._serialized_start = 3969 + _DATATYPE_VARIANT._serialized_end = 4036 + _DATATYPE_UDT._serialized_start = 4039 + _DATATYPE_UDT._serialized_end = 4310 + _DATATYPE_UNPARSED._serialized_start = 4312 + _DATATYPE_UNPARSED._serialized_end = 4364 # @@protoc_insertion_point(module_scope) diff --git a/python/pyspark/sql/connect/proto/types_pb2.pyi b/python/pyspark/sql/connect/proto/types_pb2.pyi index e6b34d3485c2f..b376211045377 100644 --- a/python/pyspark/sql/connect/proto/types_pb2.pyi +++ b/python/pyspark/sql/connect/proto/types_pb2.pyi @@ -178,22 +178,19 @@ class DataType(google.protobuf.message.Message): DESCRIPTOR: google.protobuf.descriptor.Descriptor TYPE_VARIATION_REFERENCE_FIELD_NUMBER: builtins.int - COLLATION_ID_FIELD_NUMBER: builtins.int + COLLATION_FIELD_NUMBER: builtins.int type_variation_reference: builtins.int - collation_id: builtins.int + collation: builtins.str def __init__( self, *, type_variation_reference: builtins.int = ..., - collation_id: builtins.int = ..., + collation: builtins.str = ..., ) -> None: ... def ClearField( self, field_name: typing_extensions.Literal[ - "collation_id", - b"collation_id", - "type_variation_reference", - b"type_variation_reference", + "collation", b"collation", "type_variation_reference", b"type_variation_reference" ], ) -> None: ... diff --git a/python/pyspark/sql/connect/protobuf/functions.py b/python/pyspark/sql/connect/protobuf/functions.py index fcf1ed1ee02ee..07e9b4b8c6861 100644 --- a/python/pyspark/sql/connect/protobuf/functions.py +++ b/python/pyspark/sql/connect/protobuf/functions.py @@ -27,7 +27,7 @@ from pyspark.sql.protobuf import functions as PyProtobufFunctions -from pyspark.sql.connect.column import Column +from pyspark.sql.column import Column from pyspark.sql.connect.functions.builtin import _invoke_function, _to_col, _options_to_col, lit if TYPE_CHECKING: @@ -120,7 +120,6 @@ def _read_descriptor_set_file(filePath: str) -> bytes: def _test() -> None: import os import sys - from pyspark.util import is_remote_only from pyspark.testing.utils import search_jar protobuf_jar = search_jar("connector/protobuf", "spark-protobuf-assembly-", "spark-protobuf") @@ -142,12 +141,6 @@ def _test() -> None: import pyspark.sql.connect.protobuf.functions globs = pyspark.sql.connect.protobuf.functions.__dict__.copy() - - # TODO(SPARK-47763): Reeanble Protobuf function doctests - if is_remote_only(): - del pyspark.sql.connect.protobuf.functions.from_protobuf - del pyspark.sql.connect.protobuf.functions.to_protobuf - globs["spark"] = ( PySparkSession.builder.appName("sql.protobuf.functions tests") .remote(os.environ.get("SPARK_CONNECT_TESTING_REMOTE", "local[2]")) diff --git a/python/pyspark/sql/connect/readwriter.py b/python/pyspark/sql/connect/readwriter.py index bf7dc4d369057..de62cf65b01ed 100644 --- a/python/pyspark/sql/connect/readwriter.py +++ b/python/pyspark/sql/connect/readwriter.py @@ -19,7 +19,7 @@ check_dependencies(__name__) from typing import Dict -from typing import Optional, Union, List, overload, Tuple, cast +from typing import Optional, Union, List, overload, Tuple, cast, Callable from typing import TYPE_CHECKING from pyspark.sql.connect.plan import Read, DataSource, LogicalPlan, WriteOperation, WriteOperationV2 @@ -37,6 +37,7 @@ from pyspark.sql.connect.dataframe import DataFrame from pyspark.sql.connect._typing import ColumnOrName, OptionalPrimitiveType from pyspark.sql.connect.session import SparkSession + from pyspark.sql.metrics import ExecutionInfo __all__ = ["DataFrameReader", "DataFrameWriter"] @@ -486,11 +487,18 @@ def _jreader(self) -> None: class DataFrameWriter(OptionUtils): - def __init__(self, plan: "LogicalPlan", session: "SparkSession"): + def __init__( + self, + plan: "LogicalPlan", + session: "SparkSession", + callback: Optional[Callable[["ExecutionInfo"], None]] = None, + ): self._df: "LogicalPlan" = plan self._spark: "SparkSession" = session self._write: "WriteOperation" = WriteOperation(self._df) + self._callback = callback if callback is not None else lambda _: None + def mode(self, saveMode: Optional[str]) -> "DataFrameWriter": # At the JVM side, the default value of mode is already set to "error". # So, if the given saveMode is None, we will not call JVM-side's mode method. @@ -649,9 +657,10 @@ def save( if format is not None: self.format(format) self._write.path = path - self._spark.client.execute_command( + _, _, ei = self._spark.client.execute_command( self._write.command(self._spark.client), self._write.observations ) + self._callback(ei) save.__doc__ = PySparkDataFrameWriter.save.__doc__ @@ -660,9 +669,10 @@ def insertInto(self, tableName: str, overwrite: Optional[bool] = None) -> None: self.mode("overwrite" if overwrite else "append") self._write.table_name = tableName self._write.table_save_method = "insert_into" - self._spark.client.execute_command( + _, _, ei = self._spark.client.execute_command( self._write.command(self._spark.client), self._write.observations ) + self._callback(ei) insertInto.__doc__ = PySparkDataFrameWriter.insertInto.__doc__ @@ -681,9 +691,10 @@ def saveAsTable( self.format(format) self._write.table_name = name self._write.table_save_method = "save_as_table" - self._spark.client.execute_command( + _, _, ei = self._spark.client.execute_command( self._write.command(self._spark.client), self._write.observations ) + self._callback(ei) saveAsTable.__doc__ = PySparkDataFrameWriter.saveAsTable.__doc__ @@ -845,11 +856,18 @@ def jdbc( class DataFrameWriterV2(OptionUtils): - def __init__(self, plan: "LogicalPlan", session: "SparkSession", table: str): + def __init__( + self, + plan: "LogicalPlan", + session: "SparkSession", + table: str, + callback: Optional[Callable[["ExecutionInfo"], None]] = None, + ): self._df: "LogicalPlan" = plan self._spark: "SparkSession" = session self._table_name: str = table self._write: "WriteOperationV2" = WriteOperationV2(self._df, self._table_name) + self._callback = callback if callback is not None else lambda _: None def using(self, provider: str) -> "DataFrameWriterV2": self._write.provider = provider @@ -884,50 +902,56 @@ def partitionedBy(self, col: "ColumnOrName", *cols: "ColumnOrName") -> "DataFram def create(self) -> None: self._write.mode = "create" - self._spark.client.execute_command( + _, _, ei = self._spark.client.execute_command( self._write.command(self._spark.client), self._write.observations ) + self._callback(ei) create.__doc__ = PySparkDataFrameWriterV2.create.__doc__ def replace(self) -> None: self._write.mode = "replace" - self._spark.client.execute_command( + _, _, ei = self._spark.client.execute_command( self._write.command(self._spark.client), self._write.observations ) + self._callback(ei) replace.__doc__ = PySparkDataFrameWriterV2.replace.__doc__ def createOrReplace(self) -> None: self._write.mode = "create_or_replace" - self._spark.client.execute_command( + _, _, ei = self._spark.client.execute_command( self._write.command(self._spark.client), self._write.observations ) + self._callback(ei) createOrReplace.__doc__ = PySparkDataFrameWriterV2.createOrReplace.__doc__ def append(self) -> None: self._write.mode = "append" - self._spark.client.execute_command( + _, _, ei = self._spark.client.execute_command( self._write.command(self._spark.client), self._write.observations ) + self._callback(ei) append.__doc__ = PySparkDataFrameWriterV2.append.__doc__ def overwrite(self, condition: "ColumnOrName") -> None: self._write.mode = "overwrite" self._write.overwrite_condition = F._to_col(condition) - self._spark.client.execute_command( + _, _, ei = self._spark.client.execute_command( self._write.command(self._spark.client), self._write.observations ) + self._callback(ei) overwrite.__doc__ = PySparkDataFrameWriterV2.overwrite.__doc__ def overwritePartitions(self) -> None: self._write.mode = "overwrite_partitions" - self._spark.client.execute_command( + _, _, ei = self._spark.client.execute_command( self._write.command(self._spark.client), self._write.observations ) + self._callback(ei) overwritePartitions.__doc__ = PySparkDataFrameWriterV2.overwritePartitions.__doc__ diff --git a/python/pyspark/sql/connect/session.py b/python/pyspark/sql/connect/session.py index 3be6c83cf13ba..8e277b3fc63aa 100644 --- a/python/pyspark/sql/connect/session.py +++ b/python/pyspark/sql/connect/session.py @@ -15,6 +15,7 @@ # limitations under the License. # from pyspark.sql.connect.utils import check_dependencies +from pyspark.sql.utils import is_timestamp_ntz_preferred check_dependencies(__name__) @@ -22,7 +23,7 @@ import os import warnings from collections.abc import Sized -from functools import reduce +import functools from threading import RLock from typing import ( Optional, @@ -48,10 +49,11 @@ ) import urllib +from pyspark.sql.connect.dataframe import DataFrame +from pyspark.sql.dataframe import DataFrame as ParentDataFrame from pyspark.loose_version import LooseVersion from pyspark.sql.connect.client import SparkConnectClient, DefaultChannelBuilder from pyspark.sql.connect.conf import RuntimeConf -from pyspark.sql.connect.dataframe import DataFrame from pyspark.sql.connect.plan import ( SQL, Range, @@ -72,7 +74,9 @@ to_arrow_schema, to_arrow_type, _deduplicate_field_names, + from_arrow_schema, from_arrow_type, + _check_arrow_table_timestamps_localize, ) from pyspark.sql.profiler import Profile from pyspark.sql.session import classproperty, SparkSession as PySparkSession @@ -107,7 +111,6 @@ from pyspark.sql.connect.shell.progress import ProgressHandler from pyspark.sql.connect.datasource import DataSourceRegistration - try: import memory_profiler # noqa: F401 @@ -238,7 +241,7 @@ def getOrCreate(self) -> "SparkSession": with SparkSession._lock: session = SparkSession.getActiveSession() if session is None: - session = SparkSession._default_session + session = SparkSession._get_default_session() if session is None: session = self.create() self._apply_options(session) @@ -285,9 +288,19 @@ def _set_default_and_active_session(cls, session: "SparkSession") -> None: if getattr(cls._active_session, "session", None) is None: cls._active_session.session = session + @classmethod + def _get_default_session(cls) -> Optional["SparkSession"]: + s = cls._default_session + if s is not None and not s.is_stopped: + return s + return None + @classmethod def getActiveSession(cls) -> Optional["SparkSession"]: - return getattr(cls._active_session, "session", None) + s = getattr(cls._active_session, "session", None) + if s is not None and not s.is_stopped: + return s + return None @classmethod def _getActiveSessionIfMatches(cls, session_id: str) -> "SparkSession": @@ -315,7 +328,7 @@ def _getActiveSessionIfMatches(cls, session_id: str) -> "SparkSession": def active(cls) -> "SparkSession": session = cls.getActiveSession() if session is None: - session = cls._default_session + session = cls._get_default_session() if session is None: raise PySparkRuntimeError( error_class="NO_ACTIVE_OR_DEFAULT_SESSION", @@ -325,7 +338,7 @@ def active(cls) -> "SparkSession": active.__doc__ = PySparkSession.active.__doc__ - def table(self, tableName: str) -> DataFrame: + def table(self, tableName: str) -> ParentDataFrame: if not isinstance(tableName, str): raise PySparkTypeError( error_class="NOT_STR", @@ -378,13 +391,15 @@ def _inferSchemaFromList( ( infer_dict_as_struct, infer_array_from_first_element, + infer_map_from_first_pair, prefer_timestamp_ntz, ) = self._client.get_configs( "spark.sql.pyspark.inferNestedDictAsStruct.enabled", "spark.sql.pyspark.legacy.inferArrayTypeFromFirstElement.enabled", + "spark.sql.pyspark.legacy.inferMapTypeFromFirstPair.enabled", "spark.sql.timestampType", ) - return reduce( + return functools.reduce( _merge_type, ( _infer_schema( @@ -392,6 +407,7 @@ def _inferSchemaFromList( names, infer_dict_as_struct=(infer_dict_as_struct == "true"), infer_array_from_first_element=(infer_array_from_first_element == "true"), + infer_map_from_first_pair=(infer_map_from_first_pair == "true"), prefer_timestamp_ntz=(prefer_timestamp_ntz == "TIMESTAMP_NTZ"), ) for row in data @@ -400,9 +416,11 @@ def _inferSchemaFromList( def createDataFrame( self, - data: Union["pd.DataFrame", "np.ndarray", Iterable[Any]], + data: Union["pd.DataFrame", "np.ndarray", "pa.Table", Iterable[Any]], schema: Optional[Union[AtomicType, StructType, str, List[str], Tuple[str, ...]]] = None, - ) -> "DataFrame": + samplingRatio: Optional[float] = None, + verifySchema: Optional[bool] = None, + ) -> "ParentDataFrame": assert data is not None if isinstance(data, DataFrame): raise PySparkTypeError( @@ -410,6 +428,12 @@ def createDataFrame( message_parameters={"arg_name": "data", "arg_type": "DataFrame"}, ) + if samplingRatio is not None: + warnings.warn("'samplingRatio' is ignored. It is not supported with Spark Connect.") + + if verifySchema is not None: + warnings.warn("'verifySchema' is ignored. It is not supported with Spark Connect.") + _schema: Optional[Union[AtomicType, StructType]] = None _cols: Optional[List[str]] = None _num_cols: Optional[int] = None @@ -455,6 +479,7 @@ def createDataFrame( ) _table: Optional[pa.Table] = None + timezone: Optional[str] = None if isinstance(data, pd.DataFrame): # Logic was borrowed from `_create_from_pandas_with_arrow` in @@ -540,6 +565,28 @@ def createDataFrame( cast(StructType, _deduplicate_field_names(schema)).names ).cast(arrow_schema) + elif isinstance(data, pa.Table): + prefer_timestamp_ntz = is_timestamp_ntz_preferred() + + (timezone,) = self._client.get_configs("spark.sql.session.timeZone") + + # If no schema supplied by user then get the names of columns only + if schema is None: + _cols = data.column_names + if isinstance(schema, (list, tuple)) and cast(int, _num_cols) < len(data.columns): + assert isinstance(_cols, list) + _cols.extend([f"_{i + 1}" for i in range(cast(int, _num_cols), len(data.columns))]) + _num_cols = len(_cols) + + if not isinstance(schema, StructType): + schema = from_arrow_schema(data.schema, prefer_timestamp_ntz=prefer_timestamp_ntz) + + _table = ( + _check_arrow_table_timestamps_localize(data, schema, True, timezone) + .cast(to_arrow_schema(schema, error_on_duplicated_field_names_in_struct=True)) + .rename_columns(schema.names) + ) + elif isinstance(data, np.ndarray): if _cols is None: if data.ndim == 1 or data.shape[1] == 1: @@ -636,7 +683,7 @@ def createDataFrame( df = DataFrame(plan, self) if _cols is not None and len(_cols) > 0: - df = df.toDF(*_cols) + df = df.toDF(*_cols) # type: ignore[assignment] return df createDataFrame.__doc__ = PySparkSession.createDataFrame.__doc__ @@ -646,7 +693,7 @@ def sql( sqlQuery: str, args: Optional[Union[Dict[str, Any], List]] = None, **kwargs: Any, - ) -> "DataFrame": + ) -> "ParentDataFrame": _args = [] _named_args = {} if args is not None: @@ -673,9 +720,12 @@ def sql( _views.append(SubqueryAlias(df._plan, name)) cmd = SQL(sqlQuery, _args, _named_args, _views) - data, properties = self.client.execute_command(cmd.command(self._client)) + data, properties, ei = self.client.execute_command(cmd.command(self._client)) if "sql_command_result" in properties: - return DataFrame(CachedRelation(properties["sql_command_result"]), self) + df = DataFrame(CachedRelation(properties["sql_command_result"]), self) + # A command result contains the execution. + df._execution_info = ei + return df else: return DataFrame(cmd, self) @@ -687,7 +737,7 @@ def range( end: Optional[int] = None, step: int = 1, numPartitions: Optional[int] = None, - ) -> DataFrame: + ) -> ParentDataFrame: if end is None: actual_end = start start = 0 @@ -718,6 +768,9 @@ def catalog(self) -> "Catalog": def __del__(self) -> None: try: + # StreamingQueryManager has client states that needs to be cleaned up + if hasattr(self, "_sqm"): + self._sqm.close() # Try its best to close. self.client.close() except Exception: @@ -848,7 +901,7 @@ def dataSource(self) -> "DataSourceRegistration": dataSource.__doc__ = PySparkSession.dataSource.__doc__ - @property + @functools.cached_property def version(self) -> str: result = self._client._analyze(method="spark_version").spark_version assert result is not None @@ -896,13 +949,13 @@ def copyFromLocalToFs(self, local_path: str, dest_path: str) -> None: copyFromLocalToFs.__doc__ = PySparkSession.copyFromLocalToFs.__doc__ - def _create_remote_dataframe(self, remote_id: str) -> "DataFrame": + def _create_remote_dataframe(self, remote_id: str) -> "ParentDataFrame": """ In internal API to reference a runtime DataFrame on the server side. This is used in ForeachBatch() runner, where the remote DataFrame refers to the output of a micro batch. """ - return DataFrame(CachedRemoteRelation(remote_id), self) + return DataFrame(CachedRemoteRelation(remote_id, spark_session=self), self) @staticmethod def _start_connect_server(master: str, opts: Dict[str, Any]) -> None: diff --git a/python/pyspark/sql/connect/streaming/query.py b/python/pyspark/sql/connect/streaming/query.py index 0624f8943ac40..13458d650fa9f 100644 --- a/python/pyspark/sql/connect/streaming/query.py +++ b/python/pyspark/sql/connect/streaming/query.py @@ -181,7 +181,7 @@ def _execute_streaming_query_cmd( cmd.query_id.run_id = self._run_id exec_cmd = pb2.Command() exec_cmd.streaming_query_command.CopyFrom(cmd) - (_, properties) = self._session.client.execute_command(exec_cmd) + (_, properties, _) = self._session.client.execute_command(exec_cmd) return cast(pb2.StreamingQueryCommandResult, properties["streaming_query_command_result"]) @@ -190,6 +190,9 @@ def __init__(self, session: "SparkSession") -> None: self._session = session self._sqlb = StreamingQueryListenerBus(self) + def close(self) -> None: + self._sqlb.close() + @property def active(self) -> List[StreamingQuery]: cmd = pb2.StreamingQueryManagerCommand() @@ -257,7 +260,7 @@ def _execute_streaming_query_manager_cmd( ) -> pb2.StreamingQueryManagerCommandResult: exec_cmd = pb2.Command() exec_cmd.streaming_query_manager_command.CopyFrom(cmd) - (_, properties) = self._session.client.execute_command(exec_cmd) + (_, properties, _) = self._session.client.execute_command(exec_cmd) return cast( pb2.StreamingQueryManagerCommandResult, properties["streaming_query_manager_command_result"], @@ -276,6 +279,10 @@ def __init__(self, sqm: "StreamingQueryManager") -> None: self._execution_thread: Optional[Thread] = None self._lock = Lock() + def close(self) -> None: + for listener in self._listener_bus: + self.remove(listener) + def append(self, listener: StreamingQueryListener) -> None: """ Append a listener to the local listener bus. When the added listener is diff --git a/python/pyspark/sql/connect/streaming/readwriter.py b/python/pyspark/sql/connect/streaming/readwriter.py index 4973bb5b6cf73..9b11bf328b853 100644 --- a/python/pyspark/sql/connect/streaming/readwriter.py +++ b/python/pyspark/sql/connect/streaming/readwriter.py @@ -446,6 +446,11 @@ def partitionBy(self, *cols: str) -> "DataStreamWriter": # type: ignore[misc] partitionBy.__doc__ = PySparkDataStreamWriter.partitionBy.__doc__ def queryName(self, queryName: str) -> "DataStreamWriter": + if not queryName or type(queryName) != str or len(queryName.strip()) == 0: + raise PySparkValueError( + error_class="VALUE_NOT_NON_EMPTY_STR", + message_parameters={"arg_name": "queryName", "arg_value": str(queryName)}, + ) self._write_proto.query_name = queryName return self @@ -596,7 +601,7 @@ def _start_internal( self._write_proto.table_name = tableName cmd = self._write_stream.command(self._session.client) - (_, properties) = self._session.client.execute_command(cmd) + (_, properties, _) = self._session.client.execute_command(cmd) start_result = cast( pb2.WriteStreamOperationStartResult, properties["write_stream_operation_start_result"] @@ -605,7 +610,9 @@ def _start_internal( session=self._session, queryId=start_result.query_id.id, runId=start_result.query_id.run_id, - name=start_result.name, + # A Streaming Query cannot have empty string as name + # Spark throws error in that case, so this cast is safe + name=start_result.name if start_result.name != "" else None, ) if start_result.HasField("query_started_event_json"): diff --git a/python/pyspark/sql/connect/streaming/worker/foreach_batch_worker.py b/python/pyspark/sql/connect/streaming/worker/foreach_batch_worker.py index 92ed7a4aaff53..0c92de6372b6f 100644 --- a/python/pyspark/sql/connect/streaming/worker/foreach_batch_worker.py +++ b/python/pyspark/sql/connect/streaming/worker/foreach_batch_worker.py @@ -62,13 +62,6 @@ def main(infile: IO, outfile: IO) -> None: assert spark_connect_session.session_id == session_id spark = spark_connect_session - # TODO(SPARK-44461): Enable Process Isolation - - func = worker.read_command(pickle_ser, infile) - write_int(0, outfile) # Indicate successful initialization - - outfile.flush() - log_name = "Streaming ForeachBatch worker" def process(df_id, batch_id): # type: ignore[no-untyped-def] @@ -78,16 +71,21 @@ def process(df_id, batch_id): # type: ignore[no-untyped-def] func(batch_df, batch_id) print(f"{log_name} Completed batch {batch_id} with DF id {df_id}") - while True: - df_ref_id = utf8_deserializer.loads(infile) - batch_id = read_long(infile) - # Handle errors inside Python worker. Write 0 to outfile if no errors and write -2 with - # traceback string if error occurs. - try: + try: + func = worker.read_command(pickle_ser, infile) + write_int(0, outfile) + outfile.flush() + + while True: + df_ref_id = utf8_deserializer.loads(infile) + batch_id = read_long(infile) + # Handle errors inside Python worker. Write 0 to outfile if no errors and write -2 with + # traceback string if error occurs. process(df_ref_id, int(batch_id)) write_int(0, outfile) - except BaseException as e: - handle_worker_exception(e, outfile) + outfile.flush() + except Exception as e: + handle_worker_exception(e, outfile) outfile.flush() @@ -98,4 +96,6 @@ def process(df_id, batch_id): # type: ignore[no-untyped-def] (sock_file, sock) = local_connect_and_auth(java_port, auth_secret) # There could be a long time between each micro batch. sock.settimeout(None) + write_int(os.getpid(), sock_file) + sock_file.flush() main(sock_file, sock_file) diff --git a/python/pyspark/sql/connect/streaming/worker/listener_worker.py b/python/pyspark/sql/connect/streaming/worker/listener_worker.py index d3efb5894fc02..a7a5066ca0d77 100644 --- a/python/pyspark/sql/connect/streaming/worker/listener_worker.py +++ b/python/pyspark/sql/connect/streaming/worker/listener_worker.py @@ -70,8 +70,6 @@ def main(infile: IO, outfile: IO) -> None: assert spark_connect_session.session_id == session_id spark = spark_connect_session - # TODO(SPARK-44461): Enable Process Isolation - listener = worker.read_command(pickle_ser, infile) write_int(0, outfile) # Indicate successful initialization @@ -112,4 +110,6 @@ def process(listener_event_str, listener_event_type): # type: ignore[no-untyped (sock_file, sock) = local_connect_and_auth(java_port, auth_secret) # There could be a long time between each listener event. sock.settimeout(None) + write_int(os.getpid(), sock_file) + sock_file.flush() main(sock_file, sock_file) diff --git a/python/pyspark/sql/connect/types.py b/python/pyspark/sql/connect/types.py index f058c6390612a..885ce62e7db6f 100644 --- a/python/pyspark/sql/connect/types.py +++ b/python/pyspark/sql/connect/types.py @@ -55,16 +55,6 @@ import pyspark.sql.connect.proto as pb2 -JVM_BYTE_MIN: int = -(1 << 7) -JVM_BYTE_MAX: int = (1 << 7) - 1 -JVM_SHORT_MIN: int = -(1 << 15) -JVM_SHORT_MAX: int = (1 << 15) - 1 -JVM_INT_MIN: int = -(1 << 31) -JVM_INT_MAX: int = (1 << 31) - 1 -JVM_LONG_MIN: int = -(1 << 63) -JVM_LONG_MAX: int = (1 << 63) - 1 - - class UnparsedDataType(DataType): """ Unparsed data type. @@ -139,7 +129,7 @@ def pyspark_types_to_proto_types(data_type: DataType) -> pb2.DataType: if isinstance(data_type, NullType): ret.null.CopyFrom(pb2.DataType.NULL()) elif isinstance(data_type, StringType): - ret.string.collation_id = data_type.collationId + ret.string.collation = data_type.collation elif isinstance(data_type, BooleanType): ret.boolean.CopyFrom(pb2.DataType.Boolean()) elif isinstance(data_type, BinaryType): @@ -239,7 +229,8 @@ def proto_schema_to_pyspark_data_type(schema: pb2.DataType) -> DataType: s = schema.decimal.scale if schema.decimal.HasField("scale") else 0 return DecimalType(precision=p, scale=s) elif schema.HasField("string"): - return StringType.fromCollationId(schema.string.collation_id) + collation = schema.string.collation if schema.string.collation != "" else "UTF8_BINARY" + return StringType(collation) elif schema.HasField("char"): return CharType(schema.char.length) elif schema.HasField("var_char"): diff --git a/python/pyspark/sql/connect/udf.py b/python/pyspark/sql/connect/udf.py index f3aa719b2bb64..f5daf3ff841fd 100644 --- a/python/pyspark/sql/connect/udf.py +++ b/python/pyspark/sql/connect/udf.py @@ -36,9 +36,11 @@ PythonUDF, ) from pyspark.sql.connect.column import Column -from pyspark.sql.connect.types import UnparsedDataType -from pyspark.sql.types import DataType, StringType -from pyspark.sql.udf import UDFRegistration as PySparkUDFRegistration +from pyspark.sql.types import DataType, StringType, _parse_datatype_string +from pyspark.sql.udf import ( + UDFRegistration as PySparkUDFRegistration, + UserDefinedFunction as PySparkUserDefinedFunction, +) from pyspark.errors import PySparkTypeError, PySparkRuntimeError if TYPE_CHECKING: @@ -148,20 +150,36 @@ def __init__( ) self.func = func - self.returnType: DataType = ( - UnparsedDataType(returnType) if isinstance(returnType, str) else returnType - ) + self._returnType = returnType + self._returnType_placeholder: Optional[DataType] = None self._name = name or ( func.__name__ if hasattr(func, "__name__") else func.__class__.__name__ ) self.evalType = evalType self.deterministic = deterministic + @property + def returnType(self) -> DataType: + # Make sure this is called after Connect Session is initialized. + # ``_parse_datatype_string`` accesses to Connect Server for parsing a DDL formatted string. + # TODO: PythonEvalType.SQL_BATCHED_UDF + if self._returnType_placeholder is None: + if isinstance(self._returnType, DataType): + self._returnType_placeholder = self._returnType + else: + self._returnType_placeholder = _parse_datatype_string(self._returnType) + + PySparkUserDefinedFunction._check_return_type(self._returnType_placeholder, self.evalType) + return self._returnType_placeholder + def _build_common_inline_user_defined_function( self, *args: "ColumnOrName", **kwargs: "ColumnOrName" ) -> CommonInlineUserDefinedFunction: def to_expr(col: "ColumnOrName") -> Expression: - return col._expr if isinstance(col, Column) else ColumnReference(col) + if isinstance(col, Column): + return col._expr + else: + return ColumnReference(col) # type: ignore[arg-type] arg_exprs: List[Expression] = [to_expr(arg) for arg in args] + [ NamedArgumentExpression(key, to_expr(value)) for key, value in kwargs.items() diff --git a/python/pyspark/sql/connect/udtf.py b/python/pyspark/sql/connect/udtf.py index 4ee39dc89b8e8..739289d72a3b1 100644 --- a/python/pyspark/sql/connect/udtf.py +++ b/python/pyspark/sql/connect/udtf.py @@ -141,7 +141,10 @@ def _build_common_inline_user_defined_table_function( self, *args: "ColumnOrName", **kwargs: "ColumnOrName" ) -> CommonInlineUserDefinedTableFunction: def to_expr(col: "ColumnOrName") -> Expression: - return col._expr if isinstance(col, Column) else ColumnReference(col) + if isinstance(col, Column): + return col._expr + else: + return ColumnReference(col) # type: ignore[arg-type] arg_exprs: List[Expression] = [to_expr(arg) for arg in args] + [ NamedArgumentExpression(key, to_expr(value)) for key, value in kwargs.items() diff --git a/python/pyspark/sql/connect/window.py b/python/pyspark/sql/connect/window.py index e30a5b7d7a9e2..cbca6886060cf 100644 --- a/python/pyspark/sql/connect/window.py +++ b/python/pyspark/sql/connect/window.py @@ -18,42 +18,37 @@ check_dependencies(__name__) -import sys -from typing import TYPE_CHECKING, Union, Sequence, List, Optional - -from pyspark.sql.connect.column import Column -from pyspark.sql.connect.expressions import ( - ColumnReference, - Expression, - SortOrder, -) -from pyspark.sql.connect.types import ( - JVM_LONG_MIN, - JVM_LONG_MAX, +from typing import TYPE_CHECKING, Union, Sequence, List, Optional, Tuple, cast, Iterable + +from pyspark.sql.column import Column +from pyspark.sql.window import ( + Window as ParentWindow, + WindowSpec as ParentWindowSpec, ) -from pyspark.sql.window import Window as PySparkWindow, WindowSpec as PySparkWindowSpec -from pyspark.errors import PySparkTypeError +from pyspark.sql.connect.expressions import Expression, SortOrder +from pyspark.sql.connect.functions import builtin as F if TYPE_CHECKING: - from pyspark.sql.connect._typing import ColumnOrName + from pyspark.sql.connect._typing import ColumnOrName, ColumnOrName_ __all__ = ["Window", "WindowSpec"] +def _to_cols(cols: Tuple[Union["ColumnOrName", List["ColumnOrName_"]], ...]) -> List[Column]: + if len(cols) == 1 and isinstance(cols[0], list): + cols = cols[0] # type: ignore[assignment] + return [F._to_col(c) for c in cast(Iterable["ColumnOrName"], cols)] + + class WindowFrame: def __init__(self, isRowFrame: bool, start: int, end: int) -> None: super().__init__() assert isinstance(isRowFrame, bool) - assert isinstance(start, int) - assert isinstance(end, int) - self._isRowFrame = isRowFrame - self._start = start - self._end = end def __repr__(self) -> str: @@ -63,7 +58,17 @@ def __repr__(self) -> str: return f"WindowFrame(RANGE_FRAME, {self._start}, {self._end})" -class WindowSpec: +class WindowSpec(ParentWindowSpec): + def __new__( + cls, + partitionSpec: Sequence[Expression], + orderSpec: Sequence[SortOrder], + frame: Optional[WindowFrame], + ) -> "WindowSpec": + self = object.__new__(cls) + self.__init__(partitionSpec, orderSpec, frame) # type: ignore[misc] + return self + def __init__( self, partitionSpec: Sequence[Expression], @@ -73,87 +78,27 @@ def __init__( assert isinstance(partitionSpec, list) and all( isinstance(p, Expression) for p in partitionSpec ) - assert isinstance(orderSpec, list) and all(isinstance(s, SortOrder) for s in orderSpec) - assert frame is None or isinstance(frame, WindowFrame) - self._partitionSpec = partitionSpec - self._orderSpec = orderSpec - self._frame = frame - def partitionBy(self, *cols: Union["ColumnOrName", List["ColumnOrName"]]) -> "WindowSpec": - _cols: List[ColumnOrName] = [] - for col in cols: - if isinstance(col, (str, Column)): - _cols.append(col) - elif isinstance(col, list): - for c in col: - if isinstance(c, (str, Column)): - _cols.append(c) - else: - raise PySparkTypeError( - error_class="NOT_COLUMN_OR_LIST_OR_STR", - message_parameters={"arg_name": "cols", "arg_type": type(c).__name__}, - ) - else: - raise PySparkTypeError( - error_class="NOT_COLUMN_OR_LIST_OR_STR", - message_parameters={"arg_name": "cols", "arg_type": type(col).__name__}, - ) - - newPartitionSpec: List[Expression] = [] - for c in _cols: - if isinstance(c, Column): - newPartitionSpec.append(c._expr) - else: - newPartitionSpec.append(ColumnReference(c)) - + def partitionBy(self, *cols: Union["ColumnOrName", List["ColumnOrName_"]]) -> ParentWindowSpec: return WindowSpec( - partitionSpec=newPartitionSpec, + partitionSpec=[c._expr for c in _to_cols(cols)], # type: ignore[misc] orderSpec=self._orderSpec, frame=self._frame, ) - def orderBy(self, *cols: Union["ColumnOrName", List["ColumnOrName"]]) -> "WindowSpec": - _cols: List[ColumnOrName] = [] - for col in cols: - if isinstance(col, (str, Column)): - _cols.append(col) - elif isinstance(col, list): - for c in col: - if isinstance(c, (str, Column)): - _cols.append(c) - else: - raise PySparkTypeError( - error_class="NOT_COLUMN_OR_LIST_OR_STR", - message_parameters={"arg_name": "cols", "arg_type": type(c).__name__}, - ) - else: - raise PySparkTypeError( - error_class="NOT_COLUMN_OR_LIST_OR_STR", - message_parameters={"arg_name": "cols", "arg_type": type(col).__name__}, - ) - - newOrderSpec: List[SortOrder] = [] - for c in _cols: - if isinstance(c, Column): - if isinstance(c._expr, SortOrder): - newOrderSpec.append(c._expr) - else: - newOrderSpec.append(SortOrder(c._expr)) - else: - newOrderSpec.append(SortOrder(ColumnReference(c))) - + def orderBy(self, *cols: Union["ColumnOrName", List["ColumnOrName_"]]) -> ParentWindowSpec: return WindowSpec( partitionSpec=self._partitionSpec, - orderSpec=newOrderSpec, + orderSpec=[cast(SortOrder, F._sort_col(c)._expr) for c in _to_cols(cols)], frame=self._frame, ) - def rowsBetween(self, start: int, end: int) -> "WindowSpec": + def rowsBetween(self, start: int, end: int) -> ParentWindowSpec: if start <= Window._PRECEDING_THRESHOLD: start = Window.unboundedPreceding if end >= Window._FOLLOWING_THRESHOLD: @@ -165,7 +110,7 @@ def rowsBetween(self, start: int, end: int) -> "WindowSpec": frame=WindowFrame(isRowFrame=True, start=start, end=end), ) - def rangeBetween(self, start: int, end: int) -> "WindowSpec": + def rangeBetween(self, start: int, end: int) -> ParentWindowSpec: if start <= Window._PRECEDING_THRESHOLD: start = Window.unboundedPreceding if end >= Window._FOLLOWING_THRESHOLD: @@ -190,57 +135,34 @@ def __repr__(self) -> str: return "WindowSpec(" + ", ".join(strs) + ")" -WindowSpec.rangeBetween.__doc__ = PySparkWindowSpec.rangeBetween.__doc__ -WindowSpec.rowsBetween.__doc__ = PySparkWindowSpec.rowsBetween.__doc__ -WindowSpec.orderBy.__doc__ = PySparkWindowSpec.orderBy.__doc__ -WindowSpec.partitionBy.__doc__ = PySparkWindowSpec.partitionBy.__doc__ -WindowSpec.__doc__ = PySparkWindowSpec.__doc__ - - -class Window: - _PRECEDING_THRESHOLD = max(-sys.maxsize, JVM_LONG_MIN) - _FOLLOWING_THRESHOLD = min(sys.maxsize, JVM_LONG_MAX) - - unboundedPreceding: int = JVM_LONG_MIN - - unboundedFollowing: int = JVM_LONG_MAX - - currentRow: int = 0 - +class Window(ParentWindow): _spec = WindowSpec(partitionSpec=[], orderSpec=[], frame=None) @staticmethod - def partitionBy(*cols: Union["ColumnOrName", List["ColumnOrName"]]) -> "WindowSpec": + def partitionBy(*cols: Union["ColumnOrName", List["ColumnOrName_"]]) -> ParentWindowSpec: return Window._spec.partitionBy(*cols) @staticmethod - def orderBy(*cols: Union["ColumnOrName", List["ColumnOrName"]]) -> "WindowSpec": + def orderBy(*cols: Union["ColumnOrName", List["ColumnOrName_"]]) -> ParentWindowSpec: return Window._spec.orderBy(*cols) @staticmethod - def rowsBetween(start: int, end: int) -> "WindowSpec": + def rowsBetween(start: int, end: int) -> ParentWindowSpec: return Window._spec.rowsBetween(start, end) @staticmethod - def rangeBetween(start: int, end: int) -> "WindowSpec": + def rangeBetween(start: int, end: int) -> ParentWindowSpec: return Window._spec.rangeBetween(start, end) -Window.orderBy.__doc__ = PySparkWindow.orderBy.__doc__ -Window.rowsBetween.__doc__ = PySparkWindow.rowsBetween.__doc__ -Window.rangeBetween.__doc__ = PySparkWindow.rangeBetween.__doc__ -Window.partitionBy.__doc__ = PySparkWindow.partitionBy.__doc__ -Window.__doc__ = PySparkWindow.__doc__ - - def _test() -> None: import os import sys import doctest from pyspark.sql import SparkSession as PySparkSession - import pyspark.sql.connect.window + import pyspark.sql.window - globs = pyspark.sql.connect.window.__dict__.copy() + globs = pyspark.sql.window.__dict__.copy() globs["spark"] = ( PySparkSession.builder.appName("sql.connect.window tests") .remote(os.environ.get("SPARK_CONNECT_TESTING_REMOTE", "local[4]")) @@ -248,7 +170,7 @@ def _test() -> None: ) (failure_count, test_count) = doctest.testmod( - pyspark.sql.connect.window, + pyspark.sql.window, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py index cbb0299e2195d..3fe47615b8761 100644 --- a/python/pyspark/sql/context.py +++ b/python/pyspark/sql/context.py @@ -46,6 +46,7 @@ if TYPE_CHECKING: from py4j.java_gateway import JavaObject + import pyarrow as pa from pyspark.core.rdd import RDD from pyspark.core.context import SparkContext from pyspark.sql._typing import ( @@ -343,14 +344,14 @@ def createDataFrame( @overload def createDataFrame( - self, data: "PandasDataFrameLike", samplingRatio: Optional[float] = ... + self, data: Union["PandasDataFrameLike", "pa.Table"], samplingRatio: Optional[float] = ... ) -> DataFrame: ... @overload def createDataFrame( self, - data: "PandasDataFrameLike", + data: Union["PandasDataFrameLike", "pa.Table"], schema: Union[StructType, str], verifySchema: bool = ..., ) -> DataFrame: @@ -358,13 +359,14 @@ def createDataFrame( def createDataFrame( # type: ignore[misc] self, - data: Union["RDD[Any]", Iterable[Any], "PandasDataFrameLike"], + data: Union["RDD[Any]", Iterable[Any], "PandasDataFrameLike", "pa.Table"], schema: Optional[Union[AtomicType, StructType, str]] = None, samplingRatio: Optional[float] = None, verifySchema: bool = True, ) -> DataFrame: """ - Creates a :class:`DataFrame` from an :class:`RDD`, a list or a :class:`pandas.DataFrame`. + Creates a :class:`DataFrame` from an :class:`RDD`, a list, a :class:`pandas.DataFrame`, or + a :class:`pyarrow.Table`. When ``schema`` is a list of column names, the type of each column will be inferred from ``data``. @@ -393,12 +395,15 @@ def createDataFrame( # type: ignore[misc] .. versionchanged:: 2.1.0 Added verifySchema. + .. versionchanged:: 4.0.0 + Added support for :class:`pyarrow.Table`. + Parameters ---------- data : :class:`RDD` or iterable an RDD of any kind of SQL data representation (:class:`Row`, - :class:`tuple`, ``int``, ``boolean``, etc.), or :class:`list`, or - :class:`pandas.DataFrame`. + :class:`tuple`, ``int``, ``boolean``, etc.), or :class:`list`, + :class:`pandas.DataFrame`, or :class:`pyarrow.Table`. schema : :class:`pyspark.sql.types.DataType`, str or list, optional a :class:`pyspark.sql.types.DataType` or a datatype string or a list of column names, default is None. The data type string format equals to @@ -452,6 +457,12 @@ def createDataFrame( # type: ignore[misc] >>> sqlContext.createDataFrame(pandas.DataFrame([[1, 2]])).collect() # doctest: +SKIP [Row(0=1, 1=2)] + >>> sqlContext.createDataFrame(df.toArrow()).collect() # doctest: +SKIP + [Row(name='Alice', age=1)] + >>> table = pyarrow.table({'0': [1], '1': [2]}) # doctest: +SKIP + >>> sqlContext.createDataFrame(table).collect() # doctest: +SKIP + [Row(0=1, 1=2)] + >>> sqlContext.createDataFrame(rdd, "a: string, b: int").collect() [Row(a='Alice', b=1)] >>> rdd = rdd.map(lambda row: row[1]) diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index a599d04705184..625678588bf9e 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -15,13 +15,8 @@ # limitations under the License. # -import json -import os -import sys -import random -import warnings -from collections.abc import Iterable -from functools import reduce +# mypy: disable-error-code="empty-body" + from typing import ( Any, Callable, @@ -31,45 +26,27 @@ Optional, Sequence, Tuple, - Type, Union, - cast, overload, TYPE_CHECKING, ) from pyspark import _NoValue from pyspark._globals import _NoValueType -from pyspark.errors import ( - PySparkTypeError, - PySparkValueError, - PySparkIndexError, - PySparkAttributeError, -) -from pyspark.util import ( - is_remote_only, - _load_from_socket, - _local_iterator_from_socket, -) -from pyspark.serializers import BatchedSerializer, CPickleSerializer, UTF8Deserializer +from pyspark.util import is_remote_only from pyspark.storagelevel import StorageLevel -from pyspark.traceback_utils import SCCallSiteSync -from pyspark.sql.column import Column, _to_seq, _to_list, _to_java_column +from pyspark.resource import ResourceProfile +from pyspark.sql.column import Column from pyspark.sql.readwriter import DataFrameWriter, DataFrameWriterV2 from pyspark.sql.streaming import DataStreamWriter -from pyspark.sql.types import ( - StructType, - Row, - _parse_datatype_json_string, -) -from pyspark.sql.utils import get_active_spark_context, toJArray -from pyspark.sql.pandas.conversion import PandasConversionMixin -from pyspark.sql.pandas.map_ops import PandasMapOpsMixin +from pyspark.sql.types import StructType, Row +from pyspark.sql.utils import dispatch_df_method if TYPE_CHECKING: from py4j.java_gateway import JavaObject - from pyspark.core.rdd import RDD + import pyarrow as pa from pyspark.core.context import SparkContext + from pyspark.core.rdd import RDD from pyspark._typing import PrimitiveType from pyspark.pandas.frame import DataFrame as PandasOnSparkDataFrame from pyspark.sql._typing import ( @@ -82,12 +59,18 @@ from pyspark.sql.session import SparkSession from pyspark.sql.group import GroupedData from pyspark.sql.observation import Observation + from pyspark.sql.pandas._typing import ( + PandasMapIterFunction, + ArrowMapIterFunction, + DataFrameLike as PandasDataFrameLike, + ) + from pyspark.sql.metrics import ExecutionInfo __all__ = ["DataFrame", "DataFrameNaFunctions", "DataFrameStatFunctions"] -class DataFrame(PandasMapOpsMixin, PandasConversionMixin): +class DataFrame: """A distributed collection of data grouped into named columns. .. versionadded:: 1.3.0 @@ -125,12 +108,13 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): >>> people.filter(people.age > 30).join( ... department, people.deptId == department.id).groupBy( - ... department.name, "gender").agg({"salary": "avg", "age": "max"}).show() + ... department.name, "gender").agg( + ... {"salary": "avg", "age": "max"}).sort("max(age)").show() +-------+------+-----------+--------+ | name|gender|avg(salary)|max(age)| +-------+------+-----------+--------+ - | ML| F| 150.0| 60| |PySpark| M| 75.0| 50| + | ML| F| 150.0| 60| +-------+------+-----------+--------+ Notes @@ -139,49 +123,26 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): created via using the constructor. """ - def __init__( - self, + # HACK ALERT!! this is to reduce the backward compatibility concern, and returns + # Spark Classic DataFrame by default. This is NOT an API, and NOT supposed to + # be directly invoked. DO NOT use this constructor. + _sql_ctx: Optional["SQLContext"] + _session: "SparkSession" + _sc: "SparkContext" + _jdf: "JavaObject" + is_cached: bool + _schema: Optional[StructType] + _lazy_rdd: Optional["RDD[Row]"] + _support_repr_html: bool + + def __new__( + cls, jdf: "JavaObject", sql_ctx: Union["SQLContext", "SparkSession"], - ): - from pyspark.sql.context import SQLContext - - self._sql_ctx: Optional["SQLContext"] = None - - if isinstance(sql_ctx, SQLContext): - assert not os.environ.get("SPARK_TESTING") # Sanity check for our internal usage. - assert isinstance(sql_ctx, SQLContext) - # We should remove this if-else branch in the future release, and rename - # sql_ctx to session in the constructor. This is an internal code path but - # was kept with a warning because it's used intensively by third-party libraries. - warnings.warn("DataFrame constructor is internal. Do not directly use it.") - self._sql_ctx = sql_ctx - session = sql_ctx.sparkSession - else: - session = sql_ctx - self._session: "SparkSession" = session - - self._sc: "SparkContext" = sql_ctx._sc - self._jdf: "JavaObject" = jdf - self.is_cached = False - # initialized lazily - self._schema: Optional[StructType] = None - self._lazy_rdd: Optional["RDD[Row]"] = None - # Check whether _repr_html is supported or not, we use it to avoid calling _jdf twice - # by __repr__ and _repr_html_ while eager evaluation opens. - self._support_repr_html = False - - @property - def sql_ctx(self) -> "SQLContext": - from pyspark.sql.context import SQLContext + ) -> "DataFrame": + from pyspark.sql.classic.dataframe import DataFrame - warnings.warn( - "DataFrame.sql_ctx is an internal property, and will be removed " - "in future releases. Use DataFrame.sparkSession instead." - ) - if self._sql_ctx is None: - self._sql_ctx = SQLContext._get_or_create(self._sc) - return self._sql_ctx + return DataFrame.__new__(DataFrame, jdf, sql_ctx) @property def sparkSession(self) -> "SparkSession": @@ -202,7 +163,7 @@ def sparkSession(self) -> "SparkSession": >>> type(df.sparkSession) """ - return self._session + ... if not is_remote_only(): @@ -222,14 +183,7 @@ def rdd(self) -> "RDD[Row]": >>> type(df.rdd) """ - from pyspark.core.rdd import RDD - - if self._lazy_rdd is None: - jrdd = self._jdf.javaToPython() - self._lazy_rdd = RDD( - jrdd, self.sparkSession._sc, BatchedSerializer(CPickleSerializer()) - ) - return self._lazy_rdd + ... @property def na(self) -> "DataFrameNaFunctions": @@ -259,7 +213,7 @@ def na(self) -> "DataFrameNaFunctions": | 1| 2| +---+---+ """ - return DataFrameNaFunctions(self) + ... @property def stat(self) -> "DataFrameStatFunctions": @@ -283,7 +237,7 @@ def stat(self) -> "DataFrameStatFunctions": >>> df.stat.corr("id", "c") 1.0 """ - return DataFrameStatFunctions(self) + ... if not is_remote_only(): @@ -309,11 +263,9 @@ def toJSON(self, use_unicode: bool = True) -> "RDD[str]": >>> df.toJSON().first() '{"age":2,"name":"Alice"}' """ - from pyspark.core.rdd import RDD - - rdd = self._jdf.toJSON() - return RDD(rdd.toJavaRDD(), self._sc, UTF8Deserializer(use_unicode)) + ... + @dispatch_df_method def registerTempTable(self, name: str) -> None: """Registers this :class:`DataFrame` as a temporary table using the given name. @@ -344,9 +296,9 @@ def registerTempTable(self, name: str) -> None: True """ - warnings.warn("Deprecated in 2.0, use createOrReplaceTempView instead.", FutureWarning) - self._jdf.createOrReplaceTempView(name) + ... + @dispatch_df_method def createTempView(self, name: str) -> None: """Creates a local temporary view with this :class:`DataFrame`. @@ -410,8 +362,9 @@ def createTempView(self, name: str) -> None: | 4|Jill| +---+----+ """ - self._jdf.createTempView(name) + ... + @dispatch_df_method def createOrReplaceTempView(self, name: str) -> None: """Creates or replaces a local temporary view with this :class:`DataFrame`. @@ -453,8 +406,9 @@ def createOrReplaceTempView(self, name: str) -> None: ... spark.catalog.dropTempView("people") True """ - self._jdf.createOrReplaceTempView(name) + ... + @dispatch_df_method def createGlobalTempView(self, name: str) -> None: """Creates a global temporary view with this :class:`DataFrame`. @@ -501,8 +455,9 @@ def createGlobalTempView(self, name: str) -> None: >>> spark.catalog.dropGlobalTempView("people") True """ - self._jdf.createGlobalTempView(name) + ... + @dispatch_df_method def createOrReplaceGlobalTempView(self, name: str) -> None: """Creates or replaces a global temporary view using the given name. @@ -537,7 +492,7 @@ def createOrReplaceGlobalTempView(self, name: str) -> None: >>> spark.catalog.dropGlobalTempView("people") True """ - self._jdf.createOrReplaceGlobalTempView(name) + ... @property def write(self) -> DataFrameWriter: @@ -566,7 +521,7 @@ def write(self) -> DataFrameWriter: >>> df.write.saveAsTable("tab2") >>> _ = spark.sql("DROP TABLE tab2") """ - return DataFrameWriter(self) + ... @property def writeStream(self) -> DataStreamWriter: @@ -602,7 +557,7 @@ def writeStream(self) -> DataStreamWriter: ... time.sleep(3) ... query.stop() """ - return DataStreamWriter(self) + ... @property def schema(self) -> StructType: @@ -646,18 +601,9 @@ def schema(self) -> StructType: StructType([StructField('value', StringType(), False)]) """ - if self._schema is None: - try: - self._schema = cast( - StructType, _parse_datatype_json_string(self._jdf.schema().json()) - ) - except Exception as e: - raise PySparkValueError( - error_class="CANNOT_PARSE_DATATYPE", - message_parameters={"error": str(e)}, - ) - return self._schema + ... + @dispatch_df_method def printSchema(self, level: Optional[int] = None) -> None: """Prints out the schema in the tree format. Optionally allows to specify how many levels to print if schema is nested. @@ -710,11 +656,9 @@ def printSchema(self, level: Optional[int] = None) -> None: |-- nonnullable: long (nullable = false) |-- nullable: void (nullable = true) """ - if level: - print(self._jdf.schema().treeString(level)) - else: - print(self._jdf.schema().treeString()) + ... + @dispatch_df_method def explain( self, extended: Optional[Union[bool, str]] = None, mode: Optional[str] = None ) -> None: @@ -782,56 +726,9 @@ def explain( ...Statistics... ... """ + ... - if extended is not None and mode is not None: - raise PySparkValueError( - error_class="CANNOT_SET_TOGETHER", - message_parameters={"arg_list": "extended and mode"}, - ) - - # For the no argument case: df.explain() - is_no_argument = extended is None and mode is None - - # For the cases below: - # explain(True) - # explain(extended=False) - is_extended_case = isinstance(extended, bool) and mode is None - - # For the case when extended is mode: - # df.explain("formatted") - is_extended_as_mode = isinstance(extended, str) and mode is None - - # For the mode specified: - # df.explain(mode="formatted") - is_mode_case = extended is None and isinstance(mode, str) - - if not (is_no_argument or is_extended_case or is_extended_as_mode or is_mode_case): - if (extended is not None) and (not isinstance(extended, (bool, str))): - raise PySparkTypeError( - error_class="NOT_BOOL_OR_STR", - message_parameters={ - "arg_name": "extended", - "arg_type": type(extended).__name__, - }, - ) - if (mode is not None) and (not isinstance(mode, str)): - raise PySparkTypeError( - error_class="NOT_STR", - message_parameters={"arg_name": "mode", "arg_type": type(mode).__name__}, - ) - - # Sets an explain mode depending on a given argument - if is_no_argument: - explain_mode = "simple" - elif is_extended_case: - explain_mode = "extended" if extended else "simple" - elif is_mode_case: - explain_mode = cast(str, mode) - elif is_extended_as_mode: - explain_mode = cast(str, extended) - assert self._sc._jvm is not None - print(self._sc._jvm.PythonSQLUtils.explainString(self._jdf.queryExecution(), explain_mode)) - + @dispatch_df_method def exceptAll(self, other: "DataFrame") -> "DataFrame": """Return a new :class:`DataFrame` containing rows in this :class:`DataFrame` but not in another :class:`DataFrame` while preserving duplicates. @@ -869,8 +766,9 @@ def exceptAll(self, other: "DataFrame") -> "DataFrame": +---+---+ """ - return DataFrame(self._jdf.exceptAll(other._jdf), self.sparkSession) + ... + @dispatch_df_method def isLocal(self) -> bool: """Returns ``True`` if the :func:`collect` and :func:`take` methods can be run locally (without any Spark executors). @@ -890,7 +788,7 @@ def isLocal(self) -> bool: >>> df.isLocal() True """ - return self._jdf.isLocal() + ... @property def isStreaming(self) -> bool: @@ -921,8 +819,9 @@ def isStreaming(self) -> bool: >>> df.isStreaming True """ - return self._jdf.isStreaming() + ... + @dispatch_df_method def isEmpty(self) -> bool: """ Checks if the :class:`DataFrame` is empty and returns a boolean value. @@ -972,8 +871,9 @@ def isEmpty(self) -> bool: >>> df_no_rows.isEmpty() True """ - return self._jdf.isEmpty() + ... + @dispatch_df_method def show(self, n: int = 20, truncate: Union[bool, int] = True, vertical: bool = False) -> None: """ Prints the first ``n`` rows of the DataFrame to the console. @@ -1063,73 +963,32 @@ def show(self, n: int = 20, truncate: Union[bool, int] = True, vertical: bool = age | 19 name | This is a super l... """ - print(self._show_string(n, truncate, vertical)) - - def _show_string( - self, n: int = 20, truncate: Union[bool, int] = True, vertical: bool = False - ) -> str: - if not isinstance(n, int) or isinstance(n, bool): - raise PySparkTypeError( - error_class="NOT_INT", - message_parameters={"arg_name": "n", "arg_type": type(n).__name__}, - ) - - if not isinstance(vertical, bool): - raise PySparkTypeError( - error_class="NOT_BOOL", - message_parameters={"arg_name": "vertical", "arg_type": type(vertical).__name__}, - ) - - if isinstance(truncate, bool) and truncate: - return self._jdf.showString(n, 20, vertical) - else: - try: - int_truncate = int(truncate) - except ValueError: - raise PySparkTypeError( - error_class="NOT_BOOL", - message_parameters={ - "arg_name": "truncate", - "arg_type": type(truncate).__name__, - }, - ) - - return self._jdf.showString(n, int_truncate, vertical) + ... + @dispatch_df_method def __repr__(self) -> str: - if not self._support_repr_html and self.sparkSession._jconf.isReplEagerEvalEnabled(): - vertical = False - return self._jdf.showString( - self.sparkSession._jconf.replEagerEvalMaxNumRows(), - self.sparkSession._jconf.replEagerEvalTruncate(), - vertical, - ) - else: - return "DataFrame[%s]" % (", ".join("%s: %s" % c for c in self.dtypes)) + ... + @dispatch_df_method def _repr_html_(self) -> Optional[str]: """Returns a :class:`DataFrame` with html code when you enabled eager evaluation by 'spark.sql.repl.eagerEval.enabled', this only called by REPL you are using support eager evaluation with HTML. """ - if not self._support_repr_html: - self._support_repr_html = True - if self.sparkSession._jconf.isReplEagerEvalEnabled(): - return self._jdf.htmlString( - self.sparkSession._jconf.replEagerEvalMaxNumRows(), - self.sparkSession._jconf.replEagerEvalTruncate(), - ) - else: - return None + ... def checkpoint(self, eager: bool = True) -> "DataFrame": - """Returns a checkpointed version of this :class:`DataFrame`. Checkpointing can be used to - truncate the logical plan of this :class:`DataFrame`, which is especially useful in - iterative algorithms where the plan may grow exponentially. It will be saved to files - inside the checkpoint directory set with :meth:`SparkContext.setCheckpointDir`. + """Returns a checkpointed version of this :class:`DataFrame`. Checkpointing can be + used to truncate the logical plan of this :class:`DataFrame`, which is especially + useful in iterative algorithms where the plan may grow exponentially. It will be + saved to files inside the checkpoint directory set with + :meth:`SparkContext.setCheckpointDir`, or `spark.checkpoint.dir` configuration. .. versionadded:: 2.1.0 + .. versionchanged:: 4.0.0 + Supports Spark Connect. + Parameters ---------- eager : bool, optional, default True @@ -1146,25 +1005,25 @@ def checkpoint(self, eager: bool = True) -> "DataFrame": Examples -------- - >>> import tempfile >>> df = spark.createDataFrame([ ... (14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"]) - >>> with tempfile.TemporaryDirectory(prefix="checkpoint") as d: - ... spark.sparkContext.setCheckpointDir("/tmp/bb") - ... df.checkpoint(False) + >>> df.checkpoint(False) # doctest: +SKIP DataFrame[age: bigint, name: string] """ - jdf = self._jdf.checkpoint(eager) - return DataFrame(jdf, self.sparkSession) + ... def localCheckpoint(self, eager: bool = True) -> "DataFrame": - """Returns a locally checkpointed version of this :class:`DataFrame`. Checkpointing can be - used to truncate the logical plan of this :class:`DataFrame`, which is especially useful in - iterative algorithms where the plan may grow exponentially. Local checkpoints are - stored in the executors using the caching subsystem and therefore they are not reliable. + """Returns a locally checkpointed version of this :class:`DataFrame`. Checkpointing can + be used to truncate the logical plan of this :class:`DataFrame`, which is especially + useful in iterative algorithms where the plan may grow exponentially. Local checkpoints + are stored in the executors using the caching subsystem and therefore they are not + reliable. .. versionadded:: 2.3.0 + .. versionchanged:: 4.0.0 + Supports Spark Connect. + Parameters ---------- eager : bool, optional, default True @@ -1186,9 +1045,9 @@ def localCheckpoint(self, eager: bool = True) -> "DataFrame": >>> df.localCheckpoint(False) DataFrame[age: bigint, name: string] """ - jdf = self._jdf.localCheckpoint(eager) - return DataFrame(jdf, self.sparkSession) + ... + @dispatch_df_method def withWatermark(self, eventTime: str, delayThreshold: str) -> "DataFrame": """Defines an event time watermark for this :class:`DataFrame`. A watermark tracks a point in time before which we assume no more late data is going to arrive. @@ -1252,22 +1111,9 @@ def withWatermark(self, eventTime: str, delayThreshold: str) -> "DataFrame": >>> time.sleep(3) >>> query.stop() """ - if not eventTime or type(eventTime) is not str: - raise PySparkTypeError( - error_class="NOT_STR", - message_parameters={"arg_name": "eventTime", "arg_type": type(eventTime).__name__}, - ) - if not delayThreshold or type(delayThreshold) is not str: - raise PySparkTypeError( - error_class="NOT_STR", - message_parameters={ - "arg_name": "delayThreshold", - "arg_type": type(delayThreshold).__name__, - }, - ) - jdf = self._jdf.withWatermark(eventTime, delayThreshold) - return DataFrame(jdf, self.sparkSession) + ... + @dispatch_df_method def hint( self, name: str, *parameters: Union["PrimitiveType", "Column", List["PrimitiveType"]] ) -> "DataFrame": @@ -1308,67 +1154,9 @@ def hint( ... +- BroadcastHashJoin ... ... """ - if len(parameters) == 1 and isinstance(parameters[0], list): - parameters = parameters[0] # type: ignore[assignment] - - if not isinstance(name, str): - raise PySparkTypeError( - error_class="NOT_STR", - message_parameters={"arg_name": "name", "arg_type": type(name).__name__}, - ) - - allowed_types = (str, float, int, Column, list) - allowed_primitive_types = (str, float, int) - allowed_types_repr = ", ".join( - [t.__name__ for t in allowed_types[:-1]] - + ["list[" + t.__name__ + "]" for t in allowed_primitive_types] - ) - for p in parameters: - if not isinstance(p, allowed_types): - raise PySparkTypeError( - error_class="DISALLOWED_TYPE_FOR_CONTAINER", - message_parameters={ - "arg_name": "parameters", - "arg_type": type(parameters).__name__, - "allowed_types": allowed_types_repr, - "item_type": type(p).__name__, - }, - ) - if isinstance(p, list): - if not all(isinstance(e, allowed_primitive_types) for e in p): - raise PySparkTypeError( - error_class="DISALLOWED_TYPE_FOR_CONTAINER", - message_parameters={ - "arg_name": "parameters", - "arg_type": type(parameters).__name__, - "allowed_types": allowed_types_repr, - "item_type": type(p).__name__ + "[" + type(p[0]).__name__ + "]", - }, - ) - - def _converter(parameter: Union[str, list, float, int, Column]) -> Any: - if isinstance(parameter, Column): - return _to_java_column(parameter) - elif isinstance(parameter, list): - # for list input, we are assuming only one element type exist in the list. - # for empty list, we are converting it into an empty long[] in the JVM side. - gateway = self._sc._gateway - assert gateway is not None - jclass = gateway.jvm.long - if len(parameter) >= 1: - mapping = { - str: gateway.jvm.java.lang.String, - float: gateway.jvm.double, - int: gateway.jvm.long, - } - jclass = mapping[type(parameter[0])] - return toJArray(gateway, jclass, parameter) - else: - return parameter - - jdf = self._jdf.hint(name, self._jseq(parameters, _converter)) - return DataFrame(jdf, self.sparkSession) + ... + @dispatch_df_method def count(self) -> int: """Returns the number of rows in this :class:`DataFrame`. @@ -1392,8 +1180,9 @@ def count(self) -> int: >>> df.count() 3 """ - return int(self._jdf.count()) + ... + @dispatch_df_method def collect(self) -> List[Row]: """Returns all the records in the DataFrame as a list of :class:`Row`. @@ -1412,6 +1201,7 @@ def collect(self) -> List[Row]: DataFrame.take : Returns the first `n` rows. DataFrame.head : Returns the first `n` rows. DataFrame.toPandas : Returns the data as a pandas DataFrame. + DataFrame.toArrow : Returns the data as a PyArrow Table. Notes ----- @@ -1459,10 +1249,9 @@ def collect(self) -> List[Row]: >>> [row.asDict() for row in rows] [{'age': 14, 'name': 'Tom'}, {'age': 23, 'name': 'Alice'}, {'age': 16, 'name': 'Bob'}] """ - with SCCallSiteSync(self._sc): - sock_info = self._jdf.collectToPython() - return list(_load_from_socket(sock_info, BatchedSerializer(CPickleSerializer()))) + ... + @dispatch_df_method def toLocalIterator(self, prefetchPartitions: bool = False) -> Iterator[Row]: """ Returns an iterator that contains all of the rows in this :class:`DataFrame`. @@ -1495,10 +1284,9 @@ def toLocalIterator(self, prefetchPartitions: bool = False) -> Iterator[Row]: >>> list(df.toLocalIterator()) [Row(age=14, name='Tom'), Row(age=23, name='Alice'), Row(age=16, name='Bob')] """ - with SCCallSiteSync(self._sc): - sock_info = self._jdf.toPythonIterator(prefetchPartitions) - return _local_iterator_from_socket(sock_info, BatchedSerializer(CPickleSerializer())) + ... + @dispatch_df_method def limit(self, num: int) -> "DataFrame": """Limits the result count to the number specified. @@ -1534,9 +1322,9 @@ def limit(self, num: int) -> "DataFrame": +---+----+ +---+----+ """ - jdf = self._jdf.limit(num) - return DataFrame(jdf, self.sparkSession) + ... + @dispatch_df_method def offset(self, num: int) -> "DataFrame": """Returns a new :class: `DataFrame` by skipping the first `n` rows. @@ -1572,9 +1360,9 @@ def offset(self, num: int) -> "DataFrame": +---+----+ +---+----+ """ - jdf = self._jdf.offset(num) - return DataFrame(jdf, self.sparkSession) + ... + @dispatch_df_method def take(self, num: int) -> List[Row]: """Returns the first ``num`` rows as a :class:`list` of :class:`Row`. @@ -1604,8 +1392,9 @@ def take(self, num: int) -> List[Row]: >>> df.take(2) [Row(age=14, name='Tom'), Row(age=23, name='Alice')] """ - return self.limit(num).collect() + ... + @dispatch_df_method def tail(self, num: int) -> List[Row]: """ Returns the last ``num`` rows as a :class:`list` of :class:`Row`. @@ -1637,10 +1426,9 @@ def tail(self, num: int) -> List[Row]: >>> df.tail(2) [Row(age=23, name='Alice'), Row(age=16, name='Bob')] """ - with SCCallSiteSync(self._sc): - sock_info = self._jdf.tailToPython(num) - return list(_load_from_socket(sock_info, BatchedSerializer(CPickleSerializer()))) + ... + @dispatch_df_method def foreach(self, f: Callable[[Row], None]) -> None: """Applies the ``f`` function to all :class:`Row` of this :class:`DataFrame`. @@ -1666,8 +1454,9 @@ def foreach(self, f: Callable[[Row], None]) -> None: ... >>> df.foreach(func) """ - self.rdd.foreach(f) + ... + @dispatch_df_method def foreachPartition(self, f: Callable[[Iterator[Row]], None]) -> None: """Applies the ``f`` function to each partition of this :class:`DataFrame`. @@ -1694,8 +1483,9 @@ def foreachPartition(self, f: Callable[[Iterator[Row]], None]) -> None: ... >>> df.foreachPartition(func) """ - self.rdd.foreachPartition(f) # type: ignore[arg-type] + ... + @dispatch_df_method def cache(self) -> "DataFrame": """Persists the :class:`DataFrame` with the default storage level (`MEMORY_AND_DISK_DESER`). @@ -1723,10 +1513,9 @@ def cache(self) -> "DataFrame": == Physical Plan == InMemoryTableScan ... """ - self.is_cached = True - self._jdf.cache() - return self + ... + @dispatch_df_method def persist( self, storageLevel: StorageLevel = (StorageLevel.MEMORY_AND_DISK_DESER), @@ -1771,10 +1560,7 @@ def persist( >>> df.persist(StorageLevel.DISK_ONLY) DataFrame[id: bigint] """ - self.is_cached = True - javaStorageLevel = self._sc._getJavaStorageLevel(storageLevel) - self._jdf.persist(javaStorageLevel) - return self + ... @property def storageLevel(self) -> StorageLevel: @@ -1802,16 +1588,9 @@ def storageLevel(self) -> StorageLevel: >>> df2.persist(StorageLevel.DISK_ONLY_2).storageLevel StorageLevel(True, False, False, False, 2) """ - java_storage_level = self._jdf.storageLevel() - storage_level = StorageLevel( - java_storage_level.useDisk(), - java_storage_level.useMemory(), - java_storage_level.useOffHeap(), - java_storage_level.deserialized(), - java_storage_level.replication(), - ) - return storage_level + ... + @dispatch_df_method def unpersist(self, blocking: bool = False) -> "DataFrame": """Marks the :class:`DataFrame` as non-persistent, and remove all blocks for it from memory and disk. @@ -1850,6 +1629,7 @@ def unpersist(self, blocking: bool = False) -> "DataFrame": self._jdf.unpersist(blocking) return self + @dispatch_df_method def coalesce(self, numPartitions: int) -> "DataFrame": """ Returns a new :class:`DataFrame` that has exactly `numPartitions` partitions. @@ -1915,7 +1695,8 @@ def repartition(self, numPartitions: int, *cols: "ColumnOrName") -> "DataFrame": def repartition(self, *cols: "ColumnOrName") -> "DataFrame": ... - def repartition( # type: ignore[misc] + @dispatch_df_method # type: ignore[misc] + def repartition( self, numPartitions: Union[int, "ColumnOrName"], *cols: "ColumnOrName" ) -> "DataFrame": """ @@ -2020,25 +1801,7 @@ def repartition( # type: ignore[misc] | 2| +---------+ """ - if isinstance(numPartitions, int): - if len(cols) == 0: - return DataFrame(self._jdf.repartition(numPartitions), self.sparkSession) - else: - return DataFrame( - self._jdf.repartition(numPartitions, self._jcols(*cols)), - self.sparkSession, - ) - elif isinstance(numPartitions, (str, Column)): - cols = (numPartitions,) + cols - return DataFrame(self._jdf.repartition(self._jcols(*cols)), self.sparkSession) - else: - raise PySparkTypeError( - error_class="NOT_COLUMN_OR_STR", - message_parameters={ - "arg_name": "numPartitions", - "arg_type": type(numPartitions).__name__, - }, - ) + ... @overload def repartitionByRange(self, numPartitions: int, *cols: "ColumnOrName") -> "DataFrame": @@ -2048,7 +1811,8 @@ def repartitionByRange(self, numPartitions: int, *cols: "ColumnOrName") -> "Data def repartitionByRange(self, *cols: "ColumnOrName") -> "DataFrame": ... - def repartitionByRange( # type: ignore[misc] + @dispatch_df_method # type: ignore[misc] + def repartitionByRange( self, numPartitions: Union[int, "ColumnOrName"], *cols: "ColumnOrName" ) -> "DataFrame": """ @@ -2104,29 +1868,9 @@ def repartitionByRange( # type: ignore[misc] | 23|Alice| 1| +---+-----+--------------------+ """ - if isinstance(numPartitions, int): - if len(cols) == 0: - raise PySparkValueError( - error_class="CANNOT_BE_EMPTY", - message_parameters={"item": "partition-by expression"}, - ) - else: - return DataFrame( - self._jdf.repartitionByRange(numPartitions, self._jcols(*cols)), - self.sparkSession, - ) - elif isinstance(numPartitions, (str, Column)): - cols = (numPartitions,) + cols - return DataFrame(self._jdf.repartitionByRange(self._jcols(*cols)), self.sparkSession) - else: - raise PySparkTypeError( - error_class="NOT_COLUMN_OR_INT_OR_STR", - message_parameters={ - "arg_name": "numPartitions", - "arg_type": type(numPartitions).__name__, - }, - ) + ... + @dispatch_df_method def distinct(self) -> "DataFrame": """Returns a new :class:`DataFrame` containing the distinct rows in this :class:`DataFrame`. @@ -2225,7 +1969,7 @@ def distinct(self) -> "DataFrame": | 23|Alice| F| +---+-----+------+ """ - return DataFrame(self._jdf.distinct(), self.sparkSession) + ... @overload def sample(self, fraction: float, seed: Optional[int] = ...) -> "DataFrame": @@ -2240,7 +1984,8 @@ def sample( ) -> "DataFrame": ... - def sample( # type: ignore[misc] + @dispatch_df_method # type: ignore[misc] + def sample( self, withReplacement: Optional[Union[float, bool]] = None, fraction: Optional[Union[int, float]] = None, @@ -2290,47 +2035,9 @@ def sample( # type: ignore[misc] >>> df.sample(False, fraction=1.0).count() 10 """ + ... - # For the cases below: - # sample(True, 0.5 [, seed]) - # sample(True, fraction=0.5 [, seed]) - # sample(withReplacement=False, fraction=0.5 [, seed]) - is_withReplacement_set = type(withReplacement) == bool and isinstance(fraction, float) - - # For the case below: - # sample(faction=0.5 [, seed]) - is_withReplacement_omitted_kwargs = withReplacement is None and isinstance(fraction, float) - - # For the case below: - # sample(0.5 [, seed]) - is_withReplacement_omitted_args = isinstance(withReplacement, float) - - if not ( - is_withReplacement_set - or is_withReplacement_omitted_kwargs - or is_withReplacement_omitted_args - ): - argtypes = [type(arg).__name__ for arg in [withReplacement, fraction, seed]] - raise PySparkTypeError( - error_class="NOT_BOOL_OR_FLOAT_OR_INT", - message_parameters={ - "arg_name": "withReplacement (optional), " - + "fraction (required) and seed (optional)", - "arg_type": ", ".join(argtypes), - }, - ) - - if is_withReplacement_omitted_args: - if fraction is not None: - seed = cast(int, fraction) - fraction = withReplacement - withReplacement = None - - seed = int(seed) if seed is not None else None - args = [arg for arg in [withReplacement, fraction, seed] if arg is not None] - jdf = self._jdf.sample(*args) - return DataFrame(jdf, self.sparkSession) - + @dispatch_df_method def sampleBy( self, col: "ColumnOrName", fractions: Dict[Any, float], seed: Optional[int] = None ) -> "DataFrame": @@ -2375,36 +2082,9 @@ def sampleBy( >>> dataset.sampleBy(col("key"), fractions={2: 1.0}, seed=0).count() 33 """ - if isinstance(col, str): - col = Column(col) - elif not isinstance(col, Column): - raise PySparkTypeError( - error_class="NOT_COLUMN_OR_STR", - message_parameters={"arg_name": "col", "arg_type": type(col).__name__}, - ) - if not isinstance(fractions, dict): - raise PySparkTypeError( - error_class="NOT_DICT", - message_parameters={"arg_name": "fractions", "arg_type": type(fractions).__name__}, - ) - for k, v in fractions.items(): - if not isinstance(k, (float, int, str)): - raise PySparkTypeError( - error_class="DISALLOWED_TYPE_FOR_CONTAINER", - message_parameters={ - "arg_name": "fractions", - "arg_type": type(fractions).__name__, - "allowed_types": "float, int, str", - "item_type": type(k).__name__, - }, - ) - fractions[k] = float(v) - col = col._jc - seed = seed if seed is not None else random.randint(0, sys.maxsize) - return DataFrame( - self._jdf.stat().sampleBy(col, self._jmap(fractions), seed), self.sparkSession - ) + ... + @dispatch_df_method def randomSplit(self, weights: List[float], seed: Optional[int] = None) -> List["DataFrame"]: """Randomly splits this :class:`DataFrame` with the provided weights. @@ -2442,17 +2122,7 @@ def randomSplit(self, weights: List[float], seed: Optional[int] = None) -> List[ >>> splits[1].count() 2 """ - for w in weights: - if w < 0.0: - raise PySparkValueError( - error_class="VALUE_NOT_POSITIVE", - message_parameters={"arg_name": "weights", "arg_value": str(w)}, - ) - seed = seed if seed is not None else random.randint(0, sys.maxsize) - df_array = self._jdf.randomSplit( - _to_list(self.sparkSession._sc, cast(List["ColumnOrName"], weights)), int(seed) - ) - return [DataFrame(df, self.sparkSession) for df in df_array] + ... @property def dtypes(self) -> List[Tuple[str, str]]: @@ -2475,7 +2145,7 @@ def dtypes(self) -> List[Tuple[str, str]]: >>> df.dtypes [('age', 'bigint'), ('name', 'string')] """ - return [(str(f.name), f.dataType.simpleString()) for f in self.schema.fields] + ... @property def columns(self) -> List[str]: @@ -2552,8 +2222,9 @@ def columns(self) -> List[str]: >>> df.columns == df2.columns False """ - return [f.name for f in self.schema.fields] + ... + @dispatch_df_method def colRegex(self, colName: str) -> Column: """ Selects column based on the column name specified as a regex and returns it @@ -2585,14 +2256,9 @@ def colRegex(self, colName: str) -> Column: | 3| +----+ """ - if not isinstance(colName, str): - raise PySparkTypeError( - error_class="NOT_STR", - message_parameters={"arg_name": "colName", "arg_type": type(colName).__name__}, - ) - jc = self._jdf.colRegex(colName) - return Column(jc) + ... + @dispatch_df_method def to(self, schema: StructType) -> "DataFrame": """ Returns a new :class:`DataFrame` where each row is reconciled to match the specified @@ -2648,10 +2314,9 @@ def to(self, schema: StructType) -> "DataFrame": | 1| a| +---+---+ """ - assert schema is not None - jschema = self._jdf.sparkSession().parseDataType(schema.json()) - return DataFrame(self._jdf.to(jschema), self.sparkSession) + ... + @dispatch_df_method def alias(self, alias: str) -> "DataFrame": """Returns a new :class:`DataFrame` with an alias set. @@ -2688,9 +2353,9 @@ def alias(self, alias: str) -> "DataFrame": |Alice|Alice| 23| +-----+-----+---+ """ - assert isinstance(alias, str), "alias should be a string" - return DataFrame(getattr(self._jdf, "as")(alias), self.sparkSession) + ... + @dispatch_df_method def crossJoin(self, other: "DataFrame") -> "DataFrame": """Returns the cartesian product with another :class:`DataFrame`. @@ -2728,10 +2393,9 @@ def crossJoin(self, other: "DataFrame") -> "DataFrame": | 16| Bob| 85| +---+-----+------+ """ + ... - jdf = self._jdf.crossJoin(other._jdf) - return DataFrame(jdf, self.sparkSession) - + @dispatch_df_method def join( self, other: "DataFrame", @@ -2862,7 +2526,7 @@ def join( Outer join on multiple columns - >>> df.join(df3, ["name", "age"], "outer").show() + >>> df.join(df3, ["name", "age"], "outer").sort("name", "age").show() +-----+----+------+ | name| age|height| +-----+----+------+ @@ -2911,30 +2575,10 @@ def join( |Alice| 2| +-----+---+ """ - - if on is not None and not isinstance(on, list): - on = [on] # type: ignore[assignment] - - if on is not None: - if isinstance(on[0], str): - on = self._jseq(cast(List[str], on)) - else: - assert isinstance(on[0], Column), "on should be Column or list of Column" - on = reduce(lambda x, y: x.__and__(y), cast(List[Column], on)) - on = on._jc - - if on is None and how is None: - jdf = self._jdf.join(other._jdf) - else: - if how is None: - how = "inner" - if on is None: - on = self._jseq([]) - assert isinstance(how, str), "how should be a string" - jdf = self._jdf.join(other._jdf, on, how) - return DataFrame(jdf, self.sparkSession) + ... # TODO(SPARK-22947): Fix the DataFrame API. + @dispatch_df_method def _joinAsOf( self, other: "DataFrame", @@ -3018,44 +2662,9 @@ def _joinAsOf( [Row(a=1, left_val='a', right_val=1), Row(a=5, left_val='b', right_val=6)] """ - if isinstance(leftAsOfColumn, str): - leftAsOfColumn = self[leftAsOfColumn] - left_as_of_jcol = leftAsOfColumn._jc - if isinstance(rightAsOfColumn, str): - rightAsOfColumn = other[rightAsOfColumn] - right_as_of_jcol = rightAsOfColumn._jc - - if on is not None and not isinstance(on, list): - on = [on] # type: ignore[assignment] - - if on is not None: - if isinstance(on[0], str): - on = self._jseq(cast(List[str], on)) - else: - assert isinstance(on[0], Column), "on should be Column or list of Column" - on = reduce(lambda x, y: x.__and__(y), cast(List[Column], on)) - on = on._jc - - if how is None: - how = "inner" - assert isinstance(how, str), "how should be a string" - - if tolerance is not None: - assert isinstance(tolerance, Column), "tolerance should be Column" - tolerance = tolerance._jc - - jdf = self._jdf.joinAsOf( - other._jdf, - left_as_of_jcol, - right_as_of_jcol, - on, - how, - tolerance, - allowExactMatches, - direction, - ) - return DataFrame(jdf, self.sparkSession) + ... + @dispatch_df_method def sortWithinPartitions( self, *cols: Union[int, str, Column, List[Union[int, str, Column]]], @@ -3117,9 +2726,9 @@ def sortWithinPartitions( | 2|Alice| +---+-----+ """ - jdf = self._jdf.sortWithinPartitions(self._sort_cols(cols, kwargs)) - return DataFrame(jdf, self.sparkSession) + ... + @dispatch_df_method def sort( self, *cols: Union[int, str, Column, List[Union[int, str, Column]]], @@ -3277,98 +2886,11 @@ def sort( | 2|Alice| +---+-----+ """ - jdf = self._jdf.sort(self._sort_cols(cols, kwargs)) - return DataFrame(jdf, self.sparkSession) + ... orderBy = sort - def _jseq( - self, - cols: Sequence, - converter: Optional[Callable[..., Union["PrimitiveType", "JavaObject"]]] = None, - ) -> "JavaObject": - """Return a JVM Seq of Columns from a list of Column or names""" - return _to_seq(self.sparkSession._sc, cols, converter) - - def _jmap(self, jm: Dict) -> "JavaObject": - """Return a JVM Scala Map from a dict""" - return _to_scala_map(self.sparkSession._sc, jm) - - def _jcols(self, *cols: "ColumnOrName") -> "JavaObject": - """Return a JVM Seq of Columns from a list of Column or column names - - If `cols` has only one list in it, cols[0] will be used as the list. - """ - if len(cols) == 1 and isinstance(cols[0], list): - cols = cols[0] - return self._jseq(cols, _to_java_column) - - def _jcols_ordinal(self, *cols: "ColumnOrNameOrOrdinal") -> "JavaObject": - """Return a JVM Seq of Columns from a list of Column or column names or column ordinals. - - If `cols` has only one list in it, cols[0] will be used as the list. - """ - if len(cols) == 1 and isinstance(cols[0], list): - cols = cols[0] - - _cols = [] - for c in cols: - if isinstance(c, int) and not isinstance(c, bool): - if c < 1: - raise PySparkIndexError( - error_class="INDEX_NOT_POSITIVE", message_parameters={"index": str(c)} - ) - # ordinal is 1-based - _cols.append(self[c - 1]) - else: - _cols.append(c) # type: ignore[arg-type] - return self._jseq(_cols, _to_java_column) - - def _sort_cols( - self, - cols: Sequence[Union[int, str, Column, List[Union[int, str, Column]]]], - kwargs: Dict[str, Any], - ) -> "JavaObject": - """Return a JVM Seq of Columns that describes the sort order""" - if not cols: - raise PySparkValueError( - error_class="CANNOT_BE_EMPTY", - message_parameters={"item": "column"}, - ) - if len(cols) == 1 and isinstance(cols[0], list): - cols = cols[0] - - jcols = [] - for c in cols: - if isinstance(c, int) and not isinstance(c, bool): - # ordinal is 1-based - if c > 0: - _c = self[c - 1] - # negative ordinal means sort by desc - elif c < 0: - _c = self[-c - 1].desc() - else: - raise PySparkIndexError( - error_class="ZERO_INDEX", - message_parameters={}, - ) - else: - _c = c # type: ignore[assignment] - jcols.append(_to_java_column(cast("ColumnOrName", _c))) - - ascending = kwargs.get("ascending", True) - if isinstance(ascending, (bool, int)): - if not ascending: - jcols = [jc.desc() for jc in jcols] - elif isinstance(ascending, list): - jcols = [jc if asc else jc.desc() for asc, jc in zip(ascending, jcols)] - else: - raise PySparkTypeError( - error_class="NOT_BOOL_OR_LIST", - message_parameters={"arg_name": "ascending", "arg_type": type(ascending).__name__}, - ) - return self._jseq(jcols) - + @dispatch_df_method def describe(self, *cols: Union[str, List[str]]) -> "DataFrame": """Computes basic statistics for numeric and string columns. @@ -3430,11 +2952,9 @@ def describe(self, *cols: Union[str, List[str]]) -> "DataFrame": -------- DataFrame.summary """ - if len(cols) == 1 and isinstance(cols[0], list): - cols = cols[0] # type: ignore[assignment] - jdf = self._jdf.describe(self._jseq(cols)) - return DataFrame(jdf, self.sparkSession) + ... + @dispatch_df_method def summary(self, *statistics: str) -> "DataFrame": """Computes specified statistics for numeric and string columns. Available statistics are: - count @@ -3503,10 +3023,7 @@ def summary(self, *statistics: str) -> "DataFrame": -------- DataFrame.display """ - if len(statistics) == 1 and isinstance(statistics[0], list): - statistics = statistics[0] - jdf = self._jdf.summary(self._jseq(statistics)) - return DataFrame(jdf, self.sparkSession) + ... @overload def head(self) -> Optional[Row]: @@ -3516,6 +3033,7 @@ def head(self) -> Optional[Row]: def head(self, n: int) -> List[Row]: ... + @dispatch_df_method def head(self, n: Optional[int] = None) -> Union[Optional[Row], List[Row]]: """Returns the first ``n`` rows. @@ -3551,11 +3069,9 @@ def head(self, n: Optional[int] = None) -> Union[Optional[Row], List[Row]]: >>> df.head(0) [] """ - if n is None: - rs = self.head(1) - return rs[0] if rs else None - return self.take(n) + ... + @dispatch_df_method def first(self) -> Optional[Row]: """Returns the first row as a :class:`Row`. @@ -3576,7 +3092,7 @@ def first(self) -> Optional[Row]: >>> df.first() Row(age=2, name='Alice') """ - return self.head() + ... @overload def __getitem__(self, item: Union[int, str]) -> Column: @@ -3586,6 +3102,7 @@ def __getitem__(self, item: Union[int, str]) -> Column: def __getitem__(self, item: Union[Column, List, Tuple]) -> "DataFrame": ... + @dispatch_df_method def __getitem__(self, item: Union[int, str, Column, List, Tuple]) -> Union[Column, "DataFrame"]: """Returns the column as a :class:`Column`. @@ -3658,22 +3175,9 @@ def __getitem__(self, item: Union[int, str, Column, List, Tuple]) -> Union[Colum | 5| Bob| +---+----+ """ - if isinstance(item, str): - jc = self._jdf.apply(item) - return Column(jc) - elif isinstance(item, Column): - return self.filter(item) - elif isinstance(item, (list, tuple)): - return self.select(*item) - elif isinstance(item, int): - jc = self._jdf.apply(self.columns[item]) - return Column(jc) - else: - raise PySparkTypeError( - error_class="NOT_COLUMN_OR_FLOAT_OR_INT_OR_LIST_OR_STR", - message_parameters={"arg_name": "item", "arg_type": type(item).__name__}, - ) + ... + @dispatch_df_method def __getattr__(self, name: str) -> Column: """Returns the :class:`Column` denoted by ``name``. @@ -3707,13 +3211,9 @@ def __getattr__(self, name: str) -> Column: | 5| +---+ """ - if name not in self.columns: - raise PySparkAttributeError( - error_class="ATTRIBUTE_NOT_SUPPORTED", message_parameters={"attr_name": name} - ) - jc = self._jdf.apply(name) - return Column(jc) + ... + @dispatch_df_method def __dir__(self) -> List[str]: """ Examples @@ -3751,9 +3251,7 @@ def __dir__(self) -> List[str]: >>> [attr for attr in dir(df) if attr[0] == 'i'][:7] # Doesn't include 1 or name 1 ['i_like_pancakes', 'id', 'id2', 'inputFiles', 'intersect', 'intersectAll', 'isEmpty'] """ - attrs = set(super().__dir__()) - attrs.update(filter(lambda s: s.isidentifier(), self.columns)) - return sorted(attrs) + ... @overload def select(self, *cols: "ColumnOrName") -> "DataFrame": @@ -3763,7 +3261,8 @@ def select(self, *cols: "ColumnOrName") -> "DataFrame": def select(self, __cols: Union[List[Column], List[str]]) -> "DataFrame": ... - def select(self, *cols: "ColumnOrName") -> "DataFrame": # type: ignore[misc] + @dispatch_df_method # type: ignore[misc] + def select(self, *cols: "ColumnOrName") -> "DataFrame": """Projects a set of expressions and returns a new :class:`DataFrame`. .. versionadded:: 1.3.0 @@ -3808,8 +3307,7 @@ def select(self, *cols: "ColumnOrName") -> "DataFrame": # type: ignore[misc] | Bob| 15| +-----+---+ """ - jdf = self._jdf.select(self._jcols(*cols)) - return DataFrame(jdf, self.sparkSession) + ... @overload def selectExpr(self, *expr: str) -> "DataFrame": @@ -3819,6 +3317,7 @@ def selectExpr(self, *expr: str) -> "DataFrame": def selectExpr(self, *expr: List[str]) -> "DataFrame": ... + @dispatch_df_method def selectExpr(self, *expr: Union[str, List[str]]) -> "DataFrame": """Projects a set of SQL expressions and returns a new :class:`DataFrame`. @@ -3846,11 +3345,9 @@ def selectExpr(self, *expr: Union[str, List[str]]) -> "DataFrame": | 10| 5| +---------+--------+ """ - if len(expr) == 1 and isinstance(expr[0], list): - expr = expr[0] # type: ignore[assignment] - jdf = self._jdf.selectExpr(self._jseq(expr)) - return DataFrame(jdf, self.sparkSession) + ... + @dispatch_df_method def filter(self, condition: "ColumnOrName") -> "DataFrame": """Filters rows using the given condition. @@ -4004,16 +3501,7 @@ def filter(self, condition: "ColumnOrName") -> "DataFrame": | 5| Bob|Physics| +---+-----+-------+ """ - if isinstance(condition, str): - jdf = self._jdf.filter(condition) - elif isinstance(condition, Column): - jdf = self._jdf.filter(condition._jc) - else: - raise PySparkTypeError( - error_class="NOT_COLUMN_OR_STR", - message_parameters={"arg_name": "condition", "arg_type": type(condition).__name__}, - ) - return DataFrame(jdf, self.sparkSession) + ... @overload def groupBy(self, *cols: "ColumnOrNameOrOrdinal") -> "GroupedData": @@ -4023,7 +3511,8 @@ def groupBy(self, *cols: "ColumnOrNameOrOrdinal") -> "GroupedData": def groupBy(self, __cols: Union[List[Column], List[str], List[int]]) -> "GroupedData": ... - def groupBy(self, *cols: "ColumnOrNameOrOrdinal") -> "GroupedData": # type: ignore[misc] + @dispatch_df_method # type: ignore[misc] + def groupBy(self, *cols: "ColumnOrNameOrOrdinal") -> "GroupedData": """ Groups the :class:`DataFrame` by the specified columns so that aggregation can be performed on them. @@ -4122,10 +3611,7 @@ def groupBy(self, *cols: "ColumnOrNameOrOrdinal") -> "GroupedData": # type: ign | Bob| 5| 1| +-----+---+-----+ """ - jgd = self._jdf.groupBy(self._jcols_ordinal(*cols)) - from pyspark.sql.group import GroupedData - - return GroupedData(jgd, self) + ... @overload def rollup(self, *cols: "ColumnOrName") -> "GroupedData": @@ -4135,7 +3621,8 @@ def rollup(self, *cols: "ColumnOrName") -> "GroupedData": def rollup(self, __cols: Union[List[Column], List[str]]) -> "GroupedData": ... - def rollup(self, *cols: "ColumnOrNameOrOrdinal") -> "GroupedData": # type: ignore[misc] + @dispatch_df_method # type: ignore[misc] + def rollup(self, *cols: "ColumnOrNameOrOrdinal") -> "GroupedData": """ Create a multi-dimensional rollup for the current :class:`DataFrame` using the specified columns, allowing for aggregation on them. @@ -4207,10 +3694,7 @@ def rollup(self, *cols: "ColumnOrNameOrOrdinal") -> "GroupedData": # type: igno | Bob| 5| 1| +-----+----+-----+ """ - jgd = self._jdf.rollup(self._jcols_ordinal(*cols)) - from pyspark.sql.group import GroupedData - - return GroupedData(jgd, self) + ... @overload def cube(self, *cols: "ColumnOrName") -> "GroupedData": @@ -4220,7 +3704,8 @@ def cube(self, *cols: "ColumnOrName") -> "GroupedData": def cube(self, __cols: Union[List[Column], List[str]]) -> "GroupedData": ... - def cube(self, *cols: "ColumnOrName") -> "GroupedData": # type: ignore[misc] + @dispatch_df_method # type: ignore[misc] + def cube(self, *cols: "ColumnOrName") -> "GroupedData": """ Create a multi-dimensional cube for the current :class:`DataFrame` using the specified columns, allowing aggregations to be performed on them. @@ -4297,11 +3782,9 @@ def cube(self, *cols: "ColumnOrName") -> "GroupedData": # type: ignore[misc] | Bob| 5| 1| +-----+----+-----+ """ - jgd = self._jdf.cube(self._jcols_ordinal(*cols)) - from pyspark.sql.group import GroupedData - - return GroupedData(jgd, self) + ... + @dispatch_df_method def groupingSets( self, groupingSets: Sequence[Sequence["ColumnOrName"]], *cols: "ColumnOrName" ) -> "GroupedData": @@ -4391,13 +3874,9 @@ def groupingSets( -------- GroupedData """ - from pyspark.sql.group import GroupedData - - jgrouping_sets = _to_seq(self._sc, [self._jcols(*inner) for inner in groupingSets]) - - jgd = self._jdf.groupingSets(jgrouping_sets, self._jcols(*cols)) - return GroupedData(jgd, self) + ... + @dispatch_df_method def unpivot( self, ids: Union["ColumnOrName", List["ColumnOrName"], Tuple["ColumnOrName", ...]], @@ -4479,26 +3958,9 @@ def unpivot( -------- DataFrame.melt """ - assert ids is not None, "ids must not be None" - - def to_jcols( - cols: Union["ColumnOrName", List["ColumnOrName"], Tuple["ColumnOrName", ...]] - ) -> "JavaObject": - if isinstance(cols, list): - return self._jcols(*cols) - if isinstance(cols, tuple): - return self._jcols(*list(cols)) - return self._jcols(cols) - - jids = to_jcols(ids) - if values is None: - jdf = self._jdf.unpivotWithSeq(jids, variableColumnName, valueColumnName) - else: - jvals = to_jcols(values) - jdf = self._jdf.unpivotWithSeq(jids, jvals, variableColumnName, valueColumnName) - - return DataFrame(jdf, self.sparkSession) + ... + @dispatch_df_method def melt( self, ids: Union["ColumnOrName", List["ColumnOrName"], Tuple["ColumnOrName", ...]], @@ -4542,8 +4004,9 @@ def melt( ----- Supports Spark Connect. """ - return self.unpivot(ids, values, variableColumnName, valueColumnName) + ... + @dispatch_df_method def agg(self, *exprs: Union[Column, Dict[str, str]]) -> "DataFrame": """Aggregate on the entire :class:`DataFrame` without groups (shorthand for ``df.groupBy().agg()``). @@ -4580,8 +4043,9 @@ def agg(self, *exprs: Union[Column, Dict[str, str]]) -> "DataFrame": | 2| +--------+ """ - return self.groupBy().agg(*exprs) # type: ignore[arg-type] + ... + @dispatch_df_method def observe( self, observation: Union["Observation", str], @@ -4651,6 +4115,7 @@ def observe( When ``observation`` is a string, streaming queries also work as below. >>> from pyspark.sql.streaming import StreamingQueryListener + >>> import time >>> class MyErrorListener(StreamingQueryListener): ... def onQueryStarted(self, event): ... pass @@ -4671,45 +4136,28 @@ def observe( ... def onQueryTerminated(self, event): ... pass ... - >>> spark.streams.addListener(MyErrorListener()) + >>> error_listener = MyErrorListener() + >>> spark.streams.addListener(error_listener) + >>> sdf = spark.readStream.format("rate").load().withColumn( + ... "error", col("value") + ... ) >>> # Observe row count (rc) and error row count (erc) in the streaming Dataset - ... observed_ds = df.observe( + ... observed_ds = sdf.observe( ... "my_event", ... count(lit(1)).alias("rc"), - ... count(col("error")).alias("erc")) # doctest: +SKIP - >>> observed_ds.writeStream.format("console").start() # doctest: +SKIP - """ - from pyspark.sql import Observation - - if len(exprs) == 0: - raise PySparkValueError( - error_class="CANNOT_BE_EMPTY", - message_parameters={"item": "exprs"}, - ) - if not all(isinstance(c, Column) for c in exprs): - raise PySparkTypeError( - error_class="NOT_LIST_OF_COLUMN", - message_parameters={"arg_name": "exprs"}, - ) - - if isinstance(observation, Observation): - return observation._on(self, *exprs) - elif isinstance(observation, str): - return DataFrame( - self._jdf.observe( - observation, exprs[0]._jc, _to_seq(self._sc, [c._jc for c in exprs[1:]]) - ), - self.sparkSession, - ) - else: - raise PySparkTypeError( - error_class="NOT_LIST_OF_COLUMN", - message_parameters={ - "arg_name": "observation", - "arg_type": type(observation).__name__, - }, - ) + ... count(col("error")).alias("erc")) + >>> try: + ... q = observed_ds.writeStream.format("console").start() + ... time.sleep(5) + ... + ... finally: + ... q.stop() + ... spark.streams.removeListener(error_listener) + ... + """ + ... + @dispatch_df_method def union(self, other: "DataFrame") -> "DataFrame": """Return a new :class:`DataFrame` containing the union of rows in this and another :class:`DataFrame`. @@ -4806,8 +4254,9 @@ def union(self, other: "DataFrame") -> "DataFrame": | 4| D| +---+-----+ """ - return DataFrame(self._jdf.union(other._jdf), self.sparkSession) + ... + @dispatch_df_method def unionAll(self, other: "DataFrame") -> "DataFrame": """Return a new :class:`DataFrame` containing the union of rows in this and another :class:`DataFrame`. @@ -4840,8 +4289,9 @@ def unionAll(self, other: "DataFrame") -> "DataFrame": -------- DataFrame.union """ - return self.union(other) + ... + @dispatch_df_method def unionByName(self, other: "DataFrame", allowMissingColumns: bool = False) -> "DataFrame": """Returns a new :class:`DataFrame` containing union of rows in this and another :class:`DataFrame`. @@ -4920,8 +4370,9 @@ def unionByName(self, other: "DataFrame", allowMissingColumns: bool = False) -> |NULL|NULL|NULL| 3| 4| 5| +----+----+----+----+----+----+ """ - return DataFrame(self._jdf.unionByName(other._jdf, allowMissingColumns), self.sparkSession) + ... + @dispatch_df_method def intersect(self, other: "DataFrame") -> "DataFrame": """Return a new :class:`DataFrame` containing rows only in both this :class:`DataFrame` and another :class:`DataFrame`. @@ -4986,8 +4437,9 @@ def intersect(self, other: "DataFrame") -> "DataFrame": | 1| 2| +---+---+ """ - return DataFrame(self._jdf.intersect(other._jdf), self.sparkSession) + ... + @dispatch_df_method def intersectAll(self, other: "DataFrame") -> "DataFrame": """Return a new :class:`DataFrame` containing rows in both this :class:`DataFrame` and another :class:`DataFrame` while preserving duplicates. @@ -5051,8 +4503,9 @@ def intersectAll(self, other: "DataFrame") -> "DataFrame": | 1| 2| +---+---+ """ - return DataFrame(self._jdf.intersectAll(other._jdf), self.sparkSession) + ... + @dispatch_df_method def subtract(self, other: "DataFrame") -> "DataFrame": """Return a new :class:`DataFrame` containing rows in this :class:`DataFrame` but not in another :class:`DataFrame`. @@ -5113,9 +4566,10 @@ def subtract(self, other: "DataFrame") -> "DataFrame": +---+---+ +---+---+ """ - return DataFrame(getattr(self._jdf, "except")(other._jdf), self.sparkSession) + ... - def dropDuplicates(self, subset: Optional[List[str]] = None) -> "DataFrame": + @dispatch_df_method + def dropDuplicates(self, *subset: Union[str, List[str]]) -> "DataFrame": """Return a new :class:`DataFrame` with duplicate rows removed, optionally only considering certain columns. @@ -5132,6 +4586,9 @@ def dropDuplicates(self, subset: Optional[List[str]] = None) -> "DataFrame": .. versionchanged:: 3.4.0 Supports Spark Connect. + .. versionchanged:: 4.0.0 + Supports variable-length argument + Parameters ---------- subset : list of column names, optional @@ -5163,26 +4620,17 @@ def dropDuplicates(self, subset: Optional[List[str]] = None) -> "DataFrame": Deduplicate values on 'name' and 'height' columns. - >>> df.dropDuplicates(['name', 'height']).show() + >>> df.dropDuplicates('name', 'height').show() +-----+---+------+ | name|age|height| +-----+---+------+ |Alice| 5| 80| +-----+---+------+ """ - if subset is not None and (not isinstance(subset, Iterable) or isinstance(subset, str)): - raise PySparkTypeError( - error_class="NOT_LIST_OR_TUPLE", - message_parameters={"arg_name": "subset", "arg_type": type(subset).__name__}, - ) - - if subset is None: - jdf = self._jdf.dropDuplicates() - else: - jdf = self._jdf.dropDuplicates(self._jseq(subset)) - return DataFrame(jdf, self.sparkSession) + ... - def dropDuplicatesWithinWatermark(self, subset: Optional[List[str]] = None) -> "DataFrame": + @dispatch_df_method + def dropDuplicatesWithinWatermark(self, *subset: Union[str, List[str]]) -> "DataFrame": """Return a new :class:`DataFrame` with duplicate rows removed, optionally only considering certain columns, within watermark. @@ -5199,6 +4647,9 @@ def dropDuplicatesWithinWatermark(self, subset: Optional[List[str]] = None) -> " .. versionadded:: 3.5.0 + .. versionchanged:: 4.0.0 + Supports variable-length argument + Parameters ---------- subset : List of column names, optional @@ -5228,20 +4679,11 @@ def dropDuplicatesWithinWatermark(self, subset: Optional[List[str]] = None) -> " Deduplicate values on 'value' columns. - >>> df.dropDuplicatesWithinWatermark(['value']) # doctest: +SKIP + >>> df.dropDuplicatesWithinWatermark('value') # doctest: +SKIP """ - if subset is not None and (not isinstance(subset, Iterable) or isinstance(subset, str)): - raise PySparkTypeError( - error_class="NOT_LIST_OR_TUPLE", - message_parameters={"arg_name": "subset", "arg_type": type(subset).__name__}, - ) - - if subset is None: - jdf = self._jdf.dropDuplicatesWithinWatermark() - else: - jdf = self._jdf.dropDuplicatesWithinWatermark(self._jseq(subset)) - return DataFrame(jdf, self.sparkSession) + ... + @dispatch_df_method def dropna( self, how: str = "any", @@ -5323,26 +4765,7 @@ def dropna( | 5| NULL| Bob| +---+------+-----+ """ - if how is not None and how not in ["any", "all"]: - raise PySparkValueError( - error_class="VALUE_NOT_ANY_OR_ALL", - message_parameters={"arg_name": "how", "arg_type": how}, - ) - - if subset is None: - subset = self.columns - elif isinstance(subset, str): - subset = [subset] - elif not isinstance(subset, (list, tuple)): - raise PySparkTypeError( - error_class="NOT_LIST_OR_STR_OR_TUPLE", - message_parameters={"arg_name": "subset", "arg_type": type(subset).__name__}, - ) - - if thresh is None: - thresh = len(subset) if how == "any" else 1 - - return DataFrame(self._jdf.na().drop(thresh, self._jseq(subset)), self.sparkSession) + ... @overload def fillna( @@ -5356,6 +4779,7 @@ def fillna( def fillna(self, value: Dict[str, "LiteralType"]) -> "DataFrame": ... + @dispatch_df_method def fillna( self, value: Union["LiteralType", Dict[str, "LiteralType"]], @@ -5445,32 +4869,7 @@ def fillna( |NULL| NULL|Spark|true| +----+------+-----+----+ """ - if not isinstance(value, (float, int, str, bool, dict)): - raise PySparkTypeError( - error_class="NOT_BOOL_OR_DICT_OR_FLOAT_OR_INT_OR_STR", - message_parameters={"arg_name": "value", "arg_type": type(value).__name__}, - ) - - # Note that bool validates isinstance(int), but we don't want to - # convert bools to floats - - if not isinstance(value, bool) and isinstance(value, int): - value = float(value) - - if isinstance(value, dict): - return DataFrame(self._jdf.na().fill(value), self.sparkSession) - elif subset is None: - return DataFrame(self._jdf.na().fill(value), self.sparkSession) - else: - if isinstance(subset, str): - subset = [subset] - elif not isinstance(subset, (list, tuple)): - raise PySparkTypeError( - error_class="NOT_LIST_OR_TUPLE", - message_parameters={"arg_name": "subset", "arg_type": type(subset).__name__}, - ) - - return DataFrame(self._jdf.na().fill(value, self._jseq(subset)), self.sparkSession) + ... @overload def replace( @@ -5507,7 +4906,8 @@ def replace( ) -> "DataFrame": ... - def replace( # type: ignore[misc] + @dispatch_df_method # type: ignore[misc] + def replace( self, to_replace: Union[ "LiteralType", List["LiteralType"], Dict["LiteralType", "OptionalPrimitiveType"] @@ -5610,111 +5010,7 @@ def replace( # type: ignore[misc] |NULL| NULL| NULL| +----+------+-----+ """ - if value is _NoValue: - if isinstance(to_replace, dict): - value = None - else: - raise PySparkTypeError( - error_class="ARGUMENT_REQUIRED", - message_parameters={"arg_name": "value", "condition": "`to_replace` is dict"}, - ) - - # Helper functions - def all_of(types: Union[Type, Tuple[Type, ...]]) -> Callable[[Iterable], bool]: - """Given a type or tuple of types and a sequence of xs - check if each x is instance of type(s) - - >>> all_of(bool)([True, False]) - True - >>> all_of(str)(["a", 1]) - False - """ - - def all_of_(xs: Iterable) -> bool: - return all(isinstance(x, types) for x in xs) - - return all_of_ - - all_of_bool = all_of(bool) - all_of_str = all_of(str) - all_of_numeric = all_of((float, int)) - - # Validate input types - valid_types = (bool, float, int, str, list, tuple) - if not isinstance(to_replace, valid_types + (dict,)): - raise PySparkTypeError( - error_class="NOT_BOOL_OR_DICT_OR_FLOAT_OR_INT_OR_LIST_OR_STR_OR_TUPLE", - message_parameters={ - "arg_name": "to_replace", - "arg_type": type(to_replace).__name__, - }, - ) - - if ( - not isinstance(value, valid_types) - and value is not None - and not isinstance(to_replace, dict) - ): - raise PySparkTypeError( - error_class="NOT_BOOL_OR_FLOAT_OR_INT_OR_LIST_OR_NONE_OR_STR_OR_TUPLE", - message_parameters={ - "arg_name": "value", - "arg_type": type(value).__name__, - }, - ) - - if isinstance(to_replace, (list, tuple)) and isinstance(value, (list, tuple)): - if len(to_replace) != len(value): - raise PySparkValueError( - error_class="LENGTH_SHOULD_BE_THE_SAME", - message_parameters={ - "arg1": "to_replace", - "arg2": "value", - "arg1_length": str(len(to_replace)), - "arg2_length": str(len(value)), - }, - ) - - if not (subset is None or isinstance(subset, (list, tuple, str))): - raise PySparkTypeError( - error_class="NOT_LIST_OR_STR_OR_TUPLE", - message_parameters={"arg_name": "subset", "arg_type": type(subset).__name__}, - ) - - # Reshape input arguments if necessary - if isinstance(to_replace, (float, int, str)): - to_replace = [to_replace] - - if isinstance(to_replace, dict): - rep_dict = to_replace - if value is not None: - warnings.warn("to_replace is a dict and value is not None. value will be ignored.") - else: - if isinstance(value, (float, int, str)) or value is None: - value = [value for _ in range(len(to_replace))] - rep_dict = dict(zip(to_replace, cast("Iterable[Optional[Union[float, str]]]", value))) - - if isinstance(subset, str): - subset = [subset] - - # Verify we were not passed in mixed type generics. - if not any( - all_of_type(rep_dict.keys()) - and all_of_type(x for x in rep_dict.values() if x is not None) - for all_of_type in [all_of_bool, all_of_str, all_of_numeric] - ): - raise PySparkValueError( - error_class="MIXED_TYPE_REPLACEMENT", - message_parameters={}, - ) - - if subset is None: - return DataFrame(self._jdf.na().replace("*", rep_dict), self.sparkSession) - else: - return DataFrame( - self._jdf.na().replace(self._jseq(subset), self._jmap(rep_dict)), - self.sparkSession, - ) + ... @overload def approxQuantile( @@ -5734,6 +5030,7 @@ def approxQuantile( ) -> List[List[float]]: ... + @dispatch_df_method def approxQuantile( self, col: Union[str, List[str], Tuple[str]], @@ -5830,76 +5127,9 @@ def approxQuantile( >>> quantiles [1.0, 1.0, 5.0] """ + ... - if not isinstance(col, (str, list, tuple)): - raise PySparkTypeError( - error_class="NOT_LIST_OR_STR_OR_TUPLE", - message_parameters={"arg_name": "col", "arg_type": type(col).__name__}, - ) - - isStr = isinstance(col, str) - - if isinstance(col, tuple): - col = list(col) - elif isStr: - col = [cast(str, col)] - - for c in col: - if not isinstance(c, str): - raise PySparkTypeError( - error_class="DISALLOWED_TYPE_FOR_CONTAINER", - message_parameters={ - "arg_name": "col", - "arg_type": type(col).__name__, - "allowed_types": "str", - "item_type": type(c).__name__, - }, - ) - col = _to_list(self._sc, cast(List["ColumnOrName"], col)) - - if not isinstance(probabilities, (list, tuple)): - raise PySparkTypeError( - error_class="NOT_LIST_OR_TUPLE", - message_parameters={ - "arg_name": "probabilities", - "arg_type": type(probabilities).__name__, - }, - ) - if isinstance(probabilities, tuple): - probabilities = list(probabilities) - for p in probabilities: - if not isinstance(p, (float, int)) or p < 0 or p > 1: - raise PySparkTypeError( - error_class="NOT_LIST_OF_FLOAT_OR_INT", - message_parameters={ - "arg_name": "probabilities", - "arg_type": type(p).__name__, - }, - ) - probabilities = _to_list(self._sc, cast(List["ColumnOrName"], probabilities)) - - if not isinstance(relativeError, (float, int)): - raise PySparkTypeError( - error_class="NOT_FLOAT_OR_INT", - message_parameters={ - "arg_name": "relativeError", - "arg_type": type(relativeError).__name__, - }, - ) - if relativeError < 0: - raise PySparkValueError( - error_class="NEGATIVE_VALUE", - message_parameters={ - "arg_name": "relativeError", - "arg_value": str(relativeError), - }, - ) - relativeError = float(relativeError) - - jaq = self._jdf.stat().approxQuantile(col, probabilities, relativeError) - jaq_list = [list(j) for j in jaq] - return jaq_list[0] if isStr else jaq_list - + @dispatch_df_method def corr(self, col1: str, col2: str, method: Optional[str] = None) -> float: """ Calculates the correlation of two columns of a :class:`DataFrame` as a double value. @@ -5935,25 +5165,9 @@ def corr(self, col1: str, col2: str, method: Optional[str] = None) -> float: 1.0 """ - if not isinstance(col1, str): - raise PySparkTypeError( - error_class="NOT_STR", - message_parameters={"arg_name": "col1", "arg_type": type(col1).__name__}, - ) - if not isinstance(col2, str): - raise PySparkTypeError( - error_class="NOT_STR", - message_parameters={"arg_name": "col2", "arg_type": type(col2).__name__}, - ) - if not method: - method = "pearson" - if not method == "pearson": - raise PySparkValueError( - error_class="VALUE_NOT_PEARSON", - message_parameters={"arg_name": "method", "arg_value": method}, - ) - return self._jdf.stat().corr(col1, col2, method) + ... + @dispatch_df_method def cov(self, col1: str, col2: str) -> float: """ Calculate the sample covariance for the given columns, specified by their names, as a @@ -5986,18 +5200,9 @@ def cov(self, col1: str, col2: str) -> float: 1.0 """ - if not isinstance(col1, str): - raise PySparkTypeError( - error_class="NOT_STR", - message_parameters={"arg_name": "col1", "arg_type": type(col1).__name__}, - ) - if not isinstance(col2, str): - raise PySparkTypeError( - error_class="NOT_STR", - message_parameters={"arg_name": "col2", "arg_type": type(col2).__name__}, - ) - return self._jdf.stat().cov(col1, col2) + ... + @dispatch_df_method def crosstab(self, col1: str, col2: str) -> "DataFrame": """ Computes a pair-wise frequency table of the given columns. Also known as a contingency @@ -6039,18 +5244,9 @@ def crosstab(self, col1: str, col2: str) -> "DataFrame": +-----+---+---+---+ """ - if not isinstance(col1, str): - raise PySparkTypeError( - error_class="NOT_STR", - message_parameters={"arg_name": "col1", "arg_type": type(col1).__name__}, - ) - if not isinstance(col2, str): - raise PySparkTypeError( - error_class="NOT_STR", - message_parameters={"arg_name": "col2", "arg_type": type(col2).__name__}, - ) - return DataFrame(self._jdf.stat().crosstab(col1, col2), self.sparkSession) + ... + @dispatch_df_method def freqItems( self, cols: Union[List[str], Tuple[str]], support: Optional[float] = None ) -> "DataFrame": @@ -6095,19 +5291,9 @@ def freqItems( | [4, 1, 3]| [8, 11, 10]| +------------+------------+ """ - if isinstance(cols, tuple): - cols = list(cols) - if not isinstance(cols, list): - raise PySparkTypeError( - error_class="NOT_LIST_OR_TUPLE", - message_parameters={"arg_name": "cols", "arg_type": type(cols).__name__}, - ) - if not support: - support = 0.01 - return DataFrame( - self._jdf.stat().freqItems(_to_seq(self._sc, cols), support), self.sparkSession - ) + ... + @dispatch_df_method def _ipython_key_completions_(self) -> List[str]: """Returns the names of columns in this :class:`DataFrame`. @@ -6122,8 +5308,9 @@ def _ipython_key_completions_(self) -> List[str]: >>> df._ipython_key_completions_() ['age 1', 'name?1'] """ - return self.columns + ... + @dispatch_df_method def withColumns(self, *colsMap: Dict[str, Column]) -> "DataFrame": """ Returns a new :class:`DataFrame` by adding multiple columns or replacing the @@ -6159,24 +5346,9 @@ def withColumns(self, *colsMap: Dict[str, Column]) -> "DataFrame": | 5| Bob| 7| 8| +---+-----+----+----+ """ - # Below code is to help enable kwargs in future. - assert len(colsMap) == 1 - colsMap = colsMap[0] # type: ignore[assignment] - - if not isinstance(colsMap, dict): - raise PySparkTypeError( - error_class="NOT_DICT", - message_parameters={"arg_name": "colsMap", "arg_type": type(colsMap).__name__}, - ) - - col_names = list(colsMap.keys()) - cols = list(colsMap.values()) - - return DataFrame( - self._jdf.withColumns(_to_seq(self._sc, col_names), self._jcols(*cols)), - self.sparkSession, - ) + ... + @dispatch_df_method def withColumn(self, colName: str, col: Column) -> "DataFrame": """ Returns a new :class:`DataFrame` by adding a column or replacing the @@ -6220,13 +5392,9 @@ def withColumn(self, colName: str, col: Column) -> "DataFrame": | 5| Bob| 7| +---+-----+----+ """ - if not isinstance(col, Column): - raise PySparkTypeError( - error_class="NOT_COLUMN", - message_parameters={"arg_name": "col", "arg_type": type(col).__name__}, - ) - return DataFrame(self._jdf.withColumn(colName, col._jc), self.sparkSession) + ... + @dispatch_df_method def withColumnRenamed(self, existing: str, new: str) -> "DataFrame": """ Returns a new :class:`DataFrame` by renaming an existing column. @@ -6287,8 +5455,9 @@ def withColumnRenamed(self, existing: str, new: str) -> "DataFrame": | 5| Bob| +----+-----+ """ - return DataFrame(self._jdf.withColumnRenamed(existing, new), self.sparkSession) + ... + @dispatch_df_method def withColumnsRenamed(self, colsMap: Dict[str, str]) -> "DataFrame": """ Returns a new :class:`DataFrame` by renaming multiple columns. @@ -6356,25 +5525,9 @@ def withColumnsRenamed(self, colsMap: Dict[str, str]) -> "DataFrame": | 5| Bob| +---+-----+ """ - if not isinstance(colsMap, dict): - raise PySparkTypeError( - error_class="NOT_DICT", - message_parameters={"arg_name": "colsMap", "arg_type": type(colsMap).__name__}, - ) - - col_names: List[str] = [] - new_col_names: List[str] = [] - for k, v in colsMap.items(): - col_names.append(k) - new_col_names.append(v) - - return DataFrame( - self._jdf.withColumnsRenamed( - _to_seq(self._sc, col_names), _to_seq(self._sc, new_col_names) - ), - self.sparkSession, - ) + ... + @dispatch_df_method def withMetadata(self, columnName: str, metadata: Dict[str, Any]) -> "DataFrame": """Returns a new :class:`DataFrame` by updating an existing column with metadata. @@ -6402,18 +5555,7 @@ def withMetadata(self, columnName: str, metadata: Dict[str, Any]) -> "DataFrame" >>> df_meta.schema['age'].metadata {'foo': 'bar'} """ - from py4j.java_gateway import JVMView - - if not isinstance(metadata, dict): - raise PySparkTypeError( - error_class="NOT_DICT", - message_parameters={"arg_name": "metadata", "arg_type": type(metadata).__name__}, - ) - sc = get_active_spark_context() - jmeta = cast(JVMView, sc._jvm).org.apache.spark.sql.types.Metadata.fromJson( - json.dumps(metadata) - ) - return DataFrame(self._jdf.withMetadata(columnName, jmeta), self.sparkSession) + ... @overload def drop(self, cols: "ColumnOrName") -> "DataFrame": @@ -6423,7 +5565,8 @@ def drop(self, cols: "ColumnOrName") -> "DataFrame": def drop(self, *cols: str) -> "DataFrame": ... - def drop(self, *cols: "ColumnOrName") -> "DataFrame": # type: ignore[misc] + @dispatch_df_method # type: ignore[misc] + def drop(self, *cols: "ColumnOrName") -> "DataFrame": """ Returns a new :class:`DataFrame` without specified columns. This is a no-op if the schema doesn't contain the given column name(s). @@ -6553,29 +5696,9 @@ def drop(self, *cols: "ColumnOrName") -> "DataFrame": # type: ignore[misc] | 16| Bob| 1| +---+-----+-----+ """ - column_names: List[str] = [] - java_columns: List["JavaObject"] = [] - - for c in cols: - if isinstance(c, str): - column_names.append(c) - elif isinstance(c, Column): - java_columns.append(c._jc) - else: - raise PySparkTypeError( - error_class="NOT_COLUMN_OR_STR", - message_parameters={"arg_name": "col", "arg_type": type(c).__name__}, - ) - - jdf = self._jdf - if len(java_columns) > 0: - first_column, *remaining_columns = java_columns - jdf = jdf.drop(first_column, self._jseq(remaining_columns)) - if len(column_names) > 0: - jdf = jdf.drop(self._jseq(column_names)) - - return DataFrame(jdf, self.sparkSession) + ... + @dispatch_df_method def toDF(self, *cols: str) -> "DataFrame": """Returns a new :class:`DataFrame` that with new specified column names @@ -6609,15 +5732,9 @@ def toDF(self, *cols: str) -> "DataFrame": | 16| Bob| +---+-----+ """ - for col in cols: - if not isinstance(col, str): - raise PySparkTypeError( - error_class="NOT_LIST_OF_STR", - message_parameters={"arg_name": "cols", "arg_type": type(col).__name__}, - ) - jdf = self._jdf.toDF(self._jseq(cols)) - return DataFrame(jdf, self.sparkSession) + ... + @dispatch_df_method def transform(self, func: Callable[..., "DataFrame"], *args: Any, **kwargs: Any) -> "DataFrame": """Returns a new :class:`DataFrame`. Concise syntax for chaining custom transformations. @@ -6673,12 +5790,9 @@ def transform(self, func: Callable[..., "DataFrame"], *args: Any, **kwargs: Any) | 13| 13.0| +---+-----+ """ - result = func(self, *args, **kwargs) - assert isinstance( - result, DataFrame - ), "Func returned an instance of type [%s], " "should have been DataFrame." % type(result) - return result + ... + @dispatch_df_method def sameSemantics(self, other: "DataFrame") -> bool: """ Returns `True` when the logical query plans inside both :class:`DataFrame`\\s are equal and @@ -6721,13 +5835,9 @@ def sameSemantics(self, other: "DataFrame") -> bool: >>> df1.withColumn("col1", df1.id * 2).sameSemantics(df2.withColumn("col0", df2.id * 2)) True """ - if not isinstance(other, DataFrame): - raise PySparkTypeError( - error_class="NOT_DATAFRAME", - message_parameters={"arg_name": "other", "arg_type": type(other).__name__}, - ) - return self._jdf.sameSemantics(other._jdf) + ... + @dispatch_df_method def semanticHash(self) -> int: """ Returns a hash code of the logical query plan against this :class:`DataFrame`. @@ -6756,8 +5866,9 @@ def semanticHash(self) -> int: >>> spark.range(10).selectExpr("id as col1").semanticHash() # doctest: +SKIP 1855039936 """ - return self._jdf.semanticHash() + ... + @dispatch_df_method def inputFiles(self) -> List[str]: """ Returns a best-effort snapshot of the files that compose this :class:`DataFrame`. @@ -6791,15 +5902,16 @@ def inputFiles(self) -> List[str]: ... len(df.inputFiles()) 1 """ - return list(self._jdf.inputFiles()) + ... + @dispatch_df_method def where(self, condition: "ColumnOrName") -> "DataFrame": """ :func:`where` is an alias for :func:`filter`. .. versionadded:: 1.3.0 """ - return self.filter(condition) + ... # Two aliases below were added for pandas compatibility many years ago. # There are too many differences compared to pandas and we cannot just @@ -6814,22 +5926,31 @@ def groupby(self, *cols: "ColumnOrNameOrOrdinal") -> "GroupedData": def groupby(self, __cols: Union[List[Column], List[str], List[int]]) -> "GroupedData": ... - def groupby(self, *cols: "ColumnOrNameOrOrdinal") -> "GroupedData": # type: ignore[misc] + @dispatch_df_method # type: ignore[misc] + def groupby(self, *cols: "ColumnOrNameOrOrdinal") -> "GroupedData": """ :func:`groupby` is an alias for :func:`groupBy`. .. versionadded:: 1.4.0 """ - return self.groupBy(*cols) + ... - def drop_duplicates(self, subset: Optional[List[str]] = None) -> "DataFrame": + @dispatch_df_method + def drop_duplicates(self, *subset: Union[str, List[str]]) -> "DataFrame": """ :func:`drop_duplicates` is an alias for :func:`dropDuplicates`. .. versionadded:: 1.4.0 + + .. versionchanged:: 3.4.0 + Supports Spark Connect + + .. versionchanged:: 4.0.0 + Supports variable-length argument """ - return self.dropDuplicates(subset) + ... + @dispatch_df_method def writeTo(self, table: str) -> DataFrameWriterV2: """ Create a write configuration builder for v2 sources. @@ -6862,8 +5983,9 @@ def writeTo(self, table: str) -> DataFrameWriterV2: ... "catalog.db.table" ... ).partitionedBy("col").createOrReplace() """ - return DataFrameWriterV2(self, table) + ... + @dispatch_df_method def pandas_api( self, index_col: Optional[Union[str, List[str]]] = None ) -> "PandasOnSparkDataFrame": @@ -6914,25 +6036,276 @@ def pandas_api( 23 Alice 16 Bob """ - from pyspark.pandas.namespace import _get_index_map - from pyspark.pandas.frame import DataFrame as PandasOnSparkDataFrame - from pyspark.pandas.internal import InternalFrame + ... - index_spark_columns, index_names = _get_index_map(self, index_col) - internal = InternalFrame( - spark_frame=self, - index_spark_columns=index_spark_columns, - index_names=index_names, # type: ignore[arg-type] - ) - return PandasOnSparkDataFrame(internal) + @dispatch_df_method + def mapInPandas( + self, + func: "PandasMapIterFunction", + schema: Union[StructType, str], + barrier: bool = False, + profile: Optional[ResourceProfile] = None, + ) -> "DataFrame": + """ + Maps an iterator of batches in the current :class:`DataFrame` using a Python native + function that is performed on pandas DataFrames both as input and output, + and returns the result as a :class:`DataFrame`. + This method applies the specified Python function to an iterator of + `pandas.DataFrame`\\s, each representing a batch of rows from the original DataFrame. + The returned iterator of `pandas.DataFrame`\\s are combined as a :class:`DataFrame`. + The size of the function's input and output can be different. Each `pandas.DataFrame` + size can be controlled by `spark.sql.execution.arrow.maxRecordsPerBatch`. -def _to_scala_map(sc: "SparkContext", jm: Dict) -> "JavaObject": - """ - Convert a dict into a JVM Map. - """ - assert sc._jvm is not None - return sc._jvm.PythonUtils.toScalaMap(jm) + .. versionadded:: 3.0.0 + + .. versionchanged:: 3.4.0 + Supports Spark Connect. + + Parameters + ---------- + func : function + a Python native function that takes an iterator of `pandas.DataFrame`\\s, and + outputs an iterator of `pandas.DataFrame`\\s. + schema : :class:`pyspark.sql.types.DataType` or str + the return type of the `func` in PySpark. The value can be either a + :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string. + barrier : bool, optional, default False + Use barrier mode execution, ensuring that all Python workers in the stage will be + launched concurrently. + + .. versionadded: 3.5.0 + + profile : :class:`pyspark.resource.ResourceProfile`. The optional ResourceProfile + to be used for mapInPandas. + + .. versionadded: 4.0.0 + + + Examples + -------- + >>> df = spark.createDataFrame([(1, 21), (2, 30)], ("id", "age")) + + Filter rows with id equal to 1: + + >>> def filter_func(iterator): + ... for pdf in iterator: + ... yield pdf[pdf.id == 1] + ... + >>> df.mapInPandas(filter_func, df.schema).show() # doctest: +SKIP + +---+---+ + | id|age| + +---+---+ + | 1| 21| + +---+---+ + + Compute the mean age for each id: + + >>> def mean_age(iterator): + ... for pdf in iterator: + ... yield pdf.groupby("id").mean().reset_index() + ... + >>> df.mapInPandas(mean_age, "id: bigint, age: double").show() # doctest: +SKIP + +---+----+ + | id| age| + +---+----+ + | 1|21.0| + | 2|30.0| + +---+----+ + + Add a new column with the double of the age: + + >>> def double_age(iterator): + ... for pdf in iterator: + ... pdf["double_age"] = pdf["age"] * 2 + ... yield pdf + ... + >>> df.mapInPandas( + ... double_age, "id: bigint, age: bigint, double_age: bigint").show() # doctest: +SKIP + +---+---+----------+ + | id|age|double_age| + +---+---+----------+ + | 1| 21| 42| + | 2| 30| 60| + +---+---+----------+ + + Set ``barrier`` to ``True`` to force the ``mapInPandas`` stage running in the + barrier mode, it ensures all Python workers in the stage will be + launched concurrently. + + >>> df.mapInPandas(filter_func, df.schema, barrier=True).show() # doctest: +SKIP + +---+---+ + | id|age| + +---+---+ + | 1| 21| + +---+---+ + + Notes + ----- + This API is experimental + + See Also + -------- + pyspark.sql.functions.pandas_udf + """ + ... + + @dispatch_df_method + def mapInArrow( + self, + func: "ArrowMapIterFunction", + schema: Union[StructType, str], + barrier: bool = False, + profile: Optional[ResourceProfile] = None, + ) -> "DataFrame": + """ + Maps an iterator of batches in the current :class:`DataFrame` using a Python native + function that is performed on `pyarrow.RecordBatch`\\s both as input and output, + and returns the result as a :class:`DataFrame`. + + This method applies the specified Python function to an iterator of + `pyarrow.RecordBatch`\\s, each representing a batch of rows from the original DataFrame. + The returned iterator of `pyarrow.RecordBatch`\\s are combined as a :class:`DataFrame`. + The size of the function's input and output can be different. Each `pyarrow.RecordBatch` + size can be controlled by `spark.sql.execution.arrow.maxRecordsPerBatch`. + + .. versionadded:: 3.3.0 + + Parameters + ---------- + func : function + a Python native function that takes an iterator of `pyarrow.RecordBatch`\\s, and + outputs an iterator of `pyarrow.RecordBatch`\\s. + schema : :class:`pyspark.sql.types.DataType` or str + the return type of the `func` in PySpark. The value can be either a + :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string. + barrier : bool, optional, default False + Use barrier mode execution, ensuring that all Python workers in the stage will be + launched concurrently. + + .. versionadded: 3.5.0 + + profile : :class:`pyspark.resource.ResourceProfile`. The optional ResourceProfile + to be used for mapInArrow. + + .. versionadded: 4.0.0 + + Examples + -------- + >>> import pyarrow # doctest: +SKIP + >>> df = spark.createDataFrame([(1, 21), (2, 30)], ("id", "age")) + >>> def filter_func(iterator): + ... for batch in iterator: + ... pdf = batch.to_pandas() + ... yield pyarrow.RecordBatch.from_pandas(pdf[pdf.id == 1]) + >>> df.mapInArrow(filter_func, df.schema).show() # doctest: +SKIP + +---+---+ + | id|age| + +---+---+ + | 1| 21| + +---+---+ + + Set ``barrier`` to ``True`` to force the ``mapInArrow`` stage running in the + barrier mode, it ensures all Python workers in the stage will be + launched concurrently. + + >>> df.mapInArrow(filter_func, df.schema, barrier=True).show() # doctest: +SKIP + +---+---+ + | id|age| + +---+---+ + | 1| 21| + +---+---+ + + Notes + ----- + This API is unstable, and for developers. + + See Also + -------- + pyspark.sql.functions.pandas_udf + pyspark.sql.DataFrame.mapInPandas + """ + ... + + @dispatch_df_method + def toArrow(self) -> "pa.Table": + """ + Returns the contents of this :class:`DataFrame` as PyArrow ``pyarrow.Table``. + + This is only available if PyArrow is installed and available. + + .. versionadded:: 4.0.0 + + Notes + ----- + This method should only be used if the resulting PyArrow ``pyarrow.Table`` is + expected to be small, as all the data is loaded into the driver's memory. + + This API is a developer API. + + Examples + -------- + >>> df.toArrow() # doctest: +SKIP + pyarrow.Table + age: int64 + name: string + ---- + age: [[2,5]] + name: [["Alice","Bob"]] + """ + ... + + def toPandas(self) -> "PandasDataFrameLike": + """ + Returns the contents of this :class:`DataFrame` as Pandas ``pandas.DataFrame``. + + This is only available if Pandas is installed and available. + + .. versionadded:: 1.3.0 + + .. versionchanged:: 3.4.0 + Supports Spark Connect. + + Notes + ----- + This method should only be used if the resulting Pandas ``pandas.DataFrame`` is + expected to be small, as all the data is loaded into the driver's memory. + + Usage with ``spark.sql.execution.arrow.pyspark.enabled=True`` is experimental. + + Examples + -------- + >>> df.toPandas() # doctest: +SKIP + age name + 0 2 Alice + 1 5 Bob + """ + ... + + @property + def executionInfo(self) -> Optional["ExecutionInfo"]: + """ + Returns a QueryExecution object after the query was executed. + + The queryExecution method allows to introspect information about the actual + query execution after the successful execution. Accessing this member before + the query execution will return None. + + If the same DataFrame is executed multiple times, the execution info will be + overwritten by the latest operation. + + .. versionadded:: 4.0.0 + + Returns + ------- + An instance of QueryExecution or None when the value is not set yet. + + Notes + ----- + This is an API dedicated to Spark Connect client only. With regular Spark Session, it throws + an exception. + """ + ... class DataFrameNaFunctions: @@ -6947,13 +6320,14 @@ class DataFrameNaFunctions: def __init__(self, df: DataFrame): self.df = df + @dispatch_df_method def drop( self, how: str = "any", thresh: Optional[int] = None, subset: Optional[Union[str, Tuple[str, ...], List[str]]] = None, ) -> DataFrame: - return self.df.dropna(how=how, thresh=thresh, subset=subset) + ... drop.__doc__ = DataFrame.dropna.__doc__ @@ -6965,12 +6339,13 @@ def fill(self, value: "LiteralType", subset: Optional[List[str]] = ...) -> DataF def fill(self, value: Dict[str, "LiteralType"]) -> DataFrame: ... + @dispatch_df_method def fill( self, value: Union["LiteralType", Dict[str, "LiteralType"]], subset: Optional[List[str]] = None, ) -> DataFrame: - return self.df.fillna(value=value, subset=subset) # type: ignore[arg-type] + ... fill.__doc__ = DataFrame.fillna.__doc__ @@ -7000,7 +6375,8 @@ def replace( ) -> DataFrame: ... - def replace( # type: ignore[misc] + @dispatch_df_method # type: ignore[misc] + def replace( self, to_replace: Union[List["LiteralType"], Dict["LiteralType", "OptionalPrimitiveType"]], value: Optional[ @@ -7008,7 +6384,7 @@ def replace( # type: ignore[misc] ] = _NoValue, subset: Optional[List[str]] = None, ) -> DataFrame: - return self.df.replace(to_replace, value, subset) # type: ignore[arg-type] + ... replace.__doc__ = DataFrame.replace.__doc__ @@ -7043,61 +6419,45 @@ def approxQuantile( ) -> List[List[float]]: ... + @dispatch_df_method def approxQuantile( self, col: Union[str, List[str], Tuple[str]], probabilities: Union[List[float], Tuple[float]], relativeError: float, ) -> Union[List[float], List[List[float]]]: - return self.df.approxQuantile(col, probabilities, relativeError) + ... approxQuantile.__doc__ = DataFrame.approxQuantile.__doc__ + @dispatch_df_method def corr(self, col1: str, col2: str, method: Optional[str] = None) -> float: - return self.df.corr(col1, col2, method) + ... corr.__doc__ = DataFrame.corr.__doc__ + @dispatch_df_method def cov(self, col1: str, col2: str) -> float: - return self.df.cov(col1, col2) + ... cov.__doc__ = DataFrame.cov.__doc__ + @dispatch_df_method def crosstab(self, col1: str, col2: str) -> DataFrame: - return self.df.crosstab(col1, col2) + ... crosstab.__doc__ = DataFrame.crosstab.__doc__ + @dispatch_df_method def freqItems(self, cols: List[str], support: Optional[float] = None) -> DataFrame: - return self.df.freqItems(cols, support) + ... freqItems.__doc__ = DataFrame.freqItems.__doc__ + @dispatch_df_method def sampleBy( self, col: str, fractions: Dict[Any, float], seed: Optional[int] = None ) -> DataFrame: - return self.df.sampleBy(col, fractions, seed) + ... sampleBy.__doc__ = DataFrame.sampleBy.__doc__ - - -def _test() -> None: - import doctest - from pyspark.sql import SparkSession - import pyspark.sql.dataframe - - globs = pyspark.sql.dataframe.__dict__.copy() - spark = SparkSession.builder.master("local[4]").appName("sql.dataframe tests").getOrCreate() - globs["spark"] = spark - (failure_count, test_count) = doctest.testmod( - pyspark.sql.dataframe, - globs=globs, - optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF, - ) - spark.stop() - if failure_count: - sys.exit(-1) - - -if __name__ == "__main__": - _test() diff --git a/python/pyspark/sql/datasource.py b/python/pyspark/sql/datasource.py index c08b5b7af77fb..8ea36bb04fb68 100644 --- a/python/pyspark/sql/datasource.py +++ b/python/pyspark/sql/datasource.py @@ -16,7 +16,7 @@ # from abc import ABC, abstractmethod from collections import UserDict -from typing import Any, Dict, Iterator, List, Sequence, Tuple, Type, Union, TYPE_CHECKING +from typing import Any, Dict, Iterator, List, Optional, Sequence, Tuple, Type, Union, TYPE_CHECKING from pyspark.sql import Row from pyspark.sql.types import StructType @@ -30,9 +30,12 @@ "DataSource", "DataSourceReader", "DataSourceStreamReader", + "SimpleDataSourceStreamReader", "DataSourceWriter", + "DataSourceStreamWriter", "DataSourceRegistration", "InputPartition", + "SimpleDataSourceStreamReader", "WriterCommitMessage", ] @@ -183,11 +186,36 @@ def streamWriter(self, schema: StructType, overwrite: bool) -> "DataSourceStream message_parameters={"feature": "streamWriter"}, ) + def simpleStreamReader(self, schema: StructType) -> "SimpleDataSourceStreamReader": + """ + Returns a :class:`SimpleDataSourceStreamReader` instance for reading data. + + One of simpleStreamReader() and streamReader() must be implemented for readable streaming + data source. Spark will check whether streamReader() is implemented, if yes, create a + DataSourceStreamReader to read data. simpleStreamReader() will only be invoked when + streamReader() is not implemented. + + Parameters + ---------- + schema : :class:`StructType` + The schema of the data to be read. + + Returns + ------- + reader : :class:`SimpleDataSourceStreamReader` + A reader instance for this data source. + """ + raise PySparkNotImplementedError( + error_class="NOT_IMPLEMENTED", + message_parameters={"feature": "simpleStreamReader"}, + ) + def streamReader(self, schema: StructType) -> "DataSourceStreamReader": """ Returns a :class:`DataSourceStreamReader` instance for reading streaming data. - The implementation is required for readable streaming data sources. + One of simpleStreamReader() and streamReader() must be implemented for readable streaming + data source. Parameters ---------- @@ -305,7 +333,7 @@ def partitions(self) -> Sequence[InputPartition]: ) @abstractmethod - def read(self, partition: InputPartition) -> Union[Iterator[Tuple], Iterator[Row]]: + def read(self, partition: InputPartition) -> Iterator[Tuple]: """ Generates data for a given partition and returns an iterator of tuples or rows. @@ -396,8 +424,10 @@ def latestOffset(self) -> dict: def partitions(self, start: dict, end: dict) -> Sequence[InputPartition]: """ - Returns a list of InputPartition given the start and end offsets. Each InputPartition - represents a data split that can be processed by one Spark task. + Returns a list of InputPartition given the start and end offsets. Each InputPartition + represents a data split that can be processed by one Spark task. This may be called with + an empty offset range when start == end, in that case the method should return + an empty sequence of InputPartition. Parameters ---------- @@ -418,7 +448,7 @@ def partitions(self, start: dict, end: dict) -> Sequence[InputPartition]: ) @abstractmethod - def read(self, partition: InputPartition) -> Union[Iterator[Tuple], Iterator[Row]]: + def read(self, partition: InputPartition) -> Iterator[Tuple]: """ Generates data for a given partition and returns an iterator of tuples or rows. @@ -469,6 +499,102 @@ def stop(self) -> None: ... +class SimpleDataSourceStreamReader(ABC): + """ + A base class for simplified streaming data source readers. + Compared to :class:`DataSourceStreamReader`, :class:`SimpleDataSourceStreamReader` doesn't + require planning data partition. Also, the read api of :class:`SimpleDataSourceStreamReader` + allows reading data and planning the latest offset at the same time. + + Because :class:`SimpleDataSourceStreamReader` read records in Spark driver node to determine + end offset of each batch without partitioning, it is only supposed to be used in + lightweight use cases where input rate and batch size is small. + Use :class:`DataSourceStreamReader` when read throughput is high and can't be handled + by a single process. + + .. versionadded: 4.0.0 + """ + + def initialOffset(self) -> dict: + """ + Return the initial offset of the streaming data source. + A new streaming query starts reading data from the initial offset. + If Spark is restarting an existing query, it will restart from the check-pointed offset + rather than the initial one. + + Returns + ------- + dict + A dict or recursive dict whose key and value are primitive types, which includes + Integer, String and Boolean. + + Examples + -------- + >>> def initialOffset(self): + ... return {"parititon-1": {"index": 3, "closed": True}, "partition-2": {"index": 5}} + """ + raise PySparkNotImplementedError( + error_class="NOT_IMPLEMENTED", + message_parameters={"feature": "initialOffset"}, + ) + + def read(self, start: dict) -> Tuple[Iterator[Tuple], dict]: + """ + Read all available data from start offset and return the offset that next read attempt + starts from. + + Parameters + ---------- + start : dict + The start offset to start reading from. + + Returns + ------- + A :class:`Tuple` of an iterator of :class:`Tuple` and a dict\\s + The iterator contains all the available records after start offset. + The dict is the end offset of this read attempt and the start of next read attempt. + """ + raise PySparkNotImplementedError( + error_class="NOT_IMPLEMENTED", + message_parameters={"feature": "read"}, + ) + + def readBetweenOffsets(self, start: dict, end: dict) -> Iterator[Tuple]: + """ + Read all available data from specific start offset and end offset. + This is invoked during failure recovery to re-read a batch deterministically. + + Parameters + ---------- + start : dict + The start offset to start reading from. + + end : dict + The offset where the reading stop. + + Returns + ------- + iterator of :class:`Tuple`\\s + All the records between start offset and end offset. + """ + raise PySparkNotImplementedError( + error_class="NOT_IMPLEMENTED", + message_parameters={"feature": "readBetweenOffsets"}, + ) + + def commit(self, end: dict) -> None: + """ + Informs the source that Spark has completed processing all data for offsets less than or + equal to `end` and will only request offsets greater than `end` in the future. + + Parameters + ---------- + end : dict + The latest offset that the streaming query has processed for this source. + """ + ... + + class DataSourceWriter(ABC): """ A base class for data source writers. Data source writers are responsible for saving @@ -503,7 +629,7 @@ def write(self, iterator: Iterator[Row]) -> "WriterCommitMessage": """ ... - def commit(self, messages: List["WriterCommitMessage"]) -> None: + def commit(self, messages: List[Optional["WriterCommitMessage"]]) -> None: """ Commits this writing job with a list of commit messages. @@ -515,11 +641,11 @@ def commit(self, messages: List["WriterCommitMessage"]) -> None: Parameters ---------- messages : list of :class:`WriterCommitMessage`\\s - A list of commit messages. + A list of commit messages. If a write task fails, the commit message will be `None`. """ ... - def abort(self, messages: List["WriterCommitMessage"]) -> None: + def abort(self, messages: List[Optional["WriterCommitMessage"]]) -> None: """ Aborts this writing job due to task failures. @@ -531,7 +657,7 @@ def abort(self, messages: List["WriterCommitMessage"]) -> None: Parameters ---------- messages : list of :class:`WriterCommitMessage`\\s - A list of commit messages. + A list of commit messages. If a write task fails, the commit message will be `None`. """ ... @@ -568,7 +694,7 @@ def write(self, iterator: Iterator[Row]) -> "WriterCommitMessage": """ ... - def commit(self, messages: List["WriterCommitMessage"], batchId: int) -> None: + def commit(self, messages: List[Optional["WriterCommitMessage"]], batchId: int) -> None: """ Commits this microbatch with a list of commit messages. @@ -579,15 +705,15 @@ def commit(self, messages: List["WriterCommitMessage"], batchId: int) -> None: Parameters ---------- - messages : List[WriterCommitMessage] - A list of commit messages. + messages : list of :class:`WriterCommitMessage`\\s + A list of commit messages. If a write task fails, the commit message will be `None`. batchId: int An integer that uniquely identifies a batch of data being written. The integer increase by 1 with each microbatch processed. """ ... - def abort(self, messages: List["WriterCommitMessage"], batchId: int) -> None: + def abort(self, messages: List[Optional["WriterCommitMessage"]], batchId: int) -> None: """ Aborts this microbatch due to task failures. @@ -598,8 +724,8 @@ def abort(self, messages: List["WriterCommitMessage"], batchId: int) -> None: Parameters ---------- - messages : List[WriterCommitMessage] - A list of commit messages. + messages : list of :class:`WriterCommitMessage`\\s + A list of commit messages. If a write task fails, the commit message will be `None`. batchId: int An integer that uniquely identifies a batch of data being written. The integer increase by 1 with each microbatch processed. diff --git a/python/pyspark/sql/datasource_internal.py b/python/pyspark/sql/datasource_internal.py new file mode 100644 index 0000000000000..6df0be4192ec8 --- /dev/null +++ b/python/pyspark/sql/datasource_internal.py @@ -0,0 +1,146 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +import json +import copy +from itertools import chain +from typing import Iterator, List, Optional, Sequence, Tuple + +from pyspark.sql.datasource import ( + DataSource, + DataSourceStreamReader, + InputPartition, + SimpleDataSourceStreamReader, +) +from pyspark.sql.types import StructType +from pyspark.errors import PySparkNotImplementedError + + +def _streamReader(datasource: DataSource, schema: StructType) -> "DataSourceStreamReader": + """ + Fallback to simpleStreamReader() method when streamReader() is not implemented. + This should be invoked whenever a DataSourceStreamReader needs to be created instead of + invoking datasource.streamReader() directly. + """ + try: + return datasource.streamReader(schema=schema) + except PySparkNotImplementedError: + return _SimpleStreamReaderWrapper(datasource.simpleStreamReader(schema=schema)) + + +class SimpleInputPartition(InputPartition): + def __init__(self, start: dict, end: dict): + self.start = start + self.end = end + + +class PrefetchedCacheEntry: + def __init__(self, start: dict, end: dict, iterator: Iterator[Tuple]): + self.start = start + self.end = end + self.iterator = iterator + + +class _SimpleStreamReaderWrapper(DataSourceStreamReader): + """ + A private class that wrap :class:`SimpleDataSourceStreamReader` in prefetch and cache pattern, + so that :class:`SimpleDataSourceStreamReader` can integrate with streaming engine like an + ordinary :class:`DataSourceStreamReader`. + + current_offset tracks the latest progress of the record prefetching, it is initialized to be + initialOffset() when query start for the first time or initialized to be the end offset of + the last planned batch when query restarts. + + When streaming engine calls latestOffset(), the wrapper calls read() that starts from + current_offset, prefetches and cache the data, then updates the current_offset to be + the end offset of the new data. + + When streaming engine call planInputPartitions(start, end), the wrapper get the prefetched data + from cache and send it to JVM along with the input partitions. + + When query restart, batches in write ahead offset log that has not been committed will be + replayed by reading data between start and end offset through readBetweenOffsets(start, end). + """ + + def __init__(self, simple_reader: SimpleDataSourceStreamReader): + self.simple_reader = simple_reader + self.initial_offset: Optional[dict] = None + self.current_offset: Optional[dict] = None + self.cache: List[PrefetchedCacheEntry] = [] + + def initialOffset(self) -> dict: + if self.initial_offset is None: + self.initial_offset = self.simple_reader.initialOffset() + return self.initial_offset + + def latestOffset(self) -> dict: + # when query start for the first time, use initial offset as the start offset. + if self.current_offset is None: + self.current_offset = self.initialOffset() + (iter, end) = self.simple_reader.read(self.current_offset) + self.cache.append(PrefetchedCacheEntry(self.current_offset, end, iter)) + self.current_offset = end + return end + + def commit(self, end: dict) -> None: + if self.current_offset is None: + self.current_offset = end + + end_idx = -1 + for idx, entry in enumerate(self.cache): + if json.dumps(entry.end) == json.dumps(end): + end_idx = idx + break + if end_idx > 0: + # Drop prefetched data for batch that has been committed. + self.cache = self.cache[end_idx:] + self.simple_reader.commit(end) + + def partitions(self, start: dict, end: dict) -> Sequence["InputPartition"]: + # when query restart from checkpoint, use the last committed offset as the start offset. + # This depends on the streaming engine calling planInputPartitions() of the last batch + # in offset log when query restart. + if self.current_offset is None: + self.current_offset = end + if len(self.cache) > 0: + assert self.cache[-1].end == end + return [SimpleInputPartition(start, end)] + + def getCache(self, start: dict, end: dict) -> Iterator[Tuple]: + start_idx = -1 + end_idx = -1 + for idx, entry in enumerate(self.cache): + # There is no convenient way to compare 2 offsets. + # Serialize into json string before comparison. + if json.dumps(entry.start) == json.dumps(start): + start_idx = idx + if json.dumps(entry.end) == json.dumps(end): + end_idx = idx + break + if start_idx == -1 or end_idx == -1: + return None # type: ignore[return-value] + # Chain all the data iterator between start offset and end offset + # need to copy here to avoid exhausting the original data iterator. + entries = [copy.copy(entry.iterator) for entry in self.cache[start_idx : end_idx + 1]] + it = chain(*entries) + return it + + def read( + self, input_partition: SimpleInputPartition # type: ignore[override] + ) -> Iterator[Tuple]: + return self.simple_reader.readBetweenOffsets(input_partition.start, input_partition.end) diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index 58eb136da216c..2a302d1e51125 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -40,7 +40,7 @@ ) from pyspark.errors import PySparkTypeError, PySparkValueError -from pyspark.sql.column import Column, _to_java_column, _to_seq, _create_column_from_literal +from pyspark.sql.column import Column from pyspark.sql.dataframe import DataFrame from pyspark.sql.types import ArrayType, DataType, StringType, StructType, _from_numpy_type @@ -86,7 +86,7 @@ def _get_jvm_function(name: str, sc: "SparkContext") -> Callable: Java gateway associated with sc. """ assert sc._jvm is not None - return getattr(sc._jvm.functions, name) + return getattr(getattr(sc._jvm, "org.apache.spark.sql.functions"), name) def _invoke_function(name: str, *args: Any) -> Column: @@ -106,6 +106,8 @@ def _invoke_function_over_columns(name: str, *cols: "ColumnOrName") -> Column: Invokes n-ary JVM function identified by name and wraps the result with :class:`~pyspark.sql.Column`. """ + from pyspark.sql.classic.column import _to_java_column + return _invoke_function(name, *(_to_java_column(col) for col in cols)) @@ -114,6 +116,8 @@ def _invoke_function_over_seq_of_columns(name: str, cols: "Iterable[ColumnOrName Invokes unary JVM function identified by name with and wraps the result with :class:`~pyspark.sql.Column`. """ + from pyspark.sql.classic.column import _to_java_column, _to_seq + sc = _get_active_spark_context() return _invoke_function(name, _to_seq(sc, cols, _to_java_column)) @@ -123,6 +127,7 @@ def _invoke_binary_math_function(name: str, col1: Any, col2: Any) -> Column: Invokes binary JVM math function identified by name and wraps the result with :class:`~pyspark.sql.Column`. """ + from pyspark.sql.classic.column import _to_java_column, _create_column_from_literal # For legacy reasons, the arguments here can be implicitly converted into column cols = [ @@ -633,7 +638,7 @@ def try_divide(left: "ColumnOrName", right: "ColumnOrName") -> Column: | 4 months| +--------------------------------------------------+ - Example 3: Exception druing division, resulting in NULL when ANSI mode is on + Example 3: Exception during division, resulting in NULL when ANSI mode is on >>> import pyspark.sql.functions as sf >>> origin = spark.conf.get("spark.sql.ansi.enabled") @@ -652,6 +657,56 @@ def try_divide(left: "ColumnOrName", right: "ColumnOrName") -> Column: return _invoke_function_over_columns("try_divide", left, right) +@_try_remote_functions +def try_remainder(left: "ColumnOrName", right: "ColumnOrName") -> Column: + """ + Returns the remainder after `dividend`/`divisor`. Its result is + always null if `divisor` is 0. + + .. versionadded:: 4.0.0 + + Parameters + ---------- + left : :class:`~pyspark.sql.Column` or str + dividend + right : :class:`~pyspark.sql.Column` or str + divisor + + Examples + -------- + Example 1: Integer divided by Integer. + + >>> import pyspark.sql.functions as sf + >>> spark.createDataFrame( + ... [(6000, 15), (3, 2), (1234, 0)], ["a", "b"] + ... ).select(sf.try_remainder("a", "b")).show() + +-------------------+ + |try_remainder(a, b)| + +-------------------+ + | 0| + | 1| + | NULL| + +-------------------+ + + Example 2: Exception during division, resulting in NULL when ANSI mode is on + + >>> import pyspark.sql.functions as sf + >>> origin = spark.conf.get("spark.sql.ansi.enabled") + >>> spark.conf.set("spark.sql.ansi.enabled", "true") + >>> try: + ... df = spark.range(1) + ... df.select(sf.try_remainder(df.id, sf.lit(0))).show() + ... finally: + ... spark.conf.set("spark.sql.ansi.enabled", origin) + +--------------------+ + |try_remainder(id, 0)| + +--------------------+ + | NULL| + +--------------------+ + """ + return _invoke_function_over_columns("try_remainder", left, right) + + @_try_remote_functions def try_multiply(left: "ColumnOrName", right: "ColumnOrName") -> Column: """ @@ -1005,6 +1060,8 @@ def mode(col: "ColumnOrName", deterministic: bool = False) -> Column: | -10| +---------------------------------------+ """ + from pyspark.sql.classic.column import _to_java_column + return _invoke_function("mode", _to_java_column(col), deterministic) @@ -1268,7 +1325,7 @@ def max_by(col: "ColumnOrName", ord: "ColumnOrName") -> Column: >>> import pyspark.sql.functions as sf >>> df = spark.createDataFrame([ ... ("Consult", "Eva", 6), ("Finance", "Frank", 5), - ... ("Finance", "George", 5), ("Consult", "Henry", 7)], + ... ("Finance", "George", 9), ("Consult", "Henry", 7)], ... schema=("department", "name", "years_in_dept")) >>> df.groupby("department").agg( ... sf.max_by("name", "years_in_dept") @@ -1349,7 +1406,7 @@ def min_by(col: "ColumnOrName", ord: "ColumnOrName") -> Column: >>> import pyspark.sql.functions as sf >>> df = spark.createDataFrame([ ... ("Consult", "Eva", 6), ("Finance", "Frank", 5), - ... ("Finance", "George", 5), ("Consult", "Henry", 7)], + ... ("Finance", "George", 9), ("Consult", "Henry", 7)], ... schema=("department", "name", "years_in_dept")) >>> df.groupby("department").agg( ... sf.min_by("name", "years_in_dept") @@ -1358,7 +1415,7 @@ def min_by(col: "ColumnOrName", ord: "ColumnOrName") -> Column: |department|min_by(name, years_in_dept)| +----------+---------------------------+ | Consult| Eva| - | Finance| George| + | Finance| Frank| +----------+---------------------------+ """ return _invoke_function_over_columns("min_by", col, ord) @@ -2061,7 +2118,7 @@ def ceil(col: "ColumnOrName", scale: Optional[Union[Column, int]] = None) -> Col scale : :class:`~pyspark.sql.Column` or int, optional An optional parameter to control the rounding behavior. - .. versionadded:: 4.0.0 + .. versionadded:: 4.0.0 Returns ------- @@ -2114,7 +2171,7 @@ def ceiling(col: "ColumnOrName", scale: Optional[Union[Column, int]] = None) -> scale : :class:`~pyspark.sql.Column` or int An optional parameter to control the rounding behavior. - .. versionadded:: 4.0.0 + .. versionadded:: 4.0.0 Returns ------- @@ -2375,7 +2432,7 @@ def floor(col: "ColumnOrName", scale: Optional[Union[Column, int]] = None) -> Co scale : :class:`~pyspark.sql.Column` or int, optional An optional parameter to control the rounding behavior. - .. versionadded:: 4.0.0 + .. versionadded:: 4.0.0 Returns @@ -5123,6 +5180,8 @@ def approx_count_distinct(col: "ColumnOrName", rsd: Optional[float] = None) -> C | 95546| 102065| +----------------+------------+ """ + from pyspark.sql.classic.column import _to_java_column + if rsd is None: return _invoke_function_over_columns("approx_count_distinct", col) else: @@ -5386,6 +5445,8 @@ def count_distinct(col: "ColumnOrName", *cols: "ColumnOrName") -> Column: | 2| +------------------------------+ """ + from pyspark.sql.classic.column import _to_seq, _to_java_column + sc = _get_active_spark_context() return _invoke_function( "count_distinct", _to_java_column(col), _to_seq(sc, cols, _to_java_column) @@ -5413,8 +5474,8 @@ def first(col: "ColumnOrName", ignorenulls: bool = False) -> Column: ---------- col : :class:`~pyspark.sql.Column` or str column to fetch first value for. - ignorenulls : :class:`~pyspark.sql.Column` or str - if first value is null then look for first non-null value. + ignorenulls : bool + if first value is null then look for first non-null value. ``False``` by default. Returns ------- @@ -5443,6 +5504,8 @@ def first(col: "ColumnOrName", ignorenulls: bool = False) -> Column: | Bob| 5| +-----+----------+ """ + from pyspark.sql.classic.column import _to_java_column + return _invoke_function("first", _to_java_column(col), ignorenulls) @@ -5684,8 +5747,8 @@ def last(col: "ColumnOrName", ignorenulls: bool = False) -> Column: ---------- col : :class:`~pyspark.sql.Column` or str column to fetch last value for. - ignorenulls : :class:`~pyspark.sql.Column` or str - if last value is null then look for non-null value. + ignorenulls : bool + if last value is null then look for non-null value. ``False``` by default. Returns ------- @@ -5714,6 +5777,8 @@ def last(col: "ColumnOrName", ignorenulls: bool = False) -> Column: | Bob| 5| +-----+---------+ """ + from pyspark.sql.classic.column import _to_java_column + return _invoke_function("last", _to_java_column(col), ignorenulls) @@ -5847,6 +5912,8 @@ def percentile( | 2| 19.967859769284075| +---+--------------------+ """ + from pyspark.sql.classic.column import _to_seq, _create_column_from_literal, _to_java_column + sc = _get_active_spark_context() if isinstance(percentage, (list, tuple)): @@ -5924,6 +5991,8 @@ def percentile_approx( |-- key: long (nullable = true) |-- median: double (nullable = true) """ + from pyspark.sql.classic.column import _to_seq, _create_column_from_literal, _to_java_column + sc = _get_active_spark_context() if isinstance(percentage, (list, tuple)): @@ -5998,6 +6067,8 @@ def approx_percentile( |-- key: long (nullable = true) |-- approx_percentile(value, 0.5, 1000000): double (nullable = true) """ + from pyspark.sql.classic.column import _to_seq, _create_column_from_literal, _to_java_column + sc = _get_active_spark_context() if isinstance(percentage, (list, tuple)): @@ -6145,8 +6216,8 @@ def round(col: "ColumnOrName", scale: Optional[Union[Column, int]] = None) -> Co scale : :class:`~pyspark.sql.Column` or int, optional An optional parameter to control the rounding behavior. - .. versionchanged:: 4.0.0 - Support Column type. + .. versionchanged:: 4.0.0 + Support Column type. Returns ------- @@ -6200,8 +6271,8 @@ def bround(col: "ColumnOrName", scale: Optional[Union[Column, int]] = None) -> C scale : :class:`~pyspark.sql.Column` or int, optional An optional parameter to control the rounding behavior. - .. versionchanged:: 4.0.0 - Support Column type. + .. versionchanged:: 4.0.0 + Support Column type. Returns ------- @@ -6279,6 +6350,8 @@ def shiftleft(col: "ColumnOrName", numBits: int) -> Column: >>> spark.createDataFrame([(21,)], ['a']).select(shiftleft('a', 1).alias('r')).collect() [Row(r=42)] """ + from pyspark.sql.classic.column import _to_java_column + return _invoke_function("shiftleft", _to_java_column(col), numBits) @@ -6324,6 +6397,8 @@ def shiftright(col: "ColumnOrName", numBits: int) -> Column: >>> spark.createDataFrame([(42,)], ['a']).select(shiftright('a', 1).alias('r')).collect() [Row(r=21)] """ + from pyspark.sql.classic.column import _to_java_column + return _invoke_function("shiftright", _to_java_column(col), numBits) @@ -6370,6 +6445,8 @@ def shiftrightunsigned(col: "ColumnOrName", numBits: int) -> Column: >>> df.select(shiftrightunsigned('a', 1).alias('r')).collect() [Row(r=9223372036854775787)] """ + from pyspark.sql.classic.column import _to_java_column + return _invoke_function("shiftrightunsigned", _to_java_column(col), numBits) @@ -6684,6 +6761,8 @@ def log(arg1: Union["ColumnOrName", float], arg2: Optional["ColumnOrName"] = Non |1.3862943611198906| +------------------+ """ + from pyspark.sql.classic.column import _to_java_column + if arg2 is None: return _invoke_function_over_columns("log", cast("ColumnOrName", arg1)) else: @@ -6781,6 +6860,8 @@ def conv(col: "ColumnOrName", fromBase: int, toBase: int) -> Column: >>> df.select(conv(df.n, 2, 16).alias('hex')).collect() [Row(hex='15')] """ + from pyspark.sql.classic.column import _to_java_column + return _invoke_function("conv", _to_java_column(col), fromBase, toBase) @@ -6894,6 +6975,8 @@ def lag(col: "ColumnOrName", offset: int = 1, default: Optional[Any] = None) -> | b| 8| -1| +---+---+-------------+ """ + from pyspark.sql.classic.column import _to_java_column + return _invoke_function("lag", _to_java_column(col), offset, default) @@ -6975,6 +7058,8 @@ def lead(col: "ColumnOrName", offset: int = 1, default: Optional[Any] = None) -> | b| 8| -1| +---+---+----------+ """ + from pyspark.sql.classic.column import _to_java_column + return _invoke_function("lead", _to_java_column(col), offset, default) @@ -7049,6 +7134,8 @@ def nth_value(col: "ColumnOrName", offset: int, ignoreNulls: Optional[bool] = Fa | b| 8| 8| +---+---+---------+ """ + from pyspark.sql.classic.column import _to_java_column + return _invoke_function("nth_value", _to_java_column(col), offset, ignoreNulls) @@ -7561,6 +7648,8 @@ def date_format(date: "ColumnOrName", format: str) -> Column: >>> df.select(date_format('dt', 'MM/dd/yyyy').alias('date')).collect() [Row(date='04/08/2015')] """ + from pyspark.sql.classic.column import _to_java_column + return _invoke_function("date_format", _to_java_column(date), format) @@ -8418,6 +8507,8 @@ def months_between(date1: "ColumnOrName", date2: "ColumnOrName", roundOff: bool >>> df.select(months_between(df.date1, df.date2, False).alias('months')).collect() [Row(months=3.9495967741935485)] """ + from pyspark.sql.classic.column import _to_java_column + return _invoke_function( "months_between", _to_java_column(date1), _to_java_column(date2), roundOff ) @@ -8459,6 +8550,8 @@ def to_date(col: "ColumnOrName", format: Optional[str] = None) -> Column: >>> df.select(to_date(df.t, 'yyyy-MM-dd HH:mm:ss').alias('date')).collect() [Row(date=datetime.date(1997, 2, 28))] """ + from pyspark.sql.classic.column import _to_java_column + if format is None: return _invoke_function_over_columns("to_date", col) else: @@ -8595,6 +8688,8 @@ def to_timestamp(col: "ColumnOrName", format: Optional[str] = None) -> Column: |1997-02-28 10:30:00| +-------------------+ """ + from pyspark.sql.classic.column import _to_java_column + if format is None: return _invoke_function_over_columns("to_timestamp", col) else: @@ -8852,6 +8947,8 @@ def trunc(date: "ColumnOrName", format: str) -> Column: >>> df.select(trunc(df.d, 'mon').alias('month')).collect() [Row(month=datetime.date(1997, 2, 1))] """ + from pyspark.sql.classic.column import _to_java_column + return _invoke_function("trunc", _to_java_column(date), format) @@ -8889,6 +8986,8 @@ def date_trunc(format: str, timestamp: "ColumnOrName") -> Column: >>> df.select(date_trunc('mon', df.t).alias('month')).collect() [Row(month=datetime.datetime(1997, 2, 1, 0, 0))] """ + from pyspark.sql.classic.column import _to_java_column + return _invoke_function("date_trunc", format, _to_java_column(timestamp)) @@ -8922,6 +9021,8 @@ def next_day(date: "ColumnOrName", dayOfWeek: str) -> Column: >>> df.select(next_day(df.d, 'Sun').alias('date')).collect() [Row(date=datetime.date(2015, 8, 2))] """ + from pyspark.sql.classic.column import _to_java_column + return _invoke_function("next_day", _to_java_column(date), dayOfWeek) @@ -8951,6 +9052,8 @@ def last_day(date: "ColumnOrName") -> Column: >>> df.select(last_day(df.d).alias('date')).collect() [Row(date=datetime.date(1997, 2, 28))] """ + from pyspark.sql.classic.column import _to_java_column + return _invoke_function("last_day", _to_java_column(date)) @@ -8986,6 +9089,8 @@ def from_unixtime(timestamp: "ColumnOrName", format: str = "yyyy-MM-dd HH:mm:ss" [Row(ts='2015-04-08 00:00:00')] >>> spark.conf.unset("spark.sql.session.timeZone") """ + from pyspark.sql.classic.column import _to_java_column + return _invoke_function("from_unixtime", _to_java_column(timestamp), format) @@ -9066,6 +9171,8 @@ def unix_timestamp( >>> spark.conf.unset("spark.sql.session.timeZone") """ + from pyspark.sql.classic.column import _to_java_column + if timestamp is None: return _invoke_function("unix_timestamp") return _invoke_function("unix_timestamp", _to_java_column(timestamp), format) @@ -9120,6 +9227,8 @@ def from_utc_timestamp(timestamp: "ColumnOrName", tz: "ColumnOrName") -> Column: >>> df.select(from_utc_timestamp(df.ts, df.tz).alias('local_time')).collect() [Row(local_time=datetime.datetime(1997, 2, 28, 19, 30))] """ + from pyspark.sql.classic.column import _to_java_column + if isinstance(tz, Column): tz = _to_java_column(tz) return _invoke_function("from_utc_timestamp", _to_java_column(timestamp), tz) @@ -9174,6 +9283,8 @@ def to_utc_timestamp(timestamp: "ColumnOrName", tz: "ColumnOrName") -> Column: >>> df.select(to_utc_timestamp(df.ts, df.tz).alias('utc_time')).collect() [Row(utc_time=datetime.datetime(1997, 2, 28, 1, 30))] """ + from pyspark.sql.classic.column import _to_java_column + if isinstance(tz, Column): tz = _to_java_column(tz) return _invoke_function("to_utc_timestamp", _to_java_column(timestamp), tz) @@ -9290,6 +9401,129 @@ def timestamp_micros(col: "ColumnOrName") -> Column: return _invoke_function_over_columns("timestamp_micros", col) +@_try_remote_functions +def timestamp_diff(unit: str, start: "ColumnOrName", end: "ColumnOrName") -> Column: + """ + Gets the difference between the timestamps in the specified units by truncating + the fraction part. + + .. versionadded:: 4.0.0 + + Parameters + ---------- + unit : str + This indicates the units of the difference between the given timestamps. + Supported options are (case insensitive): "YEAR", "QUARTER", "MONTH", "WEEK", + "DAY", "HOUR", "MINUTE", "SECOND", "MILLISECOND" and "MICROSECOND". + start : :class:`~pyspark.sql.Column` or str + A timestamp which the expression subtracts from `endTimestamp`. + end : :class:`~pyspark.sql.Column` or str + A timestamp from which the expression subtracts `startTimestamp`. + + Returns + ------- + :class:`~pyspark.sql.Column` + the difference between the timestamps. + + Examples + -------- + >>> import datetime + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame( + ... [(datetime.datetime(2016, 3, 11, 9, 0, 7), datetime.datetime(2024, 4, 2, 9, 0, 7))], + ... ).toDF("start", "end") + >>> df.select(sf.timestamp_diff("year", "start", "end")).show() + +-------------------------------+ + |timestampdiff(year, start, end)| + +-------------------------------+ + | 8| + +-------------------------------+ + >>> df.select(sf.timestamp_diff("WEEK", "start", "end")).show() + +-------------------------------+ + |timestampdiff(WEEK, start, end)| + +-------------------------------+ + | 420| + +-------------------------------+ + >>> df.select(sf.timestamp_diff("day", "end", "start")).show() + +------------------------------+ + |timestampdiff(day, end, start)| + +------------------------------+ + | -2944| + +------------------------------+ + """ + from pyspark.sql.classic.column import _to_java_column + + return _invoke_function( + "timestamp_diff", + unit, + _to_java_column(start), + _to_java_column(end), + ) + + +@_try_remote_functions +def timestamp_add(unit: str, quantity: "ColumnOrName", ts: "ColumnOrName") -> Column: + """ + Gets the difference between the timestamps in the specified units by truncating + the fraction part. + + .. versionadded:: 4.0.0 + + Parameters + ---------- + unit : str + This indicates the units of the difference between the given timestamps. + Supported options are (case insensitive): "YEAR", "QUARTER", "MONTH", "WEEK", + "DAY", "HOUR", "MINUTE", "SECOND", "MILLISECOND" and "MICROSECOND". + quantity : :class:`~pyspark.sql.Column` or str + The number of units of time that you want to add. + ts : :class:`~pyspark.sql.Column` or str + A timestamp to which you want to add. + + Returns + ------- + :class:`~pyspark.sql.Column` + the difference between the timestamps. + + Examples + -------- + >>> import datetime + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame( + ... [(datetime.datetime(2016, 3, 11, 9, 0, 7), 2), + ... (datetime.datetime(2024, 4, 2, 9, 0, 7), 3)], ["ts", "quantity"]) + >>> df.select(sf.timestamp_add("year", "quantity", "ts")).show() + +--------------------------------+ + |timestampadd(year, quantity, ts)| + +--------------------------------+ + | 2018-03-11 09:00:07| + | 2027-04-02 09:00:07| + +--------------------------------+ + >>> df.select(sf.timestamp_add("WEEK", sf.lit(5), "ts")).show() + +-------------------------+ + |timestampadd(WEEK, 5, ts)| + +-------------------------+ + | 2016-04-15 09:00:07| + | 2024-05-07 09:00:07| + +-------------------------+ + >>> df.select(sf.timestamp_add("day", sf.lit(-5), "ts")).show() + +-------------------------+ + |timestampadd(day, -5, ts)| + +-------------------------+ + | 2016-03-06 09:00:07| + | 2024-03-28 09:00:07| + +-------------------------+ + """ + from pyspark.sql.classic.column import _to_java_column + + return _invoke_function( + "timestamp_add", + unit, + _to_java_column(quantity), + _to_java_column(ts), + ) + + @_try_remote_functions def window( timeColumn: "ColumnOrName", @@ -9367,6 +9601,7 @@ def window( |2016-03-11 09:00:05|2016-03-11 09:00:10| 1| +-------------------+-------------------+---+ """ + from pyspark.sql.classic.column import _to_java_column def check_string_field(field, fieldName): # type: ignore[no-untyped-def] if not field or type(field) is not str: @@ -9438,6 +9673,8 @@ def window_time( ... ).collect() [Row(end='2016-03-11 09:00:10', window_time='2016-03-11 09:00:09.999999', sum=1)] """ + from pyspark.sql.classic.column import _to_java_column + window_col = _to_java_column(windowColumn) return _invoke_function("window_time", window_col) @@ -9493,6 +9730,7 @@ def session_window(timeColumn: "ColumnOrName", gapDuration: Union[Column, str]) ... w.session_window.end.cast("string").alias("end"), "sum").collect() [Row(start='2016-03-11 09:00:07', end='2016-03-11 09:00:12', sum=1)] """ + from pyspark.sql.classic.column import _to_java_column def check_field(field: Union[Column, str], fieldName: str) -> None: if field is None or not isinstance(field, (str, Column)): @@ -9863,6 +10101,8 @@ def sha2(col: "ColumnOrName", numBits: int) -> Column: |Bob |cd9fb1e148ccd8442e5aa74904cc73bf6fb54d1d54d333bd596aa9bb4bb4e961| +-----+----------------------------------------------------------------+ """ + from pyspark.sql.classic.column import _to_java_column + if numBits not in [0, 224, 256, 384, 512]: raise PySparkValueError( error_class="VALUE_NOT_ALLOWED", @@ -10000,6 +10240,8 @@ def assert_true(col: "ColumnOrName", errMsg: Optional[Union[Column, str]] = None java.lang.RuntimeException: My error msg ... """ + from pyspark.sql.classic.column import _create_column_from_literal, _to_java_column + if errMsg is None: return _invoke_function_over_columns("assert_true", col) if not isinstance(errMsg, (str, Column)): @@ -10042,6 +10284,8 @@ def raise_error(errMsg: Union[Column, str]) -> Column: java.lang.RuntimeException: My error message ... """ + from pyspark.sql.classic.column import _create_column_from_literal, _to_java_column + if not isinstance(errMsg, (str, Column)): raise PySparkTypeError( error_class="NOT_COLUMN_OR_STR", @@ -10368,6 +10612,8 @@ def concat_ws(sep: str, *cols: "ColumnOrName") -> Column: >>> df.select(concat_ws('-', df.s, df.d).alias('s')).collect() [Row(s='abcd-123')] """ + from pyspark.sql.classic.column import _to_seq, _to_java_column + sc = _get_active_spark_context() return _invoke_function("concat_ws", sep, _to_seq(sc, cols, _to_java_column)) @@ -10405,6 +10651,8 @@ def decode(col: "ColumnOrName", charset: str) -> Column: | abcd| +----------------+ """ + from pyspark.sql.classic.column import _to_java_column + return _invoke_function("decode", _to_java_column(col), charset) @@ -10441,6 +10689,8 @@ def encode(col: "ColumnOrName", charset: str) -> Column: | [61 62 63 64]| +----------------+ """ + from pyspark.sql.classic.column import _to_java_column + return _invoke_function("encode", _to_java_column(col), charset) @@ -10470,6 +10720,8 @@ def format_number(col: "ColumnOrName", d: int) -> Column: >>> spark.createDataFrame([(5,)], ['a']).select(format_number('a', 4).alias('v')).collect() [Row(v='5.0000')] """ + from pyspark.sql.classic.column import _to_java_column + return _invoke_function("format_number", _to_java_column(col), d) @@ -10501,6 +10753,8 @@ def format_string(format: str, *cols: "ColumnOrName") -> Column: >>> df.select(format_string('%d %s', df.a, df.b).alias('v')).collect() [Row(v='5 hello')] """ + from pyspark.sql.classic.column import _to_seq, _to_java_column + sc = _get_active_spark_context() return _invoke_function("format_string", format, _to_seq(sc, cols, _to_java_column)) @@ -10539,6 +10793,8 @@ def instr(str: "ColumnOrName", substr: str) -> Column: >>> df.select(instr(df.s, 'b').alias('s')).collect() [Row(s=2)] """ + from pyspark.sql.classic.column import _to_java_column + return _invoke_function("instr", _to_java_column(str), substr) @@ -10585,6 +10841,8 @@ def overlay( >>> df.select(overlay("x", "y", 7, 2).alias("overlayed")).collect() [Row(overlayed='SPARK_COREL')] """ + from pyspark.sql.classic.column import _create_column_from_literal, _to_java_column + if not isinstance(pos, (int, str, Column)): raise PySparkTypeError( error_class="NOT_COLUMN_OR_INT_OR_STR", @@ -10657,7 +10915,9 @@ def sentences( @_try_remote_functions -def substring(str: "ColumnOrName", pos: int, len: int) -> Column: +def substring( + str: "ColumnOrName", pos: Union["ColumnOrName", int], len: Union["ColumnOrName", int] +) -> Column: """ Substring starts at `pos` and is of length `len` when str is String type or returns the slice of byte array that starts at `pos` in byte and is of length `len` @@ -10676,11 +10936,18 @@ def substring(str: "ColumnOrName", pos: int, len: int) -> Column: ---------- str : :class:`~pyspark.sql.Column` or str target column to work on. - pos : int + pos : :class:`~pyspark.sql.Column` or str or int starting position in str. - len : int + + .. versionchanged:: 4.0.0 + `pos` now accepts column and column name. + + len : :class:`~pyspark.sql.Column` or str or int length of chars. + .. versionchanged:: 4.0.0 + `len` now accepts column and column name. + Returns ------- :class:`~pyspark.sql.Column` @@ -10691,8 +10958,17 @@ def substring(str: "ColumnOrName", pos: int, len: int) -> Column: >>> df = spark.createDataFrame([('abcd',)], ['s',]) >>> df.select(substring(df.s, 1, 2).alias('s')).collect() [Row(s='ab')] + >>> df = spark.createDataFrame([('Spark', 2, 3)], ['s', 'p', 'l']) + >>> df.select(substring(df.s, 2, df.l).alias('s')).collect() + [Row(s='par')] + >>> df.select(substring(df.s, df.p, 3).alias('s')).collect() + [Row(s='par')] + >>> df.select(substring(df.s, df.p, df.l).alias('s')).collect() + [Row(s='par')] """ - return _invoke_function("substring", _to_java_column(str), pos, len) + pos = lit(pos) if isinstance(pos, int) else pos + len = lit(len) if isinstance(len, int) else len + return _invoke_function_over_columns("substring", str, pos, len) @_try_remote_functions @@ -10730,6 +11006,8 @@ def substring_index(str: "ColumnOrName", delim: str, count: int) -> Column: >>> df.select(substring_index(df.s, '.', -3).alias('s')).collect() [Row(s='b.c.d')] """ + from pyspark.sql.classic.column import _to_java_column + return _invoke_function("substring_index", _to_java_column(str), delim, count) @@ -10770,6 +11048,8 @@ def levenshtein( >>> df0.select(levenshtein('l', 'r', 2).alias('d')).collect() [Row(d=-1)] """ + from pyspark.sql.classic.column import _to_java_column + if threshold is None: return _invoke_function_over_columns("levenshtein", left, right) else: @@ -10813,6 +11093,8 @@ def locate(substr: str, str: "ColumnOrName", pos: int = 1) -> Column: >>> df.select(locate('b', df.s, 1).alias('s')).collect() [Row(s=2)] """ + from pyspark.sql.classic.column import _to_java_column + return _invoke_function("locate", substr, _to_java_column(str), pos) @@ -10846,6 +11128,8 @@ def lpad(col: "ColumnOrName", len: int, pad: str) -> Column: >>> df.select(lpad(df.s, 6, '#').alias('s')).collect() [Row(s='##abcd')] """ + from pyspark.sql.classic.column import _to_java_column + return _invoke_function("lpad", _to_java_column(col), len, pad) @@ -10879,6 +11163,8 @@ def rpad(col: "ColumnOrName", len: int, pad: str) -> Column: >>> df.select(rpad(df.s, 6, '#').alias('s')).collect() [Row(s='abcd##')] """ + from pyspark.sql.classic.column import _to_java_column + return _invoke_function("rpad", _to_java_column(col), len, pad) @@ -10944,7 +11230,11 @@ def repeat(col: "ColumnOrName", n: Union["ColumnOrName", int]) -> Column: @_try_remote_functions -def split(str: "ColumnOrName", pattern: str, limit: int = -1) -> Column: +def split( + str: "ColumnOrName", + pattern: Union[Column, str], + limit: Union["ColumnOrName", int] = -1, +) -> Column: """ Splits str around matches of the given pattern. @@ -10957,10 +11247,10 @@ def split(str: "ColumnOrName", pattern: str, limit: int = -1) -> Column: ---------- str : :class:`~pyspark.sql.Column` or str a string expression to split - pattern : str + pattern : :class:`~pyspark.sql.Column` or str a string representing a regular expression. The regex string should be a Java regular expression. - limit : int, optional + limit : :class:`~pyspark.sql.Column` or str or int an integer which controls the number of times `pattern` is applied. * ``limit > 0``: The resulting array's length will not be more than `limit`, and the @@ -10972,6 +11262,11 @@ def split(str: "ColumnOrName", pattern: str, limit: int = -1) -> Column: .. versionchanged:: 3.0 `split` now takes an optional `limit` field. If not provided, default limit value is -1. + .. versionchanged:: 4.0.0 + `pattern` now accepts column. Does not accept column name since string type remain + accepted as a regular expression representation, for backwards compatibility. + In addition to int, `limit` now accepts column and column name. + Returns ------- :class:`~pyspark.sql.Column` @@ -10979,13 +11274,53 @@ def split(str: "ColumnOrName", pattern: str, limit: int = -1) -> Column: Examples -------- + >>> import pyspark.sql.functions as sf >>> df = spark.createDataFrame([('oneAtwoBthreeC',)], ['s',]) - >>> df.select(split(df.s, '[ABC]', 2).alias('s')).collect() - [Row(s=['one', 'twoBthreeC'])] - >>> df.select(split(df.s, '[ABC]', -1).alias('s')).collect() - [Row(s=['one', 'two', 'three', ''])] + >>> df.select(sf.split(df.s, '[ABC]', 2).alias('s')).show() + +-----------------+ + | s| + +-----------------+ + |[one, twoBthreeC]| + +-----------------+ + + >>> import pyspark.sql.functions as sf + >>> df = spark.createDataFrame([('oneAtwoBthreeC',)], ['s',]) + >>> df.select(sf.split(df.s, '[ABC]', -1).alias('s')).show() + +-------------------+ + | s| + +-------------------+ + |[one, two, three, ]| + +-------------------+ + + >>> import pyspark.sql.functions as sf + >>> df = spark.createDataFrame( + ... [('oneAtwoBthreeC', '[ABC]'), ('1A2B3C', '[1-9]+'), ('aa2bb3cc4', '[1-9]+')], + ... ['s', 'pattern'] + ... ) + >>> df.select(sf.split(df.s, df.pattern).alias('s')).show() + +-------------------+ + | s| + +-------------------+ + |[one, two, three, ]| + | [, A, B, C]| + | [aa, bb, cc, ]| + +-------------------+ + + >>> import pyspark.sql.functions as sf + >>> df = spark.createDataFrame( + ... [('oneAtwoBthreeC', '[ABC]', 2), ('1A2B3C', '[1-9]+', -1)], + ... ['s', 'pattern', 'expected_parts'] + ... ) + >>> df.select(sf.split(df.s, df.pattern, df.expected_parts).alias('s')).show() + +-----------------+ + | s| + +-----------------+ + |[one, twoBthreeC]| + | [, A, B, C]| + +-----------------+ """ - return _invoke_function("split", _to_java_column(str), pattern, limit) + limit = lit(limit) if isinstance(limit, int) else limit + return _invoke_function_over_columns("split", str, lit(pattern), limit) @_try_remote_functions @@ -11193,6 +11528,8 @@ def regexp_extract(str: "ColumnOrName", pattern: str, idx: int) -> Column: >>> df.select(regexp_extract('str', '(a+)(b)?(c)', 2).alias('d')).collect() [Row(d='')] """ + from pyspark.sql.classic.column import _to_java_column + return _invoke_function("regexp_extract", _to_java_column(str), pattern, idx) @@ -11271,6 +11608,8 @@ def regexp_replace( >>> df.select(regexp_replace("str", col("pattern"), col("replacement")).alias('d')).collect() [Row(d='-----')] """ + from pyspark.sql.classic.column import _create_column_from_literal, _to_java_column + if isinstance(pattern, str): pattern_col = _create_column_from_literal(pattern) else: @@ -11619,6 +11958,8 @@ def translate(srcCol: "ColumnOrName", matching: str, replace: str) -> Column: ... .alias('r')).collect() [Row(r='1a2s3ae')] """ + from pyspark.sql.classic.column import _to_java_column + return _invoke_function("translate", _to_java_column(srcCol), matching, replace) @@ -12033,6 +12374,8 @@ def printf(format: "ColumnOrName", *cols: "ColumnOrName") -> Column: | aa123cc| +---------------+ """ + from pyspark.sql.classic.column import _to_seq, _to_java_column + sc = _get_active_spark_context() return _invoke_function("printf", _to_java_column(format), _to_seq(sc, cols, _to_java_column)) @@ -12585,6 +12928,8 @@ def elt(*inputs: "ColumnOrName") -> Column: >>> df.select(elt(df.a, df.b, df.c).alias('r')).collect() [Row(r='scala')] """ + from pyspark.sql.classic.column import _to_seq, _to_java_column + sc = _get_active_spark_context() return _invoke_function("elt", _to_seq(sc, inputs, _to_java_column)) @@ -12886,6 +13231,8 @@ def collate(col: "ColumnOrName", collation: str) -> Column: :class:`~pyspark.sql.Column` A new column of string type, where each value has the specified collation. """ + from pyspark.sql.classic.column import _to_java_column + return _invoke_function("collate", _to_java_column(col), collation) @@ -13273,8 +13620,7 @@ def array_contains(col: "ColumnOrName", value: Any) -> Column: | true| +----------+ """ - value = value._jc if isinstance(value, Column) else value - return _invoke_function("array_contains", _to_java_column(col), value) + return _invoke_function_over_columns("array_contains", col, lit(value)) @_try_remote_functions @@ -13520,6 +13866,8 @@ def array_join( | NULL,NULL| +-------------------------+ """ + from pyspark.sql.classic.column import _to_java_column + _get_active_spark_context() if null_replacement is None: return _invoke_function("array_join", _to_java_column(col), delimiter) @@ -13634,7 +13982,10 @@ def array_position(col: "ColumnOrName", value: Any) -> Column: col : :class:`~pyspark.sql.Column` or str target column to work on. value : Any - value to look for. + value or a :class:`~pyspark.sql.Column` expression to look for. + + .. versionchanged:: 4.0.0 + `value` now also accepts a Column type. Returns ------- @@ -13699,8 +14050,20 @@ def array_position(col: "ColumnOrName", value: Any) -> Column: +-----------------------+ | 3| +-----------------------+ + + Example 6: Finding the position of a column's value in an array of integers + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([([10, 20, 30], 20)], ['data', 'col']) + >>> df.select(sf.array_position(df.data, df.col)).show() + +-------------------------+ + |array_position(data, col)| + +-------------------------+ + | 2| + +-------------------------+ + """ - return _invoke_function("array_position", _to_java_column(col), value) + return _invoke_function_over_columns("array_position", col, lit(value)) @_try_remote_functions @@ -13735,10 +14098,13 @@ def element_at(col: "ColumnOrName", extraction: Any) -> Column: Notes ----- The position is not zero based, but 1 based index. + If extraction is a string, :meth:`element_at` treats it as a literal string, + while :meth:`try_element_at` treats it as a column name. See Also -------- :meth:`get` + :meth:`try_element_at` Examples -------- @@ -13785,6 +14151,17 @@ def element_at(col: "ColumnOrName", extraction: Any) -> Column: +-------------------+ | NULL| +-------------------+ + + Example 5: Getting a value from a map using a literal string as the key + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([({"a": 1.0, "b": 2.0}, "a")], ['data', 'b']) + >>> df.select(sf.element_at(df.data, 'b')).show() + +-------------------+ + |element_at(data, b)| + +-------------------+ + | 2.0| + +-------------------+ """ return _invoke_function_over_columns("element_at", col, lit(extraction)) @@ -13809,6 +14186,17 @@ def try_element_at(col: "ColumnOrName", extraction: "ColumnOrName") -> Column: extraction : index to check for in array or key to check for in map + Notes + ----- + The position is not zero based, but 1 based index. + If extraction is a string, :meth:`try_element_at` treats it as a column name, + while :meth:`element_at` treats it as a literal string. + + See Also + -------- + :meth:`get` + :meth:`element_at` + Examples -------- Example 1: Getting the first element of an array @@ -13865,6 +14253,17 @@ def try_element_at(col: "ColumnOrName", extraction: "ColumnOrName") -> Column: +-----------------------+ | NULL| +-----------------------+ + + Example 6: Getting a value from a map using a column name as the key + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([({"a": 1.0, "b": 2.0}, "a")], ['data', 'b']) + >>> df.select(sf.try_element_at(df.data, 'b')).show() + +-----------------------+ + |try_element_at(data, b)| + +-----------------------+ + | 1.0| + +-----------------------+ """ return _invoke_function_over_columns("try_element_at", col, extraction) @@ -14065,7 +14464,10 @@ def array_remove(col: "ColumnOrName", element: Any) -> Column: col : :class:`~pyspark.sql.Column` or str name of column containing array element : - element to be removed from the array + element or a :class:`~pyspark.sql.Column` expression to be removed from the array + + .. versionchanged:: 4.0.0 + `element` now also accepts a Column type. Returns ------- @@ -14133,8 +14535,19 @@ def array_remove(col: "ColumnOrName", element: Any) -> Column: +---------------------+ | []| +---------------------+ + + Example 6: Removing a column's value from a simple array + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([([1, 2, 3, 1, 1], 1)], ['data', 'col']) + >>> df.select(sf.array_remove(df.data, df.col)).show() + +-----------------------+ + |array_remove(data, col)| + +-----------------------+ + | [2, 3]| + +-----------------------+ """ - return _invoke_function("array_remove", _to_java_column(col), element) + return _invoke_function_over_columns("array_remove", col, lit(element)) @_try_remote_functions @@ -15277,6 +15690,8 @@ def get_json_object(col: "ColumnOrName", path: str) -> Column: ... get_json_object(df.jstring, '$.f2').alias("c1") ).collect() [Row(key='1', c0='value1', c1='value2'), Row(key='2', c0='value12', c1=None)] """ + from pyspark.sql.classic.column import _to_java_column + return _invoke_function("get_json_object", _to_java_column(col), path) @@ -15308,6 +15723,8 @@ def json_tuple(col: "ColumnOrName", *fields: str) -> Column: >>> df.select(df.key, json_tuple(df.jstring, 'f1', 'f2')).collect() [Row(key='1', c0='value1', c1='value2'), Row(key='2', c0='value12', c1=None)] """ + from pyspark.sql.classic.column import _to_seq, _to_java_column + if len(fields) == 0: raise PySparkValueError( error_class="CANNOT_BE_EMPTY", @@ -15415,6 +15832,7 @@ def from_json( |[1, 2, 3]| +---------+ """ + from pyspark.sql.classic.column import _to_java_column if isinstance(schema, DataType): schema = schema.json() @@ -15423,12 +15841,44 @@ def from_json( return _invoke_function("from_json", _to_java_column(col), schema, _options_to_str(options)) +@_try_remote_functions +def try_parse_json( + col: "ColumnOrName", +) -> Column: + """ + Parses a column containing a JSON string into a :class:`VariantType`. Returns None if a string + contains an invalid JSON value. + + .. versionadded:: 4.0.0 + + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + a column or column name JSON formatted strings + + Returns + ------- + :class:`~pyspark.sql.Column` + a new column of VariantType. + + Examples + -------- + >>> df = spark.createDataFrame([ {'json': '''{ "a" : 1 }'''}, {'json': '''{a : 1}'''} ]) + >>> df.select(to_json(try_parse_json(df.json))).collect() + [Row(to_json(try_parse_json(json))='{"a":1}'), Row(to_json(try_parse_json(json))=None)] + """ + from pyspark.sql.classic.column import _to_java_column + + return _invoke_function("try_parse_json", _to_java_column(col)) + + @_try_remote_functions def parse_json( col: "ColumnOrName", ) -> Column: """ - Parses a column containing a JSON string into a :class:`VariantType`. + Parses a column containing a JSON string into a :class:`VariantType`. Throws exception if a + string represents an invalid JSON value. .. versionadded:: 4.0.0 @@ -15448,10 +15898,170 @@ def parse_json( >>> df.select(to_json(parse_json(df.json))).collect() [Row(to_json(parse_json(json))='{"a":1}')] """ + from pyspark.sql.classic.column import _to_java_column return _invoke_function("parse_json", _to_java_column(col)) +@_try_remote_functions +def is_variant_null(v: "ColumnOrName") -> Column: + """ + Check if a variant value is a variant null. Returns true if and only if the input is a variant + null and false otherwise (including in the case of SQL NULL). + + .. versionadded:: 4.0.0 + + Parameters + ---------- + v : :class:`~pyspark.sql.Column` or str + a variant column or column name + + Returns + ------- + :class:`~pyspark.sql.Column` + a boolean column indicating whether the variant value is a variant null + + Examples + -------- + >>> df = spark.createDataFrame([ {'json': '''{ "a" : 1 }'''} ]) + >>> df.select(is_variant_null(parse_json(df.json)).alias("r")).collect() + [Row(r=False)] + """ + from pyspark.sql.classic.column import _to_java_column + + return _invoke_function("is_variant_null", _to_java_column(v)) + + +@_try_remote_functions +def variant_get(v: "ColumnOrName", path: str, targetType: str) -> Column: + """ + Extracts a sub-variant from `v` according to `path`, and then cast the sub-variant to + `targetType`. Returns null if the path does not exist. Throws an exception if the cast fails. + + .. versionadded:: 4.0.0 + + Parameters + ---------- + v : :class:`~pyspark.sql.Column` or str + a variant column or column name + path : str + the extraction path. A valid path should start with `$` and is followed by zero or more + segments like `[123]`, `.name`, `['name']`, or `["name"]`. + targetType : str + the target data type to cast into, in a DDL-formatted string + + Returns + ------- + :class:`~pyspark.sql.Column` + a column of `targetType` representing the extracted result + + Examples + -------- + >>> df = spark.createDataFrame([ {'json': '''{ "a" : 1 }'''} ]) + >>> df.select(variant_get(parse_json(df.json), "$.a", "int").alias("r")).collect() + [Row(r=1)] + >>> df.select(variant_get(parse_json(df.json), "$.b", "int").alias("r")).collect() + [Row(r=None)] + """ + from pyspark.sql.classic.column import _to_java_column + + return _invoke_function("variant_get", _to_java_column(v), path, targetType) + + +@_try_remote_functions +def try_variant_get(v: "ColumnOrName", path: str, targetType: str) -> Column: + """ + Extracts a sub-variant from `v` according to `path`, and then cast the sub-variant to + `targetType`. Returns null if the path does not exist or the cast fails. + + .. versionadded:: 4.0.0 + + Parameters + ---------- + v : :class:`~pyspark.sql.Column` or str + a variant column or column name + path : str + the extraction path. A valid path should start with `$` and is followed by zero or more + segments like `[123]`, `.name`, `['name']`, or `["name"]`. + targetType : str + the target data type to cast into, in a DDL-formatted string + + Returns + ------- + :class:`~pyspark.sql.Column` + a column of `targetType` representing the extracted result + + Examples + -------- + >>> df = spark.createDataFrame([ {'json': '''{ "a" : 1 }'''} ]) + >>> df.select(try_variant_get(parse_json(df.json), "$.a", "int").alias("r")).collect() + [Row(r=1)] + >>> df.select(try_variant_get(parse_json(df.json), "$.b", "int").alias("r")).collect() + [Row(r=None)] + >>> df.select(try_variant_get(parse_json(df.json), "$.a", "binary").alias("r")).collect() + [Row(r=None)] + """ + from pyspark.sql.classic.column import _to_java_column + + return _invoke_function("try_variant_get", _to_java_column(v), path, targetType) + + +@_try_remote_functions +def schema_of_variant(v: "ColumnOrName") -> Column: + """ + Returns schema in the SQL format of a variant. + + .. versionadded:: 4.0.0 + + Parameters + ---------- + v : :class:`~pyspark.sql.Column` or str + a variant column or column name + + Returns + ------- + :class:`~pyspark.sql.Column` + a string column representing the variant schema + + Examples + -------- + >>> df = spark.createDataFrame([ {'json': '''{ "a" : 1 }'''} ]) + >>> df.select(schema_of_variant(parse_json(df.json)).alias("r")).collect() + [Row(r='STRUCT')] + """ + from pyspark.sql.classic.column import _to_java_column + + return _invoke_function("schema_of_variant", _to_java_column(v)) + + +@_try_remote_functions +def schema_of_variant_agg(v: "ColumnOrName") -> Column: + """ + Returns the merged schema in the SQL format of a variant column. + + .. versionadded:: 4.0.0 + + Parameters + ---------- + v : :class:`~pyspark.sql.Column` or str + a variant column or column name + + Returns + ------- + :class:`~pyspark.sql.Column` + a string column representing the variant schema + + Examples + -------- + >>> df = spark.createDataFrame([ {'json': '''{ "a" : 1 }'''} ]) + >>> df.select(schema_of_variant_agg(parse_json(df.json)).alias("r")).collect() + [Row(r='STRUCT')] + """ + from pyspark.sql.classic.column import _to_java_column + + return _invoke_function("schema_of_variant_agg", _to_java_column(v)) + + @_try_remote_functions def to_json(col: "ColumnOrName", options: Optional[Dict[str, str]] = None) -> Column: """ @@ -15542,6 +16152,7 @@ def to_json(col: "ColumnOrName", options: Optional[Dict[str, str]] = None) -> Co |["Alice","Bob"]| +---------------+ """ + from pyspark.sql.classic.column import _to_java_column return _invoke_function("to_json", _to_java_column(col), _options_to_str(options)) @@ -15584,6 +16195,8 @@ def schema_of_json(json: Union[Column, str], options: Optional[Dict[str, str]] = >>> df.select(schema.alias("json")).collect() [Row(json='STRUCT')] """ + from pyspark.sql.classic.column import _create_column_from_literal, _to_java_column + if isinstance(json, str): col = _create_column_from_literal(json) elif isinstance(json, Column): @@ -15721,6 +16334,7 @@ def from_xml( >>> df.select(sf.from_xml(df.value, schema).alias("xml")).collect() [Row(xml=Row(a=[1, 2]))] """ + from pyspark.sql.classic.column import _to_java_column if isinstance(schema, StructType): schema = schema.json() @@ -15799,6 +16413,8 @@ def schema_of_xml(xml: Union[Column, str], options: Optional[Dict[str, str]] = N ... ).collect() [Row(xml='STRUCT>>')] """ + from pyspark.sql.classic.column import _create_column_from_literal, _to_java_column + if isinstance(xml, str): col = _create_column_from_literal(xml) elif isinstance(xml, Column): @@ -15844,6 +16460,7 @@ def to_xml(col: "ColumnOrName", options: Optional[Dict[str, str]] = None) -> Col >>> df.select(to_xml(df.value, {'rowTag':'person'}).alias("xml")).collect() [Row(xml='\\n 2\\n Alice\\n')] """ + from pyspark.sql.classic.column import _to_java_column return _invoke_function("to_xml", _to_java_column(col), _options_to_str(options)) @@ -15920,6 +16537,8 @@ def schema_of_csv(csv: Union[Column, str], options: Optional[Dict[str, str]] = N |STRUCT<_c0: INT, _c1: STRING, _c2: BOOLEAN>| +-------------------------------------------+ """ + from pyspark.sql.classic.column import _create_column_from_literal, _to_java_column + if isinstance(csv, str): col = _create_column_from_literal(csv) elif isinstance(csv, Column): @@ -16020,6 +16639,7 @@ def to_csv(col: "ColumnOrName", options: Optional[Dict[str, str]] = None) -> Col | 2,Alice,true| +-------------+ """ + from pyspark.sql.classic.column import _to_java_column return _invoke_function("to_csv", _to_java_column(col), _options_to_str(options)) @@ -16427,6 +17047,8 @@ def sort_array(col: "ColumnOrName", asc: bool = True) -> Column: | [NULL, NULL, NULL]| +----------------------+ """ + from pyspark.sql.classic.column import _to_java_column + return _invoke_function("sort_array", _to_java_column(col), asc) @@ -16689,7 +17311,10 @@ def map_contains_key(col: "ColumnOrName", value: Any) -> Column: col : :class:`~pyspark.sql.Column` or str The name of the column or an expression that represents the map. value : - A literal value. + A literal value, or a :class:`~pyspark.sql.Column` expression. + + .. versionchanged:: 4.0.0 + `value` now also accepts a Column type. Returns ------- @@ -16719,8 +17344,19 @@ def map_contains_key(col: "ColumnOrName", value: Any) -> Column: +--------------------------+ | false| +--------------------------+ + + Example 3: Check for key using a column + + >>> from pyspark.sql import functions as sf + >>> df = spark.sql("SELECT map(1, 'a', 2, 'b') as data, 1 as key") + >>> df.select(sf.map_contains_key("data", sf.col("key"))).show() + +---------------------------+ + |map_contains_key(data, key)| + +---------------------------+ + | true| + +---------------------------+ """ - return _invoke_function("map_contains_key", _to_java_column(col), value) + return _invoke_function_over_columns("map_contains_key", col, lit(value)) @_try_remote_functions @@ -17450,6 +18086,7 @@ def from_csv( | {1, 2, 3}| +---------------+ """ + from pyspark.sql.classic.column import _create_column_from_literal, _to_java_column _get_active_spark_context() if isinstance(schema, str): @@ -17478,6 +18115,7 @@ def _unresolved_named_lambda_variable(*name_parts: Any) -> Column: name_parts : str """ from py4j.java_gateway import JVMView + from pyspark.sql.classic.column import _to_seq sc = _get_active_spark_context() name_parts_seq = _to_seq(sc, name_parts) @@ -17528,6 +18166,7 @@ def _create_lambda(f: Callable) -> Callable: - (Column, Column, Column) -> Column: ... """ from py4j.java_gateway import JVMView + from pyspark.sql.classic.column import _to_seq parameters = _get_lambda_parameters(f) @@ -17573,6 +18212,7 @@ def _invoke_higher_order_function( :return: a Column """ from py4j.java_gateway import JVMView + from pyspark.sql.classic.column import _to_java_column sc = _get_active_spark_context() expressions = cast(JVMView, sc._jvm).org.apache.spark.sql.catalyst.expressions @@ -17835,7 +18475,7 @@ def aggregate( initial value. Name of column or expression merge : function a binary function ``(acc: Column, x: Column) -> Column...`` returning expression - of the same type as ``zero`` + of the same type as ``initialValue`` finish : function, optional an optional unary function ``(x: Column) -> Column: ...`` used to convert accumulated value. @@ -19211,6 +19851,8 @@ def call_udf(udfName: str, *cols: "ColumnOrName") -> Column: | cc| +-----------+ """ + from pyspark.sql.classic.column import _to_seq, _to_java_column + sc = _get_active_spark_context() return _invoke_function("call_udf", udfName, _to_seq(sc, cols, _to_java_column)) @@ -19280,6 +19922,8 @@ def call_function(funcName: str, *cols: "ColumnOrName") -> Column: | 102.0| +------------------------------------+ """ + from pyspark.sql.classic.column import _to_seq, _to_java_column + sc = _get_active_spark_context() return _invoke_function("call_function", funcName, _to_seq(sc, cols, _to_java_column)) @@ -19295,11 +19939,16 @@ def unwrap_udt(col: "ColumnOrName") -> Column: ----- Supports Spark Connect. """ + from pyspark.sql.classic.column import _to_java_column + return _invoke_function("unwrap_udt", _to_java_column(col)) @_try_remote_functions -def hll_sketch_agg(col: "ColumnOrName", lgConfigK: Optional[Union[int, Column]] = None) -> Column: +def hll_sketch_agg( + col: "ColumnOrName", + lgConfigK: Optional[Union[int, Column]] = None, +) -> Column: """ Aggregate function: returns the updatable binary representation of the Datasketches HllSketch configured with lgConfigK arg. @@ -19308,8 +19957,8 @@ def hll_sketch_agg(col: "ColumnOrName", lgConfigK: Optional[Union[int, Column]] Parameters ---------- - col : :class:`~pyspark.sql.Column` or str or int - lgConfigK : int, optional + col : :class:`~pyspark.sql.Column` or str + lgConfigK : :class:`~pyspark.sql.Column` or int, optional The log-base-2 of K, where K is the number of buckets or slots for the HllSketch Returns @@ -19348,13 +19997,13 @@ def hll_sketch_agg(col: "ColumnOrName", lgConfigK: Optional[Union[int, Column]] if lgConfigK is None: return _invoke_function_over_columns("hll_sketch_agg", col) else: - _lgConfigK = lit(lgConfigK) if isinstance(lgConfigK, int) else lgConfigK - return _invoke_function_over_columns("hll_sketch_agg", col, _lgConfigK) + return _invoke_function_over_columns("hll_sketch_agg", col, lit(lgConfigK)) @_try_remote_functions def hll_union_agg( - col: "ColumnOrName", allowDifferentLgConfigK: Optional[Union[bool, Column]] = None + col: "ColumnOrName", + allowDifferentLgConfigK: Optional[Union[bool, Column]] = None, ) -> Column: """ Aggregate function: returns the updatable binary representation of the Datasketches @@ -19366,8 +20015,8 @@ def hll_union_agg( Parameters ---------- - col : :class:`~pyspark.sql.Column` or str or bool - allowDifferentLgConfigK : bool, optional + col : :class:`~pyspark.sql.Column` or str + allowDifferentLgConfigK : :class:`~pyspark.sql.Column` or bool, optional Allow sketches with different lgConfigK values to be merged (defaults to false). Returns @@ -19412,12 +20061,7 @@ def hll_union_agg( if allowDifferentLgConfigK is None: return _invoke_function_over_columns("hll_union_agg", col) else: - _allowDifferentLgConfigK = ( - lit(allowDifferentLgConfigK) - if isinstance(allowDifferentLgConfigK, bool) - else allowDifferentLgConfigK - ) - return _invoke_function_over_columns("hll_union_agg", col, _allowDifferentLgConfigK) + return _invoke_function_over_columns("hll_union_agg", col, lit(allowDifferentLgConfigK)) @_try_remote_functions @@ -19448,6 +20092,8 @@ def hll_sketch_estimate(col: "ColumnOrName") -> Column: | 3| +------------+ """ + from pyspark.sql.classic.column import _to_java_column + return _invoke_function("hll_sketch_estimate", _to_java_column(col)) @@ -19486,6 +20132,8 @@ def hll_union( | 6| +------------+ """ + from pyspark.sql.classic.column import _to_java_column + if allowDifferentLgConfigK is not None: return _invoke_function( "hll_union", _to_java_column(col1), _to_java_column(col2), allowDifferentLgConfigK diff --git a/python/pyspark/sql/functions/partitioning.py b/python/pyspark/sql/functions/partitioning.py index 59c293577b08a..e89901cdbe540 100644 --- a/python/pyspark/sql/functions/partitioning.py +++ b/python/pyspark/sql/functions/partitioning.py @@ -26,7 +26,7 @@ ) from pyspark.errors import PySparkTypeError -from pyspark.sql.column import Column, _to_java_column, _create_column_from_literal +from pyspark.sql.column import Column from pyspark.sql.functions.builtin import _invoke_function_over_columns, _invoke_function from pyspark.sql.utils import ( try_partitioning_remote_functions as _try_partitioning_remote_functions, @@ -204,6 +204,8 @@ def bucket(numBuckets: Union[Column, int], col: "ColumnOrName") -> Column: method of the `DataFrameWriterV2`. """ + from pyspark.sql.classic.column import _to_java_column, _create_column_from_literal + if not isinstance(numBuckets, (int, Column)): raise PySparkTypeError( error_class="NOT_COLUMN_OR_INT", diff --git a/python/pyspark/sql/group.py b/python/pyspark/sql/group.py index 15934c24b9d4a..ac4ac02a36b16 100644 --- a/python/pyspark/sql/group.py +++ b/python/pyspark/sql/group.py @@ -19,7 +19,7 @@ from typing import Callable, List, Optional, TYPE_CHECKING, overload, Dict, Union, cast, Tuple -from pyspark.sql.column import Column, _to_seq +from pyspark.sql.column import Column from pyspark.sql.session import SparkSession from pyspark.sql.dataframe import DataFrame from pyspark.sql.pandas.group_ops import PandasGroupedOpsMixin @@ -44,6 +44,8 @@ def _api(self: "GroupedData") -> DataFrame: def df_varargs_api(f: Callable[..., DataFrame]) -> Callable[..., DataFrame]: def _api(self: "GroupedData", *cols: str) -> DataFrame: + from pyspark.sql.classic.column import _to_seq + name = f.__name__ jdf = getattr(self._jgd, name)(_to_seq(self.session._sc, cols)) return DataFrame(jdf, self.session) @@ -175,6 +177,8 @@ def agg(self, *exprs: Union[Column, Dict[str, str]]) -> DataFrame: | Bob| 5| +-----+------------+ """ + from pyspark.sql.classic.column import _to_seq + assert exprs, "exprs should not be empty" if len(exprs) == 1 and isinstance(exprs[0], dict): jdf = self._jgd.agg(exprs[0]) @@ -489,7 +493,8 @@ def pivot(self, pivot_col: str, values: Optional[List["LiteralType"]] = None) -> Compute the sum of earnings for each year by course with each course as a separate column - >>> df1.groupBy("year").pivot("course", ["dotNET", "Java"]).sum("earnings").show() + >>> df1.groupBy("year").pivot( + ... "course", ["dotNET", "Java"]).sum("earnings").sort("year").show() +----+------+-----+ |year|dotNET| Java| +----+------+-----+ @@ -499,14 +504,15 @@ def pivot(self, pivot_col: str, values: Optional[List["LiteralType"]] = None) -> Or without specifying column values (less efficient) - >>> df1.groupBy("year").pivot("course").sum("earnings").show() + >>> df1.groupBy("year").pivot("course").sum("earnings").sort("year").show() +----+-----+------+ |year| Java|dotNET| +----+-----+------+ |2012|20000| 15000| |2013|30000| 48000| +----+-----+------+ - >>> df2.groupBy("sales.year").pivot("sales.course").sum("sales.earnings").show() + >>> df2.groupBy( + ... "sales.year").pivot("sales.course").sum("sales.earnings").sort("year").show() ... # doctest: +SKIP +----+-----+------+ |year| Java|dotNET| diff --git a/python/pyspark/sql/metrics.py b/python/pyspark/sql/metrics.py new file mode 100644 index 0000000000000..6664582952014 --- /dev/null +++ b/python/pyspark/sql/metrics.py @@ -0,0 +1,287 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import abc +import dataclasses +from typing import Optional, List, Tuple, Dict, Any, Union, TYPE_CHECKING, Sequence + +from pyspark.errors import PySparkValueError + +if TYPE_CHECKING: + from pyspark.testing.connectutils import have_graphviz + + if have_graphviz: + import graphviz # type: ignore + + +class ObservedMetrics(abc.ABC): + @property + @abc.abstractmethod + def name(self) -> str: + ... + + @property + @abc.abstractmethod + def pairs(self) -> Dict[str, Any]: + ... + + @property + @abc.abstractmethod + def keys(self) -> List[str]: + ... + + +class MetricValue: + """The metric values is the Python representation of a plan metric value from the JVM. + However, it does not have any reference to the original value.""" + + def __init__(self, name: str, value: Union[int, float], type: str): + self._name = name + self._type = type + self._value = value + + def __repr__(self) -> str: + return f"<{self._name}={self._value} ({self._type})>" + + @property + def name(self) -> str: + return self._name + + @property + def value(self) -> Union[int, float]: + return self._value + + @property + def metric_type(self) -> str: + return self._type + + +class PlanMetrics: + """Represents a particular plan node and the associated metrics of this node.""" + + def __init__(self, name: str, id: int, parent: int, metrics: List[MetricValue]): + self._name = name + self._id = id + self._parent_id = parent + self._metrics = metrics + + def __repr__(self) -> str: + return f"Plan({self._name}: {self._id}->{self._parent_id})={self._metrics}" + + @property + def name(self) -> str: + return self._name + + @property + def plan_id(self) -> int: + return self._id + + @property + def parent_plan_id(self) -> int: + return self._parent_id + + @property + def metrics(self) -> List[MetricValue]: + return self._metrics + + +class CollectedMetrics: + @dataclasses.dataclass + class Node: + id: int + name: str = dataclasses.field(default="") + metrics: List[MetricValue] = dataclasses.field(default_factory=list) + children: List[int] = dataclasses.field(default_factory=list) + + def text(self, current: "Node", graph: Dict[int, "Node"], prefix: str = "") -> str: + """ + Converts the current node and its children into a textual representation. This is used + to provide a usable output for the command line or other text-based interfaces. However, + it is recommended to use the Graphviz representation for a more visual representation. + + Parameters + ---------- + current: Node + Current node in the graph. + graph: dict + A dictionary representing the full graph mapping from node ID (int) to the node itself. + The node is an instance of :class:`CollectedMetrics:Node`. + prefix: str + String prefix used for generating the output buffer. + + Returns + ------- + The full string representation of the current node as root. + """ + base_metrics = set(["numPartitions", "peakMemory", "numOutputRows", "spillSize"]) + + # Format the metrics of this node: + metric_buffer = [] + for m in current.metrics: + if m.name in base_metrics: + metric_buffer.append(f"{m.name}: {m.value} ({m.metric_type})") + + buffer = f"{prefix}+- {current.name}({','.join(metric_buffer)})\n" + for i, child in enumerate(current.children): + c = graph[child] + new_prefix = prefix + " " if i == len(c.children) - 1 else prefix + if current.id != c.id: + buffer += self.text(c, graph, new_prefix) + return buffer + + def __init__(self, metrics: List[PlanMetrics]): + # Sort the input list + self._metrics = sorted(metrics, key=lambda x: x._parent_id, reverse=False) + + def extract_graph(self) -> Tuple[int, Dict[int, "CollectedMetrics.Node"]]: + """ + Builds the graph of the query plan. The graph is represented as a dictionary where the key + is the node ID and the value is the node itself. The root node is the node that has no + parent. + + Returns + ------- + The root node ID and the graph of all nodes. + """ + all_nodes: Dict[int, CollectedMetrics.Node] = {} + + for m in self._metrics: + # Add yourself to the list if you have to. + if m.plan_id not in all_nodes: + all_nodes[m.plan_id] = CollectedMetrics.Node(m.plan_id, m.name, m.metrics) + else: + all_nodes[m.plan_id].name = m.name + all_nodes[m.plan_id].metrics = m.metrics + + # Now check for the parent of this node if it's in + if m.parent_plan_id not in all_nodes: + all_nodes[m.parent_plan_id] = CollectedMetrics.Node(m.parent_plan_id) + + all_nodes[m.parent_plan_id].children.append(m.plan_id) + + # Next step is to find all the root nodes. Root nodes are never used in children. + # So we start with all node ids as candidates. + candidates = set(all_nodes.keys()) + for k, v in all_nodes.items(): + for c in v.children: + if c in candidates and c != k: + candidates.remove(c) + + assert len(candidates) == 1, f"Expected 1 root node, found {len(candidates)}" + return candidates.pop(), all_nodes + + def toText(self) -> str: + """ + Converts the execution graph from a graph into a textual representation + that can be read at the command line for example. + + Returns + ------- + A string representation of the collected metrics. + """ + root, graph = self.extract_graph() + return self.text(graph[root], graph) + + def toDot(self, filename: Optional[str] = None, out_format: str = "png") -> "graphviz.Digraph": + """ + Converts the collected metrics into a dot representation. Since the graphviz Digraph + implementation provides the ability to render the result graph directory in a + notebook, we return the graph object directly. + + If the graphviz package is not available, a PACKAGE_NOT_INSTALLED error is raised. + + Parameters + ---------- + filename : str, optional + The filename to save the graph to given an output format. The path can be + relative or absolute. + + out_format : str + The output format of the graph. The default is 'png'. + + Returns + ------- + An instance of the graphviz.Digraph object. + """ + try: + import graphviz + + dot = graphviz.Digraph( + comment="Query Plan", + node_attr={ + "shape": "box", + "font-size": "10pt", + }, + ) + + root, graph = self.extract_graph() + for k, v in graph.items(): + # Build table rows for the metrics + rows = "\n".join( + [ + ( + f'{x.name}' + f'{x.value} ({x.metric_type})' + ) + for x in v.metrics + ] + ) + + dot.node( + str(k), + """< + + + + + {} +
      + {} +
      Metrics
      >""".format( + v.name, rows + ), + ) + for c in v.children: + dot.edge(str(k), str(c)) + + if filename: + dot.render(filename, format=out_format, cleanup=True) + return dot + + except ImportError: + raise PySparkValueError( + error_class="PACKAGE_NOT_INSTALLED", + message_parameters={"package_name": "graphviz", "minimum_version": "0.20"}, + ) + + +class ExecutionInfo: + """The query execution class allows users to inspect the query execution of this particular + data frame. This value is only set in the data frame if it was executed.""" + + def __init__( + self, metrics: Optional[list[PlanMetrics]], obs: Optional[Sequence[ObservedMetrics]] + ): + self._metrics = CollectedMetrics(metrics) if metrics else None + self._observations = obs if obs else [] + + @property + def metrics(self) -> Optional[CollectedMetrics]: + return self._metrics + + @property + def flows(self) -> List[Tuple[str, Dict[str, Any]]]: + return [(f.name, f.pairs) for f in self._observations] diff --git a/python/pyspark/sql/observation.py b/python/pyspark/sql/observation.py index 1dae5086e3dd7..4ef4c78ba3c33 100644 --- a/python/pyspark/sql/observation.py +++ b/python/pyspark/sql/observation.py @@ -18,7 +18,6 @@ from typing import Any, Dict, Optional, TYPE_CHECKING from pyspark.errors import PySparkTypeError, PySparkValueError, PySparkAssertionError -from pyspark.sql import column from pyspark.sql.column import Column from pyspark.sql.dataframe import DataFrame from pyspark.sql.utils import is_remote @@ -116,6 +115,8 @@ def _on(self, df: DataFrame, *exprs: Column) -> DataFrame: :class:`DataFrame` the observed :class:`DataFrame`. """ + from pyspark.sql.classic.column import _to_seq + if self._jo is not None: raise PySparkAssertionError(error_class="REUSE_OBSERVATION", message_parameters={}) @@ -124,7 +125,9 @@ def _on(self, df: DataFrame, *exprs: Column) -> DataFrame: cls = self._jvm.org.apache.spark.sql.Observation self._jo = cls(self._name) if self._name is not None else cls() observed_df = self._jo.on( - df._jdf, exprs[0]._jc, column._to_seq(df._sc, [c._jc for c in exprs[1:]]) + df._jdf, + exprs[0]._jc, + _to_seq(df._sc, [c._jc for c in exprs[1:]]), ) return DataFrame(observed_df, df.sparkSession) diff --git a/python/pyspark/sql/pandas/conversion.py b/python/pyspark/sql/pandas/conversion.py index 4f137c7004c1c..9da15caac8025 100644 --- a/python/pyspark/sql/pandas/conversion.py +++ b/python/pyspark/sql/pandas/conversion.py @@ -56,35 +56,11 @@ class PandasConversionMixin: """ - Mix-in for the conversion from Spark to pandas. Currently, only :class:`DataFrame` - can use this class. + Mix-in for the conversion from Spark to pandas and PyArrow. Currently, only + :class:`DataFrame` can use this class. """ def toPandas(self) -> "PandasDataFrameLike": - """ - Returns the contents of this :class:`DataFrame` as Pandas ``pandas.DataFrame``. - - This is only available if Pandas is installed and available. - - .. versionadded:: 1.3.0 - - .. versionchanged:: 3.4.0 - Supports Spark Connect. - - Notes - ----- - This method should only be used if the resulting Pandas ``pandas.DataFrame`` is - expected to be small, as all the data is loaded into the driver's memory. - - Usage with ``spark.sql.execution.arrow.pyspark.enabled=True`` is experimental. - - Examples - -------- - >>> df.toPandas() # doctest: +SKIP - age name - 0 2 Alice - 1 5 Bob - """ from pyspark.sql.dataframe import DataFrame assert isinstance(self, DataFrame) @@ -249,15 +225,48 @@ def toPandas(self) -> "PandasDataFrameLike": else: return pdf - def _collect_as_arrow(self, split_batches: bool = False) -> List["pa.RecordBatch"]: + def toArrow(self) -> "pa.Table": + from pyspark.sql.dataframe import DataFrame + + assert isinstance(self, DataFrame) + + jconf = self.sparkSession._jconf + + from pyspark.sql.pandas.types import to_arrow_schema + from pyspark.sql.pandas.utils import require_minimum_pyarrow_version + + require_minimum_pyarrow_version() + schema = to_arrow_schema(self.schema, error_on_duplicated_field_names_in_struct=True) + + import pyarrow as pa + + self_destruct = jconf.arrowPySparkSelfDestructEnabled() + batches = self._collect_as_arrow( + split_batches=self_destruct, empty_list_if_zero_records=False + ) + table = pa.Table.from_batches(batches).cast(schema) + # Ensure only the table has a reference to the batches, so that + # self_destruct (if enabled) is effective + del batches + return table + + def _collect_as_arrow( + self, + split_batches: bool = False, + empty_list_if_zero_records: bool = True, + ) -> List["pa.RecordBatch"]: """ - Returns all records as a list of ArrowRecordBatches, pyarrow must be installed + Returns all records as a list of Arrow RecordBatches. PyArrow must be installed and available on driver and worker Python environments. This is an experimental feature. :param split_batches: split batches such that each column is in its own allocation, so that the selfDestruct optimization is effective; default False. + :param empty_list_if_zero_records: If True (the default), returns an empty list if the + result has 0 records. Otherwise, returns a list of length 1 containing an empty + Arrow RecordBatch which includes the schema. + .. note:: Experimental. """ from pyspark.sql.dataframe import DataFrame @@ -306,14 +315,22 @@ def _collect_as_arrow(self, split_batches: bool = False) -> List["pa.RecordBatch batches = results[:-1] batch_order = results[-1] - # Re-order the batch list using the correct order - return [batches[i] for i in batch_order] + if len(batches) or empty_list_if_zero_records: + # Re-order the batch list using the correct order + return [batches[i] for i in batch_order] + else: + from pyspark.sql.pandas.types import to_arrow_schema + import pyarrow as pa + + schema = to_arrow_schema(self.schema) + empty_arrays = [pa.array([], type=field.type) for field in schema] + return [pa.RecordBatch.from_arrays(empty_arrays, schema=schema)] class SparkConversionMixin: """ - Min-in for the conversion from pandas to Spark. Currently, only :class:`SparkSession` - can use this class. + Min-in for the conversion from pandas and PyArrow to Spark. Currently, only + :class:`SparkSession` can use this class. """ _jsparkSession: "JavaObject" @@ -324,6 +341,12 @@ def createDataFrame( ) -> "DataFrame": ... + @overload + def createDataFrame( + self, data: "pa.Table", samplingRatio: Optional[float] = ... + ) -> "DataFrame": + ... + @overload def createDataFrame( self, @@ -333,9 +356,18 @@ def createDataFrame( ) -> "DataFrame": ... + @overload + def createDataFrame( + self, + data: "pa.Table", + schema: Union[StructType, str], + verifySchema: bool = ..., + ) -> "DataFrame": + ... + def createDataFrame( # type: ignore[misc] self, - data: "PandasDataFrameLike", + data: Union["PandasDataFrameLike", "pa.Table"], schema: Optional[Union[StructType, List[str]]] = None, samplingRatio: Optional[float] = None, verifySchema: bool = True, @@ -344,12 +376,29 @@ def createDataFrame( # type: ignore[misc] assert isinstance(self, SparkSession) + timezone = self._jconf.sessionLocalTimeZone() + + if type(data).__name__ == "Table": + # `data` is a PyArrow Table + from pyspark.sql.pandas.utils import require_minimum_pyarrow_version + + require_minimum_pyarrow_version() + + import pyarrow as pa + + assert isinstance(data, pa.Table) + + # If no schema supplied by user then get the names of columns only + if schema is None: + schema = data.schema.names + + return self._create_from_arrow_table(data, schema, timezone) + + # `data` is a PandasDataFrameLike object from pyspark.sql.pandas.utils import require_minimum_pandas_version require_minimum_pandas_version() - timezone = self._jconf.sessionLocalTimeZone() - # If no schema supplied by user then get the names of columns only if schema is None: schema = [str(x) if not isinstance(x, str) else x for x in data.columns] @@ -695,6 +744,75 @@ def create_iter_server(): df._schema = schema return df + def _create_from_arrow_table( + self, table: "pa.Table", schema: Union[StructType, List[str]], timezone: str + ) -> "DataFrame": + """ + Create a DataFrame from a given pyarrow.Table by slicing it into partitions then + sending to the JVM to parallelize. + """ + from pyspark.sql import SparkSession + from pyspark.sql.dataframe import DataFrame + + assert isinstance(self, SparkSession) + + from pyspark.sql.pandas.serializers import ArrowStreamSerializer + from pyspark.sql.pandas.types import ( + from_arrow_type, + from_arrow_schema, + to_arrow_schema, + _check_arrow_table_timestamps_localize, + ) + from pyspark.sql.pandas.utils import require_minimum_pyarrow_version + + require_minimum_pyarrow_version() + + prefer_timestamp_ntz = is_timestamp_ntz_preferred() + + # Create the Spark schema from list of names passed in with Arrow types + if isinstance(schema, (list, tuple)): + table = table.rename_columns(schema) + arrow_schema = table.schema + struct = StructType() + for name, field in zip(schema, arrow_schema): + struct.add( + name, + from_arrow_type(field.type, prefer_timestamp_ntz), + nullable=field.nullable, + ) + schema = struct + + if not isinstance(schema, StructType): + schema = from_arrow_schema(table.schema, prefer_timestamp_ntz=prefer_timestamp_ntz) + + table = _check_arrow_table_timestamps_localize(table, schema, True, timezone).cast( + to_arrow_schema(schema, error_on_duplicated_field_names_in_struct=True) + ) + + # Chunk the Arrow Table into RecordBatches + chunk_size = self._jconf.arrowMaxRecordsPerBatch() + arrow_data = table.to_batches(max_chunksize=chunk_size) + + jsparkSession = self._jsparkSession + + ser = ArrowStreamSerializer() + + @no_type_check + def reader_func(temp_filename): + return self._jvm.PythonSQLUtils.readArrowStreamFromFile(temp_filename) + + @no_type_check + def create_iter_server(): + return self._jvm.ArrowIteratorServer() + + # Create Spark DataFrame from Arrow stream file, using one batch per partition + jiter = self._sc._serialize_to_jvm(arrow_data, ser, reader_func, create_iter_server) + assert self._jvm is not None + jdf = self._jvm.PythonSQLUtils.toDataFrame(jiter, schema.json(), jsparkSession) + df = DataFrame(jdf, self) + df._schema = schema + return df + def _test() -> None: import doctest diff --git a/python/pyspark/sql/pandas/functions.py b/python/pyspark/sql/pandas/functions.py index 62d365a3b2a1d..020105bb064ae 100644 --- a/python/pyspark/sql/pandas/functions.py +++ b/python/pyspark/sql/pandas/functions.py @@ -431,7 +431,8 @@ def calculate(iterator: Iterator[pd.Series]) -> Iterator[pd.Series]: return _create_pandas_udf(f=f, returnType=return_type, evalType=eval_type) -def _create_pandas_udf(f, returnType, evalType): +# validate the pandas udf and return the adjusted eval type +def _validate_pandas_udf(f, evalType) -> int: argspec = getfullargspec(f) # pandas UDF by type hints. @@ -528,6 +529,12 @@ def _create_pandas_udf(f, returnType, evalType): }, ) + return evalType + + +def _create_pandas_udf(f, returnType, evalType): + evalType = _validate_pandas_udf(f, evalType) + if is_remote(): from pyspark.sql.connect.udf import _create_udf as _create_connect_udf diff --git a/python/pyspark/sql/pandas/group_ops.py b/python/pyspark/sql/pandas/group_ops.py index d5b214e2f7d5b..3d1c50d949028 100644 --- a/python/pyspark/sql/pandas/group_ops.py +++ b/python/pyspark/sql/pandas/group_ops.py @@ -18,7 +18,7 @@ from typing import List, Union, TYPE_CHECKING, cast import warnings -from pyspark.errors import PySparkValueError +from pyspark.errors import PySparkTypeError from pyspark.util import PythonEvalType from pyspark.sql.column import Column from pyspark.sql.dataframe import DataFrame @@ -100,11 +100,9 @@ def apply(self, udf: "GroupedMapPandasUserDefinedFunction") -> DataFrame: != PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF ) ): - raise PySparkValueError( - error_class="INVALID_PANDAS_UDF", - message_parameters={ - "detail": "the udf argument must be a pandas_udf of type GROUPED_MAP." - }, + raise PySparkTypeError( + error_class="INVALID_UDF_EVAL_TYPE", + message_parameters={"eval_type": "SQL_GROUPED_MAP_PANDAS_UDF"}, ) warnings.warn( diff --git a/python/pyspark/sql/pandas/map_ops.py b/python/pyspark/sql/pandas/map_ops.py index 8c2795a8fbe42..b02fe018b688e 100644 --- a/python/pyspark/sql/pandas/map_ops.py +++ b/python/pyspark/sql/pandas/map_ops.py @@ -30,7 +30,7 @@ class PandasMapOpsMixin: """ - Min-in for pandas map operations. Currently, only :class:`DataFrame` + Mix-in for pandas map operations. Currently, only :class:`DataFrame` can use this class. """ @@ -41,109 +41,6 @@ def mapInPandas( barrier: bool = False, profile: Optional[ResourceProfile] = None, ) -> "DataFrame": - """ - Maps an iterator of batches in the current :class:`DataFrame` using a Python native - function that takes and outputs a pandas DataFrame, and returns the result as a - :class:`DataFrame`. - - The function should take an iterator of `pandas.DataFrame`\\s and return - another iterator of `pandas.DataFrame`\\s. All columns are passed - together as an iterator of `pandas.DataFrame`\\s to the function and the - returned iterator of `pandas.DataFrame`\\s are combined as a :class:`DataFrame`. - Each `pandas.DataFrame` size can be controlled by - `spark.sql.execution.arrow.maxRecordsPerBatch`. The size of the function's input and - output can be different. - - .. versionadded:: 3.0.0 - - .. versionchanged:: 3.4.0 - Supports Spark Connect. - - Parameters - ---------- - func : function - a Python native function that takes an iterator of `pandas.DataFrame`\\s, and - outputs an iterator of `pandas.DataFrame`\\s. - schema : :class:`pyspark.sql.types.DataType` or str - the return type of the `func` in PySpark. The value can be either a - :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string. - barrier : bool, optional, default False - Use barrier mode execution. - - .. versionadded: 3.5.0 - - profile : :class:`pyspark.resource.ResourceProfile`. The optional ResourceProfile - to be used for mapInPandas. - - .. versionadded: 4.0.0 - - - Examples - -------- - >>> df = spark.createDataFrame([(1, 21), (2, 30)], ("id", "age")) - - Filter rows with id equal to 1: - - >>> def filter_func(iterator): - ... for pdf in iterator: - ... yield pdf[pdf.id == 1] - ... - >>> df.mapInPandas(filter_func, df.schema).show() # doctest: +SKIP - +---+---+ - | id|age| - +---+---+ - | 1| 21| - +---+---+ - - Compute the mean age for each id: - - >>> def mean_age(iterator): - ... for pdf in iterator: - ... yield pdf.groupby("id").mean().reset_index() - ... - >>> df.mapInPandas(mean_age, "id: bigint, age: double").show() # doctest: +SKIP - +---+----+ - | id| age| - +---+----+ - | 1|21.0| - | 2|30.0| - +---+----+ - - Add a new column with the double of the age: - - >>> def double_age(iterator): - ... for pdf in iterator: - ... pdf["double_age"] = pdf["age"] * 2 - ... yield pdf - ... - >>> df.mapInPandas( - ... double_age, "id: bigint, age: bigint, double_age: bigint").show() # doctest: +SKIP - +---+---+----------+ - | id|age|double_age| - +---+---+----------+ - | 1| 21| 42| - | 2| 30| 60| - +---+---+----------+ - - Set ``barrier`` to ``True`` to force the ``mapInPandas`` stage running in the - barrier mode, it ensures all Python workers in the stage will be - launched concurrently. - - >>> df.mapInPandas(filter_func, df.schema, barrier=True).show() # doctest: +SKIP - +---+---+ - | id|age| - +---+---+ - | 1| 21| - +---+---+ - - Notes - ----- - This API is experimental - - See Also - -------- - pyspark.sql.functions.pandas_udf - """ from pyspark.sql import DataFrame from pyspark.sql.pandas.functions import pandas_udf @@ -166,74 +63,6 @@ def mapInArrow( barrier: bool = False, profile: Optional[ResourceProfile] = None, ) -> "DataFrame": - """ - Maps an iterator of batches in the current :class:`DataFrame` using a Python native - function that takes and outputs a PyArrow's `RecordBatch`, and returns the result as a - :class:`DataFrame`. - - The function should take an iterator of `pyarrow.RecordBatch`\\s and return - another iterator of `pyarrow.RecordBatch`\\s. All columns are passed - together as an iterator of `pyarrow.RecordBatch`\\s to the function and the - returned iterator of `pyarrow.RecordBatch`\\s are combined as a :class:`DataFrame`. - Each `pyarrow.RecordBatch` size can be controlled by - `spark.sql.execution.arrow.maxRecordsPerBatch`. The size of the function's input and - output can be different. - - .. versionadded:: 3.3.0 - - Parameters - ---------- - func : function - a Python native function that takes an iterator of `pyarrow.RecordBatch`\\s, and - outputs an iterator of `pyarrow.RecordBatch`\\s. - schema : :class:`pyspark.sql.types.DataType` or str - the return type of the `func` in PySpark. The value can be either a - :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string. - barrier : bool, optional, default False - Use barrier mode execution. - - .. versionadded: 3.5.0 - - profile : :class:`pyspark.resource.ResourceProfile`. The optional ResourceProfile - to be used for mapInArrow. - - .. versionadded: 4.0.0 - - Examples - -------- - >>> import pyarrow # doctest: +SKIP - >>> df = spark.createDataFrame([(1, 21), (2, 30)], ("id", "age")) - >>> def filter_func(iterator): - ... for batch in iterator: - ... pdf = batch.to_pandas() - ... yield pyarrow.RecordBatch.from_pandas(pdf[pdf.id == 1]) - >>> df.mapInArrow(filter_func, df.schema).show() # doctest: +SKIP - +---+---+ - | id|age| - +---+---+ - | 1| 21| - +---+---+ - - Set ``barrier`` to ``True`` to force the ``mapInArrow`` stage running in the - barrier mode, it ensures all Python workers in the stage will be - launched concurrently. - - >>> df.mapInArrow(filter_func, df.schema, barrier=True).show() # doctest: +SKIP - +---+---+ - | id|age| - +---+---+ - | 1| 21| - +---+---+ - - Notes - ----- - This API is unstable, and for developers. - - See Also - -------- - pyspark.sql.functions.pandas_udf - pyspark.sql.DataFrame.mapInPandas - """ from pyspark.sql import DataFrame from pyspark.sql.pandas.functions import pandas_udf diff --git a/python/pyspark/sql/pandas/types.py b/python/pyspark/sql/pandas/types.py index 559512bd00c1c..27c77c9d2d7f1 100644 --- a/python/pyspark/sql/pandas/types.py +++ b/python/pyspark/sql/pandas/types.py @@ -52,6 +52,7 @@ _create_row, ) from pyspark.errors import PySparkTypeError, UnsupportedOperationException, PySparkValueError +from pyspark.loose_version import LooseVersion if TYPE_CHECKING: import pandas as pd @@ -60,8 +61,32 @@ from pyspark.sql.pandas._typing import SeriesLike as PandasSeriesLike -def to_arrow_type(dt: DataType) -> "pa.DataType": - """Convert Spark data type to pyarrow type""" +def to_arrow_type( + dt: DataType, + error_on_duplicated_field_names_in_struct: bool = False, + timestamp_utc: bool = True, +) -> "pa.DataType": + """ + Convert Spark data type to PyArrow type + + Parameters + ---------- + dt : :class:`DataType` + The Spark data type. + error_on_duplicated_field_names_in_struct: bool, default False + Whether to raise an exception when there are duplicated field names in a + :class:`pyspark.sql.types.StructType`. (default ``False``) + timestamp_utc : bool, default True + If ``True`` (the default), :class:`TimestampType` is converted to a timezone-aware + :class:`pyarrow.TimestampType` with UTC as the timezone. If ``False``, + :class:`TimestampType` is converted to a timezone-naive :class:`pyarrow.TimestampType`. + The JVM expects timezone-aware timestamps to be in UTC. Always keep this set to ``True`` + except in special cases, such as when this function is used in a test. + + Returns + ------- + :class:`pyarrow.DataType` + """ import pyarrow as pa if type(dt) == BooleanType: @@ -86,30 +111,58 @@ def to_arrow_type(dt: DataType) -> "pa.DataType": arrow_type = pa.binary() elif type(dt) == DateType: arrow_type = pa.date32() - elif type(dt) == TimestampType: + elif type(dt) == TimestampType and timestamp_utc: # Timestamps should be in UTC, JVM Arrow timestamps require a timezone to be read arrow_type = pa.timestamp("us", tz="UTC") + elif type(dt) == TimestampType: + arrow_type = pa.timestamp("us", tz=None) elif type(dt) == TimestampNTZType: arrow_type = pa.timestamp("us", tz=None) elif type(dt) == DayTimeIntervalType: arrow_type = pa.duration("us") elif type(dt) == ArrayType: - field = pa.field("element", to_arrow_type(dt.elementType), nullable=dt.containsNull) + field = pa.field( + "element", + to_arrow_type(dt.elementType, error_on_duplicated_field_names_in_struct, timestamp_utc), + nullable=dt.containsNull, + ) arrow_type = pa.list_(field) elif type(dt) == MapType: - key_field = pa.field("key", to_arrow_type(dt.keyType), nullable=False) - value_field = pa.field("value", to_arrow_type(dt.valueType), nullable=dt.valueContainsNull) + key_field = pa.field( + "key", + to_arrow_type(dt.keyType, error_on_duplicated_field_names_in_struct, timestamp_utc), + nullable=False, + ) + value_field = pa.field( + "value", + to_arrow_type(dt.valueType, error_on_duplicated_field_names_in_struct, timestamp_utc), + nullable=dt.valueContainsNull, + ) arrow_type = pa.map_(key_field, value_field) elif type(dt) == StructType: + field_names = dt.names + if error_on_duplicated_field_names_in_struct and len(set(field_names)) != len(field_names): + raise UnsupportedOperationException( + error_class="DUPLICATED_FIELD_NAME_IN_ARROW_STRUCT", + message_parameters={"field_names": str(field_names)}, + ) fields = [ - pa.field(field.name, to_arrow_type(field.dataType), nullable=field.nullable) + pa.field( + field.name, + to_arrow_type( + field.dataType, error_on_duplicated_field_names_in_struct, timestamp_utc + ), + nullable=field.nullable, + ) for field in dt ] arrow_type = pa.struct(fields) elif type(dt) == NullType: arrow_type = pa.null() elif isinstance(dt, UserDefinedType): - arrow_type = to_arrow_type(dt.sqlType()) + arrow_type = to_arrow_type( + dt.sqlType(), error_on_duplicated_field_names_in_struct, timestamp_utc + ) elif type(dt) == VariantType: fields = [ pa.field("value", pa.binary(), nullable=False), @@ -124,12 +177,40 @@ def to_arrow_type(dt: DataType) -> "pa.DataType": return arrow_type -def to_arrow_schema(schema: StructType) -> "pa.Schema": - """Convert a schema from Spark to Arrow""" +def to_arrow_schema( + schema: StructType, + error_on_duplicated_field_names_in_struct: bool = False, + timestamp_utc: bool = True, +) -> "pa.Schema": + """ + Convert a schema from Spark to Arrow + + Parameters + ---------- + schema : :class:`StructType` + The Spark schema. + error_on_duplicated_field_names_in_struct: bool, default False + Whether to raise an exception when there are duplicated field names in an inner + :class:`pyspark.sql.types.StructType`. (default ``False``) + timestamp_utc : bool, default True + If ``True`` (the default), :class:`TimestampType` is converted to a timezone-aware + :class:`pyarrow.TimestampType` with UTC as the timezone. If ``False``, + :class:`TimestampType` is converted to a timezone-naive :class:`pyarrow.TimestampType`. + The JVM expects timezone-aware timestamps to be in UTC. Always keep this set to ``True`` + except in special cases, such as when this function is used in a test + + Returns + ------- + :class:`pyarrow.Schema` + """ import pyarrow as pa fields = [ - pa.field(field.name, to_arrow_type(field.dataType), nullable=field.nullable) + pa.field( + field.name, + to_arrow_type(field.dataType, error_on_duplicated_field_names_in_struct, timestamp_utc), + nullable=field.nullable, + ) for field in schema ] return pa.schema(fields) @@ -162,6 +243,8 @@ def from_arrow_type(at: "pa.DataType", prefer_timestamp_ntz: bool = False) -> Da spark_type = StringType() elif types.is_binary(at): spark_type = BinaryType() + elif types.is_fixed_size_binary(at): + spark_type = BinaryType() elif types.is_large_binary(at): spark_type = BinaryType() elif types.is_date32(at): @@ -174,6 +257,18 @@ def from_arrow_type(at: "pa.DataType", prefer_timestamp_ntz: bool = False) -> Da spark_type = DayTimeIntervalType() elif types.is_list(at): spark_type = ArrayType(from_arrow_type(at.value_type, prefer_timestamp_ntz)) + elif types.is_fixed_size_list(at): + import pyarrow as pa + + if LooseVersion(pa.__version__) < LooseVersion("14.0.0"): + # PyArrow versions before 14.0.0 do not support casting FixedSizeListArray to ListArray + raise PySparkTypeError( + error_class="UNSUPPORTED_DATA_TYPE_FOR_ARROW_CONVERSION", + message_parameters={"data_type": str(at)}, + ) + spark_type = ArrayType(from_arrow_type(at.value_type, prefer_timestamp_ntz)) + elif types.is_large_list(at): + spark_type = ArrayType(from_arrow_type(at.value_type, prefer_timestamp_ntz)) elif types.is_map(at): spark_type = MapType( from_arrow_type(at.key_type, prefer_timestamp_ntz), @@ -232,6 +327,162 @@ def _get_local_timezone() -> str: return os.environ.get("TZ", "dateutil/:") +def _check_arrow_array_timestamps_localize( + a: Union["pa.Array", "pa.ChunkedArray"], + dt: DataType, + truncate: bool = True, + timezone: Optional[str] = None, +) -> Union["pa.Array", "pa.ChunkedArray"]: + """ + Convert Arrow timestamps to timezone-naive in the specified timezone if the specified Spark + data type is TimestampType, and optionally truncate nanosecond timestamps to microseconds. + + This function works on Arrow Arrays and ChunkedArrays, and it recurses to convert nested + timestamps. + + Parameters + ---------- + a : :class:`pyarrow.Array` or :class:`pyarrow.ChunkedArray` + dt : :class:`DataType` + The Spark data type corresponding to the Arrow Array to be converted. + truncate : bool, default True + Whether to truncate nanosecond timestamps to microseconds. (default ``True``) + timezone : str, optional + The timezone to convert from. If there is a timestamp type, it's required. + + Returns + ------- + :class:`pyarrow.Array` or :class:`pyarrow.ChunkedArray` + """ + import pyarrow.types as types + import pyarrow as pa + import pyarrow.compute as pc + + if isinstance(a, pa.ChunkedArray) and (types.is_nested(a.type) or types.is_dictionary(a.type)): + return pa.chunked_array( + [ + _check_arrow_array_timestamps_localize(chunk, dt, truncate, timezone) + for chunk in a.iterchunks() + ] + ) + + if types.is_timestamp(a.type) and truncate and a.type.unit == "ns": + a = pc.floor_temporal(a, unit="microsecond") + + if types.is_timestamp(a.type) and a.type.tz is None and type(dt) == TimestampType: + assert timezone is not None + + # Only localize timestamps that will become Spark TimestampType columns. + # Do not localize timestamps that will become Spark TimestampNTZType columns. + return pc.assume_timezone(a, timezone) + if types.is_list(a.type): + # Return the ListArray as-is if it contains no nested fields or timestamps + if not types.is_nested(a.type.value_type) and not types.is_timestamp(a.type.value_type): + return a + + at: ArrayType = cast(ArrayType, dt) + return pa.ListArray.from_arrays( + a.offsets, + _check_arrow_array_timestamps_localize(a.values, at.elementType, truncate, timezone), + mask=a.is_null() if a.null_count else None, + ) + if types.is_map(a.type): + # Return the MapArray as-is if it contains no nested fields or timestamps + if ( + not types.is_nested(a.type.key_type) + and not types.is_nested(a.type.item_type) + and not types.is_timestamp(a.type.key_type) + and not types.is_timestamp(a.type.item_type) + ): + return a + + mt: MapType = cast(MapType, dt) + + params = { + "offsets": a.offsets, + "keys": _check_arrow_array_timestamps_localize(a.keys, mt.keyType, truncate, timezone), + "items": _check_arrow_array_timestamps_localize( + a.items, mt.valueType, truncate, timezone + ), + } + # SPARK-48302: PyArrow added support for mask argument to pa.MapArray.from_arrays in + # version 17.0.0 + if a.null_count and LooseVersion(pa.__version__) >= LooseVersion("17.0.0"): + params["mask"] = a.is_null() + + return pa.MapArray.from_arrays(**params) + if types.is_struct(a.type): + # Return the StructArray as-is if it contains no nested fields or timestamps + if all( + [ + not types.is_nested(a.type.field(i).type) + and not types.is_timestamp(a.type.field(i).type) + for i in range(a.type.num_fields) + ] + ): + return a + + st: StructType = cast(StructType, dt) + assert len(a.type) == len(st.fields) + + return pa.StructArray.from_arrays( + [ + _check_arrow_array_timestamps_localize( + a.field(i), st.fields[i].dataType, truncate, timezone + ) + for i in range(len(a.type)) + ], + [a.type[i].name for i in range(len(a.type))], + mask=a.is_null() if a.null_count else None, + ) + if types.is_dictionary(a.type): + return pa.DictionaryArray.from_arrays( + a.indices, + _check_arrow_array_timestamps_localize(a.dictionary, dt, truncate, timezone), + ) + return a + + +def _check_arrow_table_timestamps_localize( + table: "pa.Table", schema: StructType, truncate: bool = True, timezone: Optional[str] = None +) -> "pa.Table": + """ + Convert timestamps in a PyArrow Table to timezone-naive in the specified timezone if the + corresponding Spark data type is TimestampType in the specified Spark schema is TimestampType, + and optionally truncate nanosecond timestamps to microseconds. + + Parameters + ---------- + table : :class:`pyarrow.Table` + schema : :class:`StructType` + The Spark schema corresponding to the schema of the Arrow Table. + truncate : bool, default True + Whether to truncate nanosecond timestamps to microseconds. (default ``True``) + timezone : str, optional + The timezone to convert from. If there is a timestamp type, it's required. + + Returns + ------- + :class:`pyarrow.Table` + """ + import pyarrow.types as types + import pyarrow as pa + + # Return the table as-is if it contains no nested fields or timestamps + if all([not types.is_nested(at) and not types.is_timestamp(at) for at in table.schema.types]): + return table + + assert len(table.schema) == len(schema.fields) + + return pa.Table.from_arrays( + [ + _check_arrow_array_timestamps_localize(a, f.dataType, truncate, timezone) + for a, f in zip(table.columns, schema.fields) + ], + schema=table.schema, + ) + + def _check_series_localize_timestamps(s: "PandasSeriesLike", timezone: str) -> "PandasSeriesLike": """ Convert timezone aware timestamps to timezone-naive in the specified timezone or local timezone. diff --git a/python/pyspark/sql/pandas/utils.py b/python/pyspark/sql/pandas/utils.py index 654b73e3b93ca..fafc3186410c3 100644 --- a/python/pyspark/sql/pandas/utils.py +++ b/python/pyspark/sql/pandas/utils.py @@ -22,7 +22,7 @@ def require_minimum_pandas_version() -> None: """Raise ImportError if minimum version of Pandas is not installed""" # TODO(HyukjinKwon): Relocate and deduplicate the version specification. - minimum_pandas_version = "1.4.4" + minimum_pandas_version = "2.0.0" try: import pandas diff --git a/python/pyspark/sql/protobuf/functions.py b/python/pyspark/sql/protobuf/functions.py index 63871e4375718..1e75874e75f9a 100644 --- a/python/pyspark/sql/protobuf/functions.py +++ b/python/pyspark/sql/protobuf/functions.py @@ -22,7 +22,7 @@ from typing import Dict, Optional, TYPE_CHECKING, cast -from pyspark.sql.column import Column, _to_java_column +from pyspark.sql.column import Column from pyspark.sql.utils import get_active_spark_context, try_remote_protobuf_functions from pyspark.util import _print_missing_jar @@ -139,6 +139,7 @@ def from_protobuf( +------------------+ """ from py4j.java_gateway import JVMView + from pyspark.sql.classic.column import _to_java_column sc = get_active_spark_context() try: @@ -260,6 +261,7 @@ def to_protobuf( +----------------------------+ """ from py4j.java_gateway import JVMView + from pyspark.sql.classic.column import _to_java_column sc = get_active_spark_context() try: diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index 26fe8c5e6fa2f..0ee235c8bf026 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -18,7 +18,7 @@ from typing import cast, overload, Dict, Iterable, List, Optional, Tuple, TYPE_CHECKING, Union from pyspark.util import is_remote_only -from pyspark.sql.column import _to_seq, _to_java_column, Column +from pyspark.sql.column import Column from pyspark.sql.types import StructType from pyspark.sql import utils from pyspark.sql.utils import to_str @@ -619,6 +619,8 @@ def parquet(self, *paths: str, **options: "OptionalPrimitiveType") -> "DataFrame | Tom| 20| NULL| +-----+----+------+ """ + from pyspark.sql.classic.column import _to_seq + mergeSchema = options.get("mergeSchema", None) pathGlobFilter = options.get("pathGlobFilter", None) modifiedBefore = options.get("modifiedBefore", None) @@ -1042,6 +1044,8 @@ def orc( |100|Hyukjin Kwon| +---+------------+ """ + from pyspark.sql.classic.column import _to_seq + self._set_opts( mergeSchema=mergeSchema, pathGlobFilter=pathGlobFilter, @@ -1440,6 +1444,8 @@ def partitionBy(self, *cols: Union[str, List[str]]) -> "DataFrameWriter": |100| +---+ """ + from pyspark.sql.classic.column import _to_seq + if len(cols) == 1 and isinstance(cols[0], (list, tuple)): cols = cols[0] # type: ignore[assignment] self._jwrite = self._jwrite.partitionBy( @@ -1503,6 +1509,8 @@ def bucketBy( +---+------------+ >>> _ = spark.sql("DROP TABLE bucketed_table") """ + from pyspark.sql.classic.column import _to_seq + if not isinstance(numBuckets, int): raise PySparkTypeError( error_class="NOT_INT", @@ -1594,6 +1602,8 @@ def sortBy( +---+------------+ >>> _ = spark.sql("DROP TABLE sorted_bucketed_table") """ + from pyspark.sql.classic.column import _to_seq + if isinstance(col, (list, tuple)): if cols: raise PySparkValueError( @@ -2380,6 +2390,8 @@ def partitionedBy(self, col: Column, *cols: Column) -> "DataFrameWriterV2": .. versionadded: 3.1.0 """ + from pyspark.sql.classic.column import _to_seq, _to_java_column + col = _to_java_column(col) cols = _to_seq(self._spark._sc, [_to_java_column(c) for c in cols]) self._jwriter.partitionedBy(col, cols) @@ -2435,6 +2447,8 @@ def overwrite(self, condition: Column) -> None: .. versionadded: 3.1.0 """ + from pyspark.sql.classic.column import _to_java_column + condition = _to_java_column(condition) self._jwriter.overwrite(condition) diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py index 1098c41a3f4c8..d6fb4b60d90a9 100644 --- a/python/pyspark/sql/session.py +++ b/python/pyspark/sql/session.py @@ -40,7 +40,6 @@ from pyspark.conf import SparkConf from pyspark.util import is_remote_only -from pyspark.sql.column import _to_java_column from pyspark.sql.conf import RuntimeConfig from pyspark.sql.dataframe import DataFrame from pyspark.sql.functions import lit @@ -68,6 +67,7 @@ if TYPE_CHECKING: from py4j.java_gateway import JavaObject + import pyarrow as pa from pyspark.core.context import SparkContext from pyspark.core.rdd import RDD from pyspark.sql._typing import AtomicValue, RowLike, OptionalPrimitiveType @@ -143,6 +143,9 @@ def toDF(self, schema=None, sampleRatio=None): # # @classmethod + @property is also affected by a bug in Python's docstring which was backported # to Python 3.9.6 (https://github.com/python/cpython/pull/28838) +# +# Python 3.9 with MyPy complains about @classmethod + @property combination. We should fix +# it together with MyPy. class classproperty(property): """Same as Python's @property decorator, but for class attributes. @@ -1040,6 +1043,7 @@ def _inferSchemaFromList( ) infer_dict_as_struct = self._jconf.inferDictAsStruct() infer_array_from_first_element = self._jconf.legacyInferArrayTypeFromFirstElement() + infer_map_from_first_pair = self._jconf.legacyInferMapStructTypeFromFirstItem() prefer_timestamp_ntz = is_timestamp_ntz_preferred() schema = reduce( _merge_type, @@ -1049,6 +1053,7 @@ def _inferSchemaFromList( names, infer_dict_as_struct=infer_dict_as_struct, infer_array_from_first_element=infer_array_from_first_element, + infer_map_from_first_pair=infer_map_from_first_pair, prefer_timestamp_ntz=prefer_timestamp_ntz, ) for row in data @@ -1091,6 +1096,7 @@ def _inferSchema( infer_dict_as_struct = self._jconf.inferDictAsStruct() infer_array_from_first_element = self._jconf.legacyInferArrayTypeFromFirstElement() + infer_map_from_first_pair = self._jconf.legacyInferMapStructTypeFromFirstItem() prefer_timestamp_ntz = is_timestamp_ntz_preferred() if samplingRatio is None: schema = _infer_schema( @@ -1108,6 +1114,7 @@ def _inferSchema( names=names, infer_dict_as_struct=infer_dict_as_struct, infer_array_from_first_element=infer_array_from_first_element, + infer_map_from_first_pair=infer_map_from_first_pair, prefer_timestamp_ntz=prefer_timestamp_ntz, ), ) @@ -1127,6 +1134,7 @@ def _inferSchema( names, infer_dict_as_struct=infer_dict_as_struct, infer_array_from_first_element=infer_array_from_first_element, + infer_map_from_first_pair=infer_map_from_first_pair, prefer_timestamp_ntz=prefer_timestamp_ntz, ) ).reduce(_merge_type) @@ -1249,7 +1257,7 @@ def _getActiveSessionOrCreate(**static_conf: Any) -> "SparkSession": spark = builder.getOrCreate() return spark - @overload + @overload # type: ignore[override] def createDataFrame( self, data: Iterable["RowLike"], @@ -1311,6 +1319,10 @@ def createDataFrame( ) -> DataFrame: ... + @overload + def createDataFrame(self, data: "pa.Table", samplingRatio: Optional[float] = ...) -> DataFrame: + ... + @overload def createDataFrame( self, @@ -1320,28 +1332,40 @@ def createDataFrame( ) -> DataFrame: ... + @overload + def createDataFrame( + self, + data: "pa.Table", + schema: Union[StructType, str], + verifySchema: bool = ..., + ) -> DataFrame: + ... + def createDataFrame( # type: ignore[misc] self, - data: Union["RDD[Any]", Iterable[Any], "PandasDataFrameLike", "ArrayLike"], + data: Union["RDD[Any]", Iterable[Any], "PandasDataFrameLike", "ArrayLike", "pa.Table"], schema: Optional[Union[AtomicType, StructType, str]] = None, samplingRatio: Optional[float] = None, verifySchema: bool = True, ) -> DataFrame: """ - Creates a :class:`DataFrame` from an :class:`RDD`, a list, a :class:`pandas.DataFrame` - or a :class:`numpy.ndarray`. + Creates a :class:`DataFrame` from an :class:`RDD`, a list, a :class:`pandas.DataFrame`, + a :class:`numpy.ndarray`, or a :class:`pyarrow.Table`. .. versionadded:: 2.0.0 .. versionchanged:: 3.4.0 Supports Spark Connect. + .. versionchanged:: 4.0.0 + Supports :class:`pyarrow.Table`. + Parameters ---------- data : :class:`RDD` or iterable an RDD of any kind of SQL data representation (:class:`Row`, :class:`tuple`, ``int``, ``boolean``, ``dict``, etc.), or :class:`list`, - :class:`pandas.DataFrame` or :class:`numpy.ndarray`. + :class:`pandas.DataFrame`, :class:`numpy.ndarray`, or :class:`pyarrow.Table`. schema : :class:`pyspark.sql.types.DataType`, str or list, optional a :class:`pyspark.sql.types.DataType` or a datatype string or a list of column names, default is None. The data type string format equals to @@ -1363,12 +1387,14 @@ def createDataFrame( # type: ignore[misc] later. samplingRatio : float, optional the sample ratio of rows used for inferring. The first few rows will be used - if ``samplingRatio`` is ``None``. + if ``samplingRatio`` is ``None``. This option is effective only when the input is + :class:`RDD`. verifySchema : bool, optional verify data types of every row against schema. Enabled by default. - When the input is :class:`pandas.DataFrame` and - `spark.sql.execution.arrow.pyspark.enabled` is enabled, this option is not - effective. It follows Arrow type coercion. + When the input is :class:`pyarrow.Table` or when the input class is + :class:`pandas.DataFrame` and `spark.sql.execution.arrow.pyspark.enabled` is enabled, + this option is not effective. It follows Arrow type coercion. This option is not + supported with Spark Connect. .. versionadded:: 2.1.0 @@ -1468,6 +1494,22 @@ def createDataFrame( # type: ignore[misc] +---+---+ | 1| 2| +---+---+ + + Create a DataFrame from a PyArrow Table. + + >>> spark.createDataFrame(df.toArrow()).show() # doctest: +SKIP + +-----+---+ + | name|age| + +-----+---+ + |Alice| 1| + +-----+---+ + >>> table = pyarrow.table({'0': [1], '1': [2]}) # doctest: +SKIP + >>> spark.createDataFrame(table).collect() # doctest: +SKIP + +---+---+ + | 0| 1| + +---+---+ + | 1| 2| + +---+---+ """ SparkSession._activeSession = self assert self._jvm is not None @@ -1498,6 +1540,13 @@ def createDataFrame( # type: ignore[misc] except Exception: has_numpy = False + try: + import pyarrow as pa + + has_pyarrow = True + except Exception: + has_pyarrow = False + if has_numpy and isinstance(data, np.ndarray): # `data` of numpy.ndarray type will be converted to a pandas DataFrame, # so pandas is required. @@ -1531,6 +1580,11 @@ def createDataFrame( # type: ignore[misc] return super(SparkSession, self).createDataFrame( # type: ignore[call-overload] data, schema, samplingRatio, verifySchema ) + if has_pyarrow and isinstance(data, pa.Table): + # Create a DataFrame from PyArrow Table. + return super(SparkSession, self).createDataFrame( # type: ignore[call-overload] + data, schema, samplingRatio, verifySchema + ) return self._create_dataframe( data, schema, samplingRatio, verifySchema # type: ignore[arg-type] ) @@ -1633,7 +1687,7 @@ def sql( Notes ----- In Spark Classic, a temporary view referenced in `spark.sql` is resolved immediately, - while in Spark Connect it is lazily evaluated. + while in Spark Connect it is lazily analyzed. So in Spark Connect if a view is dropped, modified or replaced after `spark.sql`, the execution may fail or generate different results. @@ -1724,6 +1778,7 @@ def sql( | 3| 6| 1| +---+---+---+ """ + from pyspark.sql.classic.column import _to_java_column formatter = SQLStringFormatter(self) if len(kwargs) > 0: @@ -1766,7 +1821,7 @@ def table(self, tableName: str) -> DataFrame: Notes ----- In Spark Classic, a temporary view referenced in `spark.table` is resolved immediately, - while in Spark Connect it is lazily evaluated. + while in Spark Connect it is lazily analyzed. So in Spark Connect if a view is dropped, modified or replaced after `spark.table`, the execution may fail or generate different results. diff --git a/python/pyspark/sql/streaming/listener.py b/python/pyspark/sql/streaming/listener.py index c1c9dce047319..2aa63cdb91ab6 100644 --- a/python/pyspark/sql/streaming/listener.py +++ b/python/pyspark/sql/streaming/listener.py @@ -64,16 +64,19 @@ class StreamingQueryListener(ABC): """ def _set_spark_session( - self, spark: "SparkSession" # type: ignore[name-defined] # noqa: F821 + self, session: "SparkSession" # type: ignore[name-defined] # noqa: F821 ) -> None: - self._sparkSession = spark + if self.spark is None: + self.spark = session @property def spark(self) -> Optional["SparkSession"]: # type: ignore[name-defined] # noqa: F821 - if hasattr(self, "_sparkSession"): - return self._sparkSession - else: - return None + return getattr(self, "_sparkSession", None) + + @spark.setter + def spark(self, session: "SparkSession") -> None: # type: ignore[name-defined] # noqa: F821 + # For backward compatibility + self._sparkSession = session def _init_listener_id(self) -> None: self._id = str(uuid.uuid4()) diff --git a/python/pyspark/sql/streaming/python_streaming_source_runner.py b/python/pyspark/sql/streaming/python_streaming_source_runner.py index 76f9048e3edbe..754ecff61b973 100644 --- a/python/pyspark/sql/streaming/python_streaming_source_runner.py +++ b/python/pyspark/sql/streaming/python_streaming_source_runner.py @@ -18,7 +18,7 @@ import os import sys import json -from typing import IO +from typing import IO, Iterator, Tuple from pyspark.accumulators import _accumulatorRegistry from pyspark.errors import IllegalArgumentException, PySparkAssertionError, PySparkRuntimeError @@ -29,10 +29,13 @@ SpecialLengths, ) from pyspark.sql.datasource import DataSource, DataSourceStreamReader +from pyspark.sql.datasource_internal import _SimpleStreamReaderWrapper, _streamReader +from pyspark.sql.pandas.serializers import ArrowStreamSerializer from pyspark.sql.types import ( _parse_datatype_json_string, StructType, ) +from pyspark.sql.worker.plan_data_source_read import records_to_arrow_batches from pyspark.util import handle_worker_exception, local_connect_and_auth from pyspark.worker_util import ( check_python_version, @@ -49,6 +52,10 @@ PARTITIONS_FUNC_ID = 886 COMMIT_FUNC_ID = 887 +PREFETCHED_RECORDS_NOT_FOUND = 0 +NON_EMPTY_PYARROW_RECORD_BATCHES = 1 +EMPTY_PYARROW_RECORD_BATCHES = 2 + def initial_offset_func(reader: DataSourceStreamReader, outfile: IO) -> None: offset = reader.initialOffset() @@ -60,7 +67,14 @@ def latest_offset_func(reader: DataSourceStreamReader, outfile: IO) -> None: write_with_length(json.dumps(offset).encode("utf-8"), outfile) -def partitions_func(reader: DataSourceStreamReader, infile: IO, outfile: IO) -> None: +def partitions_func( + reader: DataSourceStreamReader, + data_source: DataSource, + schema: StructType, + max_arrow_batch_size: int, + infile: IO, + outfile: IO, +) -> None: start_offset = json.loads(utf8_deserializer.loads(infile)) end_offset = json.loads(utf8_deserializer.loads(infile)) partitions = reader.partitions(start_offset, end_offset) @@ -68,6 +82,14 @@ def partitions_func(reader: DataSourceStreamReader, infile: IO, outfile: IO) -> write_int(len(partitions), outfile) for partition in partitions: pickleSer._write_with_length(partition, outfile) + if isinstance(reader, _SimpleStreamReaderWrapper): + it = reader.getCache(start_offset, end_offset) + if it is None: + write_int(PREFETCHED_RECORDS_NOT_FOUND, outfile) + else: + send_batch_func(it, outfile, schema, max_arrow_batch_size, data_source) + else: + write_int(PREFETCHED_RECORDS_NOT_FOUND, outfile) def commit_func(reader: DataSourceStreamReader, infile: IO, outfile: IO) -> None: @@ -76,6 +98,23 @@ def commit_func(reader: DataSourceStreamReader, infile: IO, outfile: IO) -> None write_int(0, outfile) +def send_batch_func( + rows: Iterator[Tuple], + outfile: IO, + schema: StructType, + max_arrow_batch_size: int, + data_source: DataSource, +) -> None: + batches = list(records_to_arrow_batches(rows, max_arrow_batch_size, schema, data_source)) + if len(batches) != 0: + write_int(NON_EMPTY_PYARROW_RECORD_BATCHES, outfile) + write_int(SpecialLengths.START_ARROW_STREAM, outfile) + serializer = ArrowStreamSerializer() + serializer.dump_stream(batches, outfile) + else: + write_int(EMPTY_PYARROW_RECORD_BATCHES, outfile) + + def main(infile: IO, outfile: IO) -> None: try: check_python_version(infile) @@ -91,7 +130,7 @@ def main(infile: IO, outfile: IO) -> None: if not isinstance(data_source, DataSource): raise PySparkAssertionError( - error_class="PYTHON_DATA_SOURCE_TYPE_MISMATCH", + error_class="DATA_SOURCE_TYPE_MISMATCH", message_parameters={ "expected": "a Python data source instance of type 'DataSource'", "actual": f"'{type(data_source).__name__}'", @@ -103,16 +142,22 @@ def main(infile: IO, outfile: IO) -> None: schema = _parse_datatype_json_string(schema_json) if not isinstance(schema, StructType): raise PySparkAssertionError( - error_class="PYTHON_DATA_SOURCE_TYPE_MISMATCH", + error_class="DATA_SOURCE_TYPE_MISMATCH", message_parameters={ "expected": "an output schema of type 'StructType'", "actual": f"'{type(schema).__name__}'", }, ) + max_arrow_batch_size = read_int(infile) + assert max_arrow_batch_size > 0, ( + "The maximum arrow batch size should be greater than 0, but got " + f"'{max_arrow_batch_size}'" + ) + # Instantiate data source reader. try: - reader = data_source.streamReader(schema=schema) + reader = _streamReader(data_source, schema) # Initialization succeed. write_int(0, outfile) outfile.flush() @@ -125,7 +170,9 @@ def main(infile: IO, outfile: IO) -> None: elif func_id == LATEST_OFFSET_FUNC_ID: latest_offset_func(reader, outfile) elif func_id == PARTITIONS_FUNC_ID: - partitions_func(reader, infile, outfile) + partitions_func( + reader, data_source, schema, max_arrow_batch_size, infile, outfile + ) elif func_id == COMMIT_FUNC_ID: commit_func(reader, infile, outfile) else: @@ -162,5 +209,9 @@ def main(infile: IO, outfile: IO) -> None: # Read information about how to connect back to the JVM from the environment. java_port = int(os.environ["PYTHON_WORKER_FACTORY_PORT"]) auth_secret = os.environ["PYTHON_WORKER_FACTORY_SECRET"] - (sock_file, _) = local_connect_and_auth(java_port, auth_secret) + (sock_file, sock) = local_connect_and_auth(java_port, auth_secret) + # Prevent the socket from timeout error when query trigger interval is large. + sock.settimeout(None) + write_int(os.getpid(), sock_file) + sock_file.flush() main(sock_file, sock_file) diff --git a/python/pyspark/sql/streaming/query.py b/python/pyspark/sql/streaming/query.py index bcab8a104f1d9..d3d58da3562b6 100644 --- a/python/pyspark/sql/streaming/query.py +++ b/python/pyspark/sql/streaming/query.py @@ -114,7 +114,7 @@ def runId(self) -> str: @property def name(self) -> str: """ - Returns the user-specified name of the query, or null if not specified. + Returns the user-specified name of the query, or None if not specified. This name can be specified in the `org.apache.spark.sql.streaming.DataStreamWriter` as `dataframe.writeStream.queryName("query").start()`. This name, if set, must be unique across all active queries. @@ -127,14 +127,14 @@ def name(self) -> str: Returns ------- str - The user-specified name of the query, or null if not specified. + The user-specified name of the query, or None if not specified. Examples -------- >>> sdf = spark.readStream.format("rate").load() >>> sq = sdf.writeStream.format('memory').queryName('this_query').start() - Get the user-specified name of the query, or null if not specified. + Get the user-specified name of the query, or None if not specified. >>> sq.name 'this_query' diff --git a/python/pyspark/sql/streaming/readwriter.py b/python/pyspark/sql/streaming/readwriter.py index 58901f34cfc9b..b202a499e8b08 100644 --- a/python/pyspark/sql/streaming/readwriter.py +++ b/python/pyspark/sql/streaming/readwriter.py @@ -19,7 +19,6 @@ from collections.abc import Iterator from typing import cast, overload, Any, Callable, List, Optional, TYPE_CHECKING, Union -from pyspark.sql.column import _to_seq from pyspark.sql.readwriter import OptionUtils, to_str from pyspark.sql.streaming.query import StreamingQuery from pyspark.sql.types import Row, StructType @@ -554,8 +553,8 @@ def text( Parameters ---------- - path : str or list - string, or list of strings, for input path(s). + path : str + string for input path. Other Parameters ---------------- @@ -642,8 +641,8 @@ def csv( Parameters ---------- - path : str or list - string, or list of strings, for input path(s). + path : str + string for input path. schema : :class:`pyspark.sql.types.StructType` or str, optional an optional :class:`pyspark.sql.types.StructType` for the input schema or a DDL-formatted string (For example ``col0 INT, col1 DOUBLE``). @@ -1117,6 +1116,8 @@ def partitionBy(self, *cols: str) -> "DataStreamWriter": # type: ignore[misc] +...---------+-----+ ... """ + from pyspark.sql.classic.column import _to_seq + if len(cols) == 1 and isinstance(cols[0], (list, tuple)): cols = cols[0] self._jwrite = self._jwrite.partitionBy(_to_seq(self._spark._sc, cols)) diff --git a/python/pyspark/sql/tests/connect/client/test_artifact.py b/python/pyspark/sql/tests/connect/client/test_artifact.py index f4f49ab251266..c886ff36d776f 100644 --- a/python/pyspark/sql/tests/connect/client/test_artifact.py +++ b/python/pyspark/sql/tests/connect/client/test_artifact.py @@ -25,7 +25,7 @@ from pyspark.sql import SparkSession from pyspark.testing.connectutils import ReusedConnectTestCase, should_test_connect from pyspark.testing.utils import SPARK_HOME -from pyspark.sql.functions import udf +from pyspark.sql.functions import udf, assert_true, lit if should_test_connect: from pyspark.sql.connect.client.artifact import ArtifactManager @@ -46,7 +46,7 @@ def func(x): return my_pyfile.my_func() spark_session.addArtifacts(pyfile_path, pyfile=True) - self.assertEqual(spark_session.range(1).select(func("id")).first()[0], 10) + spark_session.range(1).select(assert_true(func("id") == lit(10))).show() def test_add_pyfile(self): self.check_add_pyfile(self.spark) @@ -94,7 +94,7 @@ def func(x): return my_zipfile.my_func() spark_session.addArtifacts(f"{package_path}.zip", pyfile=True) - self.assertEqual(spark_session.range(1).select(func("id")).first()[0], 5) + spark_session.range(1).select(assert_true(func("id") == lit(5))).show() def test_add_zipped_package(self): self.check_add_zipped_package(self.spark) @@ -130,7 +130,7 @@ def func(x): ) as my_file: return my_file.read().strip() - self.assertEqual(spark_session.range(1).select(func("id")).first()[0], "hello world!") + spark_session.range(1).select(assert_true(func("id") == lit("hello world!"))).show() def test_add_archive(self): self.check_add_archive(self.spark) @@ -160,7 +160,7 @@ def func(x): with open(os.path.join(root, "my_file.txt"), "r") as my_file: return my_file.read().strip() - self.assertEqual(spark_session.range(1).select(func("id")).first()[0], "Hello world!!") + spark_session.range(1).select(assert_true(func("id") == lit("Hello world!!"))).show() def test_add_file(self): self.check_add_file(self.spark) @@ -425,33 +425,3 @@ def test_add_not_existing_artifact(self): self.artifact_manager.add_artifacts( os.path.join(d, "not_existing"), file=True, pyfile=False, archive=False ) - - -@unittest.skipIf(is_remote_only(), "Requires local cluster to run") -class LocalClusterArtifactTests(ReusedConnectTestCase, ArtifactTestsMixin): - @classmethod - def conf(cls): - return ( - super().conf().set("spark.driver.memory", "512M").set("spark.executor.memory", "512M") - ) - - @classmethod - def root(cls): - # In local cluster, we can mimic the production usage. - return "." - - @classmethod - def master(cls): - return "local-cluster[2,2,512]" - - -if __name__ == "__main__": - from pyspark.sql.tests.connect.client.test_artifact import * # noqa: F401 - - try: - import xmlrunner # type: ignore - - testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) - except ImportError: - testRunner = None - unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/sql/tests/connect/client/test_artifact_localcluster.py b/python/pyspark/sql/tests/connect/client/test_artifact_localcluster.py new file mode 100644 index 0000000000000..83584b83333ee --- /dev/null +++ b/python/pyspark/sql/tests/connect/client/test_artifact_localcluster.py @@ -0,0 +1,50 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import unittest +import os + +from pyspark.sql.tests.connect.client.test_artifact import ArtifactTestsMixin +from pyspark.testing.connectutils import ReusedConnectTestCase + + +class LocalClusterArtifactTests(ReusedConnectTestCase, ArtifactTestsMixin): + @classmethod + def conf(cls): + return ( + super().conf().set("spark.driver.memory", "512M").set("spark.executor.memory", "512M") + ) + + @classmethod + def root(cls): + # In local cluster, we can mimic the production usage. + return "." + + @classmethod + def master(cls): + return os.environ.get("SPARK_CONNECT_TESTING_REMOTE", "local-cluster[2,2,512]") + + +if __name__ == "__main__": + from pyspark.sql.tests.connect.client.test_artifact_localcluster import * # noqa: F401 + + try: + import xmlrunner # type: ignore + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/sql/tests/connect/client/test_client.py b/python/pyspark/sql/tests/connect/client/test_client.py index b96fc44d50a7e..196c9eb5d81d8 100644 --- a/python/pyspark/sql/tests/connect/client/test_client.py +++ b/python/pyspark/sql/tests/connect/client/test_client.py @@ -18,13 +18,14 @@ import unittest import uuid from collections.abc import Generator -from typing import Optional, Any +from typing import Optional, Any, Union from pyspark.testing.connectutils import should_test_connect, connect_requirement_message from pyspark.testing.utils import eventually if should_test_connect: import grpc + from google.rpc import status_pb2 import pandas as pd import pyarrow as pa from pyspark.sql.connect.client import SparkConnectClient, DefaultChannelBuilder @@ -33,7 +34,7 @@ DefaultPolicy, ) from pyspark.sql.connect.client.reattach import ExecutePlanResponseReattachableIterator - from pyspark.errors import RetriesExceeded + from pyspark.errors import PySparkRuntimeError, RetriesExceeded import pyspark.sql.connect.proto as proto class TestPolicy(DefaultPolicy): @@ -50,9 +51,17 @@ def __init__(self): class TestException(grpc.RpcError, grpc.Call): """Exception mock to test retryable exceptions.""" - def __init__(self, msg, code=grpc.StatusCode.INTERNAL): + def __init__( + self, + msg, + code=grpc.StatusCode.INTERNAL, + trailing_status: Union[status_pb2.Status, None] = None, + ): self.msg = msg self._code = code + self._trailer: dict[str, Any] = {} + if trailing_status is not None: + self._trailer["grpc-status-details-bin"] = trailing_status.SerializeToString() def code(self): return self._code @@ -60,8 +69,11 @@ def code(self): def __str__(self): return self.msg + def details(self): + return self.msg + def trailing_metadata(self): - return () + return None if not self._trailer else self._trailer.items() class ResponseGenerator(Generator): """This class is used to generate values that are returned by the streaming @@ -340,6 +352,71 @@ def check(): eventually(timeout=1, catch_assertions=True)(check)() + def test_not_found_recovers(self): + """SPARK-48056: Assert that the client recovers from session or operation not + found error if no partial responses were previously received. + """ + + def not_found_recovers(error_code: str): + def not_found(): + raise TestException( + error_code, + grpc.StatusCode.UNAVAILABLE, + trailing_status=status_pb2.Status(code=14, message=error_code, details=""), + ) + + stub = self._stub_with([not_found, self.finished]) + ite = ExecutePlanResponseReattachableIterator(self.request, stub, self.retrying, []) + + for _ in ite: + pass + + def checks(): + self.assertEqual(2, stub.execute_calls) + self.assertEqual(0, stub.attach_calls) + self.assertEqual(0, stub.release_calls) + self.assertEqual(0, stub.release_until_calls) + + eventually(timeout=1, catch_assertions=True)(checks)() + + parameters = ["INVALID_HANDLE.SESSION_NOT_FOUND", "INVALID_HANDLE.OPERATION_NOT_FOUND"] + for b in parameters: + not_found_recovers(b) + + def test_not_found_fails(self): + """SPARK-48056: Assert that the client fails from session or operation not found error + if a partial response was previously received. + """ + + def not_found_fails(error_code: str): + def not_found(): + raise TestException( + error_code, + grpc.StatusCode.UNAVAILABLE, + trailing_status=status_pb2.Status(code=14, message=error_code, details=""), + ) + + stub = self._stub_with([self.response], [not_found]) + + with self.assertRaises(PySparkRuntimeError) as e: + ite = ExecutePlanResponseReattachableIterator(self.request, stub, self.retrying, []) + for _ in ite: + pass + + self.assertTrue("RESPONSE_ALREADY_RECEIVED" in e.exception.getMessage()) + + def checks(): + self.assertEqual(1, stub.execute_calls) + self.assertEqual(1, stub.attach_calls) + self.assertEqual(0, stub.release_calls) + self.assertEqual(0, stub.release_until_calls) + + eventually(timeout=1, catch_assertions=True)(checks)() + + parameters = ["INVALID_HANDLE.SESSION_NOT_FOUND", "INVALID_HANDLE.OPERATION_NOT_FOUND"] + for b in parameters: + not_found_fails(b) + if __name__ == "__main__": from pyspark.sql.tests.connect.client.test_client import * # noqa: F401 diff --git a/python/pyspark/sql/tests/connect/streaming/test_parity_foreach_batch.py b/python/pyspark/sql/tests/connect/streaming/test_parity_foreach_batch.py index 4598cbbdca4e1..d79bfef2426a4 100644 --- a/python/pyspark/sql/tests/connect/streaming/test_parity_foreach_batch.py +++ b/python/pyspark/sql/tests/connect/streaming/test_parity_foreach_batch.py @@ -20,6 +20,7 @@ from pyspark.sql.tests.streaming.test_streaming_foreach_batch import StreamingTestsForeachBatchMixin from pyspark.testing.connectutils import ReusedConnectTestCase from pyspark.errors import PySparkPicklingError +from pyspark.errors.exceptions.connect import SparkConnectGrpcException class StreamingForeachBatchParityTests(StreamingTestsForeachBatchMixin, ReusedConnectTestCase): @@ -66,6 +67,41 @@ def func(df, _): q = df.writeStream.foreachBatch(func).start() q.processAllAvailable() + def test_worker_initialization_error(self): + class SerializableButNotDeserializable: + @staticmethod + def _reduce_function(): + raise ValueError("Cannot unpickle this object") + + def __reduce__(self): + # Return a static method that cannot be called during unpickling + return self._reduce_function, () + + # Create an instance of the class + obj = SerializableButNotDeserializable() + + df = ( + self.spark.readStream.format("rate") + .option("rowsPerSecond", "10") + .option("numPartitions", "1") + .load() + ) + + obj = SerializableButNotDeserializable() + + def fcn(df, _): + print(obj) + + # Assert that an exception occurs during the initialization + with self.assertRaises(SparkConnectGrpcException) as error: + df.select("value").writeStream.foreachBatch(fcn).start() + + # Assert that the error message contains the expected string + self.assertIn( + "Streaming Runner initialization failed", + str(error.exception), + ) + def test_accessing_spark_session(self): spark = self.spark diff --git a/python/pyspark/sql/tests/connect/streaming/test_parity_listener.py b/python/pyspark/sql/tests/connect/streaming/test_parity_listener.py index be8c30c28ce0e..14edfa4003b23 100644 --- a/python/pyspark/sql/tests/connect/streaming/test_parity_listener.py +++ b/python/pyspark/sql/tests/connect/streaming/test_parity_listener.py @@ -23,50 +23,49 @@ from pyspark.sql.streaming.listener import StreamingQueryListener from pyspark.sql.functions import count, lit from pyspark.testing.connectutils import ReusedConnectTestCase +from pyspark.testing.utils import eventually # Listeners that has spark commands in callback handler functions -# V1: Initial interface of StreamingQueryListener containing methods `onQueryStarted`, -# `onQueryProgress`, `onQueryTerminated`. It is prior to Spark 3.5. -class TestListenerSparkV1(StreamingQueryListener): +class TestListenerSpark(StreamingQueryListener): def onQueryStarted(self, event): e = pyspark.cloudpickle.dumps(event) df = self.spark.createDataFrame(data=[(e,)]) - df.write.mode("append").saveAsTable("listener_start_events_v1") + df.write.mode("append").saveAsTable("listener_start_events") def onQueryProgress(self, event): e = pyspark.cloudpickle.dumps(event) df = self.spark.createDataFrame(data=[(e,)]) - df.write.mode("append").saveAsTable("listener_progress_events_v1") + df.write.mode("append").saveAsTable("listener_progress_events") + + def onQueryIdle(self, event): + pass def onQueryTerminated(self, event): e = pyspark.cloudpickle.dumps(event) df = self.spark.createDataFrame(data=[(e,)]) - df.write.mode("append").saveAsTable("listener_terminated_events_v1") + df.write.mode("append").saveAsTable("listener_terminated_events") -# V2: The interface after the method `onQueryIdle` is added. It is Spark 3.5+. -class TestListenerSparkV2(StreamingQueryListener): +# V1: Initial interface of StreamingQueryListener containing methods `onQueryStarted`, +# `onQueryProgress`, `onQueryTerminated`. It is prior to Spark 3.5. +class TestListenerLocalV1(StreamingQueryListener): + def __init__(self): + self.start = [] + self.progress = [] + self.terminated = [] + def onQueryStarted(self, event): - e = pyspark.cloudpickle.dumps(event) - df = self.spark.createDataFrame(data=[(e,)]) - df.write.mode("append").saveAsTable("listener_start_events_v2") + self.start.append(event) def onQueryProgress(self, event): - e = pyspark.cloudpickle.dumps(event) - df = self.spark.createDataFrame(data=[(e,)]) - df.write.mode("append").saveAsTable("listener_progress_events_v2") - - def onQueryIdle(self, event): - pass + self.progress.append(event) def onQueryTerminated(self, event): - e = pyspark.cloudpickle.dumps(event) - df = self.spark.createDataFrame(data=[(e,)]) - df.write.mode("append").saveAsTable("listener_terminated_events_v2") + self.terminated.append(event) -class TestListenerLocal(StreamingQueryListener): +class TestListenerLocalV2(StreamingQueryListener): def __init__(self): self.start = [] self.progress = [] @@ -87,19 +86,29 @@ def onQueryTerminated(self, event): class StreamingListenerParityTests(StreamingListenerTestsMixin, ReusedConnectTestCase): def test_listener_management(self): - listener1 = TestListenerLocal() - listener2 = TestListenerLocal() + listener1 = TestListenerLocalV1() + listener2 = TestListenerLocalV2() try: self.spark.streams.addListener(listener1) self.spark.streams.addListener(listener2) - q = self.spark.readStream.format("rate").load().writeStream.format("noop").start() + q = ( + self.spark.readStream.format("rate") + .load() + .writeStream.format("noop") + .queryName("test_local") + .start() + ) # Both listeners should have listener events already because onQueryStarted # is always called before DataStreamWriter.start() returns self.assertEqual(len(listener1.start), 1) self.assertEqual(len(listener2.start), 1) + self.check_start_event(listener1.start[0]) + self.check_start_event(listener2.start[0]) + while q.lastProgress is None: + q.awaitTermination(0.5) # removeListener is a blocking call, resources are cleaned up by the time it returns self.spark.streams.removeListener(listener1) self.spark.streams.removeListener(listener2) @@ -109,12 +118,13 @@ def test_listener_management(self): q.stop() # need to wait a while before QueryTerminatedEvent reaches client - time.sleep(15) + while len(listener1.terminated) == 0: + time.sleep(1) + self.assertEqual(len(listener1.terminated), 1) - self.check_start_event(listener1.start[0]) for event in listener1.progress: - self.check_progress_event(event) + self.check_progress_event(event, is_stateful=False) self.check_terminated_event(listener1.terminated[0]) finally: @@ -125,7 +135,7 @@ def test_listener_management(self): def test_slow_query(self): try: - listener = TestListenerLocal() + listener = TestListenerLocalV2() self.spark.streams.addListener(listener) slow_query = ( @@ -151,8 +161,12 @@ def test_slow_query(self): self.assertTrue(slow_query.id in [str(e.progress.id) for e in listener.progress]) self.assertTrue(fast_query.id in [str(e.progress.id) for e in listener.progress]) - self.assertTrue(slow_query.id in [str(e.id) for e in listener.terminated]) - self.assertTrue(fast_query.id in [str(e.id) for e in listener.terminated]) + eventually(timeout=20, catch_assertions=True)( + lambda: self.assertTrue(slow_query.id in [str(e.id) for e in listener.terminated]) + )() + eventually(timeout=20, catch_assertions=True)( + lambda: self.assertTrue(fast_query.id in [str(e.id) for e in listener.terminated]) + )() finally: for listener in self.spark.streams._sqlb._listener_bus: @@ -177,7 +191,7 @@ def onQueryTerminated(self, e): raise Exception("I'm so sorry!") try: - listener_good = TestListenerLocal() + listener_good = TestListenerLocalV2() listener_bad = UselessListener() self.spark.streams.addListener(listener_good) self.spark.streams.addListener(listener_bad) @@ -200,8 +214,14 @@ def onQueryTerminated(self, e): q.stop() def test_listener_events_spark_command(self): - def verify(test_listener, table_postfix): - try: + test_listener = TestListenerSpark() + + try: + with self.table( + "listener_start_events", + "listener_progress_events", + "listener_terminated_events", + ): self.spark.streams.addListener(test_listener) # This ensures the read socket on the server won't crash (i.e. because of timeout) @@ -214,56 +234,42 @@ def verify(test_listener, table_postfix): q = ( df_stateful.writeStream.format("noop") .queryName("test") - .outputMode("complete") + .outputMode("update") + .trigger(processingTime="5 seconds") .start() ) self.assertTrue(q.isActive) # ensure at least one batch is ran while q.lastProgress is None or q.lastProgress["batchId"] == 0: - q.awaitTermination(5) + q.awaitTermination(0.5) q.stop() self.assertFalse(q.isActive) - # Sleep to make sure listener_terminated_events is written successfully - time.sleep(60) - - start_table_name = "listener_start_events" + table_postfix - progress_tbl_name = "listener_progress_events" + table_postfix - terminated_tbl_name = "listener_terminated_events" + table_postfix + time.sleep( + 60 + ) # Sleep to make sure listener_terminated_events is written successfully start_event = pyspark.cloudpickle.loads( - self.spark.read.table(start_table_name).collect()[0][0] + self.spark.read.table("listener_start_events").collect()[0][0] ) progress_event = pyspark.cloudpickle.loads( - self.spark.read.table(progress_tbl_name).collect()[0][0] + self.spark.read.table("listener_progress_events").collect()[0][0] ) terminated_event = pyspark.cloudpickle.loads( - self.spark.read.table(terminated_tbl_name).collect()[0][0] + self.spark.read.table("listener_terminated_events").collect()[0][0] ) self.check_start_event(start_event) - self.check_progress_event(progress_event) + self.check_progress_event(progress_event, is_stateful=True) self.check_terminated_event(terminated_event) - finally: - self.spark.streams.removeListener(test_listener) - - # Remove again to verify this won't throw any error - self.spark.streams.removeListener(test_listener) - - with self.table( - "listener_start_events_v1", - "listener_progress_events_v1", - "listener_terminated_events_v1", - "listener_start_events_v2", - "listener_progress_events_v2", - "listener_terminated_events_v2", - ): - verify(TestListenerSparkV1(), "_v1") - verify(TestListenerSparkV2(), "_v2") + finally: + self.spark.streams.removeListener(test_listener) + # Remove again to verify this won't throw any error + self.spark.streams.removeListener(test_listener) if __name__ == "__main__": diff --git a/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py b/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py index 1c2c04f2da54f..9795e96bfa690 100644 --- a/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py +++ b/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py @@ -21,10 +21,7 @@ class StreamingParityTests(StreamingTestsMixin, ReusedConnectTestCase): def _assert_exception_tree_contains_msg(self, exception, msg): - self.assertTrue( - msg in exception._message, - "Exception tree doesn't contain the expected message: %s" % msg, - ) + self.assertIn(msg, exception._message) if __name__ == "__main__": diff --git a/python/pyspark/sql/tests/connect/test_connect_basic.py b/python/pyspark/sql/tests/connect/test_connect_basic.py index 0d84764f53602..598c76a5b25fe 100755 --- a/python/pyspark/sql/tests/connect/test_connect_basic.py +++ b/python/pyspark/sql/tests/connect/test_connect_basic.py @@ -16,9 +16,12 @@ # import os +import gc import unittest import shutil import tempfile +import io +from contextlib import redirect_stdout from pyspark.util import is_remote_only from pyspark.errors import PySparkTypeError, PySparkValueError @@ -33,6 +36,7 @@ ArrayType, Row, ) +from pyspark.testing.utils import eventually from pyspark.testing.sqlutils import SQLTestUtils from pyspark.testing.connectutils import ( should_test_connect, @@ -350,6 +354,24 @@ def test_simple_explain_string(self): result = df._explain_string() self.assertGreater(len(result), 0) + def _check_print_schema(self, query: str): + with io.StringIO() as buf, redirect_stdout(buf): + self.spark.sql(query).printSchema() + print1 = buf.getvalue() + with io.StringIO() as buf, redirect_stdout(buf): + self.connect.sql(query).printSchema() + print2 = buf.getvalue() + self.assertEqual(print1, print2, query) + + for level in [-1, 0, 1, 2, 3, 4]: + with io.StringIO() as buf, redirect_stdout(buf): + self.spark.sql(query).printSchema(level) + print1 = buf.getvalue() + with io.StringIO() as buf, redirect_stdout(buf): + self.connect.sql(query).printSchema(level) + print2 = buf.getvalue() + self.assertEqual(print1, print2, query) + def test_schema(self): schema = self.connect.read.table(self.tbl_name).schema self.assertEqual( @@ -371,6 +393,7 @@ def test_schema(self): self.spark.sql(query).schema, self.connect.sql(query).schema, ) + self._check_print_schema(query) # test TimestampType, DateType query = """ @@ -384,6 +407,7 @@ def test_schema(self): self.spark.sql(query).schema, self.connect.sql(query).schema, ) + self._check_print_schema(query) # test DayTimeIntervalType query = """ SELECT INTERVAL '100 10:30' DAY TO MINUTE AS interval """ @@ -391,6 +415,7 @@ def test_schema(self): self.spark.sql(query).schema, self.connect.sql(query).schema, ) + self._check_print_schema(query) # test MapType query = """ @@ -404,6 +429,7 @@ def test_schema(self): self.spark.sql(query).schema, self.connect.sql(query).schema, ) + self._check_print_schema(query) # test ArrayType query = """ @@ -417,6 +443,7 @@ def test_schema(self): self.spark.sql(query).schema, self.connect.sql(query).schema, ) + self._check_print_schema(query) # test StructType query = """ @@ -430,6 +457,7 @@ def test_schema(self): self.spark.sql(query).schema, self.connect.sql(query).schema, ) + self._check_print_schema(query) def test_to(self): # SPARK-41464: test DataFrame.to() @@ -540,7 +568,7 @@ def test_toDF(self): def test_print_schema(self): # SPARK-41216: Test print schema - tree_str = self.connect.sql("SELECT 1 AS X, 2 AS Y")._tree_string() + tree_str = self.connect.sql("SELECT 1 AS X, 2 AS Y").schema.treeString() # root # |-- X: integer (nullable = false) # |-- Y: integer (nullable = false) @@ -629,6 +657,18 @@ def test_deduplicate(self): self.assert_eq( df.dropDuplicates(["name"]).toPandas(), df2.dropDuplicates(["name"]).toPandas() ) + self.assert_eq( + df.drop_duplicates(["name"]).toPandas(), df2.drop_duplicates(["name"]).toPandas() + ) + self.assert_eq( + df.dropDuplicates(["name", "id"]).toPandas(), + df2.dropDuplicates(["name", "id"]).toPandas(), + ) + self.assert_eq( + df.drop_duplicates(["name", "id"]).toPandas(), + df2.drop_duplicates(["name", "id"]).toPandas(), + ) + self.assert_eq(df.dropDuplicates("name").toPandas(), df2.dropDuplicates("name").toPandas()) def test_drop(self): # SPARK-41169: test drop @@ -1359,6 +1399,92 @@ def test_verify_col_name(self): self.assertTrue(verify_col_name("`m```.`s.s`.`v`", cdf.schema)) +class SparkConnectGCTests(SparkConnectSQLTestCase): + @classmethod + def setUpClass(cls): + cls.origin = os.getenv("USER", None) + os.environ["USER"] = "SparkConnectGCTests" + super(SparkConnectGCTests, cls).setUpClass() + + @classmethod + def tearDownClass(cls): + super(SparkConnectGCTests, cls).tearDownClass() + if cls.origin is not None: + os.environ["USER"] = cls.origin + else: + del os.environ["USER"] + + def test_garbage_collection_checkpoint(self): + # SPARK-48258: Make sure garbage-collecting DataFrame remove the paired state + # in Spark Connect server + df = self.connect.range(10).localCheckpoint() + self.assertIsNotNone(df._plan._relation_id) + cached_remote_relation_id = df._plan._relation_id + + jvm = self.spark._jvm + session_holder = getattr( + getattr( + jvm.org.apache.spark.sql.connect.service, + "SparkConnectService$", + ), + "MODULE$", + ).getOrCreateIsolatedSession(self.connect.client._user_id, self.connect.client._session_id) + + # Check the state exists. + self.assertIsNotNone( + session_holder.dataFrameCache().getOrDefault(cached_remote_relation_id, None) + ) + + del df + gc.collect() + + def condition(): + # Check the state was removed up on garbage-collection. + self.assertIsNone( + session_holder.dataFrameCache().getOrDefault(cached_remote_relation_id, None) + ) + + eventually(catch_assertions=True)(condition)() + + def test_garbage_collection_derived_checkpoint(self): + # SPARK-48258: Should keep the cached remote relation when derived DataFrames exist + df = self.connect.range(10).localCheckpoint() + self.assertIsNotNone(df._plan._relation_id) + derived = df.repartition(10) + cached_remote_relation_id = df._plan._relation_id + + jvm = self.spark._jvm + session_holder = getattr( + getattr( + jvm.org.apache.spark.sql.connect.service, + "SparkConnectService$", + ), + "MODULE$", + ).getOrCreateIsolatedSession(self.connect.client._user_id, self.connect.client._session_id) + + # Check the state exists. + self.assertIsNotNone( + session_holder.dataFrameCache().getOrDefault(cached_remote_relation_id, None) + ) + + del df + gc.collect() + + def condition(): + self.assertIsNone( + session_holder.dataFrameCache().getOrDefault(cached_remote_relation_id, None) + ) + + # Should not remove the cache + with self.assertRaises(AssertionError): + eventually(catch_assertions=True, timeout=5)(condition)() + + del derived + gc.collect() + + eventually(catch_assertions=True)(condition)() + + if __name__ == "__main__": from pyspark.sql.tests.connect.test_connect_basic import * # noqa: F401 diff --git a/python/pyspark/sql/tests/connect/test_connect_column.py b/python/pyspark/sql/tests/connect/test_connect_column.py index 5a1cccc6e1720..fbfb4486446ff 100644 --- a/python/pyspark/sql/tests/connect/test_connect_column.py +++ b/python/pyspark/sql/tests/connect/test_connect_column.py @@ -51,7 +51,7 @@ from pyspark.sql.connect import functions as CF from pyspark.sql.connect.column import Column from pyspark.sql.connect.expressions import DistributedSequenceID, LiteralExpression - from pyspark.sql.connect.types import ( + from pyspark.util import ( JVM_BYTE_MIN, JVM_BYTE_MAX, JVM_SHORT_MIN, @@ -65,7 +65,7 @@ class SparkConnectColumnTests(SparkConnectSQLTestCase): def compare_by_show(self, df1, df2, n: int = 20, truncate: int = 20): - from pyspark.sql.dataframe import DataFrame as SDF + from pyspark.sql.classic.dataframe import DataFrame as SDF from pyspark.sql.connect.dataframe import DataFrame as CDF assert isinstance(df1, (SDF, CDF)) @@ -772,8 +772,8 @@ def test_column_accessor(self): sdf.select(sdf.z[0], sdf.z[1], sdf["z"][2]).toPandas(), ) self.assert_eq( - cdf.select(CF.col("z")[0], cdf.z[10], CF.col("z")[-10]).toPandas(), - sdf.select(SF.col("z")[0], sdf.z[10], SF.col("z")[-10]).toPandas(), + cdf.select(CF.col("z")[0], CF.get(cdf.z, 10), CF.get(CF.col("z"), -10)).toPandas(), + sdf.select(SF.col("z")[0], SF.get(sdf.z, 10), SF.get(SF.col("z"), -10)).toPandas(), ) self.assert_eq( cdf.select(cdf.z.getItem(0), cdf.z.getItem(1), cdf["z"].getField(2)).toPandas(), @@ -824,8 +824,12 @@ def test_column_arithmetic_ops(self): ) self.assert_eq( - cdf.select(cdf.a % cdf["b"], cdf["a"] % 2, 12 % cdf.c).toPandas(), - sdf.select(sdf.a % sdf["b"], sdf["a"] % 2, 12 % sdf.c).toPandas(), + cdf.select( + cdf.a % cdf["b"], cdf["a"] % 2, CF.try_remainder(CF.lit(12), cdf.c) + ).toPandas(), + sdf.select( + sdf.a % sdf["b"], sdf["a"] % 2, SF.try_remainder(SF.lit(12), sdf.c) + ).toPandas(), ) self.assert_eq( @@ -1020,15 +1024,33 @@ def test_distributed_sequence_id(self): expected.collect(), ) + def test_lambda_str_representation(self): + from pyspark.sql.connect.expressions import UnresolvedNamedLambdaVariable + + # forcely clear the internal increasing id, + # otherwise the string representation varies with this id + UnresolvedNamedLambdaVariable._nextVarNameId = 0 + + c = CF.array_sort( + "data", + lambda x, y: CF.when(x.isNull() | y.isNull(), CF.lit(0)).otherwise( + CF.length(y) - CF.length(x) + ), + ) + + self.assertEqual( + str(c), + ( + """Column<'array_sort(data, LambdaFunction(CASE WHEN or(isNull(x_0), """ + """isNull(y_1)) THEN 0 ELSE -(length(y_1), length(x_0)) END, x_0, y_1))'>""" + ), + ) + if __name__ == "__main__": - import os import unittest from pyspark.sql.tests.connect.test_connect_column import * # noqa: F401 - # TODO(SPARK-41794): Enable ANSI mode in this file. - os.environ["SPARK_ANSI_SQL_MODE"] = "false" - try: import xmlrunner diff --git a/python/pyspark/sql/tests/connect/test_connect_dataframe_property.py b/python/pyspark/sql/tests/connect/test_connect_dataframe_property.py new file mode 100644 index 0000000000000..c712e5d6efcb6 --- /dev/null +++ b/python/pyspark/sql/tests/connect/test_connect_dataframe_property.py @@ -0,0 +1,442 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import unittest + +from pyspark.sql.types import StructType, StructField, StringType, IntegerType, LongType, DoubleType +from pyspark.sql.utils import is_remote + +from pyspark.sql import functions as SF +from pyspark.sql.connect import functions as CF + +from pyspark.sql.tests.connect.test_connect_basic import SparkConnectSQLTestCase +from pyspark.testing.sqlutils import ( + have_pandas, + have_pyarrow, + pandas_requirement_message, + pyarrow_requirement_message, +) + +if have_pyarrow: + import pyarrow as pa + import pyarrow.compute as pc + +if have_pandas: + import pandas as pd + + +class SparkConnectDataFramePropertyTests(SparkConnectSQLTestCase): + def test_cached_property_is_copied(self): + schema = StructType( + [ + StructField("id", IntegerType(), True), + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + StructField("city", StringType(), True), + ] + ) + # Create some dummy data + data = [ + (1, "Alice", 30, "New York"), + (2, "Bob", 25, "San Francisco"), + (3, "Cathy", 29, "Los Angeles"), + (4, "David", 35, "Chicago"), + ] + df = self.spark.createDataFrame(data, schema) + df_columns = df.columns + assert len(df.columns) == 4 + for col in ["id", "name"]: + df_columns.remove(col) + assert len(df.columns) == 4 + + def test_cached_schema_to(self): + cdf = self.connect.read.table(self.tbl_name) + sdf = self.spark.read.table(self.tbl_name) + + schema = StructType( + [ + StructField("id", IntegerType(), True), + StructField("name", StringType(), True), + ] + ) + + cdf1 = cdf.to(schema) + self.assertEqual(cdf1._cached_schema, schema) + + sdf1 = sdf.to(schema) + + self.assertEqual(cdf1.schema, sdf1.schema) + self.assertEqual(cdf1.collect(), sdf1.collect()) + + @unittest.skipIf( + not have_pandas or not have_pyarrow, + pandas_requirement_message or pyarrow_requirement_message, + ) + def test_cached_schema_map_in_pandas(self): + data = [(1, "foo"), (2, None), (3, "bar"), (4, "bar")] + cdf = self.connect.createDataFrame(data, "a int, b string") + sdf = self.spark.createDataFrame(data, "a int, b string") + + def func(iterator): + for pdf in iterator: + assert isinstance(pdf, pd.DataFrame) + assert [d.name for d in list(pdf.dtypes)] == ["int32", "object"] + yield pdf + + schema = StructType( + [ + StructField("a", IntegerType(), True), + StructField("b", StringType(), True), + ] + ) + + with self.temp_env({"SPARK_CONNECT_MODE_ENABLED": "1"}): + self.assertTrue(is_remote()) + cdf1 = cdf.mapInPandas(func, schema) + self.assertEqual(cdf1._cached_schema, schema) + + with self.temp_env({"SPARK_CONNECT_MODE_ENABLED": None}): + # 'mapInPandas' depends on the method 'pandas_udf', which is dispatched + # based on 'is_remote'. However, in SparkConnectSQLTestCase, the remote + # mode is always on, so 'sdf.mapInPandas' fails with incorrect dispatch. + # Using this temp env to properly invoke mapInPandas in PySpark Classic. + self.assertFalse(is_remote()) + sdf1 = sdf.mapInPandas(func, schema) + + self.assertEqual(cdf1.schema, sdf1.schema) + self.assertEqual(cdf1.collect(), sdf1.collect()) + + @unittest.skipIf( + not have_pandas or not have_pyarrow, + pandas_requirement_message or pyarrow_requirement_message, + ) + def test_cached_schema_map_in_arrow(self): + data = [(1, "foo"), (2, None), (3, "bar"), (4, "bar")] + cdf = self.connect.createDataFrame(data, "a int, b string") + sdf = self.spark.createDataFrame(data, "a int, b string") + + def func(iterator): + for batch in iterator: + assert isinstance(batch, pa.RecordBatch) + assert batch.schema.types == [pa.int32(), pa.string()] + yield batch + + schema = StructType( + [ + StructField("a", IntegerType(), True), + StructField("b", StringType(), True), + ] + ) + + with self.temp_env({"SPARK_CONNECT_MODE_ENABLED": "1"}): + self.assertTrue(is_remote()) + cdf1 = cdf.mapInArrow(func, schema) + self.assertEqual(cdf1._cached_schema, schema) + + with self.temp_env({"SPARK_CONNECT_MODE_ENABLED": None}): + self.assertFalse(is_remote()) + sdf1 = sdf.mapInArrow(func, schema) + + self.assertEqual(cdf1.schema, sdf1.schema) + self.assertEqual(cdf1.collect(), sdf1.collect()) + + @unittest.skipIf( + not have_pandas or not have_pyarrow, + pandas_requirement_message or pyarrow_requirement_message, + ) + def test_cached_schema_group_apply_in_pandas(self): + data = [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)] + cdf = self.connect.createDataFrame(data, ("id", "v")) + sdf = self.spark.createDataFrame(data, ("id", "v")) + + def normalize(pdf): + v = pdf.v + return pdf.assign(v=(v - v.mean()) / v.std()) + + schema = StructType( + [ + StructField("id", LongType(), True), + StructField("v", DoubleType(), True), + ] + ) + + with self.temp_env({"SPARK_CONNECT_MODE_ENABLED": "1"}): + self.assertTrue(is_remote()) + cdf1 = cdf.groupby("id").applyInPandas(normalize, schema) + self.assertEqual(cdf1._cached_schema, schema) + + with self.temp_env({"SPARK_CONNECT_MODE_ENABLED": None}): + self.assertFalse(is_remote()) + sdf1 = sdf.groupby("id").applyInPandas(normalize, schema) + + self.assertEqual(cdf1.schema, sdf1.schema) + self.assertEqual(cdf1.collect(), sdf1.collect()) + + @unittest.skipIf( + not have_pandas or not have_pyarrow, + pandas_requirement_message or pyarrow_requirement_message, + ) + def test_cached_schema_group_apply_in_arrow(self): + data = [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)] + cdf = self.connect.createDataFrame(data, ("id", "v")) + sdf = self.spark.createDataFrame(data, ("id", "v")) + + def normalize(table): + v = table.column("v") + norm = pc.divide(pc.subtract(v, pc.mean(v)), pc.stddev(v, ddof=1)) + return table.set_column(1, "v", norm) + + schema = StructType( + [ + StructField("id", LongType(), True), + StructField("v", DoubleType(), True), + ] + ) + + with self.temp_env({"SPARK_CONNECT_MODE_ENABLED": "1"}): + self.assertTrue(is_remote()) + cdf1 = cdf.groupby("id").applyInArrow(normalize, schema) + self.assertEqual(cdf1._cached_schema, schema) + + with self.temp_env({"SPARK_CONNECT_MODE_ENABLED": None}): + self.assertFalse(is_remote()) + sdf1 = sdf.groupby("id").applyInArrow(normalize, schema) + + self.assertEqual(cdf1.schema, sdf1.schema) + self.assertEqual(cdf1.collect(), sdf1.collect()) + + @unittest.skipIf( + not have_pandas or not have_pyarrow, + pandas_requirement_message or pyarrow_requirement_message, + ) + def test_cached_schema_cogroup_apply_in_pandas(self): + data1 = [(20000101, 1, 1.0), (20000101, 2, 2.0), (20000102, 1, 3.0), (20000102, 2, 4.0)] + data2 = [(20000101, 1, "x"), (20000101, 2, "y")] + + cdf1 = self.connect.createDataFrame(data1, ("time", "id", "v1")) + sdf1 = self.spark.createDataFrame(data1, ("time", "id", "v1")) + cdf2 = self.connect.createDataFrame(data2, ("time", "id", "v2")) + sdf2 = self.spark.createDataFrame(data2, ("time", "id", "v2")) + + def asof_join(left, right): + return pd.merge_asof(left, right, on="time", by="id") + + schema = StructType( + [ + StructField("time", IntegerType(), True), + StructField("id", IntegerType(), True), + StructField("v1", DoubleType(), True), + StructField("v2", StringType(), True), + ] + ) + + with self.temp_env({"SPARK_CONNECT_MODE_ENABLED": "1"}): + self.assertTrue(is_remote()) + cdf3 = cdf1.groupby("id").cogroup(cdf2.groupby("id")).applyInPandas(asof_join, schema) + self.assertEqual(cdf3._cached_schema, schema) + + with self.temp_env({"SPARK_CONNECT_MODE_ENABLED": None}): + self.assertFalse(is_remote()) + sdf3 = sdf1.groupby("id").cogroup(sdf2.groupby("id")).applyInPandas(asof_join, schema) + + self.assertEqual(cdf3.schema, sdf3.schema) + self.assertEqual(cdf3.collect(), sdf3.collect()) + + @unittest.skipIf( + not have_pandas or not have_pyarrow, + pandas_requirement_message or pyarrow_requirement_message, + ) + def test_cached_schema_cogroup_apply_in_arrow(self): + data1 = [(1, 1.0), (2, 2.0), (1, 3.0), (2, 4.0)] + data2 = [(1, "x"), (2, "y")] + + cdf1 = self.connect.createDataFrame(data1, ("id", "v1")) + sdf1 = self.spark.createDataFrame(data1, ("id", "v1")) + cdf2 = self.connect.createDataFrame(data2, ("id", "v2")) + sdf2 = self.spark.createDataFrame(data2, ("id", "v2")) + + def summarize(left, right): + return pa.Table.from_pydict( + { + "left": [left.num_rows], + "right": [right.num_rows], + } + ) + + schema = StructType( + [ + StructField("left", LongType(), True), + StructField("right", LongType(), True), + ] + ) + + with self.temp_env({"SPARK_CONNECT_MODE_ENABLED": "1"}): + self.assertTrue(is_remote()) + cdf3 = cdf1.groupby("id").cogroup(cdf2.groupby("id")).applyInArrow(summarize, schema) + self.assertEqual(cdf3._cached_schema, schema) + + with self.temp_env({"SPARK_CONNECT_MODE_ENABLED": None}): + self.assertFalse(is_remote()) + sdf3 = sdf1.groupby("id").cogroup(sdf2.groupby("id")).applyInArrow(summarize, schema) + + self.assertEqual(cdf3.schema, sdf3.schema) + self.assertEqual(cdf3.collect(), sdf3.collect()) + + def test_cached_schema_set_op(self): + data1 = [(1, 2, 3)] + data2 = [(6, 2, 5)] + data3 = [(6, 2, 5.0)] + + cdf1 = self.connect.createDataFrame(data1, ["a", "b", "c"]) + sdf1 = self.spark.createDataFrame(data1, ["a", "b", "c"]) + cdf2 = self.connect.createDataFrame(data2, ["a", "b", "c"]) + sdf2 = self.spark.createDataFrame(data2, ["a", "b", "c"]) + cdf3 = self.connect.createDataFrame(data3, ["a", "b", "c"]) + sdf3 = self.spark.createDataFrame(data3, ["a", "b", "c"]) + + # schema not yet cached + self.assertTrue(cdf1._cached_schema is None) + self.assertTrue(cdf2._cached_schema is None) + self.assertTrue(cdf3._cached_schema is None) + + # no cached schema in result dataframe + self.assertTrue(cdf1.union(cdf1)._cached_schema is None) + self.assertTrue(cdf1.union(cdf2)._cached_schema is None) + self.assertTrue(cdf1.union(cdf3)._cached_schema is None) + + self.assertTrue(cdf1.unionAll(cdf1)._cached_schema is None) + self.assertTrue(cdf1.unionAll(cdf2)._cached_schema is None) + self.assertTrue(cdf1.unionAll(cdf3)._cached_schema is None) + + self.assertTrue(cdf1.unionByName(cdf1)._cached_schema is None) + self.assertTrue(cdf1.unionByName(cdf2)._cached_schema is None) + self.assertTrue(cdf1.unionByName(cdf3)._cached_schema is None) + + self.assertTrue(cdf1.subtract(cdf1)._cached_schema is None) + self.assertTrue(cdf1.subtract(cdf2)._cached_schema is None) + self.assertTrue(cdf1.subtract(cdf3)._cached_schema is None) + + self.assertTrue(cdf1.exceptAll(cdf1)._cached_schema is None) + self.assertTrue(cdf1.exceptAll(cdf2)._cached_schema is None) + self.assertTrue(cdf1.exceptAll(cdf3)._cached_schema is None) + + self.assertTrue(cdf1.intersect(cdf1)._cached_schema is None) + self.assertTrue(cdf1.intersect(cdf2)._cached_schema is None) + self.assertTrue(cdf1.intersect(cdf3)._cached_schema is None) + + self.assertTrue(cdf1.intersectAll(cdf1)._cached_schema is None) + self.assertTrue(cdf1.intersectAll(cdf2)._cached_schema is None) + self.assertTrue(cdf1.intersectAll(cdf3)._cached_schema is None) + + # trigger analysis of cdf1.schema + self.assertEqual(cdf1.schema, sdf1.schema) + self.assertTrue(cdf1._cached_schema is not None) + + self.assertEqual(cdf1.union(cdf1)._cached_schema, cdf1._cached_schema) + # cannot infer when cdf2 doesn't cache schema + self.assertTrue(cdf1.union(cdf2)._cached_schema is None) + # cannot infer when cdf3 doesn't cache schema + self.assertTrue(cdf1.union(cdf3)._cached_schema is None) + + # trigger analysis of cdf2.schema, cdf3.schema + self.assertEqual(cdf2.schema, sdf2.schema) + self.assertEqual(cdf3.schema, sdf3.schema) + + # now all the schemas are cached + self.assertTrue(cdf1._cached_schema is not None) + self.assertTrue(cdf2._cached_schema is not None) + self.assertTrue(cdf3._cached_schema is not None) + + self.assertEqual(cdf1.union(cdf1)._cached_schema, cdf1._cached_schema) + self.assertEqual(cdf1.union(cdf2)._cached_schema, cdf1._cached_schema) + # cannot infer when schemas mismatch + self.assertTrue(cdf1.union(cdf3)._cached_schema is None) + + self.assertEqual(cdf1.unionAll(cdf1)._cached_schema, cdf1._cached_schema) + self.assertEqual(cdf1.unionAll(cdf2)._cached_schema, cdf1._cached_schema) + # cannot infer when schemas mismatch + self.assertTrue(cdf1.unionAll(cdf3)._cached_schema is None) + + self.assertEqual(cdf1.unionByName(cdf1)._cached_schema, cdf1._cached_schema) + self.assertEqual(cdf1.unionByName(cdf2)._cached_schema, cdf1._cached_schema) + # cannot infer when schemas mismatch + self.assertTrue(cdf1.unionByName(cdf3)._cached_schema is None) + + self.assertEqual(cdf1.subtract(cdf1)._cached_schema, cdf1._cached_schema) + self.assertEqual(cdf1.subtract(cdf2)._cached_schema, cdf1._cached_schema) + # cannot infer when schemas mismatch + self.assertTrue(cdf1.subtract(cdf3)._cached_schema is None) + + self.assertEqual(cdf1.exceptAll(cdf1)._cached_schema, cdf1._cached_schema) + self.assertEqual(cdf1.exceptAll(cdf2)._cached_schema, cdf1._cached_schema) + # cannot infer when schemas mismatch + self.assertTrue(cdf1.exceptAll(cdf3)._cached_schema is None) + + self.assertEqual(cdf1.intersect(cdf1)._cached_schema, cdf1._cached_schema) + self.assertEqual(cdf1.intersect(cdf2)._cached_schema, cdf1._cached_schema) + # cannot infer when schemas mismatch + self.assertTrue(cdf1.intersect(cdf3)._cached_schema is None) + + self.assertEqual(cdf1.intersectAll(cdf1)._cached_schema, cdf1._cached_schema) + self.assertEqual(cdf1.intersectAll(cdf2)._cached_schema, cdf1._cached_schema) + # cannot infer when schemas mismatch + self.assertTrue(cdf1.intersectAll(cdf3)._cached_schema is None) + + def test_cached_schema_in_chain_op(self): + data = [(1, 1.0), (2, 2.0), (1, 3.0), (2, 4.0)] + + cdf = self.connect.createDataFrame(data, ("id", "v1")) + sdf = self.spark.createDataFrame(data, ("id", "v1")) + + cdf1 = cdf.withColumn("v2", CF.lit(1)) + sdf1 = sdf.withColumn("v2", SF.lit(1)) + + self.assertTrue(cdf1._cached_schema is None) + # trigger analysis of cdf1.schema + self.assertEqual(cdf1.schema, sdf1.schema) + self.assertTrue(cdf1._cached_schema is not None) + + cdf2 = cdf1.where(cdf1.v2 > 0) + sdf2 = sdf1.where(sdf1.v2 > 0) + self.assertEqual(cdf1._cached_schema, cdf2._cached_schema) + + cdf3 = cdf2.repartition(10) + sdf3 = sdf2.repartition(10) + self.assertEqual(cdf1._cached_schema, cdf3._cached_schema) + + cdf4 = cdf3.distinct() + sdf4 = sdf3.distinct() + self.assertEqual(cdf1._cached_schema, cdf4._cached_schema) + + cdf5 = cdf4.sample(fraction=0.5) + sdf5 = sdf4.sample(fraction=0.5) + self.assertEqual(cdf1._cached_schema, cdf5._cached_schema) + + self.assertEqual(cdf5.schema, sdf5.schema) + + +if __name__ == "__main__": + from pyspark.sql.tests.connect.test_connect_dataframe_property import * # noqa: F401 + + try: + import xmlrunner + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/sql/tests/connect/test_connect_error.py b/python/pyspark/sql/tests/connect/test_connect_error.py index 1297e62bb96f7..d5d9f9a221847 100644 --- a/python/pyspark/sql/tests/connect/test_connect_error.py +++ b/python/pyspark/sql/tests/connect/test_connect_error.py @@ -21,6 +21,7 @@ from pyspark.errors.exceptions.base import SessionNotSameException from pyspark.sql.types import Row from pyspark.testing.connectutils import should_test_connect +from pyspark.errors import PySparkTypeError from pyspark.errors.exceptions.connect import AnalysisException from pyspark.sql.tests.connect.test_connect_basic import SparkConnectSQLTestCase @@ -158,12 +159,10 @@ def test_different_spark_session_join_or_union(self): def test_unsupported_functions(self): # SPARK-41225: Disable unsupported functions. df = self.connect.read.table(self.tbl_name) - for f in ( - "checkpoint", - "localCheckpoint", - ): - with self.assertRaises(NotImplementedError): - getattr(df, f)() + with self.assertRaises(NotImplementedError): + df.toJSON() + with self.assertRaises(NotImplementedError): + df.rdd def test_unsupported_jvm_attribute(self): # Unsupported jvm attributes for Spark session. @@ -216,6 +215,21 @@ def test_column_cannot_be_constructed_from_string(self): with self.assertRaises(TypeError): Column("col") + def test_select_none(self): + with self.assertRaises(PySparkTypeError) as e1: + self.connect.range(1).select(None) + + self.check_error( + exception=e1.exception, + error_class="NOT_LIST_OF_COLUMN_OR_STR", + message_parameters={"arg_name": "columns"}, + ) + + def test_ym_interval_in_collect(self): + # YearMonthIntervalType is not supported in python side arrow conversion + with self.assertRaises(PySparkTypeError): + self.connect.sql("SELECT INTERVAL '10-8' YEAR TO MONTH AS interval").first() + if __name__ == "__main__": from pyspark.sql.tests.connect.test_connect_error import * # noqa: F401 diff --git a/python/pyspark/sql/tests/connect/test_connect_function.py b/python/pyspark/sql/tests/connect/test_connect_function.py index 581fde3e62933..0f0abfd4b8567 100644 --- a/python/pyspark/sql/tests/connect/test_connect_function.py +++ b/python/pyspark/sql/tests/connect/test_connect_function.py @@ -21,7 +21,14 @@ from pyspark.util import is_remote_only from pyspark.errors import PySparkTypeError, PySparkValueError from pyspark.sql import SparkSession as PySparkSession -from pyspark.sql.types import StringType, StructType, StructField, ArrayType, IntegerType +from pyspark.sql.types import ( + _drop_metadata, + StringType, + StructType, + StructField, + ArrayType, + IntegerType, +) from pyspark.testing import assertDataFrameEqual from pyspark.testing.pandasutils import PandasOnSparkTestUtils from pyspark.testing.connectutils import ReusedConnectTestCase, should_test_connect @@ -32,10 +39,8 @@ from pyspark.sql.connect.column import Column from pyspark.sql import functions as SF from pyspark.sql.window import Window as SW - from pyspark.sql.dataframe import DataFrame as SDF from pyspark.sql.connect import functions as CF from pyspark.sql.connect.window import Window as CW - from pyspark.sql.connect.dataframe import DataFrame as CDF @unittest.skipIf(is_remote_only(), "Requires JVM access") @@ -60,6 +65,9 @@ def tearDownClass(cls): del os.environ["PYSPARK_NO_NAMESPACE_SHARE"] def compare_by_show(self, df1, df2, n: int = 20, truncate: int = 20): + from pyspark.sql.classic.dataframe import DataFrame as SDF + from pyspark.sql.connect.dataframe import DataFrame as CDF + assert isinstance(df1, (SDF, CDF)) if isinstance(df1, SDF): str1 = df1._jdf.showString(n, truncate, False) @@ -1667,7 +1675,7 @@ def test_nested_lambda_function(self): ) # TODO: 'cdf.schema' has an extra metadata '{'__autoGeneratedAlias': 'true'}' - # self.assertEqual(cdf.schema, sdf.schema) + self.assertEqual(_drop_metadata(cdf.schema), _drop_metadata(sdf.schema)) self.assertEqual(cdf.collect(), sdf.collect()) def test_csv_functions(self): @@ -2029,7 +2037,6 @@ def test_string_functions_one_arg(self): (CF.sentences, SF.sentences), (CF.initcap, SF.initcap), (CF.soundex, SF.soundex), - (CF.bin, SF.bin), (CF.hex, SF.hex), (CF.unhex, SF.unhex), (CF.length, SF.length), @@ -2042,6 +2049,19 @@ def test_string_functions_one_arg(self): sdf.select(sfunc("a"), sfunc(sdf.b)).toPandas(), ) + query = """ + SELECT * FROM VALUES + (' 1 ', '2 ', NULL), (' 3', NULL, '4') + AS tab(a, b, c) + """ + cdf = self.connect.sql(query) + sdf = self.spark.sql(query) + + self.assert_eq( + cdf.select(CF.bin(cdf.a), CF.bin(cdf.b)).toPandas(), + sdf.select(SF.bin(sdf.a), SF.bin(sdf.b)).toPandas(), + ) + def test_string_functions_multi_args(self): query = """ SELECT * FROM VALUES @@ -2148,15 +2168,15 @@ def test_string_functions_multi_args(self): def test_date_ts_functions(self): query = """ SELECT * FROM VALUES - ('1997/02/28 10:30:00', '2023/03/01 06:00:00', 'JST', 1428476400, 2020, 12, 6), - ('2000/01/01 04:30:05', '2020/05/01 12:15:00', 'PST', 1403892395, 2022, 12, 6) + ('1997-02-28 10:30:00', '2023-03-01 06:00:00', 'JST', 1428476400, 2020, 12, 6), + ('2000-01-01 04:30:05', '2020-05-01 12:15:00', 'PST', 1403892395, 2022, 12, 6) AS tab(ts1, ts2, tz, seconds, Y, M, D) """ # +-------------------+-------------------+---+----------+----+---+---+ # | ts1| ts2| tz| seconds| Y| M| D| # +-------------------+-------------------+---+----------+----+---+---+ - # |1997/02/28 10:30:00|2023/03/01 06:00:00|JST|1428476400|2020| 12| 6| - # |2000/01/01 04:30:05|2020/05/01 12:15:00|PST|1403892395|2022| 12| 6| + # |1997-02-28 10:30:00|2023-03-01 06:00:00|JST|1428476400|2020| 12| 6| + # |2000-01-01 04:30:05|2020-05-01 12:15:00|PST|1403892395|2022| 12| 6| # +-------------------+-------------------+---+----------+----+---+---+ cdf = self.connect.sql(query) @@ -2212,14 +2232,14 @@ def test_date_ts_functions(self): (CF.to_date, SF.to_date), ]: self.assert_eq( - cdf.select(cfunc(cdf.ts1, format="yyyy-MM-dd")).toPandas(), - sdf.select(sfunc(sdf.ts1, format="yyyy-MM-dd")).toPandas(), + cdf.select(cfunc(cdf.ts1, format="yyyy-MM-dd HH:mm:ss")).toPandas(), + sdf.select(sfunc(sdf.ts1, format="yyyy-MM-dd HH:mm:ss")).toPandas(), ) self.compare_by_show( # [left]: datetime64[ns, America/Los_Angeles] # [right]: datetime64[ns] - cdf.select(CF.to_timestamp(cdf.ts1, format="yyyy-MM-dd")), - sdf.select(SF.to_timestamp(sdf.ts1, format="yyyy-MM-dd")), + cdf.select(CF.to_timestamp(cdf.ts1, format="yyyy-MM-dd HH:mm:ss")), + sdf.select(SF.to_timestamp(sdf.ts1, format="yyyy-MM-dd HH:mm:ss")), ) # With tz parameter @@ -2589,9 +2609,6 @@ def test_non_deterministic_with_seed(self): import os from pyspark.sql.tests.connect.test_connect_function import * # noqa: F401 - # TODO(SPARK-41547): Enable ANSI mode in this file. - os.environ["SPARK_ANSI_SQL_MODE"] = "false" - try: import xmlrunner # type: ignore diff --git a/python/pyspark/sql/tests/connect/test_connect_plan.py b/python/pyspark/sql/tests/connect/test_connect_plan.py index 3a221cacedb27..47e3fb5a96023 100644 --- a/python/pyspark/sql/tests/connect/test_connect_plan.py +++ b/python/pyspark/sql/tests/connect/test_connect_plan.py @@ -333,6 +333,11 @@ def test_observe(self): from pyspark.sql.connect.observation import Observation class MockDF(DataFrame): + def __new__(cls, df: DataFrame) -> "DataFrame": + self = object.__new__(cls) + self.__init__(df) # type: ignore[misc] + return self + def __init__(self, df: DataFrame): super().__init__(df._plan, df._session) @@ -438,7 +443,7 @@ def test_sample(self): self.assertEqual(plan.root.sample.lower_bound, 0.0) self.assertEqual(plan.root.sample.upper_bound, 0.3) self.assertEqual(plan.root.sample.with_replacement, False) - self.assertEqual(plan.root.sample.HasField("seed"), False) + self.assertEqual(plan.root.sample.HasField("seed"), True) self.assertEqual(plan.root.sample.deterministic_order, False) plan = ( @@ -548,13 +553,25 @@ def test_deduplicate(self): self.assertEqual(deduplicate_on_all_columns_plan.root.deduplicate.all_columns_as_keys, True) self.assertEqual(len(deduplicate_on_all_columns_plan.root.deduplicate.column_names), 0) - deduplicate_on_subset_columns_plan = df.dropDuplicates(["name", "height"])._plan.to_proto( - self.connect + deduplicate_on_subset_columns_plan_list_arg = df.dropDuplicates( + ["name", "height"] + )._plan.to_proto(self.connect) + self.assertEqual( + deduplicate_on_subset_columns_plan_list_arg.root.deduplicate.all_columns_as_keys, False + ) + self.assertEqual( + len(deduplicate_on_subset_columns_plan_list_arg.root.deduplicate.column_names), 2 + ) + + deduplicate_on_subset_columns_plan_var_arg = df.dropDuplicates( + "name", "height" + )._plan.to_proto(self.connect) + self.assertEqual( + deduplicate_on_subset_columns_plan_var_arg.root.deduplicate.all_columns_as_keys, False ) self.assertEqual( - deduplicate_on_subset_columns_plan.root.deduplicate.all_columns_as_keys, False + len(deduplicate_on_subset_columns_plan_var_arg.root.deduplicate.column_names), 2 ) - self.assertEqual(len(deduplicate_on_subset_columns_plan.root.deduplicate.column_names), 2) def test_relation_alias(self): df = self.connect.readTable(table_name=self.tbl_name) diff --git a/python/pyspark/sql/tests/connect/test_connect_session.py b/python/pyspark/sql/tests/connect/test_connect_session.py index 1caf3525cfbbc..1dd5cde0dff50 100644 --- a/python/pyspark/sql/tests/connect/test_connect_session.py +++ b/python/pyspark/sql/tests/connect/test_connect_session.py @@ -242,6 +242,34 @@ def toChannel(self): session = RemoteSparkSession.builder.channelBuilder(CustomChannelBuilder()).create() session.sql("select 1 + 1") + def test_reset_when_server_and_client_sessionids_mismatch(self): + session = RemoteSparkSession.builder.remote("sc://localhost").getOrCreate() + # run a simple query so the session id is synchronized. + session.range(3).collect() + + # trigger a mismatch between client session id and server session id. + session._client._session_id = str(uuid.uuid4()) + with self.assertRaises(SparkConnectException): + session.range(3).collect() + + # assert that getOrCreate() generates a new session + session = RemoteSparkSession.builder.remote("sc://localhost").getOrCreate() + session.range(3).collect() + + def test_reset_when_server_session_id_mismatch(self): + session = RemoteSparkSession.builder.remote("sc://localhost").getOrCreate() + # run a simple query so the session id is synchronized. + session.range(3).collect() + + # trigger a mismatch + session._client._server_session_id = str(uuid.uuid4()) + with self.assertRaises(SparkConnectException): + session.range(3).collect() + + # assert that getOrCreate() generates a new session + session = RemoteSparkSession.builder.remote("sc://localhost").getOrCreate() + session.range(3).collect() + class SparkConnectSessionWithOptionsTest(unittest.TestCase): def setUp(self) -> None: diff --git a/python/pyspark/sql/tests/connect/test_df_debug.py b/python/pyspark/sql/tests/connect/test_df_debug.py new file mode 100644 index 0000000000000..8a4ec68fda844 --- /dev/null +++ b/python/pyspark/sql/tests/connect/test_df_debug.py @@ -0,0 +1,86 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import unittest + +from pyspark.testing.connectutils import ( + should_test_connect, + have_graphviz, + graphviz_requirement_message, +) +from pyspark.sql.tests.connect.test_connect_basic import SparkConnectSQLTestCase + +if should_test_connect: + from pyspark.sql.connect.dataframe import DataFrame + + +class SparkConnectDataFrameDebug(SparkConnectSQLTestCase): + def test_df_debug_basics(self): + df: DataFrame = self.connect.range(100).repartition(10).groupBy("id").count() + x = df.collect() # noqa: F841 + ei = df.executionInfo + + root, graph = ei.metrics.extract_graph() + self.assertIn(root, graph, "The root must be rooted in the graph") + + def test_df_quey_execution_empty_before_execution(self): + df: DataFrame = self.connect.range(100).repartition(10).groupBy("id").count() + ei = df.executionInfo + self.assertIsNone(ei, "The query execution must be None before the action is executed") + + def test_df_query_execution_with_writes(self): + df: DataFrame = self.connect.range(100).repartition(10).groupBy("id").count() + df.write.save("/tmp/test_df_query_execution_with_writes", format="json", mode="overwrite") + ei = df.executionInfo + self.assertIsNotNone( + ei, "The query execution must be None after the write action is executed" + ) + + def test_query_execution_text_format(self): + df: DataFrame = self.connect.range(100).repartition(10).groupBy("id").count() + df.collect() + self.assertIn("HashAggregate", df.executionInfo.metrics.toText()) + + # Different execution mode. + df: DataFrame = self.connect.range(100).repartition(10).groupBy("id").count() + df.toPandas() + self.assertIn("HashAggregate", df.executionInfo.metrics.toText()) + + @unittest.skipIf(not have_graphviz, graphviz_requirement_message) + def test_df_query_execution_metrics_to_dot(self): + df: DataFrame = self.connect.range(100).repartition(10).groupBy("id").count() + x = df.collect() # noqa: F841 + ei = df.executionInfo + + dot = ei.metrics.toDot() + source = dot.source + self.assertIsNotNone(dot, "The dot representation must not be None") + self.assertGreater(len(source), 0, "The dot representation must not be empty") + self.assertIn("digraph", source, "The dot representation must contain the digraph keyword") + self.assertIn("Metrics", source, "The dot representation must contain the Metrics keyword") + + +if __name__ == "__main__": + from pyspark.sql.tests.connect.test_df_debug import * # noqa: F401 + + try: + import xmlrunner # type: ignore + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/sql/tests/connect/test_parity_arrow.py b/python/pyspark/sql/tests/connect/test_parity_arrow.py index 93d0b6cf0f5f5..885b3001b1db1 100644 --- a/python/pyspark/sql/tests/connect/test_parity_arrow.py +++ b/python/pyspark/sql/tests/connect/test_parity_arrow.py @@ -16,7 +16,6 @@ # import unittest -import sys from pyspark.sql.tests.test_arrow import ArrowTestsMixin from pyspark.testing.connectutils import ReusedConnectTestCase @@ -24,10 +23,6 @@ class ArrowParityTests(ArrowTestsMixin, ReusedConnectTestCase, PandasOnSparkTestUtils): - @unittest.skip("Spark Connect does not support Spark Context but the test depends on that.") - def test_createDataFrame_empty_partition(self): - super().test_createDataFrame_empty_partition() - @unittest.skip("Spark Connect does not support fallback.") def test_createDataFrame_fallback_disabled(self): super().test_createDataFrame_fallback_disabled() @@ -36,8 +31,11 @@ def test_createDataFrame_fallback_disabled(self): def test_createDataFrame_fallback_enabled(self): super().test_createDataFrame_fallback_enabled() - def test_createDataFrame_with_map_type(self): - self.check_createDataFrame_with_map_type(True) + def test_createDataFrame_pandas_with_map_type(self): + self.check_createDataFrame_pandas_with_map_type(True) + + def test_createDataFrame_pandas_with_struct_type(self): + self.check_createDataFrame_pandas_with_struct_type(True) def test_createDataFrame_with_ndarray(self): self.check_createDataFrame_with_ndarray(True) @@ -74,6 +72,9 @@ def test_create_data_frame_to_pandas_timestamp_ntz(self): def test_create_data_frame_to_pandas_day_time_internal(self): self.check_create_data_frame_to_pandas_day_time_internal(True) + def test_createDataFrame_pandas_respect_session_timezone(self): + self.check_createDataFrame_pandas_respect_session_timezone(True) + def test_toPandas_respect_session_timezone(self): self.check_toPandas_respect_session_timezone(True) @@ -94,11 +95,11 @@ def test_toPandas_with_map_type(self): def test_toPandas_with_map_type_nulls(self): self.check_toPandas_with_map_type_nulls(True) - def test_createDataFrame_with_array_type(self): - self.check_createDataFrame_with_array_type(True) + def test_createDataFrame_pandas_with_array_type(self): + self.check_createDataFrame_pandas_with_array_type(True) - def test_createDataFrame_with_int_col_names(self): - self.check_createDataFrame_with_int_col_names(True) + def test_createDataFrame_pandas_with_int_col_names(self): + self.check_createDataFrame_pandas_with_int_col_names(True) def test_timestamp_nat(self): self.check_timestamp_nat(True) @@ -109,19 +110,21 @@ def test_toPandas_error(self): def test_toPandas_duplicate_field_names(self): self.check_toPandas_duplicate_field_names(True) - def test_createDataFrame_duplicate_field_names(self): - self.check_createDataFrame_duplicate_field_names(True) + def test_createDataFrame_pandas_duplicate_field_names(self): + self.check_createDataFrame_pandas_duplicate_field_names(True) + + def test_toPandas_empty_rows(self): + self.check_toPandas_empty_rows(True) def test_toPandas_empty_columns(self): self.check_toPandas_empty_columns(True) - def test_createDataFrame_nested_timestamp(self): - self.check_createDataFrame_nested_timestamp(True) + def test_createDataFrame_pandas_nested_timestamp(self): + self.check_createDataFrame_pandas_nested_timestamp(True) def test_toPandas_nested_timestamp(self): self.check_toPandas_nested_timestamp(True) - @unittest.skipIf(sys.version_info < (3, 9), "zoneinfo is available from Python 3.9+") def test_toPandas_timestmap_tzinfo(self): self.check_toPandas_timestmap_tzinfo(True) diff --git a/python/pyspark/sql/tests/connect/test_parity_arrow_python_udf.py b/python/pyspark/sql/tests/connect/test_parity_arrow_python_udf.py index f5bd99fa22cfb..732008eb05a35 100644 --- a/python/pyspark/sql/tests/connect/test_parity_arrow_python_udf.py +++ b/python/pyspark/sql/tests/connect/test_parity_arrow_python_udf.py @@ -15,10 +15,6 @@ # limitations under the License. # -import unittest - -from pyspark.errors import AnalysisException, PythonException -from pyspark.sql.functions import udf from pyspark.sql.tests.connect.test_parity_udf import UDFParityTests from pyspark.sql.tests.test_arrow_python_udf import PythonUDFArrowTestsMixin @@ -36,32 +32,6 @@ def tearDownClass(cls): finally: super(ArrowPythonUDFParityTests, cls).tearDownClass() - def test_named_arguments_negative(self): - @udf("int") - def test_udf(a, b): - return a + b - - self.spark.udf.register("test_udf", test_udf) - - with self.assertRaisesRegex( - AnalysisException, - "DUPLICATE_ROUTINE_PARAMETER_ASSIGNMENT.DOUBLE_NAMED_ARGUMENT_REFERENCE", - ): - self.spark.sql("SELECT test_udf(a => id, a => id * 10) FROM range(2)").show() - - with self.assertRaisesRegex(AnalysisException, "UNEXPECTED_POSITIONAL_ARGUMENT"): - self.spark.sql("SELECT test_udf(a => id, id * 10) FROM range(2)").show() - - with self.assertRaises(PythonException): - self.spark.sql("SELECT test_udf(c => 'x') FROM range(2)").show() - - with self.assertRaises(PythonException): - self.spark.sql("SELECT test_udf(id, a => id * 10) FROM range(2)").show() - - @unittest.skip("Spark Connect does not validate return type in client.") - def test_err_return_type(self): - super.test_err_return_type() - if __name__ == "__main__": import unittest diff --git a/python/pyspark/sql/tests/connect/test_parity_column.py b/python/pyspark/sql/tests/connect/test_parity_column.py index d02fb289b7d8d..a109d2ba3b58f 100644 --- a/python/pyspark/sql/tests/connect/test_parity_column.py +++ b/python/pyspark/sql/tests/connect/test_parity_column.py @@ -17,16 +17,6 @@ import unittest -from pyspark.testing.connectutils import should_test_connect - -if should_test_connect: - from pyspark import sql - from pyspark.sql.connect.column import Column - - # This is a hack to make the Column instance comparison works in `ColumnTestsMixin`. - # e.g., `isinstance(col, pyspark.sql.Column)`. - sql.Column = Column - from pyspark.sql.tests.test_column import ColumnTestsMixin from pyspark.testing.connectutils import ReusedConnectTestCase diff --git a/python/pyspark/sql/tests/connect/test_parity_dataframe.py b/python/pyspark/sql/tests/connect/test_parity_dataframe.py index 6210d4ec72fec..343f485553a98 100644 --- a/python/pyspark/sql/tests/connect/test_parity_dataframe.py +++ b/python/pyspark/sql/tests/connect/test_parity_dataframe.py @@ -30,10 +30,6 @@ def test_help_command(self): def test_toDF_with_schema_string(self): super().test_toDF_with_schema_string() - @unittest.skip("Spark Connect does not support DataFrameQueryContext currently.") - def test_dataframe_error_context(self): - super().test_dataframe_error_context() - if __name__ == "__main__": import unittest diff --git a/python/pyspark/sql/tests/connect/test_parity_dataframe_query_context.py b/python/pyspark/sql/tests/connect/test_parity_dataframe_query_context.py new file mode 100644 index 0000000000000..59107363571ee --- /dev/null +++ b/python/pyspark/sql/tests/connect/test_parity_dataframe_query_context.py @@ -0,0 +1,38 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import unittest + +from pyspark.sql.tests.test_dataframe_query_context import DataFrameQueryContextTestsMixin +from pyspark.testing.connectutils import ReusedConnectTestCase + + +class DataFrameQueryContextParityTests(DataFrameQueryContextTestsMixin, ReusedConnectTestCase): + pass + + +if __name__ == "__main__": + import unittest + from pyspark.sql.tests.connect.test_parity_dataframe_query_context import * # noqa: F401 + + try: + import xmlrunner # type: ignore[import] + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/sql/tests/connect/test_parity_functions.py b/python/pyspark/sql/tests/connect/test_parity_functions.py index 4fa1cf31b3b68..0a77c5531082a 100644 --- a/python/pyspark/sql/tests/connect/test_parity_functions.py +++ b/python/pyspark/sql/tests/connect/test_parity_functions.py @@ -18,17 +18,10 @@ import unittest from pyspark.sql.tests.test_functions import FunctionsTestsMixin -from pyspark.testing.connectutils import should_test_connect, ReusedConnectTestCase - -if should_test_connect: - from pyspark.errors.exceptions.connect import SparkConnectException - from pyspark.sql.connect.column import Column +from pyspark.testing.connectutils import ReusedConnectTestCase class FunctionsParityTests(FunctionsTestsMixin, ReusedConnectTestCase): - def test_assert_true(self): - self.check_assert_true(SparkConnectException) - @unittest.skip("Spark Connect does not support Spark Context but the test depends on that.") def test_basic_functions(self): super().test_basic_functions() @@ -41,15 +34,8 @@ def test_function_parity(self): def test_input_file_name_reset_for_rdd(self): super().test_input_file_name_reset_for_rdd() - def test_raise_error(self): - self.check_raise_error(SparkConnectException) - - def test_sorting_functions_with_column(self): - self.check_sorting_functions_with_column(Column) - if __name__ == "__main__": - import unittest from pyspark.sql.tests.connect.test_parity_functions import * # noqa: F401 try: diff --git a/python/pyspark/sql/tests/connect/test_parity_memory_profiler.py b/python/pyspark/sql/tests/connect/test_parity_memory_profiler.py index 513e49a144e50..c6ef9810c6840 100644 --- a/python/pyspark/sql/tests/connect/test_parity_memory_profiler.py +++ b/python/pyspark/sql/tests/connect/test_parity_memory_profiler.py @@ -27,6 +27,20 @@ def setUp(self) -> None: super().setUp() self.spark._profiler_collector._value = None + +class MemoryProfilerWithoutPlanCacheParityTests(MemoryProfilerParityTests): + @classmethod + def setUpClass(cls): + super().setUpClass() + cls.spark.conf.set("spark.connect.session.planCache.enabled", False) + + @classmethod + def tearDownClass(cls): + try: + cls.spark.conf.unset("spark.connect.session.planCache.enabled") + finally: + super().tearDownClass() + def test_memory_profiler_udf_multiple_actions(self): def action(df): df.collect() @@ -35,6 +49,7 @@ def action(df): with self.sql_conf({"spark.sql.pyspark.udf.profiler": "memory"}): _do_computation(self.spark, action=action) + # Without the plan cache, UDF ID will be different for each action self.assertEqual(6, len(self.profile_results), str(list(self.profile_results))) for id in self.profile_results: diff --git a/python/pyspark/sql/tests/connect/test_parity_observation.py b/python/pyspark/sql/tests/connect/test_parity_observation.py index a7b0009357b60..e16053d5a082a 100644 --- a/python/pyspark/sql/tests/connect/test_parity_observation.py +++ b/python/pyspark/sql/tests/connect/test_parity_observation.py @@ -25,10 +25,7 @@ class DataFrameObservationParityTests( DataFrameObservationTestsMixin, ReusedConnectTestCase, ): - # TODO(SPARK-41625): Support Structured Streaming - @unittest.skip("Fails in Spark Connect, should enable.") - def test_observe_str(self): - super().test_observe_str() + pass if __name__ == "__main__": diff --git a/python/pyspark/sql/tests/connect/test_parity_pandas_cogrouped_map.py b/python/pyspark/sql/tests/connect/test_parity_pandas_cogrouped_map.py index 41e756546318d..00d71bda2d938 100644 --- a/python/pyspark/sql/tests/connect/test_parity_pandas_cogrouped_map.py +++ b/python/pyspark/sql/tests/connect/test_parity_pandas_cogrouped_map.py @@ -20,18 +20,11 @@ from pyspark.testing.connectutils import ReusedConnectTestCase -class CogroupedApplyInPandasTests(CogroupedApplyInPandasTestsMixin, ReusedConnectTestCase): - @unittest.skip("Fails in Spark Connect, should enable.") - def test_wrong_args(self): - self.check_wrong_args() - - @unittest.skip("Fails in Spark Connect, should enable.") - def test_apply_in_pandas_returning_incompatible_type(self): - super().test_apply_in_pandas_returning_incompatible_type() - - @unittest.skip("Fails in Spark Connect, should enable.") - def test_wrong_return_type(self): - super().test_wrong_return_type() +class CogroupedApplyInPandasTests( + CogroupedApplyInPandasTestsMixin, + ReusedConnectTestCase, +): + pass if __name__ == "__main__": diff --git a/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map.py b/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map.py index f0e7eeb606cab..8c76313c5c96b 100644 --- a/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map.py +++ b/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map.py @@ -26,26 +26,6 @@ class GroupedApplyInPandasTests(GroupedApplyInPandasTestsMixin, ReusedConnectTes def test_supported_types(self): super().test_supported_types() - @unittest.skip("Fails in Spark Connect, should enable.") - def test_wrong_return_type(self): - super().test_wrong_return_type() - - @unittest.skip("Fails in Spark Connect, should enable.") - def test_wrong_args(self): - super().test_wrong_args() - - @unittest.skip("Fails in Spark Connect, should enable.") - def test_unsupported_types(self): - super().test_unsupported_types() - - @unittest.skip("Fails in Spark Connect, should enable.") - def test_apply_in_pandas_returning_incompatible_type(self): - super().test_apply_in_pandas_returning_incompatible_type() - - @unittest.skip("Spark Connect doesn't support RDD but the test depends on it.") - def test_grouped_with_empty_partition(self): - super().test_grouped_with_empty_partition() - if __name__ == "__main__": from pyspark.sql.tests.connect.test_parity_pandas_grouped_map import * # noqa: F401 diff --git a/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map_with_state.py b/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map_with_state.py index dc3bdf28f81c8..67d42a7c86138 100644 --- a/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map_with_state.py +++ b/python/pyspark/sql/tests/connect/test_parity_pandas_grouped_map_with_state.py @@ -25,29 +25,7 @@ class GroupedApplyInPandasWithStateTests( GroupedApplyInPandasWithStateTestsMixin, ReusedConnectTestCase ): - @unittest.skip("foreachBatch will be supported in SPARK-42944.") - def test_apply_in_pandas_with_state_basic(self): - super().test_apply_in_pandas_with_state_basic() - - @unittest.skip("foreachBatch will be supported in SPARK-42944.") - def test_apply_in_pandas_with_state_basic_no_state(self): - super().test_apply_in_pandas_with_state_basic() - - @unittest.skip("foreachBatch will be supported in SPARK-42944.") - def test_apply_in_pandas_with_state_basic_no_state_no_data(self): - super().test_apply_in_pandas_with_state_basic() - - @unittest.skip("foreachBatch will be supported in SPARK-42944.") - def test_apply_in_pandas_with_state_basic_more_data(self): - super().test_apply_in_pandas_with_state_basic() - - @unittest.skip("foreachBatch will be supported in SPARK-42944.") - def test_apply_in_pandas_with_state_basic_fewer_data(self): - super().test_apply_in_pandas_with_state_basic() - - @unittest.skip("foreachBatch will be supported in SPARK-42944.") - def test_apply_in_pandas_with_state_basic_with_null(self): - super().test_apply_in_pandas_with_state_basic() + pass if __name__ == "__main__": diff --git a/python/pyspark/sql/tests/connect/test_parity_pandas_udf.py b/python/pyspark/sql/tests/connect/test_parity_pandas_udf.py index b5433b38dee5f..364e41716474b 100644 --- a/python/pyspark/sql/tests/connect/test_parity_pandas_udf.py +++ b/python/pyspark/sql/tests/connect/test_parity_pandas_udf.py @@ -14,49 +14,13 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from pyspark.sql.functions import pandas_udf, PandasUDFType -from pyspark.sql.tests.pandas.test_pandas_udf import PandasUDFTestsMixin -from pyspark.testing.connectutils import should_test_connect, ReusedConnectTestCase -if should_test_connect: - from pyspark.sql.connect.types import UnparsedDataType +from pyspark.sql.tests.pandas.test_pandas_udf import PandasUDFTestsMixin +from pyspark.testing.connectutils import ReusedConnectTestCase class PandasUDFParityTests(PandasUDFTestsMixin, ReusedConnectTestCase): - def test_udf_wrong_arg(self): - self.check_udf_wrong_arg() - - def test_pandas_udf_decorator_with_return_type_string(self): - @pandas_udf("v double", PandasUDFType.GROUPED_MAP) - def foo(x): - return x - - self.assertEqual(foo.returnType, UnparsedDataType("v double")) - self.assertEqual(foo.evalType, PandasUDFType.GROUPED_MAP) - - @pandas_udf(returnType="double", functionType=PandasUDFType.SCALAR) - def foo(x): - return x - - self.assertEqual(foo.returnType, UnparsedDataType("double")) - self.assertEqual(foo.evalType, PandasUDFType.SCALAR) - - def test_pandas_udf_basic_with_return_type_string(self): - udf = pandas_udf(lambda x: x, "double", PandasUDFType.SCALAR) - self.assertEqual(udf.returnType, UnparsedDataType("double")) - self.assertEqual(udf.evalType, PandasUDFType.SCALAR) - - udf = pandas_udf(lambda x: x, "v double", PandasUDFType.GROUPED_MAP) - self.assertEqual(udf.returnType, UnparsedDataType("v double")) - self.assertEqual(udf.evalType, PandasUDFType.GROUPED_MAP) - - udf = pandas_udf(lambda x: x, "v double", functionType=PandasUDFType.GROUPED_MAP) - self.assertEqual(udf.returnType, UnparsedDataType("v double")) - self.assertEqual(udf.evalType, PandasUDFType.GROUPED_MAP) - - udf = pandas_udf(lambda x: x, returnType="v double", functionType=PandasUDFType.GROUPED_MAP) - self.assertEqual(udf.returnType, UnparsedDataType("v double")) - self.assertEqual(udf.evalType, PandasUDFType.GROUPED_MAP) + pass if __name__ == "__main__": diff --git a/python/pyspark/sql/tests/connect/test_parity_pandas_udf_grouped_agg.py b/python/pyspark/sql/tests/connect/test_parity_pandas_udf_grouped_agg.py index 6a3f8ab2569b7..fdb81bffbce12 100644 --- a/python/pyspark/sql/tests/connect/test_parity_pandas_udf_grouped_agg.py +++ b/python/pyspark/sql/tests/connect/test_parity_pandas_udf_grouped_agg.py @@ -20,19 +20,11 @@ from pyspark.testing.connectutils import ReusedConnectTestCase -class PandasUDFGroupedAggParityTests(GroupedAggPandasUDFTestsMixin, ReusedConnectTestCase): - # TODO(SPARK-43727): Parity returnType check in Spark Connect - @unittest.skip("Fails in Spark Connect, should enable.") - def test_unsupported_types(self): - super().test_unsupported_types() - - @unittest.skip("Spark Connect doesn't support RDD but the test depends on it.") - def test_grouped_with_empty_partition(self): - super().test_grouped_with_empty_partition() - - @unittest.skip("Spark Connect does not support convert UNPARSED to catalyst types.") - def test_manual(self): - super().test_manual() +class PandasUDFGroupedAggParityTests( + GroupedAggPandasUDFTestsMixin, + ReusedConnectTestCase, +): + pass if __name__ == "__main__": diff --git a/python/pyspark/sql/tests/connect/test_parity_pandas_udf_scalar.py b/python/pyspark/sql/tests/connect/test_parity_pandas_udf_scalar.py index b42bfaf0f58db..451f0f68d6ee5 100644 --- a/python/pyspark/sql/tests/connect/test_parity_pandas_udf_scalar.py +++ b/python/pyspark/sql/tests/connect/test_parity_pandas_udf_scalar.py @@ -15,30 +15,12 @@ # limitations under the License. # import unittest -from pyspark.sql.connect.column import Column from pyspark.sql.tests.pandas.test_pandas_udf_scalar import ScalarPandasUDFTestsMixin from pyspark.testing.connectutils import ReusedConnectTestCase class PandasUDFScalarParityTests(ScalarPandasUDFTestsMixin, ReusedConnectTestCase): - def test_nondeterministic_vectorized_udf_in_aggregate(self): - self.check_nondeterministic_analysis_exception() - - @unittest.skip("Spark Connect doesn't support RDD but the test depends on it.") - def test_vectorized_udf_empty_partition(self): - super().test_vectorized_udf_empty_partition() - - @unittest.skip("Spark Connect doesn't support RDD but the test depends on it.") - def test_vectorized_udf_struct_with_empty_partition(self): - super().test_vectorized_udf_struct_with_empty_partition() - - # TODO(SPARK-43727): Parity returnType check in Spark Connect - @unittest.skip("Fails in Spark Connect, should enable.") - def test_vectorized_udf_wrong_return_type(self): - super().test_vectorized_udf_wrong_return_type() - - def test_mixed_udf_and_sql(self): - self._test_mixed_udf_and_sql(Column) + pass if __name__ == "__main__": diff --git a/python/pyspark/sql/tests/connect/test_parity_types.py b/python/pyspark/sql/tests/connect/test_parity_types.py index 82a677574b455..6d06611def6af 100644 --- a/python/pyspark/sql/tests/connect/test_parity_types.py +++ b/python/pyspark/sql/tests/connect/test_parity_types.py @@ -32,24 +32,28 @@ def test_apply_schema_to_dict_and_rows(self): @unittest.skip("Spark Connect does not support RDD but the tests depend on them.") def test_apply_schema_to_row(self): - super().test_apply_schema_to_dict_and_rows() + super().test_apply_schema_to_row() @unittest.skip("Spark Connect does not support RDD but the tests depend on them.") def test_create_dataframe_schema_mismatch(self): super().test_create_dataframe_schema_mismatch() @unittest.skip("Spark Connect does not support RDD but the tests depend on them.") - def test_infer_array_element_type_empty(self): - super().test_infer_array_element_type_empty() - - @unittest.skip("Spark Connect does not support RDD but the tests depend on them.") - def test_infer_array_element_type_with_struct(self): - super().test_infer_array_element_type_with_struct() + def test_infer_array_element_type_empty_rdd(self): + super().test_infer_array_element_type_empty_rdd() @unittest.skip("Spark Connect does not support RDD but the tests depend on them.") def test_infer_array_merge_element_types_with_rdd(self): super().test_infer_array_merge_element_types_with_rdd() + @unittest.skip("Spark Connect does not support RDD but the tests depend on them.") + def test_infer_map_pair_type_empty_rdd(self): + super().test_infer_map_pair_type_empty_rdd() + + @unittest.skip("Spark Connect does not support RDD but the tests depend on them.") + def test_infer_map_merge_pair_types_with_rdd(self): + super().test_infer_map_merge_pair_types_with_rdd() + @unittest.skip("Spark Connect does not support RDD but the tests depend on them.") def test_infer_binary_type(self): super().test_infer_binary_type() @@ -86,9 +90,17 @@ def test_rdd_with_udt(self): def test_udt(self): super().test_udt() - @unittest.skip("Does not test anything related to Spark Connect") - def test_parse_datatype_string(self): - super().test_parse_datatype_string() + @unittest.skip("Requires JVM access.") + def test_schema_with_collations_json_ser_de(self): + super().test_schema_with_collations_json_ser_de() + + @unittest.skip("This test is dedicated for PySpark Classic.") + def test_ym_interval_in_collect(self): + super().test_ym_interval_in_collect() + + @unittest.skip("This test is dedicated for PySpark Classic.") + def test_cal_interval_in_collect(self): + super().test_cal_interval_in_collect() if __name__ == "__main__": diff --git a/python/pyspark/sql/tests/connect/test_parity_udf.py b/python/pyspark/sql/tests/connect/test_parity_udf.py index 17d7ae0eb9fc7..5507f8e9f289b 100644 --- a/python/pyspark/sql/tests/connect/test_parity_udf.py +++ b/python/pyspark/sql/tests/connect/test_parity_udf.py @@ -44,10 +44,6 @@ def test_udf_with_input_file_name_for_hadooprdd(self): def test_same_accumulator_in_udfs(self): super().test_same_accumulator_in_udfs() - @unittest.skip("Spark Connect does not support spark.conf but the test depends on it.") - def test_udf_timestamp_ntz(self): - super().test_udf_timestamp_ntz() - @unittest.skip("Spark Connect does not support broadcast but the test depends on it.") def test_broadcast_in_udf(self): super().test_broadcast_in_udf() diff --git a/python/pyspark/sql/tests/connect/test_parity_udf_profiler.py b/python/pyspark/sql/tests/connect/test_parity_udf_profiler.py index dfa56ff0bb888..a1789a50896db 100644 --- a/python/pyspark/sql/tests/connect/test_parity_udf_profiler.py +++ b/python/pyspark/sql/tests/connect/test_parity_udf_profiler.py @@ -27,6 +27,20 @@ def setUp(self) -> None: super().setUp() self.spark._profiler_collector._value = None + +class UDFProfilerWithoutPlanCacheParityTests(UDFProfilerParityTests): + @classmethod + def setUpClass(cls): + super().setUpClass() + cls.spark.conf.set("spark.connect.session.planCache.enabled", False) + + @classmethod + def tearDownClass(cls): + try: + cls.spark.conf.unset("spark.connect.session.planCache.enabled") + finally: + super().tearDownClass() + def test_perf_profiler_udf_multiple_actions(self): def action(df): df.collect() @@ -35,6 +49,7 @@ def action(df): with self.sql_conf({"spark.sql.pyspark.udf.profiler": "perf"}): _do_computation(self.spark, action=action) + # Without the plan cache, UDF ID will be different for each action self.assertEqual(6, len(self.profile_results), str(list(self.profile_results))) for id in self.profile_results: diff --git a/python/pyspark/sql/tests/connect/test_parity_udtf.py b/python/pyspark/sql/tests/connect/test_parity_udtf.py index 5071b69060a1d..2ea6ef8cc389d 100644 --- a/python/pyspark/sql/tests/connect/test_parity_udtf.py +++ b/python/pyspark/sql/tests/connect/test_parity_udtf.py @@ -25,7 +25,6 @@ sql.udtf.UserDefinedTableFunction = UserDefinedTableFunction from pyspark.sql.connect.functions import lit, udtf -from pyspark.util import is_remote_only from pyspark.sql.tests.test_udtf import BaseUDTFTestsMixin, UDTFArrowTestsMixin from pyspark.testing.connectutils import ReusedConnectTestCase from pyspark.errors.exceptions.connect import SparkConnectGrpcException, PythonException @@ -68,13 +67,11 @@ def test_udtf_with_analyze_using_broadcast(self): def test_udtf_with_analyze_using_accumulator(self): super().test_udtf_with_analyze_using_accumulator() - @unittest.skipIf(is_remote_only(), "pyspark-connect does not have SparkFiles") def test_udtf_with_analyze_using_archive(self): - super().test_udtf_with_analyze_using_archive() + super().check_udtf_with_analyze_using_archive(".") - @unittest.skipIf(is_remote_only(), "pyspark-connect does not have SparkFiles") def test_udtf_with_analyze_using_file(self): - super().test_udtf_with_analyze_using_file() + super().check_udtf_with_analyze_using_file(".") @unittest.skip("pyspark-connect can serialize SparkSession, but fails on executor") def test_udtf_access_spark_session(self): diff --git a/python/pyspark/sql/tests/connect/test_resources.py b/python/pyspark/sql/tests/connect/test_resources.py index 931acd9298043..94d71b54ff057 100644 --- a/python/pyspark/sql/tests/connect/test_resources.py +++ b/python/pyspark/sql/tests/connect/test_resources.py @@ -15,19 +15,16 @@ # limitations under the License. # import unittest +import os -from pyspark.util import is_remote_only from pyspark.testing.connectutils import ReusedConnectTestCase +from pyspark.sql.tests.test_resources import ResourceProfileTestsMixin -# TODO(SPARK-47757): Reeanble ResourceProfileTests for pyspark-connect -if not is_remote_only(): - from pyspark.sql.tests.test_resources import ResourceProfileTestsMixin - - class ResourceProfileTests(ResourceProfileTestsMixin, ReusedConnectTestCase): - @classmethod - def master(cls): - return "local-cluster[1, 4, 1024]" +class ResourceProfileTests(ResourceProfileTestsMixin, ReusedConnectTestCase): + @classmethod + def master(cls): + return os.environ.get("SPARK_CONNECT_TESTING_REMOTE", "local-cluster[1, 4, 1024]") if __name__ == "__main__": diff --git a/python/pyspark/sql/tests/connect/test_session.py b/python/pyspark/sql/tests/connect/test_session.py index 5184b9f061712..6f0e4aaad3f89 100644 --- a/python/pyspark/sql/tests/connect/test_session.py +++ b/python/pyspark/sql/tests/connect/test_session.py @@ -77,6 +77,34 @@ def test_session_create_sets_active_session(self): self.assertIs(session, session2) session.stop() + def test_active_session_expires_when_client_closes(self): + s1 = RemoteSparkSession.builder.remote("sc://other").getOrCreate() + s2 = RemoteSparkSession.getActiveSession() + + self.assertIs(s1, s2) + + # We don't call close() to avoid executing ExecutePlanResponseReattachableIterator + s1._client._closed = True + + self.assertIsNone(RemoteSparkSession.getActiveSession()) + s3 = RemoteSparkSession.builder.remote("sc://other").getOrCreate() + + self.assertIsNot(s1, s3) + + def test_default_session_expires_when_client_closes(self): + s1 = RemoteSparkSession.builder.remote("sc://other").getOrCreate() + s2 = RemoteSparkSession.getDefaultSession() + + self.assertIs(s1, s2) + + # We don't call close() to avoid executing ExecutePlanResponseReattachableIterator + s1._client._closed = True + + self.assertIsNone(RemoteSparkSession.getDefaultSession()) + s3 = RemoteSparkSession.builder.remote("sc://other").getOrCreate() + + self.assertIsNot(s1, s3) + class JobCancellationTests(ReusedConnectTestCase): def test_tags(self): @@ -91,6 +119,34 @@ def test_tags(self): self.assertEqual(self.spark.getTags(), set()) self.spark.clearTags() + def test_tags_multithread(self): + output1 = None + output2 = None + + def tag1(): + nonlocal output1 + + self.spark.addTag("tag1") + output1 = self.spark.getTags() + + def tag2(): + nonlocal output2 + + self.spark.addTag("tag2") + output2 = self.spark.getTags() + + t1 = threading.Thread(target=tag1) + t1.start() + t1.join() + t2 = threading.Thread(target=tag2) + t2.start() + t2.join() + + self.assertIsNotNone(output1) + self.assertEquals(output1, {"tag1"}) + self.assertIsNotNone(output2) + self.assertEquals(output2, {"tag2"}) + def test_interrupt_tag(self): thread_ids = range(4) self.check_job_cancellation( diff --git a/python/pyspark/sql/tests/pandas/test_pandas_cogrouped_map.py b/python/pyspark/sql/tests/pandas/test_pandas_cogrouped_map.py index 0e7d0e7ef7df8..b1060ef48156a 100644 --- a/python/pyspark/sql/tests/pandas/test_pandas_cogrouped_map.py +++ b/python/pyspark/sql/tests/pandas/test_pandas_cogrouped_map.py @@ -165,7 +165,7 @@ def check_apply_in_pandas_not_returning_pandas_dataframe(self): fn=lambda lft, rgt: lft.size + rgt.size, error_class=PythonException, error_message_regex="Return type of the user-defined function " - "should be pandas.DataFrame, but is int.", + "should be pandas.DataFrame, but is int", ) def test_apply_in_pandas_returning_column_names(self): diff --git a/python/pyspark/sql/tests/pandas/test_pandas_grouped_map.py b/python/pyspark/sql/tests/pandas/test_pandas_grouped_map.py index 0396006e2b362..a26d6d02a2bcd 100644 --- a/python/pyspark/sql/tests/pandas/test_pandas_grouped_map.py +++ b/python/pyspark/sql/tests/pandas/test_pandas_grouped_map.py @@ -52,7 +52,7 @@ MapType, YearMonthIntervalType, ) -from pyspark.errors import PythonException, PySparkTypeError +from pyspark.errors import PythonException, PySparkTypeError, PySparkValueError from pyspark.testing.sqlutils import ( ReusedSQLTestCase, have_pandas, @@ -421,22 +421,43 @@ def test_wrong_args(self): def check_wrong_args(self): df = self.data - with self.assertRaisesRegex(ValueError, "Invalid function"): + with self.assertRaisesRegex(PySparkTypeError, "INVALID_UDF_EVAL_TYPE"): df.groupby("id").apply(lambda x: x) - with self.assertRaisesRegex(ValueError, "Invalid function"): + with self.assertRaisesRegex(PySparkTypeError, "INVALID_UDF_EVAL_TYPE"): df.groupby("id").apply(udf(lambda x: x, DoubleType())) - with self.assertRaisesRegex(ValueError, "Invalid function"): + with self.assertRaisesRegex(PySparkTypeError, "INVALID_UDF_EVAL_TYPE"): df.groupby("id").apply(sum(df.v)) - with self.assertRaisesRegex(ValueError, "Invalid function"): + with self.assertRaisesRegex(PySparkTypeError, "INVALID_UDF_EVAL_TYPE"): df.groupby("id").apply(df.v + 1) - with self.assertRaisesRegex(ValueError, "Invalid function"): + with self.assertRaisesRegex(PySparkTypeError, "INVALID_UDF_EVAL_TYPE"): + df.groupby("id").apply(pandas_udf(lambda x, y: x, DoubleType())) + with self.assertRaisesRegex(PySparkTypeError, "INVALID_UDF_EVAL_TYPE"): + df.groupby("id").apply(pandas_udf(lambda x, y: x, DoubleType(), PandasUDFType.SCALAR)) + + with self.assertRaisesRegex(PySparkValueError, "INVALID_PANDAS_UDF"): df.groupby("id").apply( pandas_udf(lambda: 1, StructType([StructField("d", DoubleType())])) ) - with self.assertRaisesRegex(ValueError, "Invalid function"): - df.groupby("id").apply(pandas_udf(lambda x, y: x, DoubleType())) - with self.assertRaisesRegex(ValueError, "Invalid function.*GROUPED_MAP"): - df.groupby("id").apply(pandas_udf(lambda x, y: x, DoubleType(), PandasUDFType.SCALAR)) + + def test_wrong_args_in_apply_func(self): + df1 = self.spark.range(11) + df2 = self.spark.range(22) + + with self.assertRaisesRegex(PySparkValueError, "INVALID_PANDAS_UDF"): + df1.groupby("id").applyInPandas(lambda: 1, StructType([StructField("d", DoubleType())])) + + with self.assertRaisesRegex(PySparkValueError, "INVALID_PANDAS_UDF"): + df1.groupby("id").applyInArrow(lambda: 1, StructType([StructField("d", DoubleType())])) + + with self.assertRaisesRegex(PySparkValueError, "INVALID_PANDAS_UDF"): + df1.groupby("id").cogroup(df2.groupby("id")).applyInPandas( + lambda: 1, StructType([StructField("d", DoubleType())]) + ) + + with self.assertRaisesRegex(PySparkValueError, "INVALID_PANDAS_UDF"): + df1.groupby("id").cogroup(df2.groupby("id")).applyInArrow( + lambda: 1, StructType([StructField("d", DoubleType())]) + ) def test_unsupported_types(self): with self.quiet(): @@ -679,13 +700,13 @@ def test_grouped_with_empty_partition(self): data = [Row(id=1, x=2), Row(id=1, x=3), Row(id=2, x=4)] expected = [Row(id=1, x=5), Row(id=1, x=5), Row(id=2, x=4)] num_parts = len(data) + 1 - df = self.spark.createDataFrame(self.sc.parallelize(data, numSlices=num_parts)) + df = self.spark.createDataFrame(data).repartition(num_parts) f = pandas_udf( lambda pdf: pdf.assign(x=pdf["x"].sum()), "id long, x int", PandasUDFType.GROUPED_MAP ) - result = df.groupBy("id").apply(f).collect() + result = df.groupBy("id").apply(f).sort("id").collect() self.assertEqual(result, expected) def test_grouped_over_window(self): diff --git a/python/pyspark/sql/tests/pandas/test_pandas_grouped_map_with_state.py b/python/pyspark/sql/tests/pandas/test_pandas_grouped_map_with_state.py index 12ee9319d2cbe..47f7d672cc8c2 100644 --- a/python/pyspark/sql/tests/pandas/test_pandas_grouped_map_with_state.py +++ b/python/pyspark/sql/tests/pandas/test_pandas_grouped_map_with_state.py @@ -95,6 +95,7 @@ def prepare_test_resource(): self.assertEqual(q.name, "this_query") self.assertTrue(q.isActive) q.processAllAvailable() + self.assertTrue(q.exception() is None) def test_apply_in_pandas_with_state_basic(self): def func(key, pdf_iter, state): @@ -109,10 +110,10 @@ def func(key, pdf_iter, state): yield pd.DataFrame({"key": [key[0]], "countAsString": [str(total_len)]}) def check_results(batch_df, _): - self.assertEqual( - set(batch_df.sort("key").collect()), - {Row(key="hello", countAsString="1"), Row(key="this", countAsString="1")}, - ) + assert set(batch_df.sort("key").collect()) == { + Row(key="hello", countAsString="1"), + Row(key="this", countAsString="1"), + } self._test_apply_in_pandas_with_state_basic(func, check_results) @@ -123,14 +124,11 @@ def func(key, pdf_iter, state): yield pd.DataFrame({"key": [key[0], "foo"], "countAsString": ["100", "222"]}) def check_results(batch_df, _): - self.assertEqual( - set(batch_df.sort("key").collect()), - { - Row(key="hello", countAsString="100"), - Row(key="this", countAsString="100"), - Row(key="foo", countAsString="222"), - }, - ) + assert set(batch_df.sort("key").collect()) == { + Row(key="hello", countAsString="100"), + Row(key="this", countAsString="100"), + Row(key="foo", countAsString="222"), + } self._test_apply_in_pandas_with_state_basic(func, check_results) @@ -141,7 +139,7 @@ def func(key, pdf_iter, state): yield pd.DataFrame({"key": [], "countAsString": []}) def check_results(batch_df, _): - self.assertTrue(len(set(batch_df.sort("key").collect())) == 0) + assert len(set(batch_df.sort("key").collect())) == 0 self._test_apply_in_pandas_with_state_basic(func, check_results) @@ -156,16 +154,13 @@ def func(key, pdf_iter, state): ) def check_results(batch_df, _): - self.assertEqual( - set(batch_df.sort("key").collect()), - { - Row(key="hello", countAsString="1"), - Row(key="foo", countAsString="666"), - Row(key="hello_2", countAsString="2"), - Row(key="this", countAsString="1"), - Row(key="this_2", countAsString="2"), - }, - ) + assert set(batch_df.sort("key").collect()) == { + Row(key="hello", countAsString="1"), + Row(key="foo", countAsString="666"), + Row(key="hello_2", countAsString="2"), + Row(key="this", countAsString="1"), + Row(key="this_2", countAsString="2"), + } self._test_apply_in_pandas_with_state_basic(func, check_results) @@ -177,7 +172,7 @@ def func(key, pdf_iter, state): yield pd.DataFrame({"key": [], "countAsString": []}) def check_results(batch_df, _): - self.assertTrue(len(set(batch_df.sort("key").collect())) == 0) + assert len(set(batch_df.sort("key").collect())) == 0 self._test_apply_in_pandas_with_state_basic(func, check_results) @@ -194,10 +189,7 @@ def func(key, pdf_iter, state): yield pd.DataFrame({"key": [None], "countAsString": [str(total_len)]}) def check_results(batch_df, _): - self.assertEqual( - set(batch_df.sort("key").collect()), - {Row(key=None, countAsString="1")}, - ) + assert set(batch_df.sort("key").collect()) == {Row(key=None, countAsString="1")} self._test_apply_in_pandas_with_state_basic(func, check_results) diff --git a/python/pyspark/sql/tests/pandas/test_pandas_map.py b/python/pyspark/sql/tests/pandas/test_pandas_map.py index 37e52d4344fb8..692f9705411e0 100644 --- a/python/pyspark/sql/tests/pandas/test_pandas_map.py +++ b/python/pyspark/sql/tests/pandas/test_pandas_map.py @@ -151,14 +151,14 @@ def bad_iter_elem(_): with self.assertRaisesRegex( PythonException, "Return type of the user-defined function should be iterator of pandas.DataFrame, " - "but is int.", + "but is int", ): (self.spark.range(10, numPartitions=3).mapInPandas(no_iter, "a int").count()) with self.assertRaisesRegex( PythonException, "Return type of the user-defined function should be iterator of pandas.DataFrame, " - "but is iterator of int.", + "but is iterator of int", ): (self.spark.range(10, numPartitions=3).mapInPandas(bad_iter_elem, "a int").count()) diff --git a/python/pyspark/sql/tests/pandas/test_pandas_udf_grouped_agg.py b/python/pyspark/sql/tests/pandas/test_pandas_udf_grouped_agg.py index a7cf45e3bcbe0..70fa31fd515bb 100644 --- a/python/pyspark/sql/tests/pandas/test_pandas_udf_grouped_agg.py +++ b/python/pyspark/sql/tests/pandas/test_pandas_udf_grouped_agg.py @@ -538,11 +538,11 @@ def test_grouped_with_empty_partition(self): data = [Row(id=1, x=2), Row(id=1, x=3), Row(id=2, x=4)] expected = [Row(id=1, sum=5), Row(id=2, x=4)] num_parts = len(data) + 1 - df = self.spark.createDataFrame(self.sc.parallelize(data, numSlices=num_parts)) + df = self.spark.createDataFrame(data).repartition(num_parts) f = pandas_udf(lambda x: x.sum(), "int", PandasUDFType.GROUPED_AGG) - result = df.groupBy("id").agg(f(df["x"]).alias("sum")).collect() + result = df.groupBy("id").agg(f(df["x"]).alias("sum")).sort("id").collect() self.assertEqual(result, expected) def test_grouped_without_group_by_clause(self): diff --git a/python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py b/python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py index ec413d048d8ec..38bc633cd1ed1 100644 --- a/python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py +++ b/python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py @@ -764,15 +764,17 @@ def iter_identity(x): self.assertEqual(df.collect(), res.collect()) def test_vectorized_udf_empty_partition(self): - df = self.spark.createDataFrame(self.sc.parallelize([Row(id=1)], 2)) + df = self.spark.createDataFrame([Row(id=1)]).repartition(2) for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]: f = pandas_udf(lambda x: x, LongType(), udf_type) res = df.select(f(col("id"))) self.assertEqual(df.collect(), res.collect()) def test_vectorized_udf_struct_with_empty_partition(self): - df = self.spark.createDataFrame(self.sc.parallelize([Row(id=1)], 2)).withColumn( - "name", lit("John Doe") + df = ( + self.spark.createDataFrame([Row(id=1)]) + .repartition(2) + .withColumn("name", lit("John Doe")) ) @pandas_udf("first string, last string") @@ -1334,7 +1336,7 @@ def f1(x): return x + 1 def f2(x): - assert type(x) == col_type + assert isinstance(x, col_type) return x + 10 @pandas_udf("int") diff --git a/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints.py b/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints.py index bfb874ffe5340..7a0fccc225725 100644 --- a/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints.py +++ b/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints.py @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # -import sys import unittest from inspect import signature from typing import Union, Iterator, Tuple, cast, get_type_hints @@ -114,7 +113,6 @@ def func(iter: Iterator[Tuple[Union[pd.DataFrame, pd.Series], ...]]) -> Iterator infer_eval_type(signature(func), get_type_hints(func)), PandasUDFType.SCALAR_ITER ) - @unittest.skipIf(sys.version_info < (3, 9), "Type hinting generics require Python 3.9.") def test_type_annotation_tuple_generics(self): def func(iter: Iterator[tuple[pd.DataFrame, pd.Series]]) -> Iterator[pd.DataFrame]: pass diff --git a/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints_with_future_annotations.py b/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints_with_future_annotations.py index 9b6751564c40e..442e1c61a0ba8 100644 --- a/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints_with_future_annotations.py +++ b/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints_with_future_annotations.py @@ -16,7 +16,6 @@ # from __future__ import annotations -import sys import unittest from inspect import signature from typing import Union, Iterator, Tuple, cast, get_type_hints @@ -308,10 +307,6 @@ def pandas_plus_one(iter: Iterator[pd.DataFrame]) -> Iterator[pd.DataFrame]: expected = df.selectExpr("id + 1 as id") assert_frame_equal(expected.toPandas(), actual.toPandas()) - @unittest.skipIf( - sys.version_info < (3, 9), - "string annotations with future annotations do not work under Python<3.9", - ) def test_string_type_annotation(self): def func(col: "pd.Series") -> "pd.Series": pass diff --git a/python/pyspark/sql/tests/streaming/test_streaming.py b/python/pyspark/sql/tests/streaming/test_streaming.py index abfacdbbf059b..e284d052d9ae2 100644 --- a/python/pyspark/sql/tests/streaming/test_streaming.py +++ b/python/pyspark/sql/tests/streaming/test_streaming.py @@ -22,8 +22,9 @@ from pyspark.sql import Row from pyspark.sql.functions import lit -from pyspark.sql.types import StructType, StructField, IntegerType, StringType +from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType from pyspark.testing.sqlutils import ReusedSQLTestCase +from pyspark.errors import PySparkValueError class StreamingTestsMixin: @@ -58,6 +59,26 @@ def test_streaming_query_functions_basic(self): finally: query.stop() + def test_streaming_query_name_edge_case(self): + # Query name should be None when not specified + q1 = self.spark.readStream.format("rate").load().writeStream.format("noop").start() + self.assertEqual(q1.name, None) + + # Cannot set query name to be an empty string + error_thrown = False + try: + ( + self.spark.readStream.format("rate") + .load() + .writeStream.format("noop") + .queryName("") + .start() + ) + except PySparkValueError: + error_thrown = True + + self.assertTrue(error_thrown) + def test_stream_trigger(self): df = self.spark.readStream.format("text").load("python/test_support/sql/streaming") @@ -263,36 +284,37 @@ def test_stream_await_termination(self): shutil.rmtree(tmpPath) def test_stream_exception(self): - sdf = self.spark.readStream.format("text").load("python/test_support/sql/streaming") - sq = sdf.writeStream.format("memory").queryName("query_explain").start() - try: - sq.processAllAvailable() - self.assertEqual(sq.exception(), None) - finally: - sq.stop() - - from pyspark.sql.functions import col, udf - from pyspark.errors import StreamingQueryException - - bad_udf = udf(lambda x: 1 / 0) - sq = ( - sdf.select(bad_udf(col("value"))) - .writeStream.format("memory") - .queryName("this_query") - .start() - ) - try: - # Process some data to fail the query - sq.processAllAvailable() - self.fail("bad udf should fail the query") - except StreamingQueryException as e: - # This is expected - self._assert_exception_tree_contains_msg(e, "ZeroDivisionError") - finally: - exception = sq.exception() - sq.stop() - self.assertIsInstance(exception, StreamingQueryException) - self._assert_exception_tree_contains_msg(exception, "ZeroDivisionError") + with self.sql_conf({"spark.sql.execution.pyspark.udf.simplifiedTraceback.enabled": True}): + sdf = self.spark.readStream.format("text").load("python/test_support/sql/streaming") + sq = sdf.writeStream.format("memory").queryName("query_explain").start() + try: + sq.processAllAvailable() + self.assertEqual(sq.exception(), None) + finally: + sq.stop() + + from pyspark.sql.functions import col, udf + from pyspark.errors import StreamingQueryException + + bad_udf = udf(lambda x: 1 / 0) + sq = ( + sdf.select(bad_udf(col("value"))) + .writeStream.format("memory") + .queryName("this_query") + .start() + ) + try: + # Process some data to fail the query + sq.processAllAvailable() + self.fail("bad udf should fail the query") + except StreamingQueryException as e: + # This is expected + self._assert_exception_tree_contains_msg(e, "ZeroDivisionError") + finally: + exception = sq.exception() + sq.stop() + self.assertIsInstance(exception, StreamingQueryException) + self._assert_exception_tree_contains_msg(exception, "ZeroDivisionError") def test_query_manager_no_recreation(self): # SPARK-46873: There should not be a new StreamingQueryManager created every time @@ -391,6 +413,30 @@ def test_streaming_with_temporary_view(self): set([Row(value="view_a"), Row(value="view_b"), Row(value="view_c")]), set(result) ) + def test_streaming_drop_duplicate_within_watermark(self): + """ + This verifies dropDuplicatesWithinWatermark works with a streaming dataframe. + """ + user_schema = StructType().add("time", TimestampType()).add("id", "integer") + df = ( + self.spark.readStream.option("sep", ";") + .schema(user_schema) + .csv("python/test_support/sql/streaming/time") + ) + q1 = ( + df.withWatermark("time", "2 seconds") + .dropDuplicatesWithinWatermark(["id"]) + .writeStream.outputMode("update") + .format("memory") + .queryName("test_streaming_drop_duplicates_within_wm") + .start() + ) + self.assertTrue(q1.isActive) + q1.processAllAvailable() + q1.stop() + result = self.spark.sql("SELECT * FROM test_streaming_drop_duplicates_within_wm").collect() + self.assertTrue(len(result) >= 6 and len(result) <= 9) + class StreamingTests(StreamingTestsMixin, ReusedSQLTestCase): def _assert_exception_tree_contains_msg(self, exception, msg): diff --git a/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py b/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py index ef286115a303f..de8f30baebca5 100644 --- a/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py +++ b/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py @@ -29,17 +29,18 @@ def test_streaming_foreach_batch(self): q = None def collectBatch(batch_df, batch_id): - batch_df.createOrReplaceGlobalTempView("test_view") + batch_df.write.format("parquet").saveAsTable("test_table") try: df = self.spark.readStream.format("text").load("python/test_support/sql/streaming") q = df.writeStream.foreachBatch(collectBatch).start() q.processAllAvailable() - collected = self.spark.sql("select * from global_temp.test_view").collect() + collected = self.spark.sql("select * from test_table").collect() self.assertTrue(len(collected), 2) finally: if q: q.stop() + self.spark.sql("DROP TABLE IF EXISTS test_table") def test_streaming_foreach_batch_tempview(self): q = None @@ -50,18 +51,19 @@ def collectBatch(batch_df, batch_id): # clone the session which is no longer same with the session used to start the # streaming query assert len(batch_df.sparkSession.sql("SELECT * FROM updates").collect()) == 2 - # Write to a global view verify on the repl/client side. - batch_df.createOrReplaceGlobalTempView("temp_view") + # Write a table to verify on the repl/client side. + batch_df.write.format("parquet").saveAsTable("test_table") try: df = self.spark.readStream.format("text").load("python/test_support/sql/streaming") q = df.writeStream.foreachBatch(collectBatch).start() q.processAllAvailable() - collected = self.spark.sql("SELECT * FROM global_temp.temp_view").collect() + collected = self.spark.sql("SELECT * FROM test_table").collect() self.assertTrue(len(collected[0]), 2) finally: if q: q.stop() + self.spark.sql("DROP TABLE IF EXISTS test_table") def test_streaming_foreach_batch_propagates_python_errors(self): from pyspark.errors import StreamingQueryException diff --git a/python/pyspark/sql/tests/streaming/test_streaming_listener.py b/python/pyspark/sql/tests/streaming/test_streaming_listener.py index 243ad2dca0747..762fc335b56ad 100644 --- a/python/pyspark/sql/tests/streaming/test_streaming_listener.py +++ b/python/pyspark/sql/tests/streaming/test_streaming_listener.py @@ -30,6 +30,7 @@ StateOperatorProgress, StreamingQueryProgress, ) +from pyspark.sql.functions import count, col, lit from pyspark.testing.sqlutils import ReusedSQLTestCase @@ -39,16 +40,16 @@ def check_start_event(self, event): self.assertTrue(isinstance(event, QueryStartedEvent)) self.assertTrue(isinstance(event.id, uuid.UUID)) self.assertTrue(isinstance(event.runId, uuid.UUID)) - self.assertTrue(event.name is None or event.name == "test") + self.assertTrue(event.name is None or event.name.startswith("test")) try: datetime.strptime(event.timestamp, "%Y-%m-%dT%H:%M:%S.%fZ") except ValueError: self.fail("'%s' is not in ISO 8601 format.") - def check_progress_event(self, event): + def check_progress_event(self, event, is_stateful): """Check QueryProgressEvent""" self.assertTrue(isinstance(event, QueryProgressEvent)) - self.check_streaming_query_progress(event.progress) + self.check_streaming_query_progress(event.progress, is_stateful) def check_terminated_event(self, event, exception=None, error_class=None): """Check QueryTerminatedEvent""" @@ -65,12 +66,12 @@ def check_terminated_event(self, event, exception=None, error_class=None): else: self.assertEqual(event.errorClassOnException, None) - def check_streaming_query_progress(self, progress): + def check_streaming_query_progress(self, progress, is_stateful): """Check StreamingQueryProgress""" self.assertTrue(isinstance(progress, StreamingQueryProgress)) self.assertTrue(isinstance(progress.id, uuid.UUID)) self.assertTrue(isinstance(progress.runId, uuid.UUID)) - self.assertEqual(progress.name, "test") + self.assertTrue(progress.name.startswith("test")) try: json.loads(progress.json) except Exception: @@ -108,9 +109,10 @@ def check_streaming_query_progress(self, progress): self.assertTrue(all(map(lambda v: isinstance(v, str), progress.eventTime.values()))) self.assertTrue(isinstance(progress.stateOperators, list)) - self.assertTrue(len(progress.stateOperators) >= 1) - for so in progress.stateOperators: - self.check_state_operator_progress(so) + if is_stateful: + self.assertTrue(len(progress.stateOperators) >= 1) + for so in progress.stateOperators: + self.check_state_operator_progress(so) self.assertTrue(isinstance(progress.sources, list)) self.assertTrue(len(progress.sources) >= 1) @@ -192,6 +194,53 @@ def check_sink_progress(self, progress): self.assertTrue(isinstance(progress.numOutputRows, int)) self.assertTrue(isinstance(progress.metrics, dict)) + # This is a generic test work for both classic Spark and Spark Connect + def test_listener_observed_metrics(self): + class MyErrorListener(StreamingQueryListener): + def __init__(self): + self.num_rows = -1 + self.num_error_rows = -1 + + def onQueryStarted(self, event): + pass + + def onQueryProgress(self, event): + row = event.progress.observedMetrics.get("my_event") + # Save observed metrics for later verification + self.num_rows = row["rc"] + self.num_error_rows = row["erc"] + + def onQueryIdle(self, event): + pass + + def onQueryTerminated(self, event): + pass + + try: + error_listener = MyErrorListener() + self.spark.streams.addListener(error_listener) + + sdf = self.spark.readStream.format("rate").load().withColumn("error", col("value")) + + # Observe row count (rc) and error row count (erc) in the streaming Dataset + observed_ds = sdf.observe( + "my_event", count(lit(1)).alias("rc"), count(col("error")).alias("erc") + ) + + q = observed_ds.writeStream.format("console").start() + + while q.lastProgress is None or q.lastProgress["batchId"] == 0: + q.awaitTermination(0.5) + + time.sleep(5) + + self.assertTrue(error_listener.num_rows > 0) + self.assertTrue(error_listener.num_error_rows > 0) + + finally: + q.stop() + self.spark.streams.removeListener(error_listener) + class StreamingListenerTests(StreamingListenerTestsMixin, ReusedSQLTestCase): def test_number_of_public_methods(self): @@ -313,7 +362,7 @@ def verify(test_listener): self.spark.sparkContext._jsc.sc().listenerBus().waitUntilEmpty() self.check_start_event(start_event) - self.check_progress_event(progress_event) + self.check_progress_event(progress_event, True) self.check_terminated_event(terminated_event) # Check query terminated with exception @@ -470,7 +519,7 @@ def test_streaming_query_progress_fromJson(self): """ progress = StreamingQueryProgress.fromJson(json.loads(progress_json)) - self.check_streaming_query_progress(progress) + self.check_streaming_query_progress(progress, True) # checks for progress self.assertEqual(progress.id, uuid.UUID("00000000-0000-0001-0000-000000000001")) @@ -543,6 +592,23 @@ def test_streaming_query_progress_fromJson(self): self.assertEqual(sink.numOutputRows, -1) self.assertEqual(sink.metrics, {}) + def test_spark_property_in_listener(self): + # SPARK-48560: Make StreamingQueryListener.spark settable + class TestListener(StreamingQueryListener): + def __init__(self, session): + self.spark = session + + def onQueryStarted(self, event): + pass + + def onQueryProgress(self, event): + pass + + def onQueryTerminated(self, event): + pass + + self.assertEqual(TestListener(self.spark).spark, self.spark) + if __name__ == "__main__": import unittest diff --git a/python/pyspark/sql/tests/test_arrow.py b/python/pyspark/sql/tests/test_arrow.py index 5235e021bae9a..c1a69c404086b 100644 --- a/python/pyspark/sql/tests/test_arrow.py +++ b/python/pyspark/sql/tests/test_arrow.py @@ -23,7 +23,6 @@ import unittest from typing import cast from collections import namedtuple -import sys from pyspark import SparkConf from pyspark.sql import Row, SparkSession @@ -56,6 +55,9 @@ ExamplePointUDT, ) from pyspark.errors import ArithmeticException, PySparkTypeError, UnsupportedOperationException +from pyspark.loose_version import LooseVersion +from pyspark.util import is_remote_only +from pyspark.loose_version import LooseVersion if have_pandas: import pandas as pd @@ -160,6 +162,45 @@ def setUpClass(cls): ] cls.data = [tuple(list(d) + [None]) for d in cls.data_wo_null] + cls.schema_nested_timestamp = ( + StructType() + .add("ts", TimestampType()) + .add("ts_ntz", TimestampNTZType()) + .add( + "struct", StructType().add("ts", TimestampType()).add("ts_ntz", TimestampNTZType()) + ) + .add("array", ArrayType(TimestampType())) + .add("array_ntz", ArrayType(TimestampNTZType())) + .add("map", MapType(StringType(), TimestampType())) + .add("map_ntz", MapType(StringType(), TimestampNTZType())) + ) + cls.data_nested_timestamp = [ + Row( + datetime(2023, 1, 1, 0, 0, 0), + datetime(2023, 1, 1, 0, 0, 0), + Row( + datetime(2023, 1, 1, 0, 0, 0), + datetime(2023, 1, 1, 0, 0, 0), + ), + [datetime(2023, 1, 1, 0, 0, 0)], + [datetime(2023, 1, 1, 0, 0, 0)], + dict(ts=datetime(2023, 1, 1, 0, 0, 0)), + dict(ts_ntz=datetime(2023, 1, 1, 0, 0, 0)), + ) + ] + cls.data_nested_timestamp_expected_ny = Row( + ts=datetime(2022, 12, 31, 21, 0, 0), + ts_ntz=datetime(2023, 1, 1, 0, 0, 0), + struct=Row( + ts=datetime(2022, 12, 31, 21, 0, 0), + ts_ntz=datetime(2023, 1, 1, 0, 0, 0), + ), + array=[datetime(2022, 12, 31, 21, 0, 0)], + array_ntz=[datetime(2023, 1, 1, 0, 0, 0)], + map=dict(ts=datetime(2022, 12, 31, 21, 0, 0)), + map_ntz=dict(ts_ntz=datetime(2023, 1, 1, 0, 0, 0)), + ) + @classmethod def tearDownClass(cls): del os.environ["TZ"] @@ -179,6 +220,27 @@ def create_pandas_data_frame(self): data_dict["4_float_t"] = np.float32(data_dict["4_float_t"]) return pd.DataFrame(data=data_dict) + def create_arrow_table(self): + import pyarrow as pa + + data_dict = {} + for j, name in enumerate(self.schema.names): + data_dict[name] = [self.data[i][j] for i in range(len(self.data))] + t = pa.Table.from_pydict(data_dict) + # convert these to Arrow types + new_schema = t.schema.set( + t.schema.get_field_index("2_int_t"), pa.field("2_int_t", pa.int32()) + ) + new_schema = new_schema.set( + new_schema.get_field_index("4_float_t"), pa.field("4_float_t", pa.float32()) + ) + new_schema = new_schema.set( + new_schema.get_field_index("6_decimal_t"), + pa.field("6_decimal_t", pa.decimal128(38, 18)), + ) + t = t.cast(new_schema) + return t + @property def create_np_arrs(self): import numpy as np @@ -286,6 +348,17 @@ def check_create_data_frame_to_pandas_timestamp_ntz(self, arrow_enabled): pdf = df.toPandas() assert_frame_equal(origin, pdf) + def test_create_data_frame_to_arrow_timestamp_ntz(self): + with self.sql_conf({"spark.sql.session.timeZone": "America/Los_Angeles"}): + origin = pa.table({"a": [datetime.datetime(2012, 2, 2, 2, 2, 2)]}) + df = self.spark.createDataFrame( + origin, schema=StructType([StructField("a", TimestampNTZType(), True)]) + ) + df.selectExpr("assert_true('2012-02-02 02:02:02' == CAST(a AS STRING))").collect() + + t = df.toArrow() + self.assertTrue(origin.equals(t)) + def test_create_data_frame_to_pandas_day_time_internal(self): for arrow_enabled in [True, False]: with self.subTest(arrow_enabled=arrow_enabled): @@ -303,6 +376,16 @@ def check_create_data_frame_to_pandas_day_time_internal(self, arrow_enabled): pdf = df.toPandas() assert_frame_equal(origin, pdf) + def test_create_data_frame_to_arrow_day_time_internal(self): + origin = pa.table({"a": [datetime.timedelta(microseconds=123)]}) + df = self.spark.createDataFrame(origin) + df.select( + assert_true(lit("INTERVAL '0 00:00:00.000123' DAY TO SECOND") == df.a.cast("string")) + ).collect() + + t = df.toArrow() + self.assertTrue(origin.equals(t)) + def test_toPandas_respect_session_timezone(self): for arrow_enabled in [True, False]: with self.subTest(arrow_enabled=arrow_enabled): @@ -333,12 +416,52 @@ def check_toPandas_respect_session_timezone(self, arrow_enabled): ) assert_frame_equal(pdf_ny, pdf_la_corrected) + def test_toArrow_keep_utc_timezone(self): + df = self.spark.createDataFrame(self.data, schema=self.schema) + + timezone = "America/Los_Angeles" + with self.sql_conf({"spark.sql.session.timeZone": timezone}): + t_la = df.toArrow() + + timezone = "America/New_York" + with self.sql_conf({"spark.sql.session.timeZone": timezone}): + t_ny = df.toArrow() + + self.assertTrue(t_ny.equals(t_la)) + self.assertEqual(t_la["8_timestamp_t"].type.tz, "UTC") + self.assertEqual(t_ny["8_timestamp_t"].type.tz, "UTC") + def test_pandas_round_trip(self): pdf = self.create_pandas_data_frame() df = self.spark.createDataFrame(self.data, schema=self.schema) pdf_arrow = df.toPandas() assert_frame_equal(pdf_arrow, pdf) + def test_arrow_round_trip(self): + import pyarrow.compute as pc + + t_in = self.create_arrow_table() + + # Convert timezone-naive local timestamp column in input table to UTC + # to enable comparison to UTC timestamp column in output table + timezone = self.spark.conf.get("spark.sql.session.timeZone") + t_in = t_in.set_column( + t_in.schema.get_field_index("8_timestamp_t"), + "8_timestamp_t", + pc.assume_timezone(t_in["8_timestamp_t"], timezone), + ) + t_in = t_in.cast( + t_in.schema.set( + t_in.schema.get_field_index("8_timestamp_t"), + pa.field("8_timestamp_t", pa.timestamp("us", tz="UTC")), + ) + ) + + df = self.spark.createDataFrame(self.data, schema=self.schema) + t_out = df.toArrow() + + self.assertTrue(t_out.equals(t_in)) + def test_pandas_self_destruct(self): import pyarrow as pa @@ -402,6 +525,13 @@ def raise_exception(): with self.assertRaisesRegex(Exception, "My error"): df.toPandas() + def test_createDataFrame_arrow_pandas(self): + table = self.create_arrow_table() + pdf = self.create_pandas_data_frame() + df_arrow = self.spark.createDataFrame(table) + df_pandas = self.spark.createDataFrame(pdf) + self.assertEqual(df_arrow.collect(), df_pandas.collect()) + def _createDataFrame_toggle(self, data, schema=None): with self.sql_conf({"spark.sql.execution.arrow.pyspark.enabled": False}): df_no_arrow = self.spark.createDataFrame(data, schema=schema) @@ -415,12 +545,12 @@ def test_createDataFrame_toggle(self): df_no_arrow, df_arrow = self._createDataFrame_toggle(pdf, schema=self.schema) self.assertEqual(df_no_arrow.collect(), df_arrow.collect()) - def test_createDataFrame_respect_session_timezone(self): + def test_createDataFrame_pandas_respect_session_timezone(self): for arrow_enabled in [True, False]: with self.subTest(arrow_enabled=arrow_enabled): - self.check_createDataFrame_respect_session_timezone(arrow_enabled) + self.check_createDataFrame_pandas_respect_session_timezone(arrow_enabled) - def check_createDataFrame_respect_session_timezone(self, arrow_enabled): + def check_createDataFrame_pandas_respect_session_timezone(self, arrow_enabled): from datetime import timedelta pdf = self.create_pandas_data_frame() @@ -450,18 +580,46 @@ def check_createDataFrame_respect_session_timezone(self, arrow_enabled): ] self.assertEqual(result_ny, result_la_corrected) - def test_createDataFrame_with_schema(self): + def test_createDataFrame_arrow_respect_session_timezone(self): + from datetime import timedelta + + t = self.create_arrow_table() + timezone = "America/Los_Angeles" + with self.sql_conf({"spark.sql.session.timeZone": timezone}): + df_la = self.spark.createDataFrame(t, schema=self.schema) + result_la = df_la.collect() + + timezone = "America/New_York" + with self.sql_conf({"spark.sql.session.timeZone": timezone}): + df_ny = self.spark.createDataFrame(t, schema=self.schema) + result_ny = df_ny.collect() + + self.assertNotEqual(result_ny, result_la) + + # Correct result_la by adjusting 3 hours difference between Los Angeles and New York + result_la_corrected = [ + Row( + **{ + k: v - timedelta(hours=3) if k == "8_timestamp_t" else v + for k, v in row.asDict().items() + } + ) + for row in result_la + ] + self.assertEqual(result_ny, result_la_corrected) + + def test_createDataFrame_pandas_with_schema(self): pdf = self.create_pandas_data_frame() df = self.spark.createDataFrame(pdf, schema=self.schema) self.assertEqual(self.schema, df.schema) pdf_arrow = df.toPandas() assert_frame_equal(pdf_arrow, pdf) - def test_createDataFrame_with_incorrect_schema(self): + def test_createDataFrame_pandas_with_incorrect_schema(self): with self.quiet(): - self.check_createDataFrame_with_incorrect_schema() + self.check_createDataFrame_pandas_with_incorrect_schema() - def check_createDataFrame_with_incorrect_schema(self): + def check_createDataFrame_pandas_with_incorrect_schema(self): pdf = self.create_pandas_data_frame() fields = list(self.schema) fields[5], fields[6] = fields[6], fields[5] # swap decimal with date @@ -485,7 +643,15 @@ def check_createDataFrame_with_incorrect_schema(self): self.assertEqual(len(exception.args), 1) self.assertRegex(exception.args[0], "[D|d]ecimal.*got.*date") - def test_createDataFrame_with_names(self): + def test_createDataFrame_arrow_with_incorrect_schema(self): + t = self.create_arrow_table() + fields = list(self.schema) + fields[5], fields[6] = fields[6], fields[5] # swap decimal with date + wrong_schema = StructType(fields) + with self.assertRaises(Exception): + self.spark.createDataFrame(t, schema=wrong_schema) + + def test_createDataFrame_pandas_with_names(self): pdf = self.create_pandas_data_frame() new_names = list(map(str, range(len(self.schema.fieldNames())))) # Test that schema as a list of column names gets applied @@ -495,7 +661,17 @@ def test_createDataFrame_with_names(self): df = self.spark.createDataFrame(pdf, schema=tuple(new_names)) self.assertEqual(df.schema.fieldNames(), new_names) - def test_createDataFrame_column_name_encoding(self): + def test_createDataFrame_arrow_with_names(self): + t = self.create_arrow_table() + new_names = list(map(str, range(len(self.schema.fieldNames())))) + # Test that schema as a list of column names gets applied + df = self.spark.createDataFrame(t, schema=list(new_names)) + self.assertEqual(df.schema.fieldNames(), new_names) + # Test that schema as tuple of column names gets applied + df = self.spark.createDataFrame(t, schema=tuple(new_names)) + self.assertEqual(df.schema.fieldNames(), new_names) + + def test_createDataFrame_pandas_column_name_encoding(self): pdf = pd.DataFrame({"a": [1]}) columns = self.spark.createDataFrame(pdf).columns self.assertTrue(isinstance(columns[0], str)) @@ -504,6 +680,15 @@ def test_createDataFrame_column_name_encoding(self): self.assertTrue(isinstance(columns[0], str)) self.assertEqual(columns[0], "b") + def test_createDataFrame_arrow_column_name_encoding(self): + t = pa.table({"a": [1]}) + columns = self.spark.createDataFrame(t).columns + self.assertTrue(isinstance(columns[0], str)) + self.assertEqual(columns[0], "a") + columns = self.spark.createDataFrame(t, ["b"]).columns + self.assertTrue(isinstance(columns[0], str)) + self.assertEqual(columns[0], "b") + def test_createDataFrame_with_single_data_type(self): with self.quiet(): self.check_createDataFrame_with_single_data_type() @@ -531,6 +716,17 @@ def test_createDataFrame_does_not_modify_input(self): self.spark.createDataFrame(pdf, schema=self.schema) self.assertTrue(pdf.equals(pdf_copy)) + def test_createDataFrame_arrow_truncate_timestamp(self): + t_in = pa.Table.from_arrays( + [pa.array([1234567890123456789], type=pa.timestamp("ns", tz="UTC"))], names=["ts"] + ) + df = self.spark.createDataFrame(t_in) + t_out = df.toArrow() + expected = pa.Table.from_arrays( + [pa.array([1234567890123456], type=pa.timestamp("us", tz="UTC"))], names=["ts"] + ) + self.assertTrue(t_out.equals(expected)) + def test_schema_conversion_roundtrip(self): from pyspark.sql.pandas.types import from_arrow_schema, to_arrow_schema @@ -565,12 +761,12 @@ def check_createDataFrame_with_ndarray(self, arrow_enabled): ): self.spark.createDataFrame(np.array(0)) - def test_createDataFrame_with_array_type(self): + def test_createDataFrame_pandas_with_array_type(self): for arrow_enabled in [True, False]: with self.subTest(arrow_enabled=arrow_enabled): - self.check_createDataFrame_with_array_type(arrow_enabled) + self.check_createDataFrame_pandas_with_array_type(arrow_enabled) - def check_createDataFrame_with_array_type(self, arrow_enabled): + def check_createDataFrame_pandas_with_array_type(self, arrow_enabled): pdf = pd.DataFrame({"a": [[1, 2], [3, 4]], "b": [["x", "y"], ["y", "z"]]}) with self.sql_conf({"spark.sql.execution.arrow.pyspark.enabled": arrow_enabled}): df = self.spark.createDataFrame(pdf) @@ -580,6 +776,18 @@ def check_createDataFrame_with_array_type(self, arrow_enabled): for e in range(len(expected[r])): self.assertTrue(expected[r][e] == result[r][e]) + def test_createDataFrame_arrow_with_array_type_nulls(self): + t = pa.table({"a": [[1, 2], None, [3, 4]], "b": [["x", "y"], ["y", "z"], None]}) + df = self.spark.createDataFrame(t) + result = df.collect() + expected = [ + tuple(list(e) if e is not None else None for e in rec) + for rec in t.to_pandas().to_records(index=False) + ] + for r in range(len(expected)): + for e in range(len(expected[r])): + self.assertTrue(expected[r][e] == result[r][e]) + def test_toPandas_with_array_type(self): for arrow_enabled in [True, False]: with self.subTest(arrow_enabled=arrow_enabled): @@ -597,13 +805,28 @@ def check_toPandas_with_array_type(self, arrow_enabled): for e in range(len(expected[r])): self.assertTrue(expected[r][e] == result[r][e]) - def test_createDataFrame_with_map_type(self): + def test_toArrow_with_array_type_nulls(self): + expected = [([1, 2], ["x", "y"]), (None, ["y", "z"]), ([3, 4], None)] + array_schema = StructType( + [StructField("a", ArrayType(IntegerType())), StructField("b", ArrayType(StringType()))] + ) + df = self.spark.createDataFrame(expected, schema=array_schema) + t = df.toArrow() + result = [ + tuple(None if e is None else list(e) for e in rec) + for rec in t.to_pandas().to_records(index=False) + ] + for r in range(len(expected)): + for e in range(len(expected[r])): + self.assertTrue(expected[r][e] == result[r][e]) + + def test_createDataFrame_pandas_with_map_type(self): with self.quiet(): for arrow_enabled in [True, False]: with self.subTest(arrow_enabled=arrow_enabled): - self.check_createDataFrame_with_map_type(arrow_enabled) + self.check_createDataFrame_pandas_with_map_type(arrow_enabled) - def check_createDataFrame_with_map_type(self, arrow_enabled): + def check_createDataFrame_pandas_with_map_type(self, arrow_enabled): map_data = [{"a": 1}, {"b": 2, "c": 3}, {}, None, {"d": None}] pdf = pd.DataFrame({"id": [0, 1, 2, 3, 4], "m": map_data}) @@ -621,12 +844,52 @@ def check_createDataFrame_with_map_type(self, arrow_enabled): i, m = row self.assertEqual(m, map_data[i]) - def test_createDataFrame_with_struct_type(self): + def test_createDataFrame_arrow_with_map_type(self): + map_data = [{"a": 1}, {"b": 2, "c": 3}, {}, {}, {"d": None}] + + t = pa.table( + {"id": [0, 1, 2, 3, 4], "m": map_data}, + schema=pa.schema([("id", pa.int64()), ("m", pa.map_(pa.string(), pa.int64()))]), + ) + for schema in ( + "id long, m map", + StructType().add("id", LongType()).add("m", MapType(StringType(), LongType())), + ): + with self.subTest(schema=schema): + df = self.spark.createDataFrame(t, schema=schema) + + result = df.collect() + + for row in result: + i, m = row + self.assertEqual(m, map_data[i]) + + def test_createDataFrame_arrow_with_map_type_nulls(self): + map_data = [{"a": 1}, {"b": 2, "c": 3}, {}, None, {"d": None}] + + t = pa.table( + {"id": [0, 1, 2, 3, 4], "m": map_data}, + schema=pa.schema([("id", pa.int64()), ("m", pa.map_(pa.string(), pa.int64()))]), + ) + for schema in ( + "id long, m map", + StructType().add("id", LongType()).add("m", MapType(StringType(), LongType())), + ): + with self.subTest(schema=schema): + df = self.spark.createDataFrame(t, schema=schema) + + result = df.collect() + + for row in result: + i, m = row + self.assertEqual(m, map_data[i]) + + def test_createDataFrame_pandas_with_struct_type(self): for arrow_enabled in [True, False]: with self.subTest(arrow_enabled=arrow_enabled): - self.check_createDataFrame_with_struct_type(arrow_enabled) + self.check_createDataFrame_pandas_with_struct_type(arrow_enabled) - def check_createDataFrame_with_struct_type(self, arrow_enabled): + def check_createDataFrame_pandas_with_struct_type(self, arrow_enabled): pdf = pd.DataFrame( {"a": [Row(1, "a"), Row(2, "b")], "b": [{"s": 3, "t": "x"}, {"s": 4, "t": "y"}]} ) @@ -647,6 +910,42 @@ def check_createDataFrame_with_struct_type(self, arrow_enabled): expected[r][e] == result[r][e], f"{expected[r][e]} == {result[r][e]}" ) + def test_createDataFrame_pandas_with_struct_type(self): + for arrow_enabled in [True, False]: + with self.subTest(arrow_enabled=arrow_enabled): + self.check_createDataFrame_pandas_with_struct_type(arrow_enabled) + + def test_createDataFrame_arrow_with_struct_type_nulls(self): + t = pa.table( + { + "a": [{"x": 1, "y": "a"}, None, {"x": None, "y": "b"}], + "b": [{"s": 3, "t": None}, {"s": 4, "t": "y"}, None], + }, + ) + for schema in ( + "a struct, b struct", + StructType() + .add("a", StructType().add("x", LongType()).add("y", StringType())) + .add("b", StructType().add("s", LongType()).add("t", StringType())), + ): + with self.subTest(schema=schema): + df = self.spark.createDataFrame(t, schema) + result = df.collect() + expected = [ + ( + Row( + a=None if rec[0] is None else (Row(**rec[0])), + b=None if rec[1] is None else Row(**rec[1]), + ) + ) + for rec in t.to_pandas().to_records(index=False) + ] + for r in range(len(expected)): + for e in range(len(expected[r])): + self.assertTrue( + expected[r][e] == result[r][e], f"{expected[r][e]} == {result[r][e]}" + ) + def test_createDataFrame_with_string_dtype(self): # SPARK-34521: spark.createDataFrame does not support Pandas StringDtype extension type with self.sql_conf({"spark.sql.execution.arrow.pyspark.enabled": True}): @@ -690,6 +989,22 @@ def check_toPandas_with_map_type(self, arrow_enabled): pdf = df.toPandas() assert_frame_equal(origin, pdf) + def test_toArrow_with_map_type(self): + origin = pa.table( + {"id": [0, 1, 2, 3], "m": [{}, {"a": 1}, {"a": 1, "b": 2}, {"a": 1, "b": 2, "c": 3}]}, + schema=pa.schema( + [pa.field("id", pa.int64()), pa.field("m", pa.map_(pa.string(), pa.int64()), True)] + ), + ) + for schema in [ + "id long, m map", + StructType().add("id", LongType()).add("m", MapType(StringType(), LongType())), + ]: + df = self.spark.createDataFrame(origin, schema=schema) + + t = df.toArrow() + self.assertTrue(origin.equals(t)) + def test_toPandas_with_map_type_nulls(self): with self.quiet(): for arrow_enabled in [True, False]: @@ -712,12 +1027,29 @@ def check_toPandas_with_map_type_nulls(self, arrow_enabled): pdf = df.toPandas() assert_frame_equal(origin, pdf) - def test_createDataFrame_with_int_col_names(self): + def test_toArrow_with_map_type_nulls(self): + map_data = [{"a": 1}, {"b": 2, "c": 3}, {}, None, {"d": None}] + + origin = pa.table( + {"id": [0, 1, 2, 3, 4], "m": map_data}, + schema=pa.schema( + [pa.field("id", pa.int64()), pa.field("m", pa.map_(pa.string(), pa.int64()), True)] + ), + ) + for schema in [ + "id long, m map", + StructType().add("id", LongType()).add("m", MapType(StringType(), LongType())), + ]: + df = self.spark.createDataFrame(origin, schema=schema) + pdf = df.toArrow().to_pandas() + assert_frame_equal(origin.to_pandas(), pdf) + + def test_createDataFrame_pandas_with_int_col_names(self): for arrow_enabled in [True, False]: with self.subTest(arrow_enabled=arrow_enabled): - self.check_createDataFrame_with_int_col_names(arrow_enabled) + self.check_createDataFrame_pandas_with_int_col_names(arrow_enabled) - def check_createDataFrame_with_int_col_names(self, arrow_enabled): + def check_createDataFrame_pandas_with_int_col_names(self, arrow_enabled): import numpy as np pdf = pd.DataFrame(np.random.rand(4, 2)) @@ -726,6 +1058,13 @@ def check_createDataFrame_with_int_col_names(self, arrow_enabled): pdf_col_names = [str(c) for c in pdf.columns] self.assertEqual(pdf_col_names, df.columns) + def test_createDataFrame_arrow_with_int_col_names(self): + import numpy as np + + t = pa.table(pd.DataFrame(np.random.rand(4, 2))) + df = self.spark.createDataFrame(t) + self.assertEqual(t.schema.names, df.columns) + # Regression test for SPARK-23314 def test_timestamp_dst(self): # Daylight saving time for Los Angeles for 2015 is Sun, Nov 1 at 2:00 am @@ -810,6 +1149,23 @@ def test_createDataFrame_with_category_type(self): self.assertIsInstance(arrow_first_category_element, str) self.assertIsInstance(spark_first_category_element, str) + def test_createDataFrame_with_dictionary_type_nulls(self): + import pyarrow.compute as pc + + t = pa.table({"A": ["a", "b", "c", None, "a"]}) + t = t.add_column(1, "B", pc.dictionary_encode(t["A"])) + category_first_element = sorted(t["B"].combine_chunks().dictionary.to_pylist())[0] + + df = self.spark.createDataFrame(t) + type = df.dtypes[1][1] + result = df.toArrow() + result_first_category_element = result["B"][0].as_py() + + # ensure original category elements are string + self.assertIsInstance(category_first_element, str) + self.assertEqual(type, "string") + self.assertIsInstance(result_first_category_element, str) + def test_createDataFrame_with_float_index(self): # SPARK-32098: float index should not produce duplicated or truncated Spark DataFrame self.assertEqual( @@ -830,7 +1186,8 @@ def test_createDataFrame_empty_partition(self): pdf = pd.DataFrame({"c1": [1], "c2": ["string"]}) df = self.spark.createDataFrame(pdf) self.assertEqual([Row(c1=1, c2="string")], df.collect()) - self.assertGreater(self.spark.sparkContext.defaultParallelism, len(pdf)) + if not is_remote_only(): + self.assertGreater(self._legacy_sc.defaultParallelism, len(pdf)) def test_toPandas_error(self): for arrow_enabled in [True, False]: @@ -847,6 +1204,15 @@ def check_toPandas_error(self, arrow_enabled): with self.assertRaises(ArithmeticException): self.spark.sql("select 1/0").toPandas() + def test_toArrow_error(self): + with self.sql_conf( + { + "spark.sql.ansi.enabled": True, + } + ): + with self.assertRaises(ArithmeticException): + self.spark.sql("select 1/0").toArrow() + def test_toPandas_duplicate_field_names(self): for arrow_enabled in [True, False]: with self.subTest(arrow_enabled=arrow_enabled): @@ -901,12 +1267,45 @@ def check_toPandas_duplicate_field_names(self, arrow_enabled): expected = pd.DataFrame.from_records(data, columns=schema.names) assert_frame_equal(df.toPandas(), expected) - def test_createDataFrame_duplicate_field_names(self): + def test_toArrow_duplicate_field_names(self): + data = [[1, 1], [2, 2]] + names = ["a", "a"] + df = self.spark.createDataFrame(data, names) + + expected = pa.table( + [[1, 2], [1, 2]], + schema=pa.schema([pa.field("a", pa.int64()), pa.field("a", pa.int64())]), + ) + + self.assertTrue(df.toArrow().equals(expected)) + + data = [Row(Row("a", 1), Row(2, 3, "b", 4, "c")), Row(Row("x", 6), Row(7, 8, "y", 9, "z"))] + schema = ( + StructType() + .add("struct", StructType().add("x", StringType()).add("x", IntegerType())) + .add( + "struct", + StructType() + .add("a", IntegerType()) + .add("x", IntegerType()) + .add("x", StringType()) + .add("y", IntegerType()) + .add("y", StringType()), + ) + ) + df = self.spark.createDataFrame(data, schema=schema) + + with self.assertRaisesRegex( + UnsupportedOperationException, "DUPLICATED_FIELD_NAME_IN_ARROW_STRUCT" + ): + df.toArrow() + + def test_createDataFrame_pandas_duplicate_field_names(self): for arrow_enabled in [True, False]: with self.subTest(arrow_enabled=arrow_enabled): - self.check_createDataFrame_duplicate_field_names(arrow_enabled) + self.check_createDataFrame_pandas_duplicate_field_names(arrow_enabled) - def check_createDataFrame_duplicate_field_names(self, arrow_enabled): + def check_createDataFrame_pandas_duplicate_field_names(self, arrow_enabled): schema = ( StructType() .add("struct", StructType().add("x", StringType()).add("x", IntegerType())) @@ -929,6 +1328,66 @@ def check_createDataFrame_duplicate_field_names(self, arrow_enabled): self.assertEqual(df.collect(), data) + def test_createDataFrame_arrow_duplicate_field_names(self): + t = pa.table( + [[1, 2], [1, 2]], + schema=pa.schema([pa.field("a", pa.int64()), pa.field("a", pa.int64())]), + ) + schema = StructType().add("a", LongType()).add("a", LongType()) + + df = self.spark.createDataFrame(t) + + self.assertTrue(df.toArrow().equals(t)) + + df = self.spark.createDataFrame(t, schema=schema) + + self.assertTrue(df.toArrow().equals(t)) + + t = pa.table( + [ + pa.StructArray.from_arrays( + [ + pa.array(["a", "x"], type=pa.string()), + pa.array([1, 6], type=pa.int32()), + ], + names=["x", "x"], + ), + pa.StructArray.from_arrays( + [ + pa.array([2, 7], type=pa.int32()), + pa.array([3, 8], type=pa.int32()), + pa.array(["b", "y"], type=pa.string()), + pa.array([4, 9], type=pa.int32()), + pa.array(["c", "z"], type=pa.string()), + ], + names=["a", "x", "x", "y", "y"], + ), + ], + names=["struct", "struct"], + ) + schema = ( + StructType() + .add("struct", StructType().add("x", StringType()).add("x", IntegerType())) + .add( + "struct", + StructType() + .add("a", IntegerType()) + .add("x", IntegerType()) + .add("x", StringType()) + .add("y", IntegerType()) + .add("y", StringType()), + ) + ) + with self.assertRaisesRegex( + UnsupportedOperationException, "DUPLICATED_FIELD_NAME_IN_ARROW_STRUCT" + ): + self.spark.createDataFrame(t) + + with self.assertRaisesRegex( + UnsupportedOperationException, "DUPLICATED_FIELD_NAME_IN_ARROW_STRUCT" + ): + self.spark.createDataFrame(t, schema) + def test_toPandas_empty_columns(self): for arrow_enabled in [True, False]: with self.subTest(arrow_enabled=arrow_enabled): @@ -940,38 +1399,39 @@ def check_toPandas_empty_columns(self, arrow_enabled): with self.sql_conf({"spark.sql.execution.arrow.pyspark.enabled": arrow_enabled}): assert_frame_equal(df.toPandas(), pd.DataFrame(columns=[], index=range(2))) - def test_createDataFrame_nested_timestamp(self): + def test_toArrow_empty_columns(self): + df = self.spark.range(2).select([]) + + self.assertTrue(df.toArrow().equals(pa.table([]))) + + def test_toPandas_empty_rows(self): for arrow_enabled in [True, False]: with self.subTest(arrow_enabled=arrow_enabled): - self.check_createDataFrame_nested_timestamp(arrow_enabled) + self.check_toPandas_empty_rows(arrow_enabled) - def check_createDataFrame_nested_timestamp(self, arrow_enabled): - schema = ( - StructType() - .add("ts", TimestampType()) - .add("ts_ntz", TimestampNTZType()) - .add( - "struct", StructType().add("ts", TimestampType()).add("ts_ntz", TimestampNTZType()) + def check_toPandas_empty_rows(self, arrow_enabled): + df = self.spark.range(2).limit(0) + + with self.sql_conf({"spark.sql.execution.arrow.pyspark.enabled": arrow_enabled}): + assert_frame_equal(df.toPandas(), pd.DataFrame({"id": pd.Series([], dtype="int64")})) + + def test_toArrow_empty_rows(self): + df = self.spark.range(2).limit(0) + + self.assertTrue( + df.toArrow().equals( + pa.Table.from_arrays([[]], schema=pa.schema([pa.field("id", pa.int64(), False)])) ) - .add("array", ArrayType(TimestampType())) - .add("array_ntz", ArrayType(TimestampNTZType())) - .add("map", MapType(StringType(), TimestampType())) - .add("map_ntz", MapType(StringType(), TimestampNTZType())) ) - data = [ - Row( - datetime.datetime(2023, 1, 1, 0, 0, 0), - datetime.datetime(2023, 1, 1, 0, 0, 0), - Row( - datetime.datetime(2023, 1, 1, 0, 0, 0), - datetime.datetime(2023, 1, 1, 0, 0, 0), - ), - [datetime.datetime(2023, 1, 1, 0, 0, 0)], - [datetime.datetime(2023, 1, 1, 0, 0, 0)], - dict(ts=datetime.datetime(2023, 1, 1, 0, 0, 0)), - dict(ts_ntz=datetime.datetime(2023, 1, 1, 0, 0, 0)), - ) - ] + + def test_createDataFrame_pandas_nested_timestamp(self): + for arrow_enabled in [True, False]: + with self.subTest(arrow_enabled=arrow_enabled): + self.check_createDataFrame_pandas_nested_timestamp(arrow_enabled) + + def check_createDataFrame_pandas_nested_timestamp(self, arrow_enabled): + schema = self.schema_nested_timestamp + data = self.data_nested_timestamp pdf = pd.DataFrame.from_records(data, columns=schema.names) with self.sql_conf( @@ -982,22 +1442,26 @@ def check_createDataFrame_nested_timestamp(self, arrow_enabled): ): df = self.spark.createDataFrame(pdf, schema) - expected = Row( - ts=datetime.datetime(2022, 12, 31, 21, 0, 0), - ts_ntz=datetime.datetime(2023, 1, 1, 0, 0, 0), - struct=Row( - ts=datetime.datetime(2022, 12, 31, 21, 0, 0), - ts_ntz=datetime.datetime(2023, 1, 1, 0, 0, 0), - ), - array=[datetime.datetime(2022, 12, 31, 21, 0, 0)], - array_ntz=[datetime.datetime(2023, 1, 1, 0, 0, 0)], - map=dict(ts=datetime.datetime(2022, 12, 31, 21, 0, 0)), - map_ntz=dict(ts_ntz=datetime.datetime(2023, 1, 1, 0, 0, 0)), - ) + expected = self.data_nested_timestamp_expected_ny + + self.assertEqual(df.first(), expected) + + def test_createDataFrame_arrow_nested_timestamp(self): + from pyspark.sql.pandas.types import to_arrow_schema + + schema = self.schema_nested_timestamp + data = self.data_nested_timestamp + pdf = pd.DataFrame.from_records(data, columns=schema.names) + arrow_schema = to_arrow_schema(schema, timestamp_utc=False) + t = pa.Table.from_pandas(pdf, arrow_schema) + + with self.sql_conf({"spark.sql.session.timeZone": "America/New_York"}): + df = self.spark.createDataFrame(t, schema) + + expected = self.data_nested_timestamp_expected_ny self.assertEqual(df.first(), expected) - @unittest.skipIf(sys.version_info < (3, 9), "zoneinfo is available from Python 3.9+") def test_toPandas_timestmap_tzinfo(self): for arrow_enabled in [True, False]: with self.subTest(arrow_enabled=arrow_enabled): @@ -1031,32 +1495,8 @@ def test_toPandas_nested_timestamp(self): self.check_toPandas_nested_timestamp(arrow_enabled) def check_toPandas_nested_timestamp(self, arrow_enabled): - schema = ( - StructType() - .add("ts", TimestampType()) - .add("ts_ntz", TimestampNTZType()) - .add( - "struct", StructType().add("ts", TimestampType()).add("ts_ntz", TimestampNTZType()) - ) - .add("array", ArrayType(TimestampType())) - .add("array_ntz", ArrayType(TimestampNTZType())) - .add("map", MapType(StringType(), TimestampType())) - .add("map_ntz", MapType(StringType(), TimestampNTZType())) - ) - data = [ - Row( - datetime.datetime(2023, 1, 1, 0, 0, 0), - datetime.datetime(2023, 1, 1, 0, 0, 0), - Row( - datetime.datetime(2023, 1, 1, 0, 0, 0), - datetime.datetime(2023, 1, 1, 0, 0, 0), - ), - [datetime.datetime(2023, 1, 1, 0, 0, 0)], - [datetime.datetime(2023, 1, 1, 0, 0, 0)], - dict(ts=datetime.datetime(2023, 1, 1, 0, 0, 0)), - dict(ts_ntz=datetime.datetime(2023, 1, 1, 0, 0, 0)), - ) - ] + schema = self.schema_nested_timestamp + data = self.data_nested_timestamp df = self.spark.createDataFrame(data, schema) with self.sql_conf( @@ -1087,6 +1527,57 @@ def check_toPandas_nested_timestamp(self, arrow_enabled): assert_frame_equal(pdf, expected) + def test_toArrow_nested_timestamp(self): + schema = self.schema_nested_timestamp + data = self.data_nested_timestamp + df = self.spark.createDataFrame(data, schema) + + t = df.toArrow() + + from pyspark.sql.pandas.types import to_arrow_schema + + arrow_schema = to_arrow_schema(schema) + expected = pa.Table.from_pydict( + { + "ts": [datetime.datetime(2023, 1, 1, 8, 0, 0)], + "ts_ntz": [datetime.datetime(2023, 1, 1, 0, 0, 0)], + "struct": [ + Row( + datetime.datetime(2023, 1, 1, 8, 0, 0), + datetime.datetime(2023, 1, 1, 0, 0, 0), + ) + ], + "array": [[datetime.datetime(2023, 1, 1, 8, 0, 0)]], + "array_ntz": [[datetime.datetime(2023, 1, 1, 0, 0, 0)]], + "map": [dict(ts=datetime.datetime(2023, 1, 1, 8, 0, 0))], + "map_ntz": [dict(ts_ntz=datetime.datetime(2023, 1, 1, 0, 0, 0))], + }, + schema=arrow_schema, + ) + + self.assertTrue(t.equals(expected)) + + def test_arrow_map_timestamp_nulls_round_trip(self): + origin_schema = pa.schema([("map", pa.map_(pa.string(), pa.timestamp("us", tz="UTC")))]) + origin = pa.table( + [[dict(ts=datetime.datetime(2023, 1, 1, 8, 0, 0)), None]], + schema=origin_schema, + ) + df = self.spark.createDataFrame(origin) + t = df.toArrow() + + # SPARK-48302: PyArrow versions before 17.0.0 replaced nulls with empty lists when + # reconstructing MapArray columns to localize timestamps + if LooseVersion(pa.__version__) >= LooseVersion("17.0.0"): + expected = origin + else: + expected = pa.table( + [[dict(ts=datetime.datetime(2023, 1, 1, 8, 0, 0)), []]], + schema=origin_schema, + ) + + self.assertTrue(t.equals(expected)) + def test_createDataFrame_udt(self): for arrow_enabled in [True, False]: with self.subTest(arrow_enabled=arrow_enabled): @@ -1183,6 +1674,50 @@ def test_negative_and_zero_batch_size(self): pdf = pd.DataFrame({"a": [123]}) assert_frame_equal(pdf, self.spark.createDataFrame(pdf).toPandas()) + def test_createDataFrame_arrow_large_string(self): + a = pa.array(["a"] * 5, type=pa.large_string()) + t = pa.table([a], ["ls"]) + df = self.spark.createDataFrame(t) + self.assertIsInstance(df.schema["ls"].dataType, StringType) + + def test_createDataFrame_arrow_large_binary(self): + a = pa.array(["a"] * 5, type=pa.large_binary()) + t = pa.table([a], ["lb"]) + df = self.spark.createDataFrame(t) + self.assertIsInstance(df.schema["lb"].dataType, BinaryType) + + def test_createDataFrame_arrow_large_list(self): + a = pa.array([[-1, 3]] * 5, type=pa.large_list(pa.int32())) + t = pa.table([a], ["ll"]) + df = self.spark.createDataFrame(t) + self.assertIsInstance(df.schema["ll"].dataType, ArrayType) + + def test_createDataFrame_arrow_large_list_int64_offset(self): + # Check for expected failure if the large list contains an index >= 2^31 + a = pa.LargeListArray.from_arrays( + [0, 2**31], pa.NullArray.from_buffers(pa.null(), 2**31, [None]) + ) + t = pa.table([a], ["ll"]) + with self.assertRaises(Exception): + self.spark.createDataFrame(t) + + def test_createDataFrame_arrow_fixed_size_binary(self): + a = pa.array(["a"] * 5, type=pa.binary(1)) + t = pa.table([a], ["fsb"]) + df = self.spark.createDataFrame(t) + self.assertIsInstance(df.schema["fsb"].dataType, BinaryType) + + def test_createDataFrame_arrow_fixed_size_list(self): + a = pa.array([[-1, 3]] * 5, type=pa.list_(pa.int32(), 2)) + t = pa.table([a], ["fsl"]) + if LooseVersion(pa.__version__) < LooseVersion("14.0.0"): + # PyArrow versions before 14.0.0 do not support casting FixedSizeListArray to ListArray + with self.assertRaises(PySparkTypeError): + df = self.spark.createDataFrame(t) + else: + df = self.spark.createDataFrame(t) + self.assertIsInstance(df.schema["fsl"].dataType, ArrayType) + @unittest.skipIf( not have_pandas or not have_pyarrow, diff --git a/python/pyspark/sql/tests/test_arrow_map.py b/python/pyspark/sql/tests/test_arrow_map.py index f5fc2ea29ebad..2e82869230db4 100644 --- a/python/pyspark/sql/tests/test_arrow_map.py +++ b/python/pyspark/sql/tests/test_arrow_map.py @@ -103,14 +103,14 @@ def bad_iter_elem(_): with self.assertRaisesRegex( PythonException, "Return type of the user-defined function should be iterator " - "of pyarrow.RecordBatch, but is int.", + "of pyarrow.RecordBatch, but is int", ): (self.spark.range(10, numPartitions=3).mapInArrow(not_iter, "a int").count()) with self.assertRaisesRegex( PythonException, "Return type of the user-defined function should be iterator " - "of pyarrow.RecordBatch, but is iterator of int.", + "of pyarrow.RecordBatch, but is iterator of int", ): (self.spark.range(10, numPartitions=3).mapInArrow(bad_iter_elem, "a int").count()) diff --git a/python/pyspark/sql/tests/test_arrow_python_udf.py b/python/pyspark/sql/tests/test_arrow_python_udf.py index 23f302ec3c8d3..5a66d61cb66a2 100644 --- a/python/pyspark/sql/tests/test_arrow_python_udf.py +++ b/python/pyspark/sql/tests/test_arrow_python_udf.py @@ -17,7 +17,7 @@ import unittest -from pyspark.errors import PythonException, PySparkNotImplementedError +from pyspark.errors import AnalysisException, PythonException, PySparkNotImplementedError from pyspark.sql import Row from pyspark.sql.functions import udf from pyspark.sql.tests.test_udf import BaseUDFTestsMixin @@ -197,6 +197,28 @@ def test_warn_no_args(self): " without arguments.", ) + def test_named_arguments_negative(self): + @udf("int") + def test_udf(a, b): + return a + b + + self.spark.udf.register("test_udf", test_udf) + + with self.assertRaisesRegex( + AnalysisException, + "DUPLICATE_ROUTINE_PARAMETER_ASSIGNMENT.DOUBLE_NAMED_ARGUMENT_REFERENCE", + ): + self.spark.sql("SELECT test_udf(a => id, a => id * 10) FROM range(2)").show() + + with self.assertRaisesRegex(AnalysisException, "UNEXPECTED_POSITIONAL_ARGUMENT"): + self.spark.sql("SELECT test_udf(a => id, id * 10) FROM range(2)").show() + + with self.assertRaises(PythonException): + self.spark.sql("SELECT test_udf(c => 'x') FROM range(2)").show() + + with self.assertRaises(PythonException): + self.spark.sql("SELECT test_udf(id, a => id * 10) FROM range(2)").show() + class PythonUDFArrowTests(PythonUDFArrowTestsMixin, ReusedSQLTestCase): @classmethod diff --git a/python/pyspark/sql/tests/test_column.py b/python/pyspark/sql/tests/test_column.py index e51ae69814bdd..ea17febc00e38 100644 --- a/python/pyspark/sql/tests/test_column.py +++ b/python/pyspark/sql/tests/test_column.py @@ -19,7 +19,7 @@ from itertools import chain from pyspark.sql import Column, Row from pyspark.sql import functions as sf -from pyspark.sql.types import StructType, StructField, LongType +from pyspark.sql.types import StructType, StructField, IntegerType, LongType from pyspark.errors import AnalysisException, PySparkTypeError, PySparkValueError from pyspark.testing.sqlutils import ReusedSQLTestCase @@ -42,7 +42,7 @@ def test_and_in_expression(self): def test_validate_column_types(self): from pyspark.sql.functions import udf, to_json - from pyspark.sql.column import _to_java_column + from pyspark.sql.classic.column import _to_java_column self.assertTrue("Column" in _to_java_column("a").getClass().toString()) self.assertTrue("Column" in _to_java_column("a").getClass().toString()) @@ -228,6 +228,17 @@ def test_alias_negative(self): message_parameters={"arg_name": "metadata"}, ) + def test_cast_str_representation(self): + self.assertEqual(str(sf.col("a").cast("int")), "Column<'CAST(a AS INT)'>") + self.assertEqual(str(sf.col("a").cast("INT")), "Column<'CAST(a AS INT)'>") + self.assertEqual(str(sf.col("a").cast(IntegerType())), "Column<'CAST(a AS INT)'>") + self.assertEqual(str(sf.col("a").cast(LongType())), "Column<'CAST(a AS BIGINT)'>") + + self.assertEqual(str(sf.col("a").try_cast("int")), "Column<'TRY_CAST(a AS INT)'>") + self.assertEqual(str(sf.col("a").try_cast("INT")), "Column<'TRY_CAST(a AS INT)'>") + self.assertEqual(str(sf.col("a").try_cast(IntegerType())), "Column<'TRY_CAST(a AS INT)'>") + self.assertEqual(str(sf.col("a").try_cast(LongType())), "Column<'TRY_CAST(a AS BIGINT)'>") + def test_cast_negative(self): with self.assertRaises(PySparkTypeError) as pe: self.spark.range(1).id.cast(123) @@ -248,6 +259,18 @@ def test_over_negative(self): message_parameters={"arg_name": "window", "arg_type": "int"}, ) + def test_eqnullsafe_classmethod_usage(self): + df = self.spark.range(1) + self.assertEqual(df.select(Column.eqNullSafe(df.id, df.id)).first()[0], True) + + def test_isinstance_dataframe(self): + self.assertIsInstance(self.spark.range(1).id, Column) + + def test_expr_str_representation(self): + expression = sf.expr("foo") + when_cond = sf.when(expression, sf.lit(None)) + self.assertEqual(str(when_cond), "Column<'CASE WHEN foo THEN NULL END'>") + class ColumnTests(ColumnTestsMixin, ReusedSQLTestCase): pass diff --git a/python/pyspark/sql/tests/test_context.py b/python/pyspark/sql/tests/test_context.py index b381833314861..f363b8748c0b9 100644 --- a/python/pyspark/sql/tests/test_context.py +++ b/python/pyspark/sql/tests/test_context.py @@ -26,13 +26,13 @@ from pyspark import SparkContext, SQLContext from pyspark.sql import Row, SparkSession from pyspark.sql.types import StructType, StringType, StructField -from pyspark.testing.utils import ReusedPySparkTestCase +from pyspark.testing.sqlutils import ReusedSQLTestCase -class HiveContextSQLTests(ReusedPySparkTestCase): +class HiveContextSQLTests(ReusedSQLTestCase): @classmethod def setUpClass(cls): - ReusedPySparkTestCase.setUpClass() + ReusedSQLTestCase.setUpClass() cls.tempdir = tempfile.NamedTemporaryFile(delete=False) cls.hive_available = True cls.spark = None @@ -58,7 +58,7 @@ def setUp(self): @classmethod def tearDownClass(cls): - ReusedPySparkTestCase.tearDownClass() + ReusedSQLTestCase.tearDownClass() shutil.rmtree(cls.tempdir.name, ignore_errors=True) if cls.spark is not None: cls.spark.stop() @@ -100,23 +100,20 @@ def test_save_and_load_table(self): self.spark.sql("DROP TABLE savedJsonTable") self.spark.sql("DROP TABLE externalJsonTable") - defaultDataSourceName = self.spark.conf.get( - "spark.sql.sources.default", "org.apache.spark.sql.parquet" - ) - self.spark.sql("SET spark.sql.sources.default=org.apache.spark.sql.json") - df.write.saveAsTable("savedJsonTable", path=tmpPath, mode="overwrite") - actual = self.spark.catalog.createTable("externalJsonTable", path=tmpPath) - self.assertEqual( - sorted(df.collect()), sorted(self.spark.sql("SELECT * FROM savedJsonTable").collect()) - ) - self.assertEqual( - sorted(df.collect()), - sorted(self.spark.sql("SELECT * FROM externalJsonTable").collect()), - ) - self.assertEqual(sorted(df.collect()), sorted(actual.collect())) - self.spark.sql("DROP TABLE savedJsonTable") - self.spark.sql("DROP TABLE externalJsonTable") - self.spark.sql("SET spark.sql.sources.default=" + defaultDataSourceName) + with self.sql_conf({"spark.sql.sources.default": "org.apache.spark.sql.json"}): + df.write.saveAsTable("savedJsonTable", path=tmpPath, mode="overwrite") + actual = self.spark.catalog.createTable("externalJsonTable", path=tmpPath) + self.assertEqual( + sorted(df.collect()), + sorted(self.spark.sql("SELECT * FROM savedJsonTable").collect()), + ) + self.assertEqual( + sorted(df.collect()), + sorted(self.spark.sql("SELECT * FROM externalJsonTable").collect()), + ) + self.assertEqual(sorted(df.collect()), sorted(actual.collect())) + self.spark.sql("DROP TABLE savedJsonTable") + self.spark.sql("DROP TABLE externalJsonTable") shutil.rmtree(tmpPath) diff --git a/python/pyspark/sql/tests/test_dataframe.py b/python/pyspark/sql/tests/test_dataframe.py index 3f6a8eece5b09..c7cf43a334541 100644 --- a/python/pyspark/sql/tests/test_dataframe.py +++ b/python/pyspark/sql/tests/test_dataframe.py @@ -23,7 +23,7 @@ import io from contextlib import redirect_stdout -from pyspark.sql import Row, functions +from pyspark.sql import Row, functions, DataFrame from pyspark.sql.functions import col, lit, count, struct from pyspark.sql.types import ( StringType, @@ -37,9 +37,7 @@ AnalysisException, IllegalArgumentException, PySparkTypeError, - ArithmeticException, - QueryContextType, - NumberFormatException, + PySparkValueError, ) from pyspark.testing.sqlutils import ( ReusedSQLTestCase, @@ -255,24 +253,22 @@ def test_ordering_of_with_columns_renamed(self): self.assertEqual(df2.columns, ["a"]) def test_drop_duplicates(self): - # SPARK-36034 test that drop duplicates throws a type error when in correct type provided df = self.spark.createDataFrame([("Alice", 50), ("Alice", 60)], ["name", "age"]) # shouldn't drop a non-null row self.assertEqual(df.dropDuplicates().count(), 2) self.assertEqual(df.dropDuplicates(["name"]).count(), 1) - self.assertEqual(df.dropDuplicates(["name", "age"]).count(), 2) - with self.assertRaises(PySparkTypeError) as pe: - df.dropDuplicates("name") + self.assertEqual(df.drop_duplicates(["name"]).count(), 1) + self.assertEqual(df.drop_duplicates(["name", "age"]).count(), 2) - self.check_error( - exception=pe.exception, - error_class="NOT_LIST_OR_TUPLE", - message_parameters={"arg_name": "subset", "arg_type": "str"}, - ) + # SPARK-48482 dropDuplicates should take varargs + self.assertEqual(df.dropDuplicates("name").count(), 1) + self.assertEqual(df.dropDuplicates("name", "age").count(), 2) + self.assertEqual(df.drop_duplicates("name").count(), 1) + self.assertEqual(df.drop_duplicates("name", "age").count(), 2) def test_drop_duplicates_with_ambiguous_reference(self): df1 = self.spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"]) @@ -433,6 +429,11 @@ def test_sample(self): IllegalArgumentException, lambda: self.spark.range(1).sample(-1.0).count() ) + def test_sample_with_random_seed(self): + df = self.spark.range(10000).sample(0.1) + cnts = [df.count() for i in range(10)] + self.assertEqual(1, len(set(cnts))) + def test_toDF_with_string(self): df = self.spark.createDataFrame([("John", 30), ("Alice", 25), ("Bob", 28)]) data = [("John", 30), ("Alice", 25), ("Bob", 28)] @@ -529,7 +530,7 @@ def test_join_without_on(self): def test_invalid_join_method(self): df1 = self.spark.createDataFrame([("Alice", 5), ("Bob", 8)], ["name", "age"]) df2 = self.spark.createDataFrame([("Alice", 80), ("Bob", 90)], ["name", "height"]) - self.assertRaises(IllegalArgumentException, lambda: df1.join(df2, how="invalid-join-type")) + self.assertRaises(AnalysisException, lambda: df1.join(df2, how="invalid-join-type")) # Cartesian products require cross join syntax def test_require_cross(self): @@ -835,491 +836,29 @@ def test_duplicate_field_names(self): self.assertEqual(df.schema, schema) self.assertEqual(df.collect(), data) - def test_dataframe_error_context(self): - # SPARK-47274: Add more useful contexts for PySpark DataFrame API errors. - with self.sql_conf({"spark.sql.ansi.enabled": True}): - df = self.spark.range(10) - - # DataFrameQueryContext with pysparkLoggingInfo - divide - with self.assertRaises(ArithmeticException) as pe: - df.withColumn("div_zero", df.id / 0).collect() - self.check_error( - exception=pe.exception, - error_class="DIVIDE_BY_ZERO", - message_parameters={"config": '"spark.sql.ansi.enabled"'}, - query_context_type=QueryContextType.DataFrame, - pyspark_fragment="divide", - ) - - # DataFrameQueryContext with pysparkLoggingInfo - plus - with self.assertRaises(NumberFormatException) as pe: - df.withColumn("plus_invalid_type", df.id + "string").collect() - self.check_error( - exception=pe.exception, - error_class="CAST_INVALID_INPUT", - message_parameters={ - "expression": "'string'", - "sourceType": '"STRING"', - "targetType": '"BIGINT"', - "ansiConfig": '"spark.sql.ansi.enabled"', - }, - query_context_type=QueryContextType.DataFrame, - pyspark_fragment="plus", - ) - - # DataFrameQueryContext with pysparkLoggingInfo - minus - with self.assertRaises(NumberFormatException) as pe: - df.withColumn("minus_invalid_type", df.id - "string").collect() - self.check_error( - exception=pe.exception, - error_class="CAST_INVALID_INPUT", - message_parameters={ - "expression": "'string'", - "sourceType": '"STRING"', - "targetType": '"BIGINT"', - "ansiConfig": '"spark.sql.ansi.enabled"', - }, - query_context_type=QueryContextType.DataFrame, - pyspark_fragment="minus", - ) - - # DataFrameQueryContext with pysparkLoggingInfo - multiply - with self.assertRaises(NumberFormatException) as pe: - df.withColumn("multiply_invalid_type", df.id * "string").collect() - self.check_error( - exception=pe.exception, - error_class="CAST_INVALID_INPUT", - message_parameters={ - "expression": "'string'", - "sourceType": '"STRING"', - "targetType": '"BIGINT"', - "ansiConfig": '"spark.sql.ansi.enabled"', - }, - query_context_type=QueryContextType.DataFrame, - pyspark_fragment="multiply", - ) - - # DataFrameQueryContext with pysparkLoggingInfo - mod - with self.assertRaises(NumberFormatException) as pe: - df.withColumn("mod_invalid_type", df.id % "string").collect() - self.check_error( - exception=pe.exception, - error_class="CAST_INVALID_INPUT", - message_parameters={ - "expression": "'string'", - "sourceType": '"STRING"', - "targetType": '"BIGINT"', - "ansiConfig": '"spark.sql.ansi.enabled"', - }, - query_context_type=QueryContextType.DataFrame, - pyspark_fragment="mod", - ) - - # DataFrameQueryContext with pysparkLoggingInfo - equalTo - with self.assertRaises(NumberFormatException) as pe: - df.withColumn("equalTo_invalid_type", df.id == "string").collect() - self.check_error( - exception=pe.exception, - error_class="CAST_INVALID_INPUT", - message_parameters={ - "expression": "'string'", - "sourceType": '"STRING"', - "targetType": '"BIGINT"', - "ansiConfig": '"spark.sql.ansi.enabled"', - }, - query_context_type=QueryContextType.DataFrame, - pyspark_fragment="equalTo", - ) - - # DataFrameQueryContext with pysparkLoggingInfo - lt - with self.assertRaises(NumberFormatException) as pe: - df.withColumn("lt_invalid_type", df.id < "string").collect() - self.check_error( - exception=pe.exception, - error_class="CAST_INVALID_INPUT", - message_parameters={ - "expression": "'string'", - "sourceType": '"STRING"', - "targetType": '"BIGINT"', - "ansiConfig": '"spark.sql.ansi.enabled"', - }, - query_context_type=QueryContextType.DataFrame, - pyspark_fragment="lt", - ) - - # DataFrameQueryContext with pysparkLoggingInfo - leq - with self.assertRaises(NumberFormatException) as pe: - df.withColumn("leq_invalid_type", df.id <= "string").collect() - self.check_error( - exception=pe.exception, - error_class="CAST_INVALID_INPUT", - message_parameters={ - "expression": "'string'", - "sourceType": '"STRING"', - "targetType": '"BIGINT"', - "ansiConfig": '"spark.sql.ansi.enabled"', - }, - query_context_type=QueryContextType.DataFrame, - pyspark_fragment="leq", - ) - - # DataFrameQueryContext with pysparkLoggingInfo - geq - with self.assertRaises(NumberFormatException) as pe: - df.withColumn("geq_invalid_type", df.id >= "string").collect() - self.check_error( - exception=pe.exception, - error_class="CAST_INVALID_INPUT", - message_parameters={ - "expression": "'string'", - "sourceType": '"STRING"', - "targetType": '"BIGINT"', - "ansiConfig": '"spark.sql.ansi.enabled"', - }, - query_context_type=QueryContextType.DataFrame, - pyspark_fragment="geq", - ) - - # DataFrameQueryContext with pysparkLoggingInfo - gt - with self.assertRaises(NumberFormatException) as pe: - df.withColumn("gt_invalid_type", df.id > "string").collect() - self.check_error( - exception=pe.exception, - error_class="CAST_INVALID_INPUT", - message_parameters={ - "expression": "'string'", - "sourceType": '"STRING"', - "targetType": '"BIGINT"', - "ansiConfig": '"spark.sql.ansi.enabled"', - }, - query_context_type=QueryContextType.DataFrame, - pyspark_fragment="gt", - ) - - # DataFrameQueryContext with pysparkLoggingInfo - eqNullSafe - with self.assertRaises(NumberFormatException) as pe: - df.withColumn("eqNullSafe_invalid_type", df.id.eqNullSafe("string")).collect() - self.check_error( - exception=pe.exception, - error_class="CAST_INVALID_INPUT", - message_parameters={ - "expression": "'string'", - "sourceType": '"STRING"', - "targetType": '"BIGINT"', - "ansiConfig": '"spark.sql.ansi.enabled"', - }, - query_context_type=QueryContextType.DataFrame, - pyspark_fragment="eqNullSafe", - ) - - # DataFrameQueryContext with pysparkLoggingInfo - and - with self.assertRaises(AnalysisException) as pe: - df.withColumn("and_invalid_type", df.id & "string").collect() - self.check_error( - exception=pe.exception, - error_class="DATATYPE_MISMATCH.BINARY_OP_WRONG_TYPE", - message_parameters={ - "inputType": '"BOOLEAN"', - "actualDataType": '"BIGINT"', - "sqlExpr": '"(id AND string)"', - }, - query_context_type=QueryContextType.DataFrame, - pyspark_fragment="and", - ) - - # DataFrameQueryContext with pysparkLoggingInfo - or - with self.assertRaises(AnalysisException) as pe: - df.withColumn("or_invalid_type", df.id | "string").collect() - self.check_error( - exception=pe.exception, - error_class="DATATYPE_MISMATCH.BINARY_OP_WRONG_TYPE", - message_parameters={ - "inputType": '"BOOLEAN"', - "actualDataType": '"BIGINT"', - "sqlExpr": '"(id OR string)"', - }, - query_context_type=QueryContextType.DataFrame, - pyspark_fragment="or", - ) - - # DataFrameQueryContext with pysparkLoggingInfo - bitwiseOR - with self.assertRaises(NumberFormatException) as pe: - df.withColumn("bitwiseOR_invalid_type", df.id.bitwiseOR("string")).collect() - self.check_error( - exception=pe.exception, - error_class="CAST_INVALID_INPUT", - message_parameters={ - "expression": "'string'", - "sourceType": '"STRING"', - "targetType": '"BIGINT"', - "ansiConfig": '"spark.sql.ansi.enabled"', - }, - query_context_type=QueryContextType.DataFrame, - pyspark_fragment="bitwiseOR", - ) - - # DataFrameQueryContext with pysparkLoggingInfo - bitwiseAND - with self.assertRaises(NumberFormatException) as pe: - df.withColumn("bitwiseAND_invalid_type", df.id.bitwiseAND("string")).collect() - self.check_error( - exception=pe.exception, - error_class="CAST_INVALID_INPUT", - message_parameters={ - "expression": "'string'", - "sourceType": '"STRING"', - "targetType": '"BIGINT"', - "ansiConfig": '"spark.sql.ansi.enabled"', - }, - query_context_type=QueryContextType.DataFrame, - pyspark_fragment="bitwiseAND", - ) - - # DataFrameQueryContext with pysparkLoggingInfo - bitwiseXOR - with self.assertRaises(NumberFormatException) as pe: - df.withColumn("bitwiseXOR_invalid_type", df.id.bitwiseXOR("string")).collect() - self.check_error( - exception=pe.exception, - error_class="CAST_INVALID_INPUT", - message_parameters={ - "expression": "'string'", - "sourceType": '"STRING"', - "targetType": '"BIGINT"', - "ansiConfig": '"spark.sql.ansi.enabled"', - }, - query_context_type=QueryContextType.DataFrame, - pyspark_fragment="bitwiseXOR", - ) - - # DataFrameQueryContext with pysparkLoggingInfo - chained (`divide` is problematic) - with self.assertRaises(ArithmeticException) as pe: - df.withColumn("multiply_ten", df.id * 10).withColumn( - "divide_zero", df.id / 0 - ).withColumn("plus_ten", df.id + 10).withColumn("minus_ten", df.id - 10).collect() - self.check_error( - exception=pe.exception, - error_class="DIVIDE_BY_ZERO", - message_parameters={"config": '"spark.sql.ansi.enabled"'}, - query_context_type=QueryContextType.DataFrame, - pyspark_fragment="divide", - ) - - # DataFrameQueryContext with pysparkLoggingInfo - chained (`plus` is problematic) - with self.assertRaises(NumberFormatException) as pe: - df.withColumn("multiply_ten", df.id * 10).withColumn( - "divide_ten", df.id / 10 - ).withColumn("plus_string", df.id + "string").withColumn( - "minus_ten", df.id - 10 - ).collect() - self.check_error( - exception=pe.exception, - error_class="CAST_INVALID_INPUT", - message_parameters={ - "expression": "'string'", - "sourceType": '"STRING"', - "targetType": '"BIGINT"', - "ansiConfig": '"spark.sql.ansi.enabled"', - }, - query_context_type=QueryContextType.DataFrame, - pyspark_fragment="plus", - ) - - # DataFrameQueryContext with pysparkLoggingInfo - chained (`minus` is problematic) - with self.assertRaises(NumberFormatException) as pe: - df.withColumn("multiply_ten", df.id * 10).withColumn( - "divide_ten", df.id / 10 - ).withColumn("plus_ten", df.id + 10).withColumn( - "minus_string", df.id - "string" - ).collect() - self.check_error( - exception=pe.exception, - error_class="CAST_INVALID_INPUT", - message_parameters={ - "expression": "'string'", - "sourceType": '"STRING"', - "targetType": '"BIGINT"', - "ansiConfig": '"spark.sql.ansi.enabled"', - }, - query_context_type=QueryContextType.DataFrame, - pyspark_fragment="minus", - ) - - # DataFrameQueryContext with pysparkLoggingInfo - chained (`multiply` is problematic) - with self.assertRaises(NumberFormatException) as pe: - df.withColumn("multiply_string", df.id * "string").withColumn( - "divide_ten", df.id / 10 - ).withColumn("plus_ten", df.id + 10).withColumn("minus_ten", df.id - 10).collect() - self.check_error( - exception=pe.exception, - error_class="CAST_INVALID_INPUT", - message_parameters={ - "expression": "'string'", - "sourceType": '"STRING"', - "targetType": '"BIGINT"', - "ansiConfig": '"spark.sql.ansi.enabled"', - }, - query_context_type=QueryContextType.DataFrame, - pyspark_fragment="multiply", - ) + def test_union_classmethod_usage(self): + df = self.spark.range(1) + self.assertEqual(DataFrame.union(df, df).collect(), [Row(id=0), Row(id=0)]) - # Multiple expressions in df.select (`divide` is problematic) - with self.assertRaises(ArithmeticException) as pe: - df.select(df.id - 10, df.id + 4, df.id / 0, df.id * 5).collect() - self.check_error( - exception=pe.exception, - error_class="DIVIDE_BY_ZERO", - message_parameters={"config": '"spark.sql.ansi.enabled"'}, - query_context_type=QueryContextType.DataFrame, - pyspark_fragment="divide", - ) + def test_isinstance_dataframe(self): + self.assertIsInstance(self.spark.range(1), DataFrame) - # Multiple expressions in df.select (`plus` is problematic) - with self.assertRaises(NumberFormatException) as pe: - df.select(df.id - 10, df.id + "string", df.id / 10, df.id * 5).collect() - self.check_error( - exception=pe.exception, - error_class="CAST_INVALID_INPUT", - message_parameters={ - "expression": "'string'", - "sourceType": '"STRING"', - "targetType": '"BIGINT"', - "ansiConfig": '"spark.sql.ansi.enabled"', - }, - query_context_type=QueryContextType.DataFrame, - pyspark_fragment="plus", - ) - - # Multiple expressions in df.select (`minus` is problematic) - with self.assertRaises(NumberFormatException) as pe: - df.select(df.id - "string", df.id + 4, df.id / 10, df.id * 5).collect() - self.check_error( - exception=pe.exception, - error_class="CAST_INVALID_INPUT", - message_parameters={ - "expression": "'string'", - "sourceType": '"STRING"', - "targetType": '"BIGINT"', - "ansiConfig": '"spark.sql.ansi.enabled"', - }, - query_context_type=QueryContextType.DataFrame, - pyspark_fragment="minus", - ) - - # Multiple expressions in df.select (`multiply` is problematic) - with self.assertRaises(NumberFormatException) as pe: - df.select(df.id - 10, df.id + 4, df.id / 10, df.id * "string").collect() - self.check_error( - exception=pe.exception, - error_class="CAST_INVALID_INPUT", - message_parameters={ - "expression": "'string'", - "sourceType": '"STRING"', - "targetType": '"BIGINT"', - "ansiConfig": '"spark.sql.ansi.enabled"', - }, - query_context_type=QueryContextType.DataFrame, - pyspark_fragment="multiply", - ) - - # Multiple expressions with pre-declared expressions (`divide` is problematic) - a = df.id / 10 - b = df.id / 0 - with self.assertRaises(ArithmeticException) as pe: - df.select(a, df.id + 4, b, df.id * 5).collect() - self.check_error( - exception=pe.exception, - error_class="DIVIDE_BY_ZERO", - message_parameters={"config": '"spark.sql.ansi.enabled"'}, - query_context_type=QueryContextType.DataFrame, - pyspark_fragment="divide", - ) - - # Multiple expressions with pre-declared expressions (`plus` is problematic) - a = df.id + "string" - b = df.id + 4 - with self.assertRaises(NumberFormatException) as pe: - df.select(df.id / 10, a, b, df.id * 5).collect() - self.check_error( - exception=pe.exception, - error_class="CAST_INVALID_INPUT", - message_parameters={ - "expression": "'string'", - "sourceType": '"STRING"', - "targetType": '"BIGINT"', - "ansiConfig": '"spark.sql.ansi.enabled"', - }, - query_context_type=QueryContextType.DataFrame, - pyspark_fragment="plus", - ) - - # Multiple expressions with pre-declared expressions (`minus` is problematic) - a = df.id - "string" - b = df.id - 5 - with self.assertRaises(NumberFormatException) as pe: - df.select(a, df.id / 10, b, df.id * 5).collect() - self.check_error( - exception=pe.exception, - error_class="CAST_INVALID_INPUT", - message_parameters={ - "expression": "'string'", - "sourceType": '"STRING"', - "targetType": '"BIGINT"', - "ansiConfig": '"spark.sql.ansi.enabled"', - }, - query_context_type=QueryContextType.DataFrame, - pyspark_fragment="minus", - ) - - # Multiple expressions with pre-declared expressions (`multiply` is problematic) - a = df.id * "string" - b = df.id * 10 - with self.assertRaises(NumberFormatException) as pe: - df.select(a, df.id / 10, b, df.id + 5).collect() - self.check_error( - exception=pe.exception, - error_class="CAST_INVALID_INPUT", - message_parameters={ - "expression": "'string'", - "sourceType": '"STRING"', - "targetType": '"BIGINT"', - "ansiConfig": '"spark.sql.ansi.enabled"', - }, - query_context_type=QueryContextType.DataFrame, - pyspark_fragment="multiply", - ) - - # DataFrameQueryContext without pysparkLoggingInfo - with self.assertRaises(AnalysisException) as pe: - df.select("non-existing-column") - self.check_error( - exception=pe.exception, - error_class="UNRESOLVED_COLUMN.WITH_SUGGESTION", - message_parameters={"objectName": "`non-existing-column`", "proposal": "`id`"}, - query_context_type=QueryContextType.DataFrame, - pyspark_fragment="", - ) - - # SQLQueryContext - with self.assertRaises(ArithmeticException) as pe: - self.spark.sql("select 10/0").collect() - self.check_error( - exception=pe.exception, - error_class="DIVIDE_BY_ZERO", - message_parameters={"config": '"spark.sql.ansi.enabled"'}, - query_context_type=QueryContextType.SQL, - ) - - # No QueryContext - with self.assertRaises(AnalysisException) as pe: - self.spark.sql("select * from non-existing-table") - self.check_error( - exception=pe.exception, - error_class="INVALID_IDENTIFIER", - message_parameters={"ident": "non-existing-table"}, - query_context_type=None, - ) + def test_checkpoint_dataframe(self): + with io.StringIO() as buf, redirect_stdout(buf): + self.spark.range(1).localCheckpoint().explain() + self.assertIn("ExistingRDD", buf.getvalue()) class DataFrameTests(DataFrameTestsMixin, ReusedSQLTestCase): - pass + def test_query_execution_unsupported_in_classic(self): + with self.assertRaises(PySparkValueError) as pe: + self.spark.range(1).executionInfo + + self.check_error( + exception=pe.exception, + error_class="CLASSIC_OPERATION_NOT_SUPPORTED_ON_DF", + message_parameters={"member": "queryExecution"}, + ) if __name__ == "__main__": diff --git a/python/pyspark/sql/tests/test_dataframe_query_context.py b/python/pyspark/sql/tests/test_dataframe_query_context.py new file mode 100644 index 0000000000000..e1a3e33df8593 --- /dev/null +++ b/python/pyspark/sql/tests/test_dataframe_query_context.py @@ -0,0 +1,488 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import unittest +from pyspark.errors import ( + AnalysisException, + ArithmeticException, + QueryContextType, + NumberFormatException, +) +from pyspark.testing.sqlutils import ( + ReusedSQLTestCase, +) + + +class DataFrameQueryContextTestsMixin: + def test_dataframe_query_context(self): + # SPARK-47274: Add more useful contexts for PySpark DataFrame API errors. + with self.sql_conf({"spark.sql.ansi.enabled": True}): + df = self.spark.range(10) + + # DataFrameQueryContext with pysparkLoggingInfo - divide + with self.assertRaises(ArithmeticException) as pe: + df.withColumn("div_zero", df.id / 0).collect() + self.check_error( + exception=pe.exception, + error_class="DIVIDE_BY_ZERO", + message_parameters={"config": '"spark.sql.ansi.enabled"'}, + query_context_type=QueryContextType.DataFrame, + fragment="__truediv__", + ) + + # DataFrameQueryContext with pysparkLoggingInfo - plus + with self.assertRaises(NumberFormatException) as pe: + df.withColumn("plus_invalid_type", df.id + "string").collect() + self.check_error( + exception=pe.exception, + error_class="CAST_INVALID_INPUT", + message_parameters={ + "expression": "'string'", + "sourceType": '"STRING"', + "targetType": '"BIGINT"', + "ansiConfig": '"spark.sql.ansi.enabled"', + }, + query_context_type=QueryContextType.DataFrame, + fragment="__add__", + ) + + # DataFrameQueryContext with pysparkLoggingInfo - minus + with self.assertRaises(NumberFormatException) as pe: + df.withColumn("minus_invalid_type", df.id - "string").collect() + self.check_error( + exception=pe.exception, + error_class="CAST_INVALID_INPUT", + message_parameters={ + "expression": "'string'", + "sourceType": '"STRING"', + "targetType": '"BIGINT"', + "ansiConfig": '"spark.sql.ansi.enabled"', + }, + query_context_type=QueryContextType.DataFrame, + fragment="__sub__", + ) + + # DataFrameQueryContext with pysparkLoggingInfo - multiply + with self.assertRaises(NumberFormatException) as pe: + df.withColumn("multiply_invalid_type", df.id * "string").collect() + self.check_error( + exception=pe.exception, + error_class="CAST_INVALID_INPUT", + message_parameters={ + "expression": "'string'", + "sourceType": '"STRING"', + "targetType": '"BIGINT"', + "ansiConfig": '"spark.sql.ansi.enabled"', + }, + query_context_type=QueryContextType.DataFrame, + fragment="__mul__", + ) + + # DataFrameQueryContext with pysparkLoggingInfo - mod + with self.assertRaises(NumberFormatException) as pe: + df.withColumn("mod_invalid_type", df.id % "string").collect() + self.check_error( + exception=pe.exception, + error_class="CAST_INVALID_INPUT", + message_parameters={ + "expression": "'string'", + "sourceType": '"STRING"', + "targetType": '"BIGINT"', + "ansiConfig": '"spark.sql.ansi.enabled"', + }, + query_context_type=QueryContextType.DataFrame, + fragment="__mod__", + ) + + # DataFrameQueryContext with pysparkLoggingInfo - equalTo + with self.assertRaises(NumberFormatException) as pe: + df.withColumn("equalTo_invalid_type", df.id == "string").collect() + self.check_error( + exception=pe.exception, + error_class="CAST_INVALID_INPUT", + message_parameters={ + "expression": "'string'", + "sourceType": '"STRING"', + "targetType": '"BIGINT"', + "ansiConfig": '"spark.sql.ansi.enabled"', + }, + query_context_type=QueryContextType.DataFrame, + fragment="__eq__", + ) + + # DataFrameQueryContext with pysparkLoggingInfo - lt + with self.assertRaises(NumberFormatException) as pe: + df.withColumn("lt_invalid_type", df.id < "string").collect() + self.check_error( + exception=pe.exception, + error_class="CAST_INVALID_INPUT", + message_parameters={ + "expression": "'string'", + "sourceType": '"STRING"', + "targetType": '"BIGINT"', + "ansiConfig": '"spark.sql.ansi.enabled"', + }, + query_context_type=QueryContextType.DataFrame, + fragment="__lt__", + ) + + # DataFrameQueryContext with pysparkLoggingInfo - leq + with self.assertRaises(NumberFormatException) as pe: + df.withColumn("leq_invalid_type", df.id <= "string").collect() + self.check_error( + exception=pe.exception, + error_class="CAST_INVALID_INPUT", + message_parameters={ + "expression": "'string'", + "sourceType": '"STRING"', + "targetType": '"BIGINT"', + "ansiConfig": '"spark.sql.ansi.enabled"', + }, + query_context_type=QueryContextType.DataFrame, + fragment="__le__", + ) + + # DataFrameQueryContext with pysparkLoggingInfo - geq + with self.assertRaises(NumberFormatException) as pe: + df.withColumn("geq_invalid_type", df.id >= "string").collect() + self.check_error( + exception=pe.exception, + error_class="CAST_INVALID_INPUT", + message_parameters={ + "expression": "'string'", + "sourceType": '"STRING"', + "targetType": '"BIGINT"', + "ansiConfig": '"spark.sql.ansi.enabled"', + }, + query_context_type=QueryContextType.DataFrame, + fragment="__ge__", + ) + + # DataFrameQueryContext with pysparkLoggingInfo - gt + with self.assertRaises(NumberFormatException) as pe: + df.withColumn("gt_invalid_type", df.id > "string").collect() + self.check_error( + exception=pe.exception, + error_class="CAST_INVALID_INPUT", + message_parameters={ + "expression": "'string'", + "sourceType": '"STRING"', + "targetType": '"BIGINT"', + "ansiConfig": '"spark.sql.ansi.enabled"', + }, + query_context_type=QueryContextType.DataFrame, + fragment="__gt__", + ) + + # DataFrameQueryContext with pysparkLoggingInfo - eqNullSafe + with self.assertRaises(NumberFormatException) as pe: + df.withColumn("eqNullSafe_invalid_type", df.id.eqNullSafe("string")).collect() + self.check_error( + exception=pe.exception, + error_class="CAST_INVALID_INPUT", + message_parameters={ + "expression": "'string'", + "sourceType": '"STRING"', + "targetType": '"BIGINT"', + "ansiConfig": '"spark.sql.ansi.enabled"', + }, + query_context_type=QueryContextType.DataFrame, + fragment="eqNullSafe", + ) + + # DataFrameQueryContext with pysparkLoggingInfo - bitwiseOR + with self.assertRaises(NumberFormatException) as pe: + df.withColumn("bitwiseOR_invalid_type", df.id.bitwiseOR("string")).collect() + self.check_error( + exception=pe.exception, + error_class="CAST_INVALID_INPUT", + message_parameters={ + "expression": "'string'", + "sourceType": '"STRING"', + "targetType": '"BIGINT"', + "ansiConfig": '"spark.sql.ansi.enabled"', + }, + query_context_type=QueryContextType.DataFrame, + fragment="bitwiseOR", + ) + + # DataFrameQueryContext with pysparkLoggingInfo - bitwiseAND + with self.assertRaises(NumberFormatException) as pe: + df.withColumn("bitwiseAND_invalid_type", df.id.bitwiseAND("string")).collect() + self.check_error( + exception=pe.exception, + error_class="CAST_INVALID_INPUT", + message_parameters={ + "expression": "'string'", + "sourceType": '"STRING"', + "targetType": '"BIGINT"', + "ansiConfig": '"spark.sql.ansi.enabled"', + }, + query_context_type=QueryContextType.DataFrame, + fragment="bitwiseAND", + ) + + # DataFrameQueryContext with pysparkLoggingInfo - bitwiseXOR + with self.assertRaises(NumberFormatException) as pe: + df.withColumn("bitwiseXOR_invalid_type", df.id.bitwiseXOR("string")).collect() + self.check_error( + exception=pe.exception, + error_class="CAST_INVALID_INPUT", + message_parameters={ + "expression": "'string'", + "sourceType": '"STRING"', + "targetType": '"BIGINT"', + "ansiConfig": '"spark.sql.ansi.enabled"', + }, + query_context_type=QueryContextType.DataFrame, + fragment="bitwiseXOR", + ) + + # DataFrameQueryContext with pysparkLoggingInfo - chained (`divide` is problematic) + with self.assertRaises(ArithmeticException) as pe: + df.withColumn("multiply_ten", df.id * 10).withColumn( + "divide_zero", df.id / 0 + ).withColumn("plus_ten", df.id + 10).withColumn("minus_ten", df.id - 10).collect() + self.check_error( + exception=pe.exception, + error_class="DIVIDE_BY_ZERO", + message_parameters={"config": '"spark.sql.ansi.enabled"'}, + query_context_type=QueryContextType.DataFrame, + fragment="__truediv__", + ) + + # DataFrameQueryContext with pysparkLoggingInfo - chained (`plus` is problematic) + with self.assertRaises(NumberFormatException) as pe: + df.withColumn("multiply_ten", df.id * 10).withColumn( + "divide_ten", df.id / 10 + ).withColumn("plus_string", df.id + "string").withColumn( + "minus_ten", df.id - 10 + ).collect() + self.check_error( + exception=pe.exception, + error_class="CAST_INVALID_INPUT", + message_parameters={ + "expression": "'string'", + "sourceType": '"STRING"', + "targetType": '"BIGINT"', + "ansiConfig": '"spark.sql.ansi.enabled"', + }, + query_context_type=QueryContextType.DataFrame, + fragment="__add__", + ) + + # DataFrameQueryContext with pysparkLoggingInfo - chained (`minus` is problematic) + with self.assertRaises(NumberFormatException) as pe: + df.withColumn("multiply_ten", df.id * 10).withColumn( + "divide_ten", df.id / 10 + ).withColumn("plus_ten", df.id + 10).withColumn( + "minus_string", df.id - "string" + ).collect() + self.check_error( + exception=pe.exception, + error_class="CAST_INVALID_INPUT", + message_parameters={ + "expression": "'string'", + "sourceType": '"STRING"', + "targetType": '"BIGINT"', + "ansiConfig": '"spark.sql.ansi.enabled"', + }, + query_context_type=QueryContextType.DataFrame, + fragment="__sub__", + ) + + # DataFrameQueryContext with pysparkLoggingInfo - chained (`multiply` is problematic) + with self.assertRaises(NumberFormatException) as pe: + df.withColumn("multiply_string", df.id * "string").withColumn( + "divide_ten", df.id / 10 + ).withColumn("plus_ten", df.id + 10).withColumn("minus_ten", df.id - 10).collect() + self.check_error( + exception=pe.exception, + error_class="CAST_INVALID_INPUT", + message_parameters={ + "expression": "'string'", + "sourceType": '"STRING"', + "targetType": '"BIGINT"', + "ansiConfig": '"spark.sql.ansi.enabled"', + }, + query_context_type=QueryContextType.DataFrame, + fragment="__mul__", + ) + + # Multiple expressions in df.select (`divide` is problematic) + with self.assertRaises(ArithmeticException) as pe: + df.select(df.id - 10, df.id + 4, df.id / 0, df.id * 5).collect() + self.check_error( + exception=pe.exception, + error_class="DIVIDE_BY_ZERO", + message_parameters={"config": '"spark.sql.ansi.enabled"'}, + query_context_type=QueryContextType.DataFrame, + fragment="__truediv__", + ) + + # Multiple expressions in df.select (`plus` is problematic) + with self.assertRaises(NumberFormatException) as pe: + df.select(df.id - 10, df.id + "string", df.id / 10, df.id * 5).collect() + self.check_error( + exception=pe.exception, + error_class="CAST_INVALID_INPUT", + message_parameters={ + "expression": "'string'", + "sourceType": '"STRING"', + "targetType": '"BIGINT"', + "ansiConfig": '"spark.sql.ansi.enabled"', + }, + query_context_type=QueryContextType.DataFrame, + fragment="__add__", + ) + + # Multiple expressions in df.select (`minus` is problematic) + with self.assertRaises(NumberFormatException) as pe: + df.select(df.id - "string", df.id + 4, df.id / 10, df.id * 5).collect() + self.check_error( + exception=pe.exception, + error_class="CAST_INVALID_INPUT", + message_parameters={ + "expression": "'string'", + "sourceType": '"STRING"', + "targetType": '"BIGINT"', + "ansiConfig": '"spark.sql.ansi.enabled"', + }, + query_context_type=QueryContextType.DataFrame, + fragment="__sub__", + ) + + # Multiple expressions in df.select (`multiply` is problematic) + with self.assertRaises(NumberFormatException) as pe: + df.select(df.id - 10, df.id + 4, df.id / 10, df.id * "string").collect() + self.check_error( + exception=pe.exception, + error_class="CAST_INVALID_INPUT", + message_parameters={ + "expression": "'string'", + "sourceType": '"STRING"', + "targetType": '"BIGINT"', + "ansiConfig": '"spark.sql.ansi.enabled"', + }, + query_context_type=QueryContextType.DataFrame, + fragment="__mul__", + ) + + # Multiple expressions with pre-declared expressions (`divide` is problematic) + a = df.id / 10 + b = df.id / 0 + with self.assertRaises(ArithmeticException) as pe: + df.select(a, df.id + 4, b, df.id * 5).collect() + self.check_error( + exception=pe.exception, + error_class="DIVIDE_BY_ZERO", + message_parameters={"config": '"spark.sql.ansi.enabled"'}, + query_context_type=QueryContextType.DataFrame, + fragment="__truediv__", + ) + + # Multiple expressions with pre-declared expressions (`plus` is problematic) + a = df.id + "string" + b = df.id + 4 + with self.assertRaises(NumberFormatException) as pe: + df.select(df.id / 10, a, b, df.id * 5).collect() + self.check_error( + exception=pe.exception, + error_class="CAST_INVALID_INPUT", + message_parameters={ + "expression": "'string'", + "sourceType": '"STRING"', + "targetType": '"BIGINT"', + "ansiConfig": '"spark.sql.ansi.enabled"', + }, + query_context_type=QueryContextType.DataFrame, + fragment="__add__", + ) + + # Multiple expressions with pre-declared expressions (`minus` is problematic) + a = df.id - "string" + b = df.id - 5 + with self.assertRaises(NumberFormatException) as pe: + df.select(a, df.id / 10, b, df.id * 5).collect() + self.check_error( + exception=pe.exception, + error_class="CAST_INVALID_INPUT", + message_parameters={ + "expression": "'string'", + "sourceType": '"STRING"', + "targetType": '"BIGINT"', + "ansiConfig": '"spark.sql.ansi.enabled"', + }, + query_context_type=QueryContextType.DataFrame, + fragment="__sub__", + ) + + # Multiple expressions with pre-declared expressions (`multiply` is problematic) + a = df.id * "string" + b = df.id * 10 + with self.assertRaises(NumberFormatException) as pe: + df.select(a, df.id / 10, b, df.id + 5).collect() + self.check_error( + exception=pe.exception, + error_class="CAST_INVALID_INPUT", + message_parameters={ + "expression": "'string'", + "sourceType": '"STRING"', + "targetType": '"BIGINT"', + "ansiConfig": '"spark.sql.ansi.enabled"', + }, + query_context_type=QueryContextType.DataFrame, + fragment="__mul__", + ) + + def test_sql_query_context(self): + with self.sql_conf({"spark.sql.ansi.enabled": True}): + # SQLQueryContext + with self.assertRaises(ArithmeticException) as pe: + self.spark.sql("select 10/0").collect() + self.check_error( + exception=pe.exception, + error_class="DIVIDE_BY_ZERO", + message_parameters={"config": '"spark.sql.ansi.enabled"'}, + query_context_type=QueryContextType.SQL, + ) + + # No QueryContext + with self.assertRaises(AnalysisException) as pe: + self.spark.sql("select * from non-existing-table") + self.check_error( + exception=pe.exception, + error_class="INVALID_IDENTIFIER", + message_parameters={"ident": "non-existing-table"}, + query_context_type=None, + ) + + +class DataFrameQueryContextTests(DataFrameQueryContextTestsMixin, ReusedSQLTestCase): + pass + + +if __name__ == "__main__": + from pyspark.sql.tests.test_dataframe_query_context import * # noqa: F401 + + try: + import xmlrunner # type: ignore + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py index 23f7f9e00216c..4e9b61f7d0d96 100644 --- a/python/pyspark/sql/tests/test_functions.py +++ b/python/pyspark/sql/tests/test_functions.py @@ -26,6 +26,7 @@ from pyspark.errors import PySparkTypeError, PySparkValueError, SparkRuntimeException from pyspark.sql import Row, Window, functions as F, types +from pyspark.sql.avro.functions import from_avro, to_avro from pyspark.sql.column import Column from pyspark.testing.sqlutils import ReusedSQLTestCase, SQLTestUtils from pyspark.testing.utils import have_numpy @@ -1315,6 +1316,35 @@ def test_parse_json(self): self.assertEqual("""{"a":1}""", actual["var"]) self.assertEqual("""{"b":[{"c":"str2"}]}""", actual["var_lit"]) + def test_variant_expressions(self): + df = self.spark.createDataFrame([Row(json="""{ "a" : 1 }"""), Row(json="""{ "b" : 2 }""")]) + v = F.parse_json(df.json) + + def check(resultDf, expected): + self.assertEqual([r[0] for r in resultDf.collect()], expected) + + check(df.select(F.is_variant_null(v)), [False, False]) + check(df.select(F.schema_of_variant(v)), ["STRUCT", "STRUCT"]) + check(df.select(F.schema_of_variant_agg(v)), ["STRUCT"]) + + check(df.select(F.variant_get(v, "$.a", "int")), [1, None]) + check(df.select(F.variant_get(v, "$.b", "int")), [None, 2]) + check(df.select(F.variant_get(v, "$.a", "double")), [1.0, None]) + + with self.assertRaises(SparkRuntimeException) as ex: + df.select(F.variant_get(v, "$.a", "binary")).collect() + + self.check_error( + exception=ex.exception, + error_class="INVALID_VARIANT_CAST", + message_parameters={"value": "1", "dataType": '"BINARY"'}, + ) + + check(df.select(F.try_variant_get(v, "$.a", "int")), [1, None]) + check(df.select(F.try_variant_get(v, "$.b", "int")), [None, 2]) + check(df.select(F.try_variant_get(v, "$.a", "double")), [1.0, None]) + check(df.select(F.try_variant_get(v, "$.a", "binary")), [None, None]) + def test_schema_of_json(self): with self.assertRaises(PySparkTypeError) as pe: F.schema_of_json(1) @@ -1325,6 +1355,14 @@ def test_schema_of_json(self): message_parameters={"arg_name": "json", "arg_type": "int"}, ) + def test_try_parse_json(self): + df = self.spark.createDataFrame([{"json": """{ "a" : 1 }"""}, {"json": """{ a : 1 }"""}]) + actual = df.select( + F.to_json(F.try_parse_json(df.json)).alias("var"), + ).collect() + self.assertEqual("""{"a":1}""", actual[0]["var"]) + self.assertEqual(None, actual[1]["var"]) + def test_schema_of_csv(self): with self.assertRaises(PySparkTypeError) as pe: F.schema_of_csv(1) @@ -1491,6 +1529,40 @@ def test_json_tuple_empty_fields(self): lambda: df.select(F.json_tuple(df.jstring)), ) + def test_avro_type_check(self): + parameters = ["data", "jsonFormatSchema", "options"] + expected_type = ["pyspark.sql.Column or str", "str", "dict, optional"] + dummyDF = self.spark.createDataFrame([Row(a=i, b=i) for i in range(5)]) + + # test from_avro type checks for each parameter + wrong_type_value = 1 + with self.assertRaises(PySparkTypeError) as pe1: + dummyDF.select(from_avro(wrong_type_value, "jsonSchema", None)) + with self.assertRaises(PySparkTypeError) as pe2: + dummyDF.select(from_avro("value", wrong_type_value, None)) + with self.assertRaises(PySparkTypeError) as pe3: + dummyDF.select(from_avro("value", "jsonSchema", wrong_type_value)) + from_avro_pes = [pe1, pe2, pe3] + for i in range(3): + self.check_error( + exception=from_avro_pes[i].exception, + error_class="INVALID_TYPE", + message_parameters={"arg_name": parameters[i], "arg_type": expected_type[i]}, + ) + + # test to_avro type checks for each parameter + with self.assertRaises(PySparkTypeError) as pe4: + dummyDF.select(to_avro(wrong_type_value, "jsonSchema")) + with self.assertRaises(PySparkTypeError) as pe5: + dummyDF.select(to_avro("value", wrong_type_value)) + to_avro_pes = [pe4, pe5] + for i in range(2): + self.check_error( + exception=to_avro_pes[i].exception, + error_class="INVALID_TYPE", + message_parameters={"arg_name": parameters[i], "arg_type": expected_type[i]}, + ) + class FunctionsTests(ReusedSQLTestCase, FunctionsTestsMixin): pass diff --git a/python/pyspark/sql/tests/test_group.py b/python/pyspark/sql/tests/test_group.py index 958fc4e65dac2..8e3d2d8d00033 100644 --- a/python/pyspark/sql/tests/test_group.py +++ b/python/pyspark/sql/tests/test_group.py @@ -26,7 +26,7 @@ pandas_requirement_message, pyarrow_requirement_message, ) -from pyspark.testing import assertDataFrameEqual, assertSchemaEqual +from pyspark.testing import assertDataFrameEqual class GroupTestsMixin: @@ -92,30 +92,25 @@ def test_group_by_ordinal(self): # basic case df1 = spark.sql("select a, sum(b) from v group by 1;") df2 = df.groupBy(1).agg(sf.sum("b")) - assertSchemaEqual(df1.schema, df2.schema) assertDataFrameEqual(df1, df2) # constant case df1 = spark.sql("select 1, 2, sum(b) from v group by 1, 2;") df2 = df.select(sf.lit(1), sf.lit(2), "b").groupBy(1, 2).agg(sf.sum("b")) - assertSchemaEqual(df1.schema, df2.schema) assertDataFrameEqual(df1, df2) # duplicate group by column df1 = spark.sql("select a, 1, sum(b) from v group by a, 1;") df2 = df.select("a", sf.lit(1), "b").groupBy("a", 2).agg(sf.sum("b")) - assertSchemaEqual(df1.schema, df2.schema) assertDataFrameEqual(df1, df2) df1 = spark.sql("select a, 1, sum(b) from v group by 1, 2;") df2 = df.select("a", sf.lit(1), "b").groupBy(1, 2).agg(sf.sum("b")) - assertSchemaEqual(df1.schema, df2.schema) assertDataFrameEqual(df1, df2) # group by a non-aggregate expression's ordinal df1 = spark.sql("select a, b + 2, count(2) from v group by a, 2;") df2 = df.select("a", df.b + 2).groupBy(1, 2).agg(sf.count(sf.lit(2))) - assertSchemaEqual(df1.schema, df2.schema) assertDataFrameEqual(df1, df2) # negative cases: ordinal out of range @@ -152,28 +147,23 @@ def test_order_by_ordinal(self): df1 = spark.sql("select * from v order by 1 desc;") df2 = df.orderBy(-1) - assertSchemaEqual(df1.schema, df2.schema) assertDataFrameEqual(df1, df2) df1 = spark.sql("select * from v order by 1 desc, b desc;") df2 = df.orderBy(-1, df.b.desc()) - assertSchemaEqual(df1.schema, df2.schema) assertDataFrameEqual(df1, df2) df1 = spark.sql("select * from v order by 1 desc, 2 desc;") df2 = df.orderBy(-1, -2) - assertSchemaEqual(df1.schema, df2.schema) assertDataFrameEqual(df1, df2) # groupby ordinal with orderby ordinal df1 = spark.sql("select a, 1, sum(b) from v group by 1, 2 order by 1;") df2 = df.select("a", sf.lit(1), "b").groupBy(1, 2).agg(sf.sum("b")).sort(1) - assertSchemaEqual(df1.schema, df2.schema) assertDataFrameEqual(df1, df2) df1 = spark.sql("select a, 1, sum(b) from v group by 1, 2 order by 3, 1;") df2 = df.select("a", sf.lit(1), "b").groupBy(1, 2).agg(sf.sum("b")).sort(3, 1) - assertSchemaEqual(df1.schema, df2.schema) assertDataFrameEqual(df1, df2) # negative cases: ordinal out of range diff --git a/python/pyspark/sql/tests/test_python_datasource.py b/python/pyspark/sql/tests/test_python_datasource.py index d028a210b0071..8431e9b3e35d4 100644 --- a/python/pyspark/sql/tests/test_python_datasource.py +++ b/python/pyspark/sql/tests/test_python_datasource.py @@ -19,7 +19,7 @@ import unittest from typing import Callable, Union -from pyspark.errors import PythonException +from pyspark.errors import PythonException, AnalysisException from pyspark.sql.datasource import ( DataSource, DataSourceReader, @@ -154,7 +154,8 @@ def test_data_source_read_output_named_row_with_wrong_schema(self): read_func=lambda schema, partition: iter([Row(i=1, j=2), Row(j=3, k=4)]) ) with self.assertRaisesRegex( - PythonException, "PYTHON_DATA_SOURCE_READ_RETURN_SCHEMA_MISMATCH" + PythonException, + r"\[DATA_SOURCE_RETURN_SCHEMA_MISMATCH\] Return schema mismatch in the result", ): self.spark.read.format("test").load().show() @@ -373,6 +374,47 @@ def test_case_insensitive_dict(self): self.assertEqual(d2["BaR"], 3) self.assertEqual(d2["baz"], 3) + def test_data_source_type_mismatch(self): + class TestDataSource(DataSource): + @classmethod + def name(cls): + return "test" + + def schema(self): + return "id int" + + def reader(self, schema): + return TestReader() + + def writer(self, schema, overwrite): + return TestWriter() + + class TestReader: + def partitions(self): + return [] + + def read(self, partition): + yield (0,) + + class TestWriter: + def write(self, iterator): + return WriterCommitMessage() + + self.spark.dataSource.register(TestDataSource) + + with self.assertRaisesRegex( + AnalysisException, + r"\[DATA_SOURCE_TYPE_MISMATCH\] Expected an instance of DataSourceReader", + ): + self.spark.read.format("test").load().show() + + df = self.spark.range(10) + with self.assertRaisesRegex( + AnalysisException, + r"\[DATA_SOURCE_TYPE_MISMATCH\] Expected an instance of DataSourceWriter", + ): + df.write.format("test").mode("append").saveAsTable("test_table") + class PythonDataSourceTests(BasePythonDataSourceTestsMixin, ReusedSQLTestCase): ... diff --git a/python/pyspark/sql/tests/test_python_streaming_datasource.py b/python/pyspark/sql/tests/test_python_streaming_datasource.py index f7247599be839..183b0ad80d9d4 100644 --- a/python/pyspark/sql/tests/test_python_streaming_datasource.py +++ b/python/pyspark/sql/tests/test_python_streaming_datasource.py @@ -24,8 +24,10 @@ DataSourceStreamReader, InputPartition, DataSourceStreamWriter, + SimpleDataSourceStreamReader, WriterCommitMessage, ) +from pyspark.sql.streaming import StreamingQueryException from pyspark.sql.types import Row from pyspark.testing.sqlutils import ( have_pyarrow, @@ -147,53 +149,95 @@ def check_batch(df, batch_id): while len(q.recentProgress) < 10: time.sleep(0.2) q.stop() - q.awaitTermination + q.awaitTermination() self.assertIsNone(q.exception(), "No exception has to be propagated.") - def test_stream_writer(self): - input_dir = tempfile.TemporaryDirectory(prefix="test_data_stream_write_input") - output_dir = tempfile.TemporaryDirectory(prefix="test_data_stream_write_output") - checkpoint_dir = tempfile.TemporaryDirectory(prefix="test_data_stream_write_checkpoint") + def test_simple_stream_reader(self): + class SimpleStreamReader(SimpleDataSourceStreamReader): + def initialOffset(self): + return {"offset": 0} - self.spark.range(0, 30).repartition(2).write.format("json").mode("append").save( - input_dir.name - ) - self.spark.dataSource.register(self._get_test_data_source()) - df = self.spark.readStream.schema("id int").json(input_dir.name) - q = ( - df.writeStream.format("TestDataSource") - .option("checkpointLocation", checkpoint_dir.name) - .start(output_dir.name) - ) - while not q.recentProgress: - time.sleep(0.2) + def read(self, start: dict): + start_idx = start["offset"] + it = iter([(i,) for i in range(start_idx, start_idx + 2)]) + return (it, {"offset": start_idx + 2}) + + def commit(self, end): + pass - # Test stream writer write and commit. - # The first microbatch contain 30 rows and 2 partitions. - # Number of rows and partitions is writen by StreamWriter.commit(). - assertDataFrameEqual(self.spark.read.json(output_dir.name), [Row(2, 30)]) + def readBetweenOffsets(self, start: dict, end: dict): + start_idx = start["offset"] + end_idx = end["offset"] + return iter([(i,) for i in range(start_idx, end_idx)]) + + class SimpleDataSource(DataSource): + def schema(self): + return "id INT" - self.spark.range(50, 80).repartition(2).write.format("json").mode("append").save( - input_dir.name - ) + def simpleStreamReader(self, schema): + return SimpleStreamReader() - # Test StreamWriter write and abort. - # When row id > 50, write tasks throw exception and fail. - # 1.txt is written by StreamWriter.abort() to record the failure. - while q.exception() is None: + self.spark.dataSource.register(SimpleDataSource) + df = self.spark.readStream.format("SimpleDataSource").load() + + def check_batch(df, batch_id): + assertDataFrameEqual(df, [Row(batch_id * 2), Row(batch_id * 2 + 1)]) + + q = df.writeStream.foreachBatch(check_batch).start() + while len(q.recentProgress) < 10: time.sleep(0.2) - assertDataFrameEqual( - self.spark.read.text(os.path.join(output_dir.name, "1.txt")), [Row("failed in batch 1")] - ) - q.awaitTermination + q.stop() + q.awaitTermination() + self.assertIsNone(q.exception(), "No exception has to be propagated.") + + def test_stream_writer(self): + input_dir = tempfile.TemporaryDirectory(prefix="test_data_stream_write_input") + output_dir = tempfile.TemporaryDirectory(prefix="test_data_stream_write_output") + checkpoint_dir = tempfile.TemporaryDirectory(prefix="test_data_stream_write_checkpoint") - input_dir.cleanup() - output_dir.cleanup() - checkpoint_dir.cleanup() + try: + self.spark.range(0, 30).repartition(2).write.format("json").mode("append").save( + input_dir.name + ) + self.spark.dataSource.register(self._get_test_data_source()) + df = self.spark.readStream.schema("id int").json(input_dir.name) + q = ( + df.writeStream.format("TestDataSource") + .option("checkpointLocation", checkpoint_dir.name) + .start(output_dir.name) + ) + while not q.recentProgress: + time.sleep(0.2) + + # Test stream writer write and commit. + # The first microbatch contain 30 rows and 2 partitions. + # Number of rows and partitions is writen by StreamWriter.commit(). + assertDataFrameEqual(self.spark.read.json(output_dir.name), [Row(2, 30)]) + + self.spark.range(50, 80).repartition(2).write.format("json").mode("append").save( + input_dir.name + ) + + # Test StreamWriter write and abort. + # When row id > 50, write tasks throw exception and fail. + # 1.txt is written by StreamWriter.abort() to record the failure. + while q.exception() is None: + time.sleep(0.2) + assertDataFrameEqual( + self.spark.read.text(os.path.join(output_dir.name, "1.txt")), + [Row("failed in batch 1")], + ) + q.awaitTermination() + except StreamingQueryException as e: + self.assertIn("invalid value", str(e)) + finally: + input_dir.cleanup() + output_dir.cleanup() + checkpoint_dir.cleanup() class PythonStreamingDataSourceTests(BasePythonStreamingDataSourceTestsMixin, ReusedSQLTestCase): - ... + pass if __name__ == "__main__": diff --git a/python/pyspark/sql/tests/test_readwriter.py b/python/pyspark/sql/tests/test_readwriter.py index 5784d2c729739..8060a9ae8bc76 100644 --- a/python/pyspark/sql/tests/test_readwriter.py +++ b/python/pyspark/sql/tests/test_readwriter.py @@ -55,12 +55,9 @@ def test_save_and_load(self): ) self.assertEqual(sorted(df.collect()), sorted(actual.collect())) - try: - self.spark.sql("SET spark.sql.sources.default=org.apache.spark.sql.json") + with self.sql_conf({"spark.sql.sources.default": "org.apache.spark.sql.json"}): actual = self.spark.read.load(path=tmpPath) self.assertEqual(sorted(df.collect()), sorted(actual.collect())) - finally: - self.spark.sql("RESET spark.sql.sources.default") csvpath = os.path.join(tempfile.mkdtemp(), "data") df.write.option("quote", None).format("csv").save(csvpath) @@ -94,12 +91,9 @@ def test_save_and_load_builder(self): ) self.assertEqual(sorted(df.collect()), sorted(actual.collect())) - try: - self.spark.sql("SET spark.sql.sources.default=org.apache.spark.sql.json") + with self.sql_conf({"spark.sql.sources.default": "org.apache.spark.sql.json"}): actual = self.spark.read.load(path=tmpPath) self.assertEqual(sorted(df.collect()), sorted(actual.collect())) - finally: - self.spark.sql("RESET spark.sql.sources.default") finally: shutil.rmtree(tmpPath) @@ -247,10 +241,9 @@ def test_create(self): def test_create_without_provider(self): df = self.df - with self.assertRaisesRegex( - AnalysisException, "NOT_SUPPORTED_COMMAND_WITHOUT_HIVE_SUPPORT" - ): + with self.table("test_table"): df.writeTo("test_table").create() + self.assertEqual(100, self.spark.sql("select * from test_table").count()) def test_table_overwrite(self): df = self.df diff --git a/python/pyspark/sql/tests/test_resources.py b/python/pyspark/sql/tests/test_resources.py index 9dfb14d9c37f7..4ce61e9f763d6 100644 --- a/python/pyspark/sql/tests/test_resources.py +++ b/python/pyspark/sql/tests/test_resources.py @@ -16,7 +16,7 @@ # import unittest -from pyspark import SparkContext, TaskContext +from pyspark import TaskContext from pyspark.resource import TaskResourceRequests, ResourceProfileBuilder from pyspark.sql import SparkSession from pyspark.testing.sqlutils import ( @@ -41,7 +41,7 @@ def func(iterator): yield batch df = self.spark.range(10) - df.mapInArrow(func, "id long").collect() + df.mapInArrow(func, "id long").show(n=10) def test_map_in_arrow_with_profile(self): def func(iterator): @@ -54,7 +54,7 @@ def func(iterator): treqs = TaskResourceRequests().cpus(3) rp = ResourceProfileBuilder().require(treqs).build - df.mapInArrow(func, "id long", False, rp).collect() + df.mapInArrow(func, "id long", False, rp).show(n=10) def test_map_in_pandas_without_profile(self): def func(iterator): @@ -64,7 +64,7 @@ def func(iterator): yield batch df = self.spark.range(10) - df.mapInPandas(func, "id long").collect() + df.mapInPandas(func, "id long").show(n=10) def test_map_in_pandas_with_profile(self): def func(iterator): @@ -77,12 +77,14 @@ def func(iterator): treqs = TaskResourceRequests().cpus(3) rp = ResourceProfileBuilder().require(treqs).build - df.mapInPandas(func, "id long", False, rp).collect() + df.mapInPandas(func, "id long", False, rp).show(n=10) class ResourceProfileTests(ResourceProfileTestsMixin, ReusedPySparkTestCase): @classmethod def setUpClass(cls): + from pyspark.core.context import SparkContext + cls.sc = SparkContext("local-cluster[1, 4, 1024]", cls.__name__, conf=cls.conf()) cls.spark = SparkSession(cls.sc) diff --git a/python/pyspark/sql/tests/test_serde.py b/python/pyspark/sql/tests/test_serde.py index ef8bbd2c370f4..01cf3c51d7de0 100644 --- a/python/pyspark/sql/tests/test_serde.py +++ b/python/pyspark/sql/tests/test_serde.py @@ -95,6 +95,14 @@ def test_time_with_timezone(self): self.assertEqual(now, now1) self.assertEqual(now, utcnow1) + def test_ntz_from_internal(self): + for ts in [1, 22, 333, 44444444, 5555555555]: + t1 = datetime.datetime.utcfromtimestamp(ts // 1000000).replace(microsecond=ts % 1000000) + t2 = datetime.datetime.fromtimestamp(ts // 1000000, datetime.timezone.utc).replace( + microsecond=ts % 1000000, tzinfo=None + ) + self.assertEqual(t1, t2) + # regression test for SPARK-19561 def test_datetime_at_epoch(self): epoch = datetime.datetime.fromtimestamp(0) diff --git a/python/pyspark/sql/tests/test_types.py b/python/pyspark/sql/tests/test_types.py index af13adbc21bb2..4810cf40e2315 100644 --- a/python/pyspark/sql/tests/test_types.py +++ b/python/pyspark/sql/tests/test_types.py @@ -32,6 +32,7 @@ PySparkTypeError, PySparkValueError, PySparkRuntimeError, + PySparkNotImplementedError, ) from pyspark.sql.types import ( DataType, @@ -41,6 +42,7 @@ FloatType, DateType, TimestampType, + TimestampNTZType, DayTimeIntervalType, YearMonthIntervalType, CalendarIntervalType, @@ -191,6 +193,7 @@ def __init__(self): Row(a=1), Row("a")(1), A(), + Row(b=Row(c=datetime.datetime(1970, 1, 1, 0, 0))), ] df = self.spark.createDataFrame([data]) @@ -213,6 +216,7 @@ def __init__(self): "struct", "struct", "struct", + "struct>", ] self.assertEqual(actual, expected) @@ -235,14 +239,25 @@ def __init__(self): Row(a=1), Row(a=1), Row(a=1), + Row(b=Row(c=datetime.datetime(1970, 1, 1, 0, 0))), ] self.assertEqual(actual, expected) with self.sql_conf({"spark.sql.timestampType": "TIMESTAMP_NTZ"}): with self.sql_conf({"spark.sql.session.timeZone": "America/Sao_Paulo"}): - df = self.spark.createDataFrame([(datetime.datetime(1970, 1, 1, 0, 0),)]) + data = [ + ( + datetime.datetime(1970, 1, 1, 0, 0), + Row(a=Row(a=datetime.datetime(1970, 1, 1, 0, 0))), + ) + ] + df = self.spark.createDataFrame(data) self.assertEqual(list(df.schema)[0].dataType.simpleString(), "timestamp_ntz") self.assertEqual(df.first()[0], datetime.datetime(1970, 1, 1, 0, 0)) + self.assertEqual( + list(df.schema)[1].dataType.simpleString(), "struct>" + ) + self.assertEqual(df.first()[1], Row(a=Row(a=datetime.datetime(1970, 1, 1, 0, 0)))) df = self.spark.createDataFrame( [ @@ -366,7 +381,7 @@ def test_infer_array_merge_element_types_with_rdd(self): df = self.spark.createDataFrame(rdd) self.assertEqual(Row(f1=[1, None], f2=[None, 2]), df.first()) - def test_infer_array_element_type_empty(self): + def test_infer_array_element_type_empty_rdd(self): # SPARK-39168: Test inferring array element type from all rows ArrayRow = Row("f1") @@ -379,6 +394,12 @@ def test_infer_array_element_type_empty(self): self.assertEqual(Row(f1=[None]), rows[1]) self.assertEqual(Row(f1=[1]), rows[2]) + def test_infer_array_element_type_empty(self): + # SPARK-39168: Test inferring array element type from all rows + ArrayRow = Row("f1") + + data = [ArrayRow([]), ArrayRow([None]), ArrayRow([1])] + df = self.spark.createDataFrame(data) rows = df.collect() self.assertEqual(Row(f1=[]), rows[0]) @@ -392,12 +413,6 @@ def test_infer_array_element_type_with_struct(self): with self.sql_conf({"spark.sql.pyspark.inferNestedDictAsStruct.enabled": True}): data = [NestedRow([{"payment": 200.5}, {"name": "A"}])] - nestedRdd = self.sc.parallelize(data) - df = self.spark.createDataFrame(nestedRdd) - self.assertEqual( - Row(f1=[Row(payment=200.5, name=None), Row(payment=None, name="A")]), df.first() - ) - df = self.spark.createDataFrame(data) self.assertEqual( Row(f1=[Row(payment=200.5, name=None), Row(payment=None, name="A")]), df.first() @@ -410,6 +425,57 @@ def test_infer_array_element_type_with_struct(self): df = self.spark.createDataFrame(data) self.assertEqual(Row(f1=[Row(payment=200.5), Row(payment=None)]), df.first()) + def test_infer_map_merge_pair_types_with_rdd(self): + # SPARK-48247: Test inferring map pair type from all values in array + MapRow = Row("f1", "f2") + + data = [MapRow({"a": 1, "b": None}, {"a": None, "b": 1})] + + rdd = self.sc.parallelize(data) + df = self.spark.createDataFrame(rdd) + self.assertEqual(Row(f1={"a": 1, "b": None}, f2={"a": None, "b": 1}), df.first()) + + def test_infer_map_pair_type_empty_rdd(self): + # SPARK-48247: Test inferring map pair type from all rows + MapRow = Row("f1") + + data = [MapRow({}), MapRow({"a": None}), MapRow({"a": 1})] + + rdd = self.sc.parallelize(data) + df = self.spark.createDataFrame(rdd) + rows = df.collect() + self.assertEqual(Row(f1={}), rows[0]) + self.assertEqual(Row(f1={"a": None}), rows[1]) + self.assertEqual(Row(f1={"a": 1}), rows[2]) + + def test_infer_map_pair_type_empty(self): + # SPARK-48247: Test inferring map pair type from all rows + MapRow = Row("f1") + + data = [MapRow({}), MapRow({"a": None}), MapRow({"a": 1})] + + df = self.spark.createDataFrame(data) + rows = df.collect() + self.assertEqual(Row(f1={}), rows[0]) + self.assertEqual(Row(f1={"a": None}), rows[1]) + self.assertEqual(Row(f1={"a": 1}), rows[2]) + + def test_infer_map_pair_type_with_nested_maps(self): + # SPARK-48247: Test inferring nested map + NestedRow = Row("f1", "f2") + + data = [ + NestedRow({"payment": 200.5, "name": "A"}, {"outer": {"payment": 200.5, "name": "A"}}) + ] + df = self.spark.createDataFrame(data) + self.assertEqual( + Row( + f1={"payment": "200.5", "name": "A"}, + f2={"outer": {"payment": "200.5", "name": "A"}}, + ), + df.first(), + ) + def test_create_dataframe_from_dict_respects_schema(self): df = self.spark.createDataFrame([{"a": 1}], ["b"]) self.assertEqual(df.columns, ["b"]) @@ -426,14 +492,11 @@ class User: self.assertEqual(asdict(user), r.asDict()) def test_negative_decimal(self): - try: - self.spark.sql("set spark.sql.legacy.allowNegativeScaleOfDecimal=true") + with self.sql_conf({"spark.sql.legacy.allowNegativeScaleOfDecimal": True}): df = self.spark.createDataFrame([(1,), (11,)], ["value"]) ret = df.select(F.col("value").cast(DecimalType(1, -1))).collect() actual = list(map(lambda r: int(r.value), ret)) self.assertEqual(actual, [0, 10]) - finally: - self.spark.sql("set spark.sql.legacy.allowNegativeScaleOfDecimal=false") def test_create_dataframe_from_objects(self): data = [MyObject(1, "1"), MyObject(2, "2")] @@ -549,6 +612,234 @@ def test_convert_list_to_str(self): self.assertEqual(df.count(), 1) self.assertEqual(df.head(), Row(name="[123]", income=120)) + def test_schema_with_collations_json_ser_de(self): + from pyspark.sql.types import _parse_datatype_json_string + + unicode_collation = "UNICODE" + + simple_struct = StructType([StructField("c1", StringType(unicode_collation))]) + + nested_struct = StructType([StructField("nested", simple_struct)]) + + array_in_schema = StructType( + [StructField("array", ArrayType(StringType(unicode_collation)))] + ) + + map_in_schema = StructType( + [ + StructField( + "map", MapType(StringType(unicode_collation), StringType(unicode_collation)) + ) + ] + ) + + nested_map = StructType( + [ + StructField( + "nested", + StructType( + [ + StructField( + "mapField", + MapType( + StringType(unicode_collation), StringType(unicode_collation) + ), + ) + ] + ), + ) + ] + ) + + array_in_map = StructType( + [ + StructField( + "arrInMap", + MapType( + StringType(unicode_collation), ArrayType(StringType(unicode_collation)) + ), + ) + ] + ) + + nested_array_in_map_value = StructType( + [ + StructField( + "nestedArrayInMap", + ArrayType( + MapType( + StringType(unicode_collation), + ArrayType(ArrayType(StringType(unicode_collation))), + ) + ), + ) + ] + ) + + schema_with_multiple_fields = StructType( + simple_struct.fields + + nested_struct.fields + + array_in_schema.fields + + map_in_schema.fields + + nested_map.fields + + array_in_map.fields + + nested_array_in_map_value.fields + ) + + schemas = [ + simple_struct, + nested_struct, + array_in_schema, + map_in_schema, + nested_map, + nested_array_in_map_value, + array_in_map, + schema_with_multiple_fields, + ] + + for schema in schemas: + scala_datatype = self.spark._jsparkSession.parseDataType(schema.json()) + python_datatype = _parse_datatype_json_string(scala_datatype.json()) + assert schema == python_datatype + assert schema == _parse_datatype_json_string(schema.json()) + + def test_schema_with_collations_on_non_string_types(self): + from pyspark.sql.types import _parse_datatype_json_string, _COLLATIONS_METADATA_KEY + + collations_on_int_col_json = f""" + {{ + "type": "struct", + "fields": [ + {{ + "name": "c1", + "type": "integer", + "nullable": true, + "metadata": {{ + "{_COLLATIONS_METADATA_KEY}": {{ + "c1": "icu.UNICODE" + }} + }} + }} + ] + }} + """ + + collations_in_array_element_json = f""" + {{ + "type": "struct", + "fields": [ + {{ + "name": "arrayField", + "type": {{ + "type": "array", + "elementType": "integer", + "containsNull": true + }}, + "nullable": true, + "metadata": {{ + "{_COLLATIONS_METADATA_KEY}": {{ + "arrayField.element": "icu.UNICODE" + }} + }} + }} + ] + }} + """ + + collations_on_array_json = f""" + {{ + "type": "struct", + "fields": [ + {{ + "name": "arrayField", + "type": {{ + "type": "array", + "elementType": "integer", + "containsNull": true + }}, + "nullable": true, + "metadata": {{ + "{_COLLATIONS_METADATA_KEY}": {{ + "arrayField": "icu.UNICODE" + }} + }} + }} + ] + }} + """ + + collations_in_nested_map_json = f""" + {{ + "type": "struct", + "fields": [ + {{ + "name": "nested", + "type": {{ + "type": "struct", + "fields": [ + {{ + "name": "mapField", + "type": {{ + "type": "map", + "keyType": "string", + "valueType": "integer", + "valueContainsNull": true + }}, + "nullable": true, + "metadata": {{ + "{_COLLATIONS_METADATA_KEY}": {{ + "mapField.value": "icu.UNICODE" + }} + }} + }} + ] + }}, + "nullable": true, + "metadata": {{}} + }} + ] + }} + """ + + self.assertRaises( + PySparkTypeError, lambda: _parse_datatype_json_string(collations_on_int_col_json) + ) + + self.assertRaises( + PySparkTypeError, lambda: _parse_datatype_json_string(collations_in_array_element_json) + ) + + self.assertRaises( + PySparkTypeError, lambda: _parse_datatype_json_string(collations_on_array_json) + ) + + self.assertRaises( + PySparkTypeError, lambda: _parse_datatype_json_string(collations_in_nested_map_json) + ) + + def test_schema_with_bad_collations_provider(self): + from pyspark.sql.types import _parse_datatype_json_string, _COLLATIONS_METADATA_KEY + + schema_json = f""" + {{ + "type": "struct", + "fields": [ + {{ + "name": "c1", + "type": "string", + "nullable": "true", + "metadata": {{ + "{_COLLATIONS_METADATA_KEY}": {{ + "c1": "badProvider.UNICODE" + }} + }} + }} + ] + }} + """ + + self.assertRaises(PySparkValueError, lambda: _parse_datatype_json_string(schema_json)) + def test_udt(self): from pyspark.sql.types import _parse_datatype_json_string, _infer_type, _make_type_verifier @@ -856,35 +1147,48 @@ def test_struct_type(self): self.assertRaises(IndexError, lambda: struct1[9]) self.assertRaises(TypeError, lambda: struct1[9.9]) + def test_parse_datatype_json_string(self): + from pyspark.sql.types import _parse_datatype_json_string + + for dataType in [ + StringType(), + CharType(5), + VarcharType(10), + BinaryType(), + BooleanType(), + DecimalType(), + DecimalType(10, 2), + FloatType(), + DoubleType(), + ByteType(), + ShortType(), + IntegerType(), + LongType(), + DateType(), + TimestampType(), + TimestampNTZType(), + NullType(), + VariantType(), + YearMonthIntervalType(), + YearMonthIntervalType(YearMonthIntervalType.YEAR), + YearMonthIntervalType(YearMonthIntervalType.YEAR, YearMonthIntervalType.MONTH), + DayTimeIntervalType(), + DayTimeIntervalType(DayTimeIntervalType.DAY), + DayTimeIntervalType(DayTimeIntervalType.HOUR, DayTimeIntervalType.SECOND), + CalendarIntervalType(), + ]: + json_str = dataType.json() + parsed = _parse_datatype_json_string(json_str) + self.assertEqual(dataType, parsed) + def test_parse_datatype_string(self): - from pyspark.sql.types import _all_atomic_types, _parse_datatype_string + from pyspark.sql.types import _all_mappable_types, _parse_datatype_string + + for k, t in _all_mappable_types.items(): + self.assertEqual(t(), _parse_datatype_string(k)) - for k, t in _all_atomic_types.items(): - if k != "varchar" and k != "char": - self.assertEqual(t(), _parse_datatype_string(k)) self.assertEqual(IntegerType(), _parse_datatype_string("int")) self.assertEqual(StringType(), _parse_datatype_string("string")) - self.assertEqual(StringType(), _parse_datatype_string("string collate UTF8_BINARY")) - self.assertEqual(StringType(), _parse_datatype_string("string COLLATE UTF8_BINARY")) - self.assertEqual( - StringType.fromCollationId(0), _parse_datatype_string("string COLLATE UTF8_BINARY") - ) - self.assertEqual( - StringType.fromCollationId(1), - _parse_datatype_string("string COLLATE UTF8_BINARY_LCASE"), - ) - self.assertEqual( - StringType.fromCollationId(2), _parse_datatype_string("string COLLATE UNICODE") - ) - self.assertEqual( - StringType.fromCollationId(2), _parse_datatype_string("string COLLATE `UNICODE`") - ) - self.assertEqual( - StringType.fromCollationId(3), _parse_datatype_string("string COLLATE UNICODE_CI") - ) - self.assertEqual( - StringType.fromCollationId(3), _parse_datatype_string("string COLLATE `UNICODE_CI`") - ) self.assertEqual(CharType(1), _parse_datatype_string("char(1)")) self.assertEqual(CharType(10), _parse_datatype_string("char( 10 )")) self.assertEqual(CharType(11), _parse_datatype_string("char( 11)")) @@ -912,6 +1216,313 @@ def test_parse_datatype_string(self): ) self.assertEqual(VariantType(), _parse_datatype_string("variant")) + def test_tree_string(self): + schema1 = DataType.fromDDL("c1 INT, c2 STRUCT>") + + self.assertEqual( + schema1.treeString().split("\n"), + [ + "root", + " |-- c1: integer (nullable = true)", + " |-- c2: struct (nullable = true)", + " | |-- c3: integer (nullable = true)", + " | |-- c4: struct (nullable = true)", + " | | |-- c5: integer (nullable = true)", + " | | |-- c6: integer (nullable = true)", + "", + ], + ) + self.assertEqual( + schema1.treeString(-1).split("\n"), + [ + "root", + " |-- c1: integer (nullable = true)", + " |-- c2: struct (nullable = true)", + " | |-- c3: integer (nullable = true)", + " | |-- c4: struct (nullable = true)", + " | | |-- c5: integer (nullable = true)", + " | | |-- c6: integer (nullable = true)", + "", + ], + ) + self.assertEqual( + schema1.treeString(0).split("\n"), + [ + "root", + " |-- c1: integer (nullable = true)", + " |-- c2: struct (nullable = true)", + " | |-- c3: integer (nullable = true)", + " | |-- c4: struct (nullable = true)", + " | | |-- c5: integer (nullable = true)", + " | | |-- c6: integer (nullable = true)", + "", + ], + ) + self.assertEqual( + schema1.treeString(1).split("\n"), + [ + "root", + " |-- c1: integer (nullable = true)", + " |-- c2: struct (nullable = true)", + "", + ], + ) + self.assertEqual( + schema1.treeString(2).split("\n"), + [ + "root", + " |-- c1: integer (nullable = true)", + " |-- c2: struct (nullable = true)", + " | |-- c3: integer (nullable = true)", + " | |-- c4: struct (nullable = true)", + "", + ], + ) + self.assertEqual( + schema1.treeString(3).split("\n"), + [ + "root", + " |-- c1: integer (nullable = true)", + " |-- c2: struct (nullable = true)", + " | |-- c3: integer (nullable = true)", + " | |-- c4: struct (nullable = true)", + " | | |-- c5: integer (nullable = true)", + " | | |-- c6: integer (nullable = true)", + "", + ], + ) + self.assertEqual( + schema1.treeString(4).split("\n"), + [ + "root", + " |-- c1: integer (nullable = true)", + " |-- c2: struct (nullable = true)", + " | |-- c3: integer (nullable = true)", + " | |-- c4: struct (nullable = true)", + " | | |-- c5: integer (nullable = true)", + " | | |-- c6: integer (nullable = true)", + "", + ], + ) + + schema2 = DataType.fromDDL( + "c1 INT, c2 ARRAY>, c4 STRUCT>>" + ) + self.assertEqual( + schema2.treeString(0).split("\n"), + [ + "root", + " |-- c1: integer (nullable = true)", + " |-- c2: array (nullable = true)", + " | |-- element: struct (containsNull = true)", + " | | |-- c3: integer (nullable = true)", + " |-- c4: struct (nullable = true)", + " | |-- c5: integer (nullable = true)", + " | |-- c6: array (nullable = true)", + " | | |-- element: array (containsNull = true)", + " | | | |-- element: integer (containsNull = true)", + "", + ], + ) + self.assertEqual( + schema2.treeString(1).split("\n"), + [ + "root", + " |-- c1: integer (nullable = true)", + " |-- c2: array (nullable = true)", + " |-- c4: struct (nullable = true)", + "", + ], + ) + self.assertEqual( + schema2.treeString(2).split("\n"), + [ + "root", + " |-- c1: integer (nullable = true)", + " |-- c2: array (nullable = true)", + " | |-- element: struct (containsNull = true)", + " |-- c4: struct (nullable = true)", + " | |-- c5: integer (nullable = true)", + " | |-- c6: array (nullable = true)", + "", + ], + ) + self.assertEqual( + schema2.treeString(3).split("\n"), + [ + "root", + " |-- c1: integer (nullable = true)", + " |-- c2: array (nullable = true)", + " | |-- element: struct (containsNull = true)", + " | | |-- c3: integer (nullable = true)", + " |-- c4: struct (nullable = true)", + " | |-- c5: integer (nullable = true)", + " | |-- c6: array (nullable = true)", + " | | |-- element: array (containsNull = true)", + "", + ], + ) + self.assertEqual( + schema2.treeString(4).split("\n"), + [ + "root", + " |-- c1: integer (nullable = true)", + " |-- c2: array (nullable = true)", + " | |-- element: struct (containsNull = true)", + " | | |-- c3: integer (nullable = true)", + " |-- c4: struct (nullable = true)", + " | |-- c5: integer (nullable = true)", + " | |-- c6: array (nullable = true)", + " | | |-- element: array (containsNull = true)", + " | | | |-- element: integer (containsNull = true)", + "", + ], + ) + + schema3 = DataType.fromDDL( + "c1 MAP>>, c3 STRUCT>>" + ) + self.assertEqual( + schema3.treeString(0).split("\n"), + [ + "root", + " |-- c1: map (nullable = true)", + " | |-- key: integer", + " | |-- value: struct (valueContainsNull = true)", + " | | |-- c2: map (nullable = true)", + " | | | |-- key: integer", + " | | | |-- value: integer (valueContainsNull = true)", + " |-- c3: struct (nullable = true)", + " | |-- c4: map (nullable = true)", + " | | |-- key: integer", + " | | |-- value: map (valueContainsNull = true)", + " | | | |-- key: integer", + " | | | |-- value: integer (valueContainsNull = true)", + "", + ], + ) + self.assertEqual( + schema3.treeString(1).split("\n"), + [ + "root", + " |-- c1: map (nullable = true)", + " |-- c3: struct (nullable = true)", + "", + ], + ) + self.assertEqual( + schema3.treeString(2).split("\n"), + [ + "root", + " |-- c1: map (nullable = true)", + " | |-- key: integer", + " | |-- value: struct (valueContainsNull = true)", + " |-- c3: struct (nullable = true)", + " | |-- c4: map (nullable = true)", + "", + ], + ) + self.assertEqual( + schema3.treeString(3).split("\n"), + [ + "root", + " |-- c1: map (nullable = true)", + " | |-- key: integer", + " | |-- value: struct (valueContainsNull = true)", + " | | |-- c2: map (nullable = true)", + " |-- c3: struct (nullable = true)", + " | |-- c4: map (nullable = true)", + " | | |-- key: integer", + " | | |-- value: map (valueContainsNull = true)", + "", + ], + ) + self.assertEqual( + schema3.treeString(4).split("\n"), + [ + "root", + " |-- c1: map (nullable = true)", + " | |-- key: integer", + " | |-- value: struct (valueContainsNull = true)", + " | | |-- c2: map (nullable = true)", + " | | | |-- key: integer", + " | | | |-- value: integer (valueContainsNull = true)", + " |-- c3: struct (nullable = true)", + " | |-- c4: map (nullable = true)", + " | | |-- key: integer", + " | | |-- value: map (valueContainsNull = true)", + " | | | |-- key: integer", + " | | | |-- value: integer (valueContainsNull = true)", + "", + ], + ) + + def test_tree_string_for_builtin_types(self): + schema = ( + StructType() + .add("n", NullType()) + .add("str", StringType()) + .add("c", CharType(10)) + .add("v", VarcharType(10)) + .add("bin", BinaryType()) + .add("bool", BooleanType()) + .add("date", DateType()) + .add("ts", TimestampType()) + .add("ts_ntz", TimestampNTZType()) + .add("dec", DecimalType(10, 2)) + .add("double", DoubleType()) + .add("float", FloatType()) + .add("long", LongType()) + .add("int", IntegerType()) + .add("short", ShortType()) + .add("byte", ByteType()) + .add("ym_interval_1", YearMonthIntervalType()) + .add("ym_interval_2", YearMonthIntervalType(YearMonthIntervalType.YEAR)) + .add( + "ym_interval_3", + YearMonthIntervalType(YearMonthIntervalType.YEAR, YearMonthIntervalType.MONTH), + ) + .add("dt_interval_1", DayTimeIntervalType()) + .add("dt_interval_2", DayTimeIntervalType(DayTimeIntervalType.DAY)) + .add( + "dt_interval_3", + DayTimeIntervalType(DayTimeIntervalType.HOUR, DayTimeIntervalType.SECOND), + ) + .add("cal_interval", CalendarIntervalType()) + .add("var", VariantType()) + ) + self.assertEqual( + schema.treeString().split("\n"), + [ + "root", + " |-- n: void (nullable = true)", + " |-- str: string (nullable = true)", + " |-- c: char(10) (nullable = true)", + " |-- v: varchar(10) (nullable = true)", + " |-- bin: binary (nullable = true)", + " |-- bool: boolean (nullable = true)", + " |-- date: date (nullable = true)", + " |-- ts: timestamp (nullable = true)", + " |-- ts_ntz: timestamp_ntz (nullable = true)", + " |-- dec: decimal(10,2) (nullable = true)", + " |-- double: double (nullable = true)", + " |-- float: float (nullable = true)", + " |-- long: long (nullable = true)", + " |-- int: integer (nullable = true)", + " |-- short: short (nullable = true)", + " |-- byte: byte (nullable = true)", + " |-- ym_interval_1: interval year to month (nullable = true)", + " |-- ym_interval_2: interval year (nullable = true)", + " |-- ym_interval_3: interval year to month (nullable = true)", + " |-- dt_interval_1: interval day to second (nullable = true)", + " |-- dt_interval_2: interval day (nullable = true)", + " |-- dt_interval_3: interval hour to second (nullable = true)", + " |-- cal_interval: interval (nullable = true)", + " |-- var: variant (nullable = true)", + "", + ], + ) + def test_metadata_null(self): schema = StructType( [ @@ -1230,7 +1841,7 @@ def test_repr(self): NullType(), StringType(), StringType("UTF8_BINARY"), - StringType("UTF8_BINARY_LCASE"), + StringType("UTF8_LCASE"), StringType("UNICODE"), StringType("UNICODE_CI"), CharType(10), @@ -1427,8 +2038,10 @@ def test_variant_type(self): ("-int4", "-69633", -69633), ("int8", "4295033089", 4295033089), ("-int8", "-4294967297", -4294967297), - ("float4", "1.23456789e-30", 1.23456789e-30), - ("-float4", "-4.56789e+29", -4.56789e29), + ("float4", "3.402e+38", 3.402e38), + ("-float4", "-3.402e+38", -3.402e38), + ("float8", "1.79769e+308", 1.79769e308), + ("-float8", "-1.79769e+308", -1.79769e308), ("dec4", "123.456", Decimal("123.456")), ("-dec4", "-321.654", Decimal("-321.654")), ("dec8", "429.4967297", Decimal("429.4967297")), @@ -1447,17 +2060,77 @@ def test_variant_type(self): F.struct([F.parse_json(F.lit('{"b": "2"}'))]).alias("s"), F.create_map([F.lit("k"), F.parse_json(F.lit('{"c": true}'))]).alias("m"), ).collect()[0] - variants = [row["v"], row["a"][0], row["s"]["col1"], row["m"]["k"]] + + # These data types are not supported by parse_json yet so they are being handled + # separately - Date, Timestamp, TimestampNTZ, Binary, Float (Single Precision) + date_columns = self.spark.sql( + "select cast(Date('2021-01-01')" + + " as variant) as d0, cast(Date('1800-12-31')" + + " as variant) as d1" + ).collect()[0] + float_columns = self.spark.sql( + "select cast(Float(5.5)" + " as variant) as f0, cast(Float(-5.5) as variant) as f1" + ).collect()[0] + binary_columns = self.spark.sql( + "select cast(binary(x'324FA69E')" + " as variant) as b" + ).collect()[0] + timetamp_ntz_columns = self.spark.sql( + "select cast(cast('1940-01-01 12:33:01.123'" + + " as timestamp_ntz) as variant) as tntz0, cast(cast('2522-12-31 05:57:13'" + + " as timestamp_ntz) as variant) as tntz1, cast(cast('0001-07-15 17:43:26+08:00'" + + " as timestamp_ntz) as variant) as tntz2" + ).collect()[0] + timetamp_columns = self.spark.sql( + "select cast(cast('1940-01-01 12:35:13.123+7:30'" + + " as timestamp) as variant) as t0, cast(cast('2522-12-31 00:00:00-5:23'" + + " as timestamp) as variant) as t1, cast(cast('0001-12-31 01:01:01+08:00'" + + " as timestamp) as variant) as t2" + ).collect()[0] + + variants = [ + row["v"], + row["a"][0], + row["s"]["col1"], + row["m"]["k"], + date_columns["d0"], + date_columns["d1"], + float_columns["f0"], + float_columns["f1"], + binary_columns["b"], + timetamp_ntz_columns["tntz0"], + timetamp_ntz_columns["tntz1"], + timetamp_ntz_columns["tntz2"], + timetamp_columns["t0"], + timetamp_columns["t1"], + timetamp_columns["t2"], + ] + for v in variants: self.assertEqual(type(v), VariantVal) - # check str + # check str (to_json) as_string = str(variants[0]) for key, expected, _ in expected_values: self.assertTrue('"%s":%s' % (key, expected) in as_string) self.assertEqual(str(variants[1]), '{"a":1}') self.assertEqual(str(variants[2]), '{"b":"2"}') self.assertEqual(str(variants[3]), '{"c":true}') + self.assertEqual(str(variants[4]), '"2021-01-01"') + self.assertEqual(str(variants[5]), '"1800-12-31"') + self.assertEqual(str(variants[6]), "5.5") + self.assertEqual(str(variants[7]), "-5.5") + self.assertEqual(str(variants[8]), '"Mk+mng=="') + self.assertEqual(str(variants[9]), '"1940-01-01 12:33:01.123000"') + self.assertEqual(str(variants[10]), '"2522-12-31 05:57:13"') + self.assertEqual(str(variants[11]), '"0001-07-15 17:43:26"') + self.assertEqual(str(variants[12]), '"1940-01-01 05:05:13.123000+00:00"') + self.assertEqual(str(variants[13]), '"2522-12-31 05:23:00+00:00"') + self.assertEqual(str(variants[14]), '"0001-12-30 17:01:01+00:00"') + + # Check to_json on timestamps with custom timezones + self.assertEqual( + variants[12].toJson("America/Los_Angeles"), '"1939-12-31 21:05:13.123000-08:00"' + ) # check toPython as_python = variants[0].toPython() @@ -1466,10 +2139,68 @@ def test_variant_type(self): self.assertEqual(variants[1].toPython(), {"a": 1}) self.assertEqual(variants[2].toPython(), {"b": "2"}) self.assertEqual(variants[3].toPython(), {"c": True}) + self.assertEqual(variants[4].toPython(), datetime.date(2021, 1, 1)) + self.assertEqual(variants[5].toPython(), datetime.date(1800, 12, 31)) + self.assertEqual(variants[6].toPython(), float(5.5)) + self.assertEqual(variants[7].toPython(), float(-5.5)) + self.assertEqual(variants[8].toPython(), bytearray(b"2O\xa6\x9e")) + self.assertEqual(variants[9].toPython(), datetime.datetime(1940, 1, 1, 12, 33, 1, 123000)) + self.assertEqual(variants[10].toPython(), datetime.datetime(2522, 12, 31, 5, 57, 13)) + self.assertEqual(variants[11].toPython(), datetime.datetime(1, 7, 15, 17, 43, 26)) + self.assertEqual( + variants[12].toPython(), + datetime.datetime( + 1940, + 1, + 1, + 12, + 35, + 13, + 123000, + tzinfo=datetime.timezone(datetime.timedelta(hours=7, minutes=30)), + ), + ) + self.assertEqual( + variants[13].toPython(), + datetime.datetime( + 2522, + 12, + 31, + 3, + 3, + 31, + tzinfo=datetime.timezone(datetime.timedelta(hours=-2, minutes=-20, seconds=31)), + ), + ) + self.assertEqual( + variants[14].toPython(), + datetime.datetime( + 1, + 12, + 31, + 16, + 3, + 23, + tzinfo=datetime.timezone(datetime.timedelta(hours=23, minutes=2, seconds=22)), + ), + ) # check repr self.assertEqual(str(variants[0]), str(eval(repr(variants[0])))) + metadata = bytes([1, 0, 0]) + self.assertEqual(str(VariantVal(bytes([32, 0, 1, 0, 0, 0]), metadata)), "1") + self.assertEqual(str(VariantVal(bytes([32, 1, 2, 0, 0, 0]), metadata)), "0.2") + self.assertEqual(str(VariantVal(bytes([32, 2, 3, 0, 0, 0]), metadata)), "0.03") + self.assertEqual(str(VariantVal(bytes([32, 0, 1, 0, 0, 0]), metadata)), "1") + self.assertEqual(str(VariantVal(bytes([32, 0, 255, 201, 154, 59]), metadata)), "999999999") + self.assertRaises( + PySparkValueError, lambda: str(VariantVal(bytes([32, 0, 0, 202, 154, 59]), metadata)) + ) + self.assertRaises( + PySparkValueError, lambda: str(VariantVal(bytes([32, 10, 1, 0, 0, 0]), metadata)) + ) + def test_from_ddl(self): self.assertEqual(DataType.fromDDL("long"), LongType()) self.assertEqual( @@ -1487,20 +2218,44 @@ def test_from_ddl(self): def test_collated_string(self): dfs = [ - self.spark.sql("SELECT 'abc' collate UTF8_BINARY_LCASE"), + self.spark.sql("SELECT 'abc' collate UTF8_LCASE"), self.spark.createDataFrame( - [], StructType([StructField("id", StringType("UTF8_BINARY_LCASE"))]) + [], StructType([StructField("id", StringType("UTF8_LCASE"))]) ), ] for df in dfs: # performs both datatype -> proto & proto -> datatype conversions self.assertEqual( - df.to(StructType([StructField("new", StringType("UTF8_BINARY_LCASE"))])) + df.to(StructType([StructField("new", StringType("UTF8_LCASE"))])) .schema[0] .dataType, - StringType("UTF8_BINARY_LCASE"), + StringType("UTF8_LCASE"), + ) + + def test_infer_array_element_type_with_struct(self): + # SPARK-48248: Nested array to respect legacy conf of inferArrayTypeFromFirstElement + with self.sql_conf( + {"spark.sql.pyspark.legacy.inferArrayTypeFromFirstElement.enabled": True} + ): + self.assertEqual( + ArrayType(ArrayType(LongType())), + self.spark.createDataFrame([[[[1, 1.0]]]]).schema.fields[0].dataType, ) + def test_ym_interval_in_collect(self): + with self.assertRaises(PySparkNotImplementedError): + self.spark.sql("SELECT INTERVAL '10-8' YEAR TO MONTH AS interval").first() + + with self.temp_env({"PYSPARK_YM_INTERVAL_LEGACY": "1"}): + self.assertEqual( + self.spark.sql("SELECT INTERVAL '10-8' YEAR TO MONTH AS interval").first(), + Row(interval=128), + ) + + def test_cal_interval_in_collect(self): + with self.assertRaises(PySparkNotImplementedError): + self.spark.sql("SELECT make_interval(100, 11, 1, 1, 12, 30, 01.001001)").first()[0] + class DataTypeTests(unittest.TestCase): # regression test for SPARK-6055 @@ -1638,7 +2393,7 @@ def __init__(self, **kwargs): (1.0, StringType()), ([], StringType()), ({}, StringType()), - ("", StringType("UTF8_BINARY_LCASE")), + ("", StringType("UTF8_LCASE")), # Char ("", CharType(10)), (1, CharType(10)), @@ -1707,7 +2462,7 @@ def __init__(self, **kwargs): failure_spec = [ # String (match anything but None) (None, StringType(), ValueError), - (None, StringType("UTF8_BINARY_LCASE"), ValueError), + (None, StringType("UTF8_LCASE"), ValueError), # CharType (match anything but None) (None, CharType(10), ValueError), # VarcharType (match anything but None) diff --git a/python/pyspark/sql/tests/test_udtf.py b/python/pyspark/sql/tests/test_udtf.py index 923fe4a2a8e8d..66f1a3090546f 100644 --- a/python/pyspark/sql/tests/test_udtf.py +++ b/python/pyspark/sql/tests/test_udtf.py @@ -1801,6 +1801,9 @@ def _add_archive(self, path): def test_udtf_with_analyze_using_archive(self): from pyspark.core.files import SparkFiles + self.check_udtf_with_analyze_using_archive(SparkFiles.getRootDirectory()) + + def check_udtf_with_analyze_using_archive(self, exec_root_dir): with tempfile.TemporaryDirectory(prefix="test_udtf_with_analyze_using_archive") as d: archive_path = os.path.join(d, "my_archive") os.mkdir(archive_path) @@ -1815,9 +1818,7 @@ class TestUDTF: @staticmethod def read_my_archive() -> str: with open( - os.path.join( - SparkFiles.getRootDirectory(), "my_files", "my_archive", "my_file.txt" - ), + os.path.join(exec_root_dir, "my_files", "my_archive", "my_file.txt"), "r", ) as my_file: return my_file.read().strip() @@ -1850,6 +1851,9 @@ def _add_file(self, path): def test_udtf_with_analyze_using_file(self): from pyspark.core.files import SparkFiles + self.check_udtf_with_analyze_using_file(SparkFiles.getRootDirectory()) + + def check_udtf_with_analyze_using_file(self, exec_root_dir): with tempfile.TemporaryDirectory(prefix="test_udtf_with_analyze_using_file") as d: file_path = os.path.join(d, "my_file.txt") with open(file_path, "w") as f: @@ -1860,9 +1864,7 @@ def test_udtf_with_analyze_using_file(self): class TestUDTF: @staticmethod def read_my_file() -> str: - with open( - os.path.join(SparkFiles.getRootDirectory(), "my_file.txt"), "r" - ) as my_file: + with open(os.path.join(exec_root_dir, "my_file.txt"), "r") as my_file: return my_file.read().strip() @staticmethod @@ -2557,16 +2559,18 @@ class TestUDTF: def eval(self): yield 1, - # We do not use `self.sql_conf` here to test the SQL SET command - # instead of using PySpark's `spark.conf.set`. old_value = self.spark.conf.get("spark.sql.execution.pythonUDTF.arrow.enabled") - self.spark.sql("SET spark.sql.execution.pythonUDTF.arrow.enabled=False") - self.assertEqual(udtf(TestUDTF, returnType="x: int").evalType, PythonEvalType.SQL_TABLE_UDF) - self.spark.sql("SET spark.sql.execution.pythonUDTF.arrow.enabled=True") - self.assertEqual( - udtf(TestUDTF, returnType="x: int").evalType, PythonEvalType.SQL_ARROW_TABLE_UDF - ) - self.spark.conf.set("spark.sql.execution.pythonUDTF.arrow.enabled", old_value) + try: + self.spark.conf.set("spark.sql.execution.pythonUDTF.arrow.enabled", False) + self.assertEqual( + udtf(TestUDTF, returnType="x: int").evalType, PythonEvalType.SQL_TABLE_UDF + ) + self.spark.conf.set("spark.sql.execution.pythonUDTF.arrow.enabled", True) + self.assertEqual( + udtf(TestUDTF, returnType="x: int").evalType, PythonEvalType.SQL_ARROW_TABLE_UDF + ) + finally: + self.spark.conf.set("spark.sql.execution.pythonUDTF.arrow.enabled", old_value) def test_udtf_eval_returning_non_tuple(self): @udtf(returnType="a: int") diff --git a/python/pyspark/sql/tests/typing/test_session.yml b/python/pyspark/sql/tests/typing/test_session.yml index 8f48edb7e579e..d6eee82a7678e 100644 --- a/python/pyspark/sql/tests/typing/test_session.yml +++ b/python/pyspark/sql/tests/typing/test_session.yml @@ -51,25 +51,6 @@ spark.createDataFrame(["foo", "bar"], "string") -- case: createDataFrameScalarsInvalid - main: | - from pyspark.sql import SparkSession - from pyspark.sql.types import StructType, StructField, StringType, IntegerType - - spark = SparkSession.builder.getOrCreate() - - schema = StructType([ - StructField("name", StringType(), True), - StructField("age", IntegerType(), True) - ]) - - # Invalid - scalars require schema - spark.createDataFrame(["foo", "bar"]) # E: Value of type variable "RowLike" of "createDataFrame" of "SparkSession" cannot be "str" [type-var] - - # Invalid - data has to match schema (either product -> struct or scalar -> atomic) - spark.createDataFrame([1, 2, 3], schema) # E: Value of type variable "RowLike" of "createDataFrame" of "SparkSession" cannot be "int" [type-var] - - - case: createDataFrameStructsInvalid main: | from pyspark.sql import SparkSession @@ -102,7 +83,9 @@ main:18: note: def [AtomicValue in (datetime, date, Decimal, bool, str, int, float)] createDataFrame(self, data: RDD[AtomicValue], schema: Union[AtomicType, str], verifySchema: bool = ...) -> DataFrame main:18: note: def [AtomicValue in (datetime, date, Decimal, bool, str, int, float)] createDataFrame(self, data: Iterable[AtomicValue], schema: Union[AtomicType, str], verifySchema: bool = ...) -> DataFrame main:18: note: def createDataFrame(self, data: DataFrame, samplingRatio: Optional[float] = ...) -> DataFrame + main:18: note: def createDataFrame(self, data: Any, samplingRatio: Optional[float] = ...) -> DataFrame main:18: note: def createDataFrame(self, data: DataFrame, schema: Union[StructType, str], verifySchema: bool = ...) -> DataFrame + main:18: note: def createDataFrame(self, data: Any, schema: Union[StructType, str], verifySchema: bool = ...) -> DataFrame - case: createDataFrameFromEmptyRdd main: | diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py index 3546fd8228141..d2adc53a3618f 100644 --- a/python/pyspark/sql/types.py +++ b/python/pyspark/sql/types.py @@ -15,6 +15,7 @@ # limitations under the License. # +import os import sys import decimal import time @@ -45,9 +46,14 @@ TYPE_CHECKING, ) -from pyspark.util import is_remote_only +from pyspark.util import is_remote_only, JVM_INT_MAX from pyspark.serializers import CloudPickleSerializer -from pyspark.sql.utils import has_numpy, get_active_spark_context +from pyspark.sql.utils import ( + has_numpy, + get_active_spark_context, + escape_meta_characters, + StringConcat, +) from pyspark.sql.variant_utils import VariantUtils from pyspark.errors import ( PySparkNotImplementedError, @@ -199,6 +205,35 @@ def fromDDL(cls, ddl: str) -> "DataType": assert len(schema) == 1 return schema[0].dataType + @classmethod + def _data_type_build_formatted_string( + cls, + dataType: "DataType", + prefix: str, + stringConcat: StringConcat, + maxDepth: int, + ) -> None: + if isinstance(dataType, (ArrayType, StructType, MapType)): + dataType._build_formatted_string(prefix, stringConcat, maxDepth - 1) + + # The method typeName() is not always the same as the Scala side. + # Add this helper method to make TreeString() compatible with Scala side. + @classmethod + def _get_jvm_type_name(cls, dataType: "DataType") -> str: + if isinstance( + dataType, + ( + DecimalType, + CharType, + VarcharType, + DayTimeIntervalType, + YearMonthIntervalType, + ), + ): + return dataType.simpleString() + else: + return dataType.typeName() + # This singleton pattern does not work with pickle, you will get # another object after pickle and unpickle @@ -254,38 +289,40 @@ class StringType(AtomicType): name of the collation, default is UTF8_BINARY. """ - collationNames = ["UTF8_BINARY", "UTF8_BINARY_LCASE", "UNICODE", "UNICODE_CI"] - - def __init__(self, collation: Optional[str] = None): - self.collationId = 0 if collation is None else self.collationNameToId(collation) + providerSpark = "spark" + providerICU = "icu" + providers = [providerSpark, providerICU] - @classmethod - def fromCollationId(self, collationId: int) -> "StringType": - return StringType(StringType.collationNames[collationId]) - - def collationIdToName(self) -> str: - if self.collationId == 0: - return "" - else: - return " collate %s" % StringType.collationNames[self.collationId] + def __init__(self, collation: str = "UTF8_BINARY"): + self.collation = collation @classmethod - def collationNameToId(cls, collationName: str) -> int: - return StringType.collationNames.index(collationName) + def collationProvider(cls, collationName: str) -> str: + # TODO: do this properly like on the scala side + if collationName.startswith("UTF8"): + return StringType.providerSpark + return StringType.providerICU def simpleString(self) -> str: - return "string" + self.collationIdToName() + if self.isUTF8BinaryCollation(): + return "string" + + return f"string collate ${self.collation}" + # For backwards compatibility and compatibility with other readers all string types + # are serialized in json as regular strings and the collation info is written to + # struct field metadata def jsonValue(self) -> str: - return "string" + self.collationIdToName() + return "string" def __repr__(self) -> str: return ( - "StringType('%s')" % StringType.collationNames[self.collationId] - if self.collationId != 0 - else "StringType()" + "StringType()" if self.isUTF8BinaryCollation() else "StringType('%s')" % self.collation ) + def isUTF8BinaryCollation(self) -> bool: + return self.collation == "UTF8_BINARY" + class CharType(AtomicType): """Char data type @@ -397,8 +434,8 @@ def toInternal(self, dt: datetime.datetime) -> int: def fromInternal(self, ts: int) -> datetime.datetime: if ts is not None: # using int to avoid precision loss in float - return datetime.datetime.utcfromtimestamp(ts // 1000000).replace( - microsecond=ts % 1000000 + return datetime.datetime.fromtimestamp(ts // 1000000, datetime.timezone.utc).replace( + microsecond=ts % 1000000, tzinfo=None ) @@ -550,7 +587,12 @@ def fromInternal(self, micros: int) -> Optional[datetime.timedelta]: class YearMonthIntervalType(AnsiIntervalType): - """YearMonthIntervalType, represents year-month intervals of the SQL standard""" + """YearMonthIntervalType, represents year-month intervals of the SQL standard + + Notes + ----- + This data type doesn't support collection: df.collect/take/head. + """ YEAR = 0 MONTH = 1 @@ -592,6 +634,24 @@ def _str_repr(self) -> str: jsonValue = _str_repr + def needConversion(self) -> bool: + # If PYSPARK_YM_INTERVAL_LEGACY is not set, needConversion is true, + # 'df.collect' fails with PySparkNotImplementedError; + # otherwise, no conversion is needed, and 'df.collect' returns the internal integers. + return not os.environ.get("PYSPARK_YM_INTERVAL_LEGACY") == "1" + + def toInternal(self, obj: Any) -> Any: + raise PySparkNotImplementedError( + error_class="NOT_IMPLEMENTED", + message_parameters={"feature": "YearMonthIntervalType.toInternal"}, + ) + + def fromInternal(self, obj: Any) -> Any: + raise PySparkNotImplementedError( + error_class="NOT_IMPLEMENTED", + message_parameters={"feature": "YearMonthIntervalType.fromInternal"}, + ) + def __repr__(self) -> str: return "%s(%d, %d)" % (type(self).__name__, self.startField, self.endField) @@ -609,6 +669,21 @@ class CalendarIntervalType(DataType, metaclass=DataTypeSingleton): def typeName(cls) -> str: return "interval" + def needConversion(self) -> bool: + return True + + def toInternal(self, obj: Any) -> Any: + raise PySparkNotImplementedError( + error_class="NOT_IMPLEMENTED", + message_parameters={"feature": "CalendarIntervalType.toInternal"}, + ) + + def fromInternal(self, obj: Any) -> Any: + raise PySparkNotImplementedError( + error_class="NOT_IMPLEMENTED", + message_parameters={"feature": "CalendarIntervalType.fromInternal"}, + ) + class ArrayType(DataType): """Array data type. @@ -693,8 +768,16 @@ def jsonValue(self) -> Dict[str, Any]: } @classmethod - def fromJson(cls, json: Dict[str, Any]) -> "ArrayType": - return ArrayType(_parse_datatype_json_value(json["elementType"]), json["containsNull"]) + def fromJson( + cls, + json: Dict[str, Any], + fieldPath: str, + collationsMap: Optional[Dict[str, str]], + ) -> "ArrayType": + elementType = _parse_datatype_json_value( + json["elementType"], fieldPath + ".element", collationsMap + ) + return ArrayType(elementType, json["containsNull"]) def needConversion(self) -> bool: return self.elementType.needConversion() @@ -709,6 +792,21 @@ def fromInternal(self, obj: List[Optional[T]]) -> List[Optional[T]]: return obj return obj and [self.elementType.fromInternal(v) for v in obj] + def _build_formatted_string( + self, + prefix: str, + stringConcat: StringConcat, + maxDepth: int = JVM_INT_MAX, + ) -> None: + if maxDepth > 0: + stringConcat.append( + f"{prefix}-- element: {DataType._get_jvm_type_name(self.elementType)} " + + f"(containsNull = {str(self.containsNull).lower()})\n" + ) + DataType._data_type_build_formatted_string( + self.elementType, f"{prefix} |", stringConcat, maxDepth + ) + class MapType(DataType): """Map data type. @@ -810,10 +908,19 @@ def jsonValue(self) -> Dict[str, Any]: } @classmethod - def fromJson(cls, json: Dict[str, Any]) -> "MapType": + def fromJson( + cls, + json: Dict[str, Any], + fieldPath: str, + collationsMap: Optional[Dict[str, str]], + ) -> "MapType": + keyType = _parse_datatype_json_value(json["keyType"], fieldPath + ".key", collationsMap) + valueType = _parse_datatype_json_value( + json["valueType"], fieldPath + ".value", collationsMap + ) return MapType( - _parse_datatype_json_value(json["keyType"]), - _parse_datatype_json_value(json["valueType"]), + keyType, + valueType, json["valueContainsNull"], ) @@ -834,6 +941,25 @@ def fromInternal(self, obj: Dict[T, Optional[U]]) -> Dict[T, Optional[U]]: (self.keyType.fromInternal(k), self.valueType.fromInternal(v)) for k, v in obj.items() ) + def _build_formatted_string( + self, + prefix: str, + stringConcat: StringConcat, + maxDepth: int = JVM_INT_MAX, + ) -> None: + if maxDepth > 0: + stringConcat.append(f"{prefix}-- key: {DataType._get_jvm_type_name(self.keyType)}\n") + DataType._data_type_build_formatted_string( + self.keyType, f"{prefix} |", stringConcat, maxDepth + ) + stringConcat.append( + f"{prefix}-- value: {DataType._get_jvm_type_name(self.valueType)} " + + f"(valueContainsNull = {str(self.valueContainsNull).lower()})\n" + ) + DataType._data_type_build_formatted_string( + self.valueType, f"{prefix} |", stringConcat, maxDepth + ) + class StructField(DataType): """A field in :class:`StructType`. @@ -884,22 +1010,89 @@ def __repr__(self) -> str: return "StructField('%s', %s, %s)" % (self.name, self.dataType, str(self.nullable)) def jsonValue(self) -> Dict[str, Any]: + collationMetadata = self.getCollationMetadata() + metadata = ( + self.metadata + if not collationMetadata + else {**self.metadata, _COLLATIONS_METADATA_KEY: collationMetadata} + ) + return { "name": self.name, "type": self.dataType.jsonValue(), "nullable": self.nullable, - "metadata": self.metadata, + "metadata": metadata, } @classmethod def fromJson(cls, json: Dict[str, Any]) -> "StructField": + metadata = json.get("metadata") + collationsMap = {} + if metadata and _COLLATIONS_METADATA_KEY in metadata: + collationsMap = metadata[_COLLATIONS_METADATA_KEY] + for key, value in collationsMap.items(): + nameParts = value.split(".") + assert len(nameParts) == 2 + provider, name = nameParts[0], nameParts[1] + _assert_valid_collation_provider(provider) + collationsMap[key] = name + + metadata = { + key: value for key, value in metadata.items() if key != _COLLATIONS_METADATA_KEY + } + return StructField( json["name"], - _parse_datatype_json_value(json["type"]), + _parse_datatype_json_value(json["type"], json["name"], collationsMap), json.get("nullable", True), - json.get("metadata"), + metadata, ) + def getCollationsMap(self, metadata: Dict[str, Any]) -> Dict[str, str]: + if not metadata or _COLLATIONS_METADATA_KEY not in metadata: + return {} + + collationMetadata: Dict[str, str] = metadata[_COLLATIONS_METADATA_KEY] + collationsMap: Dict[str, str] = {} + + for key, value in collationMetadata.items(): + nameParts = value.split(".") + assert len(nameParts) == 2 + provider, name = nameParts[0], nameParts[1] + _assert_valid_collation_provider(provider) + collationsMap[key] = name + + return collationsMap + + def getCollationMetadata(self) -> Dict[str, str]: + def visitRecursively(dt: DataType, fieldPath: str) -> None: + if isinstance(dt, ArrayType): + processDataType(dt.elementType, fieldPath + ".element") + elif isinstance(dt, MapType): + processDataType(dt.keyType, fieldPath + ".key") + processDataType(dt.valueType, fieldPath + ".value") + elif isinstance(dt, StringType) and self._isCollatedString(dt): + collationMetadata[fieldPath] = self.schemaCollationValue(dt) + + def processDataType(dt: DataType, fieldPath: str) -> None: + if self._isCollatedString(dt): + collationMetadata[fieldPath] = self.schemaCollationValue(dt) + else: + visitRecursively(dt, fieldPath) + + collationMetadata: Dict[str, str] = {} + visitRecursively(self.dataType, self.name) + return collationMetadata + + def _isCollatedString(self, dt: DataType) -> bool: + return isinstance(dt, StringType) and not dt.isUTF8BinaryCollation() + + def schemaCollationValue(self, dt: DataType) -> str: + assert isinstance(dt, StringType) + collationName = dt.collation + provider = StringType.collationProvider(collationName) + return f"{provider}.{collationName}" + def needConversion(self) -> bool: return self.dataType.needConversion() @@ -915,6 +1108,22 @@ def typeName(self) -> str: # type: ignore[override] message_parameters={}, ) + def _build_formatted_string( + self, + prefix: str, + stringConcat: StringConcat, + maxDepth: int = JVM_INT_MAX, + ) -> None: + if maxDepth > 0: + stringConcat.append( + f"{prefix}-- {escape_meta_characters(self.name)}: " + + f"{DataType._get_jvm_type_name(self.dataType)} " + + f"(nullable = {str(self.nullable).lower()})\n" + ) + DataType._data_type_build_formatted_string( + self.dataType, f"{prefix} |", stringConcat, maxDepth + ) + class StructType(DataType): """Struct type, consisting of a list of :class:`StructField`. @@ -1335,6 +1544,24 @@ def fromInternal(self, obj: Tuple) -> "Row": values = obj return _create_row(self.names, values) + def _build_formatted_string( + self, + prefix: str, + stringConcat: StringConcat, + maxDepth: int = JVM_INT_MAX, + ) -> None: + for field in self.fields: + field._build_formatted_string(prefix, stringConcat, maxDepth) + + def treeString(self, maxDepth: int = JVM_INT_MAX) -> str: + stringConcat = StringConcat() + stringConcat.append("root\n") + prefix = " |" + depth = maxDepth if maxDepth > 0 else JVM_INT_MAX + for field in self.fields: + field._build_formatted_string(prefix, stringConcat, depth) + return stringConcat.toString() + class VariantType(AtomicType): """ @@ -1521,6 +1748,19 @@ def toPython(self) -> Any: """ return VariantUtils.to_python(self.value, self.metadata) + def toJson(self, zone_id: str = "UTC") -> str: + """ + Convert the VariantVal to a JSON string. The zone ID represents the time zone that the + timestamp should be printed in. It is defaulted to UTC. The list of valid zone IDs can be + found by importing the `zoneinfo` module and running :code:`zoneinfo.available_timezones()`. + + Returns + ------- + str + A JSON string that represents the Variant. + """ + return VariantUtils.to_json(self.value, self.metadata, zone_id) + _atomic_types: List[Type[DataType]] = [ StringType, @@ -1540,21 +1780,67 @@ def toPython(self) -> Any: TimestampNTZType, NullType, VariantType, + YearMonthIntervalType, + DayTimeIntervalType, ] -_all_atomic_types: Dict[str, Type[DataType]] = dict((t.typeName(), t) for t in _atomic_types) -_complex_types: List[Type[Union[ArrayType, MapType, StructType]]] = [ArrayType, MapType, StructType] -_all_complex_types: Dict[str, Type[Union[ArrayType, MapType, StructType]]] = dict( - (v.typeName(), v) for v in _complex_types -) +_complex_types: List[Type[Union[ArrayType, MapType, StructType]]] = [ + ArrayType, + MapType, + StructType, +] +_all_complex_types: Dict[str, Type[Union[ArrayType, MapType, StructType]]] = { + "array": ArrayType, + "map": MapType, + "struct": StructType, +} + +# Datatypes that can be directly parsed by mapping a json string without regex. +# This dict should be only used in json parsing. +# Note that: +# 1, CharType and VarcharType are not listed here, since they need regex; +# 2, DecimalType can be parsed by both mapping ('decimal') and regex ('decimal(10, 2)'); +# 3, CalendarIntervalType is not an atomic type, but can be mapped by 'interval'; +_all_mappable_types: Dict[str, Type[DataType]] = { + "string": StringType, + "binary": BinaryType, + "boolean": BooleanType, + "decimal": DecimalType, + "float": FloatType, + "double": DoubleType, + "byte": ByteType, + "short": ShortType, + "integer": IntegerType, + "long": LongType, + "date": DateType, + "timestamp": TimestampType, + "timestamp_ntz": TimestampNTZType, + "void": NullType, + "variant": VariantType, + "interval": CalendarIntervalType, +} -_COLLATED_STRING = re.compile(r"string\s+collate\s+([\w_]+|`[\w_]`)") _LENGTH_CHAR = re.compile(r"char\(\s*(\d+)\s*\)") _LENGTH_VARCHAR = re.compile(r"varchar\(\s*(\d+)\s*\)") _FIXED_DECIMAL = re.compile(r"decimal\(\s*(\d+)\s*,\s*(-?\d+)\s*\)") _INTERVAL_DAYTIME = re.compile(r"interval (day|hour|minute|second)( to (day|hour|minute|second))?") _INTERVAL_YEARMONTH = re.compile(r"interval (year|month)( to (year|month))?") +_COLLATIONS_METADATA_KEY = "__COLLATIONS" + + +def _drop_metadata(d: Union[DataType, StructField]) -> Union[DataType, StructField]: + assert isinstance(d, (DataType, StructField)) + if isinstance(d, StructField): + return StructField(d.name, _drop_metadata(d.dataType), d.nullable, None) + elif isinstance(d, StructType): + return StructType([cast(StructField, _drop_metadata(f)) for f in d.fields]) + elif isinstance(d, ArrayType): + return ArrayType(_drop_metadata(d.elementType), d.containsNull) + elif isinstance(d, MapType): + return MapType(_drop_metadata(d.keyType), _drop_metadata(d.valueType), d.valueContainsNull) + return d + def _parse_datatype_string(s: str) -> DataType: """ @@ -1600,35 +1886,48 @@ def _parse_datatype_string(s: str) -> DataType: ... ParseException:... """ - from py4j.java_gateway import JVMView + from pyspark.sql.utils import is_remote - sc = get_active_spark_context() + if is_remote(): + from pyspark.sql.connect.session import SparkSession - def from_ddl_schema(type_str: str) -> DataType: - return _parse_datatype_json_string( - cast(JVMView, sc._jvm).org.apache.spark.sql.types.StructType.fromDDL(type_str).json() + return cast( + DataType, + SparkSession.active()._client._analyze(method="ddl_parse", ddl_string=s).parsed, ) - def from_ddl_datatype(type_str: str) -> DataType: - return _parse_datatype_json_string( - cast(JVMView, sc._jvm) - .org.apache.spark.sql.api.python.PythonSQLUtils.parseDataType(type_str) - .json() - ) + else: + from py4j.java_gateway import JVMView + + sc = get_active_spark_context() + + def from_ddl_schema(type_str: str) -> DataType: + return _parse_datatype_json_string( + cast(JVMView, sc._jvm) + .org.apache.spark.sql.types.StructType.fromDDL(type_str) + .json() + ) + + def from_ddl_datatype(type_str: str) -> DataType: + return _parse_datatype_json_string( + cast(JVMView, sc._jvm) + .org.apache.spark.sql.api.python.PythonSQLUtils.parseDataType(type_str) + .json() + ) - try: - # DDL format, "fieldname datatype, fieldname datatype". - return from_ddl_schema(s) - except Exception as e: try: - # For backwards compatibility, "integer", "struct" and etc. - return from_ddl_datatype(s) - except BaseException: + # DDL format, "fieldname datatype, fieldname datatype". + return from_ddl_schema(s) + except Exception as e: try: - # For backwards compatibility, "fieldname: datatype, fieldname: datatype" case. - return from_ddl_datatype("struct<%s>" % s.strip()) + # For backwards compatibility, "integer", "struct" and etc. + return from_ddl_datatype(s) except BaseException: - raise e + try: + # For backwards compatibility, "fieldname: datatype, fieldname: datatype" case. + return from_ddl_datatype("struct<%s>" % s.strip()) + except BaseException: + raise e def _parse_datatype_json_string(json_string: str) -> DataType: @@ -1644,11 +1943,8 @@ def _parse_datatype_json_string(json_string: str) -> DataType: ... python_datatype = _parse_datatype_json_string(scala_datatype.json()) ... assert datatype == python_datatype ... - >>> for cls in _all_atomic_types.values(): - ... if cls is not VarcharType and cls is not CharType: - ... check_datatype(cls()) - ... else: - ... check_datatype(cls(1)) + >>> for cls in _all_mappable_types.values(): + ... check_datatype(cls()) >>> # Simple ArrayType. >>> simple_arraytype = ArrayType(StringType(), True) @@ -1689,12 +1985,18 @@ def _parse_datatype_json_string(json_string: str) -> DataType: return _parse_datatype_json_value(json.loads(json_string)) -def _parse_datatype_json_value(json_value: Union[dict, str]) -> DataType: +def _parse_datatype_json_value( + json_value: Union[dict, str], + fieldPath: str = "", + collationsMap: Optional[Dict[str, str]] = None, +) -> DataType: if not isinstance(json_value, dict): - if json_value in _all_atomic_types.keys(): - return _all_atomic_types[json_value]() - elif json_value == "decimal": - return DecimalType() + if json_value in _all_mappable_types.keys(): + if collationsMap is not None and fieldPath in collationsMap: + _assert_valid_type_for_collation(fieldPath, json_value, collationsMap) + collation_name = collationsMap[fieldPath] + return StringType(collation_name) + return _all_mappable_types[json_value]() elif _FIXED_DECIMAL.match(json_value): m = _FIXED_DECIMAL.match(json_value) return DecimalType(int(m.group(1)), int(m.group(2))) # type: ignore[union-attr] @@ -1714,11 +2016,6 @@ def _parse_datatype_json_value(json_value: Union[dict, str]) -> DataType: if first_field is not None and second_field is None: return YearMonthIntervalType(first_field) return YearMonthIntervalType(first_field, second_field) - elif json_value == "interval": - return CalendarIntervalType() - elif _COLLATED_STRING.match(json_value): - m = _COLLATED_STRING.match(json_value) - return StringType(m.group(1)) # type: ignore[union-attr] elif _LENGTH_CHAR.match(json_value): m = _LENGTH_CHAR.match(json_value) return CharType(int(m.group(1))) # type: ignore[union-attr] @@ -1733,7 +2030,15 @@ def _parse_datatype_json_value(json_value: Union[dict, str]) -> DataType: else: tpe = json_value["type"] if tpe in _all_complex_types: - return _all_complex_types[tpe].fromJson(json_value) + if collationsMap is not None and fieldPath in collationsMap: + _assert_valid_type_for_collation(fieldPath, tpe, collationsMap) + + complex_type = _all_complex_types[tpe] + if complex_type is ArrayType: + return ArrayType.fromJson(json_value, fieldPath, collationsMap) + elif complex_type is MapType: + return MapType.fromJson(json_value, fieldPath, collationsMap) + return StructType.fromJson(json_value) elif tpe == "udt": return UserDefinedType.fromJson(json_value) else: @@ -1743,6 +2048,27 @@ def _parse_datatype_json_value(json_value: Union[dict, str]) -> DataType: ) +def _assert_valid_type_for_collation( + fieldPath: str, fieldType: Any, collationMap: Dict[str, str] +) -> None: + if fieldPath in collationMap and fieldType != "string": + raise PySparkTypeError( + error_class="INVALID_JSON_DATA_TYPE_FOR_COLLATIONS", + message_parameters={"jsonType": fieldType}, + ) + + +def _assert_valid_collation_provider(provider: str) -> None: + if provider.lower() not in StringType.providers: + raise PySparkValueError( + error_class="COLLATION_INVALID_PROVIDER", + message_parameters={ + "provider": provider, + "supportedProviders": ", ".join(StringType.providers), + }, + ) + + # Mapping Python types to Spark SQL DataType _type_mappings = { type(None): NullType, @@ -1862,6 +2188,7 @@ def _infer_type( obj: Any, infer_dict_as_struct: bool = False, infer_array_from_first_element: bool = False, + infer_map_from_first_pair: bool = False, prefer_timestamp_ntz: bool = False, ) -> DataType: """Infer the DataType from obj""" @@ -1897,12 +2224,13 @@ def _infer_type( value, infer_dict_as_struct, infer_array_from_first_element, + infer_map_from_first_pair, prefer_timestamp_ntz, ), True, ) return struct - else: + elif infer_map_from_first_pair: for key, value in obj.items(): if key is not None and value is not None: return MapType( @@ -1910,28 +2238,72 @@ def _infer_type( key, infer_dict_as_struct, infer_array_from_first_element, + infer_map_from_first_pair, prefer_timestamp_ntz, ), _infer_type( value, infer_dict_as_struct, infer_array_from_first_element, + infer_map_from_first_pair, prefer_timestamp_ntz, ), True, ) return MapType(NullType(), NullType(), True) + else: + key_type: DataType = NullType() + value_type: DataType = NullType() + for key, value in obj.items(): + if key is not None: + key_type = _merge_type( + key_type, + _infer_type( + key, + infer_dict_as_struct, + infer_array_from_first_element, + infer_map_from_first_pair, + prefer_timestamp_ntz, + ), + ) + if value is not None: + value_type = _merge_type( + value_type, + _infer_type( + value, + infer_dict_as_struct, + infer_array_from_first_element, + infer_map_from_first_pair, + prefer_timestamp_ntz, + ), + ) + + return MapType(key_type, value_type, True) elif isinstance(obj, list): if len(obj) > 0: if infer_array_from_first_element: return ArrayType( - _infer_type(obj[0], infer_dict_as_struct, prefer_timestamp_ntz), True + _infer_type( + obj[0], + infer_dict_as_struct, + infer_array_from_first_element, + prefer_timestamp_ntz, + ), + True, ) else: return ArrayType( reduce( _merge_type, - (_infer_type(v, infer_dict_as_struct, prefer_timestamp_ntz) for v in obj), + ( + _infer_type( + v, + infer_dict_as_struct, + infer_array_from_first_element, + prefer_timestamp_ntz, + ) + for v in obj + ), ), True, ) @@ -1950,6 +2322,7 @@ def _infer_type( obj, infer_dict_as_struct=infer_dict_as_struct, infer_array_from_first_element=infer_array_from_first_element, + prefer_timestamp_ntz=prefer_timestamp_ntz, ) except TypeError: raise PySparkTypeError( @@ -1963,6 +2336,7 @@ def _infer_schema( names: Optional[List[str]] = None, infer_dict_as_struct: bool = False, infer_array_from_first_element: bool = False, + infer_map_from_first_pair: bool = False, prefer_timestamp_ntz: bool = False, ) -> StructType: """Infer the schema from dict/namedtuple/object""" @@ -2001,6 +2375,7 @@ def _infer_schema( v, infer_dict_as_struct, infer_array_from_first_element, + infer_map_from_first_pair, prefer_timestamp_ntz, ), True, diff --git a/python/pyspark/sql/udf.py b/python/pyspark/sql/udf.py index 0d0fc9042e627..3d19a2b5458bd 100644 --- a/python/pyspark/sql/udf.py +++ b/python/pyspark/sql/udf.py @@ -27,7 +27,7 @@ from pyspark.util import PythonEvalType -from pyspark.sql.column import Column, _to_java_expr, _to_seq +from pyspark.sql.column import Column from pyspark.sql.types import ( DataType, StringType, @@ -205,54 +205,45 @@ def __init__( self.evalType = evalType self.deterministic = deterministic - @property - def returnType(self) -> DataType: - # This makes sure this is called after SparkContext is initialized. - # ``_parse_datatype_string`` accesses to JVM for parsing a DDL formatted string. - # TODO: PythonEvalType.SQL_BATCHED_UDF - if self._returnType_placeholder is None: - if isinstance(self._returnType, DataType): - self._returnType_placeholder = self._returnType - else: - self._returnType_placeholder = _parse_datatype_string(self._returnType) - if self.evalType == PythonEvalType.SQL_ARROW_BATCHED_UDF: + @staticmethod + def _check_return_type(returnType: DataType, evalType: int) -> None: + if evalType == PythonEvalType.SQL_ARROW_BATCHED_UDF: try: - to_arrow_type(self._returnType_placeholder) + to_arrow_type(returnType) except TypeError: raise PySparkNotImplementedError( error_class="NOT_IMPLEMENTED", message_parameters={ "feature": f"Invalid return type with Arrow-optimized Python UDF: " - f"{self._returnType_placeholder}" + f"{returnType}" }, ) elif ( - self.evalType == PythonEvalType.SQL_SCALAR_PANDAS_UDF - or self.evalType == PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF + evalType == PythonEvalType.SQL_SCALAR_PANDAS_UDF + or evalType == PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF ): try: - to_arrow_type(self._returnType_placeholder) + to_arrow_type(returnType) except TypeError: raise PySparkNotImplementedError( error_class="NOT_IMPLEMENTED", message_parameters={ - "feature": f"Invalid return type with scalar Pandas UDFs: " - f"{self._returnType_placeholder}" + "feature": f"Invalid return type with scalar Pandas UDFs: " f"{returnType}" }, ) elif ( - self.evalType == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF - or self.evalType == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF_WITH_STATE + evalType == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF + or evalType == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF_WITH_STATE ): - if isinstance(self._returnType_placeholder, StructType): + if isinstance(returnType, StructType): try: - to_arrow_type(self._returnType_placeholder) + to_arrow_type(returnType) except TypeError: raise PySparkNotImplementedError( error_class="NOT_IMPLEMENTED", message_parameters={ "feature": f"Invalid return type with grouped map Pandas UDFs or " - f"at groupby.applyInPandas(WithState): {self._returnType_placeholder}" + f"at groupby.applyInPandas(WithState): {returnType}" }, ) else: @@ -261,22 +252,21 @@ def returnType(self) -> DataType: message_parameters={ "eval_type": "SQL_GROUPED_MAP_PANDAS_UDF or " "SQL_GROUPED_MAP_PANDAS_UDF_WITH_STATE", - "return_type": str(self._returnType_placeholder), + "return_type": str(returnType), }, ) elif ( - self.evalType == PythonEvalType.SQL_MAP_PANDAS_ITER_UDF - or self.evalType == PythonEvalType.SQL_MAP_ARROW_ITER_UDF + evalType == PythonEvalType.SQL_MAP_PANDAS_ITER_UDF + or evalType == PythonEvalType.SQL_MAP_ARROW_ITER_UDF ): - if isinstance(self._returnType_placeholder, StructType): + if isinstance(returnType, StructType): try: - to_arrow_type(self._returnType_placeholder) + to_arrow_type(returnType) except TypeError: raise PySparkNotImplementedError( error_class="NOT_IMPLEMENTED", message_parameters={ - "feature": f"Invalid return type in mapInPandas: " - f"{self._returnType_placeholder}" + "feature": f"Invalid return type in mapInPandas: " f"{returnType}" }, ) else: @@ -284,19 +274,19 @@ def returnType(self) -> DataType: error_class="INVALID_RETURN_TYPE_FOR_PANDAS_UDF", message_parameters={ "eval_type": "SQL_MAP_PANDAS_ITER_UDF or SQL_MAP_ARROW_ITER_UDF", - "return_type": str(self._returnType_placeholder), + "return_type": str(returnType), }, ) - elif self.evalType == PythonEvalType.SQL_GROUPED_MAP_ARROW_UDF: - if isinstance(self._returnType_placeholder, StructType): + elif evalType == PythonEvalType.SQL_GROUPED_MAP_ARROW_UDF: + if isinstance(returnType, StructType): try: - to_arrow_type(self._returnType_placeholder) + to_arrow_type(returnType) except TypeError: raise PySparkNotImplementedError( error_class="NOT_IMPLEMENTED", message_parameters={ "feature": "Invalid return type with grouped map Arrow UDFs or " - f"at groupby.applyInArrow: {self._returnType_placeholder}" + f"at groupby.applyInArrow: {returnType}" }, ) else: @@ -304,19 +294,19 @@ def returnType(self) -> DataType: error_class="INVALID_RETURN_TYPE_FOR_ARROW_UDF", message_parameters={ "eval_type": "SQL_GROUPED_MAP_ARROW_UDF", - "return_type": str(self._returnType_placeholder), + "return_type": str(returnType), }, ) - elif self.evalType == PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF: - if isinstance(self._returnType_placeholder, StructType): + elif evalType == PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF: + if isinstance(returnType, StructType): try: - to_arrow_type(self._returnType_placeholder) + to_arrow_type(returnType) except TypeError: raise PySparkNotImplementedError( error_class="NOT_IMPLEMENTED", message_parameters={ "feature": f"Invalid return type in cogroup.applyInPandas: " - f"{self._returnType_placeholder}" + f"{returnType}" }, ) else: @@ -324,19 +314,19 @@ def returnType(self) -> DataType: error_class="INVALID_RETURN_TYPE_FOR_PANDAS_UDF", message_parameters={ "eval_type": "SQL_COGROUPED_MAP_PANDAS_UDF", - "return_type": str(self._returnType_placeholder), + "return_type": str(returnType), }, ) - elif self.evalType == PythonEvalType.SQL_COGROUPED_MAP_ARROW_UDF: - if isinstance(self._returnType_placeholder, StructType): + elif evalType == PythonEvalType.SQL_COGROUPED_MAP_ARROW_UDF: + if isinstance(returnType, StructType): try: - to_arrow_type(self._returnType_placeholder) + to_arrow_type(returnType) except TypeError: raise PySparkNotImplementedError( error_class="NOT_IMPLEMENTED", message_parameters={ "feature": "Invalid return type in cogroup.applyInArrow: " - f"{self._returnType_placeholder}" + f"{returnType}" }, ) else: @@ -344,30 +334,42 @@ def returnType(self) -> DataType: error_class="INVALID_RETURN_TYPE_FOR_ARROW_UDF", message_parameters={ "eval_type": "SQL_COGROUPED_MAP_ARROW_UDF", - "return_type": str(self._returnType_placeholder), + "return_type": str(returnType), }, ) - elif self.evalType == PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF: + elif evalType == PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF: try: # StructType is not yet allowed as a return type, explicitly check here to fail fast - if isinstance(self._returnType_placeholder, StructType): + if isinstance(returnType, StructType): raise PySparkNotImplementedError( error_class="NOT_IMPLEMENTED", message_parameters={ "feature": f"Invalid return type with grouped aggregate Pandas UDFs: " - f"{self._returnType_placeholder}" + f"{returnType}" }, ) - to_arrow_type(self._returnType_placeholder) + to_arrow_type(returnType) except TypeError: raise PySparkNotImplementedError( error_class="NOT_IMPLEMENTED", message_parameters={ "feature": f"Invalid return type with grouped aggregate Pandas UDFs: " - f"{self._returnType_placeholder}" + f"{returnType}" }, ) + @property + def returnType(self) -> DataType: + # Make sure this is called after SparkContext is initialized. + # ``_parse_datatype_string`` accesses to JVM for parsing a DDL formatted string. + # TODO: PythonEvalType.SQL_BATCHED_UDF + if self._returnType_placeholder is None: + if isinstance(self._returnType, DataType): + self._returnType_placeholder = self._returnType + else: + self._returnType_placeholder = _parse_datatype_string(self._returnType) + + UserDefinedFunction._check_return_type(self._returnType_placeholder, self.evalType) return self._returnType_placeholder @property @@ -395,6 +397,8 @@ def _create_judf(self, func: Callable[..., Any]) -> "JavaObject": return judf def __call__(self, *args: "ColumnOrName", **kwargs: "ColumnOrName") -> Column: + from pyspark.sql.classic.column import _to_java_expr, _to_seq + sc = get_active_spark_context() assert sc._jvm is not None diff --git a/python/pyspark/sql/udtf.py b/python/pyspark/sql/udtf.py index 801ecc605e500..f560880202230 100644 --- a/python/pyspark/sql/udtf.py +++ b/python/pyspark/sql/udtf.py @@ -26,7 +26,6 @@ from pyspark.errors import PySparkAttributeError, PySparkPicklingError, PySparkTypeError from pyspark.util import PythonEvalType -from pyspark.sql.column import _to_java_column, _to_java_expr, _to_seq from pyspark.sql.pandas.utils import require_minimum_pandas_version, require_minimum_pyarrow_version from pyspark.sql.types import DataType, StructType, _parse_datatype_string from pyspark.sql.udf import _wrap_function @@ -374,6 +373,8 @@ def _create_judtf(self, func: Type) -> "JavaObject": return judtf def __call__(self, *args: "ColumnOrName", **kwargs: "ColumnOrName") -> "DataFrame": + from pyspark.sql.classic.column import _to_java_column, _to_java_expr, _to_seq + from pyspark.sql import DataFrame, SparkSession spark = SparkSession._getActiveSessionOrCreate() diff --git a/python/pyspark/sql/utils.py b/python/pyspark/sql/utils.py index be46203665714..76227851f9fe6 100644 --- a/python/pyspark/sql/utils.py +++ b/python/pyspark/sql/utils.py @@ -17,7 +17,17 @@ import inspect import functools import os -from typing import Any, Callable, Optional, Sequence, TYPE_CHECKING, cast, TypeVar, Union, Type +from typing import ( + Any, + Callable, + Optional, + List, + Sequence, + TYPE_CHECKING, + cast, + TypeVar, + Union, +) # For backward compatibility. from pyspark.errors import ( # noqa: F401 @@ -32,7 +42,7 @@ PySparkNotImplementedError, PySparkRuntimeError, ) -from pyspark.util import is_remote_only +from pyspark.util import is_remote_only, JVM_INT_MAX from pyspark.errors.exceptions.captured import CapturedException # noqa: F401 from pyspark.find_spark_home import _find_spark_home @@ -46,8 +56,6 @@ from pyspark import SparkContext from pyspark.sql.session import SparkSession from pyspark.sql.dataframe import DataFrame - from pyspark.sql.column import Column - from pyspark.sql.window import Window from pyspark.pandas._typing import IndexOpsLike, SeriesOrIndex has_numpy: bool = False @@ -124,6 +132,44 @@ class Java: implements = ["org.apache.spark.sql.execution.streaming.sources.PythonForeachBatchFunction"] +# Python implementation of 'org.apache.spark.sql.catalyst.util.StringConcat' +class StringConcat: + def __init__(self, maxLength: int = JVM_INT_MAX - 15): + self.maxLength: int = maxLength + self.strings: List[str] = [] + self.length: int = 0 + + def atLimit(self) -> bool: + return self.length >= self.maxLength + + def append(self, s: str) -> None: + if s is not None: + sLen = len(s) + if not self.atLimit(): + available = self.maxLength - self.length + stringToAppend = s if available >= sLen else s[0:available] + self.strings.append(stringToAppend) + + self.length = min(self.length + sLen, JVM_INT_MAX - 15) + + def toString(self) -> str: + # finalLength = self.maxLength if self.atLimit() else self.length + return "".join(self.strings) + + +# Python implementation of 'org.apache.spark.util.SparkSchemaUtils.escapeMetaCharacters' +def escape_meta_characters(s: str) -> str: + return ( + s.replace("\n", "\\n") + .replace("\r", "\\r") + .replace("\t", "\\t") + .replace("\f", "\\f") + .replace("\b", "\\b") + .replace("\u000B", "\\v") + .replace("\u0007", "\\a") + ) + + def to_str(value: Any) -> Optional[str]: """ A wrapper over str(), but converts bool values to lower case strings. @@ -242,62 +288,111 @@ def wrapped(*args: Any, **kwargs: Any) -> Any: return cast(FuncT, wrapped) -def try_remote_window(f: FuncT) -> FuncT: +def get_active_spark_context() -> "SparkContext": + """Raise RuntimeError if SparkContext is not initialized, + otherwise, returns the active SparkContext.""" + from pyspark import SparkContext + + sc = SparkContext._active_spark_context + if sc is None or sc._jvm is None: + raise PySparkRuntimeError( + error_class="SESSION_OR_CONTEXT_NOT_EXISTS", + message_parameters={}, + ) + return sc + + +def try_remote_session_classmethod(f: FuncT) -> FuncT: """Mark API supported from Spark Connect.""" @functools.wraps(f) def wrapped(*args: Any, **kwargs: Any) -> Any: if is_remote() and "PYSPARK_NO_NAMESPACE_SHARE" not in os.environ: - from pyspark.sql.connect.window import Window + from pyspark.sql.connect.session import SparkSession - return getattr(Window, f.__name__)(*args, **kwargs) + assert inspect.isclass(args[0]) + return getattr(SparkSession, f.__name__)(*args[1:], **kwargs) else: return f(*args, **kwargs) return cast(FuncT, wrapped) -def try_remote_windowspec(f: FuncT) -> FuncT: - """Mark API supported from Spark Connect.""" +def dispatch_df_method(f: FuncT) -> FuncT: + """ + For the usecases of direct DataFrame.union(df, ...), it checks if self + is a Connect DataFrame or Classic DataFrame, and dispatches. + """ @functools.wraps(f) def wrapped(*args: Any, **kwargs: Any) -> Any: if is_remote() and "PYSPARK_NO_NAMESPACE_SHARE" not in os.environ: - from pyspark.sql.connect.window import WindowSpec + from pyspark.sql.connect.dataframe import DataFrame as ConnectDataFrame - return getattr(WindowSpec, f.__name__)(*args, **kwargs) + if isinstance(args[0], ConnectDataFrame): + return getattr(ConnectDataFrame, f.__name__)(*args, **kwargs) else: - return f(*args, **kwargs) + from pyspark.sql.classic.dataframe import DataFrame as ClassicDataFrame + + if isinstance(args[0], ClassicDataFrame): + return getattr(ClassicDataFrame, f.__name__)(*args, **kwargs) + + raise PySparkNotImplementedError( + error_class="NOT_IMPLEMENTED", + message_parameters={"feature": f"DataFrame.{f.__name__}"}, + ) return cast(FuncT, wrapped) -def get_active_spark_context() -> "SparkContext": - """Raise RuntimeError if SparkContext is not initialized, - otherwise, returns the active SparkContext.""" - from pyspark import SparkContext +def dispatch_col_method(f: FuncT) -> FuncT: + """ + For the usecases of direct Column.method(col, ...), it checks if self + is a Connect DataFrame or Classic DataFrame, and dispatches. + """ - sc = SparkContext._active_spark_context - if sc is None or sc._jvm is None: - raise PySparkRuntimeError( - error_class="SESSION_OR_CONTEXT_NOT_EXISTS", - message_parameters={}, + @functools.wraps(f) + def wrapped(*args: Any, **kwargs: Any) -> Any: + if is_remote() and "PYSPARK_NO_NAMESPACE_SHARE" not in os.environ: + from pyspark.sql.connect.column import Column as ConnectColumn + + if isinstance(args[0], ConnectColumn): + return getattr(ConnectColumn, f.__name__)(*args, **kwargs) + else: + from pyspark.sql.classic.column import Column as ClassicColumn + + if isinstance(args[0], ClassicColumn): + return getattr(ClassicColumn, f.__name__)(*args, **kwargs) + + raise PySparkNotImplementedError( + error_class="NOT_IMPLEMENTED", + message_parameters={"feature": f"Column.{f.__name__}"}, ) - return sc + return cast(FuncT, wrapped) -def try_remote_session_classmethod(f: FuncT) -> FuncT: - """Mark API supported from Spark Connect.""" + +def dispatch_window_method(f: FuncT) -> FuncT: + """ + For the usecases of direct Window.method(col, ...), it checks if self + is a Connect Window or Classic Window, and dispatches. + """ @functools.wraps(f) def wrapped(*args: Any, **kwargs: Any) -> Any: if is_remote() and "PYSPARK_NO_NAMESPACE_SHARE" not in os.environ: - from pyspark.sql.connect.session import SparkSession + from pyspark.sql.connect.window import Window as ConnectWindow - assert inspect.isclass(args[0]) - return getattr(SparkSession, f.__name__)(*args[1:], **kwargs) + return getattr(ConnectWindow, f.__name__)(*args, **kwargs) else: - return f(*args, **kwargs) + from pyspark.sql.classic.window import Window as ClassicWindow + + return getattr(ClassicWindow, f.__name__)(*args, **kwargs) + + raise PySparkNotImplementedError( + error_class="NOT_IMPLEMENTED", + message_parameters={"feature": f"Window.{f.__name__}"}, + ) return cast(FuncT, wrapped) @@ -309,15 +404,9 @@ def pyspark_column_op( Wrapper function for column_op to get proper Column class. """ from pyspark.pandas.base import column_op - from pyspark.sql.column import Column as PySparkColumn + from pyspark.sql.column import Column from pyspark.pandas.data_type_ops.base import _is_extension_dtypes - if is_remote(): - from pyspark.sql.connect.column import Column as ConnectColumn - - Column = ConnectColumn - else: - Column = PySparkColumn # type: ignore[assignment] result = column_op(getattr(Column, func_name))(left, right) # It works as expected on extension dtype, so we don't need to call `fillna` for this case. if (fillna is not None) and (_is_extension_dtypes(left) or _is_extension_dtypes(right)): @@ -326,39 +415,6 @@ def pyspark_column_op( return result.fillna(fillna) if fillna is not None else result -def get_column_class() -> Type["Column"]: - from pyspark.sql.column import Column as PySparkColumn - - if is_remote(): - from pyspark.sql.connect.column import Column as ConnectColumn - - return ConnectColumn # type: ignore[return-value] - else: - return PySparkColumn - - -def get_dataframe_class() -> Type["DataFrame"]: - from pyspark.sql.dataframe import DataFrame as PySparkDataFrame - - if is_remote(): - from pyspark.sql.connect.dataframe import DataFrame as ConnectDataFrame - - return ConnectDataFrame # type: ignore[return-value] - else: - return PySparkDataFrame - - -def get_window_class() -> Type["Window"]: - from pyspark.sql.window import Window as PySparkWindow - - if is_remote(): - from pyspark.sql.connect.window import Window as ConnectWindow - - return ConnectWindow # type: ignore[return-value] - else: - return PySparkWindow - - def get_lit_sql_str(val: str) -> str: # Equivalent to `lit(val)._jc.expr().sql()` for string typed val # See `sql` definition in `sql/catalyst/src/main/scala/org/apache/spark/ diff --git a/python/pyspark/sql/variant_utils.py b/python/pyspark/sql/variant_utils.py index 11dc29503921f..95084fc7d932f 100644 --- a/python/pyspark/sql/variant_utils.py +++ b/python/pyspark/sql/variant_utils.py @@ -15,12 +15,15 @@ # limitations under the License. # +import base64 import decimal +import datetime import json import struct from array import array from typing import Any, Callable, Dict, List, Tuple from pyspark.errors import PySparkValueError +from zoneinfo import ZoneInfo class VariantUtils: @@ -86,19 +89,48 @@ class VariantUtils: DECIMAL8 = 9 # 16-byte decimal. Content is 1-byte scale + 16-byte little-endian signed integer. DECIMAL16 = 10 + # Date value. Content is 4-byte little-endian signed integer that represents the number of days + # from the Unix epoch. + DATE = 11 + # Timestamp value. Content is 8-byte little-endian signed integer that represents the number of + # microseconds elapsed since the Unix epoch, 1970-01-01 00:00:00 UTC. This is a timezone-aware + # field and when reading into a Python datetime object defaults to the UTC timezone. + TIMESTAMP = 12 + # Timestamp_ntz value. It has the same content as `TIMESTAMP` but should always be interpreted + # as if the local time zone is UTC. + TIMESTAMP_NTZ = 13 + # 4-byte IEEE float. + FLOAT = 14 + # Binary value. The content is (4-byte little-endian unsigned integer representing the binary + # size) + (size bytes of binary content). + BINARY = 15 # Long string value. The content is (4-byte little-endian unsigned integer representing the # string size) + (size bytes of string content). LONG_STR = 16 U32_SIZE = 4 + EPOCH = datetime.datetime( + year=1970, month=1, day=1, hour=0, minute=0, second=0, tzinfo=datetime.timezone.utc + ) + EPOCH_NTZ = datetime.datetime(year=1970, month=1, day=1, hour=0, minute=0, second=0) + + MAX_DECIMAL4_PRECISION = 9 + MAX_DECIMAL4_VALUE = 10**MAX_DECIMAL4_PRECISION + MAX_DECIMAL8_PRECISION = 18 + MAX_DECIMAL8_VALUE = 10**MAX_DECIMAL8_PRECISION + MAX_DECIMAL16_PRECISION = 38 + MAX_DECIMAL16_VALUE = 10**MAX_DECIMAL16_PRECISION + @classmethod - def to_json(cls, value: bytes, metadata: bytes) -> str: + def to_json(cls, value: bytes, metadata: bytes, zone_id: str = "UTC") -> str: """ - Convert the VariantVal to a JSON string. + Convert the VariantVal to a JSON string. The `zone_id` parameter denotes the time zone that + timestamp fields should be parsed in. It defaults to "UTC". The list of valid zone IDs can + found by importing the `zoneinfo` module and running `zoneinfo.available_timezones()`. :return: JSON string """ - return cls._to_json(value, metadata, 0) + return cls._to_json(value, metadata, 0, zone_id) @classmethod def to_python(cls, value: bytes, metadata: bytes) -> str: @@ -117,7 +149,7 @@ def _read_long(cls, data: bytes, pos: int, num_bytes: int, signed: bool) -> int: @classmethod def _check_index(cls, pos: int, length: int) -> None: if pos < 0 or pos >= length: - raise PySparkValueError(error_class="MALFORMED_VARIANT") + raise PySparkValueError(error_class="MALFORMED_VARIANT", message_parameters={}) @classmethod def _get_type_info(cls, value: bytes, pos: int) -> Tuple[int, int]: @@ -137,14 +169,14 @@ def _get_metadata_key(cls, metadata: bytes, id: int) -> str: offset_size = ((metadata[0] >> 6) & 0x3) + 1 dict_size = cls._read_long(metadata, 1, offset_size, signed=False) if id >= dict_size: - raise PySparkValueError(error_class="MALFORMED_VARIANT") + raise PySparkValueError(error_class="MALFORMED_VARIANT", message_parameters={}) string_start = 1 + (dict_size + 2) * offset_size offset = cls._read_long(metadata, 1 + (id + 1) * offset_size, offset_size, signed=False) next_offset = cls._read_long( metadata, 1 + (id + 2) * offset_size, offset_size, signed=False ) if offset > next_offset: - raise PySparkValueError(error_class="MALFORMED_VARIANT") + raise PySparkValueError(error_class="MALFORMED_VARIANT", message_parameters={}) cls._check_index(string_start + next_offset - 1, len(metadata)) return metadata[string_start + offset : (string_start + next_offset)].decode("utf-8") @@ -155,7 +187,7 @@ def _get_boolean(cls, value: bytes, pos: int) -> bool: if basic_type != VariantUtils.PRIMITIVE or ( type_info != VariantUtils.TRUE and type_info != VariantUtils.FALSE ): - raise PySparkValueError(error_class="MALFORMED_VARIANT") + raise PySparkValueError(error_class="MALFORMED_VARIANT", message_parameters={}) return type_info == VariantUtils.TRUE @classmethod @@ -163,16 +195,45 @@ def _get_long(cls, value: bytes, pos: int) -> int: cls._check_index(pos, len(value)) basic_type, type_info = cls._get_type_info(value, pos) if basic_type != VariantUtils.PRIMITIVE: - raise PySparkValueError(error_class="MALFORMED_VARIANT") + raise PySparkValueError(error_class="MALFORMED_VARIANT", message_parameters={}) if type_info == VariantUtils.INT1: return cls._read_long(value, pos + 1, 1, signed=True) elif type_info == VariantUtils.INT2: return cls._read_long(value, pos + 1, 2, signed=True) - elif type_info == VariantUtils.INT4: + elif type_info == VariantUtils.INT4 or type_info == VariantUtils.DATE: return cls._read_long(value, pos + 1, 4, signed=True) elif type_info == VariantUtils.INT8: return cls._read_long(value, pos + 1, 8, signed=True) - raise PySparkValueError(error_class="MALFORMED_VARIANT") + raise PySparkValueError(error_class="MALFORMED_VARIANT", message_parameters={}) + + @classmethod + def _get_date(cls, value: bytes, pos: int) -> datetime.date: + cls._check_index(pos, len(value)) + basic_type, type_info = cls._get_type_info(value, pos) + if basic_type != VariantUtils.PRIMITIVE: + raise PySparkValueError(error_class="MALFORMED_VARIANT", message_parameters={}) + if type_info == VariantUtils.DATE: + days_since_epoch = cls._read_long(value, pos + 1, 4, signed=True) + return datetime.date.fromordinal(VariantUtils.EPOCH.toordinal() + days_since_epoch) + raise PySparkValueError(error_class="MALFORMED_VARIANT", message_parameters={}) + + @classmethod + def _get_timestamp(cls, value: bytes, pos: int, zone_id: str) -> datetime.datetime: + cls._check_index(pos, len(value)) + basic_type, type_info = cls._get_type_info(value, pos) + if basic_type != VariantUtils.PRIMITIVE: + raise PySparkValueError(error_class="MALFORMED_VARIANT", message_parameters={}) + if type_info == VariantUtils.TIMESTAMP_NTZ: + microseconds_since_epoch = cls._read_long(value, pos + 1, 8, signed=True) + return VariantUtils.EPOCH_NTZ + datetime.timedelta( + microseconds=microseconds_since_epoch + ) + if type_info == VariantUtils.TIMESTAMP: + microseconds_since_epoch = cls._read_long(value, pos + 1, 8, signed=True) + return ( + VariantUtils.EPOCH + datetime.timedelta(microseconds=microseconds_since_epoch) + ).astimezone(ZoneInfo(zone_id)) + raise PySparkValueError(error_class="MALFORMED_VARIANT", message_parameters={}) @classmethod def _get_string(cls, value: bytes, pos: int) -> str: @@ -191,35 +252,64 @@ def _get_string(cls, value: bytes, pos: int) -> str: length = cls._read_long(value, pos + 1, VariantUtils.U32_SIZE, signed=False) cls._check_index(start + length - 1, len(value)) return value[start : start + length].decode("utf-8") - raise PySparkValueError(error_class="MALFORMED_VARIANT") + raise PySparkValueError(error_class="MALFORMED_VARIANT", message_parameters={}) @classmethod def _get_double(cls, value: bytes, pos: int) -> float: cls._check_index(pos, len(value)) basic_type, type_info = cls._get_type_info(value, pos) - if basic_type != VariantUtils.PRIMITIVE or type_info != VariantUtils.DOUBLE: - raise PySparkValueError(error_class="MALFORMED_VARIANT") - return struct.unpack("d", value[pos + 1 : pos + 9])[0] + if basic_type != VariantUtils.PRIMITIVE: + raise PySparkValueError(error_class="MALFORMED_VARIANT", message_parameters={}) + if type_info == VariantUtils.FLOAT: + cls._check_index(pos + 4, len(value)) + return struct.unpack(" None: + # max_unscaled == 10**max_scale, but we pass a literal parameter to avoid redundant + # computation. + if unscaled >= max_unscaled or unscaled <= -max_unscaled or scale > max_scale: + raise PySparkValueError(error_class="MALFORMED_VARIANT", message_parameters={}) @classmethod def _get_decimal(cls, value: bytes, pos: int) -> decimal.Decimal: cls._check_index(pos, len(value)) basic_type, type_info = cls._get_type_info(value, pos) if basic_type != VariantUtils.PRIMITIVE: - raise PySparkValueError(error_class="MALFORMED_VARIANT") + raise PySparkValueError(error_class="MALFORMED_VARIANT", message_parameters={}) scale = value[pos + 1] unscaled = 0 if type_info == VariantUtils.DECIMAL4: unscaled = cls._read_long(value, pos + 2, 4, signed=True) + cls._check_decimal(unscaled, scale, cls.MAX_DECIMAL4_VALUE, cls.MAX_DECIMAL4_PRECISION) elif type_info == VariantUtils.DECIMAL8: unscaled = cls._read_long(value, pos + 2, 8, signed=True) + cls._check_decimal(unscaled, scale, cls.MAX_DECIMAL8_VALUE, cls.MAX_DECIMAL8_PRECISION) elif type_info == VariantUtils.DECIMAL16: cls._check_index(pos + 17, len(value)) unscaled = int.from_bytes(value[pos + 2 : pos + 18], byteorder="little", signed=True) + cls._check_decimal( + unscaled, scale, cls.MAX_DECIMAL16_VALUE, cls.MAX_DECIMAL16_PRECISION + ) else: - raise PySparkValueError(error_class="MALFORMED_VARIANT") + raise PySparkValueError(error_class="MALFORMED_VARIANT", message_parameters={}) return decimal.Decimal(unscaled) * (decimal.Decimal(10) ** (-scale)) + @classmethod + def _get_binary(cls, value: bytes, pos: int) -> bytes: + cls._check_index(pos, len(value)) + basic_type, type_info = cls._get_type_info(value, pos) + if basic_type != VariantUtils.PRIMITIVE or type_info != VariantUtils.BINARY: + raise PySparkValueError(error_class="MALFORMED_VARIANT", message_parameters={}) + start = pos + 1 + VariantUtils.U32_SIZE + length = cls._read_long(value, pos + 1, VariantUtils.U32_SIZE, signed=False) + cls._check_index(start + length - 1, len(value)) + return bytes(value[start : start + length]) + @classmethod def _get_type(cls, value: bytes, pos: int) -> Any: """ @@ -244,7 +334,7 @@ def _get_type(cls, value: bytes, pos: int) -> Any: or type_info == VariantUtils.INT8 ): return int - elif type_info == VariantUtils.DOUBLE: + elif type_info == VariantUtils.DOUBLE or type_info == VariantUtils.FLOAT: return float elif ( type_info == VariantUtils.DECIMAL4 @@ -252,18 +342,24 @@ def _get_type(cls, value: bytes, pos: int) -> Any: or type_info == VariantUtils.DECIMAL16 ): return decimal.Decimal + elif type_info == VariantUtils.BINARY: + return bytes + elif type_info == VariantUtils.DATE: + return datetime.date + elif type_info == VariantUtils.TIMESTAMP or type_info == VariantUtils.TIMESTAMP_NTZ: + return datetime.datetime elif type_info == VariantUtils.LONG_STR: return str - raise PySparkValueError(error_class="MALFORMED_VARIANT") + raise PySparkValueError(error_class="MALFORMED_VARIANT", message_parameters={}) @classmethod - def _to_json(cls, value: bytes, metadata: bytes, pos: int) -> Any: + def _to_json(cls, value: bytes, metadata: bytes, pos: int, zone_id: str) -> str: variant_type = cls._get_type(value, pos) if variant_type == dict: def handle_object(key_value_pos_list: List[Tuple[str, int]]) -> str: key_value_list = [ - json.dumps(key) + ":" + cls._to_json(value, metadata, value_pos) + json.dumps(key) + ":" + cls._to_json(value, metadata, value_pos, zone_id) for (key, value_pos) in key_value_pos_list ] return "{" + ",".join(key_value_list) + "}" @@ -273,19 +369,25 @@ def handle_object(key_value_pos_list: List[Tuple[str, int]]) -> str: def handle_array(value_pos_list: List[int]) -> str: value_list = [ - cls._to_json(value, metadata, value_pos) for value_pos in value_pos_list + cls._to_json(value, metadata, value_pos, zone_id) + for value_pos in value_pos_list ] return "[" + ",".join(value_list) + "]" return cls._handle_array(value, pos, handle_array) else: - value = cls._get_scalar(variant_type, value, metadata, pos) + value = cls._get_scalar(variant_type, value, metadata, pos, zone_id) if value is None: return "null" if type(value) == bool: return "true" if value else "false" if type(value) == str: return json.dumps(value) + if type(value) == bytes: + # decoding simply converts byte array to string + return '"' + base64.b64encode(value).decode("utf-8") + '"' + if type(value) == datetime.date or type(value) == datetime.datetime: + return '"' + str(value) + '"' return str(value) @classmethod @@ -311,10 +413,12 @@ def handle_array(value_pos_list: List[int]) -> List[Any]: return cls._handle_array(value, pos, handle_array) else: - return cls._get_scalar(variant_type, value, metadata, pos) + return cls._get_scalar(variant_type, value, metadata, pos, zone_id="UTC") @classmethod - def _get_scalar(cls, variant_type: Any, value: bytes, metadata: bytes, pos: int) -> Any: + def _get_scalar( + cls, variant_type: Any, value: bytes, metadata: bytes, pos: int, zone_id: str + ) -> Any: if isinstance(None, variant_type): return None elif variant_type == bool: @@ -327,8 +431,14 @@ def _get_scalar(cls, variant_type: Any, value: bytes, metadata: bytes, pos: int) return cls._get_double(value, pos) elif variant_type == decimal.Decimal: return cls._get_decimal(value, pos) + elif variant_type == bytes: + return cls._get_binary(value, pos) + elif variant_type == datetime.date: + return cls._get_date(value, pos) + elif variant_type == datetime.datetime: + return cls._get_timestamp(value, pos, zone_id) else: - raise PySparkValueError(error_class="MALFORMED_VARIANT") + raise PySparkValueError(error_class="MALFORMED_VARIANT", message_parameters={}) @classmethod def _handle_object( @@ -341,7 +451,7 @@ def _handle_object( cls._check_index(pos, len(value)) basic_type, type_info = cls._get_type_info(value, pos) if basic_type != VariantUtils.OBJECT: - raise PySparkValueError(error_class="MALFORMED_VARIANT") + raise PySparkValueError(error_class="MALFORMED_VARIANT", message_parameters={}) large_size = ((type_info >> 4) & 0x1) != 0 size_bytes = VariantUtils.U32_SIZE if large_size else 1 num_fields = cls._read_long(value, pos + 1, size_bytes, signed=False) @@ -370,7 +480,7 @@ def _handle_array(cls, value: bytes, pos: int, func: Callable[[List[int]], Any]) cls._check_index(pos, len(value)) basic_type, type_info = cls._get_type_info(value, pos) if basic_type != VariantUtils.ARRAY: - raise PySparkValueError(error_class="MALFORMED_VARIANT") + raise PySparkValueError(error_class="MALFORMED_VARIANT", message_parameters={}) large_size = ((type_info >> 2) & 0x1) != 0 size_bytes = VariantUtils.U32_SIZE if large_size else 1 num_fields = cls._read_long(value, pos + 1, size_bytes, signed=False) diff --git a/python/pyspark/sql/window.py b/python/pyspark/sql/window.py index 42d50dc1b3bdf..22c9f697acde3 100644 --- a/python/pyspark/sql/window.py +++ b/python/pyspark/sql/window.py @@ -14,14 +14,16 @@ # See the License for the specific language governing permissions and # limitations under the License. # + +# mypy: disable-error-code="empty-body" + import sys -from typing import cast, Iterable, List, Tuple, TYPE_CHECKING, Union +from typing import List, TYPE_CHECKING, Union -from pyspark.sql.column import _to_seq, _to_java_column -from pyspark.sql.utils import ( - try_remote_window, - try_remote_windowspec, - get_active_spark_context, +from pyspark.sql.utils import dispatch_window_method +from pyspark.util import ( + JVM_LONG_MIN, + JVM_LONG_MAX, ) if TYPE_CHECKING: @@ -31,13 +33,6 @@ __all__ = ["Window", "WindowSpec"] -def _to_java_cols(cols: Tuple[Union["ColumnOrName", List["ColumnOrName_"]], ...]) -> "JavaObject": - if len(cols) == 1 and isinstance(cols[0], list): - cols = cols[0] # type: ignore[assignment] - sc = get_active_spark_context() - return _to_seq(sc, cast(Iterable["ColumnOrName"], cols), _to_java_column) - - class Window: """ Utility functions for defining window in DataFrames. @@ -62,19 +57,17 @@ class Window: >>> window = Window.orderBy("date").partitionBy("country").rangeBetween(-3, 3) """ - _JAVA_MIN_LONG = -(1 << 63) # -9223372036854775808 - _JAVA_MAX_LONG = (1 << 63) - 1 # 9223372036854775807 - _PRECEDING_THRESHOLD = max(-sys.maxsize, _JAVA_MIN_LONG) - _FOLLOWING_THRESHOLD = min(sys.maxsize, _JAVA_MAX_LONG) + _PRECEDING_THRESHOLD = max(-sys.maxsize, JVM_LONG_MIN) + _FOLLOWING_THRESHOLD = min(sys.maxsize, JVM_LONG_MAX) - unboundedPreceding: int = _JAVA_MIN_LONG + unboundedPreceding: int = JVM_LONG_MIN - unboundedFollowing: int = _JAVA_MAX_LONG + unboundedFollowing: int = JVM_LONG_MAX currentRow: int = 0 @staticmethod - @try_remote_window + @dispatch_window_method def partitionBy(*cols: Union["ColumnOrName", List["ColumnOrName_"]]) -> "WindowSpec": """ Creates a :class:`WindowSpec` with the partitioning defined. @@ -124,16 +117,10 @@ def partitionBy(*cols: Union["ColumnOrName", List["ColumnOrName_"]]) -> "WindowS | 3| b| 3| +---+--------+----------+ """ - from py4j.java_gateway import JVMView - - sc = get_active_spark_context() - jspec = cast(JVMView, sc._jvm).org.apache.spark.sql.expressions.Window.partitionBy( - _to_java_cols(cols) - ) - return WindowSpec(jspec) + ... @staticmethod - @try_remote_window + @dispatch_window_method def orderBy(*cols: Union["ColumnOrName", List["ColumnOrName_"]]) -> "WindowSpec": """ Creates a :class:`WindowSpec` with the ordering defined. @@ -183,16 +170,10 @@ def orderBy(*cols: Union["ColumnOrName", List["ColumnOrName_"]]) -> "WindowSpec" | 3| b| 1| +---+--------+----------+ """ - from py4j.java_gateway import JVMView - - sc = get_active_spark_context() - jspec = cast(JVMView, sc._jvm).org.apache.spark.sql.expressions.Window.orderBy( - _to_java_cols(cols) - ) - return WindowSpec(jspec) + ... @staticmethod - @try_remote_window + @dispatch_window_method def rowsBetween(start: int, end: int) -> "WindowSpec": """ Creates a :class:`WindowSpec` with the frame boundaries defined, @@ -266,20 +247,10 @@ def rowsBetween(start: int, end: int) -> "WindowSpec": +---+--------+---+ """ - from py4j.java_gateway import JVMView - - if start <= Window._PRECEDING_THRESHOLD: - start = Window.unboundedPreceding - if end >= Window._FOLLOWING_THRESHOLD: - end = Window.unboundedFollowing - sc = get_active_spark_context() - jspec = cast(JVMView, sc._jvm).org.apache.spark.sql.expressions.Window.rowsBetween( - start, end - ) - return WindowSpec(jspec) + ... @staticmethod - @try_remote_window + @dispatch_window_method def rangeBetween(start: int, end: int) -> "WindowSpec": """ Creates a :class:`WindowSpec` with the frame boundaries defined, @@ -356,17 +327,7 @@ def rangeBetween(start: int, end: int) -> "WindowSpec": +---+--------+---+ """ - from py4j.java_gateway import JVMView - - if start <= Window._PRECEDING_THRESHOLD: - start = Window.unboundedPreceding - if end >= Window._FOLLOWING_THRESHOLD: - end = Window.unboundedFollowing - sc = get_active_spark_context() - jspec = cast(JVMView, sc._jvm).org.apache.spark.sql.expressions.Window.rangeBetween( - start, end - ) - return WindowSpec(jspec) + ... class WindowSpec: @@ -382,10 +343,11 @@ class WindowSpec: Supports Spark Connect. """ - def __init__(self, jspec: "JavaObject") -> None: - self._jspec = jspec + def __new__(cls, jspec: "JavaObject") -> "WindowSpec": + from pyspark.sql.classic.WindowSpec import WindowSpec # type: ignore[import-not-found] + + return WindowSpec.__new__(WindowSpec, jspec) - @try_remote_windowspec def partitionBy(self, *cols: Union["ColumnOrName", List["ColumnOrName_"]]) -> "WindowSpec": """ Defines the partitioning columns in a :class:`WindowSpec`. @@ -397,9 +359,8 @@ def partitionBy(self, *cols: Union["ColumnOrName", List["ColumnOrName_"]]) -> "W cols : str, :class:`Column` or list names of columns or expressions """ - return WindowSpec(self._jspec.partitionBy(_to_java_cols(cols))) + ... - @try_remote_windowspec def orderBy(self, *cols: Union["ColumnOrName", List["ColumnOrName_"]]) -> "WindowSpec": """ Defines the ordering columns in a :class:`WindowSpec`. @@ -411,9 +372,8 @@ def orderBy(self, *cols: Union["ColumnOrName", List["ColumnOrName_"]]) -> "Windo cols : str, :class:`Column` or list names of columns or expressions """ - return WindowSpec(self._jspec.orderBy(_to_java_cols(cols))) + ... - @try_remote_windowspec def rowsBetween(self, start: int, end: int) -> "WindowSpec": """ Defines the frame boundaries, from `start` (inclusive) to `end` (inclusive). @@ -439,13 +399,8 @@ def rowsBetween(self, start: int, end: int) -> "WindowSpec": The frame is unbounded if this is ``Window.unboundedFollowing``, or any value greater than or equal to min(sys.maxsize, 9223372036854775807). """ - if start <= Window._PRECEDING_THRESHOLD: - start = Window.unboundedPreceding - if end >= Window._FOLLOWING_THRESHOLD: - end = Window.unboundedFollowing - return WindowSpec(self._jspec.rowsBetween(start, end)) + ... - @try_remote_windowspec def rangeBetween(self, start: int, end: int) -> "WindowSpec": """ Defines the frame boundaries, from `start` (inclusive) to `end` (inclusive). @@ -471,29 +426,4 @@ def rangeBetween(self, start: int, end: int) -> "WindowSpec": The frame is unbounded if this is ``Window.unboundedFollowing``, or any value greater than or equal to min(sys.maxsize, 9223372036854775807). """ - if start <= Window._PRECEDING_THRESHOLD: - start = Window.unboundedPreceding - if end >= Window._FOLLOWING_THRESHOLD: - end = Window.unboundedFollowing - return WindowSpec(self._jspec.rangeBetween(start, end)) - - -def _test() -> None: - import doctest - from pyspark.sql import SparkSession - import pyspark.sql.window - - globs = pyspark.sql.window.__dict__.copy() - spark = SparkSession.builder.master("local[4]").appName("sql.window tests").getOrCreate() - globs["spark"] = spark - - (failure_count, test_count) = doctest.testmod( - pyspark.sql.window, globs=globs, optionflags=doctest.NORMALIZE_WHITESPACE - ) - spark.stop() - if failure_count: - sys.exit(-1) - - -if __name__ == "__main__": - _test() + ... diff --git a/python/pyspark/sql/worker/analyze_udtf.py b/python/pyspark/sql/worker/analyze_udtf.py index d0a24363c0c1e..7dafb87c42211 100644 --- a/python/pyspark/sql/worker/analyze_udtf.py +++ b/python/pyspark/sql/worker/analyze_udtf.py @@ -264,4 +264,7 @@ def invalid_analyze_result_field(field_name: str, expected_field: str) -> PySpar java_port = int(os.environ["PYTHON_WORKER_FACTORY_PORT"]) auth_secret = os.environ["PYTHON_WORKER_FACTORY_SECRET"] (sock_file, _) = local_connect_and_auth(java_port, auth_secret) + # TODO: Remove the following two lines and use `Process.pid()` when we drop JDK 8. + write_int(os.getpid(), sock_file) + sock_file.flush() main(sock_file, sock_file) diff --git a/python/pyspark/sql/worker/commit_data_source_write.py b/python/pyspark/sql/worker/commit_data_source_write.py index 530f18ef8288d..1d9e53083d4d9 100644 --- a/python/pyspark/sql/worker/commit_data_source_write.py +++ b/python/pyspark/sql/worker/commit_data_source_write.py @@ -60,14 +60,7 @@ def main(infile: IO, outfile: IO) -> None: # Receive the data source writer instance. writer = pickleSer._read_with_length(infile) - if not isinstance(writer, DataSourceWriter): - raise PySparkAssertionError( - error_class="PYTHON_DATA_SOURCE_TYPE_MISMATCH", - message_parameters={ - "expected": "an instance of DataSourceWriter", - "actual": f"'{type(writer).__name__}'", - }, - ) + assert isinstance(writer, DataSourceWriter) # Receive the commit messages. num_messages = read_int(infile) @@ -76,7 +69,7 @@ def main(infile: IO, outfile: IO) -> None: message = pickleSer._read_with_length(infile) if message is not None and not isinstance(message, WriterCommitMessage): raise PySparkAssertionError( - error_class="PYTHON_DATA_SOURCE_TYPE_MISMATCH", + error_class="DATA_SOURCE_TYPE_MISMATCH", message_parameters={ "expected": "an instance of WriterCommitMessage", "actual": f"'{type(message).__name__}'", @@ -90,9 +83,9 @@ def main(infile: IO, outfile: IO) -> None: # Commit or abort the Python data source write. # Note the commit messages can be None if there are failed tasks. if abort: - writer.abort(commit_messages) # type: ignore[arg-type] + writer.abort(commit_messages) else: - writer.commit(commit_messages) # type: ignore[arg-type] + writer.commit(commit_messages) # Send a status code back to JVM. write_int(0, outfile) @@ -117,4 +110,6 @@ def main(infile: IO, outfile: IO) -> None: java_port = int(os.environ["PYTHON_WORKER_FACTORY_PORT"]) auth_secret = os.environ["PYTHON_WORKER_FACTORY_SECRET"] (sock_file, _) = local_connect_and_auth(java_port, auth_secret) + write_int(os.getpid(), sock_file) + sock_file.flush() main(sock_file, sock_file) diff --git a/python/pyspark/sql/worker/create_data_source.py b/python/pyspark/sql/worker/create_data_source.py index 1f11b65f44c7e..d6b59b04393d8 100644 --- a/python/pyspark/sql/worker/create_data_source.py +++ b/python/pyspark/sql/worker/create_data_source.py @@ -75,7 +75,7 @@ def main(infile: IO, outfile: IO) -> None: data_source_cls = read_command(pickleSer, infile) if not (isinstance(data_source_cls, type) and issubclass(data_source_cls, DataSource)): raise PySparkAssertionError( - error_class="PYTHON_DATA_SOURCE_TYPE_MISMATCH", + error_class="DATA_SOURCE_TYPE_MISMATCH", message_parameters={ "expected": "a subclass of DataSource", "actual": f"'{type(data_source_cls).__name__}'", @@ -85,7 +85,7 @@ def main(infile: IO, outfile: IO) -> None: # Check the name method is a class method. if not inspect.ismethod(data_source_cls.name): raise PySparkTypeError( - error_class="PYTHON_DATA_SOURCE_TYPE_MISMATCH", + error_class="DATA_SOURCE_TYPE_MISMATCH", message_parameters={ "expected": "'name()' method to be a classmethod", "actual": f"'{type(data_source_cls.name).__name__}'", @@ -98,7 +98,7 @@ def main(infile: IO, outfile: IO) -> None: # Check if the provider name matches the data source's name. if provider.lower() != data_source_cls.name().lower(): raise PySparkAssertionError( - error_class="PYTHON_DATA_SOURCE_TYPE_MISMATCH", + error_class="DATA_SOURCE_TYPE_MISMATCH", message_parameters={ "expected": f"provider with name {data_source_cls.name()}", "actual": f"'{provider}'", @@ -111,7 +111,7 @@ def main(infile: IO, outfile: IO) -> None: user_specified_schema = _parse_datatype_json_string(utf8_deserializer.loads(infile)) if not isinstance(user_specified_schema, StructType): raise PySparkAssertionError( - error_class="PYTHON_DATA_SOURCE_TYPE_MISMATCH", + error_class="DATA_SOURCE_TYPE_MISMATCH", message_parameters={ "expected": "the user-defined schema to be a 'StructType'", "actual": f"'{type(data_source_cls).__name__}'", @@ -187,4 +187,6 @@ def main(infile: IO, outfile: IO) -> None: java_port = int(os.environ["PYTHON_WORKER_FACTORY_PORT"]) auth_secret = os.environ["PYTHON_WORKER_FACTORY_SECRET"] (sock_file, _) = local_connect_and_auth(java_port, auth_secret) + write_int(os.getpid(), sock_file) + sock_file.flush() main(sock_file, sock_file) diff --git a/python/pyspark/sql/worker/lookup_data_sources.py b/python/pyspark/sql/worker/lookup_data_sources.py index 7f0127b719463..6da9d5925f636 100644 --- a/python/pyspark/sql/worker/lookup_data_sources.py +++ b/python/pyspark/sql/worker/lookup_data_sources.py @@ -95,4 +95,6 @@ def main(infile: IO, outfile: IO) -> None: java_port = int(os.environ["PYTHON_WORKER_FACTORY_PORT"]) auth_secret = os.environ["PYTHON_WORKER_FACTORY_SECRET"] (sock_file, _) = local_connect_and_auth(java_port, auth_secret) + write_int(os.getpid(), sock_file) + sock_file.flush() main(sock_file, sock_file) diff --git a/python/pyspark/sql/worker/plan_data_source_read.py b/python/pyspark/sql/worker/plan_data_source_read.py index 6c0d48caefeb8..51a90bba14547 100644 --- a/python/pyspark/sql/worker/plan_data_source_read.py +++ b/python/pyspark/sql/worker/plan_data_source_read.py @@ -18,8 +18,9 @@ import os import sys import functools +import pyarrow as pa from itertools import islice -from typing import IO, List, Iterator, Iterable +from typing import IO, List, Iterator, Iterable, Tuple, Union from pyspark.accumulators import _accumulatorRegistry from pyspark.errors import PySparkAssertionError, PySparkRuntimeError @@ -31,7 +32,13 @@ ) from pyspark.sql import Row from pyspark.sql.connect.conversion import ArrowTableToRowsConversion, LocalDataToArrowConversion -from pyspark.sql.datasource import DataSource, InputPartition +from pyspark.sql.datasource import ( + DataSource, + DataSourceReader, + DataSourceStreamReader, + InputPartition, +) +from pyspark.sql.datasource_internal import _streamReader from pyspark.sql.pandas.types import to_arrow_schema from pyspark.sql.types import ( _parse_datatype_json_string, @@ -51,6 +58,78 @@ ) +def records_to_arrow_batches( + output_iter: Iterator[Tuple], + max_arrow_batch_size: int, + return_type: StructType, + data_source: DataSource, +) -> Iterable[pa.RecordBatch]: + """ + Convert an iterator of Python tuples to an iterator of pyarrow record batches. + + For each python tuple, check the types of each field and append it to the records batch. + + """ + + def batched(iterator: Iterator, n: int) -> Iterator: + return iter(functools.partial(lambda it: list(islice(it, n)), iterator), []) + + pa_schema = to_arrow_schema(return_type) + column_names = return_type.fieldNames() + column_converters = [ + LocalDataToArrowConversion._create_converter(field.dataType) for field in return_type.fields + ] + # Convert the results from the `reader.read` method to an iterator of arrow batches. + num_cols = len(column_names) + col_mapping = {name: i for i, name in enumerate(column_names)} + col_name_set = set(column_names) + for batch in batched(output_iter, max_arrow_batch_size): + pylist: List[List] = [[] for _ in range(num_cols)] + for result in batch: + # Validate the output row schema. + if hasattr(result, "__len__") and len(result) != num_cols: + raise PySparkRuntimeError( + error_class="DATA_SOURCE_RETURN_SCHEMA_MISMATCH", + message_parameters={ + "expected": str(num_cols), + "actual": str(len(result)), + }, + ) + + # Validate the output row type. + if not isinstance(result, (list, tuple)): + raise PySparkRuntimeError( + error_class="DATA_SOURCE_INVALID_RETURN_TYPE", + message_parameters={ + "type": type(result).__name__, + "name": data_source.name(), + "supported_types": "tuple, list, `pyspark.sql.types.Row`", + }, + ) + + # Assign output values by name of the field, not position, if the result is a + # named `Row` object. + if isinstance(result, Row) and hasattr(result, "__fields__"): + # Check if the names are the same as the schema. + if set(result.__fields__) != col_name_set: + raise PySparkRuntimeError( + error_class="DATA_SOURCE_RETURN_SCHEMA_MISMATCH", + message_parameters={ + "expected": str(column_names), + "actual": str(result.__fields__), + }, + ) + # Assign the values by name. + for name in column_names: + idx = col_mapping[name] + pylist[idx].append(column_converters[idx](result[name])) + else: + for col in range(num_cols): + pylist[col].append(column_converters[col](result[col])) + batch = pa.RecordBatch.from_arrays(pylist, schema=pa_schema) + yield batch + + def main(infile: IO, outfile: IO) -> None: """ Main method for planning a data source read. @@ -113,7 +192,7 @@ def main(infile: IO, outfile: IO) -> None: schema = _parse_datatype_json_string(schema_json) if not isinstance(schema, StructType): raise PySparkAssertionError( - error_class="PYTHON_DATA_SOURCE_TYPE_MISMATCH", + error_class="DATA_SOURCE_TYPE_MISMATCH", message_parameters={ "expected": "an output schema of type 'StructType'", "actual": f"'{type(schema).__name__}'", @@ -130,26 +209,27 @@ def main(infile: IO, outfile: IO) -> None: is_streaming = read_bool(infile) # Instantiate data source reader. - reader = ( - data_source.streamReader(schema=schema) - if is_streaming - else data_source.reader(schema=schema) - ) - - # Wrap the data source read logic in an mapInArrow UDF. - import pyarrow as pa + if is_streaming: + reader: Union[DataSourceReader, DataSourceStreamReader] = _streamReader( + data_source, schema + ) + else: + reader = data_source.reader(schema=schema) + # Validate the reader. + if not isinstance(reader, DataSourceReader): + raise PySparkAssertionError( + error_class="DATA_SOURCE_TYPE_MISMATCH", + message_parameters={ + "expected": "an instance of DataSourceReader", + "actual": f"'{type(reader).__name__}'", + }, + ) # Create input converter. converter = ArrowTableToRowsConversion._create_converter(BinaryType()) # Create output converter. return_type = schema - pa_schema = to_arrow_schema(return_type) - column_names = return_type.fieldNames() - column_converters = [ - LocalDataToArrowConversion._create_converter(field.dataType) - for field in return_type.fields - ] def data_source_read_func(iterator: Iterable[pa.RecordBatch]) -> Iterable[pa.RecordBatch]: partition_bytes = None @@ -176,7 +256,7 @@ def data_source_read_func(iterator: Iterable[pa.RecordBatch]) -> Iterable[pa.Rec f"but found '{type(partition).__name__}'." ) - output_iter = reader.read(partition) # type: ignore[attr-defined] + output_iter = reader.read(partition) # type: ignore[arg-type] # Validate the output iterator. if not isinstance(output_iter, Iterator): @@ -189,58 +269,9 @@ def data_source_read_func(iterator: Iterable[pa.RecordBatch]) -> Iterable[pa.Rec }, ) - def batched(iterator: Iterator, n: int) -> Iterator: - return iter(functools.partial(lambda it: list(islice(it, n)), iterator), []) - - # Convert the results from the `reader.read` method to an iterator of arrow batches. - num_cols = len(column_names) - col_mapping = {name: i for i, name in enumerate(column_names)} - col_name_set = set(column_names) - for batch in batched(output_iter, max_arrow_batch_size): - pylist: List[List] = [[] for _ in range(num_cols)] - for result in batch: - # Validate the output row schema. - if hasattr(result, "__len__") and len(result) != num_cols: - raise PySparkRuntimeError( - error_class="DATA_SOURCE_RETURN_SCHEMA_MISMATCH", - message_parameters={ - "expected": str(num_cols), - "actual": str(len(result)), - }, - ) - - # Validate the output row type. - if not isinstance(result, (list, tuple)): - raise PySparkRuntimeError( - error_class="DATA_SOURCE_INVALID_RETURN_TYPE", - message_parameters={ - "type": type(result).__name__, - "name": data_source.name(), - "supported_types": "tuple, list, `pyspark.sql.types.Row`", - }, - ) - - # Assign output values by name of the field, not position, if the result is a - # named `Row` object. - if isinstance(result, Row) and hasattr(result, "__fields__"): - # Check if the names are the same as the schema. - if set(result.__fields__) != col_name_set: - raise PySparkRuntimeError( - error_class="PYTHON_DATA_SOURCE_READ_RETURN_SCHEMA_MISMATCH", - message_parameters={ - "expected": str(column_names), - "actual": str(result.__fields__), - }, - ) - # Assign the values by name. - for name in column_names: - idx = col_mapping[name] - pylist[idx].append(column_converters[idx](result[name])) - else: - for col in range(num_cols): - pylist[col].append(column_converters[col](result[col])) - - yield pa.RecordBatch.from_arrays(pylist, schema=pa_schema) + return records_to_arrow_batches( + output_iter, max_arrow_batch_size, return_type, data_source + ) command = (data_source_read_func, return_type) pickleSer._write_with_length(command, outfile) @@ -248,7 +279,7 @@ def batched(iterator: Iterator, n: int) -> Iterator: if not is_streaming: # The partitioning of python batch source read is determined before query execution. try: - partitions = reader.partitions() # type: ignore[attr-defined] + partitions = reader.partitions() # type: ignore[call-arg] if not isinstance(partitions, list): raise PySparkRuntimeError( error_class="DATA_SOURCE_TYPE_MISMATCH", @@ -267,9 +298,9 @@ def batched(iterator: Iterator, n: int) -> Iterator: }, ) if len(partitions) == 0: - partitions = [None] + partitions = [None] # type: ignore[list-item] except NotImplementedError: - partitions = [None] + partitions = [None] # type: ignore[list-item] # Return the serialized partition values. write_int(len(partitions), outfile) @@ -299,4 +330,6 @@ def batched(iterator: Iterator, n: int) -> Iterator: java_port = int(os.environ["PYTHON_WORKER_FACTORY_PORT"]) auth_secret = os.environ["PYTHON_WORKER_FACTORY_SECRET"] (sock_file, _) = local_connect_and_auth(java_port, auth_secret) + write_int(os.getpid(), sock_file) + sock_file.flush() main(sock_file, sock_file) diff --git a/python/pyspark/sql/worker/python_streaming_sink_runner.py b/python/pyspark/sql/worker/python_streaming_sink_runner.py index ba0a8037de602..7d03157d705d6 100644 --- a/python/pyspark/sql/worker/python_streaming_sink_runner.py +++ b/python/pyspark/sql/worker/python_streaming_sink_runner.py @@ -21,7 +21,6 @@ from pyspark.accumulators import _accumulatorRegistry from pyspark.errors import PySparkAssertionError, PySparkRuntimeError -from pyspark.util import local_connect_and_auth from pyspark.serializers import ( read_bool, read_int, @@ -34,12 +33,13 @@ _parse_datatype_json_string, StructType, ) -from pyspark.util import handle_worker_exception +from pyspark.util import handle_worker_exception, local_connect_and_auth from pyspark.worker_util import ( check_python_version, read_command, pickleSer, send_accumulator_updates, + setup_broadcasts, setup_memory_limits, setup_spark_files, utf8_deserializer, @@ -47,9 +47,18 @@ def main(infile: IO, outfile: IO) -> None: + """ + Main method for committing or aborting a data source streaming write operation. + + This process is invoked from the `PythonStreamingSinkCommitRunner.runInPython` + method in the StreamingWrite implementation of the PythonDataSourceV2. It is + responsible for invoking either the `commit` or the `abort` method on a data source + writer instance, given a list of commit messages. + """ try: check_python_version(infile) setup_spark_files(infile) + setup_broadcasts(infile) memory_limit_mb = int(os.environ.get("PYSPARK_PLANNER_MEMORY_MB", "-1")) setup_memory_limits(memory_limit_mb) @@ -61,7 +70,7 @@ def main(infile: IO, outfile: IO) -> None: if not isinstance(data_source, DataSource): raise PySparkAssertionError( - error_class="PYTHON_DATA_SOURCE_TYPE_MISMATCH", + error_class="DATA_SOURCE_TYPE_MISMATCH", message_parameters={ "expected": "a Python data source instance of type 'DataSource'", "actual": f"'{type(data_source).__name__}'", @@ -72,7 +81,7 @@ def main(infile: IO, outfile: IO) -> None: schema = _parse_datatype_json_string(schema_json) if not isinstance(schema, StructType): raise PySparkAssertionError( - error_class="PYTHON_DATA_SOURCE_TYPE_MISMATCH", + error_class="DATA_SOURCE_TYPE_MISMATCH", message_parameters={ "expected": "an output schema of type 'StructType'", "actual": f"'{type(schema).__name__}'", @@ -82,36 +91,36 @@ def main(infile: IO, outfile: IO) -> None: overwrite = read_bool(infile) # Instantiate data source reader. try: + # Create the data source writer instance. writer = data_source.streamWriter(schema=schema, overwrite=overwrite) - # Initialization succeed. + + # Receive the commit messages. + num_messages = read_int(infile) + commit_messages = [] + for _ in range(num_messages): + message = pickleSer._read_with_length(infile) + if message is not None and not isinstance(message, WriterCommitMessage): + raise PySparkAssertionError( + error_class="DATA_SOURCE_TYPE_MISMATCH", + message_parameters={ + "expected": "an instance of WriterCommitMessage", + "actual": f"'{type(message).__name__}'", + }, + ) + commit_messages.append(message) + + batch_id = read_long(infile) + abort = read_bool(infile) + + # Commit or abort the Python data source write. + # Note the commit messages can be None if there are failed tasks. + if abort: + writer.abort(commit_messages, batch_id) + else: + writer.commit(commit_messages, batch_id) + # Send a status code back to JVM. write_int(0, outfile) outfile.flush() - - # handle method call from socket - while True: - num_messages = read_int(infile) - commit_messages = [] - for _ in range(num_messages): - message = pickleSer._read_with_length(infile) - if message is not None and not isinstance(message, WriterCommitMessage): - raise PySparkAssertionError( - error_class="PYTHON_DATA_SOURCE_TYPE_MISMATCH", - message_parameters={ - "expected": "an instance of WriterCommitMessage", - "actual": f"'{type(message).__name__}'", - }, - ) - commit_messages.append(message) - batch_id = read_long(infile) - abort = read_bool(infile) - # Commit or abort the Python data source write. - # Note the commit messages can be None if there are failed tasks. - if abort: - writer.abort(commit_messages, batch_id) # type: ignore[arg-type] - else: - writer.commit(commit_messages, batch_id) # type: ignore[arg-type] - write_int(0, outfile) - outfile.flush() except Exception as e: error_msg = "data source {} throw exception: {}".format(data_source.name, e) raise PySparkRuntimeError( @@ -137,4 +146,6 @@ def main(infile: IO, outfile: IO) -> None: java_port = int(os.environ["PYTHON_WORKER_FACTORY_PORT"]) auth_secret = os.environ["PYTHON_WORKER_FACTORY_SECRET"] (sock_file, _) = local_connect_and_auth(java_port, auth_secret) + write_int(os.getpid(), sock_file) + sock_file.flush() main(sock_file, sock_file) diff --git a/python/pyspark/sql/worker/write_into_data_source.py b/python/pyspark/sql/worker/write_into_data_source.py index ad8717cb33b5c..212a2754ec9f0 100644 --- a/python/pyspark/sql/worker/write_into_data_source.py +++ b/python/pyspark/sql/worker/write_into_data_source.py @@ -29,7 +29,12 @@ SpecialLengths, ) from pyspark.sql import Row -from pyspark.sql.datasource import DataSource, WriterCommitMessage, CaseInsensitiveDict +from pyspark.sql.datasource import ( + DataSource, + DataSourceWriter, + WriterCommitMessage, + CaseInsensitiveDict, +) from pyspark.sql.types import ( _parse_datatype_json_string, StructType, @@ -162,6 +167,14 @@ def main(infile: IO, outfile: IO) -> None: else: # Instantiate the data source writer. writer = data_source.writer(schema, overwrite) # type: ignore[assignment] + if not isinstance(writer, DataSourceWriter): + raise PySparkAssertionError( + error_class="DATA_SOURCE_TYPE_MISMATCH", + message_parameters={ + "expected": "an instance of DataSourceWriter", + "actual": f"'{type(writer).__name__}'", + }, + ) # Create a function that can be used in mapInArrow. import pyarrow as pa @@ -229,4 +242,6 @@ def batch_to_rows() -> Iterator[Row]: java_port = int(os.environ["PYTHON_WORKER_FACTORY_PORT"]) auth_secret = os.environ["PYTHON_WORKER_FACTORY_SECRET"] (sock_file, _) = local_connect_and_auth(java_port, auth_secret) + write_int(os.getpid(), sock_file) + sock_file.flush() main(sock_file, sock_file) diff --git a/python/pyspark/testing/connectutils.py b/python/pyspark/testing/connectutils.py index 191505741eb40..2f18cd8a6ccdc 100644 --- a/python/pyspark/testing/connectutils.py +++ b/python/pyspark/testing/connectutils.py @@ -45,6 +45,13 @@ googleapis_common_protos_requirement_message = str(e) have_googleapis_common_protos = googleapis_common_protos_requirement_message is None +graphviz_requirement_message = None +try: + import graphviz +except ImportError as e: + graphviz_requirement_message = str(e) +have_graphviz: bool = graphviz_requirement_message is None + from pyspark import Row, SparkConf from pyspark.util import is_remote_only from pyspark.testing.utils import PySparkErrorTestUtils @@ -170,6 +177,8 @@ def conf(cls): conf = SparkConf(loadDefaults=False) # Make the server terminate reattachable streams every 1 second and 123 bytes, # to make the tests exercise reattach. + if conf._jconf is not None: + conf._jconf.remove("spark.master") conf.set("spark.connect.execute.reattachable.senderMaxStreamDuration", "1s") conf.set("spark.connect.execute.reattachable.senderMaxStreamSize", "123") return conf diff --git a/python/pyspark/testing/mlutils.py b/python/pyspark/testing/mlutils.py index 8981e97ea49ba..aa3e23bccb198 100644 --- a/python/pyspark/testing/mlutils.py +++ b/python/pyspark/testing/mlutils.py @@ -99,6 +99,11 @@ def tearDownClass(cls): class MockDataset(DataFrame): + def __new__(cls) -> "DataFrame": + self = object.__new__(cls) + self.__init__() + return self + def __init__(self): self.index = 0 diff --git a/python/pyspark/testing/sqlutils.py b/python/pyspark/testing/sqlutils.py index 690d5c37b22e4..9f07c44c084cf 100644 --- a/python/pyspark/testing/sqlutils.py +++ b/python/pyspark/testing/sqlutils.py @@ -247,6 +247,29 @@ def function(self, *functions): for f in functions: self.spark.sql("DROP FUNCTION IF EXISTS %s" % f) + @contextmanager + def temp_env(self, pairs): + assert isinstance(pairs, dict), "pairs should be a dictionary." + + keys = pairs.keys() + new_values = pairs.values() + old_values = [os.environ.get(key, None) for key in keys] + for key, new_value in zip(keys, new_values): + if new_value is None: + if key in os.environ: + del os.environ[key] + else: + os.environ[key] = new_value + try: + yield + finally: + for key, old_value in zip(keys, old_values): + if old_value is None: + if key in os.environ: + del os.environ[key] + else: + os.environ[key] = old_value + @staticmethod def assert_close(a, b): c = [j[0] for j in b] @@ -258,6 +281,7 @@ class ReusedSQLTestCase(ReusedPySparkTestCase, SQLTestUtils, PySparkErrorTestUti @classmethod def setUpClass(cls): super(ReusedSQLTestCase, cls).setUpClass() + cls._legacy_sc = cls.sc cls.spark = SparkSession(cls.sc) cls.tempdir = tempfile.NamedTemporaryFile(delete=False) os.unlink(cls.tempdir.name) diff --git a/python/pyspark/testing/utils.py b/python/pyspark/testing/utils.py index fe25136864eef..c74291524daed 100644 --- a/python/pyspark/testing/utils.py +++ b/python/pyspark/testing/utils.py @@ -38,7 +38,7 @@ have_scipy = False have_numpy = False try: - import scipy.sparse # noqa: F401 + import scipy # noqa: F401 have_scipy = True except ImportError: @@ -287,7 +287,7 @@ def check_error( error_class: str, message_parameters: Optional[Dict[str, str]] = None, query_context_type: Optional[QueryContextType] = None, - pyspark_fragment: Optional[str] = None, + fragment: Optional[str] = None, ): query_context = exception.getQueryContext() assert bool(query_context) == (query_context_type is not None), ( @@ -326,10 +326,10 @@ def check_error( ) if actual == QueryContextType.DataFrame: assert ( - pyspark_fragment is not None - ), "`pyspark_fragment` is required when QueryContextType is DataFrame." - expected = pyspark_fragment - actual = actual_context.pysparkFragment() + fragment is not None + ), "`fragment` is required when QueryContextType is DataFrame." + expected = fragment + actual = actual_context.fragment() self.assertEqual( expected, actual, @@ -829,12 +829,7 @@ def assertDataFrameEqual( actual, expected, almost=True, rtol=rtol, atol=atol, check_row_order=checkRowOrder ) - from pyspark.sql.utils import get_dataframe_class - - # if is_remote(), allow Connect DataFrame - SparkDataFrame = get_dataframe_class() - - if not isinstance(actual, (DataFrame, SparkDataFrame, list)): + if not isinstance(actual, (DataFrame, list)): raise PySparkAssertionError( error_class="INVALID_TYPE_DF_EQUALITY_ARG", message_parameters={ @@ -843,7 +838,7 @@ def assertDataFrameEqual( "actual_type": type(actual), }, ) - elif not isinstance(expected, (DataFrame, SparkDataFrame, list)): + elif not isinstance(expected, (DataFrame, list)): raise PySparkAssertionError( error_class="INVALID_TYPE_DF_EQUALITY_ARG", message_parameters={ diff --git a/python/pyspark/util.py b/python/pyspark/util.py index bf1cf5b595533..49766913e6ee2 100644 --- a/python/pyspark/util.py +++ b/python/pyspark/util.py @@ -71,6 +71,16 @@ from pyspark.sql import SparkSession +JVM_BYTE_MIN: int = -(1 << 7) +JVM_BYTE_MAX: int = (1 << 7) - 1 +JVM_SHORT_MIN: int = -(1 << 15) +JVM_SHORT_MAX: int = (1 << 15) - 1 +JVM_INT_MIN: int = -(1 << 31) +JVM_INT_MAX: int = (1 << 31) - 1 +JVM_LONG_MIN: int = -(1 << 63) +JVM_LONG_MAX: int = (1 << 63) - 1 + + def print_exec(stream: TextIO) -> None: ei = sys.exc_info() traceback.print_exception(ei[0], ei[1], ei[2], None, stream) @@ -107,6 +117,22 @@ def majorMinorVersion(sparkVersion: str) -> Tuple[int, int]: ) +class LogUtils: + """ + Utils for querying structured Spark logs with Spark SQL. + """ + + LOG_SCHEMA = ( + "ts TIMESTAMP, " + "level STRING, " + "msg STRING, " + "context map, " + "exception STRUCT>>," + "logger STRING" + ) + + def fail_on_stopiteration(f: Callable) -> Callable: """ Wraps the input function to fail on 'StopIteration' by raising a 'RuntimeError' @@ -747,6 +773,9 @@ def is_remote_only() -> bool: """ global _is_remote_only + if "SPARK_SKIP_CONNECT_COMPAT_TESTS" in os.environ: + return True + if _is_remote_only is not None: return _is_remote_only try: diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py index 41f6c35bc4452..e9c259e68a27a 100644 --- a/python/pyspark/worker.py +++ b/python/pyspark/worker.py @@ -1868,4 +1868,7 @@ def process(): java_port = int(os.environ["PYTHON_WORKER_FACTORY_PORT"]) auth_secret = os.environ["PYTHON_WORKER_FACTORY_SECRET"] (sock_file, _) = local_connect_and_auth(java_port, auth_secret) + # TODO: Remove the following two lines and use `Process.pid()` when we drop JDK 8. + write_int(os.getpid(), sock_file) + sock_file.flush() main(sock_file, sock_file) diff --git a/python/run-tests b/python/run-tests index 401fcae3e350c..b5492a080d6a5 100755 --- a/python/run-tests +++ b/python/run-tests @@ -21,9 +21,9 @@ FWDIR="$(cd "`dirname $0`"/..; pwd)" cd "$FWDIR" -PYTHON_VERSION_CHECK=$(python3 -c 'import sys; print(sys.version_info < (3, 8, 0))') +PYTHON_VERSION_CHECK=$(python3 -c 'import sys; print(sys.version_info < (3, 9, 0))') if [[ "$PYTHON_VERSION_CHECK" == "True" ]]; then - echo "Python versions prior to 3.8 are not supported." + echo "Python versions prior to 3.9 are not supported." exit -1 fi diff --git a/python/run-tests.py b/python/run-tests.py index ebdd4a9a21798..64ac48e210db4 100755 --- a/python/run-tests.py +++ b/python/run-tests.py @@ -62,13 +62,15 @@ def get_valid_filename(s): # Find out where the assembly jars are located. # TODO: revisit for Scala 2.13 -for scala in ["2.13"]: - build_dir = os.path.join(SPARK_HOME, "assembly", "target", "scala-" + scala) - if os.path.isdir(build_dir): - SPARK_DIST_CLASSPATH = os.path.join(build_dir, "jars", "*") - break -else: - raise RuntimeError("Cannot find assembly build directory, please build Spark first.") +SPARK_DIST_CLASSPATH = "" +if "SPARK_SKIP_CONNECT_COMPAT_TESTS" not in os.environ: + for scala in ["2.13"]: + build_dir = os.path.join(SPARK_HOME, "assembly", "target", "scala-" + scala) + if os.path.isdir(build_dir): + SPARK_DIST_CLASSPATH = os.path.join(build_dir, "jars", "*") + break + else: + raise RuntimeError("Cannot find assembly build directory, please build Spark first.") def run_individual_python_test(target_dir, test_name, pyspark_python, keep_test_output): @@ -100,6 +102,8 @@ def run_individual_python_test(target_dir, test_name, pyspark_python, keep_test_ if "SPARK_CONNECT_TESTING_REMOTE" in os.environ: env.update({"SPARK_CONNECT_TESTING_REMOTE": os.environ["SPARK_CONNECT_TESTING_REMOTE"]}) + if "SPARK_SKIP_CONNECT_COMPAT_TESTS" in os.environ: + env.update({"SPARK_SKIP_JVM_REQUIRED_TESTS": os.environ["SPARK_SKIP_CONNECT_COMPAT_TESTS"]}) # Create a unique temp directory under 'target/' for each run. The TMPDIR variable is # recognized by the tempfile module to override the default system temp directory. diff --git a/python/test_support/sql/streaming/time/text-with-time-test.txt b/python/test_support/sql/streaming/time/text-with-time-test.txt new file mode 100644 index 0000000000000..cf9edcafe5a49 --- /dev/null +++ b/python/test_support/sql/streaming/time/text-with-time-test.txt @@ -0,0 +1,10 @@ +2024-05-24 15:03:20;1 +2024-05-24 15:03:21;2 +2024-05-24 15:03:24;3 +2024-05-24 15:03:25;3 +2024-05-24 15:03:31;4 +2024-05-24 15:03:31;1 +2024-05-24 15:03:32;3 +2024-05-24 15:03:45;2 +2024-05-24 15:03:46;5 +2024-05-24 15:03:50;6 \ No newline at end of file diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Constants.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Constants.scala index 385734c557a38..ead3188aa6494 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Constants.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Constants.scala @@ -16,7 +16,16 @@ */ package org.apache.spark.deploy.k8s -private[spark] object Constants { +import org.apache.spark.annotation.{DeveloperApi, Stable} + +/** + * :: DeveloperApi :: + * + * This is used in both K8s module and Spark K8s Operator. + */ +@Stable +@DeveloperApi +object Constants { // Labels val SPARK_VERSION_LABEL = "spark-version" diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/KubernetesConf.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/KubernetesConf.scala index 9fdd9518d2d81..deb178eb90e17 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/KubernetesConf.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/KubernetesConf.scala @@ -22,14 +22,15 @@ import io.fabric8.kubernetes.api.model.{LocalObjectReference, LocalObjectReferen import org.apache.commons.lang3.StringUtils import org.apache.spark.{SPARK_VERSION, SparkConf} +import org.apache.spark.annotation.{DeveloperApi, Since, Unstable} import org.apache.spark.deploy.k8s.Config._ import org.apache.spark.deploy.k8s.Constants._ +import org.apache.spark.deploy.k8s.features.DriverServiceFeatureStep._ import org.apache.spark.deploy.k8s.submit._ -import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{CONFIG, EXECUTOR_ENV_REGEX} +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.internal.config.ConfigEntry import org.apache.spark.resource.ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID -import org.apache.spark.util.Utils +import org.apache.spark.util.{Clock, SystemClock, Utils} /** * Structure containing metadata for Kubernetes logic to build Spark pods. @@ -77,18 +78,43 @@ private[spark] abstract class KubernetesConf(val sparkConf: SparkConf) { def getOption(key: String): Option[String] = sparkConf.getOption(key) } -private[spark] class KubernetesDriverConf( +/** + * :: DeveloperApi :: + * + * Used for K8s operations internally and Spark K8s operator. + */ +@Unstable +@DeveloperApi +@Since("4.0.0") +class KubernetesDriverConf( sparkConf: SparkConf, val appId: String, val mainAppResource: MainAppResource, val mainClass: String, val appArgs: Array[String], - val proxyUser: Option[String]) - extends KubernetesConf(sparkConf) { + val proxyUser: Option[String], + clock: Clock = new SystemClock()) + extends KubernetesConf(sparkConf) with Logging { def driverNodeSelector: Map[String, String] = KubernetesUtils.parsePrefixedKeyValuePairs(sparkConf, KUBERNETES_DRIVER_NODE_SELECTOR_PREFIX) + lazy val driverServiceName: String = { + val preferredServiceName = s"$resourceNamePrefix$DRIVER_SVC_POSTFIX" + if (preferredServiceName.length <= MAX_SERVICE_NAME_LENGTH) { + preferredServiceName + } else { + val randomServiceId = KubernetesUtils.uniqueID(clock) + val shorterServiceName = s"spark-$randomServiceId$DRIVER_SVC_POSTFIX" + logWarning(log"Driver's hostname would preferably be " + + log"${MDC(LogKeys.PREFERRED_SERVICE_NAME, preferredServiceName)}, but this is too long " + + log"(must be <= ${MDC(LogKeys.MAX_SERVICE_NAME_LENGTH, MAX_SERVICE_NAME_LENGTH)} " + + log"characters). Falling back to use " + + log"${MDC(LogKeys.SHORTER_SERVICE_NAME, shorterServiceName)} as the driver service's name.") + shorterServiceName + } + } + override val resourceNamePrefix: String = { val custom = if (Utils.isTesting) get(KUBERNETES_DRIVER_POD_NAME_PREFIX) else None custom.getOrElse(KubernetesConf.getResourceNamePrefix(appName)) @@ -100,8 +126,9 @@ private[spark] class KubernetesDriverConf( SPARK_APP_ID_LABEL -> appId, SPARK_APP_NAME_LABEL -> KubernetesConf.getAppNameLabel(appName), SPARK_ROLE_LABEL -> SPARK_POD_DRIVER_ROLE) - val driverCustomLabels = KubernetesUtils.parsePrefixedKeyValuePairs( - sparkConf, KUBERNETES_DRIVER_LABEL_PREFIX) + val driverCustomLabels = + KubernetesUtils.parsePrefixedKeyValuePairs(sparkConf, KUBERNETES_DRIVER_LABEL_PREFIX) + .map { case(k, v) => (k, Utils.substituteAppNExecIds(v, appId, "")) } presetLabels.keys.foreach { key => require( @@ -173,8 +200,9 @@ private[spark] class KubernetesExecutorConf( SPARK_ROLE_LABEL -> SPARK_POD_EXECUTOR_ROLE, SPARK_RESOURCE_PROFILE_ID_LABEL -> resourceProfileId.toString) - val executorCustomLabels = KubernetesUtils.parsePrefixedKeyValuePairs( - sparkConf, KUBERNETES_EXECUTOR_LABEL_PREFIX) + val executorCustomLabels = + KubernetesUtils.parsePrefixedKeyValuePairs(sparkConf, KUBERNETES_EXECUTOR_LABEL_PREFIX) + .map { case(k, v) => (k, Utils.substituteAppNExecIds(v, appId, executorId)) } presetLabels.keys.foreach { key => require( @@ -215,10 +243,10 @@ private[spark] class KubernetesExecutorConf( if (executorEnvRegex.pattern.matcher(key).matches()) { true } else { - logWarning(log"Invalid key: ${MDC(CONFIG, key)}, " + + logWarning(log"Invalid key: ${MDC(LogKeys.CONFIG, key)}, " + log"a valid environment variable name must consist of alphabetic characters, " + log"digits, '_', '-', or '.', and must not start with a digit. " + - log"Regex used for validation is '${MDC(EXECUTOR_ENV_REGEX, executorEnvRegex)}'") + log"Regex used for validation is '${MDC(LogKeys.EXECUTOR_ENV_REGEX, executorEnvRegex)}'") false } } diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/KubernetesDriverSpec.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/KubernetesDriverSpec.scala index a603cb08ba9a1..0fd2cf16e74ed 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/KubernetesDriverSpec.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/KubernetesDriverSpec.scala @@ -18,7 +18,18 @@ package org.apache.spark.deploy.k8s import io.fabric8.kubernetes.api.model.HasMetadata -private[spark] case class KubernetesDriverSpec( +import org.apache.spark.annotation.{DeveloperApi, Since, Unstable} + +/** + * :: DeveloperApi :: + * + * Spec for driver pod and resources, used for K8s operations internally + * and Spark K8s operator. + */ +@Unstable +@DeveloperApi +@Since("3.3.0") +case class KubernetesDriverSpec( pod: SparkPod, driverPreKubernetesResources: Seq[HasMetadata], driverKubernetesResources: Seq[HasMetadata], diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/KubernetesUtils.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/KubernetesUtils.scala index 50ecefdb6a5dc..c2fcfe179bbeb 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/KubernetesUtils.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/KubernetesUtils.scala @@ -33,7 +33,7 @@ import org.apache.spark.annotation.{DeveloperApi, Since, Unstable} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.deploy.k8s.Config.KUBERNETES_FILE_UPLOAD_PATH import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.POD_ID +import org.apache.spark.internal.LogKeys.POD_ID import org.apache.spark.launcher.SparkLauncher import org.apache.spark.resource.ResourceUtils import org.apache.spark.util.{Clock, SystemClock, Utils} diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/KubernetesVolumeUtils.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/KubernetesVolumeUtils.scala index baa519658c2e1..ee2108e8234d3 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/KubernetesVolumeUtils.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/KubernetesVolumeUtils.scala @@ -19,9 +19,17 @@ package org.apache.spark.deploy.k8s import java.lang.Long.parseLong import org.apache.spark.SparkConf +import org.apache.spark.annotation.{DeveloperApi, Since, Unstable} import org.apache.spark.deploy.k8s.Config._ -private[spark] object KubernetesVolumeUtils { +/** + * :: DeveloperApi :: + * + * A utility class used for K8s operations internally and Spark K8s operator. + */ +@Unstable +@DeveloperApi +object KubernetesVolumeUtils { /** * Extract Spark volume configuration properties with a given name prefix. * @@ -29,6 +37,7 @@ private[spark] object KubernetesVolumeUtils { * @param prefix the given property name prefix * @return a Map storing with volume name as key and spec as value */ + @Since("3.0.0") def parseVolumesWithPrefix(sparkConf: SparkConf, prefix: String): Seq[KubernetesVolumeSpec] = { val properties = sparkConf.getAllWithPrefix(prefix).toMap diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/SparkKubernetesClientFactory.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/SparkKubernetesClientFactory.scala index 3763aeadea0ef..79f76e96474e3 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/SparkKubernetesClientFactory.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/SparkKubernetesClientFactory.scala @@ -32,7 +32,8 @@ import okhttp3.OkHttpClient import org.apache.spark.SparkConf import org.apache.spark.annotation.{DeveloperApi, Since, Stable} import org.apache.spark.deploy.k8s.Config._ -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.K8S_CONTEXT import org.apache.spark.internal.config.ConfigEntry import org.apache.spark.util.ThreadUtils @@ -84,9 +85,9 @@ object SparkKubernetesClientFactory extends Logging { // Allow for specifying a context used to auto-configure from the users K8S config file val kubeContext = sparkConf.get(KUBERNETES_CONTEXT).filter(_.nonEmpty) - logInfo("Auto-configuring K8S client using " + - kubeContext.map("context " + _).getOrElse("current context") + - " from users K8S config file") + logInfo(log"Auto-configuring K8S client using " + + log"${MDC(K8S_CONTEXT, kubeContext.map("context " + _).getOrElse("current context"))}" + + log" from users K8S config file") // if backoff limit is not set then set it to 3 if (getSystemPropertyOrEnvVar(KUBERNETES_REQUEST_RETRY_BACKOFFLIMIT_SYSTEM_PROPERTY) == null) { diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/DriverCommandFeatureStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/DriverCommandFeatureStep.scala index de15bf9b24d90..a8706370eead6 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/DriverCommandFeatureStep.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/DriverCommandFeatureStep.scala @@ -25,7 +25,7 @@ import org.apache.spark.deploy.k8s.Config._ import org.apache.spark.deploy.k8s.Constants._ import org.apache.spark.deploy.k8s.submit._ import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{CONFIG, CONFIG2, CONFIG3, CONFIG4, CONFIG5} +import org.apache.spark.internal.LogKeys.{CONFIG, CONFIG2, CONFIG3, CONFIG4, CONFIG5} import org.apache.spark.internal.config.{PYSPARK_DRIVER_PYTHON, PYSPARK_PYTHON} import org.apache.spark.launcher.SparkLauncher diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/DriverServiceFeatureStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/DriverServiceFeatureStep.scala index cba4f442371c9..71f8340f5b5b6 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/DriverServiceFeatureStep.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/DriverServiceFeatureStep.scala @@ -20,15 +20,13 @@ import scala.jdk.CollectionConverters._ import io.fabric8.kubernetes.api.model.{HasMetadata, ServiceBuilder} -import org.apache.spark.deploy.k8s.{KubernetesDriverConf, KubernetesUtils, SparkPod} +import org.apache.spark.deploy.k8s.{KubernetesDriverConf, SparkPod} import org.apache.spark.deploy.k8s.Config.{KUBERNETES_DNS_LABEL_NAME_MAX_LENGTH, KUBERNETES_DRIVER_SERVICE_IP_FAMILIES, KUBERNETES_DRIVER_SERVICE_IP_FAMILY_POLICY} import org.apache.spark.deploy.k8s.Constants._ import org.apache.spark.internal.{config, Logging} -import org.apache.spark.util.{Clock, SystemClock} private[spark] class DriverServiceFeatureStep( - kubernetesConf: KubernetesDriverConf, - clock: Clock = new SystemClock()) + kubernetesConf: KubernetesDriverConf) extends KubernetesFeatureConfigStep with Logging { import DriverServiceFeatureStep._ @@ -39,17 +37,7 @@ private[spark] class DriverServiceFeatureStep( s"$DRIVER_HOST_KEY is not supported in Kubernetes mode, as the driver's hostname will be " + "managed via a Kubernetes service.") - private val preferredServiceName = s"${kubernetesConf.resourceNamePrefix}$DRIVER_SVC_POSTFIX" - private val resolvedServiceName = if (preferredServiceName.length <= MAX_SERVICE_NAME_LENGTH) { - preferredServiceName - } else { - val randomServiceId = KubernetesUtils.uniqueID(clock = clock) - val shorterServiceName = s"spark-$randomServiceId$DRIVER_SVC_POSTFIX" - logWarning(s"Driver's hostname would preferably be $preferredServiceName, but this is " + - s"too long (must be <= $MAX_SERVICE_NAME_LENGTH characters). Falling back to use " + - s"$shorterServiceName as the driver service's name.") - shorterServiceName - } + private val resolvedServiceName = kubernetesConf.driverServiceName private val ipFamilyPolicy = kubernetesConf.sparkConf.get(KUBERNETES_DRIVER_SERVICE_IP_FAMILY_POLICY) private val ipFamilies = diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesClientApplication.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesClientApplication.scala index 662f5ddbd4a7e..042f984933a7e 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesClientApplication.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesClientApplication.scala @@ -31,7 +31,8 @@ import org.apache.spark.deploy.k8s._ import org.apache.spark.deploy.k8s.Config._ import org.apache.spark.deploy.k8s.Constants._ import org.apache.spark.deploy.k8s.KubernetesUtils.addOwnerReference -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{APP_ID, APP_NAME, SUBMISSION_ID} import org.apache.spark.util.Utils /** @@ -203,8 +204,9 @@ private[spark] class Client( } } } else { - logInfo(s"Deployed Spark application ${conf.appName} with application ID ${conf.appId} " + - s"and submission ID $sId into Kubernetes") + logInfo(log"Deployed Spark application ${MDC(APP_NAME, conf.appName)} with " + + log"application ID ${MDC(APP_ID, conf.appId)} and " + + log"submission ID ${MDC(SUBMISSION_ID, sId)} into Kubernetes") } } } diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesClientUtils.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesClientUtils.scala index beb7ff6bfe22c..d6b1da39bcbb5 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesClientUtils.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesClientUtils.scala @@ -28,23 +28,34 @@ import scala.jdk.CollectionConverters._ import io.fabric8.kubernetes.api.model.{ConfigMap, ConfigMapBuilder, KeyToPath} import org.apache.spark.SparkConf +import org.apache.spark.annotation.{DeveloperApi, Since, Unstable} import org.apache.spark.deploy.k8s.{Config, Constants, KubernetesUtils} import org.apache.spark.deploy.k8s.Config.{KUBERNETES_DNS_SUBDOMAIN_NAME_MAX_LENGTH, KUBERNETES_NAMESPACE} import org.apache.spark.deploy.k8s.Constants.ENV_SPARK_CONF_DIR import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{CONFIG, PATH, PATHS} +import org.apache.spark.internal.LogKeys.{CONFIG, PATH, PATHS} import org.apache.spark.util.ArrayImplicits._ -private[spark] object KubernetesClientUtils extends Logging { +/** + * :: DeveloperApi :: + * + * A utility class used for K8s operations internally and Spark K8s operator. + */ +@Unstable +@DeveloperApi +object KubernetesClientUtils extends Logging { // Config map name can be KUBERNETES_DNS_SUBDOMAIN_NAME_MAX_LENGTH chars at max. + @Since("3.3.0") def configMapName(prefix: String): String = { val suffix = "-conf-map" s"${prefix.take(KUBERNETES_DNS_SUBDOMAIN_NAME_MAX_LENGTH - suffix.length)}$suffix" } + @Since("3.1.0") val configMapNameExecutor: String = configMapName(s"spark-exec-${KubernetesUtils.uniqueID()}") + @Since("3.1.0") val configMapNameDriver: String = configMapName(s"spark-drv-${KubernetesUtils.uniqueID()}") private def buildStringFromPropertiesMap(configMapName: String, @@ -62,6 +73,7 @@ private[spark] object KubernetesClientUtils extends Logging { /** * Build, file -> 'file's content' map of all the selected files in SPARK_CONF_DIR. */ + @Since("3.1.1") def buildSparkConfDirFilesMap( configMapName: String, sparkConf: SparkConf, @@ -77,6 +89,7 @@ private[spark] object KubernetesClientUtils extends Logging { } } + @Since("3.1.0") def buildKeyToPathObjects(confFilesMap: Map[String, String]): Seq[KeyToPath] = { confFilesMap.map { case (fileName: String, _: String) => @@ -89,6 +102,7 @@ private[spark] object KubernetesClientUtils extends Logging { * Build a Config Map that will hold the content for environment variable SPARK_CONF_DIR * on remote pods. */ + @Since("3.1.0") def buildConfigMap(configMapName: String, confFileMap: Map[String, String], withLabels: Map[String, String] = Map()): ConfigMap = { val configMapNameSpace = @@ -141,8 +155,8 @@ private[spark] object KubernetesClientUtils extends Logging { } } if (truncatedMap.nonEmpty) { - logInfo(s"Spark configuration files loaded from $confDir :" + - s" ${truncatedMap.keys.mkString(",")}") + logInfo(log"Spark configuration files loaded from ${MDC(PATH, confDir)} : " + + log"${MDC(PATHS, truncatedMap.keys.mkString(","))}") } if (skippedFiles.nonEmpty) { logWarning(log"Skipped conf file(s) ${MDC(PATHS, skippedFiles.mkString(","))}, due to " + diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesDriverBuilder.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesDriverBuilder.scala index 3b69754b9cdf1..12626a8676efe 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesDriverBuilder.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesDriverBuilder.scala @@ -19,12 +19,22 @@ package org.apache.spark.deploy.k8s.submit import io.fabric8.kubernetes.client.KubernetesClient import org.apache.spark.SparkException +import org.apache.spark.annotation.{DeveloperApi, Since, Unstable} import org.apache.spark.deploy.k8s._ import org.apache.spark.deploy.k8s.features._ import org.apache.spark.util.Utils -private[spark] class KubernetesDriverBuilder { +/** + * ::DeveloperApi:: + * + * KubernetesDriverBuilder builds k8s spec for driver, used for K8s operations internally + * and Spark K8s operator. + */ +@Unstable +@DeveloperApi +class KubernetesDriverBuilder { + @Since("3.0.0") def buildFromFeatures( conf: KubernetesDriverConf, client: KubernetesClient): KubernetesDriverSpec = { diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/LoggingPodStatusWatcher.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/LoggingPodStatusWatcher.scala index 3227a72a8371b..465c5e605b8cc 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/LoggingPodStatusWatcher.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/LoggingPodStatusWatcher.scala @@ -23,7 +23,8 @@ import io.fabric8.kubernetes.client.Watcher.Action import org.apache.spark.deploy.k8s.Config._ import org.apache.spark.deploy.k8s.KubernetesDriverConf import org.apache.spark.deploy.k8s.KubernetesUtils._ -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{APP_ID, APP_NAME, POD_PHASE, POD_STATE, STATUS, SUBMISSION_ID} private[k8s] trait LoggingPodStatusWatcher extends Watcher[Pod] { def watchOrStop(submissionId: String): Boolean @@ -83,7 +84,8 @@ private[k8s] class LoggingPodStatusWatcherImpl(conf: KubernetesDriverConf) } private def logLongStatus(): Unit = { - logInfo("State changed, new state: " + pod.map(formatPodState).getOrElse("unknown")) + logInfo(log"State changed, new state: " + + log"${MDC(POD_STATE, pod.map(formatPodState).getOrElse("unknown"))}") } private def hasCompleted(): Boolean = { @@ -96,22 +98,22 @@ private[k8s] class LoggingPodStatusWatcherImpl(conf: KubernetesDriverConf) } override def watchOrStop(sId: String): Boolean = { - logInfo(s"Waiting for application ${conf.appName} with application ID $appId " + - s"and submission ID $sId to finish...") + logInfo(log"Waiting for application ${MDC(APP_NAME, conf.appName)}} with application ID " + + log"${MDC(APP_ID, appId)} and submission ID ${MDC(SUBMISSION_ID, sId)} to finish...") val interval = conf.get(REPORT_INTERVAL) synchronized { while (!podCompleted && !resourceTooOldReceived) { wait(interval) - logInfo(s"Application status for $appId (phase: $phase)") + logInfo(log"Application status for ${MDC(APP_ID, appId)} (phase: ${MDC(POD_PHASE, phase)})") } } if(podCompleted) { logInfo( - pod.map { p => s"Container final statuses:\n\n${containersDescription(p)}" } - .getOrElse("No containers were found in the driver pod.")) - logInfo(s"Application ${conf.appName} with application ID $appId " + - s"and submission ID $sId finished") + pod.map { p => log"Container final statuses:\n\n${MDC(STATUS, containersDescription(p))}" } + .getOrElse(log"No containers were found in the driver pod.")) + logInfo(log"Application ${MDC(APP_NAME, conf.appName)} with application ID " + + log"${MDC(APP_ID, appId)} and submission ID ${MDC(SUBMISSION_ID, sId)} finished") } podCompleted } diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/MainAppResource.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/MainAppResource.scala index a2e01fa2d9a0e..398bb76376cfb 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/MainAppResource.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/MainAppResource.scala @@ -16,15 +16,38 @@ */ package org.apache.spark.deploy.k8s.submit -private[spark] sealed trait MainAppResource +import org.apache.spark.annotation.{DeveloperApi, Since, Stable} -private[spark] sealed trait NonJVMResource +/** + * :: DeveloperApi :: + * + * All traits and classes in this file are used by K8s module and Spark K8s operator. + */ + +@Stable +@DeveloperApi +@Since("2.3.0") +sealed trait MainAppResource + +@Stable +@DeveloperApi +@Since("2.4.0") +sealed trait NonJVMResource -private[spark] case class JavaMainAppResource(primaryResource: Option[String]) +@Stable +@DeveloperApi +@Since("3.0.0") +case class JavaMainAppResource(primaryResource: Option[String]) extends MainAppResource -private[spark] case class PythonMainAppResource(primaryResource: String) +@Stable +@DeveloperApi +@Since("2.4.0") +case class PythonMainAppResource(primaryResource: String) extends MainAppResource with NonJVMResource -private[spark] case class RMainAppResource(primaryResource: String) +@Stable +@DeveloperApi +@Since("2.4.0") +case class RMainAppResource(primaryResource: String) extends MainAppResource with NonJVMResource diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala index a48e1fba99546..ef3547fd389fd 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala @@ -33,8 +33,7 @@ import org.apache.spark.deploy.k8s.Config._ import org.apache.spark.deploy.k8s.Constants._ import org.apache.spark.deploy.k8s.KubernetesConf import org.apache.spark.deploy.k8s.KubernetesUtils.addOwnerReference -import org.apache.spark.internal.{Logging, LogKey, MDC} -import org.apache.spark.internal.LogKey.{COUNT, EXECUTOR_IDS, TIMEOUT} +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.internal.config._ import org.apache.spark.resource.ResourceProfile import org.apache.spark.scheduler.cluster.SchedulerBackendUtils.DEFAULT_NUMBER_EXECUTORS @@ -145,7 +144,7 @@ class ExecutorPodsAllocator( onNewSnapshots(applicationId, schedulerBackend, executorPodsSnapshot) if (failureTracker.numFailedExecutors > maxNumExecutorFailures) { logError(log"Max number of executor failures " + - log"(${MDC(LogKey.MAX_EXECUTOR_FAILURES, maxNumExecutorFailures)}) reached") + log"(${MDC(LogKeys.MAX_EXECUTOR_FAILURES, maxNumExecutorFailures)}) reached") stopApplication(EXCEED_MAX_EXECUTOR_FAILURES) } } @@ -211,10 +210,11 @@ class ExecutorPodsAllocator( } if (timedOut.nonEmpty) { - logWarning(log"Executors with ids ${MDC(EXECUTOR_IDS, timedOut.mkString(","))}} were not " + - log"detected in the Kubernetes cluster after ${MDC(TIMEOUT, podCreationTimeout)} ms " + - log"despite the fact that a previous allocation attempt tried to create them. " + - log"The executors may have been deleted but the application missed the deletion event.") + logWarning(log"Executors with ids ${MDC(LogKeys.EXECUTOR_IDS, timedOut.mkString(","))}} " + + log"were not detected in the Kubernetes cluster after " + + log"${MDC(LogKeys.TIMEOUT, podCreationTimeout)} ms despite the fact that a previous " + + log"allocation attempt tried to create them. The executors may have been deleted but the " + + log"application missed the deletion event.") newlyCreatedExecutors --= timedOut if (shouldDeleteExecutors) { @@ -283,7 +283,7 @@ class ExecutorPodsAllocator( val newFailedExecutorIds = currentFailedExecutorIds.diff(failedExecutorIds) if (newFailedExecutorIds.nonEmpty) { - logWarning(log"${MDC(COUNT, newFailedExecutorIds.size)} new failed executors.") + logWarning(log"${MDC(LogKeys.COUNT, newFailedExecutorIds.size)} new failed executors.") newFailedExecutorIds.foreach { _ => failureTracker.registerExecutorFailure() } } failedExecutorIds = failedExecutorIds ++ currentFailedExecutorIds @@ -343,7 +343,8 @@ class ExecutorPodsAllocator( val toDelete = newlyCreatedToDelete ++ pendingToDelete if (toDelete.nonEmpty) { - logInfo(s"Deleting ${toDelete.size} excess pod requests (${toDelete.mkString(",")}).") + logInfo(log"Deleting ${MDC(LogKeys.COUNT, toDelete.size)} excess pod requests " + + log"(${MDC(LogKeys.RESOURCE_PROFILE_IDS, toDelete.mkString(","))}).") _deletedExecutorIds = _deletedExecutorIds ++ toDelete Utils.tryLogNonFatalError { @@ -397,9 +398,11 @@ class ExecutorPodsAllocator( val numMissingPodsForRpId = targetNum - podCountForRpId val numExecutorsToAllocate = math.min(math.min(numMissingPodsForRpId, podAllocationSize), sharedSlotFromPendingPods) - logInfo(s"Going to request $numExecutorsToAllocate executors from Kubernetes for " + - s"ResourceProfile Id: $rpId, target: $targetNum, known: $podCountForRpId, " + - s"sharedSlotFromPendingPods: $sharedSlotFromPendingPods.") + logInfo(log"Going to request ${MDC(LogKeys.COUNT, numExecutorsToAllocate)} executors from" + + log" Kubernetes for ResourceProfile Id: ${MDC(LogKeys.RESOURCE_PROFILE_ID, rpId)}, " + + log"target: ${MDC(LogKeys.NUM_POD_TARGET, targetNum)}, " + + log"known: ${MDC(LogKeys.NUM_POD, podCountForRpId)}, sharedSlotFromPendingPods: " + + log"${MDC(LogKeys.NUM_POD_SHARED_SLOT, sharedSlotFromPendingPods)}.") requestNewExecutors(numExecutorsToAllocate, applicationId, rpId, k8sKnownPVCNames) } } @@ -428,7 +431,8 @@ class ExecutorPodsAllocator( .filterNot(pvc => pvcsInUse.contains(pvc.getMetadata.getName)) .filter(pvc => now - Instant.parse(pvc.getMetadata.getCreationTimestamp).toEpochMilli > podAllocationDelay) - logInfo(s"Found ${reusablePVCs.size} reusable PVCs from ${createdPVCs.size} PVCs") + logInfo(log"Found ${MDC(LogKeys.COUNT, reusablePVCs.size)} reusable PVCs from " + + log"${MDC(LogKeys.TOTAL, createdPVCs.size)} PVCs") reusablePVCs } catch { case _: KubernetesClientException => @@ -449,7 +453,8 @@ class ExecutorPodsAllocator( val reusablePVCs = getReusablePVCs(applicationId, pvcsInUse) for ( _ <- 0 until numExecutorsToAllocate) { if (reusablePVCs.isEmpty && podAllocOnPVC && maxPVCs <= PVC_COUNTER.get()) { - logInfo(s"Wait to reuse one of the existing ${PVC_COUNTER.get()} PVCs.") + logInfo( + log"Wait to reuse one of the existing ${MDC(LogKeys.COUNT, PVC_COUNTER.get())} PVCs.") return } val newExecutorId = EXECUTOR_ID_COUNTER.incrementAndGet() @@ -480,8 +485,9 @@ class ExecutorPodsAllocator( addOwnerReference(driverPod.get, Seq(resource)) } val pvc = resource.asInstanceOf[PersistentVolumeClaim] - logInfo(s"Trying to create PersistentVolumeClaim ${pvc.getMetadata.getName} with " + - s"StorageClass ${pvc.getSpec.getStorageClassName}") + logInfo(log"Trying to create PersistentVolumeClaim " + + log"${MDC(LogKeys.PVC_METADATA_NAME, pvc.getMetadata.getName)} with " + + log"StorageClass ${MDC(LogKeys.CLASS_NAME, pvc.getSpec.getStorageClassName)}") kubernetesClient.persistentVolumeClaims().inNamespace(namespace).resource(pvc).create() PVC_COUNTER.incrementAndGet() } @@ -519,7 +525,8 @@ class ExecutorPodsAllocator( if (volume.nonEmpty) { val matchedPVC = reusablePVCs.remove(index) replacedResources.add(pvc) - logInfo(s"Reuse PersistentVolumeClaim ${matchedPVC.getMetadata.getName}") + logInfo(log"Reuse PersistentVolumeClaim " + + log"${MDC(LogKeys.PVC_METADATA_NAME, matchedPVC.getMetadata.getName)}") volume.get.getPersistentVolumeClaim.setClaimName(matchedPVC.getMetadata.getName) } } @@ -535,7 +542,7 @@ class ExecutorPodsAllocator( } catch { case e: Exception => logError(log"Cannot get the creationTimestamp of the pod: " + - log"${MDC(LogKey.POD_ID, state.pod)}", e) + log"${MDC(LogKeys.POD_ID, state.pod)}", e) true } } diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManager.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManager.scala index 5590311bf6614..0d79efa06e497 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManager.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManager.scala @@ -30,7 +30,8 @@ import org.apache.spark.SparkConf import org.apache.spark.deploy.k8s.Config._ import org.apache.spark.deploy.k8s.Constants._ import org.apache.spark.deploy.k8s.KubernetesUtils._ -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.EXECUTOR_ID import org.apache.spark.scheduler.ExecutorExited import org.apache.spark.util.Utils @@ -99,8 +100,9 @@ private[spark] class ExecutorPodsLifecycleManager( if (onFinalNonDeletedState(succeeded, execId, schedulerBackend, deleteFromK8s)) { execIdsRemovedInThisRound += execId if (schedulerBackend.isExecutorActive(execId.toString)) { - logInfo(s"Snapshot reported succeeded executor with id $execId, " + - "even though the application has not requested for it to be removed.") + logInfo(log"Snapshot reported succeeded executor with id " + + log"${MDC(EXECUTOR_ID, execId)}, even though the application has not " + + log"requested for it to be removed.") } else { logDebug(s"Snapshot reported succeeded executor with id $execId," + s" pod name ${state.pod.getMetadata.getName}.") diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsSnapshot.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsSnapshot.scala index 99cef671d2e41..44daf57d76ebf 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsSnapshot.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsSnapshot.scala @@ -25,7 +25,7 @@ import io.fabric8.kubernetes.api.model.Pod import org.apache.spark.deploy.k8s.Constants._ import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{POD_NAME, POD_NAMESPACE, POD_PHASE} +import org.apache.spark.internal.LogKeys.{POD_NAME, POD_NAMESPACE, POD_PHASE} /** * An immutable view of the current executor pods that are running in the cluster. diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorRollPlugin.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorRollPlugin.scala index 1c0de8e2afded..e84aae1f27b3b 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorRollPlugin.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorRollPlugin.scala @@ -27,7 +27,7 @@ import org.apache.spark.api.plugin.{DriverPlugin, ExecutorPlugin, PluginContext, import org.apache.spark.deploy.k8s.Config.{EXECUTOR_ROLL_INTERVAL, EXECUTOR_ROLL_POLICY, ExecutorRollPolicy, MINIMUM_TASKS_PER_EXECUTOR_BEFORE_ROLLING} import org.apache.spark.executor.ExecutorMetrics import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{CLASS_NAME, CONFIG, INTERVAL} +import org.apache.spark.internal.LogKeys.{CLASS_NAME, CONFIG, EXECUTOR_ID, INTERVAL} import org.apache.spark.internal.config.DECOMMISSION_ENABLED import org.apache.spark.scheduler.ExecutorDecommissionInfo import org.apache.spark.sql.catalyst.util.DateTimeConstants.MILLIS_PER_SECOND @@ -82,7 +82,7 @@ class ExecutorRollDriverPlugin extends DriverPlugin with Logging { choose(executorSummaryList, policy) match { case Some(id) => // Use decommission to be safe. - logInfo(s"Ask to decommission executor $id") + logInfo(log"Ask to decommission executor ${MDC(EXECUTOR_ID, id)}") val now = System.currentTimeMillis() scheduler.decommissionExecutor( id, diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterManager.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterManager.scala index 3235d922204b2..ddcdc2cf663ac 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterManager.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterManager.scala @@ -25,7 +25,8 @@ import org.apache.spark.{SparkConf, SparkContext, SparkMasterRegex} import org.apache.spark.deploy.k8s.{KubernetesConf, KubernetesUtils, SparkKubernetesClientFactory} import org.apache.spark.deploy.k8s.Config._ import org.apache.spark.deploy.k8s.Constants.DEFAULT_EXECUTOR_CONTAINER_NAME -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.MASTER_URL import org.apache.spark.internal.config.TASK_MAX_FAILURES import org.apache.spark.scheduler.{ExternalClusterManager, SchedulerBackend, TaskScheduler, TaskSchedulerImpl} import org.apache.spark.scheduler.local.LocalSchedulerBackend @@ -61,7 +62,7 @@ private[spark] class KubernetesClusterManager extends ExternalClusterManager wit if (threads == "*") localCpuCount else threads.toInt case _ => 1 } - logInfo(s"Running Spark with ${sc.conf.get(KUBERNETES_DRIVER_MASTER_URL)}") + logInfo(log"Running Spark with ${MDC(MASTER_URL, sc.conf.get(KUBERNETES_DRIVER_MASTER_URL))}") val schedulerImpl = scheduler.asInstanceOf[TaskSchedulerImpl] // KubernetesClusterSchedulerBackend respects `spark.app.id` while LocalSchedulerBackend // does not. Propagate `spark.app.id` via `spark.test.appId` to match the behavior. diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala index daf8d5e3f58a2..4e4634504a0f3 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala @@ -32,6 +32,8 @@ import org.apache.spark.deploy.k8s.Config._ import org.apache.spark.deploy.k8s.Constants._ import org.apache.spark.deploy.k8s.submit.KubernetesClientUtils import org.apache.spark.deploy.security.HadoopDelegationTokenManager +import org.apache.spark.internal.LogKeys.{COUNT, HOST_PORT, TOTAL} +import org.apache.spark.internal.MDC import org.apache.spark.internal.config.SCHEDULER_MIN_REGISTERED_RESOURCES_RATIO import org.apache.spark.resource.ResourceProfile import org.apache.spark.rpc.{RpcAddress, RpcCallContext} @@ -255,9 +257,10 @@ private[spark] class KubernetesClusterSchedulerBackend( .withLabel(SPARK_ROLE_LABEL, SPARK_POD_EXECUTOR_ROLE) .withLabelIn(SPARK_EXECUTOR_ID_LABEL, executorIds: _*) - if (!running.list().getItems().isEmpty()) { - logInfo(s"Forcefully deleting ${running.list().getItems().size()} pods " + - s"(out of ${executorIds.size}) that are still running after graceful shutdown period.") + if (!running.list().getItems.isEmpty) { + logInfo(log"Forcefully deleting ${MDC(COUNT, running.list().getItems.size())} pods " + + log"(out of ${MDC(TOTAL, executorIds.size)}) that are still running after graceful " + + log"shutdown period.") running.delete() } } @@ -353,7 +356,7 @@ private[spark] class KubernetesClusterSchedulerBackend( execIDRequester -= rpcAddress // Expected, executors re-establish a connection with an ID case _ => - logInfo(s"No executor found for ${rpcAddress}") + logInfo(log"No executor found for ${MDC(HOST_PORT, rpcAddress)}") } } } diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/shuffle/KubernetesLocalDiskShuffleExecutorComponents.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/shuffle/KubernetesLocalDiskShuffleExecutorComponents.scala index 376218df57702..2728385874f6d 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/shuffle/KubernetesLocalDiskShuffleExecutorComponents.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/shuffle/KubernetesLocalDiskShuffleExecutorComponents.scala @@ -27,7 +27,7 @@ import org.apache.commons.io.FileExistsException import org.apache.spark.{SparkConf, SparkEnv} import org.apache.spark.deploy.k8s.Config.KUBERNETES_DRIVER_REUSE_PVC -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.internal.config.{SHUFFLE_CHECKSUM_ALGORITHM, SHUFFLE_CHECKSUM_ENABLED} import org.apache.spark.shuffle.ShuffleChecksumUtils.{compareChecksums, getChecksumFileName} import org.apache.spark.shuffle.api.{ShuffleExecutorComponents, ShuffleMapOutputWriter, SingleSpillShuffleMapOutputWriter} @@ -54,7 +54,8 @@ class KubernetesLocalDiskShuffleExecutorComponents(sparkConf: SparkConf) KubernetesLocalDiskShuffleExecutorComponents.recoverDiskStore(sparkConf, blockManager) } } else { - logInfo(s"Skip recovery because ${KUBERNETES_DRIVER_REUSE_PVC.key} is disabled.") + logInfo(log"Skip recovery because ${MDC(LogKeys.CONFIG, KUBERNETES_DRIVER_REUSE_PVC.key)} " + + log"is disabled.") } } @@ -94,20 +95,23 @@ object KubernetesLocalDiskShuffleExecutorComponents extends Logging { .partition(_.getName.contains(".checksum")) val (indexFiles, dataFiles) = files.partition(_.getName.endsWith(".index")) - logInfo(s"Found ${dataFiles.size} data files, ${indexFiles.size} index files, " + - s"and ${checksumFiles.size} checksum files.") + logInfo(log"Found ${MDC(LogKeys.NUM_DATA_FILE, dataFiles.length)} data files, " + + log"${MDC(LogKeys.NUM_INDEX_FILE, indexFiles.length)} index files, " + + log"and ${MDC(LogKeys.NUM_CHECKSUM_FILE, checksumFiles.length)} checksum files.") // Build a hashmap with checksum file name as a key val checksumFileMap = new mutable.HashMap[String, File]() val algorithm = conf.get(SHUFFLE_CHECKSUM_ALGORITHM) checksumFiles.foreach { f => - logInfo(s"${f.getName} -> ${f.getAbsolutePath}") + logInfo(log"${MDC(LogKeys.FILE_NAME, f.getName)} -> " + + log"${MDC(LogKeys.FILE_ABSOLUTE_PATH, f.getAbsolutePath)}") checksumFileMap.put(f.getName, f) } // Build a hashmap with shuffle data file name as a key val indexFileMap = new mutable.HashMap[String, File]() indexFiles.foreach { f => - logInfo(s"${f.getName.replace(".index", ".data")} -> ${f.getAbsolutePath}") + logInfo(log"${MDC(LogKeys.FILE_NAME, f.getName.replace(".index", ".data"))} -> " + + log"${MDC(LogKeys.FILE_ABSOLUTE_PATH, f.getAbsolutePath)}") indexFileMap.put(f.getName.replace(".index", ".data"), f) } @@ -116,7 +120,7 @@ object KubernetesLocalDiskShuffleExecutorComponents extends Logging { val level = StorageLevel.DISK_ONLY val checksumDisabled = !conf.get(SHUFFLE_CHECKSUM_ENABLED) (dataFiles ++ indexFiles).foreach { f => - logInfo(s"Try to recover ${f.getAbsolutePath}") + logInfo(log"Try to recover ${MDC(LogKeys.FILE_ABSOLUTE_PATH, f.getAbsolutePath)}") try { val id = BlockId(f.getName) // To make it sure to handle only shuffle blocks @@ -129,7 +133,8 @@ object KubernetesLocalDiskShuffleExecutorComponents extends Logging { val decryptedSize = f.length() bm.TempFileBasedBlockStoreUpdater(id, level, classTag, f, decryptedSize).save() } else { - logInfo(s"Ignore ${f.getAbsolutePath} due to the verification failure.") + logInfo(log"Ignore ${MDC(LogKeys.FILE_ABSOLUTE_PATH, f.getAbsolutePath)} " + + log"due to the verification failure.") } } else { logInfo("Ignore a non-shuffle block file.") diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/KubernetesConfSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/KubernetesConfSuite.scala index 9963db016ad9b..3c53e9b74f924 100644 --- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/KubernetesConfSuite.scala +++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/KubernetesConfSuite.scala @@ -40,7 +40,9 @@ class KubernetesConfSuite extends SparkFunSuite { "execNodeSelectorKey2" -> "execNodeSelectorValue2") private val CUSTOM_LABELS = Map( "customLabel1Key" -> "customLabel1Value", - "customLabel2Key" -> "customLabel2Value") + "customLabel2Key" -> "customLabel2Value", + "customLabel3Key" -> "{{APP_ID}}", + "customLabel4Key" -> "{{EXECUTOR_ID}}") private val CUSTOM_ANNOTATIONS = Map( "customAnnotation1Key" -> "customAnnotation1Value", "customAnnotation2Key" -> "customAnnotation2Value", @@ -95,7 +97,9 @@ class KubernetesConfSuite extends SparkFunSuite { SPARK_APP_ID_LABEL -> KubernetesTestConf.APP_ID, SPARK_APP_NAME_LABEL -> KubernetesConf.getAppNameLabel(conf.appName), SPARK_ROLE_LABEL -> SPARK_POD_DRIVER_ROLE) ++ - CUSTOM_LABELS) + CUSTOM_LABELS.map { + case (k, v) => (k, Utils.substituteAppNExecIds(v, conf.appId, "")) + }) assert(conf.annotations === CUSTOM_ANNOTATIONS.map { case (k, v) => (k, Utils.substituteAppNExecIds(v, conf.appId, "")) }) @@ -165,7 +169,10 @@ class KubernetesConfSuite extends SparkFunSuite { SPARK_APP_ID_LABEL -> KubernetesTestConf.APP_ID, SPARK_APP_NAME_LABEL -> KubernetesConf.getAppNameLabel(conf.appName), SPARK_ROLE_LABEL -> SPARK_POD_EXECUTOR_ROLE, - SPARK_RESOURCE_PROFILE_ID_LABEL -> DEFAULT_RESOURCE_PROFILE_ID.toString) ++ CUSTOM_LABELS) + SPARK_RESOURCE_PROFILE_ID_LABEL -> DEFAULT_RESOURCE_PROFILE_ID.toString) ++ + CUSTOM_LABELS.map { + case (k, v) => (k, Utils.substituteAppNExecIds(v, conf.appId, EXECUTOR_ID)) + }) assert(conf.annotations === CUSTOM_ANNOTATIONS.map { case (k, v) => (k, Utils.substituteAppNExecIds(v, conf.appId, EXECUTOR_ID)) }) diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/KubernetesTestConf.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/KubernetesTestConf.scala index d6a60b1edea2f..b70b9348d23b4 100644 --- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/KubernetesTestConf.scala +++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/KubernetesTestConf.scala @@ -22,6 +22,7 @@ import io.fabric8.kubernetes.api.model.Pod import org.apache.spark.SparkConf import org.apache.spark.deploy.k8s.Config._ import org.apache.spark.deploy.k8s.submit.{JavaMainAppResource, MainAppResource} +import org.apache.spark.util.{Clock, SystemClock} /** * Builder methods for KubernetesConf that allow easy control over what to return for a few @@ -52,7 +53,8 @@ object KubernetesTestConf { secretEnvNamesToKeyRefs: Map[String, String] = Map.empty, secretNamesToMountPaths: Map[String, String] = Map.empty, volumes: Seq[KubernetesVolumeSpec] = Seq.empty, - proxyUser: Option[String] = None): KubernetesDriverConf = { + proxyUser: Option[String] = None, + clock: Clock = new SystemClock()): KubernetesDriverConf = { val conf = sparkConf.clone() resourceNamePrefix.foreach { prefix => @@ -67,7 +69,7 @@ object KubernetesTestConf { setPrefixedConfigs(conf, KUBERNETES_DRIVER_SECRET_KEY_REF_PREFIX, secretEnvNamesToKeyRefs) setVolumeSpecs(conf, KUBERNETES_DRIVER_VOLUMES_PREFIX, volumes) - new KubernetesDriverConf(conf, appId, mainAppResource, mainClass, appArgs, proxyUser) + new KubernetesDriverConf(conf, appId, mainAppResource, mainClass, appArgs, proxyUser, clock) } // scalastyle:on argcount diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/BasicDriverFeatureStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/BasicDriverFeatureStepSuite.scala index f102851e6c3b9..bf022ac630158 100644 --- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/BasicDriverFeatureStepSuite.scala +++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/BasicDriverFeatureStepSuite.scala @@ -35,11 +35,13 @@ import org.apache.spark.util.Utils class BasicDriverFeatureStepSuite extends SparkFunSuite { - private val CUSTOM_DRIVER_LABELS = Map("labelkey" -> "labelvalue") + private val CUSTOM_DRIVER_LABELS = Map( + "labelkey" -> "labelvalue", + "customAppIdLabelKey" -> "{{APP_ID}}") private val CONTAINER_IMAGE_PULL_POLICY = "IfNotPresent" private val DRIVER_ANNOTATIONS = Map( "customAnnotation" -> "customAnnotationValue", - "yunikorn.apache.org/app-id" -> "{{APPID}}") + "customAppIdAnnotation" -> "{{APP_ID}}") private val DRIVER_ENVS = Map( "customDriverEnv1" -> "customDriverEnv1Value", "customDriverEnv2" -> "customDriverEnv2Value") @@ -121,10 +123,11 @@ class BasicDriverFeatureStepSuite extends SparkFunSuite { assert(driverPodMetadata.getName === "spark-driver-pod") // Check custom and preset labels are as expected + val labels = driverPodMetadata.getLabels CUSTOM_DRIVER_LABELS.foreach { case (k, v) => - assert(driverPodMetadata.getLabels.get(k) === v) + assert(labels.get(k) === Utils.substituteAppNExecIds(v, KubernetesTestConf.APP_ID, "")) } - assert(driverPodMetadata.getLabels === kubernetesConf.labels.asJava) + assert(labels === kubernetesConf.labels.asJava) val annotations = driverPodMetadata.getAnnotations.asScala DRIVER_ANNOTATIONS.foreach { case (k, v) => diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/DriverServiceFeatureStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/DriverServiceFeatureStepSuite.scala index 06d322c9d19b5..d69fcf89e1337 100644 --- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/DriverServiceFeatureStepSuite.scala +++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/DriverServiceFeatureStepSuite.scala @@ -109,17 +109,18 @@ class DriverServiceFeatureStepSuite extends SparkFunSuite { } test("Long prefixes should switch to using a generated unique name.") { + val clock = new ManualClock() val sparkConf = new SparkConf(false) .set(KUBERNETES_NAMESPACE, "my-namespace") - val kconf = KubernetesTestConf.createDriverConf( - sparkConf = sparkConf, - resourceNamePrefix = Some(LONG_RESOURCE_NAME_PREFIX), - labels = DRIVER_LABELS) - val clock = new ManualClock() // Ensure that multiple services created at the same time generate unique names. val services = (1 to 10).map { _ => - val configurationStep = new DriverServiceFeatureStep(kconf, clock = clock) + val kconf = KubernetesTestConf.createDriverConf( + sparkConf = sparkConf, + resourceNamePrefix = Some(LONG_RESOURCE_NAME_PREFIX), + labels = DRIVER_LABELS, + clock = clock) + val configurationStep = new DriverServiceFeatureStep(kconf) val serviceName = configurationStep .getAdditionalKubernetesResources() .head @@ -130,11 +131,11 @@ class DriverServiceFeatureStepSuite extends SparkFunSuite { val hostAddress = configurationStep .getAdditionalPodSystemProperties()(DRIVER_HOST_ADDRESS.key) - (serviceName -> hostAddress) - }.toMap + Tuple3(kconf, serviceName, hostAddress) + } assert(services.size === 10) - services.foreach { case (name, address) => + services.foreach { case (kconf, name, address) => assert(!name.startsWith(kconf.resourceNamePrefix)) assert(!address.startsWith(kconf.resourceNamePrefix)) assert(InternetDomainName.isValid(address)) diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/BasicTestsSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/BasicTestsSuite.scala index d6911aadfa237..0dafe30c364ae 100644 --- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/BasicTestsSuite.scala +++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/BasicTestsSuite.scala @@ -102,16 +102,18 @@ private[spark] trait BasicTestsSuite { k8sSuite: KubernetesSuite => sparkAppConf .set("spark.kubernetes.driver.label.label1", "label1-value") .set("spark.kubernetes.driver.label.label2", "label2-value") + .set("spark.kubernetes.driver.label.customAppIdLabelKey", "{{APP_ID}}") .set("spark.kubernetes.driver.annotation.annotation1", "annotation1-value") .set("spark.kubernetes.driver.annotation.annotation2", "annotation2-value") - .set("spark.kubernetes.driver.annotation.yunikorn.apache.org/app-id", "{{APP_ID}}") + .set("spark.kubernetes.driver.annotation.customAppIdAnnotation", "{{APP_ID}}") .set("spark.kubernetes.driverEnv.ENV1", "VALUE1") .set("spark.kubernetes.driverEnv.ENV2", "VALUE2") .set("spark.kubernetes.executor.label.label1", "label1-value") .set("spark.kubernetes.executor.label.label2", "label2-value") + .set("spark.kubernetes.executor.label.customAppIdLabelKey", "{{APP_ID}}") .set("spark.kubernetes.executor.annotation.annotation1", "annotation1-value") .set("spark.kubernetes.executor.annotation.annotation2", "annotation2-value") - .set("spark.kubernetes.executor.annotation.yunikorn.apache.org/app-id", "{{APP_ID}}") + .set("spark.kubernetes.executor.annotation.customAppIdAnnotation", "{{APP_ID}}") .set("spark.executorEnv.ENV1", "VALUE1") .set("spark.executorEnv.ENV2", "VALUE2") diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/DecommissionSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/DecommissionSuite.scala index 1b9b5310c2ee2..ae5f037c6b7d4 100644 --- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/DecommissionSuite.scala +++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/DecommissionSuite.scala @@ -175,7 +175,7 @@ private[spark] trait DecommissionSuite { k8sSuite: KubernetesSuite => expectedDriverLogOnCompletion = Seq( "Finished waiting, stopping Spark", "Decommission executors", - "Remove reason statistics: (gracefully decommissioned: 1, decommision unfinished: 0, " + + "Remove reason statistics: (gracefully decommissioned: 1, decommission unfinished: 0, " + "driver killed: 0, unexpectedly exited: 0)."), appArgs = Array.empty[String], driverPodChecker = doBasicDriverPyPodCheck, diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/DepsTestsSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/DepsTestsSuite.scala index 5f95b8daa66cb..c0f5e0fe265d7 100644 --- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/DepsTestsSuite.scala +++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/DepsTestsSuite.scala @@ -38,7 +38,7 @@ import org.apache.spark.deploy.k8s.integrationtest.DepsTestsSuite.{DEPS_TIMEOUT, import org.apache.spark.deploy.k8s.integrationtest.KubernetesSuite._ import org.apache.spark.deploy.k8s.integrationtest.Utils.getExamplesJarName import org.apache.spark.deploy.k8s.integrationtest.backend.minikube.Minikube -import org.apache.spark.internal.{LogKey, MDC} +import org.apache.spark.internal.{LogKeys, MDC} import org.apache.spark.internal.config.{ARCHIVES, PYSPARK_DRIVER_PYTHON, PYSPARK_PYTHON} private[spark] trait DepsTestsSuite { k8sSuite: KubernetesSuite => @@ -327,7 +327,7 @@ private[spark] trait DepsTestsSuite { k8sSuite: KubernetesSuite => s3client.createBucket(createBucketRequest) } catch { case e: Exception => - logError(log"Failed to create bucket ${MDC(LogKey.BUCKET, BUCKET)}", e) + logError(log"Failed to create bucket ${MDC(LogKeys.BUCKET, BUCKET)}", e) throw new SparkException(s"Failed to create bucket $BUCKET.", e) } } diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/KubernetesSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/KubernetesSuite.scala index 868461fd5b9e1..0b0b30e5e04fd 100644 --- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/KubernetesSuite.scala +++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/KubernetesSuite.scala @@ -589,7 +589,8 @@ class KubernetesSuite extends SparkFunSuite assert(pod.getMetadata.getLabels.get("label2") === "label2-value") assert(pod.getMetadata.getAnnotations.get("annotation1") === "annotation1-value") assert(pod.getMetadata.getAnnotations.get("annotation2") === "annotation2-value") - val appId = pod.getMetadata.getAnnotations.get("yunikorn.apache.org/app-id") + val appIdLabel = pod.getMetadata.getLabels.get("customAppIdLabelKey") + val appIdAnnotation = pod.getMetadata.getAnnotations.get("customAppIdAnnotation") val container = pod.getSpec.getContainers.get(0) val envVars = container @@ -601,7 +602,8 @@ class KubernetesSuite extends SparkFunSuite .toMap assert(envVars("ENV1") === "VALUE1") assert(envVars("ENV2") === "VALUE2") - assert(appId === envVars(ENV_APPLICATION_ID)) + assert(appIdLabel === envVars(ENV_APPLICATION_ID)) + assert(appIdAnnotation === envVars(ENV_APPLICATION_ID)) } private def deleteDriverPod(): Unit = { diff --git a/resource-managers/yarn/src/main/java/org/apache/spark/deploy/yarn/AmIpFilter.java b/resource-managers/yarn/src/main/java/org/apache/spark/deploy/yarn/AmIpFilter.java new file mode 100644 index 0000000000000..60e880d1ac4aa --- /dev/null +++ b/resource-managers/yarn/src/main/java/org/apache/spark/deploy/yarn/AmIpFilter.java @@ -0,0 +1,239 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.deploy.yarn; + +import org.apache.hadoop.classification.InterfaceAudience.Public; +import org.apache.hadoop.classification.VisibleForTesting; +import org.apache.hadoop.security.UserGroupInformation; +import org.apache.hadoop.util.Time; + +import jakarta.servlet.*; +import jakarta.servlet.http.Cookie; +import jakarta.servlet.http.HttpServletRequest; +import jakarta.servlet.http.HttpServletResponse; +import java.io.IOException; +import java.net.*; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.TimeUnit; + +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; + +// This class is copied from Hadoop 3.4.0 +// org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter +// +// Modification: +// Migrate from javax.servlet to jakarta.servlet +// Copy constant string definitions to strip external dependency +// - RM_HA_URLS +// - PROXY_USER_COOKIE_NAME +@Public +public class AmIpFilter implements Filter { + private static final SparkLogger LOG = SparkLoggerFactory.getLogger(AmIpFilter.class); + + @Deprecated + public static final String PROXY_HOST = "PROXY_HOST"; + @Deprecated + public static final String PROXY_URI_BASE = "PROXY_URI_BASE"; + public static final String PROXY_HOSTS = "PROXY_HOSTS"; + public static final String PROXY_HOSTS_DELIMITER = ","; + public static final String PROXY_URI_BASES = "PROXY_URI_BASES"; + public static final String PROXY_URI_BASES_DELIMITER = ","; + private static final String PROXY_PATH = "/proxy"; + // RM_HA_URLS is defined in AmFilterInitializer in the original Hadoop code + private static final String RM_HA_URLS = "RM_HA_URLS"; + // WebAppProxyServlet is defined in WebAppProxyServlet in the original Hadoop code + public static final String PROXY_USER_COOKIE_NAME = "proxy-user"; + // update the proxy IP list about every 5 min + private static long updateInterval = TimeUnit.MINUTES.toMillis(5); + + private String[] proxyHosts; + private Set proxyAddresses = null; + private long lastUpdate; + @VisibleForTesting + Map proxyUriBases; + String[] rmUrls = null; + + @Override + public void init(FilterConfig conf) throws ServletException { + // Maintain for backwards compatibility + if (conf.getInitParameter(PROXY_HOST) != null + && conf.getInitParameter(PROXY_URI_BASE) != null) { + proxyHosts = new String[]{conf.getInitParameter(PROXY_HOST)}; + proxyUriBases = new HashMap<>(1); + proxyUriBases.put("dummy", conf.getInitParameter(PROXY_URI_BASE)); + } else { + proxyHosts = conf.getInitParameter(PROXY_HOSTS) + .split(PROXY_HOSTS_DELIMITER); + + String[] proxyUriBasesArr = conf.getInitParameter(PROXY_URI_BASES) + .split(PROXY_URI_BASES_DELIMITER); + proxyUriBases = new HashMap<>(proxyUriBasesArr.length); + for (String proxyUriBase : proxyUriBasesArr) { + try { + URL url = new URL(proxyUriBase); + proxyUriBases.put(url.getHost() + ":" + url.getPort(), proxyUriBase); + } catch(MalformedURLException e) { + LOG.warn(proxyUriBase + " does not appear to be a valid URL", e); + } + } + } + + if (conf.getInitParameter(RM_HA_URLS) != null) { + rmUrls = conf.getInitParameter(RM_HA_URLS).split(","); + } + } + + protected Set getProxyAddresses() throws ServletException { + long now = Time.monotonicNow(); + synchronized(this) { + if (proxyAddresses == null || (lastUpdate + updateInterval) <= now) { + proxyAddresses = new HashSet<>(); + for (String proxyHost : proxyHosts) { + try { + for (InetAddress add : InetAddress.getAllByName(proxyHost)) { + LOG.debug("proxy address is: {}", add.getHostAddress()); + proxyAddresses.add(add.getHostAddress()); + } + lastUpdate = now; + } catch (UnknownHostException e) { + LOG.warn("Could not locate " + proxyHost + " - skipping", e); + } + } + if (proxyAddresses.isEmpty()) { + throw new ServletException("Could not locate any of the proxy hosts"); + } + } + return proxyAddresses; + } + } + + @Override + public void destroy() { + // Empty + } + + @Override + public void doFilter(ServletRequest req, ServletResponse resp, + FilterChain chain) throws IOException, ServletException { + ProxyUtils.rejectNonHttpRequests(req); + + HttpServletRequest httpReq = (HttpServletRequest)req; + HttpServletResponse httpResp = (HttpServletResponse)resp; + + LOG.debug("Remote address for request is: {}", httpReq.getRemoteAddr()); + + if (!getProxyAddresses().contains(httpReq.getRemoteAddr())) { + StringBuilder redirect = new StringBuilder(findRedirectUrl()); + + redirect.append(httpReq.getRequestURI()); + + int insertPoint = redirect.indexOf(PROXY_PATH); + + if (insertPoint >= 0) { + // Add /redirect as the second component of the path so that the RM web + // proxy knows that this request was a redirect. + insertPoint += PROXY_PATH.length(); + redirect.insert(insertPoint, "/redirect"); + } + // add the query parameters on the redirect if there were any + String queryString = httpReq.getQueryString(); + if (queryString != null && !queryString.isEmpty()) { + redirect.append("?"); + redirect.append(queryString); + } + + ProxyUtils.sendRedirect(httpReq, httpResp, redirect.toString()); + } else { + String user = null; + + if (httpReq.getCookies() != null) { + for (Cookie c: httpReq.getCookies()) { + if (PROXY_USER_COOKIE_NAME.equals(c.getName())){ + user = c.getValue(); + break; + } + } + } + if (user == null) { + LOG.debug("Could not find {} cookie, so user will not be set", + PROXY_USER_COOKIE_NAME); + + chain.doFilter(req, resp); + } else { + AmIpPrincipal principal = new AmIpPrincipal(user); + ServletRequest requestWrapper = new AmIpServletRequestWrapper(httpReq, + principal); + + chain.doFilter(requestWrapper, resp); + } + } + } + + @VisibleForTesting + public String findRedirectUrl() throws ServletException { + String addr = null; + if (proxyUriBases.size() == 1) { + // external proxy or not RM HA + addr = proxyUriBases.values().iterator().next(); + } else if (rmUrls != null) { + for (String url : rmUrls) { + String host = proxyUriBases.get(url); + if (isValidUrl(host)) { + addr = host; + break; + } + } + } + + if (addr == null) { + throw new ServletException( + "Could not determine the proxy server for redirection"); + } + return addr; + } + + @VisibleForTesting + public boolean isValidUrl(String url) { + boolean isValid = false; + try { + HttpURLConnection conn = (HttpURLConnection) new URL(url).openConnection(); + conn.connect(); + isValid = conn.getResponseCode() == HttpURLConnection.HTTP_OK; + // If security is enabled, any valid RM which can give 401 Unauthorized is + // good enough to access. Since AM doesn't have enough credential, auth + // cannot be completed and hence 401 is fine in such case. + if (!isValid && UserGroupInformation.isSecurityEnabled()) { + isValid = (conn.getResponseCode() == HttpURLConnection.HTTP_UNAUTHORIZED) + || (conn.getResponseCode() == HttpURLConnection.HTTP_FORBIDDEN); + return isValid; + } + } catch (Exception e) { + LOG.warn("Failed to connect to " + url + ": " + e.toString()); + } + return isValid; + } + + @VisibleForTesting + protected static void setUpdateInterval(long updateInterval) { + AmIpFilter.updateInterval = updateInterval; + } +} diff --git a/resource-managers/yarn/src/main/java/org/apache/spark/deploy/yarn/AmIpPrincipal.java b/resource-managers/yarn/src/main/java/org/apache/spark/deploy/yarn/AmIpPrincipal.java new file mode 100644 index 0000000000000..9d5a5e3b04568 --- /dev/null +++ b/resource-managers/yarn/src/main/java/org/apache/spark/deploy/yarn/AmIpPrincipal.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.deploy.yarn; + +import java.security.Principal; + +// This class is copied from Hadoop 3.4.0 +// org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpPrincipal +public class AmIpPrincipal implements Principal { + private final String name; + + public AmIpPrincipal(String name) { + this.name = name; + } + + @Override + public String getName() { + return name; + } +} diff --git a/resource-managers/yarn/src/main/java/org/apache/spark/deploy/yarn/AmIpServletRequestWrapper.java b/resource-managers/yarn/src/main/java/org/apache/spark/deploy/yarn/AmIpServletRequestWrapper.java new file mode 100644 index 0000000000000..9082378fe89c7 --- /dev/null +++ b/resource-managers/yarn/src/main/java/org/apache/spark/deploy/yarn/AmIpServletRequestWrapper.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.deploy.yarn; + +import jakarta.servlet.http.HttpServletRequest; +import jakarta.servlet.http.HttpServletRequestWrapper; +import java.security.Principal; + +// This class is copied from Hadoop 3.4.0 +// org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpServletRequestWrapper +// +// Modification: +// Migrate from javax.servlet to jakarta.servlet +public class AmIpServletRequestWrapper extends HttpServletRequestWrapper { + private final AmIpPrincipal principal; + + public AmIpServletRequestWrapper(HttpServletRequest request, + AmIpPrincipal principal) { + super(request); + this.principal = principal; + } + + @Override + public Principal getUserPrincipal() { + return principal; + } + + @Override + public String getRemoteUser() { + return principal.getName(); + } + + @Override + public boolean isUserInRole(String role) { + // No role info so far + return false; + } + +} diff --git a/resource-managers/yarn/src/main/java/org/apache/spark/deploy/yarn/ProxyUtils.java b/resource-managers/yarn/src/main/java/org/apache/spark/deploy/yarn/ProxyUtils.java new file mode 100644 index 0000000000000..c7a49a76c655f --- /dev/null +++ b/resource-managers/yarn/src/main/java/org/apache/spark/deploy/yarn/ProxyUtils.java @@ -0,0 +1,126 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.deploy.yarn; + +import org.apache.hadoop.yarn.webapp.MimeType; +import org.apache.hadoop.yarn.webapp.hamlet2.Hamlet; + +import jakarta.servlet.ServletException; +import jakarta.servlet.ServletRequest; +import jakarta.servlet.http.HttpServletRequest; +import jakarta.servlet.http.HttpServletResponse; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.EnumSet; + +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; + +// Class containing general purpose proxy utilities +// +// This class is copied from Hadoop 3.4.0 +// org.apache.hadoop.yarn.server.webproxy.ProxyUtils +// +// Modification: +// Migrate from javax.servlet to jakarta.servlet +public class ProxyUtils { + private static final SparkLogger LOG = SparkLoggerFactory.getLogger(ProxyUtils.class); + public static final String E_HTTP_HTTPS_ONLY = + "This filter only works for HTTP/HTTPS"; + public static final String LOCATION = "Location"; + + public static class __ implements Hamlet.__ { + // Empty + } + + public static class Page extends Hamlet { + Page(PrintWriter out) { + super(out, 0, false); + } + + public HTML html() { + return new HTML<>("html", null, EnumSet.of(EOpt.ENDTAG)); + } + } + + /** + * Handle redirects with a status code that can in future support verbs other + * than GET, thus supporting full REST functionality. + *

      + * The target URL is included in the redirect text returned + *

      + * At the end of this method, the output stream is closed. + * + * @param request request (hence: the verb and any other information + * relevant to a redirect) + * @param response the response + * @param target the target URL -unencoded + * + */ + public static void sendRedirect(HttpServletRequest request, + HttpServletResponse response, + String target) + throws IOException { + LOG.debug("Redirecting {} {} to {}", + request.getMethod(), + request.getRequestURI(), + target); + String location = response.encodeRedirectURL(target); + response.setStatus(HttpServletResponse.SC_FOUND); + response.setHeader(LOCATION, location); + response.setContentType(MimeType.HTML); + PrintWriter writer = response.getWriter(); + Page p = new Page(writer); + p.html() + .head().title("Moved").__() + .body() + .h1("Moved") + .div() + .__("Content has moved ") + .a(location, "here").__() + .__().__(); + writer.close(); + } + + + /** + * Output 404 with appropriate message. + * @param resp the http response. + * @param message the message to include on the page. + * @throws IOException on any error. + */ + public static void notFound(HttpServletResponse resp, String message) + throws IOException { + resp.setStatus(HttpServletResponse.SC_NOT_FOUND); + resp.setContentType(MimeType.HTML); + Page p = new Page(resp.getWriter()); + p.html().h1(message).__(); + } + + /** + * Reject any request that isn't from an HTTP servlet + * @param req request + * @throws ServletException if the request is of the wrong type + */ + public static void rejectNonHttpRequests(ServletRequest req) throws + ServletException { + if (!(req instanceof HttpServletRequest)) { + throw new ServletException(E_HTTP_HTTPS_ONLY); + } + } +} diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala index eb944244fc9da..11d22a3225d8a 100644 --- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala +++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala @@ -43,8 +43,7 @@ import org.apache.spark.deploy.{ExecutorFailureTracker, SparkHadoopUtil} import org.apache.spark.deploy.history.HistoryServer import org.apache.spark.deploy.security.HadoopDelegationTokenManager import org.apache.spark.deploy.yarn.config._ -import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{EXIT_CODE, FAILURES, HOST_PORT} +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.internal.config._ import org.apache.spark.internal.config.UI._ import org.apache.spark.metrics.{MetricsSystem, MetricsSystemInstances} @@ -220,7 +219,7 @@ private[spark] class ApplicationMaster( "APPMASTER", sparkConf.get(APP_CALLER_CONTEXT), Option(appAttemptId.getApplicationId.toString), attemptID).setCurrentContext() - logInfo("ApplicationAttemptId: " + appAttemptId) + logInfo(log"ApplicationAttemptId: ${MDC(LogKeys.APP_ATTEMPT_ID, appAttemptId)}") // During shutdown, we may not be able to create an FileSystem object. So, pre-create here. val stagingDirPath = new Path(System.getenv("SPARK_YARN_STAGING_DIR")) @@ -368,8 +367,9 @@ private[spark] class ApplicationMaster( final def unregister(status: FinalApplicationStatus, diagnostics: String = null): Unit = { synchronized { if (registered && !unregistered) { - logInfo(s"Unregistering ApplicationMaster with $status" + - Option(diagnostics).map(msg => s" (diag message: $msg)").getOrElse("")) + logInfo(log"Unregistering ApplicationMaster with ${MDC(LogKeys.APP_STATE, status)}" + + Option(diagnostics).map( + msg => log" (diag message: ${MDC(LogKeys.MESSAGE, msg)})").getOrElse(log"")) unregistered = true client.unregister(status, Option(diagnostics).getOrElse("")) } @@ -387,8 +387,9 @@ private[spark] class ApplicationMaster( finalStatus = FinalApplicationStatus.FAILED exitCode = ApplicationMaster.EXIT_SC_NOT_INITED } - logInfo(s"Final app status: $finalStatus, exitCode: $exitCode" + - Option(msg).map(msg => s", (reason: $msg)").getOrElse("")) + logInfo(log"Final app status: ${MDC(LogKeys.APP_STATE, finalStatus)}, " + + log"exitCode: ${MDC(LogKeys.EXIT_CODE, exitCode)}" + + Option(msg).map(msg => log", (reason: ${MDC(LogKeys.REASON, msg)})").getOrElse(log"")) finalMsg = ComStrUtils.abbreviate(msg, sparkConf.get(AM_FINAL_MSG_LIMIT).toInt) finished = true if (!inShutdown && Thread.currentThread() != reporterThread && reporterThread != null) { @@ -481,8 +482,8 @@ private[spark] class ApplicationMaster( // the allocator is ready to service requests. rpcEnv.setupEndpoint("YarnAM", new AMEndpoint(rpcEnv, driverRef)) if (_sparkConf.get(SHUFFLE_SERVICE_ENABLED)) { - logInfo("Initializing service data for shuffle service using name '" + - s"${_sparkConf.get(SHUFFLE_SERVICE_NAME)}'") + logInfo(log"Initializing service data for shuffle service using name '" + + log"${MDC(LogKeys.SHUFFLE_SERVICE_NAME, _sparkConf.get(SHUFFLE_SERVICE_NAME))}'") } allocator.allocateResources() val ms = MetricsSystem.createMetricsSystem(MetricsSystemInstances.APPLICATION_MASTER, sparkConf) @@ -526,9 +527,9 @@ private[spark] class ApplicationMaster( userClassThread.join() } catch { case e: SparkException if e.getCause().isInstanceOf[TimeoutException] => - logError( - s"SparkContext did not initialize after waiting for $totalWaitTime ms. " + - "Please check earlier log output for errors. Failing the application.") + logError(log"SparkContext did not initialize after waiting for " + + log"${MDC(LogKeys.TIMEOUT, totalWaitTime)} ms. " + + log"Please check earlier log output for errors. Failing the application.") finish(FinalApplicationStatus.FAILED, ApplicationMaster.EXIT_SC_NOT_INITED, "Timed out waiting for SparkContext.") @@ -597,8 +598,8 @@ private[spark] class ApplicationMaster( ApplicationMaster.EXIT_REPORTER_FAILURE, "Exception was thrown " + s"$failureCount time(s) from Reporter thread.") } else { - logWarning( - log"Reporter thread fails ${MDC(FAILURES, failureCount)} time(s) in a row.", e) + logWarning(log"Reporter thread fails ${MDC(LogKeys.FAILURES, failureCount)} " + + log"time(s) in a row.", e) } } try { @@ -656,8 +657,9 @@ private[spark] class ApplicationMaster( t.setDaemon(true) t.setName("Reporter") t.start() - logInfo(s"Started progress reporter thread with (heartbeat : $heartbeatInterval, " + - s"initial allocation : $initialAllocationInterval) intervals") + logInfo(log"Started progress reporter thread with " + + log"(heartbeat: ${MDC(LogKeys.HEARTBEAT_INTERVAL, heartbeatInterval)}, initial allocation: " + + log"${MDC(LogKeys.INITIAL_HEARTBEAT_INTERVAL, initialAllocationInterval)}) intervals") t } @@ -683,18 +685,18 @@ private[spark] class ApplicationMaster( try { val preserveFiles = sparkConf.get(PRESERVE_STAGING_FILES) if (!preserveFiles) { - logInfo("Deleting staging directory " + stagingDirPath) + logInfo(log"Deleting staging directory ${MDC(LogKeys.PATH, stagingDirPath)}") fs.delete(stagingDirPath, true) } } catch { case ioe: IOException => - logError("Failed to cleanup staging dir " + stagingDirPath, ioe) + logError(log"Failed to cleanup staging dir ${MDC(LogKeys.PATH, stagingDirPath)}", ioe) } } /** Add the Yarn IP filter that is required for properly securing the UI. */ private def addAmIpFilter(driver: Option[RpcEndpointRef], proxyBase: String) = { - val amFilter = "org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter" + val amFilter = classOf[AmIpFilter].getName val params = client.getAmIpFilterParams(yarnConf, proxyBase) driver match { case Some(d) => @@ -734,7 +736,8 @@ private[spark] class ApplicationMaster( override def run(): Unit = { try { if (!Modifier.isStatic(mainMethod.getModifiers)) { - logError(s"Could not find static main method in object ${args.userClass}") + logError(log"Could not find static main method in object " + + log"${MDC(LogKeys.CLASS_NAME, args.userClass)}") finish(FinalApplicationStatus.FAILED, ApplicationMaster.EXIT_EXCEPTION_USER_CLASS) } else { mainMethod.invoke(null, userArgs.toArray) @@ -748,7 +751,7 @@ private[spark] class ApplicationMaster( // Reporter thread can interrupt to stop user class case SparkUserAppException(exitCode) => val msg = log"User application exited with status " + - log"${MDC(EXIT_CODE, exitCode)}" + log"${MDC(LogKeys.EXIT_CODE, exitCode)}" logError(msg) finish(FinalApplicationStatus.FAILED, exitCode, msg.message) case cause: Throwable => @@ -791,9 +794,9 @@ private[spark] class ApplicationMaster( override def onStart(): Unit = { driver.send(RegisterClusterManager(self)) - // if deployment mode for yarn Application is client + // if deployment mode for yarn Application is managed client // then send the AM Log Info to spark driver - if (!isClusterMode) { + if (!isClusterMode && !sparkConf.get(YARN_UNMANAGED_AM)) { val hostPort = YarnContainerInfoHelper.getNodeManagerHttpAddress(None) val yarnAMID = "yarn-am" val info = new MiscellaneousProcessDetails(hostPort, @@ -831,7 +834,8 @@ private[spark] class ApplicationMaster( } case KillExecutors(executorIds) => - logInfo(s"Driver requested to kill executor(s) ${executorIds.mkString(", ")}.") + logInfo(log"Driver requested to kill executor(s) " + + log"${MDC(LogKeys.EXECUTOR_IDS, executorIds.mkString(", "))}.") Option(allocator) match { case Some(a) => executorIds.foreach(a.killExecutor) case None => logWarning("Container allocator is not ready to kill executors yet.") @@ -854,15 +858,17 @@ private[spark] class ApplicationMaster( if (!(isClusterMode || sparkConf.get(YARN_UNMANAGED_AM))) { if (shutdown || !clientModeTreatDisconnectAsFailed) { if (exitCode == 0) { - logInfo(s"Driver terminated or disconnected! Shutting down. $remoteAddress") + logInfo(log"Driver terminated or disconnected! Shutting down. " + + log"${MDC(LogKeys.HOST_PORT, remoteAddress)}") finish(FinalApplicationStatus.SUCCEEDED, ApplicationMaster.EXIT_SUCCESS) } else { - logError(log"Driver terminated with exit code ${MDC(EXIT_CODE, exitCode)}! " + - log"Shutting down. ${MDC(HOST_PORT, remoteAddress)}") + logError(log"Driver terminated with exit code ${MDC(LogKeys.EXIT_CODE, exitCode)}! " + + log"Shutting down. ${MDC(LogKeys.HOST_PORT, remoteAddress)}") finish(FinalApplicationStatus.FAILED, exitCode) } } else { - logError(s"Application Master lost connection with driver! Shutting down. $remoteAddress") + logError(log"Application Master lost connection with driver! Shutting down. " + + log"${MDC(LogKeys.HOST_PORT, remoteAddress)}") finish(FinalApplicationStatus.FAILED, ApplicationMaster.EXIT_DISCONNECTED) } } diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala index bed7c859003a0..b2c4d97bc7b07 100644 --- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala +++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala @@ -55,8 +55,7 @@ import org.apache.spark.deploy.security.HadoopDelegationTokenManager import org.apache.spark.deploy.yarn.ResourceRequestHelper._ import org.apache.spark.deploy.yarn.YarnSparkHadoopUtil._ import org.apache.spark.deploy.yarn.config._ -import org.apache.spark.internal.{Logging, LogKey, MDC} -import org.apache.spark.internal.LogKey.{APP_ID, CONFIG, CONFIG2, PATH} +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.internal.config._ import org.apache.spark.internal.config.Python._ import org.apache.spark.launcher.{JavaModuleOptions, LauncherBackend, SparkAppHandle, YarnCommandBuilderUtils} @@ -142,7 +141,8 @@ private[spark] class Client( val principal = sparkConf.get(PRINCIPAL).orNull require((principal == null) == (keytab == null), "Both principal and keytab must be defined, or neither.") - logInfo(s"Kerberos credentials: principal = $principal, keytab = $keytab") + logInfo(log"Kerberos credentials: principal = ${MDC(LogKeys.PRINCIPAL, principal)}, " + + log"keytab = ${MDC(LogKeys.KEYTAB, keytab)}") // Generate a file name that can be used for the keytab file, that does not conflict // with any user file. Some(new File(keytab).getName() + "-" + UUID.randomUUID().toString) @@ -229,7 +229,7 @@ private[spark] class Client( val appContext = createApplicationSubmissionContext(newApp, containerContext) // Finally, submit and monitor the application - logInfo(s"Submitting application $appId to ResourceManager") + logInfo(log"Submitting application ${MDC(LogKeys.APP_ID, appId)} to ResourceManager") yarnClient.submitApplication(appContext) launcherBackend.setAppId(appId.toString) reportLauncherState(SparkAppHandle.State.SUBMITTED) @@ -254,11 +254,11 @@ private[spark] class Client( try { val fs = stagingDirPath.getFileSystem(hadoopConf) if (fs.delete(stagingDirPath, true)) { - logInfo(s"Deleted staging directory $stagingDirPath") + logInfo(log"Deleted staging directory ${MDC(LogKeys.PATH, stagingDirPath)}") } } catch { case ioe: IOException => - logWarning(log"Failed to cleanup staging dir ${MDC(PATH, stagingDirPath)}", ioe) + logWarning(log"Failed to cleanup staging dir ${MDC(LogKeys.PATH, stagingDirPath)}", ioe) } } @@ -332,7 +332,7 @@ private[spark] class Client( appContext.setLogAggregationContext(logAggregationContext) } catch { case NonFatal(e) => - logWarning(log"Ignoring ${MDC(CONFIG, ROLLED_LOG_INCLUDE_PATTERN.key)}} " + + logWarning(log"Ignoring ${MDC(LogKeys.CONFIG, ROLLED_LOG_INCLUDE_PATTERN.key)}} " + log"because the version of YARN does not support it", e) } } @@ -371,14 +371,16 @@ private[spark] class Client( // SPARK-37205: this regex is used to grep a list of configurations and send them to YARN RM // for fetching delegation tokens. See YARN-5910 for more details. sparkConf.get(config.AM_TOKEN_CONF_REGEX).foreach { regex => - logInfo(s"Processing token conf (spark.yarn.am.tokenConfRegex) with regex $regex") + logInfo(log"Processing token conf (spark.yarn.am.tokenConfRegex) with " + + log"regex ${MDC(LogKeys.TOKEN_REGEX, regex)}") val dob = new DataOutputBuffer() val copy = new Configuration(false) copy.clear() hadoopConf.asScala.foreach { entry => if (entry.getKey.matches(regex)) { copy.set(entry.getKey, entry.getValue) - logInfo(s"Captured key: ${entry.getKey} -> value: ${entry.getValue}") + logInfo(log"Captured key: ${MDC(LogKeys.KEY, entry.getKey)} -> " + + log"value: ${MDC(LogKeys.VALUE, entry.getValue)}") } } copy.write(dob); @@ -403,8 +405,8 @@ private[spark] class Client( */ private def verifyClusterResources(newAppResponse: GetNewApplicationResponse): Unit = { val maxMem = newAppResponse.getMaximumResourceCapability.getMemorySize - logInfo("Verifying our application has not requested more than the maximum " + - s"memory capability of the cluster ($maxMem MB per container)") + logInfo(log"Verifying our application has not requested more than the maximum memory " + + log"capability of the cluster (${MDC(LogKeys.MAX_MEMORY_SIZE, maxMem)} MB per container)") val executorMem = executorMemory + executorOffHeapMemory + executorMemoryOverhead + pysparkWorkerMemory if (executorMem > maxMem) { @@ -421,9 +423,8 @@ private[spark] class Client( "Please check the values of 'yarn.scheduler.maximum-allocation-mb' and/or " + "'yarn.nodemanager.resource.memory-mb'.") } - logInfo("Will allocate AM container, with %d MB memory including %d MB overhead".format( - amMem, - amMemoryOverhead)) + logInfo(log"Will allocate AM container, with ${MDC(LogKeys.MEMORY_SIZE, amMem)} MB memory " + + log"including ${MDC(LogKeys.OVERHEAD_MEMORY_SIZE, amMemoryOverhead)} MB overhead") // We could add checks to make sure the entire cluster has enough resources but that involves // getting all the node reports and computing ourselves. @@ -447,7 +448,8 @@ private[spark] class Client( var destPath = srcPath if (force || !compareFs(srcFs, destFs) || "file".equals(srcFs.getScheme)) { destPath = new Path(destDir, destName.getOrElse(srcPath.getName())) - logInfo(s"Uploading resource $srcPath -> $destPath") + logInfo(log"Uploading resource ${MDC(LogKeys.SRC_PATH, srcPath)} -> " + + log"${MDC(LogKeys.TARGET_PATH, destPath)}") try { FileUtil.copy(srcFs, srcPath, destFs, destPath, false, hadoopConf) } catch { @@ -458,7 +460,8 @@ private[spark] class Client( replication.foreach(repl => destFs.setReplication(destPath, repl)) destFs.setPermission(destPath, new FsPermission(APP_FILE_PERMISSION)) } else { - logInfo(s"Source and destination file systems are the same. Not copying $srcPath") + logInfo(log"Source and destination file systems are the same. " + + log"Not copying ${MDC(LogKeys.SRC_PATH, srcPath)}") } // Resolve any symlinks in the URI path so using a "current" symlink to point to a specific // version shows the specific version in the distributed cache configuration @@ -558,11 +561,11 @@ private[spark] class Client( val uriStr = uri.toString() val fileName = new File(uri.getPath).getName if (distributedUris.contains(uriStr)) { - logWarning(log"Same path resource ${MDC(LogKey.URI, uri)} added multiple times " + + logWarning(log"Same path resource ${MDC(LogKeys.URI, uri)} added multiple times " + log"to distributed cache.") false } else if (distributedNames.contains(fileName)) { - logWarning(log"Same name resource ${MDC(LogKey.URI, uri)} added multiple times " + + logWarning(log"Same name resource ${MDC(LogKeys.URI, uri)} added multiple times " + log"to distributed cache") false } else { @@ -701,8 +704,9 @@ private[spark] class Client( case None => // No configuration, so fall back to uploading local jar files. logWarning( - log"Neither ${MDC(CONFIG, SPARK_JARS.key)} nor ${MDC(CONFIG2, SPARK_ARCHIVE.key)}} " + - log"is set, falling back to uploading libraries under SPARK_HOME.") + log"Neither ${MDC(LogKeys.CONFIG, SPARK_JARS.key)} nor " + + log"${MDC(LogKeys.CONFIG2, SPARK_ARCHIVE.key)}} is set, falling back to uploading " + + log"libraries under SPARK_HOME.") val jarsDir = new File(YarnCommandBuilderUtils.findJarsDir( sparkConf.getenv("SPARK_HOME"))) val jarsArchive = File.createTempFile(LOCALIZED_LIB_DIR, ".zip", @@ -881,7 +885,7 @@ private[spark] class Client( if (dir.isDirectory()) { val files = dir.listFiles() if (files == null) { - logWarning(log"Failed to list files under directory ${MDC(PATH, dir)}") + logWarning(log"Failed to list files under directory ${MDC(LogKeys.PATH, dir)}") } else { files.foreach { file => if (file.isFile && !hadoopConfFiles.contains(file.getName())) { @@ -1070,7 +1074,8 @@ private[spark] class Client( sparkConf)) } if (sparkConf.get(AM_JAVA_OPTIONS).isDefined) { - logWarning(log"${MDC(CONFIG, AM_JAVA_OPTIONS.key)} will not take effect in cluster mode") + logWarning(log"${MDC(LogKeys.CONFIG, AM_JAVA_OPTIONS.key)} will not take effect " + + log"in cluster mode") } } else { // Validate and include yarn am specific java options in yarn-client mode. @@ -1202,21 +1207,22 @@ private[spark] class Client( getApplicationReport() } catch { case e: ApplicationNotFoundException => - logError(log"Application ${MDC(APP_ID, appId)} not found.") + logError(log"Application ${MDC(LogKeys.APP_ID, appId)} not found.") cleanupStagingDir() return YarnAppReport(YarnApplicationState.KILLED, FinalApplicationStatus.KILLED, None) case NonFatal(e) if !e.isInstanceOf[InterruptedIOException] => - val msg = s"Failed to contact YARN for application $appId." + val msg = log"Failed to contact YARN for application ${MDC(LogKeys.APP_ID, appId)}." logError(msg, e) // Don't necessarily clean up staging dir because status is unknown return YarnAppReport(YarnApplicationState.FAILED, FinalApplicationStatus.FAILED, - Some(msg)) + Some(msg.message)) } val state = report.getYarnApplicationState reportsSinceLastLog += 1 if (logApplicationReport) { if (lastState != state || reportsSinceLastLog >= reportsTillNextLog) { - logInfo(s"Application report for $appId (state: $state)") + logInfo(log"Application report for ${MDC(LogKeys.APP_ID, appId)} " + + log"(state: ${MDC(LogKeys.APP_STATE, state)})") reportsSinceLastLog = 0 } @@ -1225,7 +1231,8 @@ private[spark] class Client( if (log.isDebugEnabled) { logDebug(formatReportDetails(report, getDriverLogsLink(report))) } else if (lastState != state) { - logInfo(formatReportDetails(report, getDriverLogsLink(report))) + logInfo(log"${MDC(LogKeys.REPORT_DETAILS, + formatReportDetails(report, getDriverLogsLink(report)))}") } } @@ -1347,7 +1354,7 @@ private[spark] class Client( .getOrElse(IMap.empty) } catch { case e: Exception => - logWarning(log"Unable to get driver log links for ${MDC(APP_ID, appId)}: ", e) + logWarning(log"Unable to get driver log links for ${MDC(LogKeys.APP_ID, appId)}: ", e) // Include the full stack trace only at DEBUG level to reduce verbosity logDebug(s"Unable to get driver log links for $appId", e) IMap.empty @@ -1367,8 +1374,10 @@ private[spark] class Client( if (!launcherBackend.isConnected() && fireAndForget) { val report = getApplicationReport() val state = report.getYarnApplicationState - logInfo(s"Application report for $appId (state: $state)") - logInfo(formatReportDetails(report, getDriverLogsLink(report))) + logInfo(log"Application report for ${MDC(LogKeys.APP_ID, appId)} " + + log"(state: ${MDC(LogKeys.APP_STATE, state)})") + logInfo(log"${MDC(LogKeys.REPORT_DETAILS, + formatReportDetails(report, getDriverLogsLink(report)))}") if (state == YarnApplicationState.FAILED || state == YarnApplicationState.KILLED) { throw new SparkException(s"Application $appId finished with status: $state") } @@ -1376,7 +1385,7 @@ private[spark] class Client( val YarnAppReport(appState, finalState, diags) = monitorApplication() if (appState == YarnApplicationState.FAILED || finalState == FinalApplicationStatus.FAILED) { diags.foreach { err => - logError(s"Application diagnostics message: $err") + logError(log"Application diagnostics message: ${MDC(LogKeys.ERROR, err)}") } throw new SparkException(s"Application $appId finished with failed status") } @@ -1674,8 +1683,8 @@ private[spark] object Client extends Logging { def getClusterPath(conf: SparkConf, path: String): String = { val localPath = conf.get(GATEWAY_ROOT_PATH) val clusterPath = conf.get(REPLACEMENT_ROOT_PATH) - if (localPath != null && clusterPath != null) { - path.replace(localPath, clusterPath) + if (localPath.isDefined && clusterPath.isDefined) { + path.replace(localPath.get, clusterPath.get) } else { path } diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala index 202ef36166d2a..62753f35ae76c 100644 --- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala +++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala @@ -19,7 +19,8 @@ package org.apache.spark.deploy.yarn import scala.collection.mutable.ArrayBuffer -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.ARGS // TODO: Add code and support for ensuring that yarn resource 'tasks' are location aware ! private[spark] class ClientArguments(args: Array[String]) extends Logging { @@ -75,7 +76,7 @@ private[spark] class ClientArguments(args: Array[String]) extends Logging { } if (verbose) { - logInfo(s"Parsed user args for YARN application: [${userArgs.mkString(" ")}]") + logInfo(log"Parsed user args for YARN application: [${MDC(ARGS, userArgs.mkString(" "))}]") } } diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala index 81b210a2297a5..983ab5b4341b8 100644 --- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala +++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala @@ -38,7 +38,8 @@ import org.apache.hadoop.yarn.ipc.YarnRPC import org.apache.hadoop.yarn.util.Records import org.apache.spark.{SecurityManager, SparkConf, SparkException} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC, MessageWithContext} +import org.apache.spark.internal.LogKeys.{EXECUTOR_ENVS, EXECUTOR_LAUNCH_COMMANDS, EXECUTOR_RESOURCES} import org.apache.spark.internal.config._ import org.apache.spark.network.util.JavaUtils import org.apache.spark.util.Utils @@ -68,21 +69,23 @@ private[yarn] class ExecutorRunnable( startContainer() } - def launchContextDebugInfo(): String = { + def launchContextDebugInfo(): MessageWithContext = { val commands = prepareCommand() val env = prepareEnvironment() - s""" - |=============================================================================== - |Default YARN executor launch context: - | env: - |${Utils.redact(sparkConf, env.toSeq).map { case (k, v) => s" $k -> $v\n" }.mkString} - | command: - | ${Utils.redactCommandLineArgs(sparkConf, commands).mkString(" \\ \n ")} - | - | resources: - |${localResources.map { case (k, v) => s" $k -> $v\n" }.mkString} - |===============================================================================""".stripMargin + // scalastyle:off line.size.limit + log""" + |=============================================================================== + |Default YARN executor launch context: + | env: + |${MDC(EXECUTOR_ENVS, Utils.redact(sparkConf, env.toSeq).map { case (k, v) => s" $k -> $v\n" }.mkString)} + | command: + | ${MDC(EXECUTOR_LAUNCH_COMMANDS, Utils.redactCommandLineArgs(sparkConf, commands).mkString(" \\ \n "))} + | + | resources: + |${MDC(EXECUTOR_RESOURCES, localResources.map { case (k, v) => s" $k -> $v\n" }.mkString)} + |===============================================================================""".stripMargin + // scalastyle:on line.size.limit } def startContainer(): java.util.Map[String, ByteBuffer] = { diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ResourceRequestHelper.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ResourceRequestHelper.scala index 755a69520ce41..a747f99f1b85d 100644 --- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ResourceRequestHelper.scala +++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ResourceRequestHelper.scala @@ -26,7 +26,7 @@ import org.apache.hadoop.yarn.exceptions.ResourceNotFoundException import org.apache.spark.{SparkConf, SparkException} import org.apache.spark.deploy.yarn.config._ import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{ERROR, RESOURCE_NAME} +import org.apache.spark.internal.LogKeys.{ERROR, RESOURCE_NAME} import org.apache.spark.internal.config._ import org.apache.spark.resource.ResourceID import org.apache.spark.resource.ResourceUtils.{AMOUNT, FPGA, GPU} diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/SparkRackResolver.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/SparkRackResolver.scala index e0d66af348e29..618f0dc8a4daa 100644 --- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/SparkRackResolver.scala +++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/SparkRackResolver.scala @@ -29,7 +29,8 @@ import org.apache.hadoop.yarn.util.RackResolver import org.apache.logging.log4j.{Level, LogManager} import org.apache.logging.log4j.core.Logger -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.NODE_LOCATION /** * Re-implement YARN's [[RackResolver]] for hadoop releases without YARN-9332. @@ -77,8 +78,8 @@ private[spark] class SparkRackResolver(conf: Configuration) extends Logging { val rNameList = dnsToSwitchMapping.resolve(hostNames.toList.asJava).asScala if (rNameList == null || rNameList.isEmpty) { hostNames.foreach(nodes += new NodeBase(_, NetworkTopology.DEFAULT_RACK)) - logInfo(s"Got an error when resolving hostNames. " + - s"Falling back to ${NetworkTopology.DEFAULT_RACK} for all") + logInfo(log"Got an error when resolving hostNames. " + + log"Falling back to ${MDC(NODE_LOCATION, NetworkTopology.DEFAULT_RACK)} for all") } else { for ((hostName, rName) <- hostNames.zip(rNameList)) { if (Strings.isNullOrEmpty(rName)) { diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala index efe766be8356d..c86195d0ef31e 100644 --- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala +++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala @@ -41,7 +41,7 @@ import org.apache.spark.deploy.yarn.YarnSparkHadoopUtil._ import org.apache.spark.deploy.yarn.config._ import org.apache.spark.executor.ExecutorExitCode import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{APP_STATE, CONFIG, CONFIG2, CONFIG3, CONTAINER_ID, ERROR, EXECUTOR_ID, HOST, REASON} +import org.apache.spark.internal.LogKeys import org.apache.spark.internal.config._ import org.apache.spark.resource.ResourceProfile import org.apache.spark.resource.ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID @@ -195,7 +195,8 @@ private[yarn] class YarnAllocator( case (true, false) => true case (true, true) => logWarning(log"Yarn Executor Decommissioning is supported only " + - log"when ${MDC(CONFIG, SHUFFLE_SERVICE_ENABLED.key)} is set to false. See: SPARK-39018.") + log"when ${MDC(LogKeys.CONFIG, SHUFFLE_SERVICE_ENABLED.key)} is set to false. " + + log"See: SPARK-39018.") false case (false, _) => false } @@ -313,7 +314,8 @@ private[yarn] class YarnAllocator( if (!rpIdToYarnResource.containsKey(rp.id)) { // track the resource profile if not already there getOrUpdateRunningExecutorForRPId(rp.id) - logInfo(s"Resource profile ${rp.id} doesn't exist, adding it") + logInfo(log"Resource profile ${MDC(LogKeys.RESOURCE_PROFILE_ID, rp.id)} doesn't exist, " + + log"adding it") val resourcesWithDefaults = ResourceProfile.getResourcesForClusterManager(rp.id, rp.executorResources, @@ -399,8 +401,8 @@ private[yarn] class YarnAllocator( val res = resourceProfileToTotalExecs.map { case (rp, numExecs) => createYarnResourceForResourceProfile(rp) if (numExecs != getOrUpdateTargetNumExecutorsForRPId(rp.id)) { - logInfo(s"Driver requested a total number of $numExecs executor(s) " + - s"for resource profile id: ${rp.id}.") + logInfo(log"Driver requested a total number of ${MDC(LogKeys.COUNT, numExecs)} " + + log"executor(s) for resource profile id: ${MDC(LogKeys.RESOURCE_PROFILE_ID, rp.id)}.") targetNumExecutorsPerResourceProfileId(rp.id) = numExecs allocatorNodeHealthTracker.setSchedulerExcludedNodes(excludedNodes) true @@ -421,7 +423,8 @@ private[yarn] class YarnAllocator( val (_, rpId) = containerIdToExecutorIdAndResourceProfileId(container.getId) internalReleaseContainer(container) getOrUpdateRunningExecutorForRPId(rpId).remove(executorId) - case _ => logWarning(log"Attempted to kill unknown executor ${MDC(EXECUTOR_ID, executorId)}!") + case _ => logWarning(log"Attempted to kill unknown executor " + + log"${MDC(LogKeys.EXECUTOR_ID, executorId)}!") } } @@ -520,12 +523,13 @@ private[yarn] class YarnAllocator( if (missing > 0) { val resource = rpIdToYarnResource.get(rpId) if (log.isInfoEnabled()) { - var requestContainerMessage = s"Will request $missing executor container(s) for " + - s" ResourceProfile Id: $rpId, each with " + - s"${resource.getVirtualCores} core(s) and " + - s"${resource.getMemorySize} MB memory." - if (resource.getResources().nonEmpty) { - requestContainerMessage ++= s" with custom resources: $resource" + var requestContainerMessage = log"Will request ${MDC(LogKeys.COUNT, missing)} executor " + + log"container(s) for ResourceProfile Id: ${MDC(LogKeys.RESOURCE_PROFILE_ID, rpId)}, " + + log"each with ${MDC(LogKeys.VIRTUAL_CORES, resource.getVirtualCores)} core(s) and " + + log"${MDC(LogKeys.MEMORY_SIZE, resource.getMemorySize)} MB memory." + if (resource.getResources.nonEmpty) { + requestContainerMessage = requestContainerMessage + + log" with custom resources: ${MDC(LogKeys.RESOURCE, resource)}" } logInfo(requestContainerMessage) } @@ -536,7 +540,8 @@ private[yarn] class YarnAllocator( } val cancelledContainers = staleRequests.size if (cancelledContainers > 0) { - logInfo(s"Canceled $cancelledContainers container request(s) (locality no longer needed)") + logInfo(log"Canceled ${MDC(LogKeys.COUNT, cancelledContainers)} container request(s) " + + log"(locality no longer needed)") } // consider the number of new containers and cancelled stale containers available @@ -570,8 +575,8 @@ private[yarn] class YarnAllocator( amClient.removeContainerRequest(nonLocal) } if (numToCancel > 0) { - logInfo(s"Canceled $numToCancel unlocalized container requests to " + - s"resubmit with locality") + logInfo(log"Canceled ${MDC(LogKeys.COUNT, numToCancel)} unlocalized container " + + log"requests to resubmit with locality") } } @@ -582,16 +587,20 @@ private[yarn] class YarnAllocator( if (log.isInfoEnabled()) { val (localized, anyHost) = newLocalityRequests.partition(_.getNodes() != null) if (anyHost.nonEmpty) { - logInfo(s"Submitted ${anyHost.size} unlocalized container requests.") + logInfo(log"Submitted ${MDC(LogKeys.COUNT, anyHost.size)}} unlocalized container " + + log"requests.") } localized.foreach { request => - logInfo(s"Submitted container request for host ${hostStr(request)}.") + logInfo(log"Submitted container request for host " + + log"${MDC(LogKeys.HOST, hostStr(request))}.") } } } else if (numPendingAllocate > 0 && missing < 0) { val numToCancel = math.min(numPendingAllocate, -missing) - logInfo(s"Canceling requests for $numToCancel executor container(s) to have a new " + - s"desired total ${getOrUpdateTargetNumExecutorsForRPId(rpId)} executors.") + logInfo(log"Canceling requests for ${MDC(LogKeys.COUNT, numToCancel)} executor " + + log"container(s) to have a new desired total " + + log"${MDC(LogKeys.NUM_EXECUTOR_DESIRED, + getOrUpdateTargetNumExecutorsForRPId(rpId))} executors.") // cancel pending allocate requests by taking locality preference into account val cancelRequests = (staleRequests ++ anyHostRequests ++ localRequests).take(numToCancel) cancelRequests.foreach(amClient.removeContainerRequest) @@ -697,8 +706,9 @@ private[yarn] class YarnAllocator( runAllocatedContainers(containersToUse) - logInfo("Received %d containers from YARN, launching executors on %d of them." - .format(allocatedContainers.size, containersToUse.size)) + logInfo(log"Received ${MDC(LogKeys.COUNT, allocatedContainers.size)} containers from YARN, " + + log"launching executors on ${MDC(LogKeys.NUM_EXECUTOR_LAUNCH, containersToUse.size)} " + + log"of them.") } /** @@ -751,8 +761,10 @@ private[yarn] class YarnAllocator( val executorId = executorIdCounter.toString val yarnResourceForRpId = rpIdToYarnResource.get(rpId) assert(container.getResource.getMemorySize >= yarnResourceForRpId.getMemorySize) - logInfo(s"Launching container $containerId on host $executorHostname " + - s"for executor with ID $executorId for ResourceProfile Id $rpId") + logInfo(log"Launching container ${MDC(LogKeys.CONTAINER_ID, containerId)} " + + log"on host ${MDC(LogKeys.HOST, executorHostname)} for " + + log"executor with ID ${MDC(LogKeys.EXECUTOR_ID, executorId)} for " + + log"ResourceProfile Id ${MDC(LogKeys.RESOURCE_PROFILE_ID, rpId)}") val rp = rpIdToResourceProfile(rpId) val defaultResources = ResourceProfile.getDefaultProfileExecutorResources(sparkConf) @@ -790,8 +802,8 @@ private[yarn] class YarnAllocator( getOrUpdateNumExecutorsStartingForRPId(rpId).decrementAndGet() launchingExecutorContainerIds.remove(containerId) if (NonFatal(e)) { - logError(log"Failed to launch executor ${MDC(EXECUTOR_ID, executorId)} " + - log"on container ${MDC(CONTAINER_ID, containerId)}", e) + logError(log"Failed to launch executor ${MDC(LogKeys.EXECUTOR_ID, executorId)} " + + log"on container ${MDC(LogKeys.CONTAINER_ID, containerId)}", e) // Assigned container should be released immediately // to avoid unnecessary resource occupation. amClient.releaseAssignedContainer(containerId) @@ -805,9 +817,9 @@ private[yarn] class YarnAllocator( updateInternalState(rpId, executorId, container) } } else { - logInfo(("Skip launching executorRunnable as running executors count: %d " + - "reached target executors count: %d.").format(rpRunningExecs, - getOrUpdateTargetNumExecutorsForRPId(rpId))) + logInfo(log"Skip launching executorRunnable as running executors count: " + + log"${MDC(LogKeys.COUNT, rpRunningExecs)} reached target executors count: " + + log"${MDC(LogKeys.NUM_EXECUTOR_TARGET, getOrUpdateTargetNumExecutorsForRPId(rpId))}.") } } } @@ -849,47 +861,47 @@ private[yarn] class YarnAllocator( case Some((executorId, _)) => getOrUpdateRunningExecutorForRPId(rpId).remove(executorId) case None => logWarning(log"Cannot find executorId for container: " + - log"${MDC(CONTAINER_ID, containerId)}") + log"${MDC(LogKeys.CONTAINER_ID, containerId)}") } - logInfo("Completed container %s%s (state: %s, exit status: %s)".format( - containerId, - onHostStr, - completedContainer.getState, - completedContainer.getExitStatus)) + logInfo(log"Completed container ${MDC(LogKeys.CONTAINER_ID, containerId)}" + + log"${MDC(LogKeys.HOST, onHostStr)} " + + log"(state: ${MDC(LogKeys.CONTAINER_STATE, completedContainer.getState)}, " + + log"exit status: ${MDC(LogKeys.EXIT_CODE, completedContainer.getExitStatus)}") val exitStatus = completedContainer.getExitStatus val (exitCausedByApp, containerExitReason) = exitStatus match { case _ if shutdown => - (false, log"Executor for container ${MDC(CONTAINER_ID, containerId)} exited after " + - log"Application shutdown.") + (false, log"Executor for container ${MDC(LogKeys.CONTAINER_ID, containerId)} " + + log"exited after Application shutdown.") case ContainerExitStatus.SUCCESS => - (false, log"Executor for container ${MDC(CONTAINER_ID, containerId)} exited because " + - log"of a YARN event (e.g., preemption) and not because of an error in the running " + - log"job.") + (false, log"Executor for container ${MDC(LogKeys.CONTAINER_ID, containerId)} " + + log"exited because of a YARN event (e.g., preemption) and not because of an " + + log"error in the running job.") case ContainerExitStatus.PREEMPTED => // Preemption is not the fault of the running tasks, since YARN preempts containers // merely to do resource sharing, and tasks that fail due to preempted executors could // just as easily finish on any other executor. See SPARK-8167. - (false, log"Container ${MDC(CONTAINER_ID, containerId)}${MDC(HOST, onHostStr)} " + - log"was preempted.") + (false, log"Container ${MDC(LogKeys.CONTAINER_ID, containerId)}" + + log"${MDC(LogKeys.HOST, onHostStr)} was preempted.") // Should probably still count memory exceeded exit codes towards task failures case ContainerExitStatus.KILLED_EXCEEDED_VMEM => val vmemExceededPattern = raw"$MEM_REGEX of $MEM_REGEX virtual memory used".r val diag = vmemExceededPattern.findFirstIn(completedContainer.getDiagnostics) .map(_.concat(".")).getOrElse("") val message = log"Container killed by YARN for exceeding virtual memory limits. " + - log"${MDC(ERROR, diag)} Consider boosting " + - log"${MDC(CONFIG, EXECUTOR_MEMORY_OVERHEAD.key)} or boosting " + - log"${MDC(CONFIG2, YarnConfiguration.NM_VMEM_PMEM_RATIO)} or disabling " + - log"${MDC(CONFIG3, YarnConfiguration.NM_VMEM_CHECK_ENABLED)} because of YARN-4714." + log"${MDC(LogKeys.ERROR, diag)} Consider boosting " + + log"${MDC(LogKeys.CONFIG, EXECUTOR_MEMORY_OVERHEAD.key)} or boosting " + + log"${MDC(LogKeys.CONFIG2, YarnConfiguration.NM_VMEM_PMEM_RATIO)} or disabling " + + log"${MDC(LogKeys.CONFIG3, YarnConfiguration.NM_VMEM_CHECK_ENABLED)} " + + log"because of YARN-4714." (true, message) case ContainerExitStatus.KILLED_EXCEEDED_PMEM => val pmemExceededPattern = raw"$MEM_REGEX of $MEM_REGEX physical memory used".r val diag = pmemExceededPattern.findFirstIn(completedContainer.getDiagnostics) .map(_.concat(".")).getOrElse("") val message = log"Container killed by YARN for exceeding physical memory limits. " + - log"${MDC(ERROR, diag)} Consider boosting " + - log"${MDC(CONFIG, EXECUTOR_MEMORY_OVERHEAD.key)}." + log"${MDC(LogKeys.ERROR, diag)} Consider boosting " + + log"${MDC(LogKeys.CONFIG, EXECUTOR_MEMORY_OVERHEAD.key)}." (true, message) case other_exit_status => val exitStatus = completedContainer.getExitStatus @@ -900,17 +912,19 @@ private[yarn] class YarnAllocator( // SPARK-26269: follow YARN's behaviour, see details in // org.apache.hadoop.yarn.util.Apps#shouldCountTowardsNodeBlacklisting if (NOT_APP_AND_SYSTEM_FAULT_EXIT_STATUS.contains(other_exit_status)) { - (false, log"Container marked as failed: ${MDC(CONTAINER_ID, containerId)}" + - log"${MDC(HOST, onHostStr)}. Exit status: ${MDC(APP_STATE, exitStatus)}. " + - log"Possible causes: ${MDC(REASON, sparkExitCodeReason)} " + - log"Diagnostics: ${MDC(ERROR, completedContainer.getDiagnostics)}.") + (false, log"Container marked as failed: ${MDC(LogKeys.CONTAINER_ID, containerId)}" + + log"${MDC(LogKeys.HOST, onHostStr)}. " + + log"Exit status: ${MDC(LogKeys.EXIT_CODE, exitStatus)}. " + + log"Possible causes: ${MDC(LogKeys.REASON, sparkExitCodeReason)} " + + log"Diagnostics: ${MDC(LogKeys.ERROR, completedContainer.getDiagnostics)}.") } else { // completed container from a bad node allocatorNodeHealthTracker.handleResourceAllocationFailure(hostOpt) - (true, log"Container from a bad node: ${MDC(CONTAINER_ID, containerId)}" + - log"${MDC(HOST, onHostStr)}. Exit status: ${MDC(APP_STATE, exitStatus)}. " + - log"Possible causes: ${MDC(REASON, sparkExitCodeReason)} " + - log"Diagnostics: ${MDC(ERROR, completedContainer.getDiagnostics)}.") + (true, log"Container from a bad node: ${MDC(LogKeys.CONTAINER_ID, containerId)}" + + log"${MDC(LogKeys.HOST, onHostStr)}. " + + log"Exit status: ${MDC(LogKeys.EXIT_CODE, exitStatus)}. " + + log"Possible causes: ${MDC(LogKeys.REASON, sparkExitCodeReason)} " + + log"Diagnostics: ${MDC(LogKeys.ERROR, completedContainer.getDiagnostics)}.") } } if (exitCausedByApp) { @@ -981,7 +995,7 @@ private[yarn] class YarnAllocator( context.reply(releasedExecutorLossReasons.remove(eid).get) } else { logWarning(log"Tried to get the loss reason for non-existent executor " + - log"${MDC(EXECUTOR_ID, eid)}") + log"${MDC(LogKeys.EXECUTOR_ID, eid)}") context.sendFailure( new SparkException(s"Fail to find loss reason for non-existent executor $eid")) } diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocatorNodeHealthTracker.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocatorNodeHealthTracker.scala index 22937ed8117a1..6938c0d7f8020 100644 --- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocatorNodeHealthTracker.scala +++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocatorNodeHealthTracker.scala @@ -25,7 +25,8 @@ import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest import org.apache.spark.SparkConf import org.apache.spark.deploy.ExecutorFailureTracker import org.apache.spark.deploy.yarn.config._ -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{FAILURES, HOST, NODES} import org.apache.spark.internal.config._ import org.apache.spark.scheduler.HealthTracker @@ -90,7 +91,8 @@ private[spark] class YarnAllocatorNodeHealthTracker( private def updateAllocationExcludedNodes(hostname: String): Unit = { val failuresOnHost = failureTracker.numFailuresOnHost(hostname) if (failuresOnHost > maxFailuresPerHost) { - logInfo(s"excluding $hostname as YARN allocation failed $failuresOnHost times") + logInfo(log"excluding ${MDC(HOST, hostname)} as YARN allocation failed " + + log"${MDC(FAILURES, failuresOnHost)} times") allocatorExcludedNodeList.put( hostname, failureTracker.clock.getTimeMillis() + excludeOnFailureTimeoutMillis) @@ -125,10 +127,12 @@ private[spark] class YarnAllocatorNodeHealthTracker( val additions = (nodesToExclude -- currentExcludededYarnNodes).toList.sorted val removals = (currentExcludededYarnNodes -- nodesToExclude).toList.sorted if (additions.nonEmpty) { - logInfo(s"adding nodes to YARN application master's excluded node list: $additions") + logInfo(log"adding nodes to YARN application master's " + + log"excluded node list: ${MDC(NODES, additions)}") } if (removals.nonEmpty) { - logInfo(s"removing nodes from YARN application master's excluded node list: $removals") + logInfo(log"removing nodes from YARN application master's " + + log"excluded node list: ${MDC(NODES, removals)}") } if (additions.nonEmpty || removals.nonEmpty) { // Note YARNs api for excluding nodes is updateBlacklist. diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/config.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/config.scala index c119610080199..51e5e0bfb9087 100644 --- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/config.scala +++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/config.scala @@ -121,14 +121,14 @@ package object config extends Logging { "with the corresponding path in cluster machines.") .version("1.5.0") .stringConf - .createWithDefault(null) + .createOptional private[spark] val REPLACEMENT_ROOT_PATH = ConfigBuilder("spark.yarn.config.replacementPath") .doc(s"Path to use as a replacement for ${GATEWAY_ROOT_PATH.key} when launching processes " + "in the YARN cluster.") .version("1.5.0") .stringConf - .createWithDefault(null) + .createOptional private[spark] val QUEUE_NAME = ConfigBuilder("spark.yarn.queue") .version("1.0.0") diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala index ccc0bc9f715e4..8032d782cf4fc 100644 --- a/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala +++ b/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala @@ -26,8 +26,8 @@ import org.apache.hadoop.yarn.api.records.{FinalApplicationStatus, YarnApplicati import org.apache.spark.{SparkContext, SparkException} import org.apache.spark.deploy.yarn.{Client, ClientArguments, YarnAppReport} import org.apache.spark.deploy.yarn.config._ -import org.apache.spark.internal.{config, Logging, MDC} -import org.apache.spark.internal.LogKey.APP_STATE +import org.apache.spark.internal.{config, Logging, LogKeys, MDC} +import org.apache.spark.internal.LogKeys.{APP_ID, APP_STATE} import org.apache.spark.launcher.SparkAppHandle import org.apache.spark.scheduler.TaskSchedulerImpl import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages._ @@ -99,7 +99,7 @@ private[spark] class YarnClientSchedulerBackend( throw new SparkException(exceptionMsg) } if (state == YarnApplicationState.RUNNING) { - logInfo(s"Application ${appId.get} has started running.") + logInfo(log"Application ${MDC(APP_ID, appId.get)} has started running.") } } @@ -120,7 +120,7 @@ private[spark] class YarnClientSchedulerBackend( logError(log"YARN application has exited unexpectedly with state " + log"${MDC(APP_STATE, state)}! Check the YARN application logs for more details.") diags.foreach { err => - logError(s"Diagnostics message: $err") + logError(log"Diagnostics message: ${MDC(LogKeys.ERROR, err)}") } allowInterrupt = false sc.stop() diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala index d7f285aeb892b..cd81f11510fee 100644 --- a/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala +++ b/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala @@ -30,7 +30,7 @@ import org.apache.hadoop.yarn.api.records.{ApplicationAttemptId, ApplicationId} import org.apache.spark.SparkContext import org.apache.spark.deploy.security.HadoopDelegationTokenManager import org.apache.spark.internal.{config, Logging, MDC} -import org.apache.spark.internal.LogKey.{EXECUTOR_ID, HOST_PORT, REASON} +import org.apache.spark.internal.LogKeys import org.apache.spark.internal.config.UI._ import org.apache.spark.resource.ResourceProfile import org.apache.spark.rpc._ @@ -221,7 +221,9 @@ private[spark] abstract class YarnSchedulerBackend( if (hasFilter) { // SPARK-26255: Append user provided filters(spark.ui.filters) with yarn filter. val allFilters = Seq(filterName) ++ conf.get(UI_FILTERS) - logInfo(s"Add WebUI Filter. $filterName, $filterParams, $proxyBase") + logInfo(log"Add WebUI Filter. ${MDC(LogKeys.UI_FILTER, filterName)}, " + + log"${MDC(LogKeys.UI_FILTER_PARAMS, filterParams)}, " + + log"${MDC(LogKeys.UI_PROXY_BASE, proxyBase)}") // For already installed handlers, prepend the filter. scheduler.sc.ui.foreach { ui => @@ -306,8 +308,8 @@ private[spark] abstract class YarnSchedulerBackend( .recover { case NonFatal(e) => logWarning(log"Attempted to get executor loss reason for executor id " + - log"${MDC(EXECUTOR_ID, executorId)} at RPC address " + - log"${MDC(HOST_PORT, executorRpcAddress)}, but got no response. " + + log"${MDC(LogKeys.EXECUTOR_ID, executorId)} at RPC address " + + log"${MDC(LogKeys.HOST_PORT, executorRpcAddress)}, but got no response. " + log"Marking as agent lost.", e) RemoveExecutor(executorId, ExecutorProcessLost()) }(ThreadUtils.sameThread) @@ -332,7 +334,7 @@ private[spark] abstract class YarnSchedulerBackend( override def receive: PartialFunction[Any, Unit] = { case RegisterClusterManager(am) => - logInfo(s"ApplicationMaster registered as $am") + logInfo(log"ApplicationMaster registered as ${MDC(LogKeys.RPC_ENDPOINT_REF, am)}") amEndpoint = Option(am) reset() @@ -346,7 +348,7 @@ private[spark] abstract class YarnSchedulerBackend( case r @ RemoveExecutor(executorId, reason) => if (!stopped.get) { logWarning(log"Requesting driver to remove executor " + - log"${MDC(EXECUTOR_ID, executorId)} for reason ${MDC(REASON, reason)}") + log"${MDC(LogKeys.EXECUTOR_ID, executorId)} for reason ${MDC(LogKeys.REASON, reason)}") driverEndpoint.send(r) } @@ -364,7 +366,8 @@ private[spark] abstract class YarnSchedulerBackend( am.ask[Boolean](r).andThen { case Success(b) => context.reply(b) case Failure(NonFatal(e)) => - logError(s"Sending $r to AM was unsuccessful", e) + logError( + log"Sending ${MDC(LogKeys.REQUEST_EXECUTORS, r)} to AM was unsuccessful", e) context.sendFailure(e) }(ThreadUtils.sameThread) case None => @@ -378,7 +381,7 @@ private[spark] abstract class YarnSchedulerBackend( am.ask[Boolean](k).andThen { case Success(b) => context.reply(b) case Failure(NonFatal(e)) => - logError(s"Sending $k to AM was unsuccessful", e) + logError(log"Sending ${MDC(LogKeys.KILL_EXECUTORS, k)} to AM was unsuccessful", e) context.sendFailure(e) }(ThreadUtils.sameThread) case None => @@ -395,7 +398,8 @@ private[spark] abstract class YarnSchedulerBackend( override def onDisconnected(remoteAddress: RpcAddress): Unit = { if (amEndpoint.exists(_.address == remoteAddress)) { - logWarning(log"ApplicationMaster has disassociated: ${MDC(HOST_PORT, remoteAddress)}") + logWarning(log"ApplicationMaster has disassociated: " + + log"${MDC(LogKeys.HOST_PORT, remoteAddress)}") amEndpoint = None } } diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/AmIpFilterSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/AmIpFilterSuite.scala new file mode 100644 index 0000000000000..e25bd665dec0d --- /dev/null +++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/AmIpFilterSuite.scala @@ -0,0 +1,342 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.deploy.yarn + +import java.io.{IOException, PrintWriter, StringWriter} +import java.net.HttpURLConnection +import java.util +import java.util.{Collections, Locale} +import java.util.concurrent.TimeUnit +import java.util.concurrent.atomic.AtomicBoolean + +import scala.jdk.CollectionConverters._ + +import jakarta.servlet.{FilterChain, FilterConfig, ServletContext, ServletException, ServletOutputStream, ServletRequest, ServletResponse} +import jakarta.servlet.http.{Cookie, HttpServlet, HttpServletRequest, HttpServletResponse} +import jakarta.ws.rs.core.MediaType +import org.eclipse.jetty.server.{Server, ServerConnector} +import org.eclipse.jetty.servlet.{ServletContextHandler, ServletHolder} +import org.eclipse.jetty.util.thread.QueuedThreadPool +import org.mockito.Mockito.{mock, when} +import org.scalatest.concurrent.Eventually._ +import org.scalatest.time.SpanSugar._ + +import org.apache.spark.SparkFunSuite + +// A port of org.apache.hadoop.yarn.server.webproxy.amfilter.TestAmFilter +class AmIpFilterSuite extends SparkFunSuite { + + private val proxyHost = "localhost" + private val proxyUri = "http://bogus" + + class TestAmIpFilter extends AmIpFilter { + override def getProxyAddresses: util.Set[String] = Set(proxyHost).asJava + } + + class DummyFilterConfig (val map: util.Map[String, String]) extends FilterConfig { + override def getFilterName: String = "dummy" + + override def getInitParameter(arg0: String): String = map.get(arg0) + + override def getInitParameterNames: util.Enumeration[String] = + Collections.enumeration(map.keySet) + + override def getServletContext: ServletContext = null + } + + test("filterNullCookies") { + val request = mock(classOf[HttpServletRequest]) + + when(request.getCookies).thenReturn(null) + when(request.getRemoteAddr).thenReturn(proxyHost) + + val response = mock(classOf[HttpServletResponse]) + val invoked = new AtomicBoolean + + val chain = new FilterChain() { + @throws[IOException] + @throws[ServletException] + override def doFilter(req: ServletRequest, resp: ServletResponse): Unit = { + invoked.set(true) + } + } + + val params = new util.HashMap[String, String] + params.put(AmIpFilter.PROXY_HOST, proxyHost) + params.put(AmIpFilter.PROXY_URI_BASE, proxyUri) + val conf = new DummyFilterConfig(params) + val filter = new TestAmIpFilter + filter.init(conf) + filter.doFilter(request, response, chain) + assert(invoked.get) + filter.destroy() + } + + test("testFindRedirectUrl") { + class EchoServlet extends HttpServlet { + @throws[IOException] + @throws[ServletException] + override def doGet(request: HttpServletRequest, response: HttpServletResponse): Unit = { + response.setContentType(MediaType.TEXT_PLAIN + "; charset=utf-8") + val out = response.getWriter + request.getParameterNames.asScala.toSeq.sorted.foreach { key => + out.print(key) + out.print(':') + out.print(request.getParameter(key)) + out.print('\n') + } + out.close() + } + } + + def withHttpEchoServer(body: String => Unit): Unit = { + val server = new Server(0) + server.getThreadPool.asInstanceOf[QueuedThreadPool].setMaxThreads(20) + val context = new ServletContextHandler + context.setContextPath("/foo") + server.setHandler(context) + val servletPath = "/bar" + context.addServlet(new ServletHolder(new EchoServlet), servletPath) + server.getConnectors.head.asInstanceOf[ServerConnector].setHost("localhost") + try { + server.start() + body(server.getURI.toString + servletPath) + } finally { + server.stop() + } + } + + // generate a valid URL + withHttpEchoServer { rm1Url => + val rm1 = "rm1" + val rm2 = "rm2" + // invalid url + val rm2Url = "host2:8088" + + val filter = new TestAmIpFilter + // make sure findRedirectUrl() go to HA branch + filter.proxyUriBases = Map(rm1 -> rm1Url, rm2 -> rm2Url).asJava + filter.rmUrls = Array[String](rm1, rm2) + + assert(filter.findRedirectUrl === rm1Url) + } + } + + test("testProxyUpdate") { + var params = new util.HashMap[String, String] + params.put(AmIpFilter.PROXY_HOSTS, proxyHost) + params.put(AmIpFilter.PROXY_URI_BASES, proxyUri) + + var conf = new DummyFilterConfig(params) + val filter = new AmIpFilter + val updateInterval = TimeUnit.SECONDS.toMillis(1) + AmIpFilter.setUpdateInterval(updateInterval) + filter.init(conf) + + // check that the configuration was applied + assert(filter.getProxyAddresses.contains("127.0.0.1")) + + // change proxy configurations + params = new util.HashMap[String, String] + params.put(AmIpFilter.PROXY_HOSTS, "unknownhost") + params.put(AmIpFilter.PROXY_URI_BASES, proxyUri) + conf = new DummyFilterConfig(params) + filter.init(conf) + + // configurations shouldn't be updated now + assert(!filter.getProxyAddresses.isEmpty) + // waiting for configuration update + eventually(timeout(5.seconds), interval(500.millis)) { + assertThrows[ServletException] { + filter.getProxyAddresses.isEmpty + } + } + } + + test("testFilter") { + var doFilterRequest: String = null + var servletWrapper: AmIpServletRequestWrapper = null + + val params = new util.HashMap[String, String] + params.put(AmIpFilter.PROXY_HOST, proxyHost) + params.put(AmIpFilter.PROXY_URI_BASE, proxyUri) + val config = new DummyFilterConfig(params) + + // dummy filter + val chain = new FilterChain() { + @throws[IOException] + @throws[ServletException] + override def doFilter(req: ServletRequest, resp: ServletResponse): Unit = { + doFilterRequest = req.getClass.getName + req match { + case wrapper: AmIpServletRequestWrapper => servletWrapper = wrapper + case _ => + } + } + } + val testFilter = new AmIpFilter + testFilter.init(config) + + val response = new HttpServletResponseForTest + + // Test request should implements HttpServletRequest + val failRequest = mock(classOf[ServletRequest]) + val throws = intercept[ServletException] { + testFilter.doFilter(failRequest, response, chain) + } + assert(ProxyUtils.E_HTTP_HTTPS_ONLY === throws.getMessage) + + + // request with HttpServletRequest + val request = mock(classOf[HttpServletRequest]) + when(request.getRemoteAddr).thenReturn("nowhere") + when(request.getRequestURI).thenReturn("/app/application_00_0") + + // address "redirect" is not in host list for non-proxy connection + testFilter.doFilter(request, response, chain) + assert(HttpURLConnection.HTTP_MOVED_TEMP === response.status) + var redirect = response.getHeader(ProxyUtils.LOCATION) + assert("http://bogus/app/application_00_0" === redirect) + + // address "redirect" is not in host list for proxy connection + when(request.getRequestURI).thenReturn("/proxy/application_00_0") + testFilter.doFilter(request, response, chain) + assert(HttpURLConnection.HTTP_MOVED_TEMP === response.status) + redirect = response.getHeader(ProxyUtils.LOCATION) + assert("http://bogus/proxy/redirect/application_00_0" === redirect) + + // check for query parameters + when(request.getRequestURI).thenReturn("/proxy/application_00_0") + when(request.getQueryString).thenReturn("id=0") + testFilter.doFilter(request, response, chain) + assert(HttpURLConnection.HTTP_MOVED_TEMP === response.status) + redirect = response.getHeader(ProxyUtils.LOCATION) + assert("http://bogus/proxy/redirect/application_00_0?id=0" === redirect) + + // "127.0.0.1" contains in host list. Without cookie + when(request.getRemoteAddr).thenReturn("127.0.0.1") + testFilter.doFilter(request, response, chain) + assert(doFilterRequest.contains("HttpServletRequest")) + + // cookie added + val cookies = Array[Cookie](new Cookie(AmIpFilter.PROXY_USER_COOKIE_NAME, "user")) + + when(request.getCookies).thenReturn(cookies) + testFilter.doFilter(request, response, chain) + + assert(doFilterRequest === classOf[AmIpServletRequestWrapper].getName) + // request contains principal from cookie + assert(servletWrapper.getUserPrincipal.getName === "user") + assert(servletWrapper.getRemoteUser === "user") + assert(!servletWrapper.isUserInRole("")) + } + + private class HttpServletResponseForTest extends HttpServletResponse { + private var redirectLocation = "" + var status = 0 + private var contentType: String = _ + final private val headers = new util.HashMap[String, String](1) + private var body: StringWriter = _ + + def getRedirect: String = redirectLocation + + @throws[IOException] + override def sendRedirect(location: String): Unit = redirectLocation = location + + override def setDateHeader(name: String, date: Long): Unit = {} + + override def addDateHeader(name: String, date: Long): Unit = {} + + override def addCookie(cookie: Cookie): Unit = {} + + override def containsHeader(name: String): Boolean = false + + override def encodeURL(url: String): String = null + + override def encodeRedirectURL(url: String): String = url + + override def encodeUrl(url: String): String = null + + override def encodeRedirectUrl(url: String): String = null + + @throws[IOException] + override def sendError(sc: Int, msg: String): Unit = {} + + @throws[IOException] + override def sendError(sc: Int): Unit = {} + + override def setStatus(status: Int): Unit = this.status = status + + override def setStatus(sc: Int, sm: String): Unit = {} + + override def getStatus: Int = 0 + + override def setContentType(contentType: String): Unit = this.contentType = contentType + + override def setBufferSize(size: Int): Unit = {} + + override def getBufferSize: Int = 0 + + @throws[IOException] + override def flushBuffer(): Unit = {} + + override def resetBuffer(): Unit = {} + + override def isCommitted: Boolean = false + + override def reset(): Unit = {} + + override def setLocale(loc: Locale): Unit = {} + + override def getLocale: Locale = null + + override def setHeader(name: String, value: String): Unit = headers.put(name, value) + + override def addHeader(name: String, value: String): Unit = {} + + override def setIntHeader(name: String, value: Int): Unit = {} + + override def addIntHeader(name: String, value: Int): Unit = {} + + override def getHeader(name: String): String = headers.get(name) + + override def getHeaders(name: String): util.Collection[String] = null + + override def getHeaderNames: util.Collection[String] = null + + override def getCharacterEncoding: String = null + + override def getContentType: String = null + + @throws[IOException] + override def getOutputStream: ServletOutputStream = null + + @throws[IOException] + override def getWriter: PrintWriter = { + body = new StringWriter + new PrintWriter(body) + } + + override def setCharacterEncoding(charset: String): Unit = {} + + override def setContentLength(len: Int): Unit = {} + + override def setContentLengthLong(len: Long): Unit = {} + } + +} diff --git a/sbin/spark-daemon.sh b/sbin/spark-daemon.sh index 28d205f03e0fa..b7233e6e9bf3d 100755 --- a/sbin/spark-daemon.sh +++ b/sbin/spark-daemon.sh @@ -98,6 +98,10 @@ spark_rotate_log () . "${SPARK_HOME}/bin/load-spark-env.sh" if [ "$SPARK_IDENT_STRING" = "" ]; then + # if for some reason the shell doesn't have $USER defined + # (e.g., ssh'd in to execute a command) + # let's get the effective username and use that + USER=${USER:-$(id -nu)} export SPARK_IDENT_STRING="$USER" fi diff --git a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4 b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4 index e2b178d34b568..85a4633e80502 100644 --- a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4 +++ b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4 @@ -69,6 +69,35 @@ lexer grammar SqlBaseLexer; public void markUnclosedComment() { has_unclosed_bracketed_comment = true; } + + /** + * When greater than zero, it's in the middle of parsing ARRAY/MAP/STRUCT type. + */ + public int complex_type_level_counter = 0; + + /** + * Increase the counter by one when hits KEYWORD 'ARRAY', 'MAP', 'STRUCT'. + */ + public void incComplexTypeLevelCounter() { + complex_type_level_counter++; + } + + /** + * Decrease the counter by one when hits close tag '>' && the counter greater than zero + * which means we are in the middle of complex type parsing. Otherwise, it's a dangling + * GT token and we do nothing. + */ + public void decComplexTypeLevelCounter() { + if (complex_type_level_counter > 0) complex_type_level_counter--; + } + + /** + * If the counter is zero, it's a shift right operator. It can be closing tags of an complex + * type definition, such as MAP>. + */ + public boolean isShiftRightOperator() { + return complex_type_level_counter == 0 ? true : false; + } } SEMICOLON: ';'; @@ -100,14 +129,16 @@ ANTI: 'ANTI'; ANY: 'ANY'; ANY_VALUE: 'ANY_VALUE'; ARCHIVE: 'ARCHIVE'; -ARRAY: 'ARRAY'; +ARRAY: 'ARRAY' {incComplexTypeLevelCounter();}; AS: 'AS'; ASC: 'ASC'; AT: 'AT'; AUTHORIZATION: 'AUTHORIZATION'; +BEGIN: 'BEGIN'; BETWEEN: 'BETWEEN'; BIGINT: 'BIGINT'; BINARY: 'BINARY'; +BINDING: 'BINDING'; BOOLEAN: 'BOOLEAN'; BOTH: 'BOTH'; BUCKET: 'BUCKET'; @@ -115,6 +146,7 @@ BUCKETS: 'BUCKETS'; BY: 'BY'; BYTE: 'BYTE'; CACHE: 'CACHE'; +CALLED: 'CALLED'; CASCADE: 'CASCADE'; CASE: 'CASE'; CAST: 'CAST'; @@ -137,9 +169,11 @@ COMMENT: 'COMMENT'; COMMIT: 'COMMIT'; COMPACT: 'COMPACT'; COMPACTIONS: 'COMPACTIONS'; +COMPENSATION: 'COMPENSATION'; COMPUTE: 'COMPUTE'; CONCATENATE: 'CONCATENATE'; CONSTRAINT: 'CONSTRAINT'; +CONTAINS: 'CONTAINS'; COST: 'COST'; CREATE: 'CREATE'; CROSS: 'CROSS'; @@ -166,10 +200,12 @@ DECIMAL: 'DECIMAL'; DECLARE: 'DECLARE'; DEFAULT: 'DEFAULT'; DEFINED: 'DEFINED'; +DEFINER: 'DEFINER'; DELETE: 'DELETE'; DELIMITED: 'DELIMITED'; DESC: 'DESC'; DESCRIBE: 'DESCRIBE'; +DETERMINISTIC: 'DETERMINISTIC'; DFS: 'DFS'; DIRECTORIES: 'DIRECTORIES'; DIRECTORY: 'DIRECTORY'; @@ -182,6 +218,7 @@ ELSE: 'ELSE'; END: 'END'; ESCAPE: 'ESCAPE'; ESCAPED: 'ESCAPED'; +EVOLUTION: 'EVOLUTION'; EXCEPT: 'EXCEPT'; EXCHANGE: 'EXCHANGE'; EXCLUDE: 'EXCLUDE'; @@ -227,6 +264,7 @@ INDEX: 'INDEX'; INDEXES: 'INDEXES'; INNER: 'INNER'; INPATH: 'INPATH'; +INPUT: 'INPUT'; INPUTFORMAT: 'INPUTFORMAT'; INSERT: 'INSERT'; INTERSECT: 'INTERSECT'; @@ -234,10 +272,12 @@ INTERVAL: 'INTERVAL'; INT: 'INT'; INTEGER: 'INTEGER'; INTO: 'INTO'; +INVOKER: 'INVOKER'; IS: 'IS'; ITEMS: 'ITEMS'; JOIN: 'JOIN'; KEYS: 'KEYS'; +LANGUAGE: 'LANGUAGE'; LAST: 'LAST'; LATERAL: 'LATERAL'; LAZY: 'LAZY'; @@ -256,7 +296,7 @@ LOCKS: 'LOCKS'; LOGICAL: 'LOGICAL'; LONG: 'LONG'; MACRO: 'MACRO'; -MAP: 'MAP'; +MAP: 'MAP' {incComplexTypeLevelCounter();}; MATCHED: 'MATCHED'; MERGE: 'MERGE'; MICROSECOND: 'MICROSECOND'; @@ -265,6 +305,7 @@ MILLISECOND: 'MILLISECOND'; MILLISECONDS: 'MILLISECONDS'; MINUTE: 'MINUTE'; MINUTES: 'MINUTES'; +MODIFIES: 'MODIFIES'; MONTH: 'MONTH'; MONTHS: 'MONTHS'; MSCK: 'MSCK'; @@ -297,8 +338,6 @@ OVERWRITE: 'OVERWRITE'; PARTITION: 'PARTITION'; PARTITIONED: 'PARTITIONED'; PARTITIONS: 'PARTITIONS'; -PERCENTILE_CONT: 'PERCENTILE_CONT'; -PERCENTILE_DISC: 'PERCENTILE_DISC'; PERCENTLIT: 'PERCENT'; PIVOT: 'PIVOT'; PLACING: 'PLACING'; @@ -311,6 +350,7 @@ PURGE: 'PURGE'; QUARTER: 'QUARTER'; QUERY: 'QUERY'; RANGE: 'RANGE'; +READS: 'READS'; REAL: 'REAL'; RECORDREADER: 'RECORDREADER'; RECORDWRITER: 'RECORDWRITER'; @@ -325,6 +365,8 @@ REPLACE: 'REPLACE'; RESET: 'RESET'; RESPECT: 'RESPECT'; RESTRICT: 'RESTRICT'; +RETURN: 'RETURN'; +RETURNS: 'RETURNS'; REVOKE: 'REVOKE'; RIGHT: 'RIGHT'; RLIKE: 'RLIKE' | 'REGEXP'; @@ -338,6 +380,7 @@ SECOND: 'SECOND'; SECONDS: 'SECONDS'; SCHEMA: 'SCHEMA'; SCHEMAS: 'SCHEMAS'; +SECURITY: 'SECURITY'; SELECT: 'SELECT'; SEMI: 'SEMI'; SEPARATED: 'SEPARATED'; @@ -356,12 +399,14 @@ SOME: 'SOME'; SORT: 'SORT'; SORTED: 'SORTED'; SOURCE: 'SOURCE'; +SPECIFIC: 'SPECIFIC'; +SQL: 'SQL'; START: 'START'; STATISTICS: 'STATISTICS'; STORED: 'STORED'; STRATIFY: 'STRATIFY'; STRING: 'STRING'; -STRUCT: 'STRUCT'; +STRUCT: 'STRUCT' {incComplexTypeLevelCounter();}; SUBSTR: 'SUBSTR'; SUBSTRING: 'SUBSTRING'; SYNC: 'SYNC'; @@ -438,8 +483,11 @@ NEQ : '<>'; NEQJ: '!='; LT : '<'; LTE : '<=' | '!>'; -GT : '>'; +GT : '>' {decComplexTypeLevelCounter();}; GTE : '>=' | '!<'; +SHIFT_LEFT: '<<'; +SHIFT_RIGHT: '>>' {isShiftRightOperator()}?; +SHIFT_RIGHT_UNSIGNED: '>>>' {isShiftRightOperator()}?; PLUS: '+'; MINUS: '-'; diff --git a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 index 3d008516589b2..54eff14b6d4df 100644 --- a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 +++ b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 @@ -42,6 +42,28 @@ options { tokenVocab = SqlBaseLexer; } public boolean double_quoted_identifiers = false; } +compoundOrSingleStatement + : singleStatement + | singleCompoundStatement + ; + +singleCompoundStatement + : beginEndCompoundBlock SEMICOLON? EOF + ; + +beginEndCompoundBlock + : BEGIN compoundBody END + ; + +compoundBody + : (compoundStatements+=compoundStatement SEMICOLON)* + ; + +compoundStatement + : statement + | beginEndCompoundBlock + ; + singleStatement : statement SEMICOLON* EOF ; @@ -77,29 +99,31 @@ statement | USE identifierReference #use | USE namespace identifierReference #useNamespace | SET CATALOG (errorCapturingIdentifier | stringLit) #setCatalog - | CREATE namespace (IF NOT EXISTS)? identifierReference + | CREATE namespace (IF errorCapturingNot EXISTS)? identifierReference (commentSpec | locationSpec | (WITH (DBPROPERTIES | PROPERTIES) propertyList))* #createNamespace | ALTER namespace identifierReference SET (DBPROPERTIES | PROPERTIES) propertyList #setNamespaceProperties + | ALTER namespace identifierReference + UNSET (DBPROPERTIES | PROPERTIES) propertyList #unsetNamespaceProperties | ALTER namespace identifierReference SET locationSpec #setNamespaceLocation | DROP namespace (IF EXISTS)? identifierReference (RESTRICT | CASCADE)? #dropNamespace | SHOW namespaces ((FROM | IN) multipartIdentifier)? (LIKE? pattern=stringLit)? #showNamespaces - | createTableHeader (LEFT_PAREN createOrReplaceTableColTypeList RIGHT_PAREN)? tableProvider? + | createTableHeader (LEFT_PAREN colDefinitionList RIGHT_PAREN)? tableProvider? createTableClauses (AS? query)? #createTable - | CREATE TABLE (IF NOT EXISTS)? target=tableIdentifier + | CREATE TABLE (IF errorCapturingNot EXISTS)? target=tableIdentifier LIKE source=tableIdentifier (tableProvider | rowFormat | createFileFormat | locationSpec | (TBLPROPERTIES tableProps=propertyList))* #createTableLike - | replaceTableHeader (LEFT_PAREN createOrReplaceTableColTypeList RIGHT_PAREN)? tableProvider? + | replaceTableHeader (LEFT_PAREN colDefinitionList RIGHT_PAREN)? tableProvider? createTableClauses (AS? query)? #replaceTable | ANALYZE TABLE identifierReference partitionSpec? COMPUTE STATISTICS @@ -141,7 +165,7 @@ statement SET SERDE stringLit (WITH SERDEPROPERTIES propertyList)? #setTableSerDe | ALTER TABLE identifierReference (partitionSpec)? SET SERDEPROPERTIES propertyList #setTableSerDe - | ALTER (TABLE | VIEW) identifierReference ADD (IF NOT EXISTS)? + | ALTER (TABLE | VIEW) identifierReference ADD (IF errorCapturingNot EXISTS)? partitionSpecLocation+ #addTablePartition | ALTER TABLE identifierReference from=partitionSpec RENAME TO to=partitionSpec #renameTablePartition @@ -153,9 +177,10 @@ statement | DROP TABLE (IF EXISTS)? identifierReference PURGE? #dropTable | DROP VIEW (IF EXISTS)? identifierReference #dropView | CREATE (OR REPLACE)? (GLOBAL? TEMPORARY)? - VIEW (IF NOT EXISTS)? identifierReference + VIEW (IF errorCapturingNot EXISTS)? identifierReference identifierCommentList? (commentSpec | + schemaBinding | (PARTITIONED ON identifierList) | (TBLPROPERTIES propertyList))* AS query #createView @@ -163,9 +188,15 @@ statement tableIdentifier (LEFT_PAREN colTypeList RIGHT_PAREN)? tableProvider (OPTIONS propertyList)? #createTempViewUsing | ALTER VIEW identifierReference AS? query #alterViewQuery - | CREATE (OR REPLACE)? TEMPORARY? FUNCTION (IF NOT EXISTS)? + | ALTER VIEW identifierReference schemaBinding #alterViewSchemaBinding + | CREATE (OR REPLACE)? TEMPORARY? FUNCTION (IF errorCapturingNot EXISTS)? identifierReference AS className=stringLit (USING resource (COMMA resource)*)? #createFunction + | CREATE (OR REPLACE)? TEMPORARY? FUNCTION (IF errorCapturingNot EXISTS)? + identifierReference LEFT_PAREN parameters=colDefinitionList? RIGHT_PAREN + (RETURNS (dataType | TABLE LEFT_PAREN returnParams=colTypeList RIGHT_PAREN))? + routineCharacteristics + RETURN (query | expression) #createUserDefinedFunction | DROP TEMPORARY? FUNCTION (IF EXISTS)? identifierReference #dropFunction | DECLARE (OR REPLACE)? VARIABLE? identifierReference dataType? variableDefaultExpression? #createVariable @@ -224,7 +255,7 @@ statement | SET .*? #setConfiguration | RESET configKey #resetQuotedConfiguration | RESET .*? #resetConfiguration - | CREATE INDEX (IF NOT EXISTS)? identifier ON TABLE? + | CREATE INDEX (IF errorCapturingNot EXISTS)? identifier ON TABLE? identifierReference (USING indexType=identifier)? LEFT_PAREN columns=multipartIdentifierPropertyList RIGHT_PAREN (OPTIONS options=propertyList)? #createIndex @@ -315,7 +346,7 @@ unsupportedHiveNativeCommands ; createTableHeader - : CREATE TEMPORARY? EXTERNAL? TABLE (IF NOT EXISTS)? identifierReference + : CREATE TEMPORARY? EXTERNAL? TABLE (IF errorCapturingNot EXISTS)? identifierReference ; replaceTableHeader @@ -342,6 +373,10 @@ locationSpec : LOCATION stringLit ; +schemaBinding + : WITH SCHEMA (BINDING | COMPENSATION | EVOLUTION | TYPE EVOLUTION) + ; + commentSpec : COMMENT stringLit ; @@ -351,8 +386,8 @@ query ; insertInto - : INSERT OVERWRITE TABLE? identifierReference (partitionSpec (IF NOT EXISTS)?)? ((BY NAME) | identifierList)? #insertOverwriteTable - | INSERT INTO TABLE? identifierReference partitionSpec? (IF NOT EXISTS)? ((BY NAME) | identifierList)? #insertIntoTable + : INSERT OVERWRITE TABLE? identifierReference (partitionSpec (IF errorCapturingNot EXISTS)?)? ((BY NAME) | identifierList)? #insertOverwriteTable + | INSERT INTO TABLE? identifierReference partitionSpec? (IF errorCapturingNot EXISTS)? ((BY NAME) | identifierList)? #insertIntoTable | INSERT INTO TABLE? identifierReference REPLACE whereClause #insertIntoReplaceWhere | INSERT OVERWRITE LOCAL? DIRECTORY path=stringLit rowFormat? createFileFormat? #insertOverwriteHiveDir | INSERT OVERWRITE LOCAL? DIRECTORY (path=stringLit)? tableProvider (OPTIONS options=propertyList)? #insertOverwriteDir @@ -389,6 +424,7 @@ describeFuncName | comparisonOperator | arithmeticOperator | predicateOperator + | shiftOperator | BANG ; @@ -480,7 +516,7 @@ dmlStatementNoWith | fromClause multiInsertQueryBody+ #multiInsertQuery | DELETE FROM identifierReference tableAlias whereClause? #deleteFromTable | UPDATE identifierReference tableAlias setClause whereClause? #updateTable - | MERGE INTO target=identifierReference targetAlias=tableAlias + | MERGE (WITH SCHEMA EVOLUTION)? INTO target=identifierReference targetAlias=tableAlias USING (source=identifierReference | LEFT_PAREN sourceQuery=query RIGHT_PAREN) sourceAlias=tableAlias ON mergeCondition=booleanExpression @@ -588,11 +624,11 @@ matchedClause : WHEN MATCHED (AND matchedCond=booleanExpression)? THEN matchedAction ; notMatchedClause - : WHEN NOT MATCHED (BY TARGET)? (AND notMatchedCond=booleanExpression)? THEN notMatchedAction + : WHEN errorCapturingNot MATCHED (BY TARGET)? (AND notMatchedCond=booleanExpression)? THEN notMatchedAction ; notMatchedBySourceClause - : WHEN NOT MATCHED BY SOURCE (AND notMatchedBySourceCond=booleanExpression)? THEN notMatchedBySourceAction + : WHEN errorCapturingNot MATCHED BY SOURCE (AND notMatchedBySourceCond=booleanExpression)? THEN notMatchedBySourceAction ; matchedAction @@ -838,9 +874,11 @@ tableArgumentPartitioning : ((WITH SINGLE PARTITION) | ((PARTITION | DISTRIBUTE) BY (((LEFT_PAREN partition+=expression (COMMA partition+=expression)* RIGHT_PAREN)) + | (expression (COMMA invalidMultiPartitionExpression=expression)+) | partition+=expression))) ((ORDER | SORT) BY (((LEFT_PAREN sortItem (COMMA sortItem)* RIGHT_PAREN) + | (sortItem (COMMA invalidMultiSortItem=sortItem)+) | sortItem)))? ; @@ -956,15 +994,20 @@ booleanExpression ; predicate - : NOT? kind=BETWEEN lower=valueExpression AND upper=valueExpression - | NOT? kind=IN LEFT_PAREN expression (COMMA expression)* RIGHT_PAREN - | NOT? kind=IN LEFT_PAREN query RIGHT_PAREN - | NOT? kind=RLIKE pattern=valueExpression - | NOT? kind=(LIKE | ILIKE) quantifier=(ANY | SOME | ALL) (LEFT_PAREN RIGHT_PAREN | LEFT_PAREN expression (COMMA expression)* RIGHT_PAREN) - | NOT? kind=(LIKE | ILIKE) pattern=valueExpression (ESCAPE escapeChar=stringLit)? - | IS NOT? kind=NULL - | IS NOT? kind=(TRUE | FALSE | UNKNOWN) - | IS NOT? kind=DISTINCT FROM right=valueExpression + : errorCapturingNot? kind=BETWEEN lower=valueExpression AND upper=valueExpression + | errorCapturingNot? kind=IN LEFT_PAREN expression (COMMA expression)* RIGHT_PAREN + | errorCapturingNot? kind=IN LEFT_PAREN query RIGHT_PAREN + | errorCapturingNot? kind=RLIKE pattern=valueExpression + | errorCapturingNot? kind=(LIKE | ILIKE) quantifier=(ANY | SOME | ALL) (LEFT_PAREN RIGHT_PAREN | LEFT_PAREN expression (COMMA expression)* RIGHT_PAREN) + | errorCapturingNot? kind=(LIKE | ILIKE) pattern=valueExpression (ESCAPE escapeChar=stringLit)? + | IS errorCapturingNot? kind=NULL + | IS errorCapturingNot? kind=(TRUE | FALSE | UNKNOWN) + | IS errorCapturingNot? kind=DISTINCT FROM right=valueExpression + ; + +errorCapturingNot + : NOT + | BANG ; valueExpression @@ -972,12 +1015,19 @@ valueExpression | operator=(MINUS | PLUS | TILDE) valueExpression #arithmeticUnary | left=valueExpression operator=(ASTERISK | SLASH | PERCENT | DIV) right=valueExpression #arithmeticBinary | left=valueExpression operator=(PLUS | MINUS | CONCAT_PIPE) right=valueExpression #arithmeticBinary + | left=valueExpression shiftOperator right=valueExpression #shiftExpression | left=valueExpression operator=AMPERSAND right=valueExpression #arithmeticBinary | left=valueExpression operator=HAT right=valueExpression #arithmeticBinary | left=valueExpression operator=PIPE right=valueExpression #arithmeticBinary | left=valueExpression comparisonOperator right=valueExpression #comparison ; +shiftOperator + : SHIFT_LEFT + | SHIFT_RIGHT + | SHIFT_RIGHT_UNSIGNED + ; + datetimeUnit : YEAR | QUARTER | MONTH | WEEK | DAY | DAYOFYEAR @@ -1143,7 +1193,7 @@ qualifiedColTypeWithPosition ; colDefinitionDescriptorWithPosition - : NOT NULL + : errorCapturingNot NULL | defaultExpression | commentSpec | colPosition @@ -1162,19 +1212,19 @@ colTypeList ; colType - : colName=errorCapturingIdentifier dataType (NOT NULL)? commentSpec? + : colName=errorCapturingIdentifier dataType (errorCapturingNot NULL)? commentSpec? ; -createOrReplaceTableColTypeList - : createOrReplaceTableColType (COMMA createOrReplaceTableColType)* +colDefinitionList + : colDefinition (COMMA colDefinition)* ; -createOrReplaceTableColType +colDefinition : colName=errorCapturingIdentifier dataType colDefinitionOption* ; colDefinitionOption - : NOT NULL + : errorCapturingNot NULL | defaultExpression | generationExpression | commentSpec @@ -1189,9 +1239,49 @@ complexColTypeList ; complexColType - : errorCapturingIdentifier COLON? dataType (NOT NULL)? commentSpec? + : errorCapturingIdentifier COLON? dataType (errorCapturingNot NULL)? commentSpec? + ; + +routineCharacteristics + : (routineLanguage + | specificName + | deterministic + | sqlDataAccess + | nullCall + | commentSpec + | rightsClause)* + ; + +routineLanguage + : LANGUAGE (SQL | IDENTIFIER) + ; + +specificName + : SPECIFIC specific=errorCapturingIdentifier ; +deterministic + : DETERMINISTIC + | errorCapturingNot DETERMINISTIC + ; + +sqlDataAccess + : access=NO SQL + | access=CONTAINS SQL + | access=READS SQL DATA + | access=MODIFIES SQL DATA + ; + +nullCall + : RETURNS NULL ON NULL INPUT + | CALLED ON NULL INPUT + ; + +rightsClause + : SQL SECURITY INVOKER + | SQL SECURITY DEFINER + ; + whenClause : WHEN condition=expression THEN result=expression ; @@ -1296,7 +1386,7 @@ alterColumnAction : TYPE dataType | commentSpec | colPosition - | setOrDrop=(SET | DROP) NOT NULL + | setOrDrop=(SET | DROP) errorCapturingNot NULL | SET defaultExpression | dropDefault=DROP DEFAULT ; @@ -1339,16 +1429,19 @@ ansiNonReserved | ARRAY | ASC | AT + | BEGIN | BETWEEN | BIGINT | BINARY | BINARY_HEX + | BINDING | BOOLEAN | BUCKET | BUCKETS | BY | BYTE | CACHE + | CALLED | CASCADE | CATALOG | CATALOGS @@ -1365,8 +1458,10 @@ ansiNonReserved | COMMIT | COMPACT | COMPACTIONS + | COMPENSATION | COMPUTE | CONCATENATE + | CONTAINS | COST | CUBE | CURRENT @@ -1387,10 +1482,12 @@ ansiNonReserved | DECLARE | DEFAULT | DEFINED + | DEFINER | DELETE | DELIMITED | DESC | DESCRIBE + | DETERMINISTIC | DFS | DIRECTORIES | DIRECTORY @@ -1399,6 +1496,7 @@ ansiNonReserved | DOUBLE | DROP | ESCAPED + | EVOLUTION | EXCHANGE | EXCLUDE | EXISTS @@ -1430,13 +1528,16 @@ ansiNonReserved | INDEX | INDEXES | INPATH + | INPUT | INPUTFORMAT | INSERT | INT | INTEGER | INTERVAL + | INVOKER | ITEMS | KEYS + | LANGUAGE | LAST | LAZY | LIKE @@ -1461,6 +1562,7 @@ ansiNonReserved | MILLISECONDS | MINUTE | MINUTES + | MODIFIES | MONTH | MONTHS | MSCK @@ -1494,6 +1596,7 @@ ansiNonReserved | QUARTER | QUERY | RANGE + | READS | REAL | RECORDREADER | RECORDWRITER @@ -1507,6 +1610,8 @@ ansiNonReserved | RESET | RESPECT | RESTRICT + | RETURN + | RETURNS | REVOKE | RLIKE | ROLE @@ -1519,6 +1624,7 @@ ansiNonReserved | SCHEMAS | SECOND | SECONDS + | SECURITY | SEMI | SEPARATED | SERDE @@ -1534,6 +1640,7 @@ ansiNonReserved | SORT | SORTED | SOURCE + | SPECIFIC | START | STATISTICS | STORED @@ -1638,10 +1745,12 @@ nonReserved | ASC | AT | AUTHORIZATION + | BEGIN | BETWEEN | BIGINT | BINARY | BINARY_HEX + | BINDING | BOOLEAN | BOTH | BUCKET @@ -1649,6 +1758,7 @@ nonReserved | BY | BYTE | CACHE + | CALLED | CASCADE | CASE | CAST @@ -1671,9 +1781,11 @@ nonReserved | COMMIT | COMPACT | COMPACTIONS + | COMPENSATION | COMPUTE | CONCATENATE | CONSTRAINT + | CONTAINS | COST | CREATE | CUBE @@ -1699,10 +1811,12 @@ nonReserved | DECLARE | DEFAULT | DEFINED + | DEFINER | DELETE | DELIMITED | DESC | DESCRIBE + | DETERMINISTIC | DFS | DIRECTORIES | DIRECTORY @@ -1715,6 +1829,7 @@ nonReserved | END | ESCAPE | ESCAPED + | EVOLUTION | EXCHANGE | EXCLUDE | EXECUTE @@ -1757,15 +1872,18 @@ nonReserved | INDEX | INDEXES | INPATH + | INPUT | INPUTFORMAT | INSERT | INT | INTEGER | INTERVAL | INTO + | INVOKER | IS | ITEMS | KEYS + | LANGUAGE | LAST | LAZY | LEADING @@ -1792,6 +1910,7 @@ nonReserved | MILLISECONDS | MINUTE | MINUTES + | MODIFIES | MONTH | MONTHS | MSCK @@ -1822,8 +1941,6 @@ nonReserved | PARTITION | PARTITIONED | PARTITIONS - | PERCENTILE_CONT - | PERCENTILE_DISC | PERCENTLIT | PIVOT | PLACING @@ -1836,6 +1953,7 @@ nonReserved | QUARTER | QUERY | RANGE + | READS | REAL | RECORDREADER | RECORDWRITER @@ -1850,6 +1968,8 @@ nonReserved | RESET | RESPECT | RESTRICT + | RETURN + | RETURNS | REVOKE | RLIKE | ROLE @@ -1862,6 +1982,7 @@ nonReserved | SCHEMAS | SECOND | SECONDS + | SECURITY | SELECT | SEPARATED | SERDE @@ -1878,6 +1999,8 @@ nonReserved | SORT | SORTED | SOURCE + | SPECIFIC + | SQL | START | STATISTICS | STORED diff --git a/sql/api/src/main/java/org/apache/spark/sql/connector/catalog/Identifier.java b/sql/api/src/main/java/org/apache/spark/sql/connector/catalog/Identifier.java index 88c51d6c43ddf..efb71f196f5f7 100644 --- a/sql/api/src/main/java/org/apache/spark/sql/connector/catalog/Identifier.java +++ b/sql/api/src/main/java/org/apache/spark/sql/connector/catalog/Identifier.java @@ -6,7 +6,7 @@ * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, diff --git a/sql/api/src/main/java/org/apache/spark/sql/connector/catalog/IdentifierImpl.java b/sql/api/src/main/java/org/apache/spark/sql/connector/catalog/IdentifierImpl.java index 17895e73d9fcf..ba883b8042c4d 100644 --- a/sql/api/src/main/java/org/apache/spark/sql/connector/catalog/IdentifierImpl.java +++ b/sql/api/src/main/java/org/apache/spark/sql/connector/catalog/IdentifierImpl.java @@ -6,7 +6,7 @@ * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, diff --git a/sql/api/src/main/java/org/apache/spark/sql/types/DataTypes.java b/sql/api/src/main/java/org/apache/spark/sql/types/DataTypes.java index 32c20dedac4c6..0034b8e715183 100644 --- a/sql/api/src/main/java/org/apache/spark/sql/types/DataTypes.java +++ b/sql/api/src/main/java/org/apache/spark/sql/types/DataTypes.java @@ -259,4 +259,22 @@ public static StructType createStructType(StructField[] fields) { return StructType$.MODULE$.apply(fields); } + + /** + * Creates a CharType with the given length. + * + * @since 4.0.0 + */ + public static CharType createCharType(int length) { + return new CharType(length); + } + + /** + * Creates a VarcharType with the given length. + * + * @since 4.0.0 + */ + public static VarcharType createVarcharType(int length) { + return new VarcharType(length); + } } diff --git a/sql/api/src/main/scala/org/apache/spark/sql/ObservationBase.scala b/sql/api/src/main/scala/org/apache/spark/sql/ObservationBase.scala new file mode 100644 index 0000000000000..4789ae8975d12 --- /dev/null +++ b/sql/api/src/main/scala/org/apache/spark/sql/ObservationBase.scala @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import scala.jdk.CollectionConverters.MapHasAsJava + +/** + * Helper class to simplify usage of `Dataset.observe(String, Column, Column*)`: + * + * {{{ + * // Observe row count (rows) and highest id (maxid) in the Dataset while writing it + * val observation = Observation("my metrics") + * val observed_ds = ds.observe(observation, count(lit(1)).as("rows"), max($"id").as("maxid")) + * observed_ds.write.parquet("ds.parquet") + * val metrics = observation.get + * }}} + * + * This collects the metrics while the first action is executed on the observed dataset. Subsequent + * actions do not modify the metrics returned by [[get]]. Retrieval of the metric via [[get]] + * blocks until the first action has finished and metrics become available. + * + * This class does not support streaming datasets. + * + * @param name name of the metric + * @since 3.3.0 + */ +abstract class ObservationBase(val name: String) { + + if (name.isEmpty) throw new IllegalArgumentException("Name must not be empty") + + @volatile protected var metrics: Option[Map[String, Any]] = None + + /** + * (Scala-specific) Get the observed metrics. This waits for the observed dataset to finish + * its first action. Only the result of the first action is available. Subsequent actions do not + * modify the result. + * + * @return the observed metrics as a `Map[String, Any]` + * @throws InterruptedException interrupted while waiting + */ + @throws[InterruptedException] + def get: Map[String, _] = { + synchronized { + // we need to loop as wait might return without us calling notify + // https://en.wikipedia.org/w/index.php?title=Spurious_wakeup&oldid=992601610 + while (this.metrics.isEmpty) { + wait() + } + } + + this.metrics.get + } + + /** + * (Java-specific) Get the observed metrics. This waits for the observed dataset to finish + * its first action. Only the result of the first action is available. Subsequent actions do not + * modify the result. + * + * @return the observed metrics as a `java.util.Map[String, Object]` + * @throws InterruptedException interrupted while waiting + */ + @throws[InterruptedException] + def getAsJava: java.util.Map[String, AnyRef] = { + get.map { case (key, value) => (key, value.asInstanceOf[Object]) }.asJava + } + + /** + * Get the observed metrics. This returns the metrics if they are available, otherwise an empty. + * + * @return the observed metrics as a `Map[String, Any]` + */ + @throws[InterruptedException] + private[sql] def getOrEmpty: Map[String, _] = { + synchronized { + if (metrics.isEmpty) { + wait(100) // Wait for 100ms to see if metrics are available + } + metrics.getOrElse(Map.empty) + } + } + + /** + * Set the observed metrics and notify all waiting threads to resume. + * + * @return `true` if all waiting threads were notified, `false` if otherwise. + */ + private[spark] def setMetricsAndNotify(metrics: Option[Map[String, Any]]): Boolean = { + synchronized { + this.metrics = metrics + if(metrics.isDefined) { + notifyAll() + true + } else { + false + } + } + } +} diff --git a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala index 16ac283eccb15..c507e952630f6 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala @@ -20,9 +20,9 @@ package org.apache.spark.sql.catalyst.encoders import scala.collection.mutable import scala.reflect.classTag -import org.apache.spark.sql.Row +import org.apache.spark.sql.{AnalysisException, Row} import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.{BinaryEncoder, BoxedBooleanEncoder, BoxedByteEncoder, BoxedDoubleEncoder, BoxedFloatEncoder, BoxedIntEncoder, BoxedLongEncoder, BoxedShortEncoder, CalendarIntervalEncoder, DateEncoder, DayTimeIntervalEncoder, EncoderField, InstantEncoder, IterableEncoder, JavaDecimalEncoder, LocalDateEncoder, LocalDateTimeEncoder, MapEncoder, NullEncoder, RowEncoder => AgnosticRowEncoder, StringEncoder, TimestampEncoder, UDTEncoder, VariantEncoder, YearMonthIntervalEncoder} -import org.apache.spark.sql.errors.ExecutionErrors +import org.apache.spark.sql.errors.{DataTypeErrorsBase, ExecutionErrors} import org.apache.spark.sql.internal.SqlApiConf import org.apache.spark.sql.types._ import org.apache.spark.util.ArrayImplicits._ @@ -59,7 +59,7 @@ import org.apache.spark.util.ArrayImplicits._ * StructType -> org.apache.spark.sql.Row * }}} */ -object RowEncoder { +object RowEncoder extends DataTypeErrorsBase { def encoderFor(schema: StructType): AgnosticEncoder[Row] = { encoderFor(schema, lenient = false) } @@ -124,5 +124,11 @@ object RowEncoder { field.nullable, field.metadata) }.toImmutableArraySeq) + + case _ => + throw new AnalysisException( + errorClass = "UNSUPPORTED_DATA_TYPE_FOR_ENCODER", + messageParameters = Map("dataType" -> toSQLType(dataType)) + ) } } diff --git a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/trees/QueryContexts.scala b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/trees/QueryContexts.scala index 1c2456f00bcdc..2b3f4674539e3 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/trees/QueryContexts.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/trees/QueryContexts.scala @@ -145,36 +145,30 @@ case class DataFrameQueryContext( override def stopIndex: Int = throw SparkUnsupportedOperationException() override val fragment: String = { - stackTrace.headOption.map { firstElem => - val methodName = firstElem.getMethodName - if (methodName.length > 1 && methodName(0) == '$') { - methodName.substring(1) - } else { - methodName - } - }.getOrElse("") + pysparkErrorContext.map(_._1).getOrElse { + stackTrace.headOption.map { firstElem => + val methodName = firstElem.getMethodName + if (methodName.length > 1 && methodName(0) == '$') { + methodName.substring(1) + } else { + methodName + } + }.getOrElse("") + } } - override val callSite: String = stackTrace.tail.mkString("\n") - - val pysparkFragment: String = pysparkErrorContext.map(_._1).getOrElse("") - val pysparkCallSite: String = pysparkErrorContext.map(_._2).getOrElse("") - - val (displayedFragment, displayedCallsite) = if (pysparkErrorContext.nonEmpty) { - (pysparkFragment, pysparkCallSite) - } else { - (fragment, callSite) - } + override val callSite: String = pysparkErrorContext.map( + _._2).getOrElse(stackTrace.tail.mkString("\n")) override lazy val summary: String = { val builder = new StringBuilder builder ++= "== DataFrame ==\n" builder ++= "\"" - builder ++= displayedFragment + builder ++= fragment builder ++= "\"" builder ++= " was called from\n" - builder ++= displayedCallsite + builder ++= callSite builder += '\n' builder.result() diff --git a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/trees/origin.scala b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/trees/origin.scala index 9d3968b025350..4ecbfd631e7e8 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/trees/origin.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/trees/origin.scala @@ -85,3 +85,20 @@ object CurrentOrigin { ret } } + +/** + * Provides detailed error context information on PySpark. + */ +object PySparkCurrentOrigin { + private val pysparkErrorContext = new ThreadLocal[Option[(String, String)]]() { + override def initialValue(): Option[(String, String)] = None + } + + def set(fragment: String, callSite: String): Unit = { + pysparkErrorContext.set(Some((fragment, callSite))) + } + + def get(): Option[(String, String)] = pysparkErrorContext.get() + + def clear(): Unit = pysparkErrorContext.remove() +} diff --git a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala index 8db8c3cd39d74..0447d813e26a5 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala @@ -198,7 +198,7 @@ trait SparkDateTimeUtils { } private val zoneInfoClassName = "sun.util.calendar.ZoneInfo" - private val getOffsetsByWallHandle = { + private lazy val getOffsetsByWallHandle = { val lookup = MethodHandles.lookup() val classType = SparkClassUtils.classForName(zoneInfoClassName) val methodName = "getOffsetsByWall" diff --git a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/SparkParserUtils.scala b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/SparkParserUtils.scala index a4ce5fb120340..7597cb1d9087d 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/SparkParserUtils.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/SparkParserUtils.scala @@ -16,8 +16,7 @@ */ package org.apache.spark.sql.catalyst.util -import java.lang.{Long => JLong} -import java.nio.CharBuffer +import java.lang.{Long => JLong, StringBuilder => JStringBuilder} import org.antlr.v4.runtime.{ParserRuleContext, Token} import org.antlr.v4.runtime.misc.Interval @@ -26,16 +25,10 @@ import org.antlr.v4.runtime.tree.TerminalNode import org.apache.spark.sql.catalyst.trees.{CurrentOrigin, Origin} trait SparkParserUtils { - val U16_CHAR_PATTERN = """\\u([a-fA-F0-9]{4})(?s).*""".r - val U32_CHAR_PATTERN = """\\U([a-fA-F0-9]{8})(?s).*""".r - val OCTAL_CHAR_PATTERN = """\\([01][0-7]{2})(?s).*""".r - val ESCAPED_CHAR_PATTERN = """\\((?s).)(?s).*""".r /** Unescape backslash-escaped string enclosed by quotes. */ def unescapeSQLString(b: String): String = { - val sb = new StringBuilder(b.length()) - - def appendEscapedChar(n: Char): Unit = { + def appendEscapedChar(n: Char, sb: JStringBuilder): Unit = { n match { case '0' => sb.append('\u0000') case 'b' => sb.append('\b') @@ -50,22 +43,64 @@ trait SparkParserUtils { } } - if (b.startsWith("r") || b.startsWith("R")) { + def allCharsAreHex(s: String, start: Int, length: Int): Boolean = { + val end = start + length + var i = start + while (i < end) { + val c = s.charAt(i) + val cIsHex = (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') + if (!cIsHex) { + return false + } + i += 1 + } + true + } + + def isThreeDigitOctalEscape(s: String, start: Int): Boolean = { + val firstChar = s.charAt(start) + val secondChar = s.charAt(start + 1) + val thirdChar = s.charAt(start + 2) + (firstChar == '0' || firstChar == '1') && + (secondChar >= '0' && secondChar <= '7') && + (thirdChar >= '0' && thirdChar <= '7') + } + + val isRawString = { + val firstChar = b.charAt(0) + firstChar == 'r' || firstChar == 'R' + } + + if (isRawString) { + // Skip the 'r' or 'R' and the first and last quotations enclosing the string literal. b.substring(2, b.length - 1) + } else if (b.indexOf('\\') == -1) { + // Fast path for the common case where the string has no escaped characters, + // in which case we just skip the first and last quotations enclosing the string literal. + b.substring(1, b.length - 1) } else { + val sb = new JStringBuilder(b.length()) // Skip the first and last quotations enclosing the string literal. - val charBuffer = CharBuffer.wrap(b, 1, b.length - 1) - - while (charBuffer.remaining() > 0) { - charBuffer match { - case U16_CHAR_PATTERN(cp) => + var i = 1 + val length = b.length - 1 + while (i < length) { + val c = b.charAt(i) + if (c != '\\' || i + 1 == length) { + // Either a regular character or a backslash at the end of the string: + sb.append(c) + i += 1 + } else { + // A backslash followed by at least one character: + i += 1 + val cAfterBackslash = b.charAt(i) + if (cAfterBackslash == 'u' && i + 1 + 4 <= length && allCharsAreHex(b, i + 1, 4)) { // \u0000 style 16-bit unicode character literals. - sb.append(Integer.parseInt(cp, 16).toChar) - charBuffer.position(charBuffer.position() + 6) - case U32_CHAR_PATTERN(cp) => + sb.append(Integer.parseInt(b, i + 1, i + 1 + 4, 16).toChar) + i += 1 + 4 + } else if (cAfterBackslash == 'U' && i + 1 + 8 <= length && allCharsAreHex(b, i + 1, 8)) { // \U00000000 style 32-bit unicode character literals. // Use Long to treat codePoint as unsigned in the range of 32-bit. - val codePoint = JLong.parseLong(cp, 16) + val codePoint = JLong.parseLong(b, i + 1, i + 1 + 8, 16) if (codePoint < 0x10000) { sb.append((codePoint & 0xFFFF).toChar) } else { @@ -74,21 +109,18 @@ trait SparkParserUtils { sb.append(highSurrogate.toChar) sb.append(lowSurrogate.toChar) } - charBuffer.position(charBuffer.position() + 10) - case OCTAL_CHAR_PATTERN(cp) => + i += 1 + 8 + } else if (i + 3 <= length && isThreeDigitOctalEscape(b, i)) { // \000 style character literals. - sb.append(Integer.parseInt(cp, 8).toChar) - charBuffer.position(charBuffer.position() + 4) - case ESCAPED_CHAR_PATTERN(c) => - // escaped character literals. - appendEscapedChar(c.charAt(0)) - charBuffer.position(charBuffer.position() + 2) - case _ => - // non-escaped character literals. - sb.append(charBuffer.get()) + sb.append(Integer.parseInt(b, i, i + 3, 8).toChar) + i += 3 + } else { + appendEscapedChar(cAfterBackslash, sb) + i += 1 + } } } - sb.toString() + sb.toString } } diff --git a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala index aa8826dd48b66..edb1ee371b156 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala @@ -16,6 +16,7 @@ */ package org.apache.spark.sql.catalyst.util +import java.util.HexFormat import java.util.concurrent.atomic.AtomicBoolean import org.apache.spark.internal.Logging @@ -101,11 +102,16 @@ object SparkStringUtils extends Logging { truncatedString(seq, "", sep, "", maxFields) } + private final lazy val SPACE_DELIMITED_UPPERCASE_HEX = + HexFormat.of().withDelimiter(" ").withUpperCase() + /** * Returns a pretty string of the byte array which prints each byte as a hex digit and add spaces * between them. For example, [1A C0]. */ - def getHexString(bytes: Array[Byte]): String = bytes.map("%02X".format(_)).mkString("[", " ", "]") + def getHexString(bytes: Array[Byte]): String = { + s"[${SPACE_DELIMITED_UPPERCASE_HEX.formatHex(bytes)}]" + } def sideBySide(left: String, right: String): Seq[String] = { sideBySide(left.split("\n").toImmutableArraySeq, right.split("\n").toImmutableArraySeq) diff --git a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala index d59b52a3818ac..9f57f8375c54d 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala @@ -162,6 +162,9 @@ class Iso8601TimestampFormatter( protected lazy val formatter: DateTimeFormatter = getOrCreateFormatter(pattern, locale, isParsing) + @transient + private lazy val zonedFormatter: DateTimeFormatter = formatter.withZone(zoneId) + @transient protected lazy val legacyFormatter = TimestampFormatter.getLegacyFormatter( pattern, zoneId, locale, legacyFormat) @@ -231,7 +234,7 @@ class Iso8601TimestampFormatter( override def format(instant: Instant): String = { try { - formatter.withZone(zoneId).format(instant) + zonedFormatter.format(instant) } catch checkFormattedDiff(toJavaTimestamp(instantToMicros(instant)), (t: Timestamp) => format(t)) } diff --git a/sql/api/src/main/scala/org/apache/spark/sql/errors/DataTypeErrorsBase.scala b/sql/api/src/main/scala/org/apache/spark/sql/errors/DataTypeErrorsBase.scala index d1d9dd806b3b8..930f92db26826 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/errors/DataTypeErrorsBase.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/errors/DataTypeErrorsBase.scala @@ -20,7 +20,7 @@ import java.util.Locale import org.apache.spark.QueryContext import org.apache.spark.sql.catalyst.util.{AttributeNameParser, QuotingUtils} -import org.apache.spark.sql.types.{AbstractDataType, DataType, TypeCollection} +import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String private[sql] trait DataTypeErrorsBase { @@ -50,6 +50,7 @@ private[sql] trait DataTypeErrorsBase { def toSQLType(t: AbstractDataType): String = t match { case TypeCollection(types) => types.map(toSQLType).mkString("(", " or ", ")") + case u: UserDefinedType[_] => s"UDT(${toSQLType(u.sqlType)})" case dt: DataType => quoteByDefault(dt.sql) case at => quoteByDefault(at.simpleString.toUpperCase(Locale.ROOT)) } diff --git a/sql/api/src/main/scala/org/apache/spark/sql/errors/QueryParsingErrors.scala b/sql/api/src/main/scala/org/apache/spark/sql/errors/QueryParsingErrors.scala index 9d0d4ea799746..e7ae9f2bfb7bb 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/errors/QueryParsingErrors.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/errors/QueryParsingErrors.scala @@ -36,6 +36,12 @@ private[sql] object QueryParsingErrors extends DataTypeErrorsBase { new ParseException(errorClass = "_LEGACY_ERROR_TEMP_0001", ctx) } + def parserStackOverflow(parserRuleContext: ParserRuleContext): Throwable = { + throw new ParseException( + errorClass = "FAILED_TO_PARSE_TOO_COMPLEX", + ctx = parserRuleContext) + } + def insertOverwriteDirectoryUnsupportedError(): Throwable = { SparkException.internalError("INSERT OVERWRITE DIRECTORY is not supported.") } @@ -283,7 +289,7 @@ private[sql] object QueryParsingErrors extends DataTypeErrorsBase { def nestedTypeMissingElementTypeError( dataType: String, ctx: PrimitiveDataTypeContext): Throwable = { - dataType match { + dataType.toUpperCase(Locale.ROOT) match { case "ARRAY" => new ParseException( errorClass = "INCOMPLETE_TYPE_DEFINITION.ARRAY", @@ -540,6 +546,12 @@ private[sql] object QueryParsingErrors extends DataTypeErrorsBase { new ParseException(errorClass = "_LEGACY_ERROR_TEMP_0052", ctx) } + def temporaryViewWithSchemaBindingMode(ctx: StatementContext): Throwable = { + new ParseException(errorClass = "UNSUPPORTED_FEATURE.TEMPORARY_VIEW_WITH_SCHEMA_BINDING_MODE", + messageParameters = Map.empty, + ctx) + } + def parameterMarkerNotAllowed(statement: String, origin: Origin): Throwable = { new ParseException( command = origin.sqlText, @@ -562,19 +574,19 @@ private[sql] object QueryParsingErrors extends DataTypeErrorsBase { ctx) } - def createFuncWithBothIfNotExistsAndReplaceError(ctx: CreateFunctionContext): Throwable = { + def createFuncWithBothIfNotExistsAndReplaceError(ctx: ParserRuleContext): Throwable = { new ParseException( - errorClass = "INVALID_SQL_SYNTAX.CREATE_FUNC_WITH_IF_NOT_EXISTS_AND_REPLACE", + errorClass = "INVALID_SQL_SYNTAX.CREATE_ROUTINE_WITH_IF_NOT_EXISTS_AND_REPLACE", ctx) } - def defineTempFuncWithIfNotExistsError(ctx: CreateFunctionContext): Throwable = { + def defineTempFuncWithIfNotExistsError(ctx: ParserRuleContext): Throwable = { new ParseException( errorClass = "INVALID_SQL_SYNTAX.CREATE_TEMP_FUNC_WITH_IF_NOT_EXISTS", ctx) } - def unsupportedFunctionNameError(funcName: Seq[String], ctx: CreateFunctionContext): Throwable = { + def unsupportedFunctionNameError(funcName: Seq[String], ctx: ParserRuleContext): Throwable = { new ParseException( errorClass = "INVALID_SQL_SYNTAX.MULTI_PART_NAME", messageParameters = Map( @@ -585,7 +597,7 @@ private[sql] object QueryParsingErrors extends DataTypeErrorsBase { def specifyingDBInCreateTempFuncError( databaseName: String, - ctx: CreateFunctionContext): Throwable = { + ctx: ParserRuleContext): Throwable = { new ParseException( errorClass = "INVALID_SQL_SYNTAX.CREATE_TEMP_FUNC_WITH_DATABASE", messageParameters = Map("database" -> toSQLId(databaseName)), diff --git a/sql/api/src/main/scala/org/apache/spark/sql/internal/types/AbstractMapType.scala b/sql/api/src/main/scala/org/apache/spark/sql/internal/types/AbstractMapType.scala new file mode 100644 index 0000000000000..62f422f6f80a7 --- /dev/null +++ b/sql/api/src/main/scala/org/apache/spark/sql/internal/types/AbstractMapType.scala @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.internal.types + +import org.apache.spark.sql.types.{AbstractDataType, DataType, MapType} + + +/** + * Use AbstractMapType(AbstractDataType, AbstractDataType) + * for defining expected types for expression parameters. + */ +case class AbstractMapType( + keyType: AbstractDataType, + valueType: AbstractDataType + ) extends AbstractDataType { + + override private[sql] def defaultConcreteType: DataType = + MapType(keyType.defaultConcreteType, valueType.defaultConcreteType, valueContainsNull = true) + + override private[sql] def acceptsType(other: DataType): Boolean = { + other.isInstanceOf[MapType] && + keyType.acceptsType(other.asInstanceOf[MapType].keyType) && + valueType.acceptsType(other.asInstanceOf[MapType].valueType) + } + + override private[spark] def simpleString: String = + s"map<${keyType.simpleString}, ${valueType.simpleString}>" +} diff --git a/sql/api/src/main/scala/org/apache/spark/sql/internal/types/AbstractStringType.scala b/sql/api/src/main/scala/org/apache/spark/sql/internal/types/AbstractStringType.scala index 6403295fe20c4..0828c2d6fc104 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/internal/types/AbstractStringType.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/internal/types/AbstractStringType.scala @@ -17,13 +17,14 @@ package org.apache.spark.sql.internal.types +import org.apache.spark.sql.internal.SqlApiConf import org.apache.spark.sql.types.{AbstractDataType, DataType, StringType} /** * StringTypeCollated is an abstract class for StringType with collation support. */ abstract class AbstractStringType extends AbstractDataType { - override private[sql] def defaultConcreteType: DataType = StringType + override private[sql] def defaultConcreteType: DataType = SqlApiConf.get.defaultStringType override private[sql] def simpleString: String = "string" } diff --git a/sql/api/src/main/scala/org/apache/spark/sql/streaming/StatefulProcessorHandle.scala b/sql/api/src/main/scala/org/apache/spark/sql/streaming/StatefulProcessorHandle.scala index f662b685c4e4f..4dc2ca875ef0e 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/streaming/StatefulProcessorHandle.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/streaming/StatefulProcessorHandle.scala @@ -108,6 +108,29 @@ private[sql] trait StatefulProcessorHandle extends Serializable { userKeyEnc: Encoder[K], valEncoder: Encoder[V]): MapState[K, V] + /** + * Function to create new or return existing map state variable of given type + * with ttl. State values will not be returned past ttlDuration, and will be eventually removed + * from the state store. Any values in mapState which have expired after ttlDuration will not + * returned on get() and will be eventually removed from the state. + * + * The user must ensure to call this function only within the `init()` method of the + * StatefulProcessor. + * + * @param stateName - name of the state variable + * @param userKeyEnc - spark sql encoder for the map key + * @param valEncoder - SQL encoder for state variable + * @param ttlConfig - the ttl configuration (time to live duration etc.) + * @tparam K - type of key for map state variable + * @tparam V - type of value for map state variable + * @return - instance of MapState of type [K,V] that can be used to store state persistently + */ + def getMapState[K, V]( + stateName: String, + userKeyEnc: Encoder[K], + valEncoder: Encoder[V], + ttlConfig: TTLConfig): MapState[K, V] + /** Function to return queryInfo for currently running task */ def getQueryInfo(): QueryInfo diff --git a/sql/api/src/main/scala/org/apache/spark/sql/types/DataType.scala b/sql/api/src/main/scala/org/apache/spark/sql/types/DataType.scala index 16cf6224ce27b..12c7905f62d1a 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/types/DataType.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/types/DataType.scala @@ -117,7 +117,8 @@ object DataType { private val FIXED_DECIMAL = """decimal\(\s*(\d+)\s*,\s*(\-?\d+)\s*\)""".r private val CHAR_TYPE = """char\(\s*(\d+)\s*\)""".r private val VARCHAR_TYPE = """varchar\(\s*(\d+)\s*\)""".r - private val COLLATED_STRING_TYPE = """string\s+collate\s+([\w_]+|`[\w_]`)""".r + + val COLLATIONS_METADATA_KEY = "__COLLATIONS" def fromDDL(ddl: String): DataType = { parseTypeWithFallback( @@ -182,9 +183,6 @@ object DataType { /** Given the string representation of a type, return its DataType */ private def nameToType(name: String): DataType = { name match { - case COLLATED_STRING_TYPE(collation) => - val collationId = CollationFactory.collationNameToId(collation) - StringType(collationId) case "decimal" => DecimalType.USER_DEFAULT case FIXED_DECIMAL(precision, scale) => DecimalType(precision.toInt, scale.toInt) case CHAR_TYPE(length) => CharType(length.toInt) @@ -208,26 +206,40 @@ object DataType { } // NOTE: Map fields must be sorted in alphabetical order to keep consistent with the Python side. - private[sql] def parseDataType(json: JValue): DataType = json match { + private[sql] def parseDataType( + json: JValue, + fieldPath: String = "", + collationsMap: Map[String, String] = Map.empty): DataType = json match { case JString(name) => - nameToType(name) + collationsMap.get(fieldPath) match { + case Some(collation) => + assertValidTypeForCollations(fieldPath, name, collationsMap) + stringTypeWithCollation(collation) + case _ => nameToType(name) + } case JSortedObject( ("containsNull", JBool(n)), ("elementType", t: JValue), ("type", JString("array"))) => - ArrayType(parseDataType(t), n) + assertValidTypeForCollations(fieldPath, "array", collationsMap) + val elementType = parseDataType(t, fieldPath + ".element", collationsMap) + ArrayType(elementType, n) case JSortedObject( ("keyType", k: JValue), ("type", JString("map")), ("valueContainsNull", JBool(n)), ("valueType", v: JValue)) => - MapType(parseDataType(k), parseDataType(v), n) + assertValidTypeForCollations(fieldPath, "map", collationsMap) + val keyType = parseDataType(k, fieldPath + ".key", collationsMap) + val valueType = parseDataType(v, fieldPath + ".value", collationsMap) + MapType(keyType, valueType, n) case JSortedObject( ("fields", JArray(fields)), ("type", JString("struct"))) => + assertValidTypeForCollations(fieldPath, "struct", collationsMap) StructType(fields.map(parseStructField)) // Scala/Java UDT @@ -253,11 +265,18 @@ object DataType { private def parseStructField(json: JValue): StructField = json match { case JSortedObject( - ("metadata", metadata: JObject), + ("metadata", JObject(metadataFields)), ("name", JString(name)), ("nullable", JBool(nullable)), ("type", dataType: JValue)) => - StructField(name, parseDataType(dataType), nullable, Metadata.fromJObject(metadata)) + val collationsMap = getCollationsMap(metadataFields) + val metadataWithoutCollations = + JObject(metadataFields.filterNot(_._1 == COLLATIONS_METADATA_KEY)) + StructField( + name, + parseDataType(dataType, name, collationsMap), + nullable, + Metadata.fromJObject(metadataWithoutCollations)) // Support reading schema when 'metadata' is missing. case JSortedObject( ("name", JString(name)), @@ -270,8 +289,43 @@ object DataType { ("type", dataType: JValue)) => StructField(name, parseDataType(dataType)) case other => throw new SparkIllegalArgumentException( - errorClass = "_LEGACY_ERROR_TEMP_3250", - messageParameters = Map("other" -> compact(render(other)))) + errorClass = "INVALID_JSON_DATA_TYPE", + messageParameters = Map("invalidType" -> compact(render(other)))) + } + + private def assertValidTypeForCollations( + fieldPath: String, + fieldType: String, + collationMap: Map[String, String]): Unit = { + if (collationMap.contains(fieldPath) && fieldType != "string") { + throw new SparkIllegalArgumentException( + errorClass = "INVALID_JSON_DATA_TYPE_FOR_COLLATIONS", + messageParameters = Map("jsonType" -> fieldType)) + } + } + + /** + * Returns a map of field path to collation name. + */ + private def getCollationsMap(metadataFields: List[JField]): Map[String, String] = { + val collationsJsonOpt = metadataFields.find(_._1 == COLLATIONS_METADATA_KEY).map(_._2) + collationsJsonOpt match { + case Some(JObject(fields)) => + fields.collect { + case (fieldPath, JString(collation)) => + collation.split("\\.", 2) match { + case Array(provider: String, collationName: String) => + CollationFactory.assertValidProvider(provider) + fieldPath -> collationName + } + }.toMap + + case _ => Map.empty + } + } + + private def stringTypeWithCollation(collationName: String): StringType = { + StringType(CollationFactory.collationNameToId(collationName)) } protected[types] def buildFormattedString( @@ -354,6 +408,41 @@ object DataType { } } + /** + * Check if `from` is equal to `to` type except for collations, which are checked to be + * compatible so that data of type `from` can be interpreted as of type `to`. + */ + private[sql] def equalsIgnoreCompatibleCollation( + from: DataType, + to: DataType): Boolean = { + (from, to) match { + // String types with possibly different collations are compatible. + case (_: StringType, _: StringType) => true + + case (ArrayType(fromElement, fromContainsNull), ArrayType(toElement, toContainsNull)) => + (fromContainsNull == toContainsNull) && + equalsIgnoreCompatibleCollation(fromElement, toElement) + + case (MapType(fromKey, fromValue, fromContainsNull), + MapType(toKey, toValue, toContainsNull)) => + fromContainsNull == toContainsNull && + // Map keys cannot change collation. + fromKey == toKey && + equalsIgnoreCompatibleCollation(fromValue, toValue) + + case (StructType(fromFields), StructType(toFields)) => + fromFields.length == toFields.length && + fromFields.zip(toFields).forall { case (fromField, toField) => + fromField.name == toField.name && + fromField.nullable == toField.nullable && + fromField.metadata == toField.metadata && + equalsIgnoreCompatibleCollation(fromField.dataType, toField.dataType) + } + + case (fromDataType, toDataType) => fromDataType == toDataType + } + } + /** * Returns true if the two data types share the same "shape", i.e. the types * are the same, but the field names don't need to be the same. diff --git a/sql/api/src/main/scala/org/apache/spark/sql/types/Metadata.scala b/sql/api/src/main/scala/org/apache/spark/sql/types/Metadata.scala index 2ffd0f13ca10f..70e03905d4b05 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/types/Metadata.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/types/Metadata.scala @@ -49,6 +49,13 @@ sealed class Metadata private[types] (private[types] val map: Map[String, Any]) /** Tests whether this Metadata contains a binding for a key. */ def contains(key: String): Boolean = map.contains(key) + /** + * Tests whether this Metadata is empty. + * + * @since 4.0.0 + */ + def isEmpty: Boolean = map.isEmpty + /** Gets a Long. */ def getLong(key: String): Long = get(key) diff --git a/sql/api/src/main/scala/org/apache/spark/sql/types/StringType.scala b/sql/api/src/main/scala/org/apache/spark/sql/types/StringType.scala index 47d85b2c645c8..6ec55db008c75 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/types/StringType.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/types/StringType.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.types +import org.json4s.JsonAST.{JString, JValue} + import org.apache.spark.annotation.Stable import org.apache.spark.sql.catalyst.util.CollationFactory @@ -36,8 +38,12 @@ class StringType private(val collationId: Int) extends AtomicType with Serializa */ def supportsBinaryEquality: Boolean = CollationFactory.fetchCollation(collationId).supportsBinaryEquality + + def isUTF8BinaryCollation: Boolean = + collationId == CollationFactory.UTF8_BINARY_COLLATION_ID + def isUTF8BinaryLcaseCollation: Boolean = - collationId == CollationFactory.UTF8_BINARY_LCASE_COLLATION_ID + collationId == CollationFactory.UTF8_LCASE_COLLATION_ID /** * Support for Binary Ordering implies that strings are considered equal only @@ -54,9 +60,14 @@ class StringType private(val collationId: Int) extends AtomicType with Serializa * If this is an UTF8_BINARY collation output is `string` due to backwards compatibility. */ override def typeName: String = - if (collationId == 0) "string" + if (isUTF8BinaryCollation) "string" else s"string collate ${CollationFactory.fetchCollation(collationId).collationName}" + // Due to backwards compatibility and compatibility with other readers + // all string types are serialized in json as regular strings and + // the collation information is written to struct field metadata + override def jsonValue: JValue = JString("string") + override def equals(obj: Any): Boolean = obj.isInstanceOf[StringType] && obj.asInstanceOf[StringType].collationId == collationId diff --git a/sql/api/src/main/scala/org/apache/spark/sql/types/StructField.scala b/sql/api/src/main/scala/org/apache/spark/sql/types/StructField.scala index 66f9557db213e..3ff96fea9ee04 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/types/StructField.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/types/StructField.scala @@ -17,11 +17,15 @@ package org.apache.spark.sql.types +import scala.collection.mutable + +import org.json4s.{JObject, JString} import org.json4s.JsonAST.JValue import org.json4s.JsonDSL._ +import org.apache.spark.SparkException import org.apache.spark.annotation.Stable -import org.apache.spark.sql.catalyst.util.{QuotingUtils, StringConcat} +import org.apache.spark.sql.catalyst.util.{CollationFactory, QuotingUtils, StringConcat} import org.apache.spark.sql.catalyst.util.ResolveDefaultColumnsUtils.{CURRENT_DEFAULT_COLUMN_METADATA_KEY, EXISTS_DEFAULT_COLUMN_METADATA_KEY} import org.apache.spark.util.SparkSchemaUtils @@ -63,7 +67,61 @@ case class StructField( ("name" -> name) ~ ("type" -> dataType.jsonValue) ~ ("nullable" -> nullable) ~ - ("metadata" -> metadata.jsonValue) + ("metadata" -> metadataJson) + } + + private def metadataJson: JValue = { + val metadataJsonValue = metadata.jsonValue + metadataJsonValue match { + case JObject(fields) if collationMetadata.nonEmpty => + val collationFields = collationMetadata.map(kv => kv._1 -> JString(kv._2)).toList + JObject(fields :+ (DataType.COLLATIONS_METADATA_KEY -> JObject(collationFields))) + + case _ => metadataJsonValue + } + } + + /** Map of field path to collation name. */ + private lazy val collationMetadata: Map[String, String] = { + val fieldToCollationMap = mutable.Map[String, String]() + + def visitRecursively(dt: DataType, path: String): Unit = dt match { + case at: ArrayType => + processDataType(at.elementType, path + ".element") + + case mt: MapType => + processDataType(mt.keyType, path + ".key") + processDataType(mt.valueType, path + ".value") + + case st: StringType if isCollatedString(st) => + fieldToCollationMap(path) = schemaCollationValue(st) + + case _ => + } + + def processDataType(dt: DataType, path: String): Unit = { + if (isCollatedString(dt)) { + fieldToCollationMap(path) = schemaCollationValue(dt) + } else { + visitRecursively(dt, path) + } + } + + visitRecursively(dataType, name) + fieldToCollationMap.toMap + } + + private def isCollatedString(dt: DataType): Boolean = dt match { + case st: StringType => !st.isUTF8BinaryCollation + case _ => false + } + + private def schemaCollationValue(dt: DataType): String = dt match { + case st: StringType => + val collation = CollationFactory.fetchCollation(st.collationId) + collation.identifier().toStringWithoutVersion() + case _ => + throw SparkException.internalError(s"Unexpected data type $dt") } /** diff --git a/sql/api/src/main/scala/org/apache/spark/sql/types/UDTRegistration.scala b/sql/api/src/main/scala/org/apache/spark/sql/types/UDTRegistration.scala index 42c8c783e54c7..9219c1d139b99 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/types/UDTRegistration.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/types/UDTRegistration.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.types import scala.collection.mutable import org.apache.spark.annotation.{DeveloperApi, Since} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.sql.errors.DataTypeErrors import org.apache.spark.util.SparkClassUtils @@ -58,7 +58,8 @@ object UDTRegistration extends Serializable with Logging { */ def register(userClass: String, udtClass: String): Unit = { if (udtMap.contains(userClass)) { - logWarning(s"Cannot register UDT for ${userClass}, which is already registered.") + logWarning(log"Cannot register UDT for ${MDC(LogKeys.CLASS_NAME, userClass)}, " + + log"which is already registered.") } else { // When register UDT with class name, we can't check if the UDT class is an UserDefinedType, // or not. The check is deferred. diff --git a/sql/api/src/main/scala/org/apache/spark/sql/types/UpCastRule.scala b/sql/api/src/main/scala/org/apache/spark/sql/types/UpCastRule.scala index e4ab802c5bd82..7ec00bde0b25f 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/types/UpCastRule.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/types/UpCastRule.scala @@ -16,8 +16,6 @@ */ package org.apache.spark.sql.types -import scala.collection.immutable.IndexedSeq - /** * Rule that defines which upcasts are allow in Spark. */ diff --git a/sql/api/src/main/scala/org/apache/spark/sql/util/ArrowUtils.scala b/sql/api/src/main/scala/org/apache/spark/sql/util/ArrowUtils.scala index d9bd3b0e612b6..6852fe09ef96b 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/util/ArrowUtils.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/util/ArrowUtils.scala @@ -51,7 +51,7 @@ private[sql] object ArrowUtils { case BinaryType if !largeVarTypes => ArrowType.Binary.INSTANCE case _: StringType if largeVarTypes => ArrowType.LargeUtf8.INSTANCE case BinaryType if largeVarTypes => ArrowType.LargeBinary.INSTANCE - case DecimalType.Fixed(precision, scale) => new ArrowType.Decimal(precision, scale) + case DecimalType.Fixed(precision, scale) => new ArrowType.Decimal(precision, scale, 8 * 16) case DateType => new ArrowType.Date(DateUnit.DAY) case TimestampType if timeZoneId == null => throw SparkException.internalError("Missing timezoneId where it is mandatory.") diff --git a/sql/catalyst/benchmarks/CalendarIntervalBenchmark-jdk21-results.txt b/sql/catalyst/benchmarks/CalendarIntervalBenchmark-jdk21-results.txt index b398bfea784fc..0f3cb3d9d3ac2 100644 --- a/sql/catalyst/benchmarks/CalendarIntervalBenchmark-jdk21-results.txt +++ b/sql/catalyst/benchmarks/CalendarIntervalBenchmark-jdk21-results.txt @@ -2,10 +2,10 @@ CalendarInterval ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor CalendarInterval: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Call setInterval & getInterval 1024 1024 0 131.1 7.6 1.0X +Call setInterval & getInterval 1028 1028 1 130.6 7.7 1.0X diff --git a/sql/catalyst/benchmarks/CalendarIntervalBenchmark-results.txt b/sql/catalyst/benchmarks/CalendarIntervalBenchmark-results.txt index efcd1362dd227..28e1630ae9624 100644 --- a/sql/catalyst/benchmarks/CalendarIntervalBenchmark-results.txt +++ b/sql/catalyst/benchmarks/CalendarIntervalBenchmark-results.txt @@ -2,10 +2,10 @@ CalendarInterval ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor CalendarInterval: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Call setInterval & getInterval 1020 1021 1 131.6 7.6 1.0X +Call setInterval & getInterval 1092 1093 1 122.9 8.1 1.0X diff --git a/sql/catalyst/benchmarks/EnumTypeSetBenchmark-jdk21-results.txt b/sql/catalyst/benchmarks/EnumTypeSetBenchmark-jdk21-results.txt index 5fc70c010384d..0c5014db37346 100644 --- a/sql/catalyst/benchmarks/EnumTypeSetBenchmark-jdk21-results.txt +++ b/sql/catalyst/benchmarks/EnumTypeSetBenchmark-jdk21-results.txt @@ -1,105 +1,105 @@ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test contains use empty Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 1 1 0 1389.9 0.7 1.0X -Use EnumSet 5 5 0 212.5 4.7 0.2X +Use HashSet 1 1 0 1391.6 0.7 1.0X +Use EnumSet 2 2 0 441.1 2.3 0.3X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test contains use 1 item Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 2 2 0 464.2 2.2 1.0X -Use EnumSet 2 2 0 544.3 1.8 1.2X +Use HashSet 2 2 0 494.0 2.0 1.0X +Use EnumSet 2 2 0 564.3 1.8 1.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test contains use 3 items Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 2 3 0 413.1 2.4 1.0X -Use EnumSet 2 2 0 563.5 1.8 1.4X +Use HashSet 2 2 0 486.6 2.1 1.0X +Use EnumSet 2 2 0 502.8 2.0 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test contains use 5 items Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 9 10 2 108.0 9.3 1.0X -Use EnumSet 2 2 0 544.5 1.8 5.0X +Use HashSet 9 9 0 114.6 8.7 1.0X +Use EnumSet 2 2 0 424.0 2.4 3.7X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test contains use 10 items Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 10 10 0 98.3 10.2 1.0X -Use EnumSet 2 2 0 544.5 1.8 5.5X +Use HashSet 10 10 0 100.0 10.0 1.0X +Use EnumSet 2 2 0 423.9 2.4 4.2X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test create empty Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 0 0 0 406.5 2.5 1.0X -Use EnumSet 1 1 0 136.5 7.3 0.3X +Use HashSet 0 0 0 407.9 2.5 1.0X +Use EnumSet 1 1 0 136.9 7.3 0.3X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test create 1 item Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 1 1 0 102.2 9.8 1.0X -Use EnumSet 0 0 0 291.4 3.4 2.9X +Use HashSet 1 1 0 102.8 9.7 1.0X +Use EnumSet 0 0 0 291.7 3.4 2.8X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test create 3 items Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 9 10 0 10.6 94.4 1.0X -Use EnumSet 1 1 0 132.3 7.6 12.5X +Use HashSet 10 10 0 10.5 95.5 1.0X +Use EnumSet 1 1 0 132.6 7.5 12.7X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test create 5 items Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 38 38 0 2.7 376.8 1.0X -Use EnumSet 1 1 0 144.3 6.9 54.4X +Use HashSet 30 30 0 3.3 300.1 1.0X +Use EnumSet 1 1 0 144.8 6.9 43.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test create 10 items Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 58 59 1 1.7 580.1 1.0X -Use EnumSet 1 1 0 132.6 7.5 76.9X +Use HashSet 59 61 1 1.7 594.5 1.0X +Use EnumSet 1 1 0 129.9 7.7 77.2X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test create and contains use empty Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 4 4 0 269.5 3.7 1.0X -Use EnumSet 5 5 0 212.9 4.7 0.8X +Use HashSet 4 4 0 230.7 4.3 1.0X +Use EnumSet 6 6 0 179.2 5.6 0.8X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test create and contains use 1 item Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 12 12 1 85.4 11.7 1.0X -Use EnumSet 6 6 0 167.6 6.0 2.0X +Use HashSet 13 14 0 75.4 13.3 1.0X +Use EnumSet 7 7 0 147.3 6.8 2.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test create and contains use 3 items Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 27 28 0 36.6 27.3 1.0X -Use EnumSet 6 6 0 169.5 5.9 4.6X +Use HashSet 27 28 1 37.6 26.6 1.0X +Use EnumSet 7 7 0 149.6 6.7 4.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test create and contains use 5 items Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 58 58 1 17.3 57.9 1.0X -Use EnumSet 6 6 0 166.4 6.0 9.6X +Use HashSet 48 49 1 20.7 48.3 1.0X +Use EnumSet 7 7 0 147.4 6.8 7.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test create and contains use 10 items Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Use HashSet 79 79 1 12.7 78.8 1.0X -Use EnumSet 6 6 0 157.9 6.3 12.4X +Use HashSet 79 80 1 12.6 79.2 1.0X +Use EnumSet 7 7 0 140.3 7.1 11.1X diff --git a/sql/catalyst/benchmarks/EnumTypeSetBenchmark-results.txt b/sql/catalyst/benchmarks/EnumTypeSetBenchmark-results.txt index 31c2877a42c21..1714661841022 100644 --- a/sql/catalyst/benchmarks/EnumTypeSetBenchmark-results.txt +++ b/sql/catalyst/benchmarks/EnumTypeSetBenchmark-results.txt @@ -1,105 +1,105 @@ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test contains use empty Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 1 1 0 1435.7 0.7 1.0X -Use EnumSet 2 2 0 516.0 1.9 0.4X +Use HashSet 1 1 0 1391.5 0.7 1.0X +Use EnumSet 2 2 0 503.5 2.0 0.4X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test contains use 1 item Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 2 2 0 501.5 2.0 1.0X -Use EnumSet 2 2 0 481.2 2.1 1.0X +Use HashSet 2 2 0 509.3 2.0 1.0X +Use EnumSet 2 2 0 488.5 2.0 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test contains use 3 items Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 2 2 0 501.2 2.0 1.0X -Use EnumSet 2 2 0 564.9 1.8 1.1X +Use HashSet 2 2 0 501.9 2.0 1.0X +Use EnumSet 2 2 0 564.7 1.8 1.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test contains use 5 items Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 9 9 1 107.7 9.3 1.0X -Use EnumSet 2 2 0 598.6 1.7 5.6X +Use HashSet 8 8 0 122.9 8.1 1.0X +Use EnumSet 2 2 0 545.7 1.8 4.4X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test contains use 10 items Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 10 10 0 98.2 10.2 1.0X -Use EnumSet 2 2 0 587.5 1.7 6.0X +Use HashSet 9 9 0 107.8 9.3 1.0X +Use EnumSet 2 2 0 545.7 1.8 5.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test create empty Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 0 0 0 408.3 2.4 1.0X -Use EnumSet 0 0 0 291.8 3.4 0.7X +Use HashSet 0 0 0 395.7 2.5 1.0X +Use EnumSet 1 1 0 132.7 7.5 0.3X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test create 1 item Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 2 2 0 59.7 16.7 1.0X -Use EnumSet 1 1 0 150.1 6.7 2.5X +Use HashSet 2 2 0 59.6 16.8 1.0X +Use EnumSet 1 1 0 151.1 6.6 2.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test create 3 items Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 10 11 0 9.6 104.4 1.0X -Use EnumSet 1 1 0 132.4 7.6 13.8X +Use HashSet 10 10 0 10.2 98.3 1.0X +Use EnumSet 1 1 0 132.4 7.6 13.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test create 5 items Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 36 37 1 2.8 362.5 1.0X -Use EnumSet 1 1 0 132.3 7.6 48.0X +Use HashSet 35 36 1 2.8 351.5 1.0X +Use EnumSet 1 1 0 132.4 7.6 46.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test create 10 items Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 58 59 1 1.7 582.1 1.0X -Use EnumSet 1 1 0 127.2 7.9 74.1X +Use HashSet 61 62 1 1.6 607.7 1.0X +Use EnumSet 1 1 0 127.2 7.9 77.3X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test create and contains use empty Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 4 4 0 274.1 3.6 1.0X -Use EnumSet 5 5 0 216.0 4.6 0.8X +Use HashSet 4 4 0 248.3 4.0 1.0X +Use EnumSet 5 5 0 188.3 5.3 0.8X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test create and contains use 1 item Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 10 11 0 97.0 10.3 1.0X -Use EnumSet 6 7 0 162.4 6.2 1.7X +Use HashSet 11 11 0 92.9 10.8 1.0X +Use EnumSet 7 7 0 141.6 7.1 1.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test create and contains use 3 items Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 32 33 1 30.8 32.5 1.0X -Use EnumSet 6 6 0 155.8 6.4 5.1X +Use HashSet 32 32 0 31.7 31.5 1.0X +Use EnumSet 6 7 0 154.0 6.5 4.9X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test create and contains use 5 items Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Use HashSet 61 63 3 16.4 61.1 1.0X -Use EnumSet 7 7 0 153.4 6.5 9.4X +Use HashSet 58 59 1 17.2 58.1 1.0X +Use EnumSet 7 7 0 140.3 7.1 8.2X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test create and contains use 10 items Set: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Use HashSet 83 85 2 12.0 83.0 1.0X -Use EnumSet 7 7 0 152.6 6.6 12.7X +Use HashSet 84 85 1 11.9 83.8 1.0X +Use EnumSet 7 7 0 144.7 6.9 12.1X diff --git a/sql/catalyst/benchmarks/EscapePathBenchmark-jdk21-results.txt b/sql/catalyst/benchmarks/EscapePathBenchmark-jdk21-results.txt new file mode 100644 index 0000000000000..3d16c874e8c9b --- /dev/null +++ b/sql/catalyst/benchmarks/EscapePathBenchmark-jdk21-results.txt @@ -0,0 +1,24 @@ +================================================================================================ +Escape +================================================================================================ + +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1021-azure +AMD EPYC 7763 64-Core Processor +Escape Tests: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Legacy 6996 7009 9 0.1 6996.5 1.0X +New 771 776 3 1.3 770.7 9.1X + + +================================================================================================ +Unescape +================================================================================================ + +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1021-azure +AMD EPYC 7763 64-Core Processor +Unescape Tests: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Legacy 5127 5137 6 0.2 5127.3 1.0X +New 579 583 4 1.7 579.3 8.9X + + diff --git a/sql/catalyst/benchmarks/EscapePathBenchmark-results.txt b/sql/catalyst/benchmarks/EscapePathBenchmark-results.txt new file mode 100644 index 0000000000000..7cfa134652c27 --- /dev/null +++ b/sql/catalyst/benchmarks/EscapePathBenchmark-results.txt @@ -0,0 +1,24 @@ +================================================================================================ +Escape +================================================================================================ + +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1021-azure +AMD EPYC 7763 64-Core Processor +Escape Tests: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Legacy 6966 6978 12 0.1 6965.9 1.0X +New 725 730 4 1.4 725.4 9.6X + + +================================================================================================ +Unescape +================================================================================================ + +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1021-azure +AMD EPYC 7763 64-Core Processor +Unescape Tests: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Legacy 6665 6677 11 0.2 6664.6 1.0X +New 602 606 2 1.7 602.1 11.1X + + diff --git a/sql/catalyst/benchmarks/GenericArrayDataBenchmark-jdk21-results.txt b/sql/catalyst/benchmarks/GenericArrayDataBenchmark-jdk21-results.txt index fa1a8ea23b99f..f5ac49b25f6e1 100644 --- a/sql/catalyst/benchmarks/GenericArrayDataBenchmark-jdk21-results.txt +++ b/sql/catalyst/benchmarks/GenericArrayDataBenchmark-jdk21-results.txt @@ -1,10 +1,10 @@ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor constructor: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -arrayOfAny 6 6 0 1620.1 0.6 1.0X -arrayOfAnyAsObject 6 6 0 1620.1 0.6 1.0X +arrayOfAny 6 6 1 1619.9 0.6 1.0X +arrayOfAnyAsObject 6 6 0 1619.8 0.6 1.0X arrayOfAnyAsSeq 215 216 1 46.5 21.5 0.0X arrayOfInt 270 271 1 37.0 27.0 0.0X -arrayOfIntAsObject 249 250 1 40.1 24.9 0.0X +arrayOfIntAsObject 250 251 1 40.0 25.0 0.0X diff --git a/sql/catalyst/benchmarks/GenericArrayDataBenchmark-results.txt b/sql/catalyst/benchmarks/GenericArrayDataBenchmark-results.txt index 70ab4313ee8d7..5431cc0ccd8bb 100644 --- a/sql/catalyst/benchmarks/GenericArrayDataBenchmark-results.txt +++ b/sql/catalyst/benchmarks/GenericArrayDataBenchmark-results.txt @@ -1,10 +1,10 @@ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor constructor: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -arrayOfAny 6 6 0 1620.2 0.6 1.0X +arrayOfAny 6 6 0 1620.1 0.6 1.0X arrayOfAnyAsObject 6 6 0 1620.1 0.6 1.0X arrayOfAnyAsSeq 155 155 1 64.7 15.5 0.0X -arrayOfInt 252 252 0 39.7 25.2 0.0X -arrayOfIntAsObject 249 250 0 40.2 24.9 0.0X +arrayOfInt 253 254 1 39.6 25.3 0.0X +arrayOfIntAsObject 252 253 1 39.7 25.2 0.0X diff --git a/sql/catalyst/benchmarks/HashBenchmark-jdk21-results.txt b/sql/catalyst/benchmarks/HashBenchmark-jdk21-results.txt index 27d1cc6f3b036..1a1d7bb5627e0 100644 --- a/sql/catalyst/benchmarks/HashBenchmark-jdk21-results.txt +++ b/sql/catalyst/benchmarks/HashBenchmark-jdk21-results.txt @@ -2,69 +2,69 @@ single ints ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Hash For single ints: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -interpreted version 2165 2180 22 248.0 4.0 1.0X -codegen version 3582 3583 1 149.9 6.7 0.6X -codegen version 64-bit 3404 3413 13 157.7 6.3 0.6X -codegen HiveHash version 2837 2857 28 189.3 5.3 0.8X +interpreted version 2149 2153 5 249.8 4.0 1.0X +codegen version 3579 3579 1 150.0 6.7 0.6X +codegen version 64-bit 3401 3403 2 157.8 6.3 0.6X +codegen HiveHash version 2799 2802 5 191.8 5.2 0.8X ================================================================================================ single longs ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Hash For single longs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -interpreted version 3483 3487 6 154.1 6.5 1.0X -codegen version 5078 5081 4 105.7 9.5 0.7X -codegen version 64-bit 4143 4148 7 129.6 7.7 0.8X -codegen HiveHash version 3320 3346 37 161.7 6.2 1.0X +interpreted version 2761 2793 45 194.4 5.1 1.0X +codegen version 5093 5095 4 105.4 9.5 0.5X +codegen version 64-bit 4112 4115 4 130.6 7.7 0.7X +codegen HiveHash version 3215 3216 1 167.0 6.0 0.9X ================================================================================================ normal ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Hash For normal: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -interpreted version 1422 1436 19 1.5 678.2 1.0X -codegen version 1881 1881 1 1.1 896.9 0.8X -codegen version 64-bit 739 741 2 2.8 352.3 1.9X -codegen HiveHash version 3770 3771 1 0.6 1797.7 0.4X +interpreted version 1462 1462 1 1.4 696.9 1.0X +codegen version 1868 1868 1 1.1 890.7 0.8X +codegen version 64-bit 732 734 1 2.9 349.0 2.0X +codegen HiveHash version 3733 3734 2 0.6 1780.0 0.4X ================================================================================================ array ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Hash For array: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -interpreted version 989 992 3 0.1 7543.5 1.0X -codegen version 3498 3499 0 0.0 26691.2 0.3X -codegen version 64-bit 2397 2398 1 0.1 18289.8 0.4X -codegen HiveHash version 726 726 0 0.2 5535.4 1.4X +interpreted version 1084 1084 0 0.1 8269.4 1.0X +codegen version 3681 3688 10 0.0 28080.6 0.3X +codegen version 64-bit 2527 2527 0 0.1 19280.1 0.4X +codegen HiveHash version 810 810 0 0.2 6178.0 1.3X ================================================================================================ map ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Hash For map: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -interpreted version 0 0 0 93.0 10.7 1.0X -codegen version 233 233 0 0.0 56943.4 0.0X -codegen version 64-bit 159 160 0 0.0 38922.6 0.0X -codegen HiveHash version 26 26 0 0.2 6303.0 0.0X +interpreted version 0 0 0 84.9 11.8 1.0X +codegen version 260 260 0 0.0 63397.9 0.0X +codegen version 64-bit 176 176 0 0.0 43056.2 0.0X +codegen HiveHash version 29 29 0 0.1 6968.9 0.0X diff --git a/sql/catalyst/benchmarks/HashBenchmark-results.txt b/sql/catalyst/benchmarks/HashBenchmark-results.txt index e004245c2731a..a864b60913439 100644 --- a/sql/catalyst/benchmarks/HashBenchmark-results.txt +++ b/sql/catalyst/benchmarks/HashBenchmark-results.txt @@ -2,69 +2,69 @@ single ints ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Hash For single ints: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -interpreted version 2253 2253 0 238.3 4.2 1.0X -codegen version 3602 3608 9 149.1 6.7 0.6X -codegen version 64-bit 3457 3466 13 155.3 6.4 0.7X -codegen HiveHash version 3002 3005 4 178.8 5.6 0.8X +interpreted version 2174 2175 1 246.9 4.1 1.0X +codegen version 3591 3602 17 149.5 6.7 0.6X +codegen version 64-bit 3475 3475 0 154.5 6.5 0.6X +codegen HiveHash version 2849 2852 4 188.5 5.3 0.8X ================================================================================================ single longs ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Hash For single longs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -interpreted version 2886 2890 5 186.0 5.4 1.0X -codegen version 5308 5313 7 101.1 9.9 0.5X -codegen version 64-bit 3728 3730 2 144.0 6.9 0.8X -codegen HiveHash version 3382 3384 2 158.7 6.3 0.9X +interpreted version 3000 3001 2 179.0 5.6 1.0X +codegen version 5207 5220 17 103.1 9.7 0.6X +codegen version 64-bit 3619 3645 36 148.3 6.7 0.8X +codegen HiveHash version 3408 3456 69 157.6 6.3 0.9X ================================================================================================ normal ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Hash For normal: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -interpreted version 2491 2493 2 0.8 1188.0 1.0X -codegen version 2253 2253 0 0.9 1074.1 1.1X -codegen version 64-bit 699 700 2 3.0 333.2 3.6X -codegen HiveHash version 3677 3678 1 0.6 1753.4 0.7X +interpreted version 2521 2524 3 0.8 1202.3 1.0X +codegen version 2232 2232 0 0.9 1064.4 1.1X +codegen version 64-bit 700 701 2 3.0 333.8 3.6X +codegen HiveHash version 3672 3682 14 0.6 1750.8 0.7X ================================================================================================ array ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Hash For array: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -interpreted version 936 937 1 0.1 7139.3 1.0X -codegen version 3374 3382 11 0.0 25742.7 0.3X -codegen version 64-bit 2282 2284 4 0.1 17406.8 0.4X -codegen HiveHash version 686 687 2 0.2 5231.1 1.4X +interpreted version 971 976 4 0.1 7410.9 1.0X +codegen version 3558 3582 34 0.0 27147.3 0.3X +codegen version 64-bit 2357 2363 9 0.1 17985.0 0.4X +codegen HiveHash version 721 723 4 0.2 5497.9 1.3X ================================================================================================ map ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Hash For map: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -interpreted version 0 0 0 90.0 11.1 1.0X -codegen version 229 230 1 0.0 55936.5 0.0X -codegen version 64-bit 155 155 0 0.0 37821.3 0.0X -codegen HiveHash version 25 25 0 0.2 6172.4 0.0X +interpreted version 0 0 0 90.3 11.1 1.0X +codegen version 213 214 0 0.0 52051.3 0.0X +codegen version 64-bit 144 144 1 0.0 35164.7 0.0X +codegen HiveHash version 24 24 3 0.2 5812.5 0.0X diff --git a/sql/catalyst/benchmarks/HashByteArrayBenchmark-jdk21-results.txt b/sql/catalyst/benchmarks/HashByteArrayBenchmark-jdk21-results.txt index a88ce01f163df..f7dc5d3a8a87d 100644 --- a/sql/catalyst/benchmarks/HashByteArrayBenchmark-jdk21-results.txt +++ b/sql/catalyst/benchmarks/HashByteArrayBenchmark-jdk21-results.txt @@ -2,76 +2,76 @@ Benchmark for MurMurHash 3 and xxHash64 ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Hash byte arrays with length 8: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Murmur3_x86_32 10 10 0 206.7 4.8 1.0X -xxHash 64-bit 10 10 0 201.7 5.0 1.0X +Murmur3_x86_32 10 10 0 206.8 4.8 1.0X +xxHash 64-bit 10 10 0 201.8 5.0 1.0X HiveHasher 14 14 0 152.3 6.6 0.7X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Hash byte arrays with length 16: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ Murmur3_x86_32 14 14 0 145.8 6.9 1.0X -xxHash 64-bit 12 12 0 168.8 5.9 1.2X -HiveHasher 23 23 0 91.6 10.9 0.6X +xxHash 64-bit 12 12 0 169.6 5.9 1.2X +HiveHasher 23 23 0 92.0 10.9 0.6X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Hash byte arrays with length 24: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Murmur3_x86_32 20 20 0 104.8 9.5 1.0X -xxHash 64-bit 14 14 0 145.6 6.9 1.4X -HiveHasher 33 33 0 63.0 15.9 0.6X +Murmur3_x86_32 20 20 0 104.5 9.6 1.0X +xxHash 64-bit 15 15 0 144.0 6.9 1.4X +HiveHasher 33 34 1 62.9 15.9 0.6X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Hash byte arrays with length 31: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Murmur3_x86_32 34 34 0 61.9 16.1 1.0X -xxHash 64-bit 27 27 0 78.2 12.8 1.3X -HiveHasher 43 43 0 48.4 20.6 0.8X +Murmur3_x86_32 31 31 0 68.3 14.7 1.0X +xxHash 64-bit 27 27 0 77.5 12.9 1.1X +HiveHasher 43 43 0 48.3 20.7 0.7X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Hash byte arrays with length 95: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Murmur3_x86_32 71 72 0 29.4 34.1 1.0X -xxHash 64-bit 60 61 1 34.9 28.7 1.2X -HiveHasher 156 156 0 13.4 74.4 0.5X +Murmur3_x86_32 68 69 0 30.6 32.6 1.0X +xxHash 64-bit 57 58 0 36.5 27.4 1.2X +HiveHasher 156 156 1 13.4 74.5 0.4X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Hash byte arrays with length 287: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Murmur3_x86_32 207 207 0 10.1 98.6 1.0X -xxHash 64-bit 105 106 0 19.9 50.3 2.0X -HiveHasher 530 530 0 4.0 252.7 0.4X +Murmur3_x86_32 205 208 1 10.2 97.9 1.0X +xxHash 64-bit 102 102 0 20.6 48.4 2.0X +HiveHasher 529 530 0 4.0 252.5 0.4X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Hash byte arrays with length 1055: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Murmur3_x86_32 743 743 0 2.8 354.3 1.0X -xxHash 64-bit 294 295 0 7.1 140.4 2.5X -HiveHasher 2029 2030 1 1.0 967.6 0.4X +Murmur3_x86_32 713 713 1 2.9 339.8 1.0X +xxHash 64-bit 292 293 0 7.2 139.4 2.4X +HiveHasher 2030 2030 0 1.0 967.8 0.4X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Hash byte arrays with length 2079: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Murmur3_x86_32 1451 1452 0 1.4 692.1 1.0X -xxHash 64-bit 548 548 1 3.8 261.1 2.7X -HiveHasher 4022 4034 17 0.5 1917.9 0.4X +Murmur3_x86_32 1379 1381 2 1.5 657.7 1.0X +xxHash 64-bit 559 564 9 3.8 266.5 2.5X +HiveHasher 4022 4024 4 0.5 1917.6 0.3X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Hash byte arrays with length 8223: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Murmur3_x86_32 5784 5785 0 0.4 2758.2 1.0X -xxHash 64-bit 2057 2057 0 1.0 980.6 2.8X -HiveHasher 15983 15984 1 0.1 7621.4 0.4X +Murmur3_x86_32 5701 5704 5 0.4 2718.2 1.0X +xxHash 64-bit 2067 2068 1 1.0 985.6 2.8X +HiveHasher 15981 15982 1 0.1 7620.3 0.4X diff --git a/sql/catalyst/benchmarks/HashByteArrayBenchmark-results.txt b/sql/catalyst/benchmarks/HashByteArrayBenchmark-results.txt index 86886ad928bbf..6c649e7b0d42d 100644 --- a/sql/catalyst/benchmarks/HashByteArrayBenchmark-results.txt +++ b/sql/catalyst/benchmarks/HashByteArrayBenchmark-results.txt @@ -2,76 +2,76 @@ Benchmark for MurMurHash 3 and xxHash64 ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Hash byte arrays with length 8: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ Murmur3_x86_32 11 11 0 184.1 5.4 1.0X -xxHash 64-bit 10 10 0 214.7 4.7 1.2X -HiveHasher 14 14 0 146.2 6.8 0.8X +xxHash 64-bit 10 10 0 214.5 4.7 1.2X +HiveHasher 14 14 0 146.3 6.8 0.8X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Hash byte arrays with length 16: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Murmur3_x86_32 17 17 0 123.4 8.1 1.0X -xxHash 64-bit 12 12 0 176.3 5.7 1.4X -HiveHasher 24 25 0 85.9 11.6 0.7X +Murmur3_x86_32 17 17 0 123.5 8.1 1.0X +xxHash 64-bit 12 12 0 176.5 5.7 1.4X +HiveHasher 24 25 1 85.7 11.7 0.7X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Hash byte arrays with length 24: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Murmur3_x86_32 23 23 0 89.5 11.2 1.0X -xxHash 64-bit 14 14 0 145.9 6.9 1.6X +Murmur3_x86_32 23 24 0 89.5 11.2 1.0X +xxHash 64-bit 14 14 0 146.1 6.8 1.6X HiveHasher 35 35 0 59.8 16.7 0.7X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Hash byte arrays with length 31: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Murmur3_x86_32 35 35 0 60.2 16.6 1.0X -xxHash 64-bit 27 27 0 76.9 13.0 1.3X +Murmur3_x86_32 35 36 0 59.1 16.9 1.0X +xxHash 64-bit 27 28 0 76.3 13.1 1.3X HiveHasher 45 45 0 47.0 21.3 0.8X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Hash byte arrays with length 95: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Murmur3_x86_32 87 88 0 24.0 41.7 1.0X -xxHash 64-bit 62 62 0 34.0 29.5 1.4X +Murmur3_x86_32 87 87 0 24.2 41.4 1.0X +xxHash 64-bit 63 64 0 33.0 30.3 1.4X HiveHasher 160 160 0 13.1 76.1 0.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Hash byte arrays with length 287: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Murmur3_x86_32 243 243 0 8.6 115.9 1.0X -xxHash 64-bit 107 107 0 19.6 50.9 2.3X -HiveHasher 534 534 0 3.9 254.4 0.5X +Murmur3_x86_32 243 243 0 8.6 115.8 1.0X +xxHash 64-bit 122 122 0 17.2 58.2 2.0X +HiveHasher 533 534 0 3.9 254.4 0.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Hash byte arrays with length 1055: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Murmur3_x86_32 873 873 0 2.4 416.1 1.0X -xxHash 64-bit 296 296 1 7.1 141.0 3.0X -HiveHasher 2035 2035 0 1.0 970.5 0.4X +Murmur3_x86_32 872 873 1 2.4 415.9 1.0X +xxHash 64-bit 397 398 1 5.3 189.4 2.2X +HiveHasher 2036 2036 0 1.0 970.7 0.4X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Hash byte arrays with length 2079: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Murmur3_x86_32 1704 1704 0 1.2 812.4 1.0X -xxHash 64-bit 551 551 1 3.8 262.6 3.1X -HiveHasher 4029 4029 0 0.5 1921.1 0.4X +Murmur3_x86_32 1704 1713 14 1.2 812.4 1.0X +xxHash 64-bit 776 778 4 2.7 370.0 2.2X +HiveHasher 4028 4029 1 0.5 1920.9 0.4X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Hash byte arrays with length 8223: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Murmur3_x86_32 6704 6705 1 0.3 3196.6 1.0X -xxHash 64-bit 2054 2055 0 1.0 979.6 3.3X -HiveHasher 15993 15996 4 0.1 7626.0 0.4X +Murmur3_x86_32 6698 6699 2 0.3 3194.0 1.0X +xxHash 64-bit 3021 3021 0 0.7 1440.4 2.2X +HiveHasher 15982 15984 3 0.1 7620.8 0.4X diff --git a/sql/catalyst/benchmarks/HexBenchmark-jdk21-results.txt b/sql/catalyst/benchmarks/HexBenchmark-jdk21-results.txt new file mode 100644 index 0000000000000..afa3efa7a919b --- /dev/null +++ b/sql/catalyst/benchmarks/HexBenchmark-jdk21-results.txt @@ -0,0 +1,14 @@ +================================================================================================ +UnHex Comparison +================================================================================================ + +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1021-azure +AMD EPYC 7763 64-Core Processor +Cardinality 1000000: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Common Codecs 4755 4766 13 0.2 4755.0 1.0X +Java 4018 4048 45 0.2 4018.3 1.2X +Spark 3473 3476 3 0.3 3472.8 1.4X +Spark Binary 2625 2628 3 0.4 2624.6 1.8X + + diff --git a/sql/catalyst/benchmarks/HexBenchmark-results.txt b/sql/catalyst/benchmarks/HexBenchmark-results.txt new file mode 100644 index 0000000000000..55a6a07fed406 --- /dev/null +++ b/sql/catalyst/benchmarks/HexBenchmark-results.txt @@ -0,0 +1,14 @@ +================================================================================================ +UnHex Comparison +================================================================================================ + +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1021-azure +AMD EPYC 7763 64-Core Processor +Cardinality 1000000: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Common Codecs 4881 4897 25 0.2 4880.8 1.0X +Java 4220 4226 9 0.2 4220.0 1.2X +Spark 3954 3956 2 0.3 3954.5 1.2X +Spark Binary 2738 2750 11 0.4 2737.9 1.8X + + diff --git a/sql/catalyst/benchmarks/UnsafeProjectionBenchmark-jdk21-results.txt b/sql/catalyst/benchmarks/UnsafeProjectionBenchmark-jdk21-results.txt index 92a440cc261b2..650028b464207 100644 --- a/sql/catalyst/benchmarks/UnsafeProjectionBenchmark-jdk21-results.txt +++ b/sql/catalyst/benchmarks/UnsafeProjectionBenchmark-jdk21-results.txt @@ -2,13 +2,13 @@ unsafe projection ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor unsafe projection: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -single long 1323 1324 1 202.8 4.9 1.0X -single nullable long 2364 2376 18 113.6 8.8 0.6X -7 primitive types 7098 7104 8 37.8 26.4 0.2X -7 nullable primitive types 9958 9959 1 27.0 37.1 0.1X +single long 1326 1327 1 202.4 4.9 1.0X +single nullable long 2360 2374 19 113.7 8.8 0.6X +7 primitive types 7076 7081 8 37.9 26.4 0.2X +7 nullable primitive types 10618 10621 5 25.3 39.6 0.1X diff --git a/sql/catalyst/benchmarks/UnsafeProjectionBenchmark-results.txt b/sql/catalyst/benchmarks/UnsafeProjectionBenchmark-results.txt index 8bb62ff118481..066c5f9a6f82a 100644 --- a/sql/catalyst/benchmarks/UnsafeProjectionBenchmark-results.txt +++ b/sql/catalyst/benchmarks/UnsafeProjectionBenchmark-results.txt @@ -2,13 +2,13 @@ unsafe projection ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor unsafe projection: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -single long 1289 1290 1 208.2 4.8 1.0X -single nullable long 2431 2431 1 110.4 9.1 0.5X -7 primitive types 6975 6977 2 38.5 26.0 0.2X -7 nullable primitive types 10329 10331 3 26.0 38.5 0.1X +single long 1287 1290 4 208.5 4.8 1.0X +single nullable long 2432 2433 2 110.4 9.1 0.5X +7 primitive types 6968 6970 3 38.5 26.0 0.2X +7 nullable primitive types 10256 10290 48 26.2 38.2 0.1X diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java index 8fe59cb7fae5d..07a9409bc57a2 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java @@ -112,6 +112,32 @@ public static boolean isLuhnNumber(UTF8String numberString) { return checkSum % 10 == 0; } + /** + * Function to validate a given UTF8 string according to Unicode rules. + * + * @param utf8String + * the input string to validate against possible invalid byte sequences + * @return + * the original string if the input string is a valid UTF8String, throw exception otherwise. + */ + public static UTF8String validateUTF8String(UTF8String utf8String) { + if (utf8String.isValid()) return utf8String; + else throw QueryExecutionErrors.invalidUTF8StringError(utf8String); + } + + /** + * Function to try to validate a given UTF8 string according to Unicode rules. + * + * @param utf8String + * the input string to validate against possible invalid byte sequences + * @return + * the original string if the input string is a valid UTF8String, null otherwise. + */ + public static UTF8String tryValidateUTF8String(UTF8String utf8String) { + if (utf8String.isValid()) return utf8String; + else return null; + } + public static byte[] aesEncrypt(byte[] input, byte[] key, UTF8String mode, diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/RowBasedKeyValueBatch.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/RowBasedKeyValueBatch.java index 6a74f64d44849..c057c36ca8204 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/RowBasedKeyValueBatch.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/RowBasedKeyValueBatch.java @@ -19,16 +19,16 @@ import java.io.Closeable; import java.io.IOException; +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.MDC; import org.apache.spark.memory.MemoryConsumer; import org.apache.spark.memory.SparkOutOfMemoryError; import org.apache.spark.memory.TaskMemoryManager; import org.apache.spark.sql.types.*; import org.apache.spark.unsafe.memory.MemoryBlock; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - - /** * RowBasedKeyValueBatch stores key value pairs in contiguous memory region. * @@ -48,7 +48,8 @@ * */ public abstract class RowBasedKeyValueBatch extends MemoryConsumer implements Closeable { - protected static final Logger logger = LoggerFactory.getLogger(RowBasedKeyValueBatch.class); + protected static final SparkLogger logger = + SparkLoggerFactory.getLogger(RowBasedKeyValueBatch.class); private static final int DEFAULT_CAPACITY = 1 << 16; @@ -127,7 +128,8 @@ private boolean acquirePage(long requiredSize) { try { page = allocatePage(requiredSize); } catch (SparkOutOfMemoryError e) { - logger.warn("Failed to allocate page ({} bytes).", requiredSize); + logger.warn("Failed to allocate page ({} bytes).", + MDC.of(LogKeys.PAGE_SIZE$.MODULE$, requiredSize)); return false; } base = page.getBaseObject(); diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java index 6325ba68af5b7..8741c206f2bb4 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java @@ -21,12 +21,14 @@ import java.math.BigDecimal; import java.math.BigInteger; import java.nio.ByteBuffer; +import java.util.Map; import com.esotericsoftware.kryo.Kryo; import com.esotericsoftware.kryo.KryoSerializable; import com.esotericsoftware.kryo.io.Input; import com.esotericsoftware.kryo.io.Output; +import org.apache.spark.SparkIllegalArgumentException; import org.apache.spark.SparkUnsupportedOperationException; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.catalyst.types.*; @@ -155,6 +157,17 @@ public UnsafeRow() {} public void pointTo(Object baseObject, long baseOffset, int sizeInBytes) { assert numFields >= 0 : "numFields (" + numFields + ") should >= 0"; assert sizeInBytes % 8 == 0 : "sizeInBytes (" + sizeInBytes + ") should be a multiple of 8"; + if (baseObject instanceof byte[] bytes) { + int offsetInByteArray = (int) (baseOffset - Platform.BYTE_ARRAY_OFFSET); + if (offsetInByteArray < 0 || sizeInBytes < 0 || + bytes.length < offsetInByteArray + sizeInBytes) { + throw new SparkIllegalArgumentException( + "INTERNAL_ERROR", + Map.of("message", "Invalid byte array backed UnsafeRow: byte array length=" + + bytes.length + ", offset=" + offsetInByteArray + ", byte size=" + sizeInBytes) + ); + } + } this.baseObject = baseObject; this.baseOffset = baseOffset; this.sizeInBytes = sizeInBytes; diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/CatalogPlugin.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/CatalogPlugin.java index 8ca4f568b9f18..23f3acc7230fa 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/CatalogPlugin.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/CatalogPlugin.java @@ -6,7 +6,7 @@ * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/MetadataColumn.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/MetadataColumn.java index 65f31229764fe..6606748e6d6f9 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/MetadataColumn.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/MetadataColumn.java @@ -1,20 +1,18 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ package org.apache.spark.sql.connector.catalog; diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsMetadataColumns.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsMetadataColumns.java index e42424268b44d..09cbda2aa1e16 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsMetadataColumns.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsMetadataColumns.java @@ -1,20 +1,18 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ package org.apache.spark.sql.connector.catalog; diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/index/SupportsIndex.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/index/SupportsIndex.java index 734b290775581..3417ef7f8e805 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/index/SupportsIndex.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/index/SupportsIndex.java @@ -6,7 +6,7 @@ * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/index/TableIndex.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/index/TableIndex.java index 977ed8d6c7528..b9eba54848023 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/index/TableIndex.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/index/TableIndex.java @@ -6,7 +6,7 @@ * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/expressions/Cast.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/expressions/Cast.java index 291f94ec75a8a..25d0c0466aca4 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/expressions/Cast.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/expressions/Cast.java @@ -29,14 +29,30 @@ @Evolving public class Cast extends ExpressionWithToString { private Expression expression; + + /** + * Original data type of given expression + */ + private DataType expressionDataType; + + /** + * Target data type, i.e. data type in which expression will be cast + */ private DataType dataType; + @Deprecated public Cast(Expression expression, DataType dataType) { + this(expression, null, dataType); + } + + public Cast(Expression expression, DataType expressionDataType, DataType targetDataType) { this.expression = expression; - this.dataType = dataType; + this.expressionDataType = expressionDataType; + this.dataType = targetDataType; } public Expression expression() { return expression; } + public DataType expressionDataType() { return expressionDataType; } public DataType dataType() { return dataType; } @Override diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/util/V2ExpressionSQLBuilder.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/util/V2ExpressionSQLBuilder.java index fd1b8f5dd1eeb..14e2112b7201a 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/util/V2ExpressionSQLBuilder.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/util/V2ExpressionSQLBuilder.java @@ -65,7 +65,6 @@ protected String escapeSpecialCharsForLikePattern(String str) { switch (c) { case '_' -> builder.append("\\_"); case '%' -> builder.append("\\%"); - case '\'' -> builder.append("\\\'"); default -> builder.append(c); } } @@ -79,7 +78,7 @@ public String build(Expression expr) { } else if (expr instanceof NamedReference namedReference) { return visitNamedReference(namedReference); } else if (expr instanceof Cast cast) { - return visitCast(build(cast.expression()), cast.dataType()); + return visitCast(build(cast.expression()), cast.expressionDataType(), cast.dataType()); } else if (expr instanceof Extract extract) { return visitExtract(extract.field(), build(extract.source())); } else if (expr instanceof SortOrder sortOrder) { @@ -212,7 +211,7 @@ protected String visitContains(String l, String r) { return l + " LIKE '%" + escapeSpecialCharsForLikePattern(value) + "%' ESCAPE '\\'"; } - private String inputToSQL(Expression input) { + protected String inputToSQL(Expression input) { if (input.children().length > 1) { return "(" + build(input) + ")"; } else { @@ -231,8 +230,8 @@ protected String visitBinaryArithmetic(String name, String l, String r) { return l + " " + name + " " + r; } - protected String visitCast(String l, DataType dataType) { - return "CAST(" + l + " AS " + dataType.typeName() + ")"; + protected String visitCast(String expr, DataType exprDataType, DataType targetDataType) { + return "CAST(" + expr + " AS " + targetDataType.typeName() + ")"; } protected String visitAnd(String name, String l, String r) { @@ -356,7 +355,7 @@ private String joinListToString( return joiner.toString(); } - private String[] expressionsToStringArray(Expression[] expressions) { + protected String[] expressionsToStringArray(Expression[] expressions) { String[] result = new String[expressions.length]; for (int i = 0; i < expressions.length; i++) { result[i] = build(expressions[i]); diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/util/CaseInsensitiveStringMap.java b/sql/catalyst/src/main/java/org/apache/spark/sql/util/CaseInsensitiveStringMap.java index 00a3de692fbf4..ec461f9740019 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/util/CaseInsensitiveStringMap.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/util/CaseInsensitiveStringMap.java @@ -17,12 +17,6 @@ package org.apache.spark.sql.util; -import org.apache.spark.SparkIllegalArgumentException; -import org.apache.spark.SparkUnsupportedOperationException; -import org.apache.spark.annotation.Experimental; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import java.util.Collection; import java.util.Collections; import java.util.HashMap; @@ -31,6 +25,14 @@ import java.util.Objects; import java.util.Set; +import org.apache.spark.annotation.Experimental; +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.MDC; +import org.apache.spark.SparkIllegalArgumentException; +import org.apache.spark.SparkUnsupportedOperationException; + /** * Case-insensitive map of string keys to string values. *

      @@ -43,7 +45,8 @@ */ @Experimental public class CaseInsensitiveStringMap implements Map { - private static final Logger logger = LoggerFactory.getLogger(CaseInsensitiveStringMap.class); + private static final SparkLogger logger = + SparkLoggerFactory.getLogger(CaseInsensitiveStringMap.class); public static CaseInsensitiveStringMap empty() { return new CaseInsensitiveStringMap(new HashMap<>(0)); @@ -59,8 +62,8 @@ public CaseInsensitiveStringMap(Map originalMap) { for (Map.Entry entry : originalMap.entrySet()) { String key = toLowerCase(entry.getKey()); if (delegate.containsKey(key)) { - logger.warn("Converting duplicated key " + entry.getKey() + - " into CaseInsensitiveStringMap."); + logger.warn("Converting duplicated key {} into CaseInsensitiveStringMap.", + MDC.of(LogKeys.KEY$.MODULE$, entry.getKey())); } delegate.put(key, entry.getValue()); } diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java b/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java index 4163af9bfda58..721e6a60befe2 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java @@ -49,26 +49,43 @@ public int numElements() { return length; } + /** + * Sets all the appropriate null bits in the input UnsafeArrayData. + * + * @param arrayData The UnsafeArrayData to set the null bits for + * @return The UnsafeArrayData with the null bits set + */ + private UnsafeArrayData setNullBits(UnsafeArrayData arrayData) { + if (data.hasNull()) { + for (int i = 0; i < length; i++) { + if (data.isNullAt(i)) { + arrayData.setNullAt(i); + } + } + } + return arrayData; + } + @Override public ArrayData copy() { DataType dt = data.dataType(); if (dt instanceof BooleanType) { - return UnsafeArrayData.fromPrimitiveArray(toBooleanArray()); + return setNullBits(UnsafeArrayData.fromPrimitiveArray(toBooleanArray())); } else if (dt instanceof ByteType) { - return UnsafeArrayData.fromPrimitiveArray(toByteArray()); + return setNullBits(UnsafeArrayData.fromPrimitiveArray(toByteArray())); } else if (dt instanceof ShortType) { - return UnsafeArrayData.fromPrimitiveArray(toShortArray()); + return setNullBits(UnsafeArrayData.fromPrimitiveArray(toShortArray())); } else if (dt instanceof IntegerType || dt instanceof DateType || dt instanceof YearMonthIntervalType) { - return UnsafeArrayData.fromPrimitiveArray(toIntArray()); + return setNullBits(UnsafeArrayData.fromPrimitiveArray(toIntArray())); } else if (dt instanceof LongType || dt instanceof TimestampType || dt instanceof DayTimeIntervalType) { - return UnsafeArrayData.fromPrimitiveArray(toLongArray()); + return setNullBits(UnsafeArrayData.fromPrimitiveArray(toLongArray())); } else if (dt instanceof FloatType) { - return UnsafeArrayData.fromPrimitiveArray(toFloatArray()); + return setNullBits(UnsafeArrayData.fromPrimitiveArray(toFloatArray())); } else if (dt instanceof DoubleType) { - return UnsafeArrayData.fromPrimitiveArray(toDoubleArray()); + return setNullBits(UnsafeArrayData.fromPrimitiveArray(toDoubleArray())); } else { return new GenericArrayData(toObjectArray(dt)).copy(); // ensure the elements are copied. } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/StructFilters.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/StructFilters.scala index 4ac62b987b151..1b2013d87eedf 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/StructFilters.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/StructFilters.scala @@ -156,6 +156,8 @@ object StructFilters { Some(Literal(true, BooleanType)) case sources.AlwaysFalse() => Some(Literal(false, BooleanType)) + case _: sources.CollatedFilter => + None } translate(filter) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index e741387d7657f..ba6764444bdf3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -254,7 +254,7 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor TypeCoercion.typeCoercionRules } - override def batches: Seq[Batch] = Seq( + private def earlyBatches: Seq[Batch] = Seq( Batch("Substitution", fixedPoint, new SubstituteExecuteImmediate(catalogManager), // This rule optimizes `UpdateFields` expression chains so looks more like optimization rule. @@ -274,7 +274,10 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor Batch("Simple Sanity Check", Once, LookupFunctions), Batch("Keep Legacy Outputs", Once, - KeepLegacyOutputs), + KeepLegacyOutputs) + ) + + override def batches: Seq[Batch] = earlyBatches ++ Seq( Batch("Resolution", fixedPoint, new ResolveCatalogs(catalogManager) :: ResolveInsertInto :: @@ -319,17 +322,19 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor ResolveTimeZone :: ResolveRandomSeed :: ResolveBinaryArithmetic :: - ResolveIdentifierClause :: + new ResolveIdentifierClause(earlyBatches) :: ResolveUnion :: ResolveRowLevelCommandAssignments :: RewriteDeleteFromTable :: RewriteUpdateTable :: RewriteMergeIntoTable :: + MoveParameterizedQueriesDown :: BindParameters :: typeCoercionRules() ++ Seq( ResolveWithCTE, ExtractDistributedSequenceID) ++ + Seq(ResolveUpdateEventTimeWatermarkColumn) ++ extendedResolutionRules : _*), Batch("Remove TempResolvedColumn", Once, RemoveTempResolvedColumn), Batch("Post-Hoc Resolution", Once, @@ -339,11 +344,16 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor new ResolveHints.RemoveAllHints), Batch("Nondeterministic", Once, PullOutNondeterministic), - Batch("UDF", Once, + Batch("ScalaUDF Null Handling", fixedPoint, + // `HandleNullInputsForUDF` may wrap the `ScalaUDF` with `If` expression to return null for + // null inputs, so the result can be null even if `ScalaUDF#nullable` is false. We need to + // run `UpdateAttributeNullability` to update nullability of the UDF output attribute in + // downstream operators. After updating attribute nullability, `ScalaUDF`s in downstream + // operators may need null handling as well, so we should run these two rules repeatedly. HandleNullInputsForUDF, - ResolveEncodersInUDF), - Batch("UpdateNullability", Once, UpdateAttributeNullability), + Batch("UDF", Once, + ResolveEncodersInUDF), Batch("Subquery", Once, UpdateOuterReferences), Batch("Cleanup", fixedPoint, @@ -1659,7 +1669,7 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor case u: UpdateTable => resolveReferencesInUpdate(u) - case m @ MergeIntoTable(targetTable, sourceTable, _, _, _, _) + case m @ MergeIntoTable(targetTable, sourceTable, _, _, _, _, _) if !m.resolved && targetTable.resolved && sourceTable.resolved => EliminateSubqueryAliases(targetTable) match { @@ -2201,11 +2211,19 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor val alias = SubqueryAlias.generateSubqueryName(s"_${tableArgs.size}") // Propagate the column indexes for TABLE arguments to the PythonUDTF instance. + val f: FunctionTableSubqueryArgumentExpression = tableArgs.head._1 val tvfWithTableColumnIndexes = tvf match { case g @ Generate(pyudtf: PythonUDTF, _, _, _, _, _) - if tableArgs.head._1.partitioningExpressionIndexes.nonEmpty => - val partitionColumnIndexes = - PythonUDTFPartitionColumnIndexes(tableArgs.head._1.partitioningExpressionIndexes) + if f.extraProjectedPartitioningExpressions.nonEmpty => + val partitionColumnIndexes = if (f.selectedInputExpressions.isEmpty) { + PythonUDTFPartitionColumnIndexes(f.partitioningExpressionIndexes) + } else { + // If the UDTF specified 'select' expression(s), we added a projection to compute + // them plus the 'partitionBy' expression(s) afterwards. + PythonUDTFPartitionColumnIndexes( + (0 until f.extraProjectedPartitioningExpressions.length) + .map(_ + f.selectedInputExpressions.length)) + } g.copy(generator = pyudtf.copy( pythonUDTFPartitionColumnIndexes = Some(partitionColumnIndexes))) case _ => tvf @@ -4002,6 +4020,8 @@ object EliminateEventTimeWatermark extends Rule[LogicalPlan] { override def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsWithPruning( _.containsPattern(EVENT_TIME_WATERMARK)) { case EventTimeWatermark(_, _, child) if child.resolved && !child.isStreaming => child + case UpdateEventTimeWatermarkColumn(_, _, child) if child.resolved && !child.isStreaming => + child } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala index 10bff5e6e59a2..9f3eee5198a16 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.analysis import scala.collection.mutable import org.apache.spark.SparkException +import org.apache.spark.internal.Logging import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.ExtendedAnalysisException import org.apache.spark.sql.catalyst.expressions._ @@ -41,7 +42,7 @@ import org.apache.spark.util.Utils /** * Throws user facing errors when passed invalid queries that fail to analyze. */ -trait CheckAnalysis extends PredicateHelper with LookupCatalog with QueryErrorsBase { +trait CheckAnalysis extends PredicateHelper with LookupCatalog with QueryErrorsBase with Logging { protected def isView(nameParts: Seq[String]): Boolean @@ -110,9 +111,8 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog with QueryErrorsB } /** Check and throw exception when a given resolved plan contains LateralColumnAliasReference. */ - private def checkNotContainingLCA(exprSeq: Seq[NamedExpression], plan: LogicalPlan): Unit = { - if (!plan.resolved) return - exprSeq.foreach(_.transformDownWithPruning(_.containsPattern(LATERAL_COLUMN_ALIAS_REFERENCE)) { + private def checkNotContainingLCA(exprs: Seq[Expression], plan: LogicalPlan): Unit = { + exprs.foreach(_.transformDownWithPruning(_.containsPattern(LATERAL_COLUMN_ALIAS_REFERENCE)) { case lcaRef: LateralColumnAliasReference => throw SparkException.internalError("Resolved plan should not contain any " + s"LateralColumnAliasReference.\nDebugging information: plan:\n$plan", @@ -143,54 +143,22 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog with QueryErrorsB errorClass, missingCol, orderedCandidates, a.origin) } - private def checkUnreferencedCTERelations( - cteMap: mutable.Map[Long, (CTERelationDef, Int, mutable.Map[Long, Int])], - visited: mutable.Map[Long, Boolean], - cteId: Long): Unit = { - if (visited(cteId)) { - return - } - val (cteDef, _, refMap) = cteMap(cteId) - refMap.foreach { case (id, _) => - checkUnreferencedCTERelations(cteMap, visited, id) - } - checkAnalysis0(cteDef.child) - visited(cteId) = true - } - def checkAnalysis(plan: LogicalPlan): Unit = { - val inlineCTE = InlineCTE(alwaysInline = true) - val cteMap = mutable.HashMap.empty[Long, (CTERelationDef, Int, mutable.Map[Long, Int])] - inlineCTE.buildCTEMap(plan, cteMap) - val visited: mutable.Map[Long, Boolean] = mutable.Map.empty.withDefaultValue(false) - cteMap.foreach { case (cteId, (relation, refCount, _)) => - // If a CTE relation is never used, it will disappear after inline. Here we explicitly check - // analysis for it, to make sure the entire query plan is valid. - try { - // If a CTE relation ref count is 0, the other CTE relations that reference it - // should also be checked by checkAnalysis0. This code will also guarantee the leaf - // relations that do not reference any others are checked first. - if (refCount == 0) { - checkUnreferencedCTERelations(cteMap, visited, cteId) - } - } catch { - case e: AnalysisException => - throw new ExtendedAnalysisException(e, relation.child) - } - } - // Inline all CTEs in the plan to help check query plan structures in subqueries. - var inlinedPlan: Option[LogicalPlan] = None - try { - inlinedPlan = Some(inlineCTE(plan)) + // We should inline all CTE relations to restore the original plan shape, as the analysis check + // may need to match certain plan shapes. For dangling CTE relations, they will still be kept + // in the original `WithCTE` node, as we need to perform analysis check for them as well. + val inlineCTE = InlineCTE(alwaysInline = true, keepDanglingRelations = true) + val inlinedPlan: LogicalPlan = try { + inlineCTE(plan) } catch { case e: AnalysisException => throw new ExtendedAnalysisException(e, plan) } try { - checkAnalysis0(inlinedPlan.get) + checkAnalysis0(inlinedPlan) } catch { case e: AnalysisException => - throw new ExtendedAnalysisException(e, inlinedPlan.get) + throw new ExtendedAnalysisException(e, inlinedPlan) } plan.setAnalyzed() } @@ -286,6 +254,14 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog with QueryErrorsB hof.invalidFormat(checkRes) } + case hof: HigherOrderFunction + if hof.resolved && hof.functions + .exists(_.exists(_.isInstanceOf[PythonUDF])) => + val u = hof.functions.flatMap(_.find(_.isInstanceOf[PythonUDF])).head + hof.failAnalysis( + errorClass = "UNSUPPORTED_FEATURE.LAMBDA_FUNCTION_WITH_PYTHON_UDF", + messageParameters = Map("funcName" -> toSQLExpr(u))) + // If an attribute can't be resolved as a map key of string type, either the key should be // surrounded with single quotes, or there is a typo in the attribute name. case GetMapValue(map, key: Attribute) if isMapWithStringKey(map) && !key.resolved => @@ -299,6 +275,7 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog with QueryErrorsB // Early checks for column definitions, to produce better error messages ColumnDefinition.checkColumnDefinitions(operator) + var stagedError: Option[() => Unit] = None getAllExpressions(operator).foreach(_.foreachUp { case a: Attribute if !a.resolved => failUnresolvedAttribute(operator, a, "UNRESOLVED_COLUMN") @@ -337,12 +314,14 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog with QueryErrorsB s"Cannot resolve the runtime replaceable expression ${toSQLExpr(e)}. " + s"The replacement is unresolved: ${toSQLExpr(e.replacement)}.") + // `Grouping` and `GroupingID` are considered as of having lower priority than the other + // nodes which cause errors. case g: Grouping => - g.failAnalysis( - errorClass = "UNSUPPORTED_GROUPING_EXPRESSION", messageParameters = Map.empty) + if (stagedError.isEmpty) stagedError = Some(() => g.failAnalysis( + errorClass = "UNSUPPORTED_GROUPING_EXPRESSION", messageParameters = Map.empty)) case g: GroupingID => - g.failAnalysis( - errorClass = "UNSUPPORTED_GROUPING_EXPRESSION", messageParameters = Map.empty) + if (stagedError.isEmpty) stagedError = Some(() => g.failAnalysis( + errorClass = "UNSUPPORTED_GROUPING_EXPRESSION", messageParameters = Map.empty)) case e: Expression if e.children.exists(_.isInstanceOf[WindowFunction]) && !e.isInstanceOf[WindowExpression] && e.resolved => @@ -401,6 +380,7 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog with QueryErrorsB case _ => }) + if (stagedError.isDefined) stagedError.get.apply() operator match { case RelationTimeTravel(u: UnresolvedRelation, _, _) => @@ -789,17 +769,10 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog with QueryErrorsB msg = s"Found the unresolved operator: ${o.simpleString(SQLConf.get.maxToStringFields)}", context = o.origin.getQueryContext, summary = o.origin.context.summary) - // If the plan is resolved, the resolved Project, Aggregate or Window should have restored or - // resolved all lateral column alias references. Add check for extra safe. - case p @ Project(pList, _) - if pList.exists(_.containsPattern(LATERAL_COLUMN_ALIAS_REFERENCE)) => - checkNotContainingLCA(pList, p) - case agg @ Aggregate(_, aggList, _) - if aggList.exists(_.containsPattern(LATERAL_COLUMN_ALIAS_REFERENCE)) => - checkNotContainingLCA(aggList, agg) - case w @ Window(pList, _, _, _) - if pList.exists(_.containsPattern(LATERAL_COLUMN_ALIAS_REFERENCE)) => - checkNotContainingLCA(pList, w) + // If the plan is resolved, all lateral column alias references should have been either + // restored or resolved. Add check for extra safe. + case o if o.expressions.exists(_.containsPattern(LATERAL_COLUMN_ALIAS_REFERENCE)) => + checkNotContainingLCA(o.expressions, o) case _ => } } @@ -919,13 +892,36 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog with QueryErrorsB // SPARK-18504/SPARK-18814: Block cases where GROUP BY columns // are not part of the correlated columns. + + // Note: groupByCols does not contain outer refs - grouping by an outer ref is always ok val groupByCols = AttributeSet(agg.groupingExpressions.flatMap(_.references)) - // Collect the local references from the correlated predicate in the subquery. - val subqueryColumns = getCorrelatedPredicates(query).flatMap(_.references) - .filterNot(conditions.flatMap(_.references).contains) - val correlatedCols = AttributeSet(subqueryColumns) - val invalidCols = groupByCols -- correlatedCols - // GROUP BY columns must be a subset of columns in the predicates + // Collect the inner query attributes that are guaranteed to have a single value for each + // outer row. See comment on getCorrelatedEquivalentInnerColumns. + val correlatedEquivalentCols = getCorrelatedEquivalentInnerColumns(query) + val nonEquivalentGroupByCols = groupByCols -- correlatedEquivalentCols + + val invalidCols = if (!SQLConf.get.getConf( + SQLConf.LEGACY_SCALAR_SUBQUERY_ALLOW_GROUP_BY_NON_EQUALITY_CORRELATED_PREDICATE)) { + nonEquivalentGroupByCols + } else { + // Legacy incorrect logic for checking for invalid group-by columns (see SPARK-48503). + // Allows any inner attribute that appears in a correlated predicate, even if it is a + // non-equality predicate or under an operator that can change the values of the attribute + // (see comments on getCorrelatedEquivalentInnerColumns for examples). + val subqueryColumns = getCorrelatedPredicates(query).flatMap(_.references) + .filterNot(conditions.flatMap(_.references).contains) + val correlatedCols = AttributeSet(subqueryColumns) + val invalidColsLegacy = groupByCols -- correlatedCols + if (!nonEquivalentGroupByCols.isEmpty && invalidColsLegacy.isEmpty) { + logWarning("Using legacy behavior for " + + s"${SQLConf.LEGACY_SCALAR_SUBQUERY_ALLOW_GROUP_BY_NON_EQUALITY_CORRELATED_PREDICATE + .key}. Query would be rejected with non-legacy behavior but is allowed by " + + s"legacy behavior. Query may be invalid and return wrong results if the scalar " + + s"subquery's group-by outputs multiple rows.") + } + invalidColsLegacy + } + if (invalidCols.nonEmpty) { expr.failAnalysis( errorClass = "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY." + @@ -1387,6 +1383,13 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog with QueryErrorsB aggregated, canContainOuter && SQLConf.get.getConf(SQLConf.DECORRELATE_OFFSET_ENABLED)) + // We always inline CTE relations before analysis check, and only un-referenced CTE + // relations will be kept in the plan. Here we should simply skip them and check the + // children, as un-referenced CTE relations won't be executed anyway and doesn't need to + // be restricted by the current subquery correlation limitations. + case _: WithCTE | _: CTERelationDef => + plan.children.foreach(p => checkPlan(p, aggregated, canContainOuter)) + // Category 4: Any other operators not in the above 3 categories // cannot be on a correlation path, that is they are allowed only // under a correlation point but they and their descendant operators @@ -1410,7 +1413,7 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog with QueryErrorsB if (struct.findNestedField( fieldNames, includeCollections = true, alter.conf.resolver).isDefined) { alter.failAnalysis( - errorClass = "FIELDS_ALREADY_EXISTS", + errorClass = "FIELD_ALREADY_EXISTS", messageParameters = Map( "op" -> op, "fieldNames" -> toSQLId(fieldNames), diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CollationTypeCasts.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CollationTypeCasts.scala index 795e8a696b017..276062ce211d2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CollationTypeCasts.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CollationTypeCasts.scala @@ -22,10 +22,10 @@ import javax.annotation.Nullable import scala.annotation.tailrec import org.apache.spark.sql.catalyst.analysis.TypeCoercion.{hasStringType, haveSameType} -import org.apache.spark.sql.catalyst.expressions.{ArrayJoin, BinaryExpression, CaseWhen, Cast, Coalesce, Collate, Concat, ConcatWs, CreateArray, Elt, Expression, Greatest, If, In, InSubquery, Least} +import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.errors.QueryCompilationErrors import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.types.{ArrayType, DataType, StringType} +import org.apache.spark.sql.types.{ArrayType, DataType, MapType, StringType} object CollationTypeCasts extends TypeCoercionRule { override val transform: PartialFunction[Expression, Expression] = { @@ -45,12 +45,52 @@ object CollationTypeCasts extends TypeCoercionRule { caseWhenExpr.elseValue.map(e => castStringType(e, outputStringType).getOrElse(e)) CaseWhen(newBranches, newElseValue) + case stringLocate: StringLocate => + stringLocate.withNewChildren(collateToSingleType( + Seq(stringLocate.first, stringLocate.second)) :+ stringLocate.third) + + case substringIndex: SubstringIndex => + substringIndex.withNewChildren( + collateToSingleType( + Seq(substringIndex.first, substringIndex.second)) :+ substringIndex.third) + case eltExpr: Elt => eltExpr.withNewChildren(eltExpr.children.head +: collateToSingleType(eltExpr.children.tail)) + case overlayExpr: Overlay => + overlayExpr.withNewChildren(collateToSingleType(Seq(overlayExpr.input, overlayExpr.replace)) + ++ Seq(overlayExpr.pos, overlayExpr.len)) + + case regExpReplace: RegExpReplace => + val Seq(subject, rep) = collateToSingleType(Seq(regExpReplace.subject, regExpReplace.rep)) + val newChildren = Seq(subject, regExpReplace.regexp, rep, regExpReplace.pos) + regExpReplace.withNewChildren(newChildren) + + case stringPadExpr @ (_: StringRPad | _: StringLPad) => + val Seq(str, len, pad) = stringPadExpr.children + val Seq(newStr, newPad) = collateToSingleType(Seq(str, pad)) + stringPadExpr.withNewChildren(Seq(newStr, len, newPad)) + + case raiseError: RaiseError => + val newErrorParams = raiseError.errorParms.dataType match { + case MapType(StringType, StringType, _) => raiseError.errorParms + case _ => Cast(raiseError.errorParms, MapType(StringType, StringType)) + } + raiseError.withNewChildren(Seq(raiseError.errorClass, newErrorParams)) + + case framelessOffsetWindow @ (_: Lag | _: Lead) => + val Seq(input, offset, default) = framelessOffsetWindow.children + val Seq(newInput, newDefault) = collateToSingleType(Seq(input, default)) + framelessOffsetWindow.withNewChildren(Seq(newInput, offset, newDefault)) + case otherExpr @ ( _: In | _: InSubquery | _: CreateArray | _: ArrayJoin | _: Concat | _: Greatest | _: Least | - _: Coalesce | _: BinaryExpression | _: ConcatWs) => + _: Coalesce | _: ArrayContains | _: ArrayExcept | _: ConcatWs | _: Mask | _: StringReplace | + _: StringTranslate | _: StringTrim | _: StringTrimLeft | _: StringTrimRight | + _: ArrayIntersect | _: ArrayPosition | _: ArrayRemove | _: ArrayUnion | _: ArraysOverlap | + _: Contains | _: EndsWith | _: EqualNullSafe | _: EqualTo | _: FindInSet | _: GreaterThan | + _: GreaterThanOrEqual | _: LessThan | _: LessThanOrEqual | _: StartsWith | _: StringInstr | + _: ToNumber | _: TryToNumber) => val newChildren = collateToSingleType(otherExpr.children) otherExpr.withNewChildren(newChildren) } @@ -99,7 +139,10 @@ object CollationTypeCasts extends TypeCoercionRule { * complex DataTypes with collated StringTypes (e.g. ArrayType) */ def getOutputCollation(expr: Seq[Expression]): StringType = { - val explicitTypes = expr.filter(_.isInstanceOf[Collate]) + val explicitTypes = expr.filter { + case _: Collate => true + case _ => false + } .map(_.dataType.asInstanceOf[StringType].collationId) .distinct @@ -114,17 +157,22 @@ object CollationTypeCasts extends TypeCoercionRule { ) // Only implicit or default collations present case 0 => - val implicitTypes = expr.map(_.dataType) + val implicitTypes = expr.filter { + case Literal(_, _: StringType) => false + case cast: Cast if cast.getTagValue(Cast.USER_SPECIFIED_CAST).isEmpty => + cast.child.dataType.isInstanceOf[StringType] + case _ => true + } + .map(_.dataType) .filter(hasStringType) - .map(extractStringType) - .filter(dt => dt.collationId != SQLConf.get.defaultStringType.collationId) - .distinctBy(_.collationId) + .map(extractStringType(_).collationId) + .distinct if (implicitTypes.length > 1) { throw QueryCompilationErrors.implicitCollationMismatchError() } else { - implicitTypes.headOption.getOrElse(SQLConf.get.defaultStringType) + implicitTypes.headOption.map(StringType(_)).getOrElse(SQLConf.get.defaultStringType) } } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ColumnResolutionHelper.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ColumnResolutionHelper.scala index 6e27192ead328..c10e000a098c9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ColumnResolutionHelper.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ColumnResolutionHelper.scala @@ -136,6 +136,9 @@ trait ColumnResolutionHelper extends Logging with DataTypeErrorsBase { getAttrCandidates: () => Seq[Attribute], throws: Boolean, includeLastResort: Boolean): Expression = { + + val resolver = conf.resolver + def innerResolve(e: Expression, isTopLevel: Boolean): Expression = withOrigin(e.origin) { if (e.resolved) return e val resolved = e match { @@ -149,7 +152,7 @@ trait ColumnResolutionHelper extends Logging with DataTypeErrorsBase { case GetViewColumnByNameAndOrdinal( viewName, colName, ordinal, expectedNumCandidates, viewDDL) => val attrCandidates = getAttrCandidates() - val matched = attrCandidates.filter(a => conf.resolver(a.name, colName)) + val matched = attrCandidates.filter(a => resolver(a.name, colName)) if (matched.length != expectedNumCandidates) { throw QueryCompilationErrors.incompatibleViewSchemaChangeError( viewName, colName, expectedNumCandidates, matched, viewDDL) @@ -183,7 +186,7 @@ trait ColumnResolutionHelper extends Logging with DataTypeErrorsBase { case u @ UnresolvedExtractValue(child, fieldName) => val newChild = innerResolve(child, isTopLevel = false) if (newChild.resolved) { - ExtractValue(newChild, fieldName, conf.resolver) + ExtractValue(newChild, fieldName, resolver) } else { u.copy(child = newChild) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala index 9ad8368d007e7..6524ff9b2c57a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala @@ -92,7 +92,7 @@ object DecimalPrecision extends TypeCoercionRule { val resultType = widerDecimalType(p1, s1, p2, s2) val newE1 = if (e1.dataType == resultType) e1 else Cast(e1, resultType) val newE2 = if (e2.dataType == resultType) e2 else Cast(e2, resultType) - b.makeCopy(Array(newE1, newE2)) + b.withNewChildren(Seq(newE1, newE2)) } /** @@ -211,21 +211,21 @@ object DecimalPrecision extends TypeCoercionRule { case (l: Literal, r) if r.dataType.isInstanceOf[DecimalType] && l.dataType.isInstanceOf[IntegralType] && literalPickMinimumPrecision => - b.makeCopy(Array(Cast(l, DataTypeUtils.fromLiteral(l)), r)) + b.withNewChildren(Seq(Cast(l, DataTypeUtils.fromLiteral(l)), r)) case (l, r: Literal) if l.dataType.isInstanceOf[DecimalType] && r.dataType.isInstanceOf[IntegralType] && literalPickMinimumPrecision => - b.makeCopy(Array(l, Cast(r, DataTypeUtils.fromLiteral(r)))) + b.withNewChildren(Seq(l, Cast(r, DataTypeUtils.fromLiteral(r)))) // Promote integers inside a binary expression with fixed-precision decimals to decimals, // and fixed-precision decimals in an expression with floats / doubles to doubles case (l @ IntegralTypeExpression(), r @ DecimalExpression(_, _)) => - b.makeCopy(Array(Cast(l, DecimalType.forType(l.dataType)), r)) + b.withNewChildren(Seq(Cast(l, DecimalType.forType(l.dataType)), r)) case (l @ DecimalExpression(_, _), r @ IntegralTypeExpression()) => - b.makeCopy(Array(l, Cast(r, DecimalType.forType(r.dataType)))) + b.withNewChildren(Seq(l, Cast(r, DecimalType.forType(r.dataType)))) case (l, r @ DecimalExpression(_, _)) if isFloat(l.dataType) => - b.makeCopy(Array(l, Cast(r, DoubleType))) + b.withNewChildren(Seq(l, Cast(r, DoubleType))) case (l @ DecimalExpression(_, _), r) if isFloat(r.dataType) => - b.makeCopy(Array(Cast(l, DoubleType), r)) + b.withNewChildren(Seq(Cast(l, DoubleType), r)) case _ => b } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DeduplicateRelations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DeduplicateRelations.scala index d696ff45b9b7f..0fa11b9c45038 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DeduplicateRelations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DeduplicateRelations.scala @@ -38,28 +38,30 @@ case class RelationWrapper(cls: Class[_], outputAttrIds: Seq[Long]) object DeduplicateRelations extends Rule[LogicalPlan] { override def apply(plan: LogicalPlan): LogicalPlan = { val newPlan = renewDuplicatedRelations(mutable.HashSet.empty, plan)._1 - if (newPlan.find(p => p.resolved && p.missingInput.nonEmpty).isDefined) { - // Wait for `ResolveMissingReferences` to resolve missing attributes first - return newPlan - } + + // Wait for `ResolveMissingReferences` to resolve missing attributes first + def noMissingInput(p: LogicalPlan) = !p.exists(_.missingInput.nonEmpty) + newPlan.resolveOperatorsUpWithPruning( _.containsAnyPattern(JOIN, LATERAL_JOIN, AS_OF_JOIN, INTERSECT, EXCEPT, UNION, COMMAND), ruleId) { case p: LogicalPlan if !p.childrenResolved => p // To resolve duplicate expression IDs for Join. - case j @ Join(left, right, _, _, _) if !j.duplicateResolved => + case j @ Join(left, right, _, _, _) if !j.duplicateResolved && noMissingInput(right) => j.copy(right = dedupRight(left, right)) // Resolve duplicate output for LateralJoin. - case j @ LateralJoin(left, right, _, _) if right.resolved && !j.duplicateResolved => + case j @ LateralJoin(left, right, _, _) + if right.resolved && !j.duplicateResolved && noMissingInput(right.plan) => j.copy(right = right.withNewPlan(dedupRight(left, right.plan))) // Resolve duplicate output for AsOfJoin. - case j @ AsOfJoin(left, right, _, _, _, _, _) if !j.duplicateResolved => + case j @ AsOfJoin(left, right, _, _, _, _, _) + if !j.duplicateResolved && noMissingInput(right) => j.copy(right = dedupRight(left, right)) // intersect/except will be rewritten to join at the beginning of optimizer. Here we need to // deduplicate the right side plan, so that we won't produce an invalid self-join later. - case i @ Intersect(left, right, _) if !i.duplicateResolved => + case i @ Intersect(left, right, _) if !i.duplicateResolved && noMissingInput(right) => i.copy(right = dedupRight(left, right)) - case e @ Except(left, right, _) if !e.duplicateResolved => + case e @ Except(left, right, _) if !e.duplicateResolved && noMissingInput(right) => e.copy(right = dedupRight(left, right)) // Only after we finish by-name resolution for Union case u: Union if !u.byName && !u.duplicateResolved => @@ -77,7 +79,8 @@ object DeduplicateRelations extends Rule[LogicalPlan] { } } u.copy(children = newChildren) - case merge: MergeIntoTable if !merge.duplicateResolved => + case merge: MergeIntoTable + if !merge.duplicateResolved && noMissingInput(merge.sourceTable) => merge.copy(sourceTable = dedupRight(merge.targetTable, merge.sourceTable)) } } @@ -252,12 +255,18 @@ object DeduplicateRelations extends Rule[LogicalPlan] { val newRightGroup = rewriteAttrs(c.rightGroup, rightAttrMap) val newLeftOrder = rewriteAttrs(c.leftOrder, leftAttrMap) val newRightOrder = rewriteAttrs(c.rightOrder, rightAttrMap) - val newKeyDes = c.keyDeserializer.asInstanceOf[UnresolvedDeserializer] - .copy(inputAttributes = newLeftGroup) - val newLeftDes = c.leftDeserializer.asInstanceOf[UnresolvedDeserializer] - .copy(inputAttributes = newLeftAttr) - val newRightDes = c.rightDeserializer.asInstanceOf[UnresolvedDeserializer] - .copy(inputAttributes = newRightAttr) + val newKeyDes = c.keyDeserializer match { + case u: UnresolvedDeserializer => u.copy(inputAttributes = newLeftGroup) + case e: Expression => e.withNewChildren(rewriteAttrs(e.children, leftAttrMap)) + } + val newLeftDes = c.leftDeserializer match { + case u: UnresolvedDeserializer => u.copy(inputAttributes = newLeftAttr) + case e: Expression => e.withNewChildren(rewriteAttrs(e.children, leftAttrMap)) + } + val newRightDes = c.rightDeserializer match { + case u: UnresolvedDeserializer => u.copy(inputAttributes = newRightAttr) + case e: Expression => e.withNewChildren(rewriteAttrs(e.children, rightAttrMap)) + } c.copy(keyDeserializer = newKeyDes, leftDeserializer = newLeftDes, rightDeserializer = newRightDes, leftGroup = newLeftGroup, rightGroup = newRightGroup, leftAttr = newLeftAttr, rightAttr = newRightAttr, diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index c56d04b570e53..8a5a32c173bbf 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -25,7 +25,7 @@ import scala.reflect.ClassTag import org.apache.spark.SparkUnsupportedOperationException import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.FUNCTION_NAME +import org.apache.spark.internal.LogKeys.FUNCTION_NAME import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.FunctionIdentifier import org.apache.spark.sql.catalyst.expressions._ @@ -381,7 +381,7 @@ object FunctionRegistry { expression[PosExplode]("posexplode"), expressionGeneratorOuter[PosExplode]("posexplode_outer"), expression[Rand]("rand"), - expression[Rand]("random", true), + expression[Rand]("random", true, Some("3.0.0")), expression[Randn]("randn"), expression[Stack]("stack"), expression[CaseWhen]("when"), @@ -416,7 +416,7 @@ object FunctionRegistry { expression[Log1p]("log1p"), expression[Log2]("log2"), expression[Log]("ln"), - expression[Remainder]("mod", true), + expression[Remainder]("mod", true, Some("2.3.0")), expression[UnaryMinus]("negative", true), expression[Pi]("pi"), expression[Pmod]("pmod"), @@ -451,6 +451,7 @@ object FunctionRegistry { // "try_*" function which always return Null instead of runtime error. expression[TryAdd]("try_add"), expression[TryDivide]("try_divide"), + expression[TryRemainder]("try_remainder"), expression[TrySubtract]("try_subtract"), expression[TryMultiply]("try_multiply"), expression[TryElementAt]("try_element_at"), @@ -529,8 +530,8 @@ object FunctionRegistry { expressionBuilder("endswith", EndsWithExpressionBuilder), expression[Base64]("base64"), expression[BitLength]("bit_length"), - expression[Length]("char_length", true), - expression[Length]("character_length", true), + expression[Length]("char_length", true, Some("2.3.0")), + expression[Length]("character_length", true, Some("2.3.0")), expression[ConcatWs]("concat_ws"), expression[Decode]("decode"), expression[Elt]("elt"), @@ -558,7 +559,7 @@ object FunctionRegistry { expressionBuilder("lpad", LPadExpressionBuilder), expression[StringTrimLeft]("ltrim"), expression[JsonTuple]("json_tuple"), - expression[StringLocate]("position", true), + expression[StringLocate]("position", true, Some("2.3.0")), expression[FormatString]("printf", true), expression[RegExpExtract]("regexp_extract"), expression[RegExpExtractAll]("regexp_extract_all"), @@ -600,6 +601,10 @@ object FunctionRegistry { expression[RegExpCount]("regexp_count"), expression[RegExpSubStr]("regexp_substr"), expression[RegExpInStr]("regexp_instr"), + expression[IsValidUTF8]("is_valid_utf8"), + expression[MakeValidUTF8]("make_valid_utf8"), + expression[ValidateUTF8]("validate_utf8"), + expression[TryValidateUTF8]("try_validate_utf8"), // url functions expression[UrlEncode]("url_encode"), @@ -700,7 +705,7 @@ object FunctionRegistry { expression[MapConcat]("map_concat"), expression[Size]("size"), expression[Slice]("slice"), - expression[Size]("cardinality", true), + expression[Size]("cardinality", true, Some("2.4.0")), expression[ArraysZip]("arrays_zip"), expression[SortArray]("sort_array"), expression[Shuffle]("shuffle"), @@ -749,11 +754,11 @@ object FunctionRegistry { expression[InputFileBlockLength]("input_file_block_length"), expression[MonotonicallyIncreasingID]("monotonically_increasing_id"), expression[CurrentDatabase]("current_database"), - expression[CurrentDatabase]("current_schema", true), + expression[CurrentDatabase]("current_schema", true, Some("3.4.0")), expression[CurrentCatalog]("current_catalog"), expression[CurrentUser]("current_user"), - expression[CurrentUser]("user", setAlias = true), - expression[CurrentUser]("session_user", setAlias = true), + expression[CurrentUser]("user", true, Some("3.4.0")), + expression[CurrentUser]("session_user", true, Some("4.0.0")), expression[CallMethodViaReflection]("reflect"), expression[CallMethodViaReflection]("java_method", true), expression[SparkVersion]("version"), @@ -799,6 +804,9 @@ object FunctionRegistry { expression[BitwiseNot]("~"), expression[BitwiseOr]("|"), expression[BitwiseXor]("^"), + expression[ShiftLeft]("<<", true, Some("4.0.0")), + expression[ShiftRight](">>", true, Some("4.0.0")), + expression[ShiftRightUnsigned](">>>", true, Some("4.0.0")), expression[BitwiseCount]("bit_count"), expression[BitAndAgg]("bit_and"), expression[BitOrAgg]("bit_or"), @@ -821,7 +829,9 @@ object FunctionRegistry { expression[JsonObjectKeys]("json_object_keys"), // Variant - expression[ParseJson]("parse_json"), + expressionBuilder("parse_json", ParseJsonExpressionBuilder), + expressionBuilder("try_parse_json", TryParseJsonExpressionBuilder), + expression[IsVariantNull]("is_variant_null"), expressionBuilder("variant_get", VariantGetExpressionBuilder), expressionBuilder("try_variant_get", TryVariantGetExpressionBuilder), expression[SchemaOfVariant]("schema_of_variant"), @@ -854,7 +864,11 @@ object FunctionRegistry { // Xml expression[XmlToStructs]("from_xml"), expression[SchemaOfXml]("schema_of_xml"), - expression[StructsToXml]("to_xml") + expression[StructsToXml]("to_xml"), + + // Avro + expression[FromAvro]("from_avro"), + expression[ToAvro]("to_avro") ) val builtin: SimpleFunctionRegistry = { @@ -953,7 +967,14 @@ object FunctionRegistry { since: Option[String] = None): (String, (ExpressionInfo, FunctionBuilder)) = { val info = FunctionRegistryBase.expressionInfo[T](name, since) val funcBuilder = (expressions: Seq[Expression]) => { - assert(expressions.forall(_.resolved), "function arguments must be resolved.") + val (lambdas, others) = expressions.partition(_.isInstanceOf[LambdaFunction]) + if (lambdas.nonEmpty && !builder.supportsLambda) { + throw new AnalysisException( + errorClass = "INVALID_LAMBDA_FUNCTION_CALL.NON_HIGHER_ORDER_FUNCTION", + messageParameters = Map( + "class" -> builder.getClass.getCanonicalName)) + } + assert(others.forall(_.resolved), "function arguments must be resolved.") val rearrangedExpressions = rearrangeExpressions(name, builder, expressions) val expr = builder.build(name, rearrangedExpressions) if (setAlias) expr.setTagValue(FUNC_ALIAS, name) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HintErrorLogger.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HintErrorLogger.scala index 7338ef21a713c..68c6ae9c03e3c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HintErrorLogger.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HintErrorLogger.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql.catalyst.analysis import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{QUERY_HINT, RELATION_NAME, UNSUPPORTED_HINT_REASON} +import org.apache.spark.internal.LogKeys.{QUERY_HINT, RELATION_NAME, UNSUPPORTED_HINT_REASON} import org.apache.spark.sql.catalyst.plans.logical.{HintErrorHandler, HintInfo} /** diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveIdentifierClause.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveIdentifierClause.scala index ced7123dfcc14..f04b7799e35ea 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveIdentifierClause.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveIdentifierClause.scala @@ -20,19 +20,24 @@ package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.catalyst.expressions.{AliasHelper, EvalHelper, Expression} import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan -import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.catalyst.rules.{Rule, RuleExecutor} import org.apache.spark.sql.catalyst.trees.TreePattern.UNRESOLVED_IDENTIFIER import org.apache.spark.sql.types.StringType /** * Resolves the identifier expressions and builds the original plans/expressions. */ -object ResolveIdentifierClause extends Rule[LogicalPlan] with AliasHelper with EvalHelper { +class ResolveIdentifierClause(earlyBatches: Seq[RuleExecutor[LogicalPlan]#Batch]) + extends Rule[LogicalPlan] with AliasHelper with EvalHelper { + + private val executor = new RuleExecutor[LogicalPlan] { + override def batches: Seq[Batch] = earlyBatches.asInstanceOf[Seq[Batch]] + } override def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsUpWithPruning( _.containsAnyPattern(UNRESOLVED_IDENTIFIER)) { case p: PlanWithUnresolvedIdentifier if p.identifierExpr.resolved => - p.planBuilder.apply(evalIdentifierExpr(p.identifierExpr)) + executor.execute(p.planBuilder.apply(evalIdentifierExpr(p.identifierExpr))) case other => other.transformExpressionsWithPruning(_.containsAnyPattern(UNRESOLVED_IDENTIFIER)) { case e: ExpressionWithUnresolvedIdentifier if e.identifierExpr.resolved => diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala index 4f5a11835c337..7ea90854932e5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.SQLConfHelper -import org.apache.spark.sql.catalyst.expressions.{AliasHelper, Attribute, Expression, NamedExpression} +import org.apache.spark.sql.catalyst.expressions.{AliasHelper, Attribute, Expression, IntegerLiteral, Literal, NamedExpression} import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, AppendColumns, LogicalPlan} import org.apache.spark.sql.catalyst.trees.TreePattern.{LATERAL_COLUMN_ALIAS_REFERENCE, UNRESOLVED_ATTRIBUTE} @@ -136,7 +136,19 @@ class ResolveReferencesInAggregate(val catalogManager: CatalogManager) extends S groupExprs } else { // This is a valid GROUP BY ALL aggregate. - expandedGroupExprs.get + expandedGroupExprs.get.zipWithIndex.map { case (expr, index) => + trimAliases(expr) match { + // HACK ALERT: If the expanded grouping expression is an integer literal, don't use it + // but use an integer literal of the index. The reason is we may repeatedly + // analyze the plan, and the original integer literal may cause failures + // with a later GROUP BY ordinal resolution. GROUP BY constant is + // meaningless so whatever value does not matter here. + case IntegerLiteral(_) => + // GROUP BY ordinal uses 1-based index. + Literal(index + 1) + case _ => expr + } + } } } else { groupExprs diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveUpdateEventTimeWatermarkColumn.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveUpdateEventTimeWatermarkColumn.scala new file mode 100644 index 0000000000000..31c4f068a83eb --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveUpdateEventTimeWatermarkColumn.scala @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis + +import org.apache.spark.sql.catalyst.plans.logical.{EventTimeWatermark, LogicalPlan, UpdateEventTimeWatermarkColumn} +import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.catalyst.trees.TreePattern.UPDATE_EVENT_TIME_WATERMARK_COLUMN +import org.apache.spark.sql.errors.QueryCompilationErrors + +/** + * Extracts the watermark delay and adds it to the UpdateEventTimeWatermarkColumn + * logical node (if such a node is present). [[UpdateEventTimeWatermarkColumn]] node updates + * the eventTimeColumn for upstream operators. + * + * If the logical plan contains a [[UpdateEventTimeWatermarkColumn]] node, but no watermark + * has been defined, the query will fail with a compilation error. + */ +object ResolveUpdateEventTimeWatermarkColumn extends Rule[LogicalPlan] { + + override def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsUpWithPruning( + _.containsPattern(UPDATE_EVENT_TIME_WATERMARK_COLUMN), ruleId) { + case u: UpdateEventTimeWatermarkColumn if u.delay.isEmpty && u.childrenResolved => + val existingWatermarkDelay = u.child.collect { + case EventTimeWatermark(_, delay, _) => delay + } + + if (existingWatermarkDelay.isEmpty) { + // input dataset needs to have a event time column, we transfer the + // watermark delay from this column to user specified eventTimeColumnName + // in the output dataset. + throw QueryCompilationErrors.cannotAssignEventTimeColumn() + } + + val delay = existingWatermarkDelay.head + u.copy(delay = Some(delay)) + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/RewriteCollationJoin.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/RewriteCollationJoin.scala new file mode 100644 index 0000000000000..ae29d21c7a71e --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/RewriteCollationJoin.scala @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis + +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.plans.logical.{Join, LogicalPlan} +import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.catalyst.util.UnsafeRowUtils +import org.apache.spark.sql.types._ +import org.apache.spark.sql.types.StringType +import org.apache.spark.util.ArrayImplicits.SparkArrayOps + +/** + * This rule rewrites Join conditions to ensure that all types containing non-binary collated + * strings are compared correctly. This is necessary because join conditions are evaluated using + * binary equality, which does not work correctly for non-binary collated strings. However, by + * injecting CollationKey expressions into the join condition, we can ensure that the comparison + * is done correctly, which then allows HashJoin to work properly on this type of data. + */ +object RewriteCollationJoin extends Rule[LogicalPlan] { + def apply(plan: LogicalPlan): LogicalPlan = plan transform { + case j @ Join(_, _, _, Some(condition), _) => + val newCondition = condition transform { + case e @ Equality(l: AttributeReference, r: AttributeReference) => + e.withNewChildren(Seq(processExpression(l, l.dataType), processExpression(r, r.dataType))) + } + if (!newCondition.fastEquals(condition)) { + j.copy(condition = Some(newCondition)) + } else { + j + } + } + + /** + * Recursively process the expression in order to replace non-binary collated strings with their + * associated collation keys. This is necessary to ensure that the join condition is evaluated + * correctly for all types containing non-binary collated strings, including structs and arrays. + */ + private def processExpression(expr: Expression, dt: DataType): Expression = { + dt match { + // For binary stable expressions, no special handling is needed. + case _ if UnsafeRowUtils.isBinaryStable(dt) => + expr + + // Inject CollationKey for non-binary collated strings. + case _: StringType => + CollationKey(expr) + + // Recursively process struct fields for non-binary structs. + case StructType(fields) => + processStruct(expr, fields) + + // Recursively process array elements for non-binary arrays. + case ArrayType(et, containsNull) => + processArray(expr, et, containsNull) + + // Joins are not supported on maps, so there's no special handling for MapType. + case _ => + expr + } + } + + private def processStruct(str: Expression, fields: Array[StructField]): Expression = { + val struct = CreateNamedStruct(fields.zipWithIndex.flatMap { case (f, i) => + Seq(Literal(f.name), processExpression(GetStructField(str, i, Some(f.name)), f.dataType)) + }.toImmutableArraySeq) + if (str.nullable) { + If(IsNull(str), Literal(null, struct.dataType), struct) + } else { + struct + } + } + + private def processArray(arr: Expression, et: DataType, containsNull: Boolean): Expression = { + val param: NamedExpression = NamedLambdaVariable("a", et, containsNull) + val funcBody: Expression = processExpression(param, et) + if (!funcBody.fastEquals(param)) { + ArrayTransform(arr, LambdaFunction(funcBody, Seq(param))) + } else { + arr + } + } + +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/RewriteMergeIntoTable.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/RewriteMergeIntoTable.scala index 9e020cb55ed56..dacee70cf1286 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/RewriteMergeIntoTable.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/RewriteMergeIntoTable.scala @@ -45,7 +45,7 @@ object RewriteMergeIntoTable extends RewriteRowLevelCommand with PredicateHelper override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { case m @ MergeIntoTable(aliasedTable, source, cond, matchedActions, notMatchedActions, - notMatchedBySourceActions) if m.resolved && m.rewritable && m.aligned && + notMatchedBySourceActions, _) if m.resolved && m.rewritable && m.aligned && matchedActions.isEmpty && notMatchedActions.size == 1 && notMatchedBySourceActions.isEmpty => @@ -79,7 +79,7 @@ object RewriteMergeIntoTable extends RewriteRowLevelCommand with PredicateHelper } case m @ MergeIntoTable(aliasedTable, source, cond, matchedActions, notMatchedActions, - notMatchedBySourceActions) if m.resolved && m.rewritable && m.aligned && + notMatchedBySourceActions, _) if m.resolved && m.rewritable && m.aligned && matchedActions.isEmpty && notMatchedBySourceActions.isEmpty => EliminateSubqueryAliases(aliasedTable) match { @@ -120,7 +120,7 @@ object RewriteMergeIntoTable extends RewriteRowLevelCommand with PredicateHelper } case m @ MergeIntoTable(aliasedTable, source, cond, matchedActions, notMatchedActions, - notMatchedBySourceActions) if m.resolved && m.rewritable && m.aligned => + notMatchedBySourceActions, _) if m.resolved && m.rewritable && m.aligned => EliminateSubqueryAliases(aliasedTable) match { case r @ DataSourceV2Relation(tbl: SupportsRowLevelOperations, _, _, _, _) => diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/StreamingJoinHelper.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/StreamingJoinHelper.scala index e9c4dd0be7d92..ef425be42f981 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/StreamingJoinHelper.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/StreamingJoinHelper.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.analysis import scala.util.control.NonFatal import org.apache.spark.internal.Logging -import org.apache.spark.internal.LogKey._ +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.MDC import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys @@ -168,7 +168,7 @@ object StreamingJoinHelper extends PredicateHelper with Logging { if (constraintTerms.size > 1) { logWarning( log"Failed to extract state constraint terms: multiple time terms in condition\n\t" + - log"${MDC(EXPRESSION_TERMS, terms.mkString("\n\t"))}") + log"${MDC(EXPR_TERMS, terms.mkString("\n\t"))}") return None } if (constraintTerms.isEmpty) { @@ -289,7 +289,7 @@ object StreamingJoinHelper extends PredicateHelper with Logging { logWarning( log"Failed to extract state value watermark from condition " + log"${MDC(JOIN_CONDITION, exprToCollectFrom)} due to " + - log"${MDC(JOIN_CONDITION_SUB_EXPRESSION, a)}") + log"${MDC(JOIN_CONDITION_SUB_EXPR, a)}") invalid = true Seq.empty } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala index 259e28b62bca7..08c5b3531b4c8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala @@ -31,7 +31,7 @@ import org.apache.spark.sql.catalyst.trees.AlwaysProcess import org.apache.spark.sql.catalyst.types.DataTypeUtils import org.apache.spark.sql.errors.QueryCompilationErrors import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.internal.types.{AbstractArrayType, AbstractStringType, StringTypeAnyCollation} +import org.apache.spark.sql.internal.types.{AbstractArrayType, AbstractMapType, AbstractStringType, StringTypeAnyCollation} import org.apache.spark.sql.types._ import org.apache.spark.sql.types.UpCastRule.numericPrecedence @@ -998,9 +998,10 @@ object TypeCoercion extends TypeCoercionBase { case (_: StringType, AnyTimestampType) => AnyTimestampType.defaultConcreteType case (_: StringType, BinaryType) => BinaryType // Cast any atomic type to string. - case (any: AtomicType, _: StringType) if !any.isInstanceOf[StringType] => StringType + case (any: AtomicType, st: StringType) if !any.isInstanceOf[StringType] => st case (any: AtomicType, st: AbstractStringType) - if !any.isInstanceOf[StringType] => st.defaultConcreteType + if !any.isInstanceOf[StringType] => + st.defaultConcreteType // When we reach here, input type is not acceptable for any types in this type collection, // try to find the first one we can implicitly cast. @@ -1047,6 +1048,15 @@ object TypeCoercion extends TypeCoercionBase { } } + case (MapType(fromKeyType, fromValueType, fn), AbstractMapType(toKeyType, toValueType)) => + val newKeyType = implicitCast(fromKeyType, toKeyType).orNull + val newValueType = implicitCast(fromValueType, toValueType).orNull + if (newKeyType != null && newValueType != null) { + MapType(newKeyType, newValueType, fn) + } else { + null + } + case _ => null } Option(ret) @@ -1110,22 +1120,22 @@ object TypeCoercion extends TypeCoercionBase { case a @ BinaryArithmetic(left @ StringTypeExpression(), right) if !isIntervalType(right.dataType) => - a.makeCopy(Array(Cast(left, DoubleType), right)) + a.withNewChildren(Seq(Cast(left, DoubleType), right)) case a @ BinaryArithmetic(left, right @ StringTypeExpression()) if !isIntervalType(left.dataType) => - a.makeCopy(Array(left, Cast(right, DoubleType))) + a.withNewChildren(Seq(left, Cast(right, DoubleType))) // For equality between string and timestamp we cast the string to a timestamp // so that things like rounding of subsecond precision does not affect the comparison. case p @ Equality(left @ StringTypeExpression(), right @ TimestampTypeExpression()) => - p.makeCopy(Array(Cast(left, TimestampType), right)) + p.withNewChildren(Seq(Cast(left, TimestampType), right)) case p @ Equality(left @ TimestampTypeExpression(), right @ StringTypeExpression()) => - p.makeCopy(Array(left, Cast(right, TimestampType))) + p.withNewChildren(Seq(left, Cast(right, TimestampType))) case p @ BinaryComparison(left, right) if findCommonTypeForBinaryComparison(left.dataType, right.dataType, conf).isDefined => val commonType = findCommonTypeForBinaryComparison(left.dataType, right.dataType, conf).get - p.makeCopy(Array(castExpr(left, commonType), castExpr(right, commonType))) + p.withNewChildren(Seq(castExpr(left, commonType), castExpr(right, commonType))) } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala index e39ec267fa612..2366dc4c0eb86 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql.catalyst.analysis import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{ANALYSIS_ERROR, QUERY_PLAN} +import org.apache.spark.internal.LogKeys.{ANALYSIS_ERROR, QUERY_PLAN} import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.ExtendedAnalysisException import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, CurrentDate, CurrentTimestampLike, Expression, GroupingSets, LocalTimestamp, MonotonicallyIncreasingID, SessionWindow, WindowExpression} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/parameters.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/parameters.scala index f1cc44b270bc5..5b365a0d49aea 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/parameters.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/parameters.scala @@ -19,9 +19,9 @@ package org.apache.spark.sql.catalyst.analysis import org.apache.spark.SparkException import org.apache.spark.sql.catalyst.expressions.{Alias, CreateArray, CreateMap, CreateNamedStruct, Expression, LeafExpression, Literal, MapFromArrays, MapFromEntries, SubqueryExpression, Unevaluable, VariableReference} -import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SupervisingCommand} import org.apache.spark.sql.catalyst.rules.Rule -import org.apache.spark.sql.catalyst.trees.TreePattern.{PARAMETER, PARAMETERIZED_QUERY, TreePattern, UNRESOLVED_WITH} +import org.apache.spark.sql.catalyst.trees.TreePattern.{COMMAND, PARAMETER, PARAMETERIZED_QUERY, TreePattern, UNRESOLVED_WITH} import org.apache.spark.sql.errors.QueryErrorsBase import org.apache.spark.sql.types.DataType @@ -104,12 +104,64 @@ case class PosParameterizedQuery(child: LogicalPlan, args: Seq[Expression]) copy(child = newChild) } +/** + * Base class for rules that process parameterized queries. + */ +abstract class ParameterizedQueryProcessor extends Rule[LogicalPlan] { + def assertUnresolvedPlanHasSingleParameterizedQuery(plan: LogicalPlan): Unit = { + if (plan.containsPattern(PARAMETERIZED_QUERY)) { + val parameterizedQueries = plan.collect { case p: ParameterizedQuery => p } + assert(parameterizedQueries.length == 1) + } + } +} + +/** + * Moves `ParameterizedQuery` inside `SupervisingCommand` for their supervised plans to be + * resolved later by the analyzer. + * + * - Basic case: + * `PosParameterizedQuery(ExplainCommand(SomeQuery(...)))` => + * `ExplainCommand(PosParameterizedQuery(SomeQuery(...)))` + * - Nested `SupervisedCommand`s are handled recursively: + * `PosParameterizedQuery(ExplainCommand(ExplainCommand(SomeQuery(...))))` => + * `ExplainCommand(ExplainCommand(PosParameterizedQuery(SomeQuery(...))))` + */ +object MoveParameterizedQueriesDown extends ParameterizedQueryProcessor { + override def apply(plan: LogicalPlan): LogicalPlan = { + assertUnresolvedPlanHasSingleParameterizedQuery(plan) + + plan.resolveOperatorsWithPruning(_.containsPattern(PARAMETERIZED_QUERY)) { + case pq: ParameterizedQuery if pq.exists(isSupervisingCommand) => + moveParameterizedQueryIntoSupervisingCommand(pq) + } + } + + private def moveParameterizedQueryIntoSupervisingCommand(pq: ParameterizedQuery): LogicalPlan = { + // Moves parameterized query down recursively to handle nested `SupervisingCommand`s + def transformSupervisedPlan: PartialFunction[LogicalPlan, LogicalPlan] = { + case command: SupervisingCommand => + command.withTransformedSupervisedPlan { + transformSupervisedPlan(_) + } + case plan => pq.withNewChildren(Seq(plan)) + } + + pq.child.resolveOperatorsWithPruning(_.containsPattern(COMMAND)) { + case command: SupervisingCommand => transformSupervisedPlan(command) + } + } + + private def isSupervisingCommand(plan: LogicalPlan): Boolean = + plan.containsPattern(COMMAND) && plan.isInstanceOf[SupervisingCommand] +} + /** * Finds all named parameters in `ParameterizedQuery` and substitutes them by literals or * by collection constructor functions such as `map()`, `array()`, `struct()` * from the user-specified arguments. */ -object BindParameters extends Rule[LogicalPlan] with QueryErrorsBase { +object BindParameters extends ParameterizedQueryProcessor with QueryErrorsBase { private def checkArgs(args: Iterable[(String, Expression)]): Unit = { def isNotAllowed(expr: Expression): Boolean = expr.exists { case _: Literal | _: CreateArray | _: CreateNamedStruct | @@ -131,11 +183,7 @@ object BindParameters extends Rule[LogicalPlan] with QueryErrorsBase { } override def apply(plan: LogicalPlan): LogicalPlan = { - if (plan.containsPattern(PARAMETERIZED_QUERY)) { - // One unresolved plan can have at most one ParameterizedQuery. - val parameterizedQueries = plan.collect { case p: ParameterizedQuery => p } - assert(parameterizedQueries.length == 1) - } + assertUnresolvedPlanHasSingleParameterizedQuery(plan) plan.resolveOperatorsWithPruning(_.containsPattern(PARAMETERIZED_QUERY)) { // We should wait for `CTESubstitution` to resolve CTE before binding parameters, as CTE diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala index 7a3cc4bc8e83e..a2cab60b392b6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala @@ -98,7 +98,7 @@ case class ExpressionWithUnresolvedIdentifier( /** * Holds the name of a relation that has yet to be looked up in a catalog. * - * @param multipartIdentifier table name + * @param multipartIdentifier table name, the location of files or Kafka topic name, etc. * @param options options to scan this relation. */ case class UnresolvedRelation( diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/view.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/view.scala index 66a5052d86f0a..7015ee568290b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/view.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/view.scala @@ -35,6 +35,54 @@ object EliminateView extends Rule[LogicalPlan] with CastSupport { } } +/** + * ViewBindingMode is used to specify the expected schema binding mode when we want to create or + * replace a view in [[CreateViewStatement]]. + */ +sealed trait ViewSchemaMode + +/** + * SchemaBinding means the view only tolerates minimal changes to the underlying schema. + * It can tolerate extra columns in SELECT * and upcast to more generic types. + */ +object SchemaBinding extends ViewSchemaMode { + override val toString: String = "BINDING" +} + +/** + * SchemaCompensation means the view only tolerates moderate changes to the underlying schema. + * It can tolerate extra columns in SELECT * and explicit casts between view body and view columns. + */ +object SchemaCompensation extends ViewSchemaMode { + override val toString: String = "COMPENSATION" +} + +/** + * SchemaTypeEvolution means the view will adopt changed column types. + * In this mode the view will refresh its metastore data on reference to keep it up to day. + */ +object SchemaTypeEvolution extends ViewSchemaMode { + override val toString: String = "TYPE EVOLUTION" +} + +/** + * SchemaUnsupported means the feature is not enabled. + * This mode is only transient and not persisted + */ +object SchemaUnsupported extends ViewSchemaMode { + override val toString: String = "UNSUPPORTED" +} + +/** + * SchemaEvolution means the view will adopt changed column types and number of columns. + * This is a result of not having a column list and WITH EVOLUTION. + * Without an explicit column list the will also adopt changes to column names. + * In this mode the view will refresh its metastore data on reference to keep it up to day. + */ +object SchemaEvolution extends ViewSchemaMode { + override val toString: String = "EVOLUTION" +} + /** * ViewType is used to specify the expected view type when we want to create or replace a view in * [[CreateViewStatement]]. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala index eb649c4d4796a..749c9df40f14f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala @@ -26,6 +26,7 @@ import org.apache.hadoop.util.Shell import org.apache.spark.sql.catalyst.analysis.Resolver import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec import org.apache.spark.sql.catalyst.expressions.{And, AttributeReference, BasePredicate, BoundReference, Expression, Predicate} +import org.apache.spark.sql.catalyst.expressions.Hex.unhexDigits import org.apache.spark.sql.catalyst.util.CharVarcharUtils import org.apache.spark.sql.errors.QueryCompilationErrors import org.apache.spark.sql.internal.SQLConf @@ -40,7 +41,7 @@ object ExternalCatalogUtils { // The following string escaping code is mainly copied from Hive (o.a.h.h.common.FileUtils). ////////////////////////////////////////////////////////////////////////////////////////////////// - val charToEscape = { + final val (charToEscape, sizeOfCharToEscape) = { val bitSet = new java.util.BitSet(128) /** @@ -60,54 +61,77 @@ object ExternalCatalogUtils { Array(' ', '<', '>', '|').foreach(bitSet.set(_)) } - bitSet + (bitSet, bitSet.size) } - def needsEscaping(c: Char): Boolean = { - c < charToEscape.size() && charToEscape.get(c) + private final val HEX_CHARS = "0123456789ABCDEF".toCharArray + + @inline final def needsEscaping(c: Char): Boolean = { + c < sizeOfCharToEscape && charToEscape.get(c) } def escapePathName(path: String): String = { - val builder = new StringBuilder() - path.foreach { c => - if (needsEscaping(c)) { - builder.append('%') - builder.append(f"${c.asInstanceOf[Int]}%02X") - } else { - builder.append(c) + if (path == null || path.isEmpty) { + return path + } + val length = path.length + var firstIndex = 0 + while (firstIndex < length && !needsEscaping(path.charAt(firstIndex))) { + firstIndex += 1 + } + if (firstIndex == length) { + path + } else { + val sb = new java.lang.StringBuilder(length + 16) + if (firstIndex != 0) sb.append(path, 0, firstIndex) + while(firstIndex < length) { + val c = path.charAt(firstIndex) + if (needsEscaping(c)) { + sb.append('%').append(HEX_CHARS((c & 0xF0) >> 4)).append(HEX_CHARS(c & 0x0F)) + } else { + sb.append(c) + } + firstIndex += 1 } + sb.toString } - - builder.toString() } - def unescapePathName(path: String): String = { - val sb = new StringBuilder - var i = 0 - - while (i < path.length) { - val c = path.charAt(i) - if (c == '%' && i + 2 < path.length) { - val code: Int = try { - Integer.parseInt(path.substring(i + 1, i + 3), 16) - } catch { - case _: Exception => -1 - } - if (code >= 0) { - sb.append(code.asInstanceOf[Char]) - i += 3 + if (path == null || path.isEmpty) { + return path + } + var plaintextEndIdx = path.indexOf('%') + val length = path.length + if (plaintextEndIdx == -1 || plaintextEndIdx + 2 >= length) { + // fast path, no %xx encoding found then return the string identity + path + } else { + val sb = new java.lang.StringBuilder(length) + var plaintextStartIdx = 0 + while(plaintextEndIdx != -1 && plaintextEndIdx + 2 < length) { + if (plaintextEndIdx > plaintextStartIdx) sb.append(path, plaintextStartIdx, plaintextEndIdx) + val high = path.charAt(plaintextEndIdx + 1) + if ((high >>> 8) == 0 && unhexDigits(high) != -1) { + val low = path.charAt(plaintextEndIdx + 2) + if ((low >>> 8) == 0 && unhexDigits(low) != -1) { + sb.append((unhexDigits(high) << 4 | unhexDigits(low)).asInstanceOf[Char]) + plaintextStartIdx = plaintextEndIdx + 3 + } else { + sb.append('%') + plaintextStartIdx = plaintextEndIdx + 1 + } } else { - sb.append(c) - i += 1 + sb.append('%') + plaintextStartIdx = plaintextEndIdx + 1 } - } else { - sb.append(c) - i += 1 + plaintextEndIdx = path.indexOf('%', plaintextStartIdx) + } + if (plaintextStartIdx < length) { + sb.append(path, plaintextStartIdx, length) } + sb.toString } - - sb.toString() } def generatePartitionPath( diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/GlobalTempViewManager.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/GlobalTempViewManager.scala index f351993eb1b7a..aeeedebe330dd 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/GlobalTempViewManager.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/GlobalTempViewManager.scala @@ -35,7 +35,7 @@ import org.apache.spark.sql.errors.QueryCompilationErrors * * @param database The system preserved virtual database that keeps all the global temporary views. */ -class GlobalTempViewManager(val database: String) { +class GlobalTempViewManager(database: String) { /** List of view definitions, mapping from view name to logical plan. */ @GuardedBy("this") diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/RoutineLanguage.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/RoutineLanguage.scala new file mode 100644 index 0000000000000..fc02bf0c606db --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/RoutineLanguage.scala @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.catalog + +/** + * Supported routine languages for UDFs created via SQL. + */ +sealed trait RoutineLanguage { + def name: String +} + +case object LanguageSQL extends RoutineLanguage { + override def name: String = "SQL" +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala index d17d0a97387d9..0e0852d0a550d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala @@ -35,7 +35,7 @@ import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst._ import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.FunctionBuilder -import org.apache.spark.sql.catalyst.expressions.{Alias, Expression, ExpressionInfo, UpCast} +import org.apache.spark.sql.catalyst.expressions.{Alias, Cast, Expression, ExpressionInfo, NamedExpression, UpCast} import org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, ParseException, ParserInterface} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project, SubqueryAlias, View} import org.apache.spark.sql.catalyst.trees.{CurrentOrigin, Origin} @@ -44,7 +44,7 @@ import org.apache.spark.sql.connector.catalog.CatalogManager import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.StaticSQLConf.GLOBAL_TEMP_DATABASE -import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.types.{StructField, StructType} import org.apache.spark.sql.util.{CaseInsensitiveStringMap, PartitioningUtils} import org.apache.spark.util.ArrayImplicits._ import org.apache.spark.util.Utils @@ -123,6 +123,7 @@ class SessionCatalog( lazy val externalCatalog = externalCatalogBuilder() lazy val globalTempViewManager = globalTempViewManagerBuilder() + val globalTempDatabase: String = SQLConf.get.globalTempDatabase /** List of temporary views, mapping from table name to their logical plan. */ @GuardedBy("this") @@ -273,9 +274,9 @@ class SessionCatalog( def createDatabase(dbDefinition: CatalogDatabase, ignoreIfExists: Boolean): Unit = { val dbName = format(dbDefinition.name) - if (dbName == globalTempViewManager.database) { + if (dbName == globalTempDatabase) { throw QueryCompilationErrors.cannotCreateDatabaseWithSameNameAsPreservedDatabaseError( - globalTempViewManager.database) + globalTempDatabase) } validateName(dbName) externalCatalog.createDatabase( @@ -333,9 +334,9 @@ class SessionCatalog( def setCurrentDatabase(db: String): Unit = { val dbName = format(db) - if (dbName == globalTempViewManager.database) { + if (dbName == globalTempDatabase) { throw QueryCompilationErrors.cannotUsePreservedDatabaseAsCurrentDatabaseError( - globalTempViewManager.database) + globalTempDatabase) } requireDbExists(dbName) synchronized { currentDb = dbName } @@ -479,8 +480,9 @@ class SessionCatalog( val catalogTable = externalCatalog.getTable(db, table) val oldDataSchema = catalogTable.dataSchema // not supporting dropping columns yet + val resolver = conf.resolver val nonExistentColumnNames = - oldDataSchema.map(_.name).filterNot(columnNameResolved(newDataSchema, _)) + oldDataSchema.map(_.name).filterNot(columnNameResolved(resolver, newDataSchema, _)) if (nonExistentColumnNames.nonEmpty) { throw QueryCompilationErrors.dropNonExistentColumnsNotSupportedError(nonExistentColumnNames) } @@ -488,8 +490,11 @@ class SessionCatalog( externalCatalog.alterTableDataSchema(db, table, newDataSchema) } - private def columnNameResolved(schema: StructType, colName: String): Boolean = { - schema.fields.map(_.name).exists(conf.resolver(_, colName)) + private def columnNameResolved( + resolver: Resolver, + schema: StructType, + colName: String): Boolean = { + schema.fields.exists(f => resolver(f.name, colName)) } /** @@ -659,7 +664,7 @@ class SessionCatalog( } else { false } - } else if (format(name.database.get) == globalTempViewManager.database) { + } else if (format(name.database.get) == globalTempDatabase) { globalTempViewManager.update(viewName, viewDefinition) } else { false @@ -767,9 +772,9 @@ class SessionCatalog( val table = format(name.table) if (name.database.isEmpty) { tempViews.get(table).map(_.tableMeta).getOrElse(getTableMetadata(name)) - } else if (format(name.database.get) == globalTempViewManager.database) { + } else if (format(name.database.get) == globalTempDatabase) { globalTempViewManager.get(table).map(_.tableMeta) - .getOrElse(throw new NoSuchTableException(globalTempViewManager.database, table)) + .getOrElse(throw new NoSuchTableException(globalTempDatabase, table)) } else { getTableMetadata(name) } @@ -795,7 +800,7 @@ class SessionCatalog( val oldTableName = qualifiedIdent.table val newTableName = format(newName.table) - if (db == globalTempViewManager.database) { + if (db == globalTempDatabase) { globalTempViewManager.rename(oldTableName, newTableName) } else { requireDbExists(db) @@ -832,10 +837,10 @@ class SessionCatalog( val qualifiedIdent = qualifyIdentifier(name) val db = qualifiedIdent.database.get val table = qualifiedIdent.table - if (db == globalTempViewManager.database) { + if (db == globalTempDatabase) { val viewExists = globalTempViewManager.remove(table) if (!viewExists && !ignoreIfNotExists) { - throw new NoSuchTableException(globalTempViewManager.database, table) + throw new NoSuchTableException(globalTempDatabase, table) } } else { if (name.database.isDefined || !tempViews.contains(table)) { @@ -873,7 +878,7 @@ class SessionCatalog( val qualifiedIdent = qualifyIdentifier(name) val db = qualifiedIdent.database.get val table = qualifiedIdent.table - if (db == globalTempViewManager.database) { + if (db == globalTempDatabase) { globalTempViewManager.get(table).map { viewDef => SubqueryAlias(table, db, getTempViewPlan(viewDef)) }.getOrElse(throw new NoSuchTableException(db, table)) @@ -926,6 +931,31 @@ class SessionCatalog( metadata.schema.fieldNames.exists(_.matches("_c[0-9]+")) } + + private def castColToType( + col: Expression, + toField: StructField, + schemaMode: ViewSchemaMode): NamedExpression = { + val cast = schemaMode match { + /* + ** For schema binding, we cast the column to the expected type using safe cast only. + ** For legacy behavior, we cast the column to the expected type using safe cast only. + ** For schema compensation, we cast the column to the expected type using any cast + * in ansi mode. + ** For schema (type) evolution, we take the column as is. + */ + case SchemaBinding => UpCast(col, toField.dataType) + case SchemaUnsupported => if (conf.viewSchemaCompensation) { + Cast(col, toField.dataType, ansiEnabled = true) + } else { + UpCast(col, toField.dataType) + } + case SchemaCompensation => Cast(col, toField.dataType, ansiEnabled = true) + case SchemaTypeEvolution => col + case other => throw SparkException.internalError("Unexpected ViewSchemaMode") + } + Alias(cast, toField.name)(explicitMetadata = Some(toField.metadata)) + } private def fromCatalogTable(metadata: CatalogTable, isTempView: Boolean): View = { val viewText = metadata.viewText.getOrElse { throw SparkException.internalError("Invalid view without text.") @@ -945,58 +975,63 @@ class SessionCatalog( throw QueryCompilationErrors.invalidViewText(viewText, metadata.qualifiedName) } } - val projectList = if (!isHiveCreatedView(metadata)) { - val viewColumnNames = if (metadata.viewQueryColumnNames.isEmpty) { - // For view created before Spark 2.2.0, the view text is already fully qualified, the plan - // output is the same with the view output. - metadata.schema.fieldNames.toImmutableArraySeq - } else { - assert(metadata.viewQueryColumnNames.length == metadata.schema.length) - metadata.viewQueryColumnNames - } + val schemaMode = metadata.viewSchemaMode + if (schemaMode == SchemaEvolution) { + View(desc = metadata, isTempView = isTempView, child = parsedPlan) + } else { + val projectList = if (!isHiveCreatedView(metadata)) { + val viewColumnNames = if (metadata.viewQueryColumnNames.isEmpty) { + // For view created before Spark 2.2.0, the view text is already fully qualified, the plan + // output is the same with the view output. + metadata.schema.fieldNames.toImmutableArraySeq + } else { + assert(metadata.viewQueryColumnNames.length == metadata.schema.length) + metadata.viewQueryColumnNames + } - // For view queries like `SELECT * FROM t`, the schema of the referenced table/view may - // change after the view has been created. We need to add an extra SELECT to pick the columns - // according to the recorded column names (to get the correct view column ordering and omit - // the extra columns that we don't require), with UpCast (to make sure the type change is - // safe) and Alias (to respect user-specified view column names) according to the view schema - // in the catalog. - // Note that, the column names may have duplication, e.g. `CREATE VIEW v(x, y) AS - // SELECT 1 col, 2 col`. We need to make sure that the matching attributes have the same - // number of duplications, and pick the corresponding attribute by ordinal. - val viewConf = View.effectiveSQLConf(metadata.viewSQLConfigs, isTempView) - val normalizeColName: String => String = if (viewConf.caseSensitiveAnalysis) { - identity + // For view queries like `SELECT * FROM t`, the schema of the referenced table/view may + // change after the view has been created. We need to add an extra SELECT to pick the + // columns according to the recorded column names (to get the correct view column ordering + // and omit the extra columns that we don't require), with UpCast (to make sure the type + // change is safe) and Alias (to respect user-specified view column names) according to the + // view schema in the catalog. + // Note that, the column names may have duplication, e.g. `CREATE VIEW v(x, y) AS + // SELECT 1 col, 2 col`. We need to make sure that the matching attributes have the same + // number of duplications, and pick the corresponding attribute by ordinal. + val viewConf = View.effectiveSQLConf(metadata.viewSQLConfigs, isTempView) + val normalizeColName: String => String = if (viewConf.caseSensitiveAnalysis) { + identity + } else { + _.toLowerCase(Locale.ROOT) + } + val nameToCounts = viewColumnNames.groupBy(normalizeColName).transform((_, v) => v.length) + val nameToCurrentOrdinal = scala.collection.mutable.HashMap.empty[String, Int] + val viewDDL = buildViewDDL(metadata, isTempView) + + viewColumnNames.zip(metadata.schema).map { case (name, field) => + val normalizedName = normalizeColName(name) + val count = nameToCounts(normalizedName) + val ordinal = nameToCurrentOrdinal.getOrElse(normalizedName, 0) + nameToCurrentOrdinal(normalizedName) = ordinal + 1 + val col = GetViewColumnByNameAndOrdinal( + metadata.identifier.toString, name, ordinal, count, viewDDL) + castColToType(col, field, schemaMode) + } } else { - _.toLowerCase(Locale.ROOT) - } - val nameToCounts = viewColumnNames.groupBy(normalizeColName).transform((_, v) => v.length) - val nameToCurrentOrdinal = scala.collection.mutable.HashMap.empty[String, Int] - val viewDDL = buildViewDDL(metadata, isTempView) - - viewColumnNames.zip(metadata.schema).map { case (name, field) => - val normalizedName = normalizeColName(name) - val count = nameToCounts(normalizedName) - val ordinal = nameToCurrentOrdinal.getOrElse(normalizedName, 0) - nameToCurrentOrdinal(normalizedName) = ordinal + 1 - val col = GetViewColumnByNameAndOrdinal( - metadata.identifier.toString, name, ordinal, count, viewDDL) - Alias(UpCast(col, field.dataType), field.name)(explicitMetadata = Some(field.metadata)) - } - } else { - // For view created by hive, the parsed view plan may have different output columns with - // the schema stored in metadata. For example: `CREATE VIEW v AS SELECT 1 FROM t` - // the schema in metadata will be `_c0` while the parsed view plan has column named `1` - metadata.schema.zipWithIndex.map { case (field, index) => - val col = GetColumnByOrdinal(index, field.dataType) - Alias(UpCast(col, field.dataType), field.name)(explicitMetadata = Some(field.metadata)) + // For view created by hive, the parsed view plan may have different output columns with + // the schema stored in metadata. For example: `CREATE VIEW v AS SELECT 1 FROM t` + // the schema in metadata will be `_c0` while the parsed view plan has column named `1` + metadata.schema.zipWithIndex.map { case (field, index) => + val col = GetColumnByOrdinal(index, field.dataType) + castColToType(col, field, schemaMode) + } } + View(desc = metadata, isTempView = isTempView, child = Project(projectList, parsedPlan)) } - View(desc = metadata, isTempView = isTempView, child = Project(projectList, parsedPlan)) } def isGlobalTempViewDB(dbName: String): Boolean = { - globalTempViewManager.database.equalsIgnoreCase(dbName) + globalTempDatabase.equalsIgnoreCase(dbName) } /** @@ -1055,9 +1090,9 @@ class SessionCatalog( pattern: String, includeLocalTempViews: Boolean): Seq[TableIdentifier] = { val dbName = format(db) - val dbTables = if (dbName == globalTempViewManager.database) { + val dbTables = if (dbName == globalTempDatabase) { globalTempViewManager.listViewNames(pattern).map { name => - TableIdentifier(name, Some(globalTempViewManager.database)) + TableIdentifier(name, Some(globalTempDatabase)) } } else { requireDbExists(dbName) @@ -1078,9 +1113,9 @@ class SessionCatalog( */ def listViews(db: String, pattern: String): Seq[TableIdentifier] = { val dbName = format(db) - val dbViews = if (dbName == globalTempViewManager.database) { + val dbViews = if (dbName == globalTempDatabase) { globalTempViewManager.listViewNames(pattern).map { name => - TableIdentifier(name, Some(globalTempViewManager.database)) + TableIdentifier(name, Some(globalTempDatabase)) } } else { requireDbExists(dbName) @@ -1096,7 +1131,7 @@ class SessionCatalog( * List all matching temp views in the specified database, including global/local temporary views. */ def listTempViews(db: String, pattern: String): Seq[CatalogTable] = { - val globalTempViews = if (format(db) == globalTempViewManager.database) { + val globalTempViews = if (format(db) == globalTempDatabase) { globalTempViewManager.listViewNames(pattern).flatMap { viewName => globalTempViewManager.get(viewName).map(_.tableMeta) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/UserDefinedFunctionErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/UserDefinedFunctionErrors.scala new file mode 100644 index 0000000000000..a5381669caea8 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/UserDefinedFunctionErrors.scala @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.catalog + +import org.apache.spark.SparkException +import org.apache.spark.sql.errors.QueryErrorsBase + +/** + * Errors during registering and executing [[UserDefinedFunction]]s. + */ +object UserDefinedFunctionErrors extends QueryErrorsBase { + def unsupportedUserDefinedFunction(language: RoutineLanguage): Throwable = { + unsupportedUserDefinedFunction(language.name) + } + + def unsupportedUserDefinedFunction(language: String): Throwable = { + SparkException.internalError(s"Unsupported user defined function type: $language") + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala index 4807d886c9f96..d55b9c972697e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala @@ -33,10 +33,10 @@ import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkException import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey._ +import org.apache.spark.internal.LogKeys._ import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.{CurrentUserContext, FunctionIdentifier, InternalRow, SQLConfHelper, TableIdentifier} -import org.apache.spark.sql.catalyst.analysis.{MultiInstanceRelation, Resolver, UnresolvedLeafNode} +import org.apache.spark.sql.catalyst.analysis.{MultiInstanceRelation, Resolver, SchemaBinding, SchemaCompensation, SchemaEvolution, SchemaTypeEvolution, SchemaUnsupported, UnresolvedLeafNode, ViewSchemaMode} import org.apache.spark.sql.catalyst.catalog.CatalogTable.VIEW_STORING_ANALYZED_PLAN import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference, Cast, ExprId, Literal} import org.apache.spark.sql.catalyst.plans.logical._ @@ -401,6 +401,25 @@ case class CatalogTable( ) } + /** + * Return the schema binding mode. Defaults to SchemaBinding if not a view or an older + * version, unless the viewSchemaBindingMode config is set to false + */ + def viewSchemaMode: ViewSchemaMode = { + if (!SQLConf.get.viewSchemaBindingEnabled) { + SchemaUnsupported + } else { + val schemaMode = properties.getOrElse(VIEW_SCHEMA_MODE, SchemaBinding.toString) + schemaMode match { + case SchemaBinding.toString => SchemaBinding + case SchemaEvolution.toString => SchemaEvolution + case SchemaTypeEvolution.toString => SchemaTypeEvolution + case SchemaCompensation.toString => SchemaCompensation + case other => throw SparkException.internalError("Unexpected ViewSchemaMode") + } + } + } + /** * Return temporary view names the current view was referred. should be empty if the * CatalogTable is not a Temporary View or created by older versions of Spark(before 3.1.0). @@ -491,6 +510,9 @@ case class CatalogTable( if (tableType == CatalogTableType.VIEW) { viewText.foreach(map.put("View Text", _)) viewOriginalText.foreach(map.put("View Original Text", _)) + if (SQLConf.get.viewSchemaBindingEnabled) { + map.put("View Schema Mode", viewSchemaMode.toString) + } if (viewCatalogAndNamespace.nonEmpty) { import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ map.put("View Catalog and Namespace", viewCatalogAndNamespace.quoted) @@ -563,6 +585,8 @@ object CatalogTable { val VIEW_REFERRED_TEMP_FUNCTION_NAMES = VIEW_PREFIX + "referredTempFunctionsNames" val VIEW_REFERRED_TEMP_VARIABLE_NAMES = VIEW_PREFIX + "referredTempVariablesNames" + val VIEW_SCHEMA_MODE = VIEW_PREFIX + "schemaMode" + val VIEW_STORING_ANALYZED_PLAN = VIEW_PREFIX + "storingAnalyzedPlan" val PROP_CLUSTERING_COLUMNS: String = "clusteringColumns" diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVExprUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVExprUtils.scala index 62638d70dd904..7b6664a4117a2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVExprUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVExprUtils.scala @@ -120,6 +120,11 @@ object CSVExprUtils { * @throws SparkIllegalArgumentException if any of the individual input chunks are illegal */ def toDelimiterStr(str: String): String = { + if (str == null) { + throw new SparkIllegalArgumentException( + errorClass = "INVALID_DELIMITER_VALUE.NULL_VALUE") + } + var idx = 0 var delimiter = "" diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVHeaderChecker.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVHeaderChecker.scala index e69a4552ebff2..47e2e288357e1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVHeaderChecker.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVHeaderChecker.scala @@ -22,7 +22,7 @@ import com.univocity.parsers.csv.{CsvParser, CsvParserSettings} import org.apache.spark.SparkIllegalArgumentException import org.apache.spark.internal.{Logging, MDC, MessageWithContext} -import org.apache.spark.internal.LogKey.{CSV_HEADER_COLUMN_NAME, CSV_HEADER_COLUMN_NAMES, CSV_HEADER_LENGTH, CSV_SCHEMA_FIELD_NAME, CSV_SCHEMA_FIELD_NAMES, CSV_SOURCE, NUM_COLUMNS} +import org.apache.spark.internal.LogKeys.{CSV_HEADER_COLUMN_NAME, CSV_HEADER_COLUMN_NAMES, CSV_HEADER_LENGTH, CSV_SCHEMA_FIELD_NAME, CSV_SCHEMA_FIELD_NAMES, CSV_SOURCE, NUM_COLUMNS} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityGenerator.scala index f10a53bde5ddd..e6e964ac90b38 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityGenerator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityGenerator.scala @@ -22,8 +22,8 @@ import java.io.Writer import com.univocity.parsers.csv.CsvWriter import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.SpecializedGetters -import org.apache.spark.sql.catalyst.util.{DateFormatter, DateTimeUtils, IntervalStringStyles, IntervalUtils, SparkStringUtils, TimestampFormatter} +import org.apache.spark.sql.catalyst.expressions.{SpecializedGetters, ToStringBase} +import org.apache.spark.sql.catalyst.util.{DateFormatter, DateTimeUtils, IntervalStringStyles, IntervalUtils, TimestampFormatter} import org.apache.spark.sql.catalyst.util.LegacyDateFormats.FAST_DATE_FORMAT import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ @@ -65,9 +65,11 @@ class UnivocityGenerator( private val nullAsQuotedEmptyString = SQLConf.get.getConf(SQLConf.LEGACY_NULL_VALUE_WRITTEN_AS_QUOTED_EMPTY_STRING_CSV) + private val binaryFormatter = ToStringBase.getBinaryFormatter + private def makeConverter(dataType: DataType): ValueConverter = dataType match { case BinaryType => - (getter, ordinal) => SparkStringUtils.getHexString(getter.getBinary(ordinal)) + (getter, ordinal) => binaryFormatter(getter.getBinary(ordinal)).toString case DateType => (getter, ordinal) => dateFormatter.format(getter.getInt(ordinal)) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala index a5158d8a22c6b..61c2f7a5926b8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala @@ -63,8 +63,7 @@ class UnivocityParser( private type ValueConverter = String => Any // This index is used to reorder parsed tokens - private val tokenIndexArr = - requiredSchema.map(f => java.lang.Integer.valueOf(dataSchema.indexOf(f))).toArray + private val tokenIndexArr = requiredSchema.map(f => dataSchema.indexOf(f)).toArray // True if we should inform the Univocity CSV parser to select which fields to read by their // positions. Generally assigned by input configuration options, except when input column(s) have @@ -81,7 +80,8 @@ class UnivocityParser( // When to-be-parsed schema is shorter than the to-be-read data schema, we let Univocity CSV // parser select a sequence of fields for reading by their positions. if (parsedSchema.length < dataSchema.length) { - parserSetting.selectIndexes(tokenIndexArr: _*) + // Box into Integer here to avoid unboxing where `tokenIndexArr` is used during parsing + parserSetting.selectIndexes(tokenIndexArr.map(java.lang.Integer.valueOf(_)): _*) } new CsvParser(parserSetting) } @@ -316,7 +316,7 @@ class UnivocityParser( throw BadRecordException( () => getCurrentInput, () => Array.empty, - QueryExecutionErrors.malformedCSVRecordError("")) + LazyBadRecordCauseWrapper(() => QueryExecutionErrors.malformedCSVRecordError(""))) } val currentInput = getCurrentInput @@ -326,7 +326,8 @@ class UnivocityParser( // However, we still have chance to parse some of the tokens. It continues to parses the // tokens normally and sets null when `ArrayIndexOutOfBoundsException` occurs for missing // tokens. - Some(QueryExecutionErrors.malformedCSVRecordError(currentInput.toString)) + Some(LazyBadRecordCauseWrapper( + () => QueryExecutionErrors.malformedCSVRecordError(currentInput.toString))) } else None // When the length of the returned tokens is identical to the length of the parsed schema, // we just need to: diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/EncoderUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/EncoderUtils.scala index 20f86a32c1a1d..81743251bada9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/EncoderUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/EncoderUtils.scala @@ -77,6 +77,7 @@ object EncoderUtils { case _: DecimalType => classOf[Decimal] case _: DayTimeIntervalType => classOf[PhysicalLongType.InternalType] case _: YearMonthIntervalType => classOf[PhysicalIntegerType.InternalType] + case _: StringType => classOf[UTF8String] case _: StructType => classOf[InternalRow] case _: ArrayType => classOf[ArrayData] case _: MapType => classOf[MapData] diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala index aa893ba8110ed..0b5ce65fed6df 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.encoders import scala.reflect.ClassTag import scala.reflect.runtime.universe.TypeTag +import org.apache.spark.SparkRuntimeException import org.apache.spark.sql.{Encoder, Row} import org.apache.spark.sql.catalyst.{DeserializerBuildHelper, InternalRow, JavaTypeInference, ScalaReflection, SerializerBuildHelper} import org.apache.spark.sql.catalyst.analysis.{Analyzer, GetColumnByOrdinal, SimpleAnalyzer, UnresolvedAttribute, UnresolvedExtractValue} @@ -187,6 +188,8 @@ object ExpressionEncoder { } constructProjection(row).get(0, anyObjectType).asInstanceOf[T] } catch { + case e: SparkRuntimeException if e.getErrorClass == "NOT_NULL_ASSERT_VIOLATION" => + throw e case e: Exception => throw QueryExecutionErrors.expressionDecodingError(e, expressions) } @@ -213,6 +216,8 @@ object ExpressionEncoder { inputRow(0) = t extractProjection(inputRow) } catch { + case e: SparkRuntimeException if e.getErrorClass == "NOT_NULL_ASSERT_VIOLATION" => + throw e case e: Exception => throw QueryExecutionErrors.expressionEncodingError(e, expressions) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/CallMethodViaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/CallMethodViaReflection.scala index c42b54222f171..13ea8c77c41b4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/CallMethodViaReflection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/CallMethodViaReflection.scala @@ -26,6 +26,8 @@ import org.apache.spark.sql.catalyst.analysis.{FunctionRegistry, TypeCheckResult import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{DataTypeMismatch, TypeCheckSuccess} import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryErrorsBase} +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.types.StringTypeAnyCollation import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String import org.apache.spark.util.ArrayImplicits._ @@ -77,12 +79,12 @@ case class CallMethodViaReflection( ) } else { val unexpectedParameter = children.zipWithIndex.collectFirst { - case (e, 0) if !(e.dataType == StringType && e.foldable) => + case (e, 0) if !(e.dataType.isInstanceOf[StringType] && e.foldable) => DataTypeMismatch( errorSubClass = "NON_FOLDABLE_INPUT", messageParameters = Map( "inputName" -> toSQLId("class"), - "inputType" -> toSQLType(StringType), + "inputType" -> toSQLType(StringTypeAnyCollation), "inputExpr" -> toSQLExpr(children.head) ) ) @@ -90,12 +92,12 @@ case class CallMethodViaReflection( DataTypeMismatch( errorSubClass = "UNEXPECTED_NULL", messageParameters = Map("exprName" -> toSQLId("class"))) - case (e, 1) if !(e.dataType == StringType && e.foldable) => + case (e, 1) if !(e.dataType.isInstanceOf[StringType] && e.foldable) => DataTypeMismatch( errorSubClass = "NON_FOLDABLE_INPUT", messageParameters = Map( "inputName" -> toSQLId("method"), - "inputType" -> toSQLType(StringType), + "inputType" -> toSQLType(StringTypeAnyCollation), "inputExpr" -> toSQLExpr(children(1)) ) ) @@ -103,14 +105,16 @@ case class CallMethodViaReflection( DataTypeMismatch( errorSubClass = "UNEXPECTED_NULL", messageParameters = Map("exprName" -> toSQLId("method"))) - case (e, idx) if idx > 1 && !CallMethodViaReflection.typeMapping.contains(e.dataType) => + case (e, idx) if idx > 1 && + (!CallMethodViaReflection.typeMapping.contains(e.dataType) + && !e.dataType.isInstanceOf[StringType]) => DataTypeMismatch( errorSubClass = "UNEXPECTED_INPUT_TYPE", messageParameters = Map( "paramIndex" -> ordinalNumber(idx), "requiredType" -> toSQLType( TypeCollection(BooleanType, ByteType, ShortType, - IntegerType, LongType, FloatType, DoubleType, StringType)), + IntegerType, LongType, FloatType, DoubleType, StringTypeAnyCollation)), "inputSql" -> toSQLExpr(e), "inputType" -> toSQLType(e.dataType)) ) @@ -134,7 +138,7 @@ case class CallMethodViaReflection( } override def nullable: Boolean = true - override val dataType: DataType = StringType + override val dataType: DataType = SQLConf.get.defaultStringType override protected def initializeInternal(partitionIndex: Int): Unit = {} override protected def evalInternal(input: InternalRow): Any = { @@ -230,7 +234,10 @@ object CallMethodViaReflection { // Argument type must match. That is, either the method's argument type matches one of the // acceptable types defined in typeMapping, or it is a super type of the acceptable types. candidateTypes.zip(argTypes).forall { case (candidateType, argType) => - typeMapping(argType).exists(candidateType.isAssignableFrom) + if (!argType.isInstanceOf[StringType]) { + typeMapping(argType).exists(candidateType.isAssignableFrom) + } + else candidateType.isAssignableFrom(classOf[String]) } } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala index e252075c9c1c4..4a2b4b28e690e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala @@ -575,8 +575,6 @@ case class Cast( // notation if an exponent is needed. override protected def useDecimalPlainString: Boolean = ansiEnabled - override protected def useHexFormatForBinary: Boolean = false - // The class name of `DateTimeUtils` protected def dateTimeUtilsCls: String = DateTimeUtils.getClass.getName.stripSuffix("$") diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/CollationKey.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/CollationKey.scala new file mode 100644 index 0000000000000..6e400d026e0ee --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/CollationKey.scala @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.expressions + +import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} +import org.apache.spark.sql.catalyst.util.CollationFactory +import org.apache.spark.sql.internal.types.StringTypeAnyCollation +import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.UTF8String + +case class CollationKey(expr: Expression) extends UnaryExpression with ExpectsInputTypes { + override def inputTypes: Seq[AbstractDataType] = Seq(StringTypeAnyCollation) + override def dataType: DataType = BinaryType + + final lazy val collationId: Int = expr.dataType match { + case st: StringType => + st.collationId + } + + override def nullSafeEval(input: Any): Any = + CollationFactory.getCollationKeyBytes(input.asInstanceOf[UTF8String], collationId) + + override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { + defineCodeGen(ctx, ev, c => s"CollationFactory.getCollationKeyBytes($c, $collationId)") + } + + override protected def withNewChildInternal(newChild: Expression): Expression = { + copy(expr = newChild) + } + + override def child: Expression = expr +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExprUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExprUtils.scala index 258bc0ed8fe73..fde2093460876 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExprUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExprUtils.scala @@ -28,6 +28,7 @@ import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression import org.apache.spark.sql.catalyst.plans.logical.Aggregate import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, CharVarcharUtils} import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryErrorsBase, QueryExecutionErrors} +import org.apache.spark.sql.internal.types.{AbstractMapType, StringTypeAnyCollation} import org.apache.spark.sql.types.{DataType, MapType, StringType, StructType, VariantType} import org.apache.spark.unsafe.types.UTF8String @@ -57,7 +58,7 @@ object ExprUtils extends QueryErrorsBase { def convertToMapData(exp: Expression): Map[String, String] = exp match { case m: CreateMap - if m.dataType.acceptsType(MapType(StringType, StringType, valueContainsNull = false)) => + if AbstractMapType(StringTypeAnyCollation, StringTypeAnyCollation).acceptsType(m.dataType) => val arrayMap = m.eval().asInstanceOf[ArrayBasedMapData] ArrayBasedMapData.toScalaMap(arrayMap).map { case (key, value) => key.toString -> value.toString @@ -77,7 +78,7 @@ object ExprUtils extends QueryErrorsBase { columnNameOfCorruptRecord: String): Unit = { schema.getFieldIndex(columnNameOfCorruptRecord).foreach { corruptFieldIndex => val f = schema(corruptFieldIndex) - if (f.dataType != StringType || !f.nullable) { + if (!f.dataType.isInstanceOf[StringType] || !f.nullable) { throw QueryCompilationErrors.invalidFieldTypeForCorruptRecordError() } } @@ -110,7 +111,7 @@ object ExprUtils extends QueryErrorsBase { */ def checkJsonSchema(schema: DataType): TypeCheckResult = { val isInvalid = schema.existsRecursively { - case MapType(keyType, _, _) if keyType != StringType => true + case MapType(keyType, _, _) if !keyType.isInstanceOf[StringType] => true case _ => false } if (isInvalid) { @@ -133,7 +134,7 @@ object ExprUtils extends QueryErrorsBase { def checkXmlSchema(schema: DataType): TypeCheckResult = { val isInvalid = schema.existsRecursively { // XML field names must be StringType - case MapType(keyType, _, _) if keyType != StringType => true + case MapType(keyType, _, _) if !keyType.isInstanceOf[StringType] => true case _ => false } if (isInvalid) { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala index fe7d5a4b782b0..de15ec43c4f31 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala @@ -376,16 +376,12 @@ abstract class Expression extends TreeNode[Expression] { } } - /** - * An expression that cannot be evaluated. These expressions don't live past analysis or - * optimization time (e.g. Star) and should not be evaluated during query planning and - * execution. + * An expression that cannot be evaluated but is guaranteed to be replaced with a foldable value + * by query optimizer (e.g. CurrentDate). */ -trait Unevaluable extends Expression { - - /** Unevaluable is not foldable because we don't have an eval for it. */ - final override def foldable: Boolean = false +trait FoldableUnevaluable extends Expression { + override def foldable: Boolean = true final override def eval(input: InternalRow = null): Any = throw QueryExecutionErrors.cannotEvaluateExpressionError(this) @@ -394,6 +390,19 @@ trait Unevaluable extends Expression { throw QueryExecutionErrors.cannotGenerateCodeForExpressionError(this) } +/** + * An expression that cannot be evaluated. These expressions don't live past analysis or + * optimization time (e.g. Star) and should not be evaluated during query planning and + * execution. + */ +trait Unevaluable extends Expression with FoldableUnevaluable { + + /** Unevaluable is not foldable by default because we don't have an eval for it. + * Exception are expressions that will be replaced by a literal by Optimizer (e.g. CurrentDate). + * Hence we allow overriding overriding of this field in special cases. + */ + final override def foldable: Boolean = false +} /** * An expression that gets replaced at runtime (currently by the optimizer) into a different diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/FunctionTableSubqueryArgumentExpression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/FunctionTableSubqueryArgumentExpression.scala index 94465ccff796e..bfd3bc8051dff 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/FunctionTableSubqueryArgumentExpression.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/FunctionTableSubqueryArgumentExpression.scala @@ -172,9 +172,12 @@ case class FunctionTableSubqueryArgumentExpression( } } - private lazy val extraProjectedPartitioningExpressions: Seq[Alias] = { + lazy val extraProjectedPartitioningExpressions: Seq[Alias] = { partitionByExpressions.filter { e => - !subqueryOutputs.contains(e) + !subqueryOutputs.contains(e) || + // Skip deduplicating the 'partitionBy' expression(s) against the attributes of the input + // table if the UDTF also specified 'select' expression(s). + selectedInputExpressions.nonEmpty }.zipWithIndex.map { case (expr, index) => Alias(expr, s"partition_by_$index")() } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/TimeWindow.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/TimeWindow.scala index fd2e302deb997..673f9397bb03f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/TimeWindow.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/TimeWindow.scala @@ -212,7 +212,7 @@ object TimeWindow { * that we can use `window` in SQL. */ def parseExpression(expr: Expression): Long = expr match { - case NonNullLiteral(s, StringType) => getIntervalInMicroSeconds(s.toString) + case NonNullLiteral(s, _: StringType) => getIntervalInMicroSeconds(s.toString) case IntegerLiteral(i) => i.toLong case NonNullLiteral(l, LongType) => l.toString.toLong case _ => throw QueryCompilationErrors.invalidLiteralForWindowDurationError() diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ToPrettyString.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ToPrettyString.scala index 8db08dbbcb813..f6fc9b3abd65b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ToPrettyString.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ToPrettyString.scala @@ -49,7 +49,7 @@ case class ToPrettyString(child: Expression, timeZoneId: Option[String] = None) override protected def useDecimalPlainString: Boolean = true - override protected def useHexFormatForBinary: Boolean = true + override protected val binaryFormatter: BinaryFormatter = ToStringBase.getBinaryFormatter private[this] lazy val castFunc: Any => UTF8String = castToString(child.dataType) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ToStringBase.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ToStringBase.scala index 4f35072c4fc7f..130b4ee4c8cac 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ToStringBase.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ToStringBase.scala @@ -24,6 +24,8 @@ import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.expressions.codegen.Block._ import org.apache.spark.sql.catalyst.util.{ArrayData, DateFormatter, IntervalStringStyles, IntervalUtils, MapData, SparkStringUtils, TimestampFormatter} import org.apache.spark.sql.catalyst.util.IntervalStringStyles.ANSI_STYLE +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.SQLConf.BinaryOutputStyle import org.apache.spark.sql.types._ import org.apache.spark.unsafe.UTF8StringBuilder import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} @@ -44,7 +46,7 @@ trait ToStringBase { self: UnaryExpression with TimeZoneAwareExpression => protected def useDecimalPlainString: Boolean - protected def useHexFormatForBinary: Boolean + protected val binaryFormatter: BinaryFormatter = UTF8String.fromBytes // Makes the function accept Any type input by doing `asInstanceOf[T]`. @inline private def acceptAny[T](func: T => UTF8String): Any => UTF8String = @@ -54,10 +56,7 @@ trait ToStringBase { self: UnaryExpression with TimeZoneAwareExpression => protected final def castToString(from: DataType): Any => UTF8String = from match { case CalendarIntervalType => acceptAny[CalendarInterval](i => UTF8String.fromString(i.toString)) - case BinaryType if useHexFormatForBinary => - acceptAny[Array[Byte]](binary => UTF8String.fromString(SparkStringUtils.getHexString(binary))) - case BinaryType => - acceptAny[Array[Byte]](UTF8String.fromBytes) + case BinaryType => acceptAny[Array[Byte]](binaryFormatter.apply) case DateType => acceptAny[Int](d => UTF8String.fromString(dateFormatter.format(d))) case TimestampType => @@ -172,12 +171,11 @@ trait ToStringBase { self: UnaryExpression with TimeZoneAwareExpression => protected final def castToStringCode( from: DataType, ctx: CodegenContext): (ExprValue, ExprValue) => Block = { from match { - case BinaryType if useHexFormatForBinary => - (c, evPrim) => - val utilCls = SparkStringUtils.getClass.getName.stripSuffix("$") - code"$evPrim = UTF8String.fromString($utilCls.getHexString($c));" case BinaryType => - (c, evPrim) => code"$evPrim = UTF8String.fromBytes($c);" + val bf = JavaCode.global( + ctx.addReferenceObj("binaryFormatter", binaryFormatter), + classOf[BinaryFormatter]) + (c, evPrim) => code"$evPrim = $bf.apply($c);" case DateType => val df = JavaCode.global( ctx.addReferenceObj("dateFormatter", dateFormatter), @@ -414,3 +412,25 @@ trait ToStringBase { self: UnaryExpression with TimeZoneAwareExpression => """.stripMargin } } + +object ToStringBase { + def getBinaryFormatter: BinaryFormatter = { + val style = SQLConf.get.getConf(SQLConf.BINARY_OUTPUT_STYLE) + style.map(BinaryOutputStyle.withName) match { + case Some(BinaryOutputStyle.UTF8) => + (array: Array[Byte]) => UTF8String.fromBytes(array) + case Some(BinaryOutputStyle.BASIC) => + (array: Array[Byte]) => UTF8String.fromString(array.mkString("[", ", ", "]")) + case Some(BinaryOutputStyle.BASE64) => + (array: Array[Byte]) => + UTF8String.fromString(java.util.Base64.getEncoder.withoutPadding().encodeToString(array)) + case Some(BinaryOutputStyle.HEX) => + (array: Array[Byte]) => Hex.hex(array) + case _ => + (array: Array[Byte]) => UTF8String.fromString(SparkStringUtils.getHexString(array)) + } + } +} + +trait BinaryFormatter extends (Array[Byte] => UTF8String) with Serializable + diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/TransformExpression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/TransformExpression.scala index d37c9d9f6452a..9041ed15fc501 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/TransformExpression.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/TransformExpression.scala @@ -17,7 +17,10 @@ package org.apache.spark.sql.catalyst.expressions -import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, Reducer, ReducibleFunction} +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} +import org.apache.spark.sql.connector.catalog.functions.{BoundFunction, Reducer, ReducibleFunction, ScalarFunction} +import org.apache.spark.sql.errors.QueryExecutionErrors import org.apache.spark.sql.types.DataType /** @@ -30,7 +33,7 @@ import org.apache.spark.sql.types.DataType case class TransformExpression( function: BoundFunction, children: Seq[Expression], - numBucketsOpt: Option[Int] = None) extends Expression with Unevaluable { + numBucketsOpt: Option[Int] = None) extends Expression { override def nullable: Boolean = true @@ -113,4 +116,23 @@ case class TransformExpression( override protected def withNewChildrenInternal(newChildren: IndexedSeq[Expression]): Expression = copy(children = newChildren) + + private lazy val resolvedFunction: Option[Expression] = this match { + case TransformExpression(scalarFunc: ScalarFunction[_], arguments, Some(numBuckets)) => + Some(V2ExpressionUtils.resolveScalarFunction(scalarFunc, + Seq(Literal(numBuckets)) ++ arguments)) + case TransformExpression(scalarFunc: ScalarFunction[_], arguments, None) => + Some(V2ExpressionUtils.resolveScalarFunction(scalarFunc, arguments)) + case _ => None + } + + override def eval(input: InternalRow): Any = { + resolvedFunction match { + case Some(fn) => fn.eval(input) + case None => throw QueryExecutionErrors.cannotEvaluateExpressionError(this) + } + } + + override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = + throw QueryExecutionErrors.cannotGenerateCodeForExpressionError(this) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/TryEval.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/TryEval.scala index 4eacd3442ed5f..05eafe01906a5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/TryEval.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/TryEval.scala @@ -132,6 +132,43 @@ case class TryDivide(left: Expression, right: Expression, replacement: Expressio } } +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = "_FUNC_(dividend, divisor) - Returns the remainder after `expr1`/`expr2`. " + + "`dividend` must be a numeric. `divisor` must be a numeric.", + examples = """ + Examples: + > SELECT _FUNC_(3, 2); + 1 + > SELECT _FUNC_(2L, 2L); + 0 + > SELECT _FUNC_(3.0, 2.0); + 1.0 + > SELECT _FUNC_(1, 0); + NULL + """, + since = "4.0.0", + group = "math_funcs") +// scalastyle:on line.size.limit +case class TryRemainder(left: Expression, right: Expression, replacement: Expression) + extends RuntimeReplaceable with InheritAnalysisRules { + def this(left: Expression, right: Expression) = this(left, right, + (left.dataType, right.dataType) match { + case (_: NumericType, _: NumericType) => Remainder(left, right, EvalMode.TRY) + // TODO: support TRY eval mode on datetime arithmetic expressions. + case _ => TryEval(Remainder(left, right, EvalMode.ANSI)) + } + ) + + override def prettyName: String = "try_remainder" + + override def parameters: Seq[Expression] = Seq(left, right) + + override protected def withNewChildInternal(newChild: Expression): Expression = { + copy(replacement = newChild) + } +} + @ExpressionDescription( usage = "_FUNC_(expr1, expr2) - Returns `expr1`-`expr2` and the result is null on overflow. " + "The acceptable input types are the same with the `-` operator.", diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/V2ExpressionUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/V2ExpressionUtils.scala index c6cfccb74c161..220920a5a3198 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/V2ExpressionUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/V2ExpressionUtils.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.expressions import java.lang.reflect.{Method, Modifier} import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{FUNCTION_NAME, FUNCTION_PARAMETER} +import org.apache.spark.internal.LogKeys.{FUNCTION_NAME, FUNCTION_PARAM} import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.{InternalRow, SQLConfHelper} import org.apache.spark.sql.catalyst.analysis.NoSuchFunctionException @@ -136,7 +136,7 @@ object V2ExpressionUtils extends SQLConfHelper with Logging { case _: NoSuchFunctionException => val parameterString = args.map(_.dataType.typeName).mkString("(", ", ", ")") logWarning(log"V2 function ${MDC(FUNCTION_NAME, name)} " + - log"with parameter types ${MDC(FUNCTION_PARAMETER, parameterString)} is used in " + + log"with parameter types ${MDC(FUNCTION_PARAM, parameterString)} is used in " + log"partition transforms, but its definition couldn't be found in the function catalog " + log"provided") None diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/With.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/With.scala index 2745b663639f8..5f6f9afa5797a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/With.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/With.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql.catalyst.expressions +import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression import org.apache.spark.sql.catalyst.trees.TreePattern.{COMMON_EXPR_REF, TreePattern, WITH_EXPRESSION} import org.apache.spark.sql.types.DataType @@ -27,13 +28,35 @@ import org.apache.spark.sql.types.DataType */ case class With(child: Expression, defs: Seq[CommonExpressionDef]) extends Expression with Unevaluable { + // We do not allow creating a With expression with an AggregateExpression that contains a + // reference to a common expression defined in that scope (note that it can contain another With + // expression with a common expression ref of the inner With). This is to prevent the creation of + // a dangling CommonExpressionRef after rewriting it in RewriteWithExpression. + assert(!With.childContainsUnsupportedAggExpr(this)) + override val nodePatterns: Seq[TreePattern] = Seq(WITH_EXPRESSION) override def dataType: DataType = child.dataType override def nullable: Boolean = child.nullable override def children: Seq[Expression] = child +: defs override protected def withNewChildrenInternal( newChildren: IndexedSeq[Expression]): Expression = { - copy(child = newChildren.head, defs = newChildren.tail.map(_.asInstanceOf[CommonExpressionDef])) + val newDefs = newChildren.tail.map(_.asInstanceOf[CommonExpressionDef]) + // If any `CommonExpressionDef` has been updated (data type or nullability), also update its + // `CommonExpressionRef` in the `child`. + val newChild = newDefs.filter(_.resolved).foldLeft(newChildren.head) { (result, newDef) => + defs.find(_.id == newDef.id).map { oldDef => + if (newDef.dataType != oldDef.dataType || newDef.nullable != oldDef.nullable) { + val newRef = new CommonExpressionRef(newDef) + result.transform { + case oldRef: CommonExpressionRef if oldRef.id == newRef.id => + newRef + } + } else { + result + } + }.getOrElse(result) + } + copy(child = newChild, defs = newDefs) } /** @@ -88,6 +111,21 @@ object With { val commonExprRefs = commonExprDefs.map(new CommonExpressionRef(_)) With(replaced(commonExprRefs), commonExprDefs) } + + private[sql] def childContainsUnsupportedAggExpr(withExpr: With): Boolean = { + lazy val commonExprIds = withExpr.defs.map(_.id).toSet + withExpr.child.exists { + case agg: AggregateExpression => + // Check that the aggregate expression does not contain a reference to a common expression + // in the outer With expression (it is ok if it contains a reference to a common expression + // for a nested With expression). + agg.exists { + case r: CommonExpressionRef => commonExprIds.contains(r.id) + case _ => false + } + case _ => false + } + } } case class CommonExpressionId(id: Long = CommonExpressionId.newId, canonicalized: Boolean = false) { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala index d1a9cafdf61fa..5977eff4526da 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala @@ -18,13 +18,14 @@ package org.apache.spark.sql.catalyst.expressions.aggregate import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.analysis.{ExpressionBuilder, UnresolvedWithinGroup} +import org.apache.spark.sql.catalyst.analysis.{ExpressionBuilder, TypeCheckResult, UnresolvedWithinGroup} import org.apache.spark.sql.catalyst.expressions.{Ascending, Descending, Expression, ExpressionDescription, ImplicitCastInputTypes, SortOrder} import org.apache.spark.sql.catalyst.trees.UnaryLike import org.apache.spark.sql.catalyst.types.PhysicalDataType -import org.apache.spark.sql.catalyst.util.GenericArrayData +import org.apache.spark.sql.catalyst.util.{CollationFactory, GenericArrayData, UnsafeRowUtils} import org.apache.spark.sql.errors.QueryCompilationErrors -import org.apache.spark.sql.types.{AbstractDataType, AnyDataType, ArrayType, BooleanType, DataType} +import org.apache.spark.sql.types.{AbstractDataType, AnyDataType, ArrayType, BooleanType, DataType, StringType} +import org.apache.spark.unsafe.types.UTF8String import org.apache.spark.util.collection.OpenHashMap case class Mode( @@ -48,6 +49,21 @@ case class Mode( override def inputTypes: Seq[AbstractDataType] = Seq(AnyDataType) + override def checkInputDataTypes(): TypeCheckResult = { + if (UnsafeRowUtils.isBinaryStable(child.dataType) || child.dataType.isInstanceOf[StringType]) { + /* + * The Mode class uses collation awareness logic to handle string data. + * Complex types with collated fields are not yet supported. + */ + // TODO: SPARK-48700: Mode expression for complex types (all collations) + super.checkInputDataTypes() + } else { + TypeCheckResult.TypeCheckFailure("The input to the function 'mode' was" + + " a type of binary-unstable type that is " + + s"not currently supported by ${prettyName}.") + } + } + override def prettyName: String = "mode" override def update( @@ -74,7 +90,29 @@ case class Mode( if (buffer.isEmpty) { return null } - + /* + * The Mode class uses special collation awareness logic + * to handle string data types with various collations. + * + * For string types that don't support binary equality, + * we create a new map where the keys are the collation keys of the original strings. + * + * Keys from the original map are aggregated based on the corresponding collation keys. + * The groupMapReduce method groups the entries by collation key and maps each group + * to a single value (the sum of the counts), and finally reduces the groups to a single map. + * + * The new map is then used in the rest of the Mode evaluation logic. + */ + val collationAwareBuffer = child.dataType match { + case c: StringType if + !CollationFactory.fetchCollation(c.collationId).supportsBinaryEquality => + val collationId = c.collationId + val modeMap = buffer.toSeq.groupMapReduce { + case (k, _) => CollationFactory.getCollationKey(k.asInstanceOf[UTF8String], collationId) + }(x => x)((x, y) => (x._1, x._2 + y._2)).values + modeMap + case _ => buffer + } reverseOpt.map { reverse => val defaultKeyOrdering = if (reverse) { PhysicalDataType.ordering(child.dataType).asInstanceOf[Ordering[AnyRef]].reverse @@ -82,8 +120,8 @@ case class Mode( PhysicalDataType.ordering(child.dataType).asInstanceOf[Ordering[AnyRef]] } val ordering = Ordering.Tuple2(Ordering.Long, defaultKeyOrdering) - buffer.maxBy { case (key, count) => (count, key) }(ordering) - }.getOrElse(buffer.maxBy(_._2))._1 + collationAwareBuffer.maxBy { case (key, count) => (count, key) }(ordering) + }.getOrElse(collationAwareBuffer.maxBy(_._2))._1 } override def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): Mode = @@ -128,6 +166,7 @@ case class Mode( copy(child = newChild) } +// TODO: SPARK-48701: PandasMode (all collations) // scalastyle:off line.size.limit @ExpressionDescription( usage = """ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/V2Aggregator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/V2Aggregator.scala index bb94421bc7d40..49ba2ec8b904e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/V2Aggregator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/V2Aggregator.scala @@ -17,13 +17,12 @@ package org.apache.spark.sql.catalyst.expressions.aggregate -import java.io.{ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, ObjectOutputStream} - import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Expression, ImplicitCastInputTypes, UnsafeProjection} import org.apache.spark.sql.connector.catalog.functions.{AggregateFunction => V2AggregateFunction} import org.apache.spark.sql.types.{AbstractDataType, DataType} import org.apache.spark.util.ArrayImplicits._ +import org.apache.spark.util.Utils case class V2Aggregator[BUF <: java.io.Serializable, OUT]( aggrFunc: V2AggregateFunction[BUF, OUT], @@ -50,16 +49,11 @@ case class V2Aggregator[BUF <: java.io.Serializable, OUT]( } override def serialize(buffer: BUF): Array[Byte] = { - val bos = new ByteArrayOutputStream() - val out = new ObjectOutputStream(bos) - out.writeObject(buffer) - out.close() - bos.toByteArray + Utils.serialize(buffer) } override def deserialize(bytes: Array[Byte]): BUF = { - val in = new ObjectInputStream(new ByteArrayInputStream(bytes)) - in.readObject().asInstanceOf[BUF] + Utils.deserialize(bytes) } def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): V2Aggregator[BUF, OUT] = diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/datasketchesAggregates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/datasketchesAggregates.scala index 02925f3625d2e..2102428131f64 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/datasketchesAggregates.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/datasketchesAggregates.scala @@ -25,7 +25,9 @@ import org.apache.spark.SparkUnsupportedOperationException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, Expression, ExpressionDescription, Literal} import org.apache.spark.sql.catalyst.trees.BinaryLike +import org.apache.spark.sql.catalyst.util.CollationFactory import org.apache.spark.sql.errors.QueryExecutionErrors +import org.apache.spark.sql.internal.types.StringTypeAnyCollation import org.apache.spark.sql.types.{AbstractDataType, BinaryType, BooleanType, DataType, IntegerType, LongType, StringType, TypeCollection} import org.apache.spark.unsafe.types.UTF8String @@ -103,7 +105,7 @@ case class HllSketchAgg( override def prettyName: String = "hll_sketch_agg" override def inputTypes: Seq[AbstractDataType] = - Seq(TypeCollection(IntegerType, LongType, StringType, BinaryType), IntegerType) + Seq(TypeCollection(IntegerType, LongType, StringTypeAnyCollation, BinaryType), IntegerType) override def dataType: DataType = BinaryType @@ -137,7 +139,9 @@ case class HllSketchAgg( // TODO: implement support for decimal/datetime/interval types case IntegerType => sketch.update(v.asInstanceOf[Int]) case LongType => sketch.update(v.asInstanceOf[Long]) - case StringType => sketch.update(v.asInstanceOf[UTF8String].toString) + case st: StringType => + val cKey = CollationFactory.getCollationKey(v.asInstanceOf[UTF8String], st.collationId) + sketch.update(cKey.toString) case BinaryType => sketch.update(v.asInstanceOf[Array[Byte]]) case dataType => throw new SparkUnsupportedOperationException( errorClass = "_LEGACY_ERROR_TEMP_3121", diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala index 9eecf81684cea..f1b192a3e21f9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala @@ -26,7 +26,7 @@ import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.DataTypeMismatch import org.apache.spark.sql.catalyst.expressions.Cast.{toSQLId, toSQLType} import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.expressions.codegen.Block._ -import org.apache.spark.sql.catalyst.trees.TreePattern.{BINARY_ARITHMETIC, TreePattern, UNARY_POSITIVE} +import org.apache.spark.sql.catalyst.trees.TreePattern.{BINARY_ARITHMETIC, TreePattern} import org.apache.spark.sql.catalyst.types.{PhysicalDecimalType, PhysicalFractionalType, PhysicalIntegerType, PhysicalIntegralType, PhysicalLongType} import org.apache.spark.sql.catalyst.util.{IntervalMathUtils, IntervalUtils, MathUtils, TypeUtils} import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors} @@ -114,7 +114,7 @@ case class UnaryMinus( since = "1.5.0", group = "math_funcs") case class UnaryPositive(child: Expression) - extends UnaryExpression with ImplicitCastInputTypes with NullIntolerant { + extends RuntimeReplaceable with ImplicitCastInputTypes with NullIntolerant { override def prettyName: String = "positive" @@ -122,17 +122,15 @@ case class UnaryPositive(child: Expression) override def dataType: DataType = child.dataType - final override val nodePatterns: Seq[TreePattern] = Seq(UNARY_POSITIVE) - - override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = - defineCodeGen(ctx, ev, c => c) + override def sql: String = s"(+ ${child.sql})" - protected override def nullSafeEval(input: Any): Any = input + override lazy val replacement: Expression = child - override def sql: String = s"(+ ${child.sql})" + override protected def withNewChildrenInternal( + newChildren: IndexedSeq[Expression]): UnaryPositive = + copy(newChildren.head) - override protected def withNewChildInternal(newChild: Expression): UnaryPositive = - copy(child = newChild) + override def children: Seq[Expression] = child :: Nil } /** @@ -452,9 +450,8 @@ case class Add( copy(left = newLeft, right = newRight) override lazy val canonicalized: Expression = { - // TODO: do not reorder consecutive `Add`s with different `evalMode` val reorderResult = buildCanonicalizedPlan( - { case Add(l, r, _) => Seq(l, r) }, + { case Add(l, r, em) if em == evalMode => Seq(l, r) }, { case (l: Expression, r: Expression) => Add(l, r, evalMode)}, Some(evalMode) ) @@ -608,10 +605,9 @@ case class Multiply( newLeft: Expression, newRight: Expression): Multiply = copy(left = newLeft, right = newRight) override lazy val canonicalized: Expression = { - // TODO: do not reorder consecutive `Multiply`s with different `evalMode` buildCanonicalizedPlan( - { case Multiply(l, r, _) => Seq(l, r) }, - { case (l: Expression, r: Expression) => Multiply(l, r, evalMode)}, + { case Multiply(l, r, em) if em == evalMode => Seq(l, r) }, + { case (l: Expression, r: Expression) => Multiply(l, r, evalMode) }, Some(evalMode) ) } @@ -888,7 +884,7 @@ case class IntegralDivide( } @ExpressionDescription( - usage = "expr1 _FUNC_ expr2 - Returns the remainder after `expr1`/`expr2`.", + usage = "expr1 % expr2, or mod(expr1, expr2) - Returns the remainder after `expr1`/`expr2`.", examples = """ Examples: > SELECT 2 % 1.8; @@ -908,6 +904,10 @@ case class Remainder( override def inputType: AbstractDataType = NumericType + // `try_remainder` has exactly the same behavior as the legacy divide, so here it only executes + // the error code path when `evalMode` is `ANSI`. + protected override def failOnError: Boolean = evalMode == EvalMode.ANSI + override def symbol: String = "%" override def decimalMethod: String = "remainder" diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwiseExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwiseExpressions.scala index 89890ea086414..88085636a5ff1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwiseExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwiseExpressions.scala @@ -229,7 +229,7 @@ case class BitwiseCount(child: Expression) override def prettyName: String = "bit_count" override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = child.dataType match { - case BooleanType => defineCodeGen(ctx, ev, c => s"if ($c) 1 else 0") + case BooleanType => defineCodeGen(ctx, ev, c => s"($c) ? 1 : 0") case _ => defineCodeGen(ctx, ev, c => s"java.lang.Long.bitCount($c)") } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala index 5aa766a60c106..a39c10866984e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala @@ -32,9 +32,8 @@ import org.codehaus.janino.util.ClassFile import org.apache.spark.{SparkException, SparkIllegalArgumentException, TaskContext, TaskKilledException} import org.apache.spark.executor.InputMetrics -import org.apache.spark.internal.Logging -import org.apache.spark.internal.LogKey._ -import org.apache.spark.internal.MDC +import org.apache.spark.internal.{Logging, LogKeys, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.metrics.source.CodegenMetrics import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.encoders.HashableWeakReference @@ -1547,13 +1546,11 @@ object CodeGenerator extends Logging { updateAndGetCompilationStats(evaluator) } catch { case e: InternalCompilerException => - val msg = QueryExecutionErrors.failedToCompileMsg(e) - logError(msg, e) + logError("Failed to compile the generated Java code.", e) logGeneratedCode(code) throw QueryExecutionErrors.internalCompilerError(e) case e: CompileException => - val msg = QueryExecutionErrors.failedToCompileMsg(e) - logError(msg, e) + logError("Failed to compile the generated Java code.", e) logGeneratedCode(code) throw QueryExecutionErrors.compilerError(e) } @@ -1595,9 +1592,10 @@ object CodeGenerator extends Logging { CodegenMetrics.METRIC_GENERATED_METHOD_BYTECODE_SIZE.update(byteCodeSize) if (byteCodeSize > DEFAULT_JVM_HUGE_METHOD_LIMIT) { - logInfo("Generated method too long to be JIT compiled: " + - log"${MDC(CLASS_NAME, cf.getThisClassName)}.${MDC(METHOD_NAME, method.getName)} " + - log"is ${MDC(BYTECODE_SIZE, byteCodeSize)} bytes") + logInfo(log"Generated method too long to be JIT compiled: " + + log"${MDC(LogKeys.CLASS_NAME, cf.getThisClassName)}." + + log"${MDC(LogKeys.METHOD_NAME, method.getName)} is " + + log"${MDC(LogKeys.BYTECODE_SIZE, byteCodeSize)} bytes") } byteCodeSize diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collationExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collationExpressions.scala index 6af00e193d94d..c528b523c5e7f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collationExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collationExpressions.scala @@ -38,8 +38,8 @@ import org.apache.spark.sql.types._ Examples: > SET spark.sql.collation.enabled=true; spark.sql.collation.enabled true - > SELECT COLLATION('Spark SQL' _FUNC_ UTF8_BINARY_LCASE); - UTF8_BINARY_LCASE + > SELECT COLLATION('Spark SQL' _FUNC_ UTF8_LCASE); + UTF8_LCASE > SET spark.sql.collation.enabled=false; spark.sql.collation.enabled false """, @@ -57,14 +57,14 @@ object CollateExpressionBuilder extends ExpressionBuilder { expressions match { case Seq(e: Expression, collationExpr: Expression) => (collationExpr.dataType, collationExpr.foldable) match { - case (StringType, true) => + case (_: StringType, true) => val evalCollation = collationExpr.eval() if (evalCollation == null) { throw QueryCompilationErrors.unexpectedNullError("collation", collationExpr) } else { Collate(e, evalCollation.toString) } - case (StringType, false) => throw QueryCompilationErrors.nonFoldableArgumentError( + case (_: StringType, false) => throw QueryCompilationErrors.nonFoldableArgumentError( funcName, "collationName", StringType) case (_, _) => throw QueryCompilationErrors.unexpectedInputDataTypeError( funcName, 1, StringType, collationExpr) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala index 39bf6734eb27b..ea117f876550e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala @@ -96,9 +96,9 @@ trait BinaryArrayExpressionWithImplicitCast @ExpressionDescription( usage = """ _FUNC_(expr) - Returns the size of an array or a map. - The function returns null for null input if spark.sql.legacy.sizeOfNull is set to false or - spark.sql.ansi.enabled is set to true. Otherwise, the function returns -1 for null input. - With the default settings, the function returns -1 for null input. + This function returns -1 for null input only if spark.sql.ansi.enabled is false and + spark.sql.legacy.sizeOfNull is true. Otherwise, it returns null for null input. + With the default settings, the function returns null for null input. """, examples = """ Examples: @@ -713,6 +713,7 @@ case class MapConcat(children: Seq[Expression]) } } + override def stateful: Boolean = true override def nullable: Boolean = children.exists(_.nullable) private lazy val mapBuilder = new ArrayBasedMapBuilder(dataType.keyType, dataType.valueType) @@ -828,6 +829,8 @@ case class MapFromEntries(child: Expression) override def nullable: Boolean = child.nullable || nullEntries + override def stateful: Boolean = true + @transient override lazy val dataType: MapType = dataTypeDetails.get._1 override def checkInputDataTypes(): TypeCheckResult = dataTypeDetails match { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala index 3eb6225b5426e..1bfa11d67af6f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala @@ -33,6 +33,7 @@ import org.apache.spark.sql.catalyst.trees.TreePattern._ import org.apache.spark.sql.catalyst.util._ import org.apache.spark.sql.errors.QueryCompilationErrors import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.types.StringTypeAnyCollation import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String import org.apache.spark.util.ArrayImplicits._ @@ -195,7 +196,7 @@ case class CreateMap(children: Seq[Expression], useStringTypeWhenEmpty: Boolean) private val defaultElementType: DataType = { if (useStringTypeWhenEmpty) { - StringType + SQLConf.get.defaultStringType } else { NullType } @@ -244,6 +245,8 @@ case class CreateMap(children: Seq[Expression], useStringTypeWhenEmpty: Boolean) private lazy val mapBuilder = new ArrayBasedMapBuilder(dataType.keyType, dataType.valueType) + override def stateful: Boolean = true + override def eval(input: InternalRow): Any = { var i = 0 while (i < keys.length) { @@ -319,6 +322,8 @@ case class MapFromArrays(left: Expression, right: Expression) valueContainsNull = right.dataType.asInstanceOf[ArrayType].containsNull) } + override def stateful: Boolean = true + private lazy val mapBuilder = new ArrayBasedMapBuilder(dataType.keyType, dataType.valueType) override def nullSafeEval(keyArray: Any, valueArray: Any): Any = { @@ -349,7 +354,7 @@ case class MapFromArrays(left: Expression, right: Expression) case object NamePlaceholder extends LeafExpression with Unevaluable { override lazy val resolved: Boolean = false override def nullable: Boolean = false - override def dataType: DataType = StringType + override def dataType: DataType = SQLConf.get.defaultStringType override def prettyName: String = "NamePlaceholder" override def toString: String = prettyName } @@ -373,7 +378,8 @@ object CreateStruct { // We should always use the last part of the column name (`c` in the above example) as the // alias name inside CreateNamedStruct. case (u: UnresolvedAttribute, _) => Seq(Literal(u.nameParts.last), u) - case (u @ UnresolvedExtractValue(_, e: Literal), _) if e.dataType == StringType => Seq(e, u) + case (u @ UnresolvedExtractValue(_, e: Literal), _) if e.dataType.isInstanceOf[StringType] => + Seq(e, u) case (a: Alias, _) => Seq(Literal(a.name), a) case (e: NamedExpression, _) if e.resolved => Seq(Literal(e.name), e) case (e: NamedExpression, _) => Seq(NamePlaceholder, e) @@ -464,7 +470,7 @@ case class CreateNamedStruct(children: Seq[Expression]) extends Expression with toSQLId(prettyName), Seq("2n (n > 0)"), children.length ) } else { - val invalidNames = nameExprs.filterNot(e => e.foldable && e.dataType == StringType) + val invalidNames = nameExprs.filterNot(e => e.foldable && e.dataType.isInstanceOf[StringType]) if (invalidNames.nonEmpty) { DataTypeMismatch( errorSubClass = "CREATE_NAMED_STRUCT_WITHOUT_FOLDABLE_STRING", @@ -566,15 +572,18 @@ case class StringToMap(text: Expression, pairDelim: Expression, keyValueDelim: E this(child, Literal(","), Literal(":")) } + override def stateful: Boolean = true + override def first: Expression = text override def second: Expression = pairDelim override def third: Expression = keyValueDelim - override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType, StringType) + override def inputTypes: Seq[AbstractDataType] = + Seq(StringTypeAnyCollation, StringTypeAnyCollation, StringTypeAnyCollation) - override def dataType: DataType = MapType(StringType, StringType) + override def dataType: DataType = MapType(first.dataType, first.dataType) - private lazy val mapBuilder = new ArrayBasedMapBuilder(StringType, StringType) + private lazy val mapBuilder = new ArrayBasedMapBuilder(first.dataType, first.dataType) override def nullSafeEval( inputString: Any, diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala index a801d0367080d..ff94322efdaa4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala @@ -51,12 +51,12 @@ object ExtractValue { resolver: Resolver): Expression = { (child.dataType, extraction) match { - case (StructType(fields), NonNullLiteral(v, StringType)) => + case (StructType(fields), NonNullLiteral(v, _: StringType)) => val fieldName = v.toString val ordinal = findField(fields, fieldName, resolver) GetStructField(child, ordinal, Some(fieldName)) - case (ArrayType(StructType(fields), containsNull), NonNullLiteral(v, StringType)) => + case (ArrayType(StructType(fields), containsNull), NonNullLiteral(v, _: StringType)) => val fieldName = v.toString val ordinal = findField(fields, fieldName, resolver) GetArrayStructFields(child, fields(ordinal).copy(name = fieldName), diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/csvExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/csvExpressions.scala index 4714fc1ded9cd..cb10440c48328 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/csvExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/csvExpressions.scala @@ -31,6 +31,7 @@ import org.apache.spark.sql.catalyst.util._ import org.apache.spark.sql.catalyst.util.TypeUtils._ import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryErrorsBase} import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.types.StringTypeAnyCollation import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String @@ -146,7 +147,7 @@ case class CsvToStructs( converter(parser.parse(csv)) } - override def inputTypes: Seq[AbstractDataType] = StringType :: Nil + override def inputTypes: Seq[AbstractDataType] = StringTypeAnyCollation :: Nil override def prettyName: String = "from_csv" @@ -177,7 +178,7 @@ case class SchemaOfCsv( child = child, options = ExprUtils.convertToMapData(options)) - override def dataType: DataType = StringType + override def dataType: DataType = SQLConf.get.defaultStringType override def nullable: Boolean = false @@ -300,7 +301,7 @@ case class StructsToCsv( (row: Any) => UTF8String.fromString(gen.writeToString(row.asInstanceOf[InternalRow])) } - override def dataType: DataType = StringType + override def dataType: DataType = SQLConf.get.defaultStringType override def withTimeZone(timeZoneId: String): TimeZoneAwareExpression = copy(timeZoneId = Option(timeZoneId)) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala index a9155e8daf101..808ad54f8ecad 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala @@ -36,6 +36,7 @@ import org.apache.spark.sql.catalyst.util.DateTimeUtils._ import org.apache.spark.sql.catalyst.util.LegacyDateFormats.SIMPLE_DATE_FORMAT import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors} import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.types.StringTypeAnyCollation import org.apache.spark.sql.types._ import org.apache.spark.sql.types.DayTimeIntervalType.DAY import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} @@ -104,7 +105,7 @@ trait TimestampFormatterHelper extends TimeZoneAwareExpression { since = "3.1.0") case class CurrentTimeZone() extends LeafExpression with Unevaluable { override def nullable: Boolean = false - override def dataType: DataType = StringType + override def dataType: DataType = SQLConf.get.defaultStringType override def prettyName: String = "current_timezone" final override val nodePatterns: Seq[TreePattern] = Seq(CURRENT_LIKE) } @@ -134,7 +135,7 @@ case class CurrentTimeZone() extends LeafExpression with Unevaluable { since = "1.5.0") // scalastyle:on line.size.limit case class CurrentDate(timeZoneId: Option[String] = None) - extends LeafExpression with TimeZoneAwareExpression with Unevaluable { + extends LeafExpression with TimeZoneAwareExpression with FoldableUnevaluable { def this() = this(None) override def nullable: Boolean = false override def dataType: DataType = DateType @@ -169,7 +170,7 @@ object CurDateExpressionBuilder extends ExpressionBuilder { } } -abstract class CurrentTimestampLike() extends LeafExpression with Unevaluable { +abstract class CurrentTimestampLike() extends LeafExpression with FoldableUnevaluable { override def nullable: Boolean = false override def dataType: DataType = TimestampType final override val nodePatterns: Seq[TreePattern] = Seq(CURRENT_LIKE) @@ -235,7 +236,7 @@ case class Now() extends CurrentTimestampLike { group = "datetime_funcs", since = "3.4.0") case class LocalTimestamp(timeZoneId: Option[String] = None) extends LeafExpression - with TimeZoneAwareExpression with Unevaluable { + with TimeZoneAwareExpression with FoldableUnevaluable { def this() = this(None) override def nullable: Boolean = false override def dataType: DataType = TimestampNTZType @@ -923,7 +924,7 @@ case class DayName(child: Expression) extends GetDateField { override val funcName = "getDayName" override def inputTypes: Seq[AbstractDataType] = Seq(DateType) - override def dataType: DataType = StringType + override def dataType: DataType = SQLConf.get.defaultStringType override protected def withNewChildInternal(newChild: Expression): DayName = copy(child = newChild) } @@ -951,9 +952,9 @@ case class DateFormatClass(left: Expression, right: Expression, timeZoneId: Opti def this(left: Expression, right: Expression) = this(left, right, None) - override def dataType: DataType = StringType + override def dataType: DataType = SQLConf.get.defaultStringType - override def inputTypes: Seq[AbstractDataType] = Seq(TimestampType, StringType) + override def inputTypes: Seq[AbstractDataType] = Seq(TimestampType, StringTypeAnyCollation) override def withTimeZone(timeZoneId: String): TimeZoneAwareExpression = copy(timeZoneId = Option(timeZoneId)) @@ -1261,7 +1262,8 @@ abstract class ToTimestamp override def forTimestampNTZ: Boolean = left.dataType == TimestampNTZType override def inputTypes: Seq[AbstractDataType] = - Seq(TypeCollection(StringType, DateType, TimestampType, TimestampNTZType), StringType) + Seq(TypeCollection(StringTypeAnyCollation, DateType, TimestampType, TimestampNTZType), + StringTypeAnyCollation) override def dataType: DataType = LongType override def nullable: Boolean = if (failOnError) children.exists(_.nullable) else true @@ -1283,7 +1285,7 @@ abstract class ToTimestamp daysToMicros(t.asInstanceOf[Int], zoneId) / downScaleFactor case TimestampType | TimestampNTZType => t.asInstanceOf[Long] / downScaleFactor - case StringType => + case _: StringType => val fmt = right.eval(input) if (fmt == null) { null @@ -1326,7 +1328,7 @@ abstract class ToTimestamp } left.dataType match { - case StringType => formatterOption.map { fmt => + case _: StringType => formatterOption.map { fmt => val df = classOf[TimestampFormatter].getName val formatterName = ctx.addReferenceObj("formatter", fmt, df) nullSafeCodeGen(ctx, ev, (datetimeStr, _) => @@ -1429,10 +1431,10 @@ case class FromUnixTime(sec: Expression, format: Expression, timeZoneId: Option[ this(unix, Literal(TimestampFormatter.defaultPattern())) } - override def dataType: DataType = StringType + override def dataType: DataType = SQLConf.get.defaultStringType override def nullable: Boolean = true - override def inputTypes: Seq[AbstractDataType] = Seq(LongType, StringType) + override def inputTypes: Seq[AbstractDataType] = Seq(LongType, StringTypeAnyCollation) override def withTimeZone(timeZoneId: String): TimeZoneAwareExpression = copy(timeZoneId = Option(timeZoneId)) @@ -1540,7 +1542,7 @@ case class NextDay( def this(left: Expression, right: Expression) = this(left, right, SQLConf.get.ansiEnabled) - override def inputTypes: Seq[AbstractDataType] = Seq(DateType, StringType) + override def inputTypes: Seq[AbstractDataType] = Seq(DateType, StringTypeAnyCollation) override def dataType: DataType = DateType override def nullable: Boolean = true @@ -1751,7 +1753,7 @@ sealed trait UTCTimestamp extends BinaryExpression with ImplicitCastInputTypes w val func: (Long, String) => Long val funcName: String - override def inputTypes: Seq[AbstractDataType] = Seq(TimestampType, StringType) + override def inputTypes: Seq[AbstractDataType] = Seq(TimestampType, StringTypeAnyCollation) override def dataType: DataType = TimestampType override def nullSafeEval(time: Any, timezone: Any): Any = { @@ -2091,8 +2093,8 @@ case class ParseToDate( override def inputTypes: Seq[AbstractDataType] = { // Note: ideally this function should only take string input, but we allow more types here to // be backward compatible. - TypeCollection(StringType, DateType, TimestampType, TimestampNTZType) +: - format.map(_ => StringType).toSeq + TypeCollection(StringTypeAnyCollation, DateType, TimestampType, TimestampNTZType) +: + format.map(_ => StringTypeAnyCollation).toSeq } override protected def withNewChildrenInternal( @@ -2163,10 +2165,10 @@ case class ParseToTimestamp( override def inputTypes: Seq[AbstractDataType] = { // Note: ideally this function should only take string input, but we allow more types here to // be backward compatible. - val types = Seq(StringType, DateType, TimestampType, TimestampNTZType) + val types = Seq(StringTypeAnyCollation, DateType, TimestampType, TimestampNTZType) TypeCollection( (if (dataType.isInstanceOf[TimestampType]) types :+ NumericType else types): _* - ) +: format.map(_ => StringType).toSeq + ) +: format.map(_ => StringTypeAnyCollation).toSeq } override protected def withNewChildrenInternal( @@ -2296,7 +2298,7 @@ case class TruncDate(date: Expression, format: Expression) override def left: Expression = date override def right: Expression = format - override def inputTypes: Seq[AbstractDataType] = Seq(DateType, StringType) + override def inputTypes: Seq[AbstractDataType] = Seq(DateType, StringTypeAnyCollation) override def dataType: DataType = DateType override def prettyName: String = "trunc" override val instant = date @@ -2365,7 +2367,7 @@ case class TruncTimestamp( override def left: Expression = format override def right: Expression = timestamp - override def inputTypes: Seq[AbstractDataType] = Seq(StringType, TimestampType) + override def inputTypes: Seq[AbstractDataType] = Seq(StringTypeAnyCollation, TimestampType) override def dataType: TimestampType = TimestampType override def prettyName: String = "date_trunc" override val instant = timestamp @@ -2666,7 +2668,7 @@ case class MakeTimestamp( // casted into decimal safely, we use DecimalType(16, 6) which is wider than DecimalType(10, 0). override def inputTypes: Seq[AbstractDataType] = Seq(IntegerType, IntegerType, IntegerType, IntegerType, IntegerType, DecimalType(16, 6)) ++ - timezone.map(_ => StringType) + timezone.map(_ => StringTypeAnyCollation) override def nullable: Boolean = if (failOnError) children.exists(_.nullable) else true override def withTimeZone(timeZoneId: String): TimeZoneAwareExpression = @@ -2938,7 +2940,7 @@ case class Extract(field: Expression, source: Expression, replacement: Expressio object Extract { def createExpr(funcName: String, field: Expression, source: Expression): Expression = { // both string and null literals are allowed. - if ((field.dataType == StringType || field.dataType == NullType) && field.foldable) { + if ((field.dataType.isInstanceOf[StringType] || field.dataType == NullType) && field.foldable) { val fieldStr = field.eval().asInstanceOf[UTF8String] if (fieldStr == null) { Literal(null, DoubleType) @@ -3113,7 +3115,8 @@ case class ConvertTimezone( override def second: Expression = targetTz override def third: Expression = sourceTs - override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType, TimestampNTZType) + override def inputTypes: Seq[AbstractDataType] = Seq(StringTypeAnyCollation, + StringTypeAnyCollation, TimestampNTZType) override def dataType: DataType = TimestampNTZType override def nullSafeEval(srcTz: Any, tgtTz: Any, micros: Any): Any = { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala index 436efa8924165..fa342f6415097 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala @@ -63,7 +63,7 @@ import org.apache.spark.util.ArrayImplicits._ case class Md5(child: Expression) extends UnaryExpression with ImplicitCastInputTypes with NullIntolerant { - override def dataType: DataType = StringType + override def dataType: DataType = SQLConf.get.defaultStringType override def inputTypes: Seq[DataType] = Seq(BinaryType) @@ -103,7 +103,7 @@ case class Md5(child: Expression) case class Sha2(left: Expression, right: Expression) extends BinaryExpression with ImplicitCastInputTypes with NullIntolerant with Serializable { - override def dataType: DataType = StringType + override def dataType: DataType = SQLConf.get.defaultStringType override def nullable: Boolean = true override def inputTypes: Seq[DataType] = Seq(BinaryType, IntegerType) @@ -169,7 +169,7 @@ case class Sha2(left: Expression, right: Expression) case class Sha1(child: Expression) extends UnaryExpression with ImplicitCastInputTypes with NullIntolerant { - override def dataType: DataType = StringType + override def dataType: DataType = SQLConf.get.defaultStringType override def inputTypes: Seq[DataType] = Seq(BinaryType) @@ -271,6 +271,10 @@ abstract class HashExpression[E] extends Expression { dt.existsRecursively(_.isInstanceOf[MapType]) } + private def hasVariantType(dt: DataType): Boolean = { + dt.existsRecursively(_.isInstanceOf[VariantType]) + } + override def checkInputDataTypes(): TypeCheckResult = { if (children.length < 1) { throw QueryCompilationErrors.wrongNumArgsError( @@ -281,6 +285,10 @@ abstract class HashExpression[E] extends Expression { DataTypeMismatch( errorSubClass = "HASH_MAP_TYPE", messageParameters = Map("functionName" -> toSQLId(prettyName))) + } else if (children.exists(child => hasVariantType(child.dataType))) { + DataTypeMismatch( + errorSubClass = "HASH_VARIANT_TYPE", + messageParameters = Map("functionName" -> toSQLId(prettyName))) } else { TypeCheckResult.TypeCheckSuccess } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala index 896f3e9774f37..80bcf156133ed 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala @@ -920,6 +920,8 @@ case class TransformKeys( override def dataType: MapType = MapType(function.dataType, valueType, valueContainsNull) + override def stateful: Boolean = true + override def checkInputDataTypes(): TypeCheckResult = { TypeUtils.checkForMapKeyType(function.dataType) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/inputFileBlock.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/inputFileBlock.scala index 6cd88367aa9a0..65eb995ff32ff 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/inputFileBlock.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/inputFileBlock.scala @@ -21,7 +21,8 @@ import org.apache.spark.rdd.InputFileBlockHolder import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGenerator, ExprCode, FalseLiteral} import org.apache.spark.sql.catalyst.expressions.codegen.Block._ -import org.apache.spark.sql.types.{DataType, LongType, StringType} +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.{DataType, LongType} import org.apache.spark.unsafe.types.UTF8String // scalastyle:off whitespace.end.of.line @@ -39,7 +40,7 @@ case class InputFileName() extends LeafExpression with Nondeterministic { override def nullable: Boolean = false - override def dataType: DataType = StringType + override def dataType: DataType = SQLConf.get.defaultStringType override def prettyName: String = "input_file_name" diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala index f35c6da4f8af9..7005d663a3f96 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala @@ -31,11 +31,13 @@ import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.DataTypeMismatch import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGenerator, CodegenFallback, ExprCode} import org.apache.spark.sql.catalyst.expressions.codegen.Block.BlockHelper +import org.apache.spark.sql.catalyst.expressions.variant.VariantExpressionEvalUtils import org.apache.spark.sql.catalyst.json._ import org.apache.spark.sql.catalyst.trees.TreePattern.{JSON_TO_STRUCT, TreePattern} import org.apache.spark.sql.catalyst.util._ import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryErrorsBase} import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.types.StringTypeAnyCollation import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.{UTF8String, VariantVal} import org.apache.spark.util.Utils @@ -131,8 +133,9 @@ case class GetJsonObject(json: Expression, path: Expression) override def left: Expression = json override def right: Expression = path - override def inputTypes: Seq[DataType] = Seq(StringType, StringType) - override def dataType: DataType = StringType + override def inputTypes: Seq[AbstractDataType] = + Seq(StringTypeAnyCollation, StringTypeAnyCollation) + override def dataType: DataType = SQLConf.get.defaultStringType override def nullable: Boolean = true override def prettyName: String = "get_json_object" @@ -476,7 +479,7 @@ case class JsonTuple(children: Seq[Expression]) @transient private lazy val constantFields: Int = foldableFieldNames.count(_ != null) override def elementSchema: StructType = StructType(fieldExpressions.zipWithIndex.map { - case (_, idx) => StructField(s"c$idx", StringType, nullable = true) + case (_, idx) => StructField(s"c$idx", children.head.dataType, nullable = true) }) override def prettyName: String = "json_tuple" @@ -486,7 +489,7 @@ case class JsonTuple(children: Seq[Expression]) throw QueryCompilationErrors.wrongNumArgsError( toSQLId(prettyName), Seq("> 1"), children.length ) - } else if (children.forall(child => StringType.acceptsType(child.dataType))) { + } else if (children.forall(child => StringTypeAnyCollation.acceptsType(child.dataType))) { TypeCheckResult.TypeCheckSuccess } else { DataTypeMismatch( @@ -664,7 +667,7 @@ case class JsonToStructs( timeZoneId = None) override def checkInputDataTypes(): TypeCheckResult = nullableSchema match { - case _: StructType | _: ArrayType | _: MapType => + case _: StructType | _: ArrayType | _: MapType | _: VariantType => val checkResult = ExprUtils.checkJsonSchema(nullableSchema) if (checkResult.isFailure) checkResult else super.checkInputDataTypes() case _ => @@ -714,11 +717,14 @@ case class JsonToStructs( override def withTimeZone(timeZoneId: String): TimeZoneAwareExpression = copy(timeZoneId = Option(timeZoneId)) - override def nullSafeEval(json: Any): Any = { - converter(parser.parse(json.asInstanceOf[UTF8String])) + override def nullSafeEval(json: Any): Any = nullableSchema match { + case _: VariantType => + VariantExpressionEvalUtils.parseJson(json.asInstanceOf[UTF8String]) + case _ => + converter(parser.parse(json.asInstanceOf[UTF8String])) } - override def inputTypes: Seq[AbstractDataType] = StringType :: Nil + override def inputTypes: Seq[AbstractDataType] = StringTypeAnyCollation :: Nil override def sql: String = schema match { case _: MapType => "entries" @@ -820,7 +826,7 @@ case class StructsToJson( } } - override def dataType: DataType = StringType + override def dataType: DataType = SQLConf.get.defaultStringType override def checkInputDataTypes(): TypeCheckResult = inputSchema match { case dt @ (_: StructType | _: MapType | _: ArrayType | _: VariantType) => @@ -869,7 +875,7 @@ case class SchemaOfJson( child = child, options = ExprUtils.convertToMapData(options)) - override def dataType: DataType = StringType + override def dataType: DataType = SQLConf.get.defaultStringType override def nullable: Boolean = false @@ -915,7 +921,8 @@ case class SchemaOfJson( .map(ArrayType(_, containsNull = at.containsNull)) .getOrElse(ArrayType(StructType(Nil), containsNull = at.containsNull)) case other: DataType => - jsonInferSchema.canonicalizeType(other, jsonOptions).getOrElse(StringType) + jsonInferSchema.canonicalizeType(other, jsonOptions).getOrElse( + SQLConf.get.defaultStringType) } } @@ -953,7 +960,7 @@ case class SchemaOfJson( case class LengthOfJsonArray(child: Expression) extends UnaryExpression with CodegenFallback with ExpectsInputTypes { - override def inputTypes: Seq[DataType] = Seq(StringType) + override def inputTypes: Seq[AbstractDataType] = Seq(StringTypeAnyCollation) override def dataType: DataType = IntegerType override def nullable: Boolean = true override def prettyName: String = "json_array_length" @@ -1026,8 +1033,8 @@ case class LengthOfJsonArray(child: Expression) extends UnaryExpression case class JsonObjectKeys(child: Expression) extends UnaryExpression with CodegenFallback with ExpectsInputTypes { - override def inputTypes: Seq[DataType] = Seq(StringType) - override def dataType: DataType = ArrayType(StringType) + override def inputTypes: Seq[AbstractDataType] = Seq(StringTypeAnyCollation) + override def dataType: DataType = ArrayType(SQLConf.get.defaultStringType) override def nullable: Boolean = true override def prettyName: String = "json_object_keys" diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala index 0fad3eff2da52..4cffc7f0b53a3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala @@ -42,6 +42,7 @@ import org.json4s.JsonAST._ import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow, ScalaReflection} import org.apache.spark.sql.catalyst.expressions.codegen._ +import org.apache.spark.sql.catalyst.expressions.variant.VariantExpressionEvalUtils import org.apache.spark.sql.catalyst.trees.TreePattern import org.apache.spark.sql.catalyst.trees.TreePattern.{LITERAL, NULL_LITERAL, TRUE_OR_FALSE_LITERAL} import org.apache.spark.sql.catalyst.types._ @@ -204,6 +205,8 @@ object Literal { create(new GenericInternalRow( struct.fields.map(f => default(f.dataType).value)), struct) case udt: UserDefinedType[_] => Literal(default(udt.sqlType).value, udt) + case VariantType => + create(VariantExpressionEvalUtils.castToVariant(0, IntegerType), VariantType) case other => throw QueryExecutionErrors.noDefaultForDataTypeError(dataType) } @@ -549,6 +552,7 @@ case class Literal (value: Any, dataType: DataType) extends LeafExpression { s"${Literal(kv._1, mapType.keyType).sql}, ${Literal(kv._2, mapType.valueType).sql}" } s"MAP(${keysAndValues.mkString(", ")})" + case (v: VariantVal, variantType: VariantType) => s"PARSE_JSON('${v.toJson(timeZoneId)}')" case _ => value.toString } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/maskExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/maskExpressions.scala index e5157685a9a6d..c11357352c79a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/maskExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/maskExpressions.scala @@ -24,7 +24,9 @@ import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.expressions.codegen.Block._ import org.apache.spark.sql.catalyst.plans.logical.{FunctionSignature, InputParameter} import org.apache.spark.sql.errors.QueryErrorsBase -import org.apache.spark.sql.types.{AbstractDataType, DataType, StringType} +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.types.StringTypeAnyCollation +import org.apache.spark.sql.types.{AbstractDataType, DataType} import org.apache.spark.unsafe.types.UTF8String // scalastyle:off line.size.limit @@ -79,12 +81,14 @@ import org.apache.spark.unsafe.types.UTF8String object MaskExpressionBuilder extends ExpressionBuilder { override def functionSignature: Option[FunctionSignature] = { val strArg = InputParameter("str") - val upperCharArg = InputParameter("upperChar", Some(Literal(Mask.MASKED_UPPERCASE))) - val lowerCharArg = InputParameter("lowerChar", Some(Literal(Mask.MASKED_LOWERCASE))) - val digitCharArg = InputParameter("digitChar", Some(Literal(Mask.MASKED_DIGIT))) - val otherCharArg = InputParameter( - "otherChar", - Some(Literal(Mask.MASKED_IGNORE, StringType))) + val upperCharArg = InputParameter("upperChar", + Some(Literal.create(Mask.MASKED_UPPERCASE, SQLConf.get.defaultStringType))) + val lowerCharArg = InputParameter("lowerChar", + Some(Literal.create(Mask.MASKED_LOWERCASE, SQLConf.get.defaultStringType))) + val digitCharArg = InputParameter("digitChar", + Some(Literal.create(Mask.MASKED_DIGIT, SQLConf.get.defaultStringType))) + val otherCharArg = InputParameter("otherChar", + Some(Literal.create(Mask.MASKED_IGNORE, SQLConf.get.defaultStringType))) val functionSignature: FunctionSignature = FunctionSignature(Seq( strArg, upperCharArg, lowerCharArg, digitCharArg, otherCharArg)) Some(functionSignature) @@ -109,33 +113,34 @@ case class Mask( def this(input: Expression) = this( input, - Literal(Mask.MASKED_UPPERCASE), - Literal(Mask.MASKED_LOWERCASE), - Literal(Mask.MASKED_DIGIT), - Literal(Mask.MASKED_IGNORE, StringType)) + Literal.create(Mask.MASKED_UPPERCASE, SQLConf.get.defaultStringType), + Literal.create(Mask.MASKED_LOWERCASE, SQLConf.get.defaultStringType), + Literal.create(Mask.MASKED_DIGIT, SQLConf.get.defaultStringType), + Literal.create(Mask.MASKED_IGNORE, input.dataType)) def this(input: Expression, upperChar: Expression) = this( input, upperChar, - Literal(Mask.MASKED_LOWERCASE), - Literal(Mask.MASKED_DIGIT), - Literal(Mask.MASKED_IGNORE, StringType)) + Literal.create(Mask.MASKED_LOWERCASE, SQLConf.get.defaultStringType), + Literal.create(Mask.MASKED_DIGIT, SQLConf.get.defaultStringType), + Literal.create(Mask.MASKED_IGNORE, input.dataType)) def this(input: Expression, upperChar: Expression, lowerChar: Expression) = this( input, upperChar, lowerChar, - Literal(Mask.MASKED_DIGIT), - Literal(Mask.MASKED_IGNORE, StringType)) + Literal.create(Mask.MASKED_DIGIT, SQLConf.get.defaultStringType), + Literal.create(Mask.MASKED_IGNORE, input.dataType)) def this( input: Expression, upperChar: Expression, lowerChar: Expression, digitChar: Expression) = - this(input, upperChar, lowerChar, digitChar, Literal(Mask.MASKED_IGNORE, StringType)) + this(input, upperChar, lowerChar, digitChar, + Literal.create(Mask.MASKED_IGNORE, input.dataType)) override def checkInputDataTypes(): TypeCheckResult = { @@ -187,7 +192,8 @@ case class Mask( * NumericType, IntegralType, FractionalType. */ override def inputTypes: Seq[AbstractDataType] = - Seq(StringType, StringType, StringType, StringType, StringType) + Seq(StringTypeAnyCollation, StringTypeAnyCollation, StringTypeAnyCollation, + StringTypeAnyCollation, StringTypeAnyCollation) override def nullable: Boolean = true @@ -276,7 +282,7 @@ case class Mask( * Returns the [[DataType]] of the result of evaluating this expression. It is invalid to query * the dataType of an unresolved expression (i.e., when `resolved` == false). */ - override def dataType: DataType = StringType + override def dataType: DataType = input.dataType /** * Returns a Seq of the children of this node. Children should not change. Immutability required diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala index 0c09e9be12e94..00274a16b888b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala @@ -18,6 +18,7 @@ package org.apache.spark.sql.catalyst.expressions import java.{lang => jl} +import java.util.HexFormat.fromHexDigit import java.util.Locale import org.apache.spark.QueryContext @@ -30,6 +31,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen.Block._ import org.apache.spark.sql.catalyst.util.{MathUtils, NumberConverter, TypeUtils} import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors} import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.types.StringTypeAnyCollation import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String @@ -450,8 +452,9 @@ case class Conv( override def first: Expression = numExpr override def second: Expression = fromBaseExpr override def third: Expression = toBaseExpr - override def inputTypes: Seq[AbstractDataType] = Seq(StringType, IntegerType, IntegerType) - override def dataType: DataType = StringType + override def inputTypes: Seq[AbstractDataType] = + Seq(StringTypeAnyCollation, IntegerType, IntegerType) + override def dataType: DataType = first.dataType override def nullable: Boolean = true override def nullSafeEval(num: Any, fromBase: Any, toBase: Any): Any = { @@ -1002,23 +1005,21 @@ case class Bin(child: Expression) extends UnaryExpression with ImplicitCastInputTypes with NullIntolerant with Serializable { override def inputTypes: Seq[DataType] = Seq(LongType) - override def dataType: DataType = StringType + override def dataType: DataType = SQLConf.get.defaultStringType protected override def nullSafeEval(input: Any): Any = - UTF8String.fromString(jl.Long.toBinaryString(input.asInstanceOf[Long])) + UTF8String.toBinaryString(input.asInstanceOf[Long]) override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { - defineCodeGen(ctx, ev, (c) => - s"UTF8String.fromString(java.lang.Long.toBinaryString($c))") + defineCodeGen(ctx, ev, c => s"UTF8String.toBinaryString($c)") } override protected def withNewChildInternal(newChild: Expression): Bin = copy(child = newChild) } object Hex { - val hexDigits = Array[Char]( - '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' - ).map(_.toByte) + private final val hexDigits = + Array[Byte]('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F') // lookup table to translate '0' -> 0 ... 'F'/'f' -> 15 val unhexDigits = { @@ -1031,61 +1032,66 @@ object Hex { def hex(bytes: Array[Byte]): UTF8String = { val length = bytes.length - val value = new Array[Byte](length * 2) + if (length == 0) { + return UTF8String.EMPTY_UTF8 + } + val targetLength = length * 2L + if (targetLength > Int.MaxValue) { + throw QueryExecutionErrors.tooManyArrayElementsError(targetLength, Int.MaxValue) + } + val value = new Array[Byte](targetLength.toInt) var i = 0 while (i < length) { - value(i * 2) = Hex.hexDigits((bytes(i) & 0xF0) >> 4) - value(i * 2 + 1) = Hex.hexDigits(bytes(i) & 0x0F) + value(i * 2) = hexDigits((bytes(i) & 0xF0) >> 4) + value(i * 2 + 1) = hexDigits(bytes(i) & 0x0F) i += 1 } UTF8String.fromBytes(value) } def hex(num: Long): UTF8String = { - // Extract the hex digits of num into value[] from right to left - val value = new Array[Byte](16) + val zeros = jl.Long.numberOfLeadingZeros(num) + if (zeros == jl.Long.SIZE) return UTF8String.ZERO_UTF8 + val len = (jl.Long.SIZE - zeros + 3) / 4 var numBuf = num - var len = 0 - do { - len += 1 - value(value.length - len) = Hex.hexDigits((numBuf & 0xF).toInt) + val value = new Array[Byte](len) + var i = len - 1 + while (i >= 0) { + value(i) = hexDigits((numBuf & 0xF).toInt) numBuf >>>= 4 - } while (numBuf != 0) - UTF8String.fromBytes(java.util.Arrays.copyOfRange(value, value.length - len, value.length)) + i -= 1 + } + UTF8String.fromBytes(value) } def unhex(bytes: Array[Byte]): Array[Byte] = { - val out = new Array[Byte]((bytes.length + 1) >> 1) - var i = 0 - var oddShift = 0 - if ((bytes.length & 0x01) != 0) { - // padding with '0' - if (bytes(0) < 0) { - return null - } - val v = Hex.unhexDigits(bytes(0)) - if (v == -1) { - return null - } - out(0) = v - i += 1 - oddShift = 1 + val length = bytes.length + if (length == 0) { + return Array.emptyByteArray } - // two characters form the hex value. - while (i < bytes.length) { - if (bytes(i) < 0 || bytes(i + 1) < 0) { - return null + if ((length & 0x1) != 0) { + // while length of bytes is odd, loop from the end to beginning w/o the head + val result = new Array[Byte](length / 2 + 1) + var i = result.length - 1 + while (i > 0) { + result(i) = ((fromHexDigit(bytes(i * 2 - 1)) << 4) | fromHexDigit(bytes(i * 2))).toByte + i -= 1 } - val first = Hex.unhexDigits(bytes(i)) - val second = Hex.unhexDigits(bytes(i + 1)) - if (first == -1 || second == -1) { - return null + // add it 'tailing' head + result(0) = fromHexDigit(bytes(0)).toByte + result + } else { + val result = new Array[Byte](length / 2) + var i = 0 + while (i < result.length) { + result(i) = ((fromHexDigit(bytes(2 * i)) << 4) | fromHexDigit(bytes(2 * i + 1))).toByte + i += 1 } - out(i / 2 + oddShift) = (((first << 4) | second) & 0xFF).toByte - i += 2 + result } - out } + + def unhex(str: String): Array[Byte] = unhex(str.getBytes()) } /** @@ -1108,21 +1114,24 @@ case class Hex(child: Expression) extends UnaryExpression with ImplicitCastInputTypes with NullIntolerant { override def inputTypes: Seq[AbstractDataType] = - Seq(TypeCollection(LongType, BinaryType, StringType)) + Seq(TypeCollection(LongType, BinaryType, StringTypeAnyCollation)) - override def dataType: DataType = StringType + override def dataType: DataType = child.dataType match { + case st: StringType => st + case _ => SQLConf.get.defaultStringType + } protected override def nullSafeEval(num: Any): Any = child.dataType match { case LongType => Hex.hex(num.asInstanceOf[Long]) case BinaryType => Hex.hex(num.asInstanceOf[Array[Byte]]) - case StringType => Hex.hex(num.asInstanceOf[UTF8String].getBytes) + case _: StringType => Hex.hex(num.asInstanceOf[UTF8String].getBytes) } override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { nullSafeCodeGen(ctx, ev, (c) => { val hex = Hex.getClass.getName.stripSuffix("$") s"${ev.value} = " + (child.dataType match { - case StringType => s"""$hex.hex($c.getBytes());""" + case _: StringType => s"""$hex.hex($c.getBytes());""" case _ => s"""$hex.hex($c);""" }) }) @@ -1149,47 +1158,32 @@ case class Unhex(child: Expression, failOnError: Boolean = false) def this(expr: Expression) = this(expr, false) - override def inputTypes: Seq[AbstractDataType] = Seq(StringType) + override def inputTypes: Seq[AbstractDataType] = Seq(StringTypeAnyCollation) override def nullable: Boolean = true override def dataType: DataType = BinaryType protected override def nullSafeEval(num: Any): Any = { - val result = Hex.unhex(num.asInstanceOf[UTF8String].getBytes) - if (failOnError && result == null) { - // The failOnError is set only from `ToBinary` function - hence we might safely set `hint` - // parameter to `try_to_binary`. - throw QueryExecutionErrors.invalidInputInConversionError( - BinaryType, - num.asInstanceOf[UTF8String], - UTF8String.fromString("HEX"), - "try_to_binary") + try { + Hex.unhex(num.asInstanceOf[UTF8String].getBytes) + } catch { + case _: IllegalArgumentException if !failOnError => null + case _: IllegalArgumentException => + throw QueryExecutionErrors.invalidInputInConversionError( + BinaryType, + num.asInstanceOf[UTF8String], + UTF8String.fromString("HEX"), + "try_to_binary") } - result } override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { - nullSafeCodeGen(ctx, ev, c => { - val hex = Hex.getClass.getName.stripSuffix("$") - val maybeFailOnErrorCode = if (failOnError) { - val binaryType = ctx.addReferenceObj("to", BinaryType, BinaryType.getClass.getName) - s""" - |if (${ev.value} == null) { - | throw QueryExecutionErrors.invalidInputInConversionError( - | $binaryType, - | $c, - | UTF8String.fromString("HEX"), - | "try_to_binary"); - |} - |""".stripMargin - } else { - s"${ev.isNull} = ${ev.value} == null;" - } - + val expr = ctx.addReferenceObj("this", this) + nullSafeCodeGen(ctx, ev, input => { s""" - ${ev.value} = $hex.unhex($c.getBytes()); - $maybeFailOnErrorCode - """ + ${ev.value} = (byte[]) $expr.nullSafeEval($input); + ${ev.isNull} = ${ev.value} == null; + """ }) } @@ -1256,6 +1250,41 @@ case class Pow(left: Expression, right: Expression) newLeft: Expression, newRight: Expression): Expression = copy(left = newLeft, right = newRight) } +sealed trait BitShiftOperation + extends BinaryExpression with ImplicitCastInputTypes with NullIntolerant { + + def symbol: String + def shiftInt: (Int, Int) => Int + def shiftLong: (Long, Int) => Long + + override def inputTypes: Seq[AbstractDataType] = + Seq(TypeCollection(IntegerType, LongType), IntegerType) + + override def dataType: DataType = left.dataType + + override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { + defineCodeGen(ctx, ev, (left, right) => s"$left $symbol $right") + } + + override protected def nullSafeEval(input1: Any, input2: Any): Any = input1 match { + case l: jl.Long => shiftLong(l, input2.asInstanceOf[Int]) + case i: jl.Integer => shiftInt(i, input2.asInstanceOf[Int]) + } + + override def toString: String = { + getTagValue(FunctionRegistry.FUNC_ALIAS) match { + case Some(alias) if alias == symbol => s"($left $symbol $right)" + case _ => super.toString + } + } + + override def sql: String = { + getTagValue(FunctionRegistry.FUNC_ALIAS) match { + case Some(alias) if alias == symbol => s"(${left.sql} $symbol ${right.sql})" + case _ => super.sql + } + } +} /** * Bitwise left shift. @@ -1264,38 +1293,28 @@ case class Pow(left: Expression, right: Expression) * @param right number of bits to left shift. */ @ExpressionDescription( - usage = "_FUNC_(base, expr) - Bitwise left shift.", + usage = "base << exp - Bitwise left shift.", examples = """ Examples: - > SELECT _FUNC_(2, 1); + > SELECT shiftleft(2, 1); + 4 + > SELECT 2 << 1; 4 """, + note = """ + `<<` operator is added in Spark 4.0.0 as an alias for `shiftleft`. + """, since = "1.5.0", group = "bitwise_funcs") -case class ShiftLeft(left: Expression, right: Expression) - extends BinaryExpression with ImplicitCastInputTypes with NullIntolerant { - - override def inputTypes: Seq[AbstractDataType] = - Seq(TypeCollection(IntegerType, LongType), IntegerType) - - override def dataType: DataType = left.dataType - - protected override def nullSafeEval(input1: Any, input2: Any): Any = { - input1 match { - case l: jl.Long => l << input2.asInstanceOf[jl.Integer] - case i: jl.Integer => i << input2.asInstanceOf[jl.Integer] - } - } - - override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { - defineCodeGen(ctx, ev, (left, right) => s"$left << $right") - } - +case class ShiftLeft(left: Expression, right: Expression) extends BitShiftOperation { + override def symbol: String = "<<" + override def shiftInt: (Int, Int) => Int = (x: Int, y: Int) => x << y + override def shiftLong: (Long, Int) => Long = (x: Long, y: Int) => x << y + val shift: (Number, Int) => Any = (x: Number, y: Int) => x.longValue() << y override protected def withNewChildrenInternal( newLeft: Expression, newRight: Expression): ShiftLeft = copy(left = newLeft, right = newRight) } - /** * Bitwise (signed) right shift. * @@ -1303,38 +1322,27 @@ case class ShiftLeft(left: Expression, right: Expression) * @param right number of bits to right shift. */ @ExpressionDescription( - usage = "_FUNC_(base, expr) - Bitwise (signed) right shift.", + usage = "base >> expr - Bitwise (signed) right shift.", examples = """ Examples: - > SELECT _FUNC_(4, 1); + > SELECT shiftright(4, 1); + 2 + > SELECT 4 >> 1; 2 """, + note = """ + `>>` operator is added in Spark 4.0.0 as an alias for `shiftright`. + """, since = "1.5.0", group = "bitwise_funcs") -case class ShiftRight(left: Expression, right: Expression) - extends BinaryExpression with ImplicitCastInputTypes with NullIntolerant { - - override def inputTypes: Seq[AbstractDataType] = - Seq(TypeCollection(IntegerType, LongType), IntegerType) - - override def dataType: DataType = left.dataType - - protected override def nullSafeEval(input1: Any, input2: Any): Any = { - input1 match { - case l: jl.Long => l >> input2.asInstanceOf[jl.Integer] - case i: jl.Integer => i >> input2.asInstanceOf[jl.Integer] - } - } - - override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { - defineCodeGen(ctx, ev, (left, right) => s"$left >> $right") - } - +case class ShiftRight(left: Expression, right: Expression) extends BitShiftOperation { + override def symbol: String = ">>" + override def shiftInt: (Int, Int) => Int = (x: Int, y: Int) => x >> y + override def shiftLong: (Long, Int) => Long = (x: Long, y: Int) => x >> y override protected def withNewChildrenInternal( newLeft: Expression, newRight: Expression): ShiftRight = copy(left = newLeft, right = newRight) } - /** * Bitwise unsigned right shift, for integer and long data type. * @@ -1342,33 +1350,23 @@ case class ShiftRight(left: Expression, right: Expression) * @param right the number of bits to right shift. */ @ExpressionDescription( - usage = "_FUNC_(base, expr) - Bitwise unsigned right shift.", + usage = "base >>> expr - Bitwise unsigned right shift.", examples = """ Examples: - > SELECT _FUNC_(4, 1); + > SELECT shiftrightunsigned(4, 1); 2 + > SELECT 4 >>> 1; + 2 + """, + note = """ + `>>>` operator is added in Spark 4.0.0 as an alias for `shiftrightunsigned`. """, since = "1.5.0", group = "bitwise_funcs") -case class ShiftRightUnsigned(left: Expression, right: Expression) - extends BinaryExpression with ImplicitCastInputTypes with NullIntolerant { - - override def inputTypes: Seq[AbstractDataType] = - Seq(TypeCollection(IntegerType, LongType), IntegerType) - - override def dataType: DataType = left.dataType - - protected override def nullSafeEval(input1: Any, input2: Any): Any = { - input1 match { - case l: jl.Long => l >>> input2.asInstanceOf[jl.Integer] - case i: jl.Integer => i >>> input2.asInstanceOf[jl.Integer] - } - } - - override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { - defineCodeGen(ctx, ev, (left, right) => s"$left >>> $right") - } - +case class ShiftRightUnsigned(left: Expression, right: Expression) extends BitShiftOperation { + override def symbol: String = ">>>" + override def shiftInt: (Int, Int) => Int = (x: Int, y: Int) => x >>> y + override def shiftLong: (Long, Int) => Long = (x: Long, y: Int) => x >>> y override protected def withNewChildrenInternal( newLeft: Expression, newRight: Expression): ShiftRightUnsigned = copy(left = newLeft, right = newRight) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala index c7281e4e87378..e9fa362de14cd 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala @@ -27,6 +27,7 @@ import org.apache.spark.sql.catalyst.util.{MapData, RandomUUIDGenerator} import org.apache.spark.sql.errors.QueryCompilationErrors import org.apache.spark.sql.errors.QueryExecutionErrors.raiseError import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.types.StringTypeAnyCollation import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String @@ -84,7 +85,7 @@ case class RaiseError(errorClass: Expression, errorParms: Expression, dataType: override def foldable: Boolean = false override def nullable: Boolean = true override def inputTypes: Seq[AbstractDataType] = - Seq(StringType, MapType(StringType, StringType)) + Seq(StringTypeAnyCollation, MapType(StringType, StringType)) override def left: Expression = errorClass override def right: Expression = errorParms @@ -199,7 +200,7 @@ object AssertTrue { since = "1.6.0", group = "misc_funcs") case class CurrentDatabase() extends LeafExpression with Unevaluable { - override def dataType: DataType = StringType + override def dataType: DataType = SQLConf.get.defaultStringType override def nullable: Boolean = false override def prettyName: String = "current_schema" final override val nodePatterns: Seq[TreePattern] = Seq(CURRENT_LIKE) @@ -218,7 +219,7 @@ case class CurrentDatabase() extends LeafExpression with Unevaluable { since = "3.1.0", group = "misc_funcs") case class CurrentCatalog() extends LeafExpression with Unevaluable { - override def dataType: DataType = StringType + override def dataType: DataType = SQLConf.get.defaultStringType override def nullable: Boolean = false override def prettyName: String = "current_catalog" final override val nodePatterns: Seq[TreePattern] = Seq(CURRENT_LIKE) @@ -251,7 +252,7 @@ case class Uuid(randomSeed: Option[Long] = None) extends LeafExpression with Non override def nullable: Boolean = false - override def dataType: DataType = StringType + override def dataType: DataType = SQLConf.get.defaultStringType override def stateful: Boolean = true @@ -292,7 +293,7 @@ case class SparkVersion() extends LeafExpression with RuntimeReplaceable { override lazy val replacement: Expression = StaticInvoke( classOf[ExpressionImplUtils], - StringType, + SQLConf.get.defaultStringType, "getSparkVersion", returnNullable = false) } @@ -311,7 +312,7 @@ case class SparkVersion() extends LeafExpression with RuntimeReplaceable { case class TypeOf(child: Expression) extends UnaryExpression { override def nullable: Boolean = false override def foldable: Boolean = true - override def dataType: DataType = StringType + override def dataType: DataType = SQLConf.get.defaultStringType override def eval(input: InternalRow): Any = UTF8String.fromString(child.dataType.catalogString) override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { @@ -334,7 +335,7 @@ case class TypeOf(child: Expression) extends UnaryExpression { // scalastyle:on line.size.limit case class CurrentUser() extends LeafExpression with Unevaluable { override def nullable: Boolean = false - override def dataType: DataType = StringType + override def dataType: DataType = SQLConf.get.defaultStringType override def prettyName: String = getTagValue(FunctionRegistry.FUNC_ALIAS).getOrElse("current_user") final override val nodePatterns: Seq[TreePattern] = Seq(CURRENT_LIKE) @@ -412,7 +413,8 @@ case class AesEncrypt( override def prettyName: String = "aes_encrypt" override def inputTypes: Seq[AbstractDataType] = - Seq(BinaryType, BinaryType, StringType, StringType, BinaryType, BinaryType) + Seq(BinaryType, BinaryType, StringTypeAnyCollation, StringTypeAnyCollation, + BinaryType, BinaryType) override def children: Seq[Expression] = Seq(input, key, mode, padding, iv, aad) @@ -485,7 +487,7 @@ case class AesDecrypt( this(input, key, Literal("GCM")) override def inputTypes: Seq[AbstractDataType] = { - Seq(BinaryType, BinaryType, StringType, StringType, BinaryType) + Seq(BinaryType, BinaryType, StringTypeAnyCollation, StringTypeAnyCollation, BinaryType) } override def prettyName: String = "aes_decrypt" diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala index 6bbeba4d2969e..3258a57bb1236 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala @@ -205,10 +205,18 @@ case class Alias(child: Expression, name: String)( "" } + /** + * This function is performance-sensitive, so we should avoid `MetadataBuilder` manipulation, + * because it performs heavy operations on maps + */ private def removeNonInheritableMetadata(metadata: Metadata): Metadata = { - val builder = new MetadataBuilder().withMetadata(metadata) - nonInheritableMetadataKeys.foreach(builder.remove) - builder.build() + if (metadata.isEmpty || nonInheritableMetadataKeys.forall(!metadata.contains(_))) { + metadata + } else { + val builder = new MetadataBuilder().withMetadata(metadata) + nonInheritableMetadataKeys.foreach(builder.remove) + builder.build() + } } override def toString: String = s"$child AS $name#${exprId.id}$typeSuffix$delaySuffix" diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/numberFormatExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/numberFormatExpressions.scala index 6d95d7e620a2e..e914190c06456 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/numberFormatExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/numberFormatExpressions.scala @@ -26,6 +26,8 @@ import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGe import org.apache.spark.sql.catalyst.expressions.codegen.Block.BlockHelper import org.apache.spark.sql.catalyst.util.ToNumberParser import org.apache.spark.sql.errors.QueryCompilationErrors +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.types.StringTypeAnyCollation import org.apache.spark.sql.types.{AbstractDataType, BinaryType, DataType, DatetimeType, Decimal, DecimalType, StringType} import org.apache.spark.unsafe.types.UTF8String @@ -47,7 +49,8 @@ abstract class ToNumberBase(left: Expression, right: Expression, errorOnFail: Bo DecimalType.USER_DEFAULT } - override def inputTypes: Seq[DataType] = Seq(StringType, StringType) + override def inputTypes: Seq[AbstractDataType] = + Seq(StringTypeAnyCollation, StringTypeAnyCollation) override def checkInputDataTypes(): TypeCheckResult = { val inputTypeCheck = super.checkInputDataTypes() @@ -247,8 +250,9 @@ object ToCharacterBuilder extends ExpressionBuilder { inputExpr.dataType match { case _: DatetimeType => DateFormatClass(inputExpr, format) case _: BinaryType => - if (!(format.dataType == StringType && format.foldable)) { - throw QueryCompilationErrors.nonFoldableArgumentError(funcName, "format", StringType) + if (!(format.dataType.isInstanceOf[StringType] && format.foldable)) { + throw QueryCompilationErrors.nonFoldableArgumentError(funcName, "format", + format.dataType) } val fmt = format.eval() if (fmt == null) { @@ -279,8 +283,8 @@ case class ToCharacter(left: Expression, right: Expression) } } - override def dataType: DataType = StringType - override def inputTypes: Seq[AbstractDataType] = Seq(DecimalType, StringType) + override def dataType: DataType = SQLConf.get.defaultStringType + override def inputTypes: Seq[AbstractDataType] = Seq(DecimalType, StringTypeAnyCollation) override def checkInputDataTypes(): TypeCheckResult = { val inputTypeCheck = super.checkInputDataTypes() if (inputTypeCheck.isSuccess) { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala index 462facd180c4e..09d024feccfa6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala @@ -360,6 +360,15 @@ case class StaticInvoke( super.stringArgs.toSeq.dropRight(1).iterator } } + + override def toString: String = + s"static_invoke(${ + if (objectName.startsWith("org.apache.spark.")) { + cls.getSimpleName + } else { + objectName + } + }.$functionName(${arguments.mkString(", ")}))" } /** @@ -509,7 +518,8 @@ case class Invoke( ev.copy(code = code) } - override def toString: String = s"$targetObject.$functionName" + override def toString: String = + s"invoke($targetObject.$functionName(${arguments.mkString(", ")}))" override protected def withNewChildrenInternal(newChildren: IndexedSeq[Expression]): Invoke = copy(targetObject = newChildren.head, arguments = newChildren.tail) @@ -1917,16 +1927,12 @@ case class AssertNotNull(child: Expression, walkedTypePath: Seq[String] = Nil) override def flatArguments: Iterator[Any] = Iterator(child) - private val errMsg = "Null value appeared in non-nullable field:" + - walkedTypePath.mkString("\n", "\n", "\n") + - "If the schema is inferred from a Scala tuple/case class, or a Java bean, " + - "please try to use scala.Option[_] or other nullable types " + - "(e.g. java.lang.Integer instead of int/scala.Int)." + private val errMsg = walkedTypePath.mkString("\n", "\n", "\n") override def eval(input: InternalRow): Any = { val result = child.eval(input) if (result == null) { - throw new NullPointerException(errMsg) + throw QueryExecutionErrors.notNullAssertViolation(errMsg) } result } @@ -1940,7 +1946,7 @@ case class AssertNotNull(child: Expression, walkedTypePath: Seq[String] = Nil) val code = childGen.code + code""" if (${childGen.isNull}) { - throw new NullPointerException($errMsgField); + throw QueryExecutionErrors.notNullAssertViolation($errMsgField); } """ ev.copy(code = code, isNull = FalseLiteral, value = childGen.value) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index b33de303b5d55..297c709c6d7d9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -33,8 +33,9 @@ import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.expressions.codegen.Block._ import org.apache.spark.sql.catalyst.trees.BinaryLike import org.apache.spark.sql.catalyst.trees.TreePattern.{LIKE_FAMLIY, REGEXP_EXTRACT_FAMILY, REGEXP_REPLACE, TreePattern} -import org.apache.spark.sql.catalyst.util.{GenericArrayData, StringUtils} +import org.apache.spark.sql.catalyst.util.{CollationSupport, GenericArrayData, StringUtils} import org.apache.spark.sql.errors.QueryExecutionErrors +import org.apache.spark.sql.internal.types.{StringTypeAnyCollation, StringTypeBinaryLcase} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String @@ -44,7 +45,11 @@ abstract class StringRegexExpression extends BinaryExpression def escape(v: String): String def matches(regex: Pattern, str: String): Boolean - override def inputTypes: Seq[DataType] = Seq(StringType, StringType) + override def inputTypes: Seq[AbstractDataType] = + Seq(StringTypeBinaryLcase, StringTypeAnyCollation) + + final lazy val collationId: Int = left.dataType.asInstanceOf[StringType].collationId + final lazy val collationRegexFlags: Int = CollationSupport.collationAwareRegexFlags(collationId) // try cache foldable pattern private lazy val cache: Pattern = right match { @@ -58,7 +63,7 @@ abstract class StringRegexExpression extends BinaryExpression } else { // Let it raise exception if couldn't compile the regex string try { - Pattern.compile(escape(str)) + Pattern.compile(escape(str), collationRegexFlags) } catch { case e: PatternSyntaxException => throw QueryExecutionErrors.invalidPatternError(prettyName, e.getPattern, e) @@ -158,7 +163,8 @@ case class Like(left: Expression, right: Expression, escapeChar: Char) val regexStr = StringEscapeUtils.escapeJava(escape(rVal.asInstanceOf[UTF8String].toString())) val pattern = ctx.addMutableState(patternClass, "patternLike", - v => s"""$v = $patternClass.compile("$regexStr");""") + v => + s"""$v = $patternClass.compile("$regexStr", $collationRegexFlags);""".stripMargin) // We don't use nullSafeCodeGen here because we don't want to re-evaluate right again. val eval = left.genCode(ctx) @@ -186,7 +192,7 @@ case class Like(left: Expression, right: Expression, escapeChar: Char) s""" String $rightStr = $eval2.toString(); $patternClass $pattern = $patternClass.compile( - $escapeFunc($rightStr, '$escapedEscapeChar')); + $escapeFunc($rightStr, '$escapedEscapeChar'), $collationRegexFlags); ${ev.value} = $pattern.matcher($eval1.toString()).matches(); """ }) @@ -258,7 +264,8 @@ case class ILike( def this(left: Expression, right: Expression) = this(left, right, '\\') - override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType) + override def inputTypes: Seq[AbstractDataType] = + Seq(StringTypeBinaryLcase, StringTypeAnyCollation) override protected def withNewChildrenInternal( newLeft: Expression, newRight: Expression): Expression = { @@ -273,7 +280,9 @@ sealed abstract class MultiLikeBase protected def isNotSpecified: Boolean - override def inputTypes: Seq[DataType] = StringType :: Nil + override def inputTypes: Seq[AbstractDataType] = StringTypeBinaryLcase :: Nil + final lazy val collationId: Int = child.dataType.asInstanceOf[StringType].collationId + final lazy val collationRegexFlags: Int = CollationSupport.collationAwareRegexFlags(collationId) override def nullable: Boolean = true @@ -281,8 +290,8 @@ sealed abstract class MultiLikeBase protected lazy val hasNull: Boolean = patterns.contains(null) - protected lazy val cache = patterns.filterNot(_ == null) - .map(s => Pattern.compile(StringUtils.escapeLikeRegex(s.toString, '\\'))) + protected lazy val cache = patterns.filterNot(_ == null).map(s => + Pattern.compile(StringUtils.escapeLikeRegex(s.toString, '\\'), collationRegexFlags)) protected lazy val matchFunc = if (isNotSpecified) { (p: Pattern, inputValue: String) => !p.matcher(inputValue).matches() @@ -475,7 +484,7 @@ case class RLike(left: Expression, right: Expression) extends StringRegexExpress val regexStr = StringEscapeUtils.escapeJava(rVal.asInstanceOf[UTF8String].toString()) val pattern = ctx.addMutableState(patternClass, "patternRLike", - v => s"""$v = $patternClass.compile("$regexStr");""") + v => s"""$v = $patternClass.compile("$regexStr", $collationRegexFlags);""".stripMargin) // We don't use nullSafeCodeGen here because we don't want to re-evaluate right again. val eval = left.genCode(ctx) @@ -499,7 +508,7 @@ case class RLike(left: Expression, right: Expression) extends StringRegexExpress nullSafeCodeGen(ctx, ev, (eval1, eval2) => { s""" String $rightStr = $eval2.toString(); - $patternClass $pattern = $patternClass.compile($rightStr); + $patternClass $pattern = $patternClass.compile($rightStr, $collationRegexFlags); ${ev.value} = $pattern.matcher($eval1.toString()).find(0); """ }) @@ -543,17 +552,20 @@ case class RLike(left: Expression, right: Expression) extends StringRegexExpress case class StringSplit(str: Expression, regex: Expression, limit: Expression) extends TernaryExpression with ImplicitCastInputTypes with NullIntolerant { - override def dataType: DataType = ArrayType(StringType, containsNull = false) - override def inputTypes: Seq[DataType] = Seq(StringType, StringType, IntegerType) + override def dataType: DataType = ArrayType(str.dataType, containsNull = false) + override def inputTypes: Seq[AbstractDataType] = + Seq(StringTypeBinaryLcase, StringTypeAnyCollation, IntegerType) override def first: Expression = str override def second: Expression = regex override def third: Expression = limit + final lazy val collationId: Int = str.dataType.asInstanceOf[StringType].collationId + def this(exp: Expression, regex: Expression) = this(exp, regex, Literal(-1)) override def nullSafeEval(string: Any, regex: Any, limit: Any): Any = { - val strings = string.asInstanceOf[UTF8String].split( - regex.asInstanceOf[UTF8String], limit.asInstanceOf[Int]) + val pattern = CollationSupport.collationAwareRegex(regex.asInstanceOf[UTF8String], collationId) + val strings = string.asInstanceOf[UTF8String].split(pattern, limit.asInstanceOf[Int]) new GenericArrayData(strings.asInstanceOf[Array[Any]]) } @@ -561,7 +573,8 @@ case class StringSplit(str: Expression, regex: Expression, limit: Expression) val arrayClass = classOf[GenericArrayData].getName nullSafeCodeGen(ctx, ev, (str, regex, limit) => { // Array in java is covariant, so we don't need to cast UTF8String[] to Object[]. - s"""${ev.value} = new $arrayClass($str.split($regex,$limit));""".stripMargin + s"""${ev.value} = new $arrayClass($str.split( + |CollationSupport.collationAwareRegex($regex, $collationId),$limit));""".stripMargin }) } @@ -658,7 +671,7 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio override def nullSafeEval(s: Any, p: Any, r: Any, i: Any): Any = { if (!p.equals(lastRegex)) { - val patternAndRegex = RegExpUtils.getPatternAndLastRegex(p, prettyName) + val patternAndRegex = RegExpUtils.getPatternAndLastRegex(p, prettyName, collationId) pattern = patternAndRegex._1 lastRegex = patternAndRegex._2 } @@ -683,9 +696,10 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio } } - override def dataType: DataType = StringType + override def dataType: DataType = subject.dataType override def inputTypes: Seq[AbstractDataType] = - Seq(StringType, StringType, StringType, IntegerType) + Seq(StringTypeBinaryLcase, StringTypeAnyCollation, StringTypeBinaryLcase, IntegerType) + final lazy val collationId: Int = subject.dataType.asInstanceOf[StringType].collationId override def prettyName: String = "regexp_replace" override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { @@ -708,7 +722,7 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio nullSafeCodeGen(ctx, ev, (subject, regexp, rep, pos) => { s""" - ${RegExpUtils.initLastMatcherCode(ctx, subject, regexp, matcher, prettyName)} + ${RegExpUtils.initLastMatcherCode(ctx, subject, regexp, matcher, prettyName, collationId)} if (!$rep.equals($termLastReplacementInUTF8)) { // replacement string changed $termLastReplacementInUTF8 = $rep.clone(); @@ -771,15 +785,18 @@ abstract class RegExpExtractBase final override val nodePatterns: Seq[TreePattern] = Seq(REGEXP_EXTRACT_FAMILY) - override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType, IntegerType) + override def inputTypes: Seq[AbstractDataType] = + Seq(StringTypeBinaryLcase, StringTypeAnyCollation, IntegerType) override def first: Expression = subject override def second: Expression = regexp override def third: Expression = idx + final lazy val collationId: Int = subject.dataType.asInstanceOf[StringType].collationId + protected def getLastMatcher(s: Any, p: Any): Matcher = { if (p != lastRegex) { // regex value changed - val patternAndRegex = RegExpUtils.getPatternAndLastRegex(p, prettyName) + val patternAndRegex = RegExpUtils.getPatternAndLastRegex(p, prettyName, collationId) pattern = patternAndRegex._1 lastRegex = patternAndRegex._2 } @@ -848,7 +865,7 @@ case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expressio } } - override def dataType: DataType = StringType + override def dataType: DataType = subject.dataType override def prettyName: String = "regexp_extract" override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { @@ -863,7 +880,7 @@ case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expressio nullSafeCodeGen(ctx, ev, (subject, regexp, idx) => { s""" - ${RegExpUtils.initLastMatcherCode(ctx, subject, regexp, matcher, prettyName)} + ${RegExpUtils.initLastMatcherCode(ctx, subject, regexp, matcher, prettyName, collationId)} if ($matcher.find()) { java.util.regex.MatchResult $matchResult = $matcher.toMatchResult(); $classNameRegExpExtractBase.checkGroupIndex("$prettyName", $matchResult.groupCount(), $idx); @@ -947,7 +964,7 @@ case class RegExpExtractAll(subject: Expression, regexp: Expression, idx: Expres new GenericArrayData(matchResults.toArray.asInstanceOf[Array[Any]]) } - override def dataType: DataType = ArrayType(StringType) + override def dataType: DataType = ArrayType(subject.dataType) override def prettyName: String = "regexp_extract_all" override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { @@ -963,7 +980,8 @@ case class RegExpExtractAll(subject: Expression, regexp: Expression, idx: Expres } nullSafeCodeGen(ctx, ev, (subject, regexp, idx) => { s""" - | ${RegExpUtils.initLastMatcherCode(ctx, subject, regexp, matcher, prettyName)} + | ${RegExpUtils.initLastMatcherCode(ctx, subject, regexp, matcher, prettyName, + collationId)} | java.util.ArrayList $matchResults = new java.util.ArrayList(); | while ($matcher.find()) { | java.util.regex.MatchResult $matchResult = $matcher.toMatchResult(); @@ -1020,7 +1038,8 @@ case class RegExpCount(left: Expression, right: Expression) override def children: Seq[Expression] = Seq(left, right) - override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType) + override def inputTypes: Seq[AbstractDataType] = + Seq(StringTypeBinaryLcase, StringTypeAnyCollation) override protected def withNewChildrenInternal( newChildren: IndexedSeq[Expression]): RegExpCount = @@ -1053,13 +1072,14 @@ case class RegExpSubStr(left: Expression, right: Expression) override lazy val replacement: Expression = new NullIf( RegExpExtract(subject = left, regexp = right, idx = Literal(0)), - Literal("")) + Literal.create("", left.dataType)) override def prettyName: String = "regexp_substr" override def children: Seq[Expression] = Seq(left, right) - override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType) + override def inputTypes: Seq[AbstractDataType] = + Seq(StringTypeBinaryLcase, StringTypeAnyCollation) override protected def withNewChildrenInternal( newChildren: IndexedSeq[Expression]): RegExpSubStr = @@ -1127,7 +1147,8 @@ case class RegExpInStr(subject: Expression, regexp: Expression, idx: Expression) s""" |try { | $setEvNotNull - | ${RegExpUtils.initLastMatcherCode(ctx, subject, regexp, matcher, prettyName)} + | ${RegExpUtils.initLastMatcherCode(ctx, subject, regexp, matcher, prettyName, + collationId)} | if ($matcher.find()) { | ${ev.value} = $matcher.toMatchResult().start() + 1; | } else { @@ -1151,17 +1172,19 @@ object RegExpUtils { subject: String, regexp: String, matcher: String, - prettyName: String): String = { + prettyName: String, + collationId: Int): String = { val classNamePattern = classOf[Pattern].getCanonicalName val termLastRegex = ctx.addMutableState("UTF8String", "lastRegex") val termPattern = ctx.addMutableState(classNamePattern, "pattern") + val collationRegexFlags = CollationSupport.collationAwareRegexFlags(collationId) s""" |if (!$regexp.equals($termLastRegex)) { | // regex value changed | try { | UTF8String r = $regexp.clone(); - | $termPattern = $classNamePattern.compile(r.toString()); + | $termPattern = $classNamePattern.compile(r.toString(), $collationRegexFlags); | $termLastRegex = r; | } catch (java.util.regex.PatternSyntaxException e) { | throw QueryExecutionErrors.invalidPatternError("$prettyName", e.getPattern(), e); @@ -1171,10 +1194,11 @@ object RegExpUtils { |""".stripMargin } - def getPatternAndLastRegex(p: Any, prettyName: String): (Pattern, UTF8String) = { + def getPatternAndLastRegex(p: Any, prettyName: String, collationId: Int): (Pattern, UTF8String) = + { val r = p.asInstanceOf[UTF8String].clone() val pattern = try { - Pattern.compile(r.toString) + Pattern.compile(r.toString, CollationSupport.collationAwareRegexFlags(collationId)) } catch { case e: PatternSyntaxException => throw QueryExecutionErrors.invalidPatternError(prettyName, e.getPattern, e) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index b3029302c03df..a0c796274f761 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -17,7 +17,8 @@ package org.apache.spark.sql.catalyst.expressions -import java.io.UnsupportedEncodingException +import java.nio.{ByteBuffer, CharBuffer} +import java.nio.charset.{CharacterCodingException, Charset, CodingErrorAction, IllegalCharsetNameException, UnsupportedCharsetException} import java.text.{BreakIterator, DecimalFormat, DecimalFormatSymbols} import java.util.{Base64 => JBase64} import java.util.{HashMap, Locale, Map => JMap} @@ -25,19 +26,20 @@ import java.util.{HashMap, Locale, Map => JMap} import scala.collection.mutable.ArrayBuffer import org.apache.spark.QueryContext +import org.apache.spark.network.util.JavaUtils import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.{ExpressionBuilder, FunctionRegistry, TypeCheckResult} import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.DataTypeMismatch import org.apache.spark.sql.catalyst.expressions.Cast._ import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.expressions.codegen.Block._ -import org.apache.spark.sql.catalyst.expressions.objects.StaticInvoke -import org.apache.spark.sql.catalyst.trees.BinaryLike +import org.apache.spark.sql.catalyst.expressions.objects.{Invoke, StaticInvoke} +import org.apache.spark.sql.catalyst.trees.{BinaryLike, UnaryLike} import org.apache.spark.sql.catalyst.trees.TreePattern.{TreePattern, UPPER_OR_LOWER} -import org.apache.spark.sql.catalyst.util.{ArrayData, CollationSupport, GenericArrayData, TypeUtils} +import org.apache.spark.sql.catalyst.util.{ArrayData, CollationFactory, CollationSupport, GenericArrayData, TypeUtils} import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors} import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.internal.types.{AbstractArrayType, StringTypeAnyCollation} +import org.apache.spark.sql.internal.types.{AbstractArrayType, StringTypeAnyCollation, StringTypeBinaryLcase} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.UTF8StringBuilder import org.apache.spark.unsafe.array.ByteArrayMethods @@ -453,14 +455,18 @@ trait String2StringExpression extends ImplicitCastInputTypes { case class Upper(child: Expression) extends UnaryExpression with String2StringExpression with NullIntolerant { - // scalastyle:off caselocale - override def convert(v: UTF8String): UTF8String = v.toUpperCase - // scalastyle:on caselocale + final lazy val collationId: Int = child.dataType.asInstanceOf[StringType].collationId + + // Flag to indicate whether to use ICU instead of JVM case mappings for UTF8_BINARY collation. + private final lazy val useICU = SQLConf.get.getConf(SQLConf.ICU_CASE_MAPPINGS_ENABLED) + + override def convert(v: UTF8String): UTF8String = + CollationSupport.Upper.exec(v, collationId, useICU) final override val nodePatterns: Seq[TreePattern] = Seq(UPPER_OR_LOWER) override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { - defineCodeGen(ctx, ev, c => s"($c).toUpperCase()") + defineCodeGen(ctx, ev, c => CollationSupport.Upper.genCode(c, collationId, useICU)) } override protected def withNewChildInternal(newChild: Expression): Upper = copy(child = newChild) @@ -481,14 +487,18 @@ case class Upper(child: Expression) case class Lower(child: Expression) extends UnaryExpression with String2StringExpression with NullIntolerant { - // scalastyle:off caselocale - override def convert(v: UTF8String): UTF8String = v.toLowerCase - // scalastyle:on caselocale + final lazy val collationId: Int = child.dataType.asInstanceOf[StringType].collationId + + // Flag to indicate whether to use ICU instead of JVM case mappings for UTF8_BINARY collation. + private final lazy val useICU = SQLConf.get.getConf(SQLConf.ICU_CASE_MAPPINGS_ENABLED) + + override def convert(v: UTF8String): UTF8String = + CollationSupport.Lower.exec(v, collationId, useICU) final override val nodePatterns: Seq[TreePattern] = Seq(UPPER_OR_LOWER) override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { - defineCodeGen(ctx, ev, c => s"($c).toLowerCase()") + defineCodeGen(ctx, ev, c => CollationSupport.Lower.genCode(c, collationId, useICU)) } override def prettyName: String = @@ -686,6 +696,188 @@ case class EndsWith(left: Expression, right: Expression) extends StringPredicate newLeft: Expression, newRight: Expression): EndsWith = copy(left = newLeft, right = newRight) } +/** + * A function that checks if a UTF8 string is valid. + */ +@ExpressionDescription( + usage = "_FUNC_(str) - Returns true if `str` is a valid UTF-8 string, otherwise returns false.", + arguments = """ + Arguments: + * str - a string expression + """, + examples = """ + Examples: + > SELECT _FUNC_('Spark'); + true + > SELECT _FUNC_(x'61'); + true + > SELECT _FUNC_(x'80'); + false + > SELECT _FUNC_(x'61C262'); + false + """, + since = "4.0.0", + group = "string_funcs") +case class IsValidUTF8(input: Expression) extends RuntimeReplaceable with ImplicitCastInputTypes + with UnaryLike[Expression] with NullIntolerant { + + override lazy val replacement: Expression = Invoke(input, "isValid", BooleanType) + + override def inputTypes: Seq[AbstractDataType] = Seq(StringTypeAnyCollation) + + override def nodeName: String = "is_valid_utf8" + + override def nullable: Boolean = true + + override def child: Expression = input + + override protected def withNewChildInternal(newChild: Expression): IsValidUTF8 = { + copy(input = newChild) + } + +} + +/** + * A function that converts an invalid UTF8 string to a valid UTF8 string by replacing invalid + * UTF-8 byte sequences with the Unicode replacement character (U+FFFD), according to the UNICODE + * standard rules (Section 3.9, Paragraph D86, Table 3-7). Valid strings remain unchanged. + */ +// scalastyle:off +@ExpressionDescription( + usage = "_FUNC_(str) - Returns the original string if `str` is a valid UTF-8 string, " + + "otherwise returns a new string whose invalid UTF8 byte sequences are replaced using the " + + "UNICODE replacement character U+FFFD.", + arguments = """ + Arguments: + * str - a string expression + """, + examples = """ + Examples: + > SELECT _FUNC_('Spark'); + Spark + > SELECT _FUNC_(x'61'); + a + > SELECT _FUNC_(x'80'); + � + > SELECT _FUNC_(x'61C262'); + a�b + """, + since = "4.0.0", + group = "string_funcs") +// scalastyle:on +case class MakeValidUTF8(input: Expression) extends RuntimeReplaceable with ImplicitCastInputTypes + with UnaryLike[Expression] with NullIntolerant { + + override lazy val replacement: Expression = Invoke(input, "makeValid", input.dataType) + + override def inputTypes: Seq[AbstractDataType] = Seq(StringTypeAnyCollation) + + override def nodeName: String = "make_valid_utf8" + + override def nullable: Boolean = true + + override def child: Expression = input + + override protected def withNewChildInternal(newChild: Expression): MakeValidUTF8 = { + copy(input = newChild) + } + +} + +/** + * A function that validates a UTF8 string, throwing an exception if the string is invalid. + */ +// scalastyle:off +@ExpressionDescription( + usage = "_FUNC_(str) - Returns the original string if `str` is a valid UTF-8 string, " + + "otherwise throws an exception.", + arguments = """ + Arguments: + * str - a string expression + """, + examples = """ + Examples: + > SELECT _FUNC_('Spark'); + Spark + > SELECT _FUNC_(x'61'); + a + """, + since = "4.0.0", + group = "string_funcs") +// scalastyle:on +case class ValidateUTF8(input: Expression) extends RuntimeReplaceable with ImplicitCastInputTypes + with UnaryLike[Expression] with NullIntolerant { + + override lazy val replacement: Expression = StaticInvoke( + classOf[ExpressionImplUtils], + input.dataType, + "validateUTF8String", + Seq(input), + inputTypes) + + override def inputTypes: Seq[AbstractDataType] = Seq(StringTypeAnyCollation) + + override def nodeName: String = "validate_utf8" + + override def nullable: Boolean = true + + override def child: Expression = input + + override protected def withNewChildInternal(newChild: Expression): ValidateUTF8 = { + copy(input = newChild) + } + +} + +/** + * A function that tries to validate a UTF8 string, returning NULL if the string is invalid. + */ +// scalastyle:off +@ExpressionDescription( + usage = "_FUNC_(str) - Returns the original string if `str` is a valid UTF-8 string, " + + "otherwise returns NULL.", + arguments = """ + Arguments: + * str - a string expression + """, + examples = """ + Examples: + > SELECT _FUNC_('Spark'); + Spark + > SELECT _FUNC_(x'61'); + a + > SELECT _FUNC_(x'80'); + NULL + > SELECT _FUNC_(x'61C262'); + NULL + """, + since = "4.0.0", + group = "string_funcs") +// scalastyle:on +case class TryValidateUTF8(input: Expression) extends RuntimeReplaceable with ImplicitCastInputTypes + with UnaryLike[Expression] with NullIntolerant { + + override lazy val replacement: Expression = StaticInvoke( + classOf[ExpressionImplUtils], + input.dataType, + "tryValidateUTF8String", + Seq(input), + inputTypes) + + override def inputTypes: Seq[AbstractDataType] = Seq(StringTypeAnyCollation) + + override def nodeName: String = "try_validate_utf8" + + override def nullable: Boolean = true + + override def child: Expression = input + + override protected def withNewChildInternal(newChild: Expression): TryValidateUTF8 = { + copy(input = newChild) + } + +} + /** * Replace all occurrences with string. */ @@ -710,23 +902,25 @@ case class EndsWith(left: Expression, right: Expression) extends StringPredicate case class StringReplace(srcExpr: Expression, searchExpr: Expression, replaceExpr: Expression) extends TernaryExpression with ImplicitCastInputTypes with NullIntolerant { + final lazy val collationId: Int = first.dataType.asInstanceOf[StringType].collationId + def this(srcExpr: Expression, searchExpr: Expression) = { this(srcExpr, searchExpr, Literal("")) } override def nullSafeEval(srcEval: Any, searchEval: Any, replaceEval: Any): Any = { - srcEval.asInstanceOf[UTF8String].replace( - searchEval.asInstanceOf[UTF8String], replaceEval.asInstanceOf[UTF8String]) + CollationSupport.StringReplace.exec(srcEval.asInstanceOf[UTF8String], + searchEval.asInstanceOf[UTF8String], replaceEval.asInstanceOf[UTF8String], collationId); } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { - nullSafeCodeGen(ctx, ev, (src, search, replace) => { - s"""${ev.value} = $src.replace($search, $replace);""" - }) + defineCodeGen(ctx, ev, (src, search, replace) => + CollationSupport.StringReplace.genCode(src, search, replace, collationId)) } - override def dataType: DataType = StringType - override def inputTypes: Seq[DataType] = Seq(StringType, StringType, StringType) + override def dataType: DataType = srcExpr.dataType + override def inputTypes: Seq[AbstractDataType] = + Seq(StringTypeAnyCollation, StringTypeAnyCollation, StringTypeAnyCollation) override def first: Expression = srcExpr override def second: Expression = searchExpr override def third: Expression = replaceExpr @@ -804,8 +998,9 @@ case class Overlay(input: Expression, replace: Expression, pos: Expression, len: override def dataType: DataType = input.dataType - override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(StringType, BinaryType), - TypeCollection(StringType, BinaryType), IntegerType, IntegerType) + override def inputTypes: Seq[AbstractDataType] = Seq( + TypeCollection(StringTypeAnyCollation, BinaryType), + TypeCollection(StringTypeAnyCollation, BinaryType), IntegerType, IntegerType) override def checkInputDataTypes(): TypeCheckResult = { val inputTypeCheck = super.checkInputDataTypes() @@ -818,7 +1013,7 @@ case class Overlay(input: Expression, replace: Expression, pos: Expression, len: } private lazy val replaceFunc = input.dataType match { - case StringType => + case _: StringType => (inputEval: Any, replaceEval: Any, posEval: Int, lenEval: Int) => { Overlay.calculate( inputEval.asInstanceOf[UTF8String], @@ -856,9 +1051,14 @@ case class Overlay(input: Expression, replace: Expression, pos: Expression, len: object StringTranslate { - def buildDict(matchingString: UTF8String, replaceString: UTF8String) + def buildDict(matchingString: UTF8String, replaceString: UTF8String, collationId: Int) : JMap[String, String] = { - val matching = matchingString.toString() + val matching = if (CollationFactory.fetchCollation(collationId).supportsLowercaseEquality) { + matchingString.toString().toLowerCase() + } else { + matchingString.toString() + } + val replace = replaceString.toString() val dict = new HashMap[String, String]() var i = 0 @@ -909,13 +1109,16 @@ case class StringTranslate(srcExpr: Expression, matchingExpr: Expression, replac @transient private var lastReplace: UTF8String = _ @transient private var dict: JMap[String, String] = _ + final lazy val collationId: Int = first.dataType.asInstanceOf[StringType].collationId + override def nullSafeEval(srcEval: Any, matchingEval: Any, replaceEval: Any): Any = { if (matchingEval != lastMatching || replaceEval != lastReplace) { lastMatching = matchingEval.asInstanceOf[UTF8String].clone() lastReplace = replaceEval.asInstanceOf[UTF8String].clone() - dict = StringTranslate.buildDict(lastMatching, lastReplace) + dict = StringTranslate.buildDict(lastMatching, lastReplace, collationId) } - srcEval.asInstanceOf[UTF8String].translate(dict) + + CollationSupport.StringTranslate.exec(srcEval.asInstanceOf[UTF8String], dict, collationId) } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { @@ -936,15 +1139,17 @@ case class StringTranslate(srcExpr: Expression, matchingExpr: Expression, replac $termLastMatching = $matching.clone(); $termLastReplace = $replace.clone(); $termDict = org.apache.spark.sql.catalyst.expressions.StringTranslate - .buildDict($termLastMatching, $termLastReplace); + .buildDict($termLastMatching, $termLastReplace, $collationId); } - ${ev.value} = $src.translate($termDict); + ${ev.value} = CollationSupport.StringTranslate. + exec($src, $termDict, $collationId); """ }) } - override def dataType: DataType = StringType - override def inputTypes: Seq[DataType] = Seq(StringType, StringType, StringType) + override def dataType: DataType = srcExpr.dataType + override def inputTypes: Seq[AbstractDataType] = + Seq(StringTypeAnyCollation, StringTypeAnyCollation, StringTypeAnyCollation) override def first: Expression = srcExpr override def second: Expression = matchingExpr override def third: Expression = replaceExpr @@ -977,15 +1182,19 @@ case class StringTranslate(srcExpr: Expression, matchingExpr: Expression, replac case class FindInSet(left: Expression, right: Expression) extends BinaryExpression with ImplicitCastInputTypes with NullIntolerant { - override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType) + final lazy val collationId: Int = left.dataType.asInstanceOf[StringType].collationId - override protected def nullSafeEval(word: Any, set: Any): Any = - set.asInstanceOf[UTF8String].findInSet(word.asInstanceOf[UTF8String]) + override def inputTypes: Seq[AbstractDataType] = + Seq(StringTypeAnyCollation, StringTypeAnyCollation) + + override protected def nullSafeEval(word: Any, set: Any): Any = { + CollationSupport.FindInSet. + exec(word.asInstanceOf[UTF8String], set.asInstanceOf[UTF8String], collationId) + } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { - nullSafeCodeGen(ctx, ev, (word, set) => - s"${ev.value} = $set.findInSet($word);" - ) + defineCodeGen(ctx, ev, (word, set) => CollationSupport.FindInSet. + genCode(word, set, collationId)) } override def dataType: DataType = IntegerType @@ -1003,8 +1212,10 @@ trait String2TrimExpression extends Expression with ImplicitCastInputTypes { protected def direction: String override def children: Seq[Expression] = srcStr +: trimStr.toSeq - override def dataType: DataType = StringType - override def inputTypes: Seq[AbstractDataType] = Seq.fill(children.size)(StringType) + override def dataType: DataType = srcStr.dataType + override def inputTypes: Seq[AbstractDataType] = Seq.fill(children.size)(StringTypeBinaryLcase) + + final lazy val collationId: Int = srcStr.dataType.asInstanceOf[StringType].collationId override def nullable: Boolean = children.exists(_.nullable) override def foldable: Boolean = children.forall(_.foldable) @@ -1023,13 +1234,19 @@ trait String2TrimExpression extends Expression with ImplicitCastInputTypes { } } - protected val trimMethod: String - override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val evals = children.map(_.genCode(ctx)) - val srcString = evals(0) + val srcString = evals.head if (evals.length == 1) { + val stringTrimCode: String = this match { + case _: StringTrim => + CollationSupport.StringTrim.genCode(srcString.value, collationId) + case _: StringTrimLeft => + CollationSupport.StringTrimLeft.genCode(srcString.value, collationId) + case _: StringTrimRight => + CollationSupport.StringTrimRight.genCode(srcString.value, collationId) + } ev.copy(code = code""" |${srcString.code} |boolean ${ev.isNull} = false; @@ -1037,10 +1254,18 @@ trait String2TrimExpression extends Expression with ImplicitCastInputTypes { |if (${srcString.isNull}) { | ${ev.isNull} = true; |} else { - | ${ev.value} = ${srcString.value}.$trimMethod(); + | ${ev.value} = $stringTrimCode; |}""".stripMargin) } else { val trimString = evals(1) + val stringTrimCode: String = this match { + case _: StringTrim => + CollationSupport.StringTrim.genCode(srcString.value, trimString.value, collationId) + case _: StringTrimLeft => + CollationSupport.StringTrimLeft.genCode(srcString.value, trimString.value, collationId) + case _: StringTrimRight => + CollationSupport.StringTrimRight.genCode(srcString.value, trimString.value, collationId) + } ev.copy(code = code""" |${srcString.code} |boolean ${ev.isNull} = false; @@ -1052,7 +1277,7 @@ trait String2TrimExpression extends Expression with ImplicitCastInputTypes { | if (${trimString.isNull}) { | ${ev.isNull} = true; | } else { - | ${ev.value} = ${srcString.value}.$trimMethod(${trimString.value}); + | ${ev.value} = $stringTrimCode; | } |}""".stripMargin) } @@ -1145,12 +1370,11 @@ case class StringTrim(srcStr: Expression, trimStr: Option[Expression] = None) override protected def direction: String = "BOTH" - override def doEval(srcString: UTF8String): UTF8String = srcString.trim() + override def doEval(srcString: UTF8String): UTF8String = + CollationSupport.StringTrim.exec(srcString, collationId) override def doEval(srcString: UTF8String, trimString: UTF8String): UTF8String = - srcString.trim(trimString) - - override val trimMethod: String = "trim" + CollationSupport.StringTrim.exec(srcString, trimString, collationId) override protected def withNewChildrenInternal(newChildren: IndexedSeq[Expression]): Expression = copy( @@ -1253,12 +1477,11 @@ case class StringTrimLeft(srcStr: Expression, trimStr: Option[Expression] = None override protected def direction: String = "LEADING" - override def doEval(srcString: UTF8String): UTF8String = srcString.trimLeft() + override def doEval(srcString: UTF8String): UTF8String = + CollationSupport.StringTrimLeft.exec(srcString, collationId) override def doEval(srcString: UTF8String, trimString: UTF8String): UTF8String = - srcString.trimLeft(trimString) - - override val trimMethod: String = "trimLeft" + CollationSupport.StringTrimLeft.exec(srcString, trimString, collationId) override protected def withNewChildrenInternal( newChildren: IndexedSeq[Expression]): StringTrimLeft = @@ -1314,12 +1537,11 @@ case class StringTrimRight(srcStr: Expression, trimStr: Option[Expression] = Non override protected def direction: String = "TRAILING" - override def doEval(srcString: UTF8String): UTF8String = srcString.trimRight() + override def doEval(srcString: UTF8String): UTF8String = + CollationSupport.StringTrimRight.exec(srcString, collationId) override def doEval(srcString: UTF8String, trimString: UTF8String): UTF8String = - srcString.trimRight(trimString) - - override val trimMethod: String = "trimRight" + CollationSupport.StringTrimRight.exec(srcString, trimString, collationId) override protected def withNewChildrenInternal( newChildren: IndexedSeq[Expression]): StringTrimRight = @@ -1349,20 +1571,24 @@ case class StringTrimRight(srcStr: Expression, trimStr: Option[Expression] = Non case class StringInstr(str: Expression, substr: Expression) extends BinaryExpression with ImplicitCastInputTypes with NullIntolerant { + final lazy val collationId: Int = left.dataType.asInstanceOf[StringType].collationId + override def left: Expression = str override def right: Expression = substr override def dataType: DataType = IntegerType - override def inputTypes: Seq[DataType] = Seq(StringType, StringType) + override def inputTypes: Seq[AbstractDataType] = + Seq(StringTypeAnyCollation, StringTypeAnyCollation) override def nullSafeEval(string: Any, sub: Any): Any = { - string.asInstanceOf[UTF8String].indexOf(sub.asInstanceOf[UTF8String], 0) + 1 + CollationSupport.StringInstr. + exec(string.asInstanceOf[UTF8String], sub.asInstanceOf[UTF8String], collationId) + 1 } override def prettyName: String = "instr" override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { - defineCodeGen(ctx, ev, (l, r) => - s"($l).indexOf($r, 0) + 1") + defineCodeGen(ctx, ev, (string, substring) => + CollationSupport.StringInstr.genCode(string, substring, collationId) + " + 1") } override protected def withNewChildrenInternal( @@ -1395,21 +1621,24 @@ case class StringInstr(str: Expression, substr: Expression) case class SubstringIndex(strExpr: Expression, delimExpr: Expression, countExpr: Expression) extends TernaryExpression with ImplicitCastInputTypes with NullIntolerant { - override def dataType: DataType = StringType - override def inputTypes: Seq[DataType] = Seq(StringType, StringType, IntegerType) + final lazy val collationId: Int = first.dataType.asInstanceOf[StringType].collationId + + override def dataType: DataType = strExpr.dataType + override def inputTypes: Seq[AbstractDataType] = + Seq(StringTypeAnyCollation, StringTypeAnyCollation, IntegerType) override def first: Expression = strExpr override def second: Expression = delimExpr override def third: Expression = countExpr override def prettyName: String = "substring_index" override def nullSafeEval(str: Any, delim: Any, count: Any): Any = { - str.asInstanceOf[UTF8String].subStringIndex( - delim.asInstanceOf[UTF8String], - count.asInstanceOf[Int]) + CollationSupport.SubstringIndex.exec(str.asInstanceOf[UTF8String], + delim.asInstanceOf[UTF8String], count.asInstanceOf[Int], collationId); } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { - defineCodeGen(ctx, ev, (str, delim, count) => s"$str.subStringIndex($delim, $count)") + defineCodeGen(ctx, ev, (str, delim, count) => + CollationSupport.SubstringIndex.genCode(str, delim, Integer.parseInt(count, 10), collationId)) } override protected def withNewChildrenInternal( @@ -1446,12 +1675,15 @@ case class StringLocate(substr: Expression, str: Expression, start: Expression) this(substr, str, Literal(1)) } + final lazy val collationId: Int = first.dataType.asInstanceOf[StringType].collationId + override def first: Expression = substr override def second: Expression = str override def third: Expression = start override def nullable: Boolean = substr.nullable || str.nullable override def dataType: DataType = IntegerType - override def inputTypes: Seq[DataType] = Seq(StringType, StringType, IntegerType) + override def inputTypes: Seq[AbstractDataType] = + Seq(StringTypeAnyCollation, StringTypeAnyCollation, IntegerType) override def eval(input: InternalRow): Any = { val s = start.eval(input) @@ -1471,9 +1703,8 @@ case class StringLocate(substr: Expression, str: Expression, start: Expression) if (sVal < 1) { 0 } else { - l.asInstanceOf[UTF8String].indexOf( - r.asInstanceOf[UTF8String], - s.asInstanceOf[Int] - 1) + 1 + CollationSupport.StringLocate.exec(l.asInstanceOf[UTF8String], + r.asInstanceOf[UTF8String], s.asInstanceOf[Int] - 1, collationId) + 1; } } } @@ -1494,8 +1725,8 @@ case class StringLocate(substr: Expression, str: Expression, start: Expression) ${strGen.code} if (!${strGen.isNull}) { if (${startGen.value} > 0) { - ${ev.value} = ${strGen.value}.indexOf(${substrGen.value}, - ${startGen.value} - 1) + 1; + ${ev.value} = CollationSupport.StringLocate.exec(${strGen.value}, + ${substrGen.value}, ${startGen.value} - 1, $collationId) + 1; } } else { ${ev.isNull} = true; @@ -1577,7 +1808,8 @@ case class StringLPad(str: Expression, len: Expression, pad: Expression) override def third: Expression = pad override def dataType: DataType = str.dataType - override def inputTypes: Seq[AbstractDataType] = Seq(StringType, IntegerType, StringType) + override def inputTypes: Seq[AbstractDataType] = + Seq(StringTypeAnyCollation, IntegerType, StringTypeAnyCollation) override def nullSafeEval(string: Any, len: Any, pad: Any): Any = { string.asInstanceOf[UTF8String].lpad(len.asInstanceOf[Int], pad.asInstanceOf[UTF8String]) @@ -1656,7 +1888,8 @@ case class StringRPad(str: Expression, len: Expression, pad: Expression = Litera override def third: Expression = pad override def dataType: DataType = str.dataType - override def inputTypes: Seq[AbstractDataType] = Seq(StringType, IntegerType, StringType) + override def inputTypes: Seq[AbstractDataType] = + Seq(StringTypeAnyCollation, IntegerType, StringTypeAnyCollation) override def nullSafeEval(string: Any, len: Any, pad: Any): Any = { string.asInstanceOf[UTF8String].rpad(len.asInstanceOf[Int], pad.asInstanceOf[UTF8String]) @@ -1698,10 +1931,10 @@ case class FormatString(children: Expression*) extends Expression with ImplicitC override def foldable: Boolean = children.forall(_.foldable) override def nullable: Boolean = children(0).nullable - override def dataType: DataType = StringType + override def dataType: DataType = children(0).dataType override def inputTypes: Seq[AbstractDataType] = - StringType :: List.fill(children.size - 1)(AnyDataType) + StringTypeAnyCollation :: List.fill(children.size - 1)(AnyDataType) override def checkInputDataTypes(): TypeCheckResult = { if (children.isEmpty) { @@ -1813,16 +2046,19 @@ case class FormatString(children: Expression*) extends Expression with ImplicitC case class InitCap(child: Expression) extends UnaryExpression with ImplicitCastInputTypes with NullIntolerant { + final lazy val collationId: Int = child.dataType.asInstanceOf[StringType].collationId + + // Flag to indicate whether to use ICU instead of JVM case mappings for UTF8_BINARY collation. + private final lazy val useICU = SQLConf.get.getConf(SQLConf.ICU_CASE_MAPPINGS_ENABLED) + override def inputTypes: Seq[AbstractDataType] = Seq(StringTypeAnyCollation) override def dataType: DataType = child.dataType override def nullSafeEval(string: Any): Any = { - // scalastyle:off caselocale - string.asInstanceOf[UTF8String].toLowerCase.toTitleCase - // scalastyle:on caselocale + CollationSupport.InitCap.exec(string.asInstanceOf[UTF8String], collationId, useICU) } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { - defineCodeGen(ctx, ev, str => s"$str.toLowerCase().toTitleCase()") + defineCodeGen(ctx, ev, str => CollationSupport.InitCap.genCode(str, collationId, useICU)) } override protected def withNewChildInternal(newChild: Expression): InitCap = @@ -1878,7 +2114,7 @@ case class StringRepeat(str: Expression, times: Expression) case class StringSpace(child: Expression) extends UnaryExpression with ImplicitCastInputTypes with NullIntolerant { - override def dataType: DataType = StringType + override def dataType: DataType = SQLConf.get.defaultStringType override def inputTypes: Seq[DataType] = Seq(IntegerType) override def nullSafeEval(s: Any): Any = { @@ -1990,15 +2226,15 @@ case class Right(str: Expression, len: Expression) extends RuntimeReplaceable override lazy val replacement: Expression = If( IsNull(str), - Literal(null, StringType), + Literal(null, str.dataType), If( LessThanOrEqual(len, Literal(0)), - Literal(UTF8String.EMPTY_UTF8, StringType), + Literal(UTF8String.EMPTY_UTF8, str.dataType), new Substring(str, UnaryMinus(len)) ) ) - override def inputTypes: Seq[AbstractDataType] = Seq(StringType, IntegerType) + override def inputTypes: Seq[AbstractDataType] = Seq(StringTypeAnyCollation, IntegerType) override def left: Expression = str override def right: Expression = len override protected def withNewChildrenInternal( @@ -2029,7 +2265,7 @@ case class Left(str: Expression, len: Expression) extends RuntimeReplaceable override lazy val replacement: Expression = Substring(str, Literal(1), len) override def inputTypes: Seq[AbstractDataType] = { - Seq(TypeCollection(StringType, BinaryType), IntegerType) + Seq(TypeCollection(StringTypeAnyCollation, BinaryType), IntegerType) } override def left: Expression = str @@ -2064,16 +2300,17 @@ case class Left(str: Expression, len: Expression) extends RuntimeReplaceable case class Length(child: Expression) extends UnaryExpression with ImplicitCastInputTypes with NullIntolerant { override def dataType: DataType = IntegerType - override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(StringType, BinaryType)) + override def inputTypes: Seq[AbstractDataType] = + Seq(TypeCollection(StringTypeAnyCollation, BinaryType)) protected override def nullSafeEval(value: Any): Any = child.dataType match { - case StringType => value.asInstanceOf[UTF8String].numChars + case _: StringType => value.asInstanceOf[UTF8String].numChars case BinaryType => value.asInstanceOf[Array[Byte]].length } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { child.dataType match { - case StringType => defineCodeGen(ctx, ev, c => s"($c).numChars()") + case _: StringType => defineCodeGen(ctx, ev, c => s"($c).numChars()") case BinaryType => defineCodeGen(ctx, ev, c => s"($c).length") } } @@ -2098,16 +2335,17 @@ case class Length(child: Expression) case class BitLength(child: Expression) extends UnaryExpression with ImplicitCastInputTypes with NullIntolerant { override def dataType: DataType = IntegerType - override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(StringType, BinaryType)) + override def inputTypes: Seq[AbstractDataType] = + Seq(TypeCollection(StringTypeAnyCollation, BinaryType)) protected override def nullSafeEval(value: Any): Any = child.dataType match { - case StringType => value.asInstanceOf[UTF8String].numBytes * 8 + case _: StringType => value.asInstanceOf[UTF8String].numBytes * 8 case BinaryType => value.asInstanceOf[Array[Byte]].length * 8 } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { child.dataType match { - case StringType => defineCodeGen(ctx, ev, c => s"($c).numBytes() * 8") + case _: StringType => defineCodeGen(ctx, ev, c => s"($c).numBytes() * 8") case BinaryType => defineCodeGen(ctx, ev, c => s"($c).length * 8") } } @@ -2136,16 +2374,17 @@ case class BitLength(child: Expression) case class OctetLength(child: Expression) extends UnaryExpression with ImplicitCastInputTypes with NullIntolerant { override def dataType: DataType = IntegerType - override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(StringType, BinaryType)) + override def inputTypes: Seq[AbstractDataType] = + Seq(TypeCollection(StringTypeAnyCollation, BinaryType)) protected override def nullSafeEval(value: Any): Any = child.dataType match { - case StringType => value.asInstanceOf[UTF8String].numBytes + case _: StringType => value.asInstanceOf[UTF8String].numBytes case BinaryType => value.asInstanceOf[Array[Byte]].length } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { child.dataType match { - case StringType => defineCodeGen(ctx, ev, c => s"($c).numBytes()") + case _: StringType => defineCodeGen(ctx, ev, c => s"($c).numBytes()") case BinaryType => defineCodeGen(ctx, ev, c => s"($c).length") } } @@ -2196,8 +2435,8 @@ case class Levenshtein( } override def inputTypes: Seq[AbstractDataType] = threshold match { - case Some(_) => Seq(StringType, StringType, IntegerType) - case _ => Seq(StringType, StringType) + case Some(_) => Seq(StringTypeAnyCollation, StringTypeAnyCollation, IntegerType) + case _ => Seq(StringTypeAnyCollation, StringTypeAnyCollation) } override def children: Seq[Expression] = threshold match { @@ -2320,9 +2559,9 @@ case class Levenshtein( case class SoundEx(child: Expression) extends UnaryExpression with ExpectsInputTypes with NullIntolerant { - override def dataType: DataType = StringType + override def dataType: DataType = SQLConf.get.defaultStringType - override def inputTypes: Seq[DataType] = Seq(StringType) + override def inputTypes: Seq[AbstractDataType] = Seq(StringTypeAnyCollation) override def nullSafeEval(input: Any): Any = input.asInstanceOf[UTF8String].soundex() @@ -2586,7 +2825,7 @@ object Decode { val input = params.head val other = params.tail val itr = other.iterator - var default: Expression = Literal.create(null, StringType) + var default: Expression = Literal.create(null, SQLConf.get.defaultStringType) val branches = ArrayBuffer.empty[(Expression, Expression)] while (itr.hasNext) { val search = itr.next() @@ -2605,7 +2844,7 @@ object Decode { // scalastyle:off line.size.limit @ExpressionDescription( usage = """ - _FUNC_(bin, charset) - Decodes the first argument using the second argument character set. + _FUNC_(bin, charset) - Decodes the first argument using the second argument character set. If either argument is null, the result will also be null. _FUNC_(expr, search, result [, search, result ] ... [, default]) - Compares expr to each search value in order. If expr is equal to a search value, _FUNC_ returns @@ -2615,7 +2854,7 @@ object Decode { arguments = """ Arguments: * bin - a binary expression to decode - * charset - one of the charsets 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16' to decode `bin` into a STRING. It is case insensitive. + * charset - one of the charsets 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16', 'UTF-32' to decode `bin` into a STRING. It is case insensitive. """, examples = """ Examples: @@ -2630,7 +2869,10 @@ object Decode { > SELECT _FUNC_(null, 6, 'Spark', NULL, 'SQL', 4, 'rocks'); SQL """, - since = "3.2.0", + since = "1.5.0", + note = """ + _FUNC_(expr, search, result [, search, result ] ... [, default]) is supported since 3.2.0 + """, group = "string_funcs") // scalastyle:on line.size.limit case class Decode(params: Seq[Expression], replacement: Expression) @@ -2645,81 +2887,69 @@ case class Decode(params: Seq[Expression], replacement: Expression) } } -/** - * Decodes the first argument into a String using the provided character set. - */ -// scalastyle:off line.size.limit -@ExpressionDescription( - usage = "_FUNC_(bin, charset) - Decodes the first argument using the second argument character set. If either argument is null, the result will also be null.", - examples = """ - Examples: - > SELECT _FUNC_(encode('abc', 'utf-8'), 'utf-8'); - abc - """, - arguments = """ - Arguments: - * bin - a binary expression to decode - * charset - one of the charsets 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16' to decode `bin` into a STRING. It is case insensitive. - """, - since = "1.5.0", - group = "string_funcs") -// scalastyle:on line.size.limit -case class StringDecode(bin: Expression, charset: Expression, legacyCharsets: Boolean) - extends BinaryExpression with ImplicitCastInputTypes with NullIntolerant { +case class StringDecode( + bin: Expression, + charset: Expression, + legacyCharsets: Boolean, + legacyErrorAction: Boolean) + extends RuntimeReplaceable with ImplicitCastInputTypes { def this(bin: Expression, charset: Expression) = - this(bin, charset, SQLConf.get.legacyJavaCharsets) + this(bin, charset, SQLConf.get.legacyJavaCharsets, SQLConf.get.legacyCodingErrorAction) - override def left: Expression = bin - override def right: Expression = charset override def dataType: DataType = SQLConf.get.defaultStringType override def inputTypes: Seq[AbstractDataType] = Seq(BinaryType, StringTypeAnyCollation) + override def prettyName: String = "decode" + override def toString: String = s"$prettyName($bin, $charset)" - private val supportedCharsets = Set( - "US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16BE", "UTF-16LE", "UTF-16") - - protected override def nullSafeEval(input1: Any, input2: Any): Any = { - val fromCharset = input2.asInstanceOf[UTF8String].toString - try { - if (legacyCharsets || supportedCharsets.contains(fromCharset.toUpperCase(Locale.ROOT))) { - UTF8String.fromString(new String(input1.asInstanceOf[Array[Byte]], fromCharset)) - } else throw new UnsupportedEncodingException - } catch { - case _: UnsupportedEncodingException => - throw QueryExecutionErrors.invalidCharsetError(prettyName, fromCharset) - } - } - - override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { - nullSafeCodeGen(ctx, ev, (bytes, charset) => { - val fromCharset = ctx.freshName("fromCharset") - val sc = JavaCode.global( - ctx.addReferenceObj("supportedCharsets", supportedCharsets), - supportedCharsets.getClass) - s""" - String $fromCharset = $charset.toString(); - try { - if ($legacyCharsets || $sc.contains($fromCharset.toUpperCase(java.util.Locale.ROOT))) { - ${ev.value} = UTF8String.fromString(new String($bytes, $fromCharset)); - } else { - throw new java.io.UnsupportedEncodingException(); - } - } catch (java.io.UnsupportedEncodingException e) { - throw QueryExecutionErrors.invalidCharsetError("$prettyName", $fromCharset); - } - """ - }) - } - - override protected def withNewChildrenInternal( - newLeft: Expression, newRight: Expression): StringDecode = - copy(bin = newLeft, charset = newRight) + override def replacement: Expression = StaticInvoke( + classOf[StringDecode], + SQLConf.get.defaultStringType, + "decode", + Seq(bin, charset, Literal(legacyCharsets), Literal(legacyErrorAction)), + Seq(BinaryType, StringTypeAnyCollation, BooleanType, BooleanType)) - override def prettyName: String = "decode" + override def children: Seq[Expression] = Seq(bin, charset) + override protected def withNewChildrenInternal(newChildren: IndexedSeq[Expression]): Expression = + copy(bin = newChildren(0), charset = newChildren(1)) } object StringDecode { def apply(bin: Expression, charset: Expression): StringDecode = new StringDecode(bin, charset) + def decode( + input: Array[Byte], + charset: UTF8String, + legacyCharsets: Boolean, + legacyErrorAction: Boolean): UTF8String = { + val fromCharset = charset.toString + if (legacyCharsets || Encode.VALID_CHARSETS.contains(fromCharset.toUpperCase(Locale.ROOT))) { + val decoder = try { + val codingErrorAction = if (legacyErrorAction) { + CodingErrorAction.REPLACE + } else { + CodingErrorAction.REPORT + } + Charset.forName(fromCharset) + .newDecoder() + .onMalformedInput(codingErrorAction) + .onUnmappableCharacter(codingErrorAction) + } catch { + case _: IllegalCharsetNameException | + _: UnsupportedCharsetException | + _: IllegalArgumentException => + throw QueryExecutionErrors.invalidCharsetError("decode", fromCharset) + } + try { + val cb = decoder.decode(ByteBuffer.wrap(input)) + UTF8String.fromString(cb.toString) + } catch { + case _: CharacterCodingException => + throw QueryExecutionErrors.malformedCharacterCoding("decode", fromCharset) + } + } else { + throw QueryExecutionErrors.invalidCharsetError("decode", fromCharset) + } + } } /** @@ -2731,7 +2961,7 @@ object StringDecode { arguments = """ Arguments: * str - a string expression - * charset - one of the charsets 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16' to encode `str` into a BINARY. It is case insensitive. + * charset - one of the charsets 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16', 'UTF-32' to encode `str` into a BINARY. It is case insensitive. """, examples = """ Examples: @@ -2741,59 +2971,79 @@ object StringDecode { since = "1.5.0", group = "string_funcs") // scalastyle:on line.size.limit -case class Encode(str: Expression, charset: Expression, legacyCharsets: Boolean) - extends BinaryExpression with ImplicitCastInputTypes with NullIntolerant { +case class Encode( + str: Expression, + charset: Expression, + legacyCharsets: Boolean, + legacyErrorAction: Boolean) + extends RuntimeReplaceable with ImplicitCastInputTypes { def this(value: Expression, charset: Expression) = - this(value, charset, SQLConf.get.legacyJavaCharsets) + this(value, charset, SQLConf.get.legacyJavaCharsets, SQLConf.get.legacyCodingErrorAction) - override def left: Expression = str - override def right: Expression = charset override def dataType: DataType = BinaryType override def inputTypes: Seq[AbstractDataType] = Seq(StringTypeAnyCollation, StringTypeAnyCollation) - private val supportedCharsets = Set( - "US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16BE", "UTF-16LE", "UTF-16") - - protected override def nullSafeEval(input1: Any, input2: Any): Any = { - val toCharset = input2.asInstanceOf[UTF8String].toString - try { - if (legacyCharsets || supportedCharsets.contains(toCharset.toUpperCase(Locale.ROOT))) { - input1.asInstanceOf[UTF8String].toString.getBytes(toCharset) - } else throw new UnsupportedEncodingException - } catch { - case _: UnsupportedEncodingException => - throw QueryExecutionErrors.invalidCharsetError(prettyName, toCharset) - } - } + override val replacement: Expression = StaticInvoke( + classOf[Encode], + BinaryType, + "encode", + Seq( + str, charset, Literal(legacyCharsets, BooleanType), Literal(legacyErrorAction, BooleanType)), + Seq(StringTypeAnyCollation, StringTypeAnyCollation, BooleanType, BooleanType)) - override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { - nullSafeCodeGen(ctx, ev, (string, charset) => { - val toCharset = ctx.freshName("toCharset") - val sc = JavaCode.global( - ctx.addReferenceObj("supportedCharsets", supportedCharsets), - supportedCharsets.getClass) - s""" - String $toCharset = $charset.toString(); - try { - if ($legacyCharsets || $sc.contains($toCharset.toUpperCase(java.util.Locale.ROOT))) { - ${ev.value} = $string.toString().getBytes($toCharset); - } else { - throw new java.io.UnsupportedEncodingException(); - } - } catch (java.io.UnsupportedEncodingException e) { - throw QueryExecutionErrors.invalidCharsetError("$prettyName", $toCharset); - }""" - }) - } + override def toString: String = s"$prettyName($str, $charset)" - override protected def withNewChildrenInternal( - newLeft: Expression, newRight: Expression): Encode = copy(str = newLeft, charset = newRight) + override def children: Seq[Expression] = Seq(str, charset) + + override protected def withNewChildrenInternal(newChildren: IndexedSeq[Expression]): Expression = + copy(str = newChildren.head, charset = newChildren(1)) } object Encode { def apply(value: Expression, charset: Expression): Encode = new Encode(value, charset) + + private[expressions] final lazy val VALID_CHARSETS = + Set("US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16BE", "UTF-16LE", "UTF-16", "UTF-32") + + def encode( + input: UTF8String, + charset: UTF8String, + legacyCharsets: Boolean, + legacyErrorAction: Boolean): Array[Byte] = { + val toCharset = charset.toString + if (input.numBytes == 0 || "UTF-8".equalsIgnoreCase(toCharset)) { + return input.getBytes + } + if (legacyCharsets || VALID_CHARSETS.contains(toCharset.toUpperCase(Locale.ROOT))) { + val encoder = try { + val codingErrorAction = if (legacyErrorAction) { + CodingErrorAction.REPLACE + } else { + CodingErrorAction.REPORT + } + Charset.forName(toCharset) + .newEncoder() + .onMalformedInput(codingErrorAction) + .onUnmappableCharacter(codingErrorAction) + } catch { + case _: IllegalCharsetNameException | + _: UnsupportedCharsetException | + _: IllegalArgumentException => + throw QueryExecutionErrors.invalidCharsetError("encode", toCharset) + } + try { + val bb = encoder.encode(CharBuffer.wrap(input.toString)) + JavaUtils.bufferToArray(bb) + } catch { + case _: CharacterCodingException => + throw QueryExecutionErrors.malformedCharacterCoding("encode", toCharset) + } + } else { + throw QueryExecutionErrors.invalidCharsetError("encode", toCharset) + } + } } /** @@ -3173,13 +3423,14 @@ case class Sentences( case class StringSplitSQL( str: Expression, delimiter: Expression) extends BinaryExpression with NullIntolerant { - override def dataType: DataType = ArrayType(StringType, containsNull = false) + override def dataType: DataType = ArrayType(str.dataType, containsNull = false) + final lazy val collationId: Int = left.dataType.asInstanceOf[StringType].collationId override def left: Expression = str override def right: Expression = delimiter override def nullSafeEval(string: Any, delimiter: Any): Any = { - val strings = string.asInstanceOf[UTF8String].splitSQL( - delimiter.asInstanceOf[UTF8String], -1); + val strings = CollationSupport.StringSplitSQL.exec(string.asInstanceOf[UTF8String], + delimiter.asInstanceOf[UTF8String], collationId) new GenericArrayData(strings.asInstanceOf[Array[Any]]) } @@ -3187,7 +3438,8 @@ case class StringSplitSQL( val arrayClass = classOf[GenericArrayData].getName nullSafeCodeGen(ctx, ev, (str, delimiter) => { // Array in java is covariant, so we don't need to cast UTF8String[] to Object[]. - s"${ev.value} = new $arrayClass($str.splitSQL($delimiter,-1));" + s"${ev.value} = new $arrayClass(" + + s"${CollationSupport.StringSplitSQL.genCode(str, delimiter, collationId)});" }) } @@ -3225,10 +3477,11 @@ case class SplitPart ( partNum: Expression) extends RuntimeReplaceable with ImplicitCastInputTypes { override lazy val replacement: Expression = - ElementAt(StringSplitSQL(str, delimiter), partNum, Some(Literal.create("", StringType)), + ElementAt(StringSplitSQL(str, delimiter), partNum, Some(Literal.create("", str.dataType)), false) override def nodeName: String = "split_part" - override def inputTypes: Seq[DataType] = Seq(StringType, StringType, IntegerType) + override def inputTypes: Seq[AbstractDataType] = + Seq(StringTypeAnyCollation, StringTypeAnyCollation, IntegerType) def children: Seq[Expression] = Seq(str, delimiter, partNum) protected def withNewChildrenInternal(newChildren: IndexedSeq[Expression]): Expression = { copy(str = newChildren.apply(0), delimiter = newChildren.apply(1), @@ -3290,7 +3543,7 @@ case class Luhncheck(input: Expression) extends RuntimeReplaceable with Implicit Seq(input), inputTypes) - override def inputTypes: Seq[AbstractDataType] = Seq(StringType) + override def inputTypes: Seq[AbstractDataType] = Seq(StringTypeAnyCollation) override def prettyName: String = "luhn_check" diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/subquery.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/subquery.scala index b8d00074bfba3..75ca4930cf8c1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/subquery.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/subquery.scala @@ -20,8 +20,9 @@ package org.apache.spark.sql.catalyst.expressions import scala.collection.mutable.ArrayBuffer import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression -import org.apache.spark.sql.catalyst.plans.QueryPlan -import org.apache.spark.sql.catalyst.plans.logical.{Filter, HintInfo, LogicalPlan} +import org.apache.spark.sql.catalyst.optimizer.DecorrelateInnerQuery +import org.apache.spark.sql.catalyst.plans._ +import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.trees.TreePattern._ import org.apache.spark.sql.errors.QueryCompilationErrors import org.apache.spark.sql.internal.SQLConf @@ -249,6 +250,79 @@ object SubExprUtils extends PredicateHelper { } } } + + /** + * Returns the inner query attributes that are guaranteed to have a single value for each + * outer row. Therefore, a scalar subquery is allowed to group-by on these attributes. + * We can derive these from correlated equality predicates, though we need to take care about + * propagating this through operators like OUTER JOIN or UNION. + * + * Positive examples: + * - x = outer(a) AND y = outer(b) + * - x = 1 + * - x = outer(a) + 1 + * + * Negative examples: + * - x <= outer(a) + * - x + y = outer(a) + * - x = outer(a) OR y = outer(b) + * - y + outer(b) = 1 (this and similar expressions could be supported, but very carefully) + * - An equality under the right side of a LEFT OUTER JOIN, e.g. + * select *, (select count(*) from y left join + * (select * from z where z1 = x1) sub on y2 = z2 group by z1) from x; + * - An equality under UNION e.g. + * select *, (select count(*) from + * (select * from y where y1 = x1 union all select * from y) group by y1) from x; + */ + def getCorrelatedEquivalentInnerColumns(plan: LogicalPlan): AttributeSet = { + plan match { + case Filter(cond, child) => + val correlated = AttributeSet(splitConjunctivePredicates(cond) + .filter( + SQLConf.get.getConf(SQLConf.SCALAR_SUBQUERY_ALLOW_GROUP_BY_COLUMN_EQUAL_TO_CONSTANT) + || containsOuter(_)) + .filter(DecorrelateInnerQuery.canPullUpOverAgg) + .flatMap(_.references)) + correlated ++ getCorrelatedEquivalentInnerColumns(child) + + case Join(left, right, joinType, _, _) => + joinType match { + case _: InnerLike => + AttributeSet(plan.children.flatMap(child => getCorrelatedEquivalentInnerColumns(child))) + case LeftOuter => getCorrelatedEquivalentInnerColumns(left) + case RightOuter => getCorrelatedEquivalentInnerColumns(right) + case FullOuter => AttributeSet.empty + case LeftSemi => getCorrelatedEquivalentInnerColumns(left) + case LeftAnti => getCorrelatedEquivalentInnerColumns(left) + case _ => AttributeSet.empty + } + + case _: Union => AttributeSet.empty + case Except(left, right, _) => getCorrelatedEquivalentInnerColumns(left) + + case + _: Aggregate | + _: Distinct | + _: Intersect | + _: GlobalLimit | + _: LocalLimit | + _: Offset | + _: Project | + _: Repartition | + _: RepartitionByExpression | + _: RebalancePartitions | + _: Sample | + _: Sort | + _: Window | + _: Tail | + _: WithCTE | + _: Range | + _: SubqueryAlias => + AttributeSet(plan.children.flatMap(child => getCorrelatedEquivalentInnerColumns(child))) + + case _ => AttributeSet.empty + } + } } /** diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/toFromAvroSqlFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/toFromAvroSqlFunctions.scala new file mode 100644 index 0000000000000..ca53058230fb8 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/toFromAvroSqlFunctions.scala @@ -0,0 +1,184 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.expressions + +import org.apache.spark.sql.catalyst.analysis.TypeCheckResult +import org.apache.spark.sql.catalyst.util.ArrayBasedMapData +import org.apache.spark.sql.errors.QueryCompilationErrors +import org.apache.spark.sql.types.{MapType, NullType, StringType} +import org.apache.spark.unsafe.types.UTF8String +import org.apache.spark.util.Utils + +/** + * Converts a binary column of Avro format into its corresponding Catalyst value. + * This is a thin wrapper over the [[AvroDataToCatalyst]] class to create a SQL function. + * + * @param child the Catalyst binary input column. + * @param jsonFormatSchema the Avro schema in JSON string format. + * @param options the options to use when performing the conversion. + * + * @since 4.0.0 + */ +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = """ + _FUNC_(child, jsonFormatSchema, options) - Converts a binary Avro value into a Catalyst value. + """, + examples = """ + Examples: + > SELECT _FUNC_(s, '{"type": "record", "name": "struct", "fields": [{ "name": "u", "type": ["int","string"] }]}', map()) IS NULL AS result FROM (SELECT NAMED_STRUCT('u', NAMED_STRUCT('member0', member0, 'member1', member1)) AS s FROM VALUES (1, NULL), (NULL, 'a') tab(member0, member1)); + [false] + """, + note = """ + The specified schema must match actual schema of the read data, otherwise the behavior + is undefined: it may fail or return arbitrary result. + To deserialize the data with a compatible and evolved schema, the expected Avro schema can be + set via the corresponding option. + """, + group = "misc_funcs", + since = "4.0.0" +) +// scalastyle:on line.size.limit +case class FromAvro(child: Expression, jsonFormatSchema: Expression, options: Expression) + extends TernaryExpression with RuntimeReplaceable { + override def first: Expression = child + override def second: Expression = jsonFormatSchema + override def third: Expression = options + + override def withNewChildrenInternal( + newFirst: Expression, newSecond: Expression, newThird: Expression): Expression = { + copy(child = newFirst, jsonFormatSchema = newSecond, options = newThird) + } + + override def checkInputDataTypes(): TypeCheckResult = { + val schemaCheck = jsonFormatSchema.dataType match { + case _: StringType | + _: NullType + if jsonFormatSchema.foldable => + None + case _ => + Some(TypeCheckResult.TypeCheckFailure( + "The second argument of the FROM_AVRO SQL function must be a constant string " + + "containing the JSON representation of the schema to use for converting the value " + + "from AVRO format")) + } + val optionsCheck = options.dataType match { + case MapType(StringType, StringType, _) | + MapType(NullType, NullType, _) | + _: NullType + if options.foldable => + None + case _ => + Some(TypeCheckResult.TypeCheckFailure( + "The third argument of the FROM_AVRO SQL function must be a constant map of strings to " + + "strings containing the options to use for converting the value from AVRO format")) + } + schemaCheck.getOrElse( + optionsCheck.getOrElse( + TypeCheckResult.TypeCheckSuccess)) + } + + override def replacement: Expression = { + val schemaValue: String = jsonFormatSchema.eval() match { + case s: UTF8String => + s.toString + case null => + "" + } + val optionsValue: Map[String, String] = options.eval() match { + case a: ArrayBasedMapData if a.keyArray.array.nonEmpty => + val keys: Array[String] = a.keyArray.array.map(_.toString) + val values: Array[String] = a.valueArray.array.map(_.toString) + keys.zip(values).toMap + case _ => + Map.empty + } + val constructor = try { + Utils.classForName("org.apache.spark.sql.avro.AvroDataToCatalyst").getConstructors().head + } catch { + case _: java.lang.ClassNotFoundException => + throw QueryCompilationErrors.avroNotLoadedSqlFunctionsUnusable(functionName = "FROM_AVRO") + } + val expr = constructor.newInstance(child, schemaValue, optionsValue) + expr.asInstanceOf[Expression] + } +} + +/** + * Converts a Catalyst binary input value into its corresponding AvroAvro format result. + * This is a thin wrapper over the [[CatalystDataToAvro]] class to create a SQL function. + * + * @param child the Catalyst binary input column. + * @param jsonFormatSchema the Avro schema in JSON string format. + * + * @since 4.0.0 + */ +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = """ + _FUNC_(child, jsonFormatSchema) - Converts a Catalyst binary input value into its corresponding + Avro format result. + """, + examples = """ + Examples: + > SELECT _FUNC_(s, '{"type": "record", "name": "struct", "fields": [{ "name": "u", "type": ["int","string"] }]}', MAP()) IS NULL FROM (SELECT NULL AS s); + [true] + """, + group = "misc_funcs", + since = "4.0.0" +) +// scalastyle:on line.size.limit +case class ToAvro(child: Expression, jsonFormatSchema: Expression) + extends BinaryExpression with RuntimeReplaceable { + override def left: Expression = child + + override def right: Expression = jsonFormatSchema + + override def withNewChildrenInternal(newLeft: Expression, newRight: Expression): Expression = { + copy(child = newLeft, jsonFormatSchema = newRight) + } + + override def checkInputDataTypes(): TypeCheckResult = { + jsonFormatSchema.dataType match { + case _: StringType if jsonFormatSchema.foldable => + TypeCheckResult.TypeCheckSuccess + case _ => + TypeCheckResult.TypeCheckFailure( + "The second argument of the TO_AVRO SQL function must be a constant string " + + "containing the JSON representation of the schema to use for converting the value " + + "to AVRO format") + } + } + + override def replacement: Expression = { + val schemaValue: Option[String] = jsonFormatSchema.eval() match { + case null => + None + case s: UTF8String => + Some(s.toString) + } + val constructor = try { + Utils.classForName("org.apache.spark.sql.avro.CatalystDataToAvro").getConstructors().head + } catch { + case _: java.lang.ClassNotFoundException => + throw QueryCompilationErrors.avroNotLoadedSqlFunctionsUnusable(functionName = "TO_AVRO") + } + val expr = constructor.newInstance(child, schemaValue) + expr.asInstanceOf[Expression] + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala index 47b37a5edeba8..ef8f2ea96eb0b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala @@ -28,7 +28,8 @@ import org.apache.spark.sql.catalyst.expressions.objects.StaticInvoke import org.apache.spark.sql.catalyst.trees.UnaryLike import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors} import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.types.{AbstractDataType, DataType, StringType} +import org.apache.spark.sql.internal.types.StringTypeAnyCollation +import org.apache.spark.sql.types.{AbstractDataType, DataType} import org.apache.spark.unsafe.types.UTF8String // scalastyle:off line.size.limit @@ -54,16 +55,16 @@ case class UrlEncode(child: Expression) override def replacement: Expression = StaticInvoke( UrlCodec.getClass, - StringType, + SQLConf.get.defaultStringType, "encode", Seq(child, Literal("UTF-8")), - Seq(StringType, StringType)) + Seq(StringTypeAnyCollation, StringTypeAnyCollation)) override protected def withNewChildInternal(newChild: Expression): Expression = { copy(child = newChild) } - override def inputTypes: Seq[AbstractDataType] = Seq(StringType) + override def inputTypes: Seq[AbstractDataType] = Seq(StringTypeAnyCollation) override def prettyName: String = "url_encode" } @@ -91,16 +92,16 @@ case class UrlDecode(child: Expression) override def replacement: Expression = StaticInvoke( UrlCodec.getClass, - StringType, + SQLConf.get.defaultStringType, "decode", Seq(child, Literal("UTF-8")), - Seq(StringType, StringType)) + Seq(StringTypeAnyCollation, StringTypeAnyCollation)) override protected def withNewChildInternal(newChild: Expression): Expression = { copy(child = newChild) } - override def inputTypes: Seq[AbstractDataType] = Seq(StringType) + override def inputTypes: Seq[AbstractDataType] = Seq(StringTypeAnyCollation) override def prettyName: String = "url_decode" } @@ -154,8 +155,8 @@ case class ParseUrl(children: Seq[Expression], failOnError: Boolean = SQLConf.ge def this(children: Seq[Expression]) = this(children, SQLConf.get.ansiEnabled) override def nullable: Boolean = true - override def inputTypes: Seq[DataType] = Seq.fill(children.size)(StringType) - override def dataType: DataType = StringType + override def inputTypes: Seq[AbstractDataType] = Seq.fill(children.size)(StringTypeAnyCollation) + override def dataType: DataType = SQLConf.get.defaultStringType override def prettyName: String = "parse_url" // If the url is a constant, cache the URL object so that we don't need to convert url diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/variant/VariantExpressionEvalUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/variant/VariantExpressionEvalUtils.scala index d55edcd662b9d..f7f7097173bb4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/variant/VariantExpressionEvalUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/variant/VariantExpressionEvalUtils.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.expressions.variant import scala.util.control.NonFatal import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.util.{ArrayData, BadRecordException, MapData} +import org.apache.spark.sql.catalyst.util.{ArrayData, MapData} import org.apache.spark.sql.errors.QueryExecutionErrors import org.apache.spark.sql.types._ import org.apache.spark.types.variant.{Variant, VariantBuilder, VariantSizeLimitException, VariantUtil} @@ -31,16 +31,39 @@ import org.apache.spark.unsafe.types.{UTF8String, VariantVal} */ object VariantExpressionEvalUtils { - def parseJson(input: UTF8String): VariantVal = { + def parseJson(input: UTF8String, failOnError: Boolean = true): VariantVal = { + def parseJsonFailure(exception: Throwable): VariantVal = { + if (failOnError) { + throw exception + } else { + null + } + } try { val v = VariantBuilder.parseJson(input.toString) new VariantVal(v.getValue, v.getMetadata) } catch { case _: VariantSizeLimitException => - throw QueryExecutionErrors.variantSizeLimitError(VariantUtil.SIZE_LIMIT, "parse_json") + parseJsonFailure(QueryExecutionErrors + .variantSizeLimitError(VariantUtil.SIZE_LIMIT, "parse_json")) case NonFatal(e) => - throw QueryExecutionErrors.malformedRecordsDetectedInRecordParsingError( - input.toString, BadRecordException(() => input, cause = e)) + parseJsonFailure(QueryExecutionErrors.malformedRecordsDetectedInRecordParsingError( + input.toString, e)) + } + } + + def isVariantNull(input: VariantVal): Boolean = { + if (input == null) { + // This is a SQL NULL, not a Variant NULL + false + } else { + val variantValue = input.getValue + if (variantValue.isEmpty) { + throw QueryExecutionErrors.malformedVariant() + } else { + // Variant NULL is denoted by basic_type == 0 and val_header == 0 + variantValue(0) == 0 + } } } @@ -65,7 +88,8 @@ object VariantExpressionEvalUtils { case LongType => builder.appendLong(input.asInstanceOf[Long]) case FloatType => builder.appendFloat(input.asInstanceOf[Float]) case DoubleType => builder.appendDouble(input.asInstanceOf[Double]) - case StringType => builder.appendString(input.asInstanceOf[UTF8String].toString) + case _: DecimalType => builder.appendDecimal(input.asInstanceOf[Decimal].toJavaBigDecimal) + case _: StringType => builder.appendString(input.asInstanceOf[UTF8String].toString) case BinaryType => builder.appendBinary(input.asInstanceOf[Array[Byte]]) case DateType => builder.appendDate(input.asInstanceOf[Int]) case TimestampType => builder.appendTimestamp(input.asInstanceOf[Long]) @@ -79,7 +103,8 @@ object VariantExpressionEvalUtils { val offsets = new java.util.ArrayList[java.lang.Integer](data.numElements()) for (i <- 0 until data.numElements()) { offsets.add(builder.getWritePos - start) - buildVariant(builder, data.get(i, elementType), elementType) + val element = if (data.isNullAt(i)) null else data.get(i, elementType) + buildVariant(builder, element, elementType) } builder.finishWritingArray(start, offsets) case MapType(StringType, valueType, _) => @@ -92,7 +117,8 @@ object VariantExpressionEvalUtils { val key = keys.getUTF8String(i).toString val id = builder.addKey(key) fields.add(new VariantBuilder.FieldEntry(key, id, builder.getWritePos - start)) - buildVariant(builder, values.get(i, valueType), valueType) + val value = if (values.isNullAt(i)) null else values.get(i, valueType) + buildVariant(builder, value, valueType) } builder.finishWritingObject(start, fields) case StructType(structFields) => @@ -103,7 +129,8 @@ object VariantExpressionEvalUtils { val key = structFields(i).name val id = builder.addKey(key) fields.add(new VariantBuilder.FieldEntry(key, id, builder.getWritePos - start)) - buildVariant(builder, data.get(i, structFields(i).dataType), structFields(i).dataType) + val value = if (data.isNullAt(i)) null else data.get(i, structFields(i).dataType) + buildVariant(builder, value, structFields(i).dataType) } builder.finishWritingObject(start, fields) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/variant/variantExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/variant/variantExpressions.scala index cab75e1996abc..b80fb11b6813b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/variant/variantExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/variant/variantExpressions.scala @@ -37,41 +37,80 @@ import org.apache.spark.sql.catalyst.trees.UnaryLike import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData} import org.apache.spark.sql.catalyst.util.DateTimeConstants._ import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryErrorsBase, QueryExecutionErrors} +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.types.StringTypeAnyCollation import org.apache.spark.sql.types._ import org.apache.spark.types.variant._ import org.apache.spark.types.variant.VariantUtil.Type import org.apache.spark.unsafe.types._ + +/** + * The implementation for `parse_json` and `try_parse_json` expressions. Parse a JSON string as a + * Variant value. + * @param child The string value to parse as a variant. + * @param failOnError Controls whether the expression should throw an exception or return null if + * the string does not represent a valid JSON value. + */ +case class ParseJson(child: Expression, failOnError: Boolean = true) + extends UnaryExpression with ExpectsInputTypes with RuntimeReplaceable { + + override lazy val replacement: Expression = StaticInvoke( + VariantExpressionEvalUtils.getClass, + VariantType, + "parseJson", + Seq(child, Literal(failOnError, BooleanType)), + inputTypes :+ BooleanType, + returnNullable = !failOnError) + + override def inputTypes: Seq[AbstractDataType] = StringTypeAnyCollation :: Nil + + override def dataType: DataType = VariantType + + override def prettyName: String = if (failOnError) "parse_json" else "try_parse_json" + + override protected def withNewChildInternal(newChild: Expression): ParseJson = + copy(child = newChild) +} + // scalastyle:off line.size.limit @ExpressionDescription( - usage = "_FUNC_(jsonStr) - Parse a JSON string as an Variant value. Throw an exception when the string is not valid JSON value.", + usage = "_FUNC_(expr) - Check if a variant value is a variant null. Returns true if and only if the input is a variant null and false otherwise (including in the case of SQL NULL).", examples = """ Examples: - > SELECT _FUNC_('{"a":1,"b":0.8}'); - {"a":1,"b":0.8} + > SELECT _FUNC_(parse_json('null')); + true + > SELECT _FUNC_(parse_json('"null"')); + false + > SELECT _FUNC_(parse_json('13')); + false + > SELECT _FUNC_(parse_json(null)); + false + > SELECT _FUNC_(variant_get(parse_json('{"a":null, "b":"spark"}'), "$.c")); + false + > SELECT _FUNC_(variant_get(parse_json('{"a":null, "b":"spark"}'), "$.a")); + true """, since = "4.0.0", - group = "variant_funcs" -) + group = "variant_funcs") // scalastyle:on line.size.limit -case class ParseJson(child: Expression) - extends UnaryExpression with ExpectsInputTypes with RuntimeReplaceable { +case class IsVariantNull(child: Expression) extends UnaryExpression + with Predicate with ExpectsInputTypes with RuntimeReplaceable { override lazy val replacement: Expression = StaticInvoke( VariantExpressionEvalUtils.getClass, - VariantType, - "parseJson", + BooleanType, + "isVariantNull", Seq(child), inputTypes, + propagateNull = false, returnNullable = false) - override def inputTypes: Seq[AbstractDataType] = StringType :: Nil - - override def dataType: DataType = VariantType + override def inputTypes: Seq[AbstractDataType] = Seq(VariantType) - override def prettyName: String = "parse_json" + override def prettyName: String = "is_variant_null" - override protected def withNewChildInternal(newChild: Expression): ParseJson = + override protected def withNewChildInternal(newChild: Expression): IsVariantNull = copy(child = newChild) } @@ -162,7 +201,7 @@ case class VariantGet( final override def nodePatternsInternal(): Seq[TreePattern] = Seq(VARIANT_GET) - override def inputTypes: Seq[AbstractDataType] = Seq(VariantType, StringType) + override def inputTypes: Seq[AbstractDataType] = Seq(VariantType, StringTypeAnyCollation) override def prettyName: String = if (failOnError) "variant_get" else "try_variant_get" @@ -223,7 +262,7 @@ case object VariantGet { VariantType => true case ArrayType(elementType, _) => checkDataType(elementType) - case MapType(StringType, valueType, _) => checkDataType(valueType) + case MapType(_: StringType, valueType, _) => checkDataType(valueType) case StructType(fields) => fields.forall(f => checkDataType(f.dataType)) case _ => false } @@ -283,7 +322,13 @@ case object VariantGet { } } - if (dataType == VariantType) return new VariantVal(v.getValue, v.getMetadata) + if (dataType == VariantType) { + // Build a new variant, in order to strip off any unnecessary metadata. + val builder = new VariantBuilder + builder.appendVariant(v) + val result = builder.result() + return new VariantVal(result.getValue, result.getMetadata) + } val variantType = v.getType if (variantType == Type.NULL) return null dataType match { @@ -297,11 +342,12 @@ case object VariantGet { } case Type.BOOLEAN => Literal(v.getBoolean, BooleanType) case Type.LONG => Literal(v.getLong, LongType) - case Type.STRING => Literal(UTF8String.fromString(v.getString), StringType) + case Type.STRING => Literal(UTF8String.fromString(v.getString), + SQLConf.get.defaultStringType) case Type.DOUBLE => Literal(v.getDouble, DoubleType) case Type.DECIMAL => val d = Decimal(v.getDecimal) - Literal(Decimal(v.getDecimal), DecimalType(d.precision, d.scale)) + Literal(d, DecimalType(d.precision, d.scale)) case Type.DATE => Literal(v.getLong.toInt, DateType) case Type.TIMESTAMP => Literal(v.getLong, TimestampType) case Type.TIMESTAMP_NTZ => Literal(v.getLong, TimestampNTZType) @@ -350,7 +396,7 @@ case object VariantGet { } else { invalidCast() } - case MapType(StringType, valueType, _) => + case MapType(_: StringType, valueType, _) => if (variantType == Type.OBJECT) { val size = v.objectSize() val keyArray = new Array[Any](size) @@ -384,6 +430,47 @@ case object VariantGet { } } +abstract class ParseJsonExpressionBuilderBase(failOnError: Boolean) extends ExpressionBuilder { + override def build(funcName: String, expressions: Seq[Expression]): Expression = { + val numArgs = expressions.length + if (numArgs == 1) { + ParseJson(expressions.head, failOnError) + } else { + throw QueryCompilationErrors.wrongNumArgsError(funcName, Seq(1), numArgs) + } + } +} + +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = "_FUNC_(jsonStr) - Parse a JSON string as a Variant value. Throw an exception when the string is not valid JSON value.", + examples = """ + Examples: + > SELECT _FUNC_('{"a":1,"b":0.8}'); + {"a":1,"b":0.8} + """, + since = "4.0.0", + group = "variant_funcs" +) +// scalastyle:on line.size.limit +object ParseJsonExpressionBuilder extends ParseJsonExpressionBuilderBase(true) + +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = "_FUNC_(jsonStr) - Parse a JSON string as a Variant value. Return NULL when the string is not valid JSON value.", + examples = """ + Examples: + > SELECT _FUNC_('{"a":1,"b":0.8}'); + {"a":1,"b":0.8} + > SELECT _FUNC_('{"a":1,'); + NULL + """, + since = "4.0.0", + group = "variant_funcs" +) +// scalastyle:on line.size.limit +object TryParseJsonExpressionBuilder extends ParseJsonExpressionBuilderBase(false) + abstract class VariantGetExpressionBuilderBase(failOnError: Boolean) extends ExpressionBuilder { override def build(funcName: String, expressions: Seq[Expression]): Expression = { val numArgs = expressions.length @@ -490,7 +577,7 @@ case class VariantExplode(child: Expression) extends UnaryExpression with Genera override def elementSchema: StructType = { new StructType() .add("pos", IntegerType, nullable = false) - .add("key", StringType, nullable = true) + .add("key", SQLConf.get.defaultStringType, nullable = true) .add("value", VariantType, nullable = false) } } @@ -547,7 +634,7 @@ case class SchemaOfVariant(child: Expression) with ExpectsInputTypes { override lazy val replacement: Expression = StaticInvoke( SchemaOfVariant.getClass, - StringType, + SQLConf.get.defaultStringType, "schemaOfVariant", Seq(child), inputTypes, @@ -555,7 +642,7 @@ case class SchemaOfVariant(child: Expression) override def inputTypes: Seq[AbstractDataType] = Seq(VariantType) - override def dataType: DataType = StringType + override def dataType: DataType = SQLConf.get.defaultStringType override def prettyName: String = "schema_of_variant" @@ -598,11 +685,11 @@ object SchemaOfVariant { case Type.NULL => NullType case Type.BOOLEAN => BooleanType case Type.LONG => LongType - case Type.STRING => StringType + case Type.STRING => SQLConf.get.defaultStringType case Type.DOUBLE => DoubleType case Type.DECIMAL => - val d = v.getDecimal - DecimalType(d.precision(), d.scale()) + val d = Decimal(v.getDecimal) + DecimalType(d.precision, d.scale) case Type.DATE => DateType case Type.TIMESTAMP => TimestampType case Type.TIMESTAMP_NTZ => TimestampNTZType @@ -643,7 +730,7 @@ case class SchemaOfVariantAgg( override def inputTypes: Seq[AbstractDataType] = Seq(VariantType) - override def dataType: DataType = StringType + override def dataType: DataType = SQLConf.get.defaultStringType override def nullable: Boolean = false diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala index 00711332350cf..5881c456f6e86 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala @@ -853,7 +853,7 @@ case class NTile(buckets: Expression) extends RowNumberLike with SizeBasedWindow // for each partition. override def checkInputDataTypes(): TypeCheckResult = { if (!buckets.foldable) { - DataTypeMismatch( + return DataTypeMismatch( errorSubClass = "NON_FOLDABLE_INPUT", messageParameters = Map( "inputName" -> toSQLId("buckets"), @@ -864,7 +864,7 @@ case class NTile(buckets: Expression) extends RowNumberLike with SizeBasedWindow } if (buckets.dataType != IntegerType) { - DataTypeMismatch( + return DataTypeMismatch( errorSubClass = "UNEXPECTED_INPUT_TYPE", messageParameters = Map( "paramIndex" -> ordinalNumber(0), diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/xpath.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/xpath.scala index c3a285178c110..f65061e8d0ea9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/xpath.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/xpath.scala @@ -23,6 +23,8 @@ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.Cast._ import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback import org.apache.spark.sql.catalyst.util.GenericArrayData +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.types.StringTypeAnyCollation import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String @@ -39,7 +41,8 @@ abstract class XPathExtract /** XPath expressions are always nullable, e.g. if the xml string is empty. */ override def nullable: Boolean = true - override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType) + override def inputTypes: Seq[AbstractDataType] = + Seq(StringTypeAnyCollation, StringTypeAnyCollation) override def checkInputDataTypes(): TypeCheckResult = { if (!path.foldable) { @@ -47,7 +50,7 @@ abstract class XPathExtract errorSubClass = "NON_FOLDABLE_INPUT", messageParameters = Map( "inputName" -> toSQLId("path"), - "inputType" -> toSQLType(StringType), + "inputType" -> toSQLType(StringTypeAnyCollation), "inputExpr" -> toSQLExpr(path) ) ) @@ -221,7 +224,7 @@ case class XPathDouble(xml: Expression, path: Expression) extends XPathExtract { // scalastyle:on line.size.limit case class XPathString(xml: Expression, path: Expression) extends XPathExtract { override def prettyName: String = "xpath_string" - override def dataType: DataType = StringType + override def dataType: DataType = SQLConf.get.defaultStringType override def nullSafeEval(xml: Any, path: Any): Any = { val ret = xpathUtil.evalString(xml.asInstanceOf[UTF8String].toString, pathString) @@ -245,7 +248,7 @@ case class XPathString(xml: Expression, path: Expression) extends XPathExtract { // scalastyle:on line.size.limit case class XPathList(xml: Expression, path: Expression) extends XPathExtract { override def prettyName: String = "xpath" - override def dataType: DataType = ArrayType(StringType, containsNull = false) + override def dataType: DataType = ArrayType(SQLConf.get.defaultStringType, containsNull = false) override def nullSafeEval(xml: Any, path: Any): Any = { val nodeList = xpathUtil.evalNodeList(xml.asInstanceOf[UTF8String].toString, pathString) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xmlExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xmlExpressions.scala index 415d55d19ded2..48a87db291a8d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xmlExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xmlExpressions.scala @@ -21,12 +21,13 @@ import java.io.CharArrayWriter import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{DataTypeMismatch, TypeCheckSuccess} -import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback -import org.apache.spark.sql.catalyst.util.{ArrayData, DropMalformedMode, FailFastMode, FailureSafeParser, GenericArrayData, PermissiveMode} +import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodegenFallback, ExprCode} +import org.apache.spark.sql.catalyst.util.{DropMalformedMode, FailFastMode, FailureSafeParser, PermissiveMode} import org.apache.spark.sql.catalyst.util.TypeUtils._ import org.apache.spark.sql.catalyst.xml.{StaxXmlGenerator, StaxXmlParser, ValidatorUtil, XmlInferSchema, XmlOptions} import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryErrorsBase} import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.types.StringTypeAnyCollation import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String @@ -51,13 +52,12 @@ import org.apache.spark.unsafe.types.UTF8String since = "4.0.0") // scalastyle:on line.size.limit case class XmlToStructs( - schema: DataType, + schema: StructType, options: Map[String, String], child: Expression, timeZoneId: Option[String] = None) extends UnaryExpression with TimeZoneAwareExpression - with CodegenFallback with ExpectsInputTypes with NullIntolerant with QueryErrorsBase { @@ -73,7 +73,7 @@ case class XmlToStructs( // The XML input data might be missing certain fields. We force the nullability // of the user-provided schema to avoid data corruptions. - val nullableSchema = schema.asNullable + private val nullableSchema = schema.asNullable def this(child: Expression, schema: Expression) = this(child, schema, Map.empty[String, String]) @@ -86,42 +86,27 @@ case class XmlToStructs( // This converts parsed rows to the desired output by the given schema. @transient - lazy val converter = nullableSchema match { - case _: StructType => - (rows: Iterator[InternalRow]) => if (rows.hasNext) rows.next() else null - case _: ArrayType => - (rows: Iterator[InternalRow]) => if (rows.hasNext) rows.next().getArray(0) else null - case _: MapType => - (rows: Iterator[InternalRow]) => if (rows.hasNext) rows.next().getMap(0) else null - } + private lazy val converter = + (rows: Iterator[InternalRow]) => if (rows.hasNext) rows.next() else null - val nameOfCorruptRecord = SQLConf.get.getConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD) + private val nameOfCorruptRecord = SQLConf.get.getConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD) - @transient lazy val parser = { + @transient + private lazy val parser = { val parsedOptions = new XmlOptions(options, timeZoneId.get, nameOfCorruptRecord) val mode = parsedOptions.parseMode if (mode != PermissiveMode && mode != FailFastMode) { throw QueryCompilationErrors.parseModeUnsupportedError("from_xml", mode) } - val (parserSchema, actualSchema) = nullableSchema match { - case s: StructType => - ExprUtils.verifyColumnNameOfCorruptRecord(s, parsedOptions.columnNameOfCorruptRecord) - (s, StructType(s.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord))) - case other => - (StructType(Array(StructField("value", other))), other) - } - - val rowSchema: StructType = schema match { - case st: StructType => st - case ArrayType(st: StructType, _) => st - } - val rawParser = new StaxXmlParser(rowSchema, parsedOptions) + ExprUtils.verifyColumnNameOfCorruptRecord( + nullableSchema, parsedOptions.columnNameOfCorruptRecord) + val rawParser = new StaxXmlParser(schema, parsedOptions) val xsdSchema = Option(parsedOptions.rowValidationXSDPath).map(ValidatorUtil.getSchema) new FailureSafeParser[String]( input => rawParser.doParseColumn(input, mode, xsdSchema), mode, - parserSchema, + nullableSchema, parsedOptions.columnNameOfCorruptRecord) } @@ -130,23 +115,17 @@ case class XmlToStructs( override def withTimeZone(timeZoneId: String): TimeZoneAwareExpression = { copy(timeZoneId = Option(timeZoneId)) } - override def nullSafeEval(xml: Any): Any = xml match { - case arr: GenericArrayData => - new GenericArrayData(arr.array.map(s => converter(parser.parse(s.toString)))) - case arr: ArrayData => - new GenericArrayData(arr.array.map(s => converter(parser.parse(s.toString)))) - case _ => - val str = xml.asInstanceOf[UTF8String].toString - converter(parser.parse(str)) - } - override def inputTypes: Seq[AbstractDataType] = StringType :: Nil + override def nullSafeEval(xml: Any): Any = + converter(parser.parse(xml.asInstanceOf[UTF8String].toString)) - override def sql: String = schema match { - case _: MapType => "entries" - case _ => super.sql + override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { + val expr = ctx.addReferenceObj("this", this) + defineCodeGen(ctx, ev, input => s"(InternalRow) $expr.nullSafeEval($input)") } + override def inputTypes: Seq[AbstractDataType] = StringTypeAnyCollation :: Nil + override def prettyName: String = "from_xml" protected def withNewChildInternal(newChild: Expression): XmlToStructs = @@ -178,16 +157,13 @@ case class SchemaOfXml( child = child, options = ExprUtils.convertToMapData(options)) - override def dataType: DataType = StringType + override def dataType: DataType = SQLConf.get.defaultStringType override def nullable: Boolean = false @transient private lazy val xmlOptions = new XmlOptions(options, "UTC") - @transient - private lazy val xmlFactory = xmlOptions.buildXmlFactory() - @transient private lazy val xmlInferSchema = { if (xmlOptions.parseMode == DropMalformedMode) { @@ -226,7 +202,7 @@ case class SchemaOfXml( .map(ArrayType(_, containsNull = at.containsNull)) .getOrElse(ArrayType(StructType(Nil), containsNull = at.containsNull)) case other: DataType => - xmlInferSchema.canonicalizeType(other).getOrElse(StringType) + xmlInferSchema.canonicalizeType(other).getOrElse(SQLConf.get.defaultStringType) } UTF8String.fromString(dataType.sql) @@ -265,7 +241,6 @@ case class StructsToXml( timeZoneId: Option[String] = None) extends UnaryExpression with TimeZoneAwareExpression - with CodegenFallback with ExpectsInputTypes with NullIntolerant { override def nullable: Boolean = true @@ -320,13 +295,18 @@ case class StructsToXml( getAndReset() } - override def dataType: DataType = StringType + override def dataType: DataType = SQLConf.get.defaultStringType override def withTimeZone(timeZoneId: String): TimeZoneAwareExpression = copy(timeZoneId = Option(timeZoneId)) override def nullSafeEval(value: Any): Any = converter(value) + override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { + val expr = ctx.addReferenceObj("this", this) + defineCodeGen(ctx, ev, input => s"(UTF8String) $expr.nullSafeEval($input)") + } + override def inputTypes: Seq[AbstractDataType] = StructType :: Nil override def prettyName: String = "to_xml" diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala index e5aa0bb6d2c06..945b6e7de8b7a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala @@ -188,6 +188,11 @@ class JSONOptions( val writeNonAsciiCharacterAsCodePoint: Boolean = parameters.get(WRITE_NON_ASCII_CHARACTER_AS_CODEPOINT).map(_.toBoolean).getOrElse(false) + // This option takes in a column name and specifies that the entire JSON record should be stored + // as a single VARIANT type column in the table with the given column name. + // E.g. spark.read.format("json").option("singleVariantColumn", "colName") + val singleVariantColumn: Option[String] = parameters.get(SINGLE_VARIANT_COLUMN) + /** Build a Jackson [[JsonFactory]] using JSON options. */ def buildJsonFactory(): JsonFactory = { val streamReadConstraints = StreamReadConstraints @@ -282,6 +287,7 @@ object JSONOptions extends DataSourceOptions { val COLUMN_NAME_OF_CORRUPTED_RECORD = newOption("columnNameOfCorruptRecord") val TIME_ZONE = newOption("timeZone") val WRITE_NON_ASCII_CHARACTER_AS_CODEPOINT = newOption("writeNonAsciiCharacterAsCodePoint") + val SINGLE_VARIANT_COLUMN = newOption("singleVariantColumn") // Options with alternative val ENCODING = "encoding" val CHARSET = "charset" diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala index d3f33a70323fc..b2c302fbbbe31 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala @@ -24,6 +24,7 @@ import scala.collection.mutable.ArrayBuffer import scala.util.control.NonFatal import com.fasterxml.jackson.core._ +import org.apache.hadoop.fs.PositionedReadable import org.apache.spark.SparkUpgradeException import org.apache.spark.internal.Logging @@ -36,7 +37,8 @@ import org.apache.spark.sql.errors.{ExecutionErrors, QueryExecutionErrors} import org.apache.spark.sql.internal.{LegacyBehaviorPolicy, SQLConf} import org.apache.spark.sql.sources.Filter import org.apache.spark.sql.types._ -import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} +import org.apache.spark.types.variant._ +import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String, VariantVal} import org.apache.spark.util.Utils /** @@ -105,12 +107,31 @@ class JacksonParser( */ private def makeRootConverter(dt: DataType): JsonParser => Iterable[InternalRow] = { dt match { + case _: StructType if options.singleVariantColumn.isDefined => (parser: JsonParser) => { + Some(InternalRow(parseVariant(parser))) + } case st: StructType => makeStructRootConverter(st) case mt: MapType => makeMapRootConverter(mt) case at: ArrayType => makeArrayRootConverter(at) } } + protected final def parseVariant(parser: JsonParser): VariantVal = { + // Skips `FIELD_NAME` at the beginning. This check is adapted from `parseJsonToken`, but we + // cannot directly use the function here because it also handles the `VALUE_NULL` token and + // returns null (representing a SQL NULL). Instead, we want to return a variant null. + if (parser.getCurrentToken == FIELD_NAME) { + parser.nextToken() + } + try { + val v = VariantBuilder.parseJson(parser) + new VariantVal(v.getValue, v.getMetadata) + } catch { + case _: VariantSizeLimitException => + throw QueryExecutionErrors.variantSizeLimitError(VariantUtil.SIZE_LIMIT, "JacksonParser") + } + } + private def makeStructRootConverter(st: StructType): JsonParser => Iterable[InternalRow] = { val elementConverter = makeConverter(st) val fieldConverters = st.map(_.dataType).map(makeConverter).toArray @@ -255,19 +276,63 @@ class JacksonParser( } } - case StringType => - (parser: JsonParser) => parseJsonToken[UTF8String](parser, dataType) { + case _: StringType => (parser: JsonParser) => { + // This must be enabled if we will retrieve the bytes directly from the raw content: + val includeSourceInLocation = JsonParser.Feature.INCLUDE_SOURCE_IN_LOCATION + val originalMask = if (includeSourceInLocation.enabledIn(parser.getFeatureMask)) { + 1 + } else { + 0 + } + parser.overrideStdFeatures(includeSourceInLocation.getMask, includeSourceInLocation.getMask) + val result = parseJsonToken[UTF8String](parser, dataType) { case VALUE_STRING => UTF8String.fromString(parser.getText) - case _ => + case other => // Note that it always tries to convert the data as string without the case of failure. - val writer = new ByteArrayOutputStream() - Utils.tryWithResource(factory.createGenerator(writer, JsonEncoding.UTF8)) { - generator => generator.copyCurrentStructure(parser) + val startLocation = parser.currentTokenLocation() + def skipAhead(): Unit = { + other match { + case START_OBJECT => + parser.skipChildren() + case START_ARRAY => + parser.skipChildren() + case _ => + // Do nothing in this case; we've already read the token + } } - UTF8String.fromBytes(writer.toByteArray) - } + + // PositionedReadable + startLocation.contentReference().getRawContent match { + case byteArray: Array[Byte] if exactStringParsing => + skipAhead() + val endLocation = parser.currentLocation.getByteOffset + + UTF8String.fromBytes( + byteArray, + startLocation.getByteOffset.toInt, + endLocation.toInt - (startLocation.getByteOffset.toInt)) + case positionedReadable: PositionedReadable if exactStringParsing => + skipAhead() + val endLocation = parser.currentLocation.getByteOffset + + val size = endLocation.toInt - (startLocation.getByteOffset.toInt) + val buffer = new Array[Byte](size) + positionedReadable.read(startLocation.getByteOffset, buffer, 0, size) + UTF8String.fromBytes(buffer, 0, size) + case _ => + val writer = new ByteArrayOutputStream() + Utils.tryWithResource(factory.createGenerator(writer, JsonEncoding.UTF8)) { + generator => generator.copyCurrentStructure(parser) + } + UTF8String.fromBytes(writer.toByteArray) + } + } + // Reset back to the original configuration: + parser.overrideStdFeatures(includeSourceInLocation.getMask, originalMask) + result + } case TimestampType => (parser: JsonParser) => parseJsonToken[java.lang.Long](parser, dataType) { @@ -380,6 +445,8 @@ class JacksonParser( case _ => null } + case _: VariantType => parseVariant + // We don't actually hit this exception though, we keep it for understandability case _ => throw ExecutionErrors.unsupportedDataTypeError(dataType) } @@ -407,6 +474,8 @@ class JacksonParser( private val allowEmptyString = SQLConf.get.getConf(SQLConf.LEGACY_ALLOW_EMPTY_STRING_IN_JSON) + private val exactStringParsing = SQLConf.get.getConf(SQLConf.JSON_EXACT_STRING_PARSING) + /** * This function throws an exception for failed conversion. For empty string on data types * except for string and binary types, this also throws an exception. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala index 7ee522226e3ec..d982e1f19da0c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala @@ -372,6 +372,11 @@ object JsonInferSchema { case (DoubleType, _: DecimalType) | (_: DecimalType, DoubleType) => DoubleType + // This branch is only used by `SchemaOfVariant.mergeSchema` because `JsonInferSchema` never + // produces `FloatType`. + case (FloatType, _: DecimalType) | (_: DecimalType, FloatType) => + DoubleType + case (t1: DecimalType, t2: DecimalType) => val scale = math.max(t1.scale, t2.scale) val range = math.max(t1.precision - t1.scale, t2.precision - t2.scale) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/InjectRuntimeFilter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/InjectRuntimeFilter.scala index 9c150f1f3308f..176e927b2d212 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/InjectRuntimeFilter.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/InjectRuntimeFilter.scala @@ -120,33 +120,52 @@ object InjectRuntimeFilter extends Rule[LogicalPlan] with PredicateHelper with J hasHitSelectiveFilter = hasHitSelectiveFilter || isLikelySelective(condition), currentPlan, targetKey) - case ExtractEquiJoinKeys(_, lkeys, rkeys, _, _, left, right, _) => + case ExtractEquiJoinKeys(joinType, lkeys, rkeys, _, _, left, right, _) => // Runtime filters use one side of the [[Join]] to build a set of join key values and prune // the other side of the [[Join]]. It's also OK to use a superset of the join key values - // (ignore null values) to do the pruning. + // (ignore null values) to do the pruning. We can also extract from the other side if the + // join keys are transitive, and the other side always produces a superset output of join + // key values. Any join side always produce a superset output of its corresponding + // join keys, but for transitive join keys we need to check the join type. // We assume other rules have already pushed predicates through join if possible. // So the predicate references won't pass on anymore. if (left.output.exists(_.semanticEquals(targetKey))) { extract(left, AttributeSet.empty, hasHitFilter = false, hasHitSelectiveFilter = false, currentPlan = left, targetKey = targetKey).orElse { - // We can also extract from the right side if the join keys are transitive. - lkeys.zip(rkeys).find(_._1.semanticEquals(targetKey)).map(_._2) - .flatMap { newTargetKey => - extract(right, AttributeSet.empty, - hasHitFilter = false, hasHitSelectiveFilter = false, currentPlan = right, - targetKey = newTargetKey) - } + // An example that extract from the right side if the join keys are transitive. + // left table: 1, 2, 3 + // right table, 3, 4 + // right outer join output: (3, 3), (null, 4) + // right key output: 3, 4 + if (canPruneLeft(joinType)) { + lkeys.zip(rkeys).find(_._1.semanticEquals(targetKey)).map(_._2) + .flatMap { newTargetKey => + extract(right, AttributeSet.empty, + hasHitFilter = false, hasHitSelectiveFilter = false, currentPlan = right, + targetKey = newTargetKey) + } + } else { + None + } } } else if (right.output.exists(_.semanticEquals(targetKey))) { extract(right, AttributeSet.empty, hasHitFilter = false, hasHitSelectiveFilter = false, currentPlan = right, targetKey = targetKey).orElse { - // We can also extract from the left side if the join keys are transitive. - rkeys.zip(lkeys).find(_._1.semanticEquals(targetKey)).map(_._2) - .flatMap { newTargetKey => - extract(left, AttributeSet.empty, - hasHitFilter = false, hasHitSelectiveFilter = false, currentPlan = left, - targetKey = newTargetKey) - } + // An example that extract from the left side if the join keys are transitive. + // left table: 1, 2, 3 + // right table, 3, 4 + // left outer join output: (1, null), (2, null), (3, 3) + // left key output: 1, 2, 3 + if (canPruneRight(joinType)) { + rkeys.zip(lkeys).find(_._1.semanticEquals(targetKey)).map(_._2) + .flatMap { newTargetKey => + extract(left, AttributeSet.empty, + hasHitFilter = false, hasHitSelectiveFilter = false, currentPlan = left, + targetKey = newTargetKey) + } + } else { + None + } } } else { None diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/InlineCTE.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/InlineCTE.scala index 8d7ff4cbf163d..8cc25328ce70b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/InlineCTE.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/InlineCTE.scala @@ -37,23 +37,19 @@ import org.apache.spark.sql.catalyst.trees.TreePattern.{CTE, PLAN_EXPRESSION} * query level. * * @param alwaysInline if true, inline all CTEs in the query plan. + * @param keepDanglingRelations if true, dangling CTE relations will be kept in the original + * `WithCTE` node. */ -case class InlineCTE(alwaysInline: Boolean = false) extends Rule[LogicalPlan] { +case class InlineCTE( + alwaysInline: Boolean = false, + keepDanglingRelations: Boolean = false) extends Rule[LogicalPlan] { override def apply(plan: LogicalPlan): LogicalPlan = { if (!plan.isInstanceOf[Subquery] && plan.containsPattern(CTE)) { - val cteMap = mutable.SortedMap.empty[Long, (CTERelationDef, Int, mutable.Map[Long, Int])] + val cteMap = mutable.SortedMap.empty[Long, CTEReferenceInfo] buildCTEMap(plan, cteMap) cleanCTEMap(cteMap) - val notInlined = mutable.ArrayBuffer.empty[CTERelationDef] - val inlined = inlineCTE(plan, cteMap, notInlined) - // CTEs in SQL Commands have been inlined by `CTESubstitution` already, so it is safe to add - // WithCTE as top node here. - if (notInlined.isEmpty) { - inlined - } else { - WithCTE(inlined, notInlined.toSeq) - } + inlineCTE(plan, cteMap) } else { plan } @@ -74,22 +70,23 @@ case class InlineCTE(alwaysInline: Boolean = false) extends Rule[LogicalPlan] { * * @param plan The plan to collect the CTEs from * @param cteMap A mutable map that accumulates the CTEs and their reference information by CTE - * ids. The value of the map is tuple whose elements are: - * - The CTE definition - * - The number of incoming references to the CTE. This includes references from - * other CTEs and regular places. - * - A mutable inner map that tracks outgoing references (counts) to other CTEs. + * ids. * @param outerCTEId While collecting the map we use this optional CTE id to identify the * current outer CTE. */ - def buildCTEMap( + private def buildCTEMap( plan: LogicalPlan, - cteMap: mutable.Map[Long, (CTERelationDef, Int, mutable.Map[Long, Int])], + cteMap: mutable.Map[Long, CTEReferenceInfo], outerCTEId: Option[Long] = None): Unit = { plan match { case WithCTE(child, cteDefs) => cteDefs.foreach { cteDef => - cteMap(cteDef.id) = (cteDef, 0, mutable.Map.empty.withDefaultValue(0)) + cteMap(cteDef.id) = CTEReferenceInfo( + cteDef = cteDef, + refCount = 0, + outgoingRefs = mutable.Map.empty.withDefaultValue(0), + shouldInline = true + ) } cteDefs.foreach { cteDef => buildCTEMap(cteDef, cteMap, Some(cteDef.id)) @@ -97,11 +94,9 @@ case class InlineCTE(alwaysInline: Boolean = false) extends Rule[LogicalPlan] { buildCTEMap(child, cteMap, outerCTEId) case ref: CTERelationRef => - val (cteDef, refCount, refMap) = cteMap(ref.cteId) - cteMap(ref.cteId) = (cteDef, refCount + 1, refMap) + cteMap(ref.cteId) = cteMap(ref.cteId).withRefCountIncreased(1) outerCTEId.foreach { cteId => - val (_, _, outerRefMap) = cteMap(cteId) - outerRefMap(ref.cteId) += 1 + cteMap(cteId).increaseOutgoingRefCount(ref.cteId, 1) } case _ => @@ -129,15 +124,12 @@ case class InlineCTE(alwaysInline: Boolean = false) extends Rule[LogicalPlan] { * @param cteMap A mutable map that accumulates the CTEs and their reference information by CTE * ids. Needs to be sorted to speed up cleaning. */ - private def cleanCTEMap( - cteMap: mutable.SortedMap[Long, (CTERelationDef, Int, mutable.Map[Long, Int])] - ) = { + private def cleanCTEMap(cteMap: mutable.SortedMap[Long, CTEReferenceInfo]): Unit = { cteMap.keys.toSeq.reverse.foreach { currentCTEId => - val (_, currentRefCount, refMap) = cteMap(currentCTEId) - if (currentRefCount == 0) { - refMap.foreach { case (referencedCTEId, uselessRefCount) => - val (cteDef, refCount, refMap) = cteMap(referencedCTEId) - cteMap(referencedCTEId) = (cteDef, refCount - uselessRefCount, refMap) + val refInfo = cteMap(currentCTEId) + if (refInfo.refCount == 0) { + refInfo.outgoingRefs.foreach { case (referencedCTEId, uselessRefCount) => + cteMap(referencedCTEId) = cteMap(referencedCTEId).withRefCountDecreased(uselessRefCount) } } } @@ -145,30 +137,46 @@ case class InlineCTE(alwaysInline: Boolean = false) extends Rule[LogicalPlan] { private def inlineCTE( plan: LogicalPlan, - cteMap: mutable.Map[Long, (CTERelationDef, Int, mutable.Map[Long, Int])], - notInlined: mutable.ArrayBuffer[CTERelationDef]): LogicalPlan = { + cteMap: mutable.Map[Long, CTEReferenceInfo]): LogicalPlan = { plan match { case WithCTE(child, cteDefs) => + val notInlined = mutable.ArrayBuffer.empty[CTERelationDef] cteDefs.foreach { cteDef => - val (cte, refCount, refMap) = cteMap(cteDef.id) - if (refCount > 0) { - val inlined = cte.copy(child = inlineCTE(cte.child, cteMap, notInlined)) - cteMap(cteDef.id) = (inlined, refCount, refMap) - if (!shouldInline(inlined, refCount)) { - notInlined.append(inlined) - } + val refInfo = cteMap(cteDef.id) + if (refInfo.refCount > 0) { + val newDef = refInfo.cteDef.copy(child = inlineCTE(refInfo.cteDef.child, cteMap)) + val inlineDecision = shouldInline(newDef, refInfo.refCount) + cteMap(cteDef.id) = cteMap(cteDef.id).copy( + cteDef = newDef, shouldInline = inlineDecision + ) + if (!inlineDecision) notInlined += newDef + } else if (keepDanglingRelations) { + notInlined += refInfo.cteDef } } - inlineCTE(child, cteMap, notInlined) + val inlined = inlineCTE(child, cteMap) + if (notInlined.isEmpty) { + inlined + } else { + // Retain the not-inlined CTE relations in place. + WithCTE(inlined, notInlined.toSeq) + } case ref: CTERelationRef => - val (cteDef, refCount, _) = cteMap(ref.cteId) - if (shouldInline(cteDef, refCount)) { - if (ref.outputSet == cteDef.outputSet) { - cteDef.child + val refInfo = cteMap(ref.cteId) + if (refInfo.shouldInline) { + if (ref.outputSet == refInfo.cteDef.outputSet) { + refInfo.cteDef.child } else { val ctePlan = DeduplicateRelations( - Join(cteDef.child, cteDef.child, Inner, None, JoinHint(None, None))).children(1) + Join( + refInfo.cteDef.child, + refInfo.cteDef.child, + Inner, + None, + JoinHint(None, None) + ) + ).children(1) val projectList = ref.output.zip(ctePlan.output).map { case (tgtAttr, srcAttr) => if (srcAttr.semanticEquals(tgtAttr)) { tgtAttr @@ -184,13 +192,41 @@ case class InlineCTE(alwaysInline: Boolean = false) extends Rule[LogicalPlan] { case _ if plan.containsPattern(CTE) => plan - .withNewChildren(plan.children.map(child => inlineCTE(child, cteMap, notInlined))) + .withNewChildren(plan.children.map(child => inlineCTE(child, cteMap))) .transformExpressionsWithPruning(_.containsAllPatterns(PLAN_EXPRESSION, CTE)) { case e: SubqueryExpression => - e.withNewPlan(inlineCTE(e.plan, cteMap, notInlined)) + e.withNewPlan(inlineCTE(e.plan, cteMap)) } case _ => plan } } } + +/** + * The bookkeeping information for tracking CTE relation references. + * + * @param cteDef The CTE relation definition + * @param refCount The number of incoming references to this CTE relation. This includes references + * from other CTE relations and regular places. + * @param outgoingRefs A mutable map that tracks outgoing reference counts to other CTE relations. + * @param shouldInline If true, this CTE relation should be inlined in the places that reference it. + */ +case class CTEReferenceInfo( + cteDef: CTERelationDef, + refCount: Int, + outgoingRefs: mutable.Map[Long, Int], + shouldInline: Boolean) { + + def withRefCountIncreased(count: Int): CTEReferenceInfo = { + copy(refCount = refCount + count) + } + + def withRefCountDecreased(count: Int): CTEReferenceInfo = { + copy(refCount = refCount - count) + } + + def increaseOutgoingRefCount(cteDefId: Long, count: Int): Unit = { + outgoingRefs(cteDefId) += count + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala index ca3c14177e6bd..8de2663a98094 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala @@ -218,6 +218,11 @@ object NestedColumnAliasing { case _ => false } + private def canAlias(ev: Expression): Boolean = { + // we can not alias the attr from lambda variable whose expr id is not available + !ev.exists(_.isInstanceOf[NamedLambdaVariable]) && ev.references.size == 1 + } + /** * Returns two types of expressions: * - Root references that are individually accessed @@ -226,11 +231,11 @@ object NestedColumnAliasing { */ private def collectRootReferenceAndExtractValue(e: Expression): Seq[Expression] = e match { case _: AttributeReference => Seq(e) - case GetStructField(_: ExtractValue | _: AttributeReference, _, _) => Seq(e) + case GetStructField(_: ExtractValue | _: AttributeReference, _, _) if canAlias(e) => Seq(e) case GetArrayStructFields(_: MapValues | _: MapKeys | _: ExtractValue | - _: AttributeReference, _, _, _, _) => Seq(e) + _: AttributeReference, _, _, _, _) if canAlias(e) => Seq(e) case es if es.children.nonEmpty => es.children.flatMap(collectRootReferenceAndExtractValue) case _ => Seq.empty } @@ -249,13 +254,8 @@ object NestedColumnAliasing { val otherRootReferences = new mutable.ArrayBuffer[AttributeReference]() exprList.foreach { e => extractor(e).foreach { - // we can not alias the attr from lambda variable whose expr id is not available - case ev: ExtractValue if !ev.exists(_.isInstanceOf[NamedLambdaVariable]) => - if (ev.references.size == 1) { - nestedFieldReferences.append(ev) - } + case ev: ExtractValue => nestedFieldReferences.append(ev) case ar: AttributeReference => otherRootReferences.append(ar) - case _ => // ignore } } val exclusiveAttrSet = AttributeSet(exclusiveAttrs ++ otherRootReferences) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeJoinCondition.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeJoinCondition.scala new file mode 100644 index 0000000000000..7c41ebea050be --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeJoinCondition.scala @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.optimizer + +import org.apache.spark.sql.catalyst.expressions.{And, EqualNullSafe, EqualTo, IsNull, Or, PredicateHelper} +import org.apache.spark.sql.catalyst.plans.logical.{Join, LogicalPlan} +import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.catalyst.trees.TreePattern.{JOIN, OR} + +/** + * Replaces `t1.id is null and t2.id is null or t1.id = t2.id` to `t1.id <=> t2.id` + * in join condition for better performance. + */ +object OptimizeJoinCondition extends Rule[LogicalPlan] with PredicateHelper { + override def apply(plan: LogicalPlan): LogicalPlan = plan.transformWithPruning( + _.containsPattern(JOIN), ruleId) { + case j @ Join(_, _, _, condition, _) if condition.nonEmpty => + val newCondition = condition.map(_.transformWithPruning(_.containsPattern(OR), ruleId) { + case Or(EqualTo(l, r), And(IsNull(c1), IsNull(c2))) + if (l.semanticEquals(c1) && r.semanticEquals(c2)) + || (l.semanticEquals(c2) && r.semanticEquals(c1)) => + EqualNullSafe(l, r) + case Or(And(IsNull(c1), IsNull(c2)), EqualTo(l, r)) + if (l.semanticEquals(c1) && r.semanticEquals(c2)) + || (l.semanticEquals(c2) && r.semanticEquals(c1)) => + EqualNullSafe(l, r) + }) + j.copy(condition = newCondition) + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeOneRowPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeOneRowPlan.scala index 83646611578cb..61c08eb8f8b6f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeOneRowPlan.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeOneRowPlan.scala @@ -21,6 +21,7 @@ import org.apache.spark.sql.catalyst.expressions.aggregate._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules._ import org.apache.spark.sql.catalyst.trees.TreePattern._ +import org.apache.spark.sql.internal.SQLConf /** * The rule is applied both normal and AQE Optimizer. It optimizes plan using max rows: @@ -31,19 +32,37 @@ import org.apache.spark.sql.catalyst.trees.TreePattern._ * it's grouping only(include the rewritten distinct plan), convert aggregate to project * - if the max rows of the child of aggregate is less than or equal to 1, * set distinct to false in all aggregate expression + * + * Note: the rule should not be applied to streaming source, since the number of rows it sees is + * just for current microbatch. It does not mean the streaming source will ever produce max 1 + * rows during lifetime of the query. Suppose the case: the streaming query has a case where + * batch 0 runs with empty data in streaming source A which triggers the rule with Aggregate, + * and batch 1 runs with several data in streaming source A which no longer trigger the rule. + * In the above scenario, this could fail the query as stateful operator is expected to be planned + * for every batches whereas here it is planned "selectively". */ object OptimizeOneRowPlan extends Rule[LogicalPlan] { override def apply(plan: LogicalPlan): LogicalPlan = { + val enableForStreaming = conf.getConf(SQLConf.STREAMING_OPTIMIZE_ONE_ROW_PLAN_ENABLED) + plan.transformUpWithPruning(_.containsAnyPattern(SORT, AGGREGATE), ruleId) { - case Sort(_, _, child) if child.maxRows.exists(_ <= 1L) => child - case Sort(_, false, child) if child.maxRowsPerPartition.exists(_ <= 1L) => child - case agg @ Aggregate(_, _, child) if agg.groupOnly && child.maxRows.exists(_ <= 1L) => + case Sort(_, _, child) if child.maxRows.exists(_ <= 1L) && + isChildEligible(child, enableForStreaming) => child + case Sort(_, false, child) if child.maxRowsPerPartition.exists(_ <= 1L) && + isChildEligible(child, enableForStreaming) => child + case agg @ Aggregate(_, _, child) if agg.groupOnly && child.maxRows.exists(_ <= 1L) && + isChildEligible(child, enableForStreaming) => Project(agg.aggregateExpressions, child) - case agg: Aggregate if agg.child.maxRows.exists(_ <= 1L) => + case agg: Aggregate if agg.child.maxRows.exists(_ <= 1L) && + isChildEligible(agg.child, enableForStreaming) => agg.transformExpressions { case aggExpr: AggregateExpression if aggExpr.isDistinct => aggExpr.copy(isDistinct = false) } } } + + private def isChildEligible(child: LogicalPlan, enableForStreaming: Boolean): Boolean = { + enableForStreaming || !child.isStreaming + } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index cacde9f5a7122..95923a1419513 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -20,15 +20,14 @@ package org.apache.spark.sql.catalyst.optimizer import scala.collection.mutable import org.apache.spark.SparkException -import org.apache.spark.internal.LogKey._ -import org.apache.spark.internal.MDC +import org.apache.spark.internal.{LogKeys, MDC} import org.apache.spark.sql.catalyst.SQLConfHelper import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.catalyst.catalog.{InMemoryCatalog, SessionCatalog} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate._ import org.apache.spark.sql.catalyst.plans._ -import org.apache.spark.sql.catalyst.plans.logical.{RepartitionOperation, _} +import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules._ import org.apache.spark.sql.catalyst.trees.AlwaysProcess import org.apache.spark.sql.catalyst.trees.TreePattern._ @@ -84,6 +83,7 @@ abstract class Optimizer(catalogManager: CatalogManager) PushDownPredicates, PushDownLeftSemiAntiJoin, PushLeftSemiLeftAntiThroughJoin, + OptimizeJoinCondition, LimitPushDown, LimitPushDownThroughWindow, ColumnPruning, @@ -116,7 +116,6 @@ abstract class Optimizer(catalogManager: CatalogManager) BooleanSimplification, SimplifyConditionals, PushFoldableIntoBranches, - RemoveDispensableExpressions, SimplifyBinaryComparison, ReplaceNullWithFalseInPredicate, PruneFilters, @@ -445,7 +444,7 @@ abstract class Optimizer(catalogManager: CatalogManager) val excludedRules = excludedRulesConf.filter { ruleName => val nonExcludable = nonExcludableRules.contains(ruleName) if (nonExcludable) { - logWarning(log"Optimization rule '${MDC(RULE_NAME, ruleName)}' " + + logWarning(log"Optimization rule '${MDC(LogKeys.RULE_NAME, ruleName)}' " + log"was not excluded from the optimizer because this rule is a non-excludable rule.") } !nonExcludable @@ -457,7 +456,7 @@ abstract class Optimizer(catalogManager: CatalogManager) val filteredRules = batch.rules.filter { rule => val exclude = excludedRules.contains(rule.ruleName) if (exclude) { - logInfo(log"Optimization rule '${MDC(RULE_NAME, rule.ruleName)}' " + + logInfo(log"Optimization rule '${MDC(LogKeys.RULE_NAME, rule.ruleName)}' " + log"is excluded from the optimizer.") } !exclude @@ -467,7 +466,7 @@ abstract class Optimizer(catalogManager: CatalogManager) } else if (filteredRules.nonEmpty) { Some(Batch(batch.name, batch.strategy, filteredRules: _*)) } else { - logInfo(log"Optimization batch '${MDC(RULE_BATCH_NAME, batch.name)}' " + + logInfo(log"Optimization batch '${MDC(LogKeys.BATCH_NAME, batch.name)}' " + log"is excluded from the optimizer as all enclosed rules have been excluded.") None } @@ -1238,7 +1237,12 @@ object CollapseProject extends Rule[LogicalPlan] with AliasHelper { case _: Attribute | _: OuterReference => true case _ if e.foldable => true // PythonUDF is handled by the rule ExtractPythonUDFs - case _: PythonUDF => true + case _: PythonUDF => + if (conf.getConf(SQLConf.AVOID_COLLAPSE_UDF_WITH_EXPENSIVE_EXPR)) { + e.children.forall(isCheap) + } else { + true + } // Alias and ExtractValue are very cheap. case _: Alias | _: ExtractValue => e.children.forall(isCheap) case _ => false @@ -1768,6 +1772,10 @@ object PushPredicateThroughNonJoin extends Rule[LogicalPlan] with PredicateHelpe val aliasMap = getAliasMap(project) project.copy(child = Filter(replaceAlias(condition, aliasMap), grandChild)) + // We can push down deterministic predicate through Aggregate, including throwable predicate. + // If we can push down a filter through Aggregate, it means the filter only references the + // grouping keys or constants. The Aggregate operator can't reduce distinct values of grouping + // keys so the filter won't see any new data after push down. case filter @ Filter(condition, aggregate: Aggregate) if aggregate.aggregateExpressions.forall(_.deterministic) && aggregate.groupingExpressions.nonEmpty => @@ -1777,8 +1785,8 @@ object PushPredicateThroughNonJoin extends Rule[LogicalPlan] with PredicateHelpe // attributes produced by the aggregate operator's child operator. val (pushDown, stayUp) = splitConjunctivePredicates(condition).partition { cond => val replaced = replaceAlias(cond, aliasMap) - cond.deterministic && !cond.throwable && - cond.references.nonEmpty && replaced.references.subsetOf(aggregate.child.outputSet) + cond.deterministic && cond.references.nonEmpty && + replaced.references.subsetOf(aggregate.child.outputSet) } if (pushDown.nonEmpty) { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelation.scala index fd7a87087ddd2..832af340c3397 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelation.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelation.scala @@ -58,13 +58,15 @@ abstract class PropagateEmptyRelationBase extends Rule[LogicalPlan] with CastSup case _ => false } - protected def empty(plan: LogicalPlan): LocalRelation = + protected def empty(plan: LogicalPlan): LogicalPlan = LocalRelation(plan.output, data = Seq.empty, isStreaming = plan.isStreaming) // Construct a project list from plan's output, while the value is always NULL. private def nullValueProjectList(plan: LogicalPlan): Seq[NamedExpression] = plan.output.map{ a => Alias(cast(Literal(null), a.dataType), a.name)(a.exprId) } + protected def canExecuteWithoutJoin(plan: LogicalPlan): Boolean = true + protected def commonApplyFunc: PartialFunction[LogicalPlan, LogicalPlan] = { case p: Union if p.children.exists(isEmpty) => val newChildren = p.children.filterNot(isEmpty) @@ -111,18 +113,19 @@ abstract class PropagateEmptyRelationBase extends Rule[LogicalPlan] with CastSup case LeftSemi if isRightEmpty | isFalseCondition => empty(p) case LeftAnti if isRightEmpty | isFalseCondition => p.left case FullOuter if isLeftEmpty && isRightEmpty => empty(p) - case LeftOuter | FullOuter if isRightEmpty => + case LeftOuter | FullOuter if isRightEmpty && canExecuteWithoutJoin(p.left) => Project(p.left.output ++ nullValueProjectList(p.right), p.left) case RightOuter if isRightEmpty => empty(p) - case RightOuter | FullOuter if isLeftEmpty => + case RightOuter | FullOuter if isLeftEmpty && canExecuteWithoutJoin(p.right) => Project(nullValueProjectList(p.left) ++ p.right.output, p.right) - case LeftOuter if isFalseCondition => + case LeftOuter if isFalseCondition && canExecuteWithoutJoin(p.left) => Project(p.left.output ++ nullValueProjectList(p.right), p.left) - case RightOuter if isFalseCondition => + case RightOuter if isFalseCondition && canExecuteWithoutJoin(p.right) => Project(nullValueProjectList(p.left) ++ p.right.output, p.right) case _ => p } - } else if (joinType == LeftSemi && conditionOpt.isEmpty && nonEmpty(p.right)) { + } else if (joinType == LeftSemi && conditionOpt.isEmpty && + nonEmpty(p.right) && canExecuteWithoutJoin(p.left)) { p.left } else if (joinType == LeftAnti && conditionOpt.isEmpty && nonEmpty(p.right)) { empty(p) @@ -130,8 +133,10 @@ abstract class PropagateEmptyRelationBase extends Rule[LogicalPlan] with CastSup p } - // the only case can be matched here is that LogicalQueryStage is empty - case p: LeafNode if !p.isInstanceOf[LocalRelation] && isEmpty(p) => empty(p) + // Only replace a query stage if it would lead to a reduction of operators. !p.isDirectStage + // means the physical node it contains is partial aggregate instead of QueryStageExec, which + // is exactly what we want to propagate empty relation. + case p: LogicalQueryStage if isEmpty(p) && !p.isDirectStage => empty(p) case p: UnaryNode if p.children.nonEmpty && p.children.forall(isEmpty) => p match { case _: Project => empty(p) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceNullWithFalseInPredicate.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceNullWithFalseInPredicate.scala index 772382f5f1e12..2cda1142299ae 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceNullWithFalseInPredicate.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceNullWithFalseInPredicate.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql.catalyst.optimizer import org.apache.spark.SparkIllegalArgumentException -import org.apache.spark.internal.LogKey.{SQL_TEXT, UNSUPPORTED_EXPRESSION} +import org.apache.spark.internal.LogKeys.{SQL_TEXT, UNSUPPORTED_EXPR} import org.apache.spark.internal.MDC import org.apache.spark.sql.catalyst.expressions.{And, ArrayExists, ArrayFilter, CaseWhen, EqualNullSafe, Expression, If, In, InSet, LambdaFunction, Literal, MapFilter, Not, Or} import org.apache.spark.sql.catalyst.expressions.Literal.{FalseLiteral, TrueLiteral} @@ -141,7 +141,7 @@ object ReplaceNullWithFalseInPredicate extends Rule[LogicalPlan] { "expr" -> e.sql)) } else { val message = log"Expected a Boolean type expression in replaceNullWithFalse, " + - log"but got the type `${MDC(UNSUPPORTED_EXPRESSION, e.dataType.catalogString)}` " + + log"but got the type `${MDC(UNSUPPORTED_EXPR, e.dataType.catalogString)}` " + log"in `${MDC(SQL_TEXT, e.sql)}`." logWarning(message) e diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteWithExpression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteWithExpression.scala index 934eadbcee551..393a66f7c1e4f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteWithExpression.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteWithExpression.scala @@ -21,36 +21,65 @@ import scala.collection.mutable import org.apache.spark.SparkException import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, PlanHelper, Project} +import org.apache.spark.sql.catalyst.planning.PhysicalAggregation +import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LogicalPlan, PlanHelper, Project} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.catalyst.trees.TreePattern.{COMMON_EXPR_REF, WITH_EXPRESSION} +import org.apache.spark.sql.internal.SQLConf /** * Rewrites the `With` expressions by adding a `Project` to pre-evaluate the common expressions, or * just inline them if they are cheap. * + * Since this rule can introduce new `Project` operators, it is advised to run [[CollapseProject]] + * after this rule. + * * Note: For now we only use `With` in a few `RuntimeReplaceable` expressions. If we expand its * usage, we should support aggregate/window functions as well. */ object RewriteWithExpression extends Rule[LogicalPlan] { override def apply(plan: LogicalPlan): LogicalPlan = { - plan.transformDownWithSubqueriesAndPruning(_.containsPattern(WITH_EXPRESSION)) { + plan.transformUpWithSubqueriesAndPruning(_.containsPattern(WITH_EXPRESSION)) { + // For aggregates, separate the computation of the aggregations themselves from the final + // result by moving the final result computation into a projection above it. This prevents + // this rule from producing an invalid Aggregate operator. + case p @ PhysicalAggregation( + groupingExpressions, aggregateExpressions, resultExpressions, child) + if p.expressions.exists(_.containsPattern(WITH_EXPRESSION)) => + // PhysicalAggregation returns aggregateExpressions as attribute references, which we change + // to aliases so that they can be referred to by resultExpressions. + val aggExprs = aggregateExpressions.map( + ae => Alias(ae, "_aggregateexpression")(ae.resultId)) + val aggExprIds = aggExprs.map(_.exprId).toSet + val resExprs = resultExpressions.map(_.transform { + case a: AttributeReference if aggExprIds.contains(a.exprId) => + a.withName("_aggregateexpression") + }.asInstanceOf[NamedExpression]) + // Rewrite the projection and the aggregate separately and then piece them together. + val agg = Aggregate(groupingExpressions, groupingExpressions ++ aggExprs, child) + val rewrittenAgg = applyInternal(agg) + val proj = Project(resExprs, rewrittenAgg) + applyInternal(proj) case p if p.expressions.exists(_.containsPattern(WITH_EXPRESSION)) => - val inputPlans = p.children.toArray - var newPlan: LogicalPlan = p.mapExpressions { expr => - rewriteWithExprAndInputPlans(expr, inputPlans) - } - newPlan = newPlan.withNewChildren(inputPlans.toIndexedSeq) - // Since we add extra Projects with extra columns to pre-evaluate the common expressions, - // the current operator may have extra columns if it inherits the output columns from its - // child, and we need to project away the extra columns to keep the plan schema unchanged. - assert(p.output.length <= newPlan.output.length) - if (p.output.length < newPlan.output.length) { - assert(p.outputSet.subsetOf(newPlan.outputSet)) - Project(p.output, newPlan) - } else { - newPlan - } + applyInternal(p) + } + } + + private def applyInternal(p: LogicalPlan): LogicalPlan = { + val inputPlans = p.children.toArray + var newPlan: LogicalPlan = p.mapExpressions { expr => + rewriteWithExprAndInputPlans(expr, inputPlans) + } + newPlan = newPlan.withNewChildren(inputPlans.toIndexedSeq) + // Since we add extra Projects with extra columns to pre-evaluate the common expressions, + // the current operator may have extra columns if it inherits the output columns from its + // child, and we need to project away the extra columns to keep the plan schema unchanged. + assert(p.output.length <= newPlan.output.length) + if (p.output.length < newPlan.output.length) { + assert(p.outputSet.subsetOf(newPlan.outputSet)) + Project(p.output, newPlan) + } else { + newPlan } } @@ -93,7 +122,12 @@ object RewriteWithExpression extends Rule[LogicalPlan] { // if it's ref count is 1. refToExpr(id) = child } else { - val alias = Alias(child, s"_common_expr_$index")() + val aliasName = if (SQLConf.get.getConf(SQLConf.USE_COMMON_EXPR_ID_FOR_ALIAS)) { + s"_common_expr_${id.id}" + } else { + s"_common_expr_$index" + } + val alias = Alias(child, aliasName)() val fakeProj = Project(Seq(alias), inputPlans(childProjectionIndex)) if (PlanHelper.specialExpressionsInUnsupportedOperator(fakeProj).nonEmpty) { // We have to inline the common expression if it cannot be put in a Project. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala index 32700f176f25a..1750a0e275732 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala @@ -738,18 +738,19 @@ object LikeSimplification extends Rule[LogicalPlan] with PredicateHelper { } else { pattern match { case startsWith(prefix) => - Some(StartsWith(input, Literal(prefix))) + Some(StartsWith(input, Literal.create(prefix, input.dataType))) case endsWith(postfix) => - Some(EndsWith(input, Literal(postfix))) + Some(EndsWith(input, Literal.create(postfix, input.dataType))) // 'a%a' pattern is basically same with 'a%' && '%a'. // However, the additional `Length` condition is required to prevent 'a' match 'a%a'. - case startsAndEndsWith(prefix, postfix) => - Some(And(GreaterThanOrEqual(Length(input), Literal(prefix.length + postfix.length)), - And(StartsWith(input, Literal(prefix)), EndsWith(input, Literal(postfix))))) + case startsAndEndsWith(prefix, postfix) => Some( + And(GreaterThanOrEqual(Length(input), Literal.create(prefix.length + postfix.length)), + And(StartsWith(input, Literal.create(prefix, input.dataType)), + EndsWith(input, Literal.create(postfix, input.dataType))))) case contains(infix) => - Some(Contains(input, Literal(infix))) + Some(Contains(input, Literal.create(infix, input.dataType))) case equalTo(str) => - Some(EqualTo(input, Literal(str))) + Some(EqualTo(input, Literal.create(str, input.dataType))) case _ => None } } @@ -785,7 +786,7 @@ object LikeSimplification extends Rule[LogicalPlan] with PredicateHelper { def apply(plan: LogicalPlan): LogicalPlan = plan.transformAllExpressionsWithPruning( _.containsPattern(LIKE_FAMLIY), ruleId) { - case l @ Like(input, Literal(pattern, StringType), escapeChar) => + case l @ Like(input, Literal(pattern, _: StringType), escapeChar) => if (pattern == null) { // If pattern is null, return null value directly, since "col like null" == null. Literal(null, BooleanType) @@ -1023,7 +1024,7 @@ object FoldablePropagation extends Rule[LogicalPlan] { plan } else { plan transformExpressions { - case a: AttributeReference if foldableMap.contains(a) => foldableMap(a) + case a: AttributeReference if foldableMap.contains(a) => foldableMap(a).withName(a.name) } } } @@ -1088,17 +1089,6 @@ object SimplifyCasts extends Rule[LogicalPlan] { } -/** - * Removes nodes that are not necessary. - */ -object RemoveDispensableExpressions extends Rule[LogicalPlan] { - def apply(plan: LogicalPlan): LogicalPlan = plan.transformAllExpressionsWithPruning( - _.containsPattern(UNARY_POSITIVE), ruleId) { - case UnaryPositive(child) => child - } -} - - /** * Removes the inner case conversion expressions that are unnecessary because * the inner conversion is overwritten by the outer one. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/finishAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/finishAnalysis.scala index 92ac7599a8ff7..48753fbfe3267 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/finishAnalysis.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/finishAnalysis.scala @@ -33,6 +33,7 @@ import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.catalyst.util.DateTimeUtils.{convertSpecialDate, convertSpecialTimestamp, convertSpecialTimestampNTZ, instantToMicros, localDateTimeToMicros} import org.apache.spark.sql.catalyst.util.TypeUtils.toSQLExpr import org.apache.spark.sql.connector.catalog.CatalogManager +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ @@ -151,11 +152,11 @@ case class ReplaceCurrentLike(catalogManager: CatalogManager) extends Rule[Logic plan.transformAllExpressionsWithPruning(_.containsPattern(CURRENT_LIKE)) { case CurrentDatabase() => - Literal.create(currentNamespace, StringType) + Literal.create(currentNamespace, SQLConf.get.defaultStringType) case CurrentCatalog() => - Literal.create(currentCatalog, StringType) + Literal.create(currentCatalog, SQLConf.get.defaultStringType) case CurrentUser() => - Literal.create(currentUser, StringType) + Literal.create(currentUser, SQLConf.get.defaultStringType) } } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala index 655b7c3455b1e..9fc4873c248b5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala @@ -20,15 +20,16 @@ package org.apache.spark.sql.catalyst.optimizer import scala.annotation.tailrec import scala.util.control.NonFatal -import org.apache.spark.internal.LogKey.JOIN_CONDITION -import org.apache.spark.internal.MDC +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{HASH_JOIN_KEYS, JOIN_CONDITION} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression -import org.apache.spark.sql.catalyst.planning.{ExtractEquiJoinKeys, ExtractFiltersAndInnerJoins} +import org.apache.spark.sql.catalyst.planning.{ExtractEquiJoinKeys, ExtractFiltersAndInnerJoins, ExtractSingleColumnNullAwareAntiJoin} import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules._ import org.apache.spark.sql.catalyst.trees.TreePattern._ +import org.apache.spark.sql.catalyst.util.UnsafeRowUtils import org.apache.spark.sql.errors.QueryCompilationErrors import org.apache.spark.sql.internal.SQLConf import org.apache.spark.util.Utils @@ -286,61 +287,55 @@ case object BuildRight extends BuildSide case object BuildLeft extends BuildSide -trait JoinSelectionHelper { +trait JoinSelectionHelper extends Logging { def getBroadcastBuildSide( - left: LogicalPlan, - right: LogicalPlan, - joinType: JoinType, - hint: JoinHint, + join: Join, hintOnly: Boolean, conf: SQLConf): Option[BuildSide] = { val buildLeft = if (hintOnly) { - hintToBroadcastLeft(hint) + hintToBroadcastLeft(join.hint) } else { - canBroadcastBySize(left, conf) && !hintToNotBroadcastLeft(hint) + canBroadcastBySize(join.left, conf) && !hintToNotBroadcastLeft(join.hint) } val buildRight = if (hintOnly) { - hintToBroadcastRight(hint) + hintToBroadcastRight(join.hint) } else { - canBroadcastBySize(right, conf) && !hintToNotBroadcastRight(hint) + canBroadcastBySize(join.right, conf) && !hintToNotBroadcastRight(join.hint) } getBuildSide( - canBuildBroadcastLeft(joinType) && buildLeft, - canBuildBroadcastRight(joinType) && buildRight, - left, - right + canBuildBroadcastLeft(join.joinType) && buildLeft, + canBuildBroadcastRight(join.joinType) && buildRight, + join.left, + join.right ) } def getShuffleHashJoinBuildSide( - left: LogicalPlan, - right: LogicalPlan, - joinType: JoinType, - hint: JoinHint, + join: Join, hintOnly: Boolean, conf: SQLConf): Option[BuildSide] = { val buildLeft = if (hintOnly) { - hintToShuffleHashJoinLeft(hint) + hintToShuffleHashJoinLeft(join.hint) } else { - hintToPreferShuffleHashJoinLeft(hint) || - (!conf.preferSortMergeJoin && canBuildLocalHashMapBySize(left, conf) && - muchSmaller(left, right, conf)) || + hintToPreferShuffleHashJoinLeft(join.hint) || + (!conf.preferSortMergeJoin && canBuildLocalHashMapBySize(join.left, conf) && + muchSmaller(join.left, join.right, conf)) || forceApplyShuffledHashJoin(conf) } val buildRight = if (hintOnly) { - hintToShuffleHashJoinRight(hint) + hintToShuffleHashJoinRight(join.hint) } else { - hintToPreferShuffleHashJoinRight(hint) || - (!conf.preferSortMergeJoin && canBuildLocalHashMapBySize(right, conf) && - muchSmaller(right, left, conf)) || + hintToPreferShuffleHashJoinRight(join.hint) || + (!conf.preferSortMergeJoin && canBuildLocalHashMapBySize(join.right, conf) && + muchSmaller(join.right, join.left, conf)) || forceApplyShuffledHashJoin(conf) } getBuildSide( - canBuildShuffledHashJoinLeft(joinType) && buildLeft, - canBuildShuffledHashJoinRight(joinType) && buildRight, - left, - right + canBuildShuffledHashJoinLeft(join.joinType) && buildLeft, + canBuildShuffledHashJoinRight(join.joinType) && buildRight, + join.left, + join.right ) } @@ -400,11 +395,32 @@ trait JoinSelectionHelper { } } - def canPlanAsBroadcastHashJoin(join: Join, conf: SQLConf): Boolean = { - getBroadcastBuildSide(join.left, join.right, join.joinType, - join.hint, hintOnly = true, conf).isDefined || - getBroadcastBuildSide(join.left, join.right, join.joinType, - join.hint, hintOnly = false, conf).isDefined + protected def hashJoinSupported + (leftKeys: Seq[Expression], rightKeys: Seq[Expression]): Boolean = { + val result = leftKeys.concat(rightKeys).forall(e => UnsafeRowUtils.isBinaryStable(e.dataType)) + if (!result) { + val keysNotSupportingHashJoin = leftKeys.concat(rightKeys).filterNot( + e => UnsafeRowUtils.isBinaryStable(e.dataType)) + logWarning(log"Hash based joins are not supported due to joining on keys that don't " + + log"support binary equality. Keys not supporting hash joins: " + + log"${ + MDC(HASH_JOIN_KEYS, keysNotSupportingHashJoin.map( + e => e.toString + " due to DataType: " + e.dataType.typeName).mkString(", ")) + }") + } + result + } + + def canPlanAsBroadcastHashJoin(join: Join, conf: SQLConf): Boolean = join match { + case ExtractEquiJoinKeys(_, leftKeys, rightKeys, _, _, _, _, _) => + val hashJoinSupport = hashJoinSupported(leftKeys, rightKeys) + val noShufflePlannedBefore = + !hashJoinSupport || getShuffleHashJoinBuildSide(join, hintOnly = true, conf).isEmpty + getBroadcastBuildSide(join, hintOnly = true, conf).isDefined || + (noShufflePlannedBefore && + getBroadcastBuildSide(join, hintOnly = false, conf).isDefined) + case ExtractSingleColumnNullAwareAntiJoin(_, _) => true + case _ => false } def canPruneLeft(joinType: JoinType): Boolean = joinType match { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AbstractSqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AbstractSqlParser.scala index 2d6fabaaef68a..1c477964a6890 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AbstractSqlParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AbstractSqlParser.scala @@ -16,6 +16,8 @@ */ package org.apache.spark.sql.catalyst.parser +import org.antlr.v4.runtime.ParserRuleContext + import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier} import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.parser.ParserUtils.withOrigin @@ -30,44 +32,56 @@ abstract class AbstractSqlParser extends AbstractParser with ParserInterface { override def astBuilder: AstBuilder /** Creates Expression for a given SQL string. */ - override def parseExpression(sqlText: String): Expression = parse(sqlText) { parser => - val ctx = parser.singleExpression() - withOrigin(ctx, Some(sqlText)) { - astBuilder.visitSingleExpression(ctx) + override def parseExpression(sqlText: String): Expression = + parse(sqlText) { parser => + val ctx = parser.singleExpression() + withErrorHandling(ctx, Some(sqlText)) { + astBuilder.visitSingleExpression(ctx) + } } - } /** Creates TableIdentifier for a given SQL string. */ - override def parseTableIdentifier(sqlText: String): TableIdentifier = parse(sqlText) { parser => - astBuilder.visitSingleTableIdentifier(parser.singleTableIdentifier()) - } + override def parseTableIdentifier(sqlText: String): TableIdentifier = + parse(sqlText) { parser => + val ctx = parser.singleTableIdentifier() + withErrorHandling(ctx, Some(sqlText)) { + astBuilder.visitSingleTableIdentifier(ctx) + } + } /** Creates FunctionIdentifier for a given SQL string. */ override def parseFunctionIdentifier(sqlText: String): FunctionIdentifier = { parse(sqlText) { parser => - astBuilder.visitSingleFunctionIdentifier(parser.singleFunctionIdentifier()) + val ctx = parser.singleFunctionIdentifier() + withErrorHandling(ctx, Some(sqlText)) { + astBuilder.visitSingleFunctionIdentifier(ctx) + } } } /** Creates a multi-part identifier for a given SQL string */ override def parseMultipartIdentifier(sqlText: String): Seq[String] = { parse(sqlText) { parser => - astBuilder.visitSingleMultipartIdentifier(parser.singleMultipartIdentifier()) + val ctx = parser.singleMultipartIdentifier() + withErrorHandling(ctx, Some(sqlText)) { + astBuilder.visitSingleMultipartIdentifier(ctx) + } } } /** Creates LogicalPlan for a given SQL string of query. */ - override def parseQuery(sqlText: String): LogicalPlan = parse(sqlText) { parser => - val ctx = parser.query() - withOrigin(ctx, Some(sqlText)) { - astBuilder.visitQuery(ctx) + override def parseQuery(sqlText: String): LogicalPlan = + parse(sqlText) { parser => + val ctx = parser.query() + withErrorHandling(ctx, Some(sqlText)) { + astBuilder.visitQuery(ctx) + } } - } /** Creates LogicalPlan for a given SQL string. */ override def parsePlan(sqlText: String): LogicalPlan = parse(sqlText) { parser => val ctx = parser.singleStatement() - withOrigin(ctx, Some(sqlText)) { + withErrorHandling(ctx, Some(sqlText)) { astBuilder.visitSingleStatement(ctx) match { case plan: LogicalPlan => plan case _ => @@ -76,4 +90,28 @@ abstract class AbstractSqlParser extends AbstractParser with ParserInterface { } } } + + /** Creates [[CompoundBody]] for a given SQL script string. */ + override def parseScript(sqlScriptText: String): CompoundBody = parse(sqlScriptText) { parser => + val ctx = parser.compoundOrSingleStatement() + withErrorHandling(ctx, Some(sqlScriptText)) { + astBuilder.visitCompoundOrSingleStatement(ctx) match { + case body: CompoundBody => body + case _ => + val position = Origin(None, None) + throw QueryParsingErrors.sqlStatementUnsupportedError(sqlScriptText, position) + } + } + } + + def withErrorHandling[T](ctx: ParserRuleContext, sqlText: Option[String])(toResult: => T): T = { + withOrigin(ctx, sqlText) { + try { + toResult + } catch { + case so: StackOverflowError => + throw QueryParsingErrors.parserStackOverflow(ctx) + } + } + } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index 34672485ddc9b..dc43bd1636594 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -20,21 +20,20 @@ package org.apache.spark.sql.catalyst.parser import java.util.Locale import java.util.concurrent.TimeUnit -import scala.collection.mutable.{ArrayBuffer, Set} +import scala.collection.mutable.{ArrayBuffer, ListBuffer, Set} import scala.jdk.CollectionConverters._ import scala.util.{Left, Right} import org.antlr.v4.runtime.{ParserRuleContext, Token} import org.antlr.v4.runtime.misc.Interval import org.antlr.v4.runtime.tree.{ParseTree, RuleNode, TerminalNode} -import org.apache.commons.codec.DecoderException -import org.apache.commons.codec.binary.Hex import org.apache.spark.{SparkArithmeticException, SparkException, SparkIllegalArgumentException, SparkThrowable} import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.PARTITION_SPECIFICATION +import org.apache.spark.internal.LogKeys.PARTITION_SPECIFICATION import org.apache.spark.sql.catalyst.{FunctionIdentifier, SQLConfHelper, TableIdentifier} import org.apache.spark.sql.catalyst.analysis._ +import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.FUNC_ALIAS import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogStorageFormat, ClusterBySpec} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate.{AnyValue, First, Last} @@ -50,7 +49,9 @@ import org.apache.spark.sql.connector.catalog.{CatalogV2Util, SupportsNamespaces import org.apache.spark.sql.connector.catalog.TableChange.ColumnPosition import org.apache.spark.sql.connector.expressions.{ApplyTransform, BucketTransform, DaysTransform, Expression => V2Expression, FieldReference, HoursTransform, IdentityTransform, LiteralValue, MonthsTransform, Transform, YearsTransform} import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryParsingErrors} +import org.apache.spark.sql.errors.DataTypeErrors.toSQLStmt import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.SQLConf.LEGACY_BANG_EQUALS_NOT import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} import org.apache.spark.util.ArrayImplicits._ @@ -114,6 +115,45 @@ class AstBuilder extends DataTypeAstBuilder with SQLConfHelper with Logging { } } + override def visitCompoundOrSingleStatement( + ctx: CompoundOrSingleStatementContext): CompoundBody = withOrigin(ctx) { + Option(ctx.singleCompoundStatement()).map { s => + visit(s).asInstanceOf[CompoundBody] + }.getOrElse { + val logicalPlan = visitSingleStatement(ctx.singleStatement()) + CompoundBody(Seq(SingleStatement(parsedPlan = logicalPlan))) + } + } + + override def visitSingleCompoundStatement(ctx: SingleCompoundStatementContext): CompoundBody = { + visit(ctx.beginEndCompoundBlock()).asInstanceOf[CompoundBody] + } + + private def visitCompoundBodyImpl(ctx: CompoundBodyContext): CompoundBody = { + val buff = ListBuffer[CompoundPlanStatement]() + ctx.compoundStatements.forEach(compoundStatement => { + buff += visit(compoundStatement).asInstanceOf[CompoundPlanStatement] + }) + CompoundBody(buff.toSeq) + } + + override def visitBeginEndCompoundBlock(ctx: BeginEndCompoundBlockContext): CompoundBody = { + visitCompoundBodyImpl(ctx.compoundBody()) + } + + override def visitCompoundBody(ctx: CompoundBodyContext): CompoundBody = { + visitCompoundBodyImpl(ctx) + } + + override def visitCompoundStatement(ctx: CompoundStatementContext): CompoundPlanStatement = + withOrigin(ctx) { + Option(ctx.statement()).map {s => + SingleStatement(parsedPlan = visit(s).asInstanceOf[LogicalPlan]) + }.getOrElse { + visit(ctx.beginEndCompoundBlock()).asInstanceOf[CompoundPlanStatement] + } + } + override def visitSingleStatement(ctx: SingleStatementContext): LogicalPlan = withOrigin(ctx) { visit(ctx.statement).asInstanceOf[LogicalPlan] } @@ -365,6 +405,8 @@ class AstBuilder extends DataTypeAstBuilder with SQLConfHelper with Logging { val cols = Option(ctx.identifierList()).map(visitIdentifierList).getOrElse(Nil) val partitionKeys = Option(ctx.partitionSpec).map(visitPartitionSpec).getOrElse(Map.empty) + blockBang(ctx.errorCapturingNot()) + if (ctx.EXISTS != null) { invalidStatement("INSERT INTO ... IF NOT EXISTS", ctx) } @@ -381,6 +423,8 @@ class AstBuilder extends DataTypeAstBuilder with SQLConfHelper with Logging { val cols = Option(ctx.identifierList()).map(visitIdentifierList).getOrElse(Nil) val partitionKeys = Option(ctx.partitionSpec).map(visitPartitionSpec).getOrElse(Map.empty) + blockBang(ctx.errorCapturingNot()) + val dynamicPartitionKeys: Map[String, Option[String]] = partitionKeys.filter(_._2.isEmpty) if (ctx.EXISTS != null && dynamicPartitionKeys.nonEmpty) { operationNotAllowed("IF NOT EXISTS with dynamic partitions: " + @@ -455,6 +499,7 @@ class AstBuilder extends DataTypeAstBuilder with SQLConfHelper with Logging { } override def visitMergeIntoTable(ctx: MergeIntoTableContext): LogicalPlan = withOrigin(ctx) { + val withSchemaEvolution = ctx.EVOLUTION() != null val targetTable = createUnresolvedRelation(ctx.target) val targetTableAlias = getTableAliasWithoutColumnAlias(ctx.targetAlias, "MERGE") val aliasedTarget = targetTableAlias.map(SubqueryAlias(_, targetTable)).getOrElse(targetTable) @@ -549,7 +594,8 @@ class AstBuilder extends DataTypeAstBuilder with SQLConfHelper with Logging { mergeCondition, matchedActions.toSeq, notMatchedActions.toSeq, - notMatchedBySourceActions.toSeq) + notMatchedBySourceActions.toSeq, + withSchemaEvolution) } /** @@ -843,7 +889,9 @@ class AstBuilder extends DataTypeAstBuilder with SQLConfHelper with Logging { // Create the attributes. val (attributes, schemaLess) = if (transformClause.colTypeList != null) { // Typed return columns. - (DataTypeUtils.toAttributes(createSchema(transformClause.colTypeList)), false) + val schema = createSchema(transformClause.colTypeList) + val replacedSchema = CharVarcharUtils.replaceCharVarcharWithStringInSchema(schema) + (DataTypeUtils.toAttributes(replacedSchema), false) } else if (transformClause.identifierSeq != null) { // Untyped return columns. val attrs = visitIdentifierSeq(transformClause.identifierSeq).map { name => @@ -1630,6 +1678,20 @@ class AstBuilder extends DataTypeAstBuilder with SQLConfHelper with Logging { } partitionByExpressions = p.partition.asScala.map(expression).toSeq orderByExpressions = p.sortItem.asScala.map(visitSortItem).toSeq + def invalidPartitionOrOrderingExpression(clause: String): String = { + "The table function call includes a table argument with an invalid " + + s"partitioning/ordering specification: the $clause clause included multiple " + + "expressions without parentheses surrounding them; please add parentheses around " + + "these expressions and then retry the query again" + } + validate( + Option(p.invalidMultiPartitionExpression).isEmpty, + message = invalidPartitionOrOrderingExpression("PARTITION BY"), + ctx = p.invalidMultiPartitionExpression) + validate( + Option(p.invalidMultiSortItem).isEmpty, + message = invalidPartitionOrOrderingExpression("ORDER BY"), + ctx = p.invalidMultiSortItem) } validate( !(withSinglePartition && partitionByExpressions.nonEmpty), @@ -1864,6 +1926,25 @@ class AstBuilder extends DataTypeAstBuilder with SQLConfHelper with Logging { exceptCols.toSeq) } + /** + * Check for the inappropriate usage of the '!' token. + * '!' used to be a synonym for 'NOT' in the lexer, but that was too general. + * '!' should only be a synonym for 'NOT' when used as a prefix in a logical operation. + * We do that now explicitly. + */ + def blockBang(ctx: ErrorCapturingNotContext): ErrorCapturingNotContext = { + val tolerateBang = conf.getConf(LEGACY_BANG_EQUALS_NOT) + if (ctx != null && ctx.BANG() != null && !tolerateBang) { + withOrigin(ctx) { + throw new ParseException( + errorClass = "SYNTAX_DISCONTINUED.BANG_EQUALS_NOT", + messageParameters = Map("clause" -> toSQLStmt("!")), + ctx) + } + } + ctx + } + /** * Create an aliased expression if an alias is specified. Both single and multi-aliases are * supported. @@ -2003,9 +2084,12 @@ class AstBuilder extends DataTypeAstBuilder with SQLConfHelper with Logging { */ private def withPredicate(e: Expression, ctx: PredicateContext): Expression = withOrigin(ctx) { // Invert a predicate if it has a valid NOT clause. - def invertIfNotDefined(e: Expression): Expression = ctx.NOT match { - case null => e - case not => Not(e) + def invertIfNotDefined(e: Expression): Expression = { + val withNot = blockBang(ctx.errorCapturingNot) + withNot match { + case null => e + case _ => Not(e) + } } def getValueExpressions(e: Expression): Seq[Expression] = e match { @@ -2027,6 +2111,8 @@ class AstBuilder extends DataTypeAstBuilder with SQLConfHelper with Logging { case _ => new Like(expr, pattern) } + val withNot = blockBang(ctx.errorCapturingNot) + // Create the predicate. ctx.kind.getType match { case SqlBaseParser.BETWEEN => @@ -2046,7 +2132,7 @@ class AstBuilder extends DataTypeAstBuilder with SQLConfHelper with Logging { // So we use LikeAny or NotLikeAny instead. val patterns = expressions.map(_.eval(EmptyRow).asInstanceOf[UTF8String]) val (expr, pat) = lowerLikeArgsIfNeeded(e, patterns) - ctx.NOT match { + withNot match { case null => LikeAny(expr, pat) case _ => NotLikeAny(expr, pat) } @@ -2062,7 +2148,7 @@ class AstBuilder extends DataTypeAstBuilder with SQLConfHelper with Logging { // So we use LikeAll or NotLikeAll instead. val patterns = expressions.map(_.eval(EmptyRow).asInstanceOf[UTF8String]) val (expr, pat) = lowerLikeArgsIfNeeded(e, patterns) - ctx.NOT match { + withNot match { case null => LikeAll(expr, pat) case _ => NotLikeAll(expr, pat) } @@ -2086,23 +2172,23 @@ class AstBuilder extends DataTypeAstBuilder with SQLConfHelper with Logging { } case SqlBaseParser.RLIKE => invertIfNotDefined(RLike(e, expression(ctx.pattern))) - case SqlBaseParser.NULL if ctx.NOT != null => + case SqlBaseParser.NULL if withNot != null => IsNotNull(e) case SqlBaseParser.NULL => IsNull(e) - case SqlBaseParser.TRUE => ctx.NOT match { + case SqlBaseParser.TRUE => withNot match { case null => EqualNullSafe(e, Literal(true)) case _ => Not(EqualNullSafe(e, Literal(true))) } - case SqlBaseParser.FALSE => ctx.NOT match { + case SqlBaseParser.FALSE => withNot match { case null => EqualNullSafe(e, Literal(false)) case _ => Not(EqualNullSafe(e, Literal(false))) } - case SqlBaseParser.UNKNOWN => ctx.NOT match { + case SqlBaseParser.UNKNOWN => withNot match { case null => IsUnknown(e) case _ => IsNotUnknown(e) } - case SqlBaseParser.DISTINCT if ctx.NOT != null => + case SqlBaseParser.DISTINCT if withNot != null => EqualNullSafe(e, expression(ctx.right)) case SqlBaseParser.DISTINCT => Not(EqualNullSafe(e, expression(ctx.right))) @@ -2148,6 +2234,19 @@ class AstBuilder extends DataTypeAstBuilder with SQLConfHelper with Logging { } } + override def visitShiftExpression(ctx: ShiftExpressionContext): Expression = withOrigin(ctx) { + val left = expression(ctx.left) + val right = expression(ctx.right) + val operator = ctx.shiftOperator().getChild(0).asInstanceOf[TerminalNode] + val shift = operator.getSymbol.getType match { + case SqlBaseParser.SHIFT_LEFT => ShiftLeft(left, right) + case SqlBaseParser.SHIFT_RIGHT => ShiftRight(left, right) + case SqlBaseParser.SHIFT_RIGHT_UNSIGNED => ShiftRightUnsigned(left, right) + } + shift.setTagValue(FUNC_ALIAS, operator.getText) + shift + } + /** * Create a unary arithmetic expression. The following arithmetic operators are supported: * - Plus: '+' @@ -2203,6 +2302,20 @@ class AstBuilder extends DataTypeAstBuilder with SQLConfHelper with Logging { */ override def visitCast(ctx: CastContext): Expression = withOrigin(ctx) { val rawDataType = typedVisit[DataType](ctx.dataType()) + ctx.dataType() match { + case context: PrimitiveDataTypeContext => + val typeCtx = context.`type`() + if (typeCtx.start.getType == STRING) { + typeCtx.children.asScala.toSeq match { + case Seq(_, cctx: CollateClauseContext) => + throw QueryParsingErrors.dataTypeUnsupportedError( + rawDataType.typeName, + ctx.dataType().asInstanceOf[PrimitiveDataTypeContext]) + case _ => + } + } + case _ => + } val dataType = CharVarcharUtils.replaceCharVarcharWithStringForCast(rawDataType) ctx.name.getType match { case SqlBaseParser.CAST => @@ -2222,6 +2335,20 @@ class AstBuilder extends DataTypeAstBuilder with SQLConfHelper with Logging { */ override def visitCastByColon(ctx: CastByColonContext): Expression = withOrigin(ctx) { val rawDataType = typedVisit[DataType](ctx.dataType()) + ctx.dataType() match { + case context: PrimitiveDataTypeContext => + val typeCtx = context.`type`() + if (typeCtx.start.getType == STRING) { + typeCtx.children.asScala.toSeq match { + case Seq(_, cctx: CollateClauseContext) => + throw QueryParsingErrors.dataTypeUnsupportedError( + rawDataType.typeName, + ctx.dataType().asInstanceOf[PrimitiveDataTypeContext]) + case _ => + } + } + case _ => + } val dataType = CharVarcharUtils.replaceCharVarcharWithStringForCast(rawDataType) val cast = Cast(expression(ctx.primaryExpression), dataType) cast.setTagValue(Cast.USER_SPECIFIED_CAST, ()) @@ -2698,11 +2825,10 @@ class AstBuilder extends DataTypeAstBuilder with SQLConfHelper with Logging { Literal(interval, CalendarIntervalType) } case BINARY_HEX => - val padding = if (value.length % 2 != 0) "0" else "" try { - Literal(Hex.decodeHex(padding + value)) + Literal(Hex.unhex(value), BinaryType) } catch { - case e: DecoderException => + case e: IllegalArgumentException => val ex = QueryParsingErrors.cannotParseValueTypeError("X", value, ctx) ex.setStackTrace(e.getStackTrace) throw ex @@ -3143,24 +3269,24 @@ class AstBuilder extends DataTypeAstBuilder with SQLConfHelper with Logging { /** * Create top level table schema. */ - protected def createSchema(ctx: CreateOrReplaceTableColTypeListContext): StructType = { - val columns = Option(ctx).toArray.flatMap(visitCreateOrReplaceTableColTypeList) + protected def createSchema(ctx: ColDefinitionListContext): StructType = { + val columns = Option(ctx).toArray.flatMap(visitColDefinitionList) StructType(columns.map(_.toV1Column)) } /** * Get CREATE TABLE column definitions. */ - override def visitCreateOrReplaceTableColTypeList( - ctx: CreateOrReplaceTableColTypeListContext): Seq[ColumnDefinition] = withOrigin(ctx) { - ctx.createOrReplaceTableColType().asScala.map(visitCreateOrReplaceTableColType).toSeq + override def visitColDefinitionList( + ctx: ColDefinitionListContext): Seq[ColumnDefinition] = withOrigin(ctx) { + ctx.colDefinition().asScala.map(visitColDefinition).toSeq } /** * Get a CREATE TABLE column definition. */ - override def visitCreateOrReplaceTableColType( - ctx: CreateOrReplaceTableColTypeContext): ColumnDefinition = withOrigin(ctx) { + override def visitColDefinition( + ctx: ColDefinitionContext): ColumnDefinition = withOrigin(ctx) { import ctx._ val name: String = colName.getText @@ -3171,6 +3297,7 @@ class AstBuilder extends DataTypeAstBuilder with SQLConfHelper with Logging { var commentSpec: Option[CommentSpecContext] = None ctx.colDefinitionOption().asScala.foreach { option => if (option.NULL != null) { + blockBang(option.errorCapturingNot) if (!nullable) { throw QueryParsingErrors.duplicateTableColumnDescriptor( option, name, "NOT NULL") @@ -3424,6 +3551,7 @@ class AstBuilder extends DataTypeAstBuilder with SQLConfHelper with Logging { */ override def visitCreateTableHeader( ctx: CreateTableHeaderContext): TableHeader = withOrigin(ctx) { + blockBang(ctx.errorCapturingNot) val temporary = ctx.TEMPORARY != null val ifNotExists = ctx.EXISTS != null if (temporary && ifNotExists) { @@ -3543,7 +3671,7 @@ class AstBuilder extends DataTypeAstBuilder with SQLConfHelper with Logging { } } - private def cleanNamespaceProperties( + protected def cleanNamespaceProperties( properties: Map[String, String], ctx: ParserRuleContext): Map[String, String] = withOrigin(ctx) { import SupportsNamespaces._ @@ -3600,6 +3728,8 @@ class AstBuilder extends DataTypeAstBuilder with SQLConfHelper with Logging { properties += PROP_LOCATION -> _ } + blockBang(ctx.errorCapturingNot) + CreateNamespace( withIdentClause(ctx.identifierReference, UnresolvedNamespace(_)), ctx.EXISTS != null, @@ -3983,8 +4113,7 @@ class AstBuilder extends DataTypeAstBuilder with SQLConfHelper with Logging { val (identifierContext, temp, ifNotExists, external) = visitCreateTableHeader(ctx.createTableHeader) - val columns = Option(ctx.createOrReplaceTableColTypeList()) - .map(visitCreateOrReplaceTableColTypeList).getOrElse(Nil) + val columns = Option(ctx.colDefinitionList()).map(visitColDefinitionList).getOrElse(Nil) val provider = Option(ctx.tableProvider).map(_.multipartIdentifier.getText) val (partTransforms, partCols, bucketSpec, properties, options, location, comment, serdeInfo, clusterBySpec) = visitCreateTableClauses(ctx.createTableClauses()) @@ -4065,8 +4194,7 @@ class AstBuilder extends DataTypeAstBuilder with SQLConfHelper with Logging { val orCreate = ctx.replaceTableHeader().CREATE() != null val (partTransforms, partCols, bucketSpec, properties, options, location, comment, serdeInfo, clusterBySpec) = visitCreateTableClauses(ctx.createTableClauses()) - val columns = Option(ctx.createOrReplaceTableColTypeList()) - .map(visitCreateOrReplaceTableColTypeList).getOrElse(Nil) + val columns = Option(ctx.colDefinitionList()).map(visitColDefinitionList).getOrElse(Nil) val provider = Option(ctx.tableProvider).map(_.multipartIdentifier.getText) if (provider.isDefined && serdeInfo.isDefined) { @@ -4207,7 +4335,10 @@ class AstBuilder extends DataTypeAstBuilder with SQLConfHelper with Logging { var colPosition: Option[ColPositionContext] = None val columnName = name.last ctx.colDefinitionDescriptorWithPosition.asScala.foreach { option => + blockBang(option.errorCapturingNot) + if (option.NULL != null) { + blockBang(option.errorCapturingNot) if (!nullable) { throw QueryParsingErrors.duplicateTableColumnDescriptor( option, columnName, "NOT NULL", isCreate = false) @@ -4411,6 +4542,8 @@ class AstBuilder extends DataTypeAstBuilder with SQLConfHelper with Logging { } var commentSpec: Option[CommentSpecContext] = None colType.colDefinitionDescriptorWithPosition.asScala.foreach { opt => + blockBang(opt.errorCapturingNot) + if (opt.NULL != null) { throw QueryParsingErrors.operationInHiveStyleCommandUnsupportedError( "NOT NULL", "REPLACE COLUMNS", ctx) @@ -4862,6 +4995,7 @@ class AstBuilder extends DataTypeAstBuilder with SQLConfHelper with Logging { val location = Option(splCtx.locationSpec).map(visitLocationSpec) UnresolvedPartitionSpec(spec, location) } + blockBang(ctx.errorCapturingNot) AddPartitions( createUnresolvedTable( ctx.identifierReference, @@ -4954,6 +5088,62 @@ class AstBuilder extends DataTypeAstBuilder with SQLConfHelper with Logging { query = plan(ctx.query)) } + /** + * Defined the schema binding mode during CREATE or ALTER VIEW. + * The method also accept a NULL context, in which case it will return the session default + * + * {{{ + * WITH SCHEMA [ BINDING | COMPENSATION | TYPE EVOLUTION | EVOLUTION ] + * }}} + */ + override def visitSchemaBinding(ctx: SchemaBindingContext): ViewSchemaMode = { + if (ctx == null) { + // No schema binding specified, return the session default + if (conf.viewSchemaBindingEnabled) { + if (conf.viewSchemaCompensation) { + SchemaCompensation + } else { + SchemaBinding + } + } else { + SchemaUnsupported + } + } else if (!conf.viewSchemaBindingEnabled) { + // If the feature is disabled, throw an exception + withOrigin(ctx) { + throw new ParseException( + errorClass = "FEATURE_NOT_ENABLED", + messageParameters = Map("featureName" -> "VIEW ... WITH SCHEMA ...", + "configKey" -> "spark.sql.legacy.viewSchemaBindingMode", + "configValue" -> "true"), + ctx) + } + } else if (ctx.COMPENSATION != null) { + SchemaCompensation + } else if (ctx.TYPE != null) { + SchemaTypeEvolution + } else if (ctx.EVOLUTION != null) { + SchemaEvolution + } else { + SchemaBinding + } + } + + /** + * Alter the schema binding of a view. This creates a [[AlterViewSchemaBinding]] + * + * For example: + * {{{ + * ALTER VIEW multi_part_name WITH SCHEMA ...; + * }}} + */ + override def visitAlterViewSchemaBinding(ctx: AlterViewSchemaBindingContext): LogicalPlan + = withOrigin(ctx) { + AlterViewSchemaBinding( + createUnresolvedView(ctx.identifierReference, "ALTER VIEW ... WITH SCHEMA ..."), + viewSchemaMode = visitSchemaBinding(ctx.schemaBinding)) + } + /** * Create a [[RenameTable]] command. * @@ -5106,6 +5296,8 @@ class AstBuilder extends DataTypeAstBuilder with SQLConfHelper with Logging { .map(x => (Option(x.options).map(visitPropertyKeyValues).getOrElse(Map.empty))).toSeq val options = Option(ctx.options).map(visitPropertyKeyValues).getOrElse(Map.empty) + blockBang(ctx.errorCapturingNot) + CreateIndex( createUnresolvedTable(ctx.identifierReference, "CREATE INDEX"), indexName, diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserInterface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserInterface.scala index 3aec1dd431138..04edb0f75c463 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserInterface.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserInterface.scala @@ -62,4 +62,10 @@ trait ParserInterface extends DataTypeParserInterface { */ @throws[ParseException]("Text cannot be parsed to a LogicalPlan") def parseQuery(sqlText: String): LogicalPlan + + /** + * Parse a SQL script string to a [[CompoundBody]]. + */ + @throws[ParseException]("Text cannot be parsed to a CompoundBody") + def parseScript(sqlScriptText: String): CompoundBody } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/SqlScriptingLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/SqlScriptingLogicalOperators.scala new file mode 100644 index 0000000000000..816ef82a3d8e6 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/SqlScriptingLogicalOperators.scala @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.parser + +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.catalyst.trees.{CurrentOrigin, Origin, WithOrigin} + +/** + * Trait for all SQL Scripting logical operators that are product of parsing phase. + * These operators will be used by the SQL Scripting interpreter to generate execution nodes. + */ +sealed trait CompoundPlanStatement + +/** + * Logical operator representing result of parsing a single SQL statement + * that is supposed to be executed against Spark. + * @param parsedPlan Result of SQL statement parsing. + */ +case class SingleStatement(parsedPlan: LogicalPlan) + extends CompoundPlanStatement + with WithOrigin { + + override val origin: Origin = CurrentOrigin.get + + def getText(sqlScriptText: String): String = { + if (origin.startIndex.isEmpty || origin.stopIndex.isEmpty) { + return null + } + sqlScriptText.substring(origin.startIndex.get, origin.stopIndex.get + 1) + } +} + +/** + * Logical operator for a compound body. Contains all statements within the compound body. + * @param collection Collection of statements within the compound body. + */ +case class CompoundBody(collection: Seq[CompoundPlanStatement]) extends CompoundPlanStatement diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala index 0f049103542ec..3f417644082c3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.catalyst.plans +import java.util.IdentityHashMap + import scala.collection.mutable import org.apache.spark.sql.AnalysisException @@ -75,8 +77,13 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] /** * The set of all attributes that are input to this operator by its children. */ - def inputSet: AttributeSet = - AttributeSet(children.flatMap(_.asInstanceOf[QueryPlan[PlanType]].output)) + def inputSet: AttributeSet = { + children match { + case Seq() => AttributeSet.empty + case Seq(c) => c.outputSet + case _ => AttributeSet.fromAttributeSets(children.map(_.outputSet)) + } + } /** * The set of all attributes that are produced by this node. @@ -221,12 +228,14 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] } } + @scala.annotation.nowarn("cat=deprecation") def recursiveTransform(arg: Any): AnyRef = arg match { case e: Expression => transformExpression(e) case Some(value) => Some(recursiveTransform(value)) case m: Map[_, _] => m case d: DataType => d // Avoid unpacking Structs - case stream: LazyList[_] => stream.map(recursiveTransform).force + case stream: Stream[_] => stream.map(recursiveTransform).force + case lazyList: LazyList[_] => lazyList.map(recursiveTransform).force case seq: Iterable[_] => seq.map(recursiveTransform) case other: AnyRef => other case null => null @@ -438,7 +447,8 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] override def verboseString(maxFields: Int): String = simpleString(maxFields) override def simpleStringWithNodeId(): String = { - val operatorId = getTagValue(QueryPlan.OP_ID_TAG).map(id => s"$id").getOrElse("unknown") + val operatorId = Option(QueryPlan.localIdMap.get().get(this)).map(id => s"$id") + .getOrElse("unknown") s"$nodeName ($operatorId)".trim } @@ -458,7 +468,8 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] } protected def formattedNodeName: String = { - val opId = getTagValue(QueryPlan.OP_ID_TAG).map(id => s"$id").getOrElse("unknown") + val opId = Option(QueryPlan.localIdMap.get().get(this)).map(id => s"$id") + .getOrElse("unknown") val codegenId = getTagValue(QueryPlan.CODEGEN_ID_TAG).map(id => s" [codegen id : $id]").getOrElse("") s"($opId) $nodeName$codegenId" @@ -517,6 +528,30 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] transformDownWithSubqueriesAndPruning(AlwaysProcess.fn, UnknownRuleId)(f) } + /** + * Same as `transformUpWithSubqueries` except allows for pruning opportunities. + */ + def transformUpWithSubqueriesAndPruning( + cond: TreePatternBits => Boolean, + ruleId: RuleId = UnknownRuleId) + (f: PartialFunction[PlanType, PlanType]): PlanType = { + val g: PartialFunction[PlanType, PlanType] = new PartialFunction[PlanType, PlanType] { + override def isDefinedAt(x: PlanType): Boolean = true + + override def apply(plan: PlanType): PlanType = { + val transformed = plan.transformExpressionsUpWithPruning(t => + t.containsPattern(PLAN_EXPRESSION) && cond(t)) { + case planExpression: PlanExpression[PlanType@unchecked] => + val newPlan = planExpression.plan.transformUpWithSubqueriesAndPruning(cond, ruleId)(f) + planExpression.withNewPlan(newPlan) + } + f.applyOrElse[PlanType, PlanType](transformed, identity) + } + } + + transformUpWithPruning(cond, ruleId)(g) + } + /** * This method is the top-down (pre-order) counterpart of transformUpWithSubqueries. * Returns a copy of this node where the given partial function has been recursively applied @@ -646,9 +681,17 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] } object QueryPlan extends PredicateHelper { - val OP_ID_TAG = TreeNodeTag[Int]("operatorId") val CODEGEN_ID_TAG = new TreeNodeTag[Int]("wholeStageCodegenId") + /** + * A thread local map to store the mapping between the query plan and the query plan id. + * The scope of this thread local is within ExplainUtils.processPlan. The reason we define it here + * is because [[ QueryPlan ]] also needs this, and it doesn't have access to `execution` package + * from `catalyst`. + */ + val localIdMap: ThreadLocal[java.util.Map[QueryPlan[_], Int]] = ThreadLocal.withInitial(() => + new IdentityHashMap[QueryPlan[_], Int]()) + /** * Normalize the exprIds in the given expression, by updating the exprId in `AttributeReference` * with its referenced ordinal from input attributes. It's similar to `BindReferences` but we diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/joinTypes.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/joinTypes.scala index f123258683ec3..d9da255eccc9d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/joinTypes.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/joinTypes.scala @@ -19,10 +19,22 @@ package org.apache.spark.sql.catalyst.plans import java.util.Locale -import org.apache.spark.{SparkIllegalArgumentException, SparkUnsupportedOperationException} +import org.apache.spark.SparkUnsupportedOperationException +import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.expressions.Attribute object JoinType { + + val supported = Seq( + "inner", + "outer", "full", "fullouter", "full_outer", + "leftouter", "left", "left_outer", + "rightouter", "right", "right_outer", + "leftsemi", "left_semi", "semi", + "leftanti", "left_anti", "anti", + "cross" + ) + def apply(typ: String): JoinType = typ.toLowerCase(Locale.ROOT).replace("_", "") match { case "inner" => Inner case "outer" | "full" | "fullouter" => FullOuter @@ -32,20 +44,12 @@ object JoinType { case "leftanti" | "anti" => LeftAnti case "cross" => Cross case _ => - val supported = Seq( - "inner", - "outer", "full", "fullouter", "full_outer", - "leftouter", "left", "left_outer", - "rightouter", "right", "right_outer", - "leftsemi", "left_semi", "semi", - "leftanti", "left_anti", "anti", - "cross") - - throw new SparkIllegalArgumentException( - errorClass = "_LEGACY_ERROR_TEMP_3216", + throw new AnalysisException( + errorClass = "UNSUPPORTED_JOIN_TYPE", messageParameters = Map( "typ" -> typ, - "supported" -> supported.mkString("'", "', '", "'"))) + "supported" -> supported.mkString("'", "', '", "'")) + ) } } @@ -129,15 +133,16 @@ object LeftSemiOrAnti { object AsOfJoinDirection { + val supported = Seq("forward", "backward", "nearest") + def apply(direction: String): AsOfJoinDirection = { direction.toLowerCase(Locale.ROOT) match { case "forward" => Forward case "backward" => Backward case "nearest" => Nearest case _ => - val supported = Seq("forward", "backward", "nearest") - throw new SparkIllegalArgumentException( - errorClass = "_LEGACY_ERROR_TEMP_3217", + throw new AnalysisException( + errorClass = "AS_OF_JOIN.UNSUPPORTED_DIRECTION", messageParameters = Map( "direction" -> direction, "supported" -> supported.mkString("'", "', '", "'"))) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Command.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Command.scala index fc9eb5d03e49f..bd277e92d11d2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Command.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Command.scala @@ -63,3 +63,14 @@ trait AnalysisOnlyCommand extends Command { // on the `AnalysisContext` def markAsAnalyzed(analysisContext: AnalysisContext): LogicalPlan } + +/** + * A logical node that does not expose its sub-nodes as children, but rather supervises them + * in an implementation-defined manner. + */ +trait SupervisingCommand extends LeafCommand { + /** + * Transforms its supervised plan using `transformer` and returns a copy of `SupervisingCommand` + */ + def withTransformedSupervisedPlan(transformer: LogicalPlan => LogicalPlan): LogicalPlan +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/EmptyRelation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/EmptyRelation.scala new file mode 100644 index 0000000000000..9e055ae7f3bd8 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/EmptyRelation.scala @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.plans.logical + +import org.apache.spark.sql.catalyst.expressions.Attribute +import org.apache.spark.sql.catalyst.expressions.SortOrder + +case class EmptyRelation(logical: LogicalPlan) extends LeafNode { + override val isStreaming: Boolean = logical.isStreaming + + override val outputOrdering: Seq[SortOrder] = logical.outputOrdering + + override def output: Seq[Attribute] = logical.output + + override def computeStats(): Statistics = Statistics(sizeInBytes = 0, rowCount = Some(0)) + + override def maxRows: Option[Long] = Some(0) + + override def maxRowsPerPartition: Option[Long] = Some(0) +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/EventTimeWatermark.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/EventTimeWatermark.scala index 32a9030ff62b1..8cfc939755ef7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/EventTimeWatermark.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/EventTimeWatermark.scala @@ -20,7 +20,8 @@ package org.apache.spark.sql.catalyst.plans.logical import java.util.concurrent.TimeUnit import org.apache.spark.sql.catalyst.expressions.Attribute -import org.apache.spark.sql.catalyst.trees.TreePattern.{EVENT_TIME_WATERMARK, TreePattern} +import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark.updateEventTimeColumn +import org.apache.spark.sql.catalyst.trees.TreePattern.{EVENT_TIME_WATERMARK, TreePattern, UPDATE_EVENT_TIME_WATERMARK_COLUMN} import org.apache.spark.sql.catalyst.util.IntervalUtils import org.apache.spark.sql.types.MetadataBuilder import org.apache.spark.unsafe.types.CalendarInterval @@ -32,6 +33,36 @@ object EventTimeWatermark { def getDelayMs(delay: CalendarInterval): Long = { IntervalUtils.getDuration(delay, TimeUnit.MILLISECONDS) } + + /** + * Adds watermark delay to the metadata for newEventTime in provided attributes. + * + * If any other existing attributes have watermark delay present in their metadata, watermark + * delay will be removed from their metadata. + */ + def updateEventTimeColumn( + attributes: Seq[Attribute], + delayMs: Long, + newEventTime: Attribute): Seq[Attribute] = { + attributes.map { a => + if (a semanticEquals newEventTime) { + val updatedMetadata = new MetadataBuilder() + .withMetadata(a.metadata) + .putLong(EventTimeWatermark.delayKey, delayMs) + .build() + a.withMetadata(updatedMetadata) + } else if (a.metadata.contains(EventTimeWatermark.delayKey)) { + // Remove existing columns tagged as eventTime for watermark + val updatedMetadata = new MetadataBuilder() + .withMetadata(a.metadata) + .remove(EventTimeWatermark.delayKey) + .build() + a.withMetadata(updatedMetadata) + } else { + a + } + } + } } /** @@ -49,26 +80,38 @@ case class EventTimeWatermark( // logic here because we also maintain the compatibility flag. (See // SQLConf.STATEFUL_OPERATOR_ALLOW_MULTIPLE for details.) // TODO: Disallow updating the metadata once we remove the compatibility flag. - override val output: Seq[Attribute] = child.output.map { a => - if (a semanticEquals eventTime) { - val delayMs = EventTimeWatermark.getDelayMs(delay) - val updatedMetadata = new MetadataBuilder() - .withMetadata(a.metadata) - .putLong(EventTimeWatermark.delayKey, delayMs) - .build() - a.withMetadata(updatedMetadata) - } else if (a.metadata.contains(EventTimeWatermark.delayKey)) { - // Remove existing watermark - val updatedMetadata = new MetadataBuilder() - .withMetadata(a.metadata) - .remove(EventTimeWatermark.delayKey) - .build() - a.withMetadata(updatedMetadata) + override val output: Seq[Attribute] = { + val delayMs = EventTimeWatermark.getDelayMs(delay) + updateEventTimeColumn(child.output, delayMs, eventTime) + } + + override protected def withNewChildInternal(newChild: LogicalPlan): EventTimeWatermark = + copy(child = newChild) +} + +/** + * Updates the event time column to [[eventTime]] in the child output. + * + * Any watermark calculations performed after this node will use the + * updated eventTimeColumn. + */ +case class UpdateEventTimeWatermarkColumn( + eventTime: Attribute, + delay: Option[CalendarInterval], + child: LogicalPlan) extends UnaryNode { + + final override val nodePatterns: Seq[TreePattern] = Seq(UPDATE_EVENT_TIME_WATERMARK_COLUMN) + + override def output: Seq[Attribute] = { + if (delay.isDefined) { + val delayMs = EventTimeWatermark.getDelayMs(delay.get) + updateEventTimeColumn(child.output, delayMs, eventTime) } else { - a + child.output } } - override protected def withNewChildInternal(newChild: LogicalPlan): EventTimeWatermark = + override protected def withNewChildInternal( + newChild: LogicalPlan): UpdateEventTimeWatermarkColumn = copy(child = newChild) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/FunctionBuilderBase.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/FunctionBuilderBase.scala index 7e04af190e4aa..0aa73f1939e10 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/FunctionBuilderBase.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/FunctionBuilderBase.scala @@ -70,6 +70,8 @@ trait FunctionBuilderBase[T] { } def build(funcName: String, expressions: Seq[Expression]): T + + def supportsLambda: Boolean = false } object NamedParametersSupport { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala index b989233da6740..938a8ffe9e446 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala @@ -25,6 +25,7 @@ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.{AliasAwareQueryOutputOrdering, QueryPlan} import org.apache.spark.sql.catalyst.plans.logical.statsEstimation.LogicalPlanStats import org.apache.spark.sql.catalyst.trees.{BinaryLike, LeafLike, TreeNodeTag, UnaryLike} +import org.apache.spark.sql.catalyst.trees.TreePattern.{LOGICAL_QUERY_STAGE, TreePattern} import org.apache.spark.sql.catalyst.types.DataTypeUtils import org.apache.spark.sql.catalyst.util.MetadataColumnHelper import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors} @@ -103,7 +104,20 @@ abstract class LogicalPlan */ lazy val resolved: Boolean = expressions.forall(_.resolved) && childrenResolved - override protected def statePrefix = if (!resolved) "'" else super.statePrefix + override protected def statePrefix = { + if (!resolved) { + "'" + } else { + val prefixFromSuper = super.statePrefix + // Ancestor class could mark something on the prefix, including 'invalid'. Add a marker for + // `streaming` only when there is no marker from ancestor class. + if (prefixFromSuper.isEmpty && isStreaming) { + "~" + } else { + prefixFromSuper + } + } + } /** * Returns true if all its children of this query plan have been resolved. @@ -118,7 +132,9 @@ abstract class LogicalPlan def resolve(schema: StructType, resolver: Resolver): Seq[Attribute] = { schema.map { field => resolve(field.name :: Nil, resolver).map { - case a: AttributeReference => a + case a: AttributeReference => + // Keep the metadata in given schema. + a.withMetadata(field.metadata) case _ => throw QueryExecutionErrors.resolveCannotHandleNestedSchema(this) }.getOrElse { throw QueryCompilationErrors.cannotResolveAttributeError( @@ -212,6 +228,33 @@ trait LeafNode extends LogicalPlan with LeafLike[LogicalPlan] { throw new SparkUnsupportedOperationException("_LEGACY_ERROR_TEMP_3114") } +/** + * A abstract class for LogicalQueryStage that is visible in logical rewrites. + */ +abstract class LogicalQueryStage extends LeafNode { + override protected val nodePatterns: Seq[TreePattern] = Seq(LOGICAL_QUERY_STAGE) + + /** + * Returns the logical plan that is included in this query stage + */ + def logicalPlan: LogicalPlan + + /** + * Returns the physical plan. + */ + def physicalPlan: QueryPlan[_] + + /** + * Return true if the physical stage is materialized + */ + def isMaterialized: Boolean + + /** + * Return true if the physical plan corresponds directly to a stage + */ + def isDirectStage: Boolean +} + /** * A logical plan node with single child. */ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala index 1c8f7a97dd7fe..0135fcfb3cc8c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala @@ -783,6 +783,7 @@ object View { "spark.sql.hive.convertMetastoreParquet", "spark.sql.hive.convertMetastoreOrc", "spark.sql.hive.convertInsertingPartitionedTable", + "spark.sql.hive.convertInsertingUnpartitionedTable", "spark.sql.hive.convertMetastoreCtas" ).contains(key) || key.startsWith("spark.sql.catalog.") } @@ -910,6 +911,10 @@ case class WithCTE(plan: LogicalPlan, cteDefs: Seq[CTERelationDef]) extends Logi def withNewPlan(newPlan: LogicalPlan): WithCTE = { withNewChildren(children.init :+ newPlan).asInstanceOf[WithCTE] } + + override def maxRows: Option[Long] = plan.maxRows + + override def maxRowsPerPartition: Option[Long] = plan.maxRowsPerPartition } /** @@ -1070,7 +1075,8 @@ case class Range( override def newInstance(): Range = copy(output = output.map(_.newInstance())) override def simpleString(maxFields: Int): String = { - s"Range ($start, $end, step=$step, splits=$numSlices)" + val splits = if (numSlices.isDefined) { s", splits=$numSlices" } else { "" } + s"Range ($start, $end, step=$step$splits)" } override def maxRows: Option[Long] = { @@ -2056,6 +2062,8 @@ case class LateralJoin( joinType: JoinType, condition: Option[Expression]) extends UnaryNode { + override lazy val allAttributes: AttributeSeq = left.output ++ right.plan.output + require(Seq(Inner, LeftOuter, Cross).contains(joinType), s"Unsupported lateral join type $joinType") diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala index 28d52d39093b3..07423b612c301 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala @@ -579,7 +579,7 @@ object TransformWithState { child: LogicalPlan): LogicalPlan = { val keyEncoder = encoderFor[K] val mapped = new TransformWithState( - UnresolvedDeserializer(encoderFor[K].deserializer, groupingAttributes), + UnresolvedDeserializer(keyEncoder.deserializer, groupingAttributes), UnresolvedDeserializer(encoderFor[V].deserializer, dataAttributes), groupingAttributes, dataAttributes, diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala index 37e751ea9884b..6339a18796fa0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.{SparkIllegalArgumentException, SparkUnsupportedOperationException} import org.apache.spark.sql.AnalysisException -import org.apache.spark.sql.catalyst.analysis.{AnalysisContext, AssignmentUtils, EliminateSubqueryAliases, FieldName, NamedRelation, PartitionSpec, ResolvedIdentifier, UnresolvedException} +import org.apache.spark.sql.catalyst.analysis.{AnalysisContext, AssignmentUtils, EliminateSubqueryAliases, FieldName, NamedRelation, PartitionSpec, ResolvedIdentifier, UnresolvedException, ViewSchemaMode} import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec import org.apache.spark.sql.catalyst.catalog.FunctionResource import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, AttributeSet, Expression, MetadataAttribute, NamedExpression, UnaryExpression, Unevaluable, V2ExpressionUtils} @@ -64,10 +64,11 @@ trait V2WriteCommand extends UnaryCommand with KeepAnalyzedQuery with CTEInChild table.skipSchemaResolution || (query.output.size == table.output.size && query.output.zip(table.output).forall { case (inAttr, outAttr) => + val inType = CharVarcharUtils.getRawType(inAttr.metadata).getOrElse(inAttr.dataType) val outType = CharVarcharUtils.getRawType(outAttr.metadata).getOrElse(outAttr.dataType) // names and types must match, nullability must be compatible inAttr.name == outAttr.name && - DataType.equalsIgnoreCompatibleNullability(inAttr.dataType, outType) && + DataType.equalsIgnoreCompatibleNullability(inType, outType) && (outAttr.nullable || !inAttr.nullable) }) } @@ -754,7 +755,8 @@ case class MergeIntoTable( mergeCondition: Expression, matchedActions: Seq[MergeAction], notMatchedActions: Seq[MergeAction], - notMatchedBySourceActions: Seq[MergeAction]) extends BinaryCommand with SupportsSubquery { + notMatchedBySourceActions: Seq[MergeAction], + withSchemaEvolution: Boolean) extends BinaryCommand with SupportsSubquery { lazy val aligned: Boolean = { val actions = matchedActions ++ notMatchedActions ++ notMatchedBySourceActions @@ -1292,6 +1294,17 @@ case class AlterViewAs( } } +/** + * The logical plan of the ALTER VIEW ... WITH SCHEMA command. + */ +case class AlterViewSchemaBinding( + child: LogicalPlan, + viewSchemaMode: ViewSchemaMode) + extends UnaryCommand { + override protected def withNewChildInternal(newChild: LogicalPlan): LogicalPlan = + copy(child = newChild) +} + /** * The logical plan of the CREATE VIEW ... command. */ @@ -1303,7 +1316,8 @@ case class CreateView( originalText: Option[String], query: LogicalPlan, allowExisting: Boolean, - replace: Boolean) extends BinaryCommand with CTEInChildren { + replace: Boolean, + viewSchemaMode: ViewSchemaMode) extends BinaryCommand with CTEInChildren { override def left: LogicalPlan = child override def right: LogicalPlan = query override protected def withNewChildrenInternal( diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala index 2364130f79e4c..19595eef10b34 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala @@ -385,8 +385,9 @@ case class KeyGroupedPartitioning( val attributes = expressions.flatMap(_.collectLeaves()) if (SQLConf.get.v2BucketingAllowJoinKeysSubsetOfPartitionKeys) { - // check that all join keys (required clustering keys) contained in partitioning - requiredClustering.forall(x => attributes.exists(_.semanticEquals(x))) && + // check that join keys (required clustering keys) + // overlap with partition keys (KeyGroupedPartitioning attributes) + requiredClustering.exists(x => attributes.exists(_.semanticEquals(x))) && expressions.forall(_.collectLeaves().size == 1) } else { attributes.forall(x => requiredClustering.exists(_.semanticEquals(x))) @@ -870,12 +871,30 @@ case class KeyGroupedShuffleSpec( if (results.forall(p => p.isEmpty)) None else Some(results) } - override def canCreatePartitioning: Boolean = SQLConf.get.v2BucketingShuffleEnabled && - // Only support partition expressions are AttributeReference for now - partitioning.expressions.forall(_.isInstanceOf[AttributeReference]) + override def canCreatePartitioning: Boolean = { + // Allow one side shuffle for SPJ for now only if partially-clustered is not enabled + // and for join keys less than partition keys only if transforms are not enabled. + val checkExprType = if (SQLConf.get.v2BucketingAllowJoinKeysSubsetOfPartitionKeys) { + e: Expression => e.isInstanceOf[AttributeReference] + } else { + e: Expression => e.isInstanceOf[AttributeReference] || e.isInstanceOf[TransformExpression] + } + SQLConf.get.v2BucketingShuffleEnabled && + !SQLConf.get.v2BucketingPartiallyClusteredDistributionEnabled && + partitioning.expressions.forall(checkExprType) + } + + override def createPartitioning(clustering: Seq[Expression]): Partitioning = { - KeyGroupedPartitioning(clustering, partitioning.numPartitions, partitioning.partitionValues) + val newExpressions: Seq[Expression] = clustering.zip(partitioning.expressions).map { + case (c, e: TransformExpression) => TransformExpression( + e.function, Seq(c), e.numBucketsOpt) + case (c, _) => c + } + KeyGroupedPartitioning(newExpressions, + partitioning.numPartitions, + partitioning.partitionValues) } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala index 476ace2662f8b..c8b3f224a3129 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.rules import org.apache.spark.SparkException import org.apache.spark.internal.{Logging, MessageWithContext} -import org.apache.spark.internal.LogKey._ +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.MDC import org.apache.spark.sql.catalyst.QueryPlanningTracker import org.apache.spark.sql.catalyst.trees.TreeNode @@ -75,11 +75,11 @@ class PlanChangeLogger[TreeType <: TreeNode[_]] extends Logging { def message(): MessageWithContext = { if (!oldPlan.fastEquals(newPlan)) { log""" - |=== Result of Batch ${MDC(RULE_BATCH_NAME, batchName)} === + |=== Result of Batch ${MDC(BATCH_NAME, batchName)} === |${MDC(QUERY_PLAN, sideBySide(oldPlan.treeString, newPlan.treeString).mkString("\n"))} """.stripMargin } else { - log"Batch ${MDC(RULE_BATCH_NAME, batchName)} has no effect." + log"Batch ${MDC(BATCH_NAME, batchName)} has no effect." } } @@ -90,14 +90,16 @@ class PlanChangeLogger[TreeType <: TreeNode[_]] extends Logging { def logMetrics(metrics: QueryExecutionMetrics): Unit = { val totalTime = metrics.time / NANOS_PER_MILLIS.toDouble val totalTimeEffective = metrics.timeEffective / NANOS_PER_MILLIS.toDouble + // scalastyle:off line.size.limit val message: MessageWithContext = log""" |=== Metrics of Executed Rules === - |Total number of runs: ${MDC(RULE_NUMBER_OF_RUNS, metrics.numRuns)} + |Total number of runs: ${MDC(NUM_RULE_OF_RUNS, metrics.numRuns)} |Total time: ${MDC(TOTAL_TIME, totalTime)} ms - |Total number of effective runs: ${MDC(RULE_NUMBER_OF_RUNS, metrics.numEffectiveRuns)} + |Total number of effective runs: ${MDC(NUM_EFFECTIVE_RULE_OF_RUNS, metrics.numEffectiveRuns)} |Total time of effective runs: ${MDC(TOTAL_EFFECTIVE_TIME, totalTimeEffective)} ms """.stripMargin + // scalastyle:on line.size.limit logBasedOnLevel(message) } @@ -145,7 +147,7 @@ abstract class RuleExecutor[TreeType <: TreeNode[_]] extends Logging { override val maxIterationsSetting: String = null) extends Strategy /** A batch of rules. */ - protected case class Batch(name: String, strategy: Strategy, rules: Rule[TreeType]*) + protected[catalyst] case class Batch(name: String, strategy: Strategy, rules: Rule[TreeType]*) /** Defines a sequence of rule batches, to be overridden by the implementation. */ protected def batches: Seq[Batch] @@ -263,7 +265,7 @@ abstract class RuleExecutor[TreeType <: TreeNode[_]] extends Logging { log"to a larger value." } val log = log"Max iterations (${MDC(NUM_ITERATIONS, iteration - 1)}) " + - log"reached for batch ${MDC(RULE_BATCH_NAME, batch.name)}" + + log"reached for batch ${MDC(BATCH_NAME, batch.name)}" + endingMsg if (Utils.isTesting || batch.strategy.errorOnExceed) { throw new RuntimeException(log.message) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleIdCollection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleIdCollection.scala index 778d56788e89e..d36ce37406063 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleIdCollection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleIdCollection.scala @@ -104,6 +104,7 @@ object RuleIdCollection { "org.apache.spark.sql.catalyst.analysis.TypeCoercionBase$CombinedTypeCoercionRule" :: "org.apache.spark.sql.catalyst.analysis.UpdateOuterReferences" :: "org.apache.spark.sql.catalyst.analysis.UpdateAttributeNullability" :: + "org.apache.spark.sql.catalyst.analysis.ResolveUpdateEventTimeWatermarkColumn" :: // Catalyst Optimizer rules "org.apache.spark.sql.catalyst.optimizer.BooleanSimplification" :: "org.apache.spark.sql.catalyst.optimizer.CollapseProject" :: @@ -135,6 +136,7 @@ object RuleIdCollection { "org.apache.spark.sql.catalyst.optimizer.ObjectSerializerPruning" :: "org.apache.spark.sql.catalyst.optimizer.OptimizeCsvJsonExprs" :: "org.apache.spark.sql.catalyst.optimizer.OptimizeIn" :: + "org.apache.spark.sql.catalyst.optimizer.OptimizeJoinCondition" :: "org.apache.spark.sql.catalyst.optimizer.OptimizeRand" :: "org.apache.spark.sql.catalyst.optimizer.OptimizeOneRowPlan" :: "org.apache.spark.sql.catalyst.optimizer.Optimizer$OptimizeSubqueries" :: @@ -148,7 +150,6 @@ object RuleIdCollection { "org.apache.spark.sql.catalyst.optimizer.PushFoldableIntoBranches" :: "org.apache.spark.sql.catalyst.optimizer.PushLeftSemiLeftAntiThroughJoin" :: "org.apache.spark.sql.catalyst.optimizer.ReassignLambdaVariableID" :: - "org.apache.spark.sql.catalyst.optimizer.RemoveDispensableExpressions" :: "org.apache.spark.sql.catalyst.optimizer.RemoveLiteralFromGroupExpressions" :: "org.apache.spark.sql.catalyst.optimizer.GenerateOptimization" :: "org.apache.spark.sql.catalyst.optimizer.RemoveNoopOperators" :: diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala index 94e893d468b39..6683f2dbfb392 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.trees import java.util.UUID +import scala.annotation.nowarn import scala.collection.{mutable, Map} import scala.jdk.CollectionConverters._ import scala.reflect.ClassTag @@ -78,8 +79,16 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] /** * A mutable map for holding auxiliary information of this tree node. It will be carried over * when this node is copied via `makeCopy`, or transformed via `transformUp`/`transformDown`. + * We lazily evaluate the `tags` since the default size of a `mutable.Map` is nonzero. This + * will reduce unnecessary memory pressure. */ - private val tags: mutable.Map[TreeNodeTag[_], Any] = mutable.Map.empty + private[this] var _tags: mutable.Map[TreeNodeTag[_], Any] = null + private def tags: mutable.Map[TreeNodeTag[_], Any] = { + if (_tags eq null) { + _tags = mutable.Map.empty + } + _tags + } /** * Default tree pattern [[BitSet] for a [[TreeNode]]. @@ -112,7 +121,14 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] * ineffective for subsequent apply calls on this tree because query plan structures are * immutable. */ - private val ineffectiveRules: BitSet = new BitSet(RuleIdCollection.NumRules) + private[this] var _ineffectiveRules: BitSet = null + private def ineffectiveRules: BitSet = { + if (_ineffectiveRules eq null) { + _ineffectiveRules = new BitSet(RuleIdCollection.NumRules) + } + _ineffectiveRules + } + private def isIneffectiveRulesEmpty = _ineffectiveRules eq null /** * @return a sequence of tree pattern enums in a TreeNode T. It does not include propagated @@ -141,17 +157,19 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] * UnknownId, it returns false. */ protected def isRuleIneffective(ruleId : RuleId): Boolean = { - if (ruleId eq UnknownRuleId) { + if (isIneffectiveRulesEmpty || (ruleId eq UnknownRuleId)) { return false } ineffectiveRules.get(ruleId.id) } + def isTagsEmpty: Boolean = (_tags eq null) || _tags.isEmpty + def copyTagsFrom(other: BaseType): Unit = { // SPARK-32753: it only makes sense to copy tags to a new node // but it's too expensive to detect other cases likes node removal // so we make a compromise here to copy tags to node with no tags - if (tags.isEmpty) { + if (isTagsEmpty && !other.isTagsEmpty) { tags ++= other.tags } } @@ -161,11 +179,17 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] } def getTagValue[T](tag: TreeNodeTag[T]): Option[T] = { - tags.get(tag).map(_.asInstanceOf[T]) + if (isTagsEmpty) { + None + } else { + tags.get(tag).map(_.asInstanceOf[T]) + } } def unsetTagValue[T](tag: TreeNodeTag[T]): Unit = { - tags -= tag + if (!isTagsEmpty) { + tags -= tag + } } /** @@ -355,12 +379,16 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] case nonChild: AnyRef => nonChild case null => null } + @nowarn("cat=deprecation") val newArgs = mapProductIterator { case s: StructType => s // Don't convert struct types to some other type of Seq[StructField] // Handle Seq[TreeNode] in TreeNode parameters. - case s: LazyList[_] => - // LazyList is lazy so we need to force materialization + case s: Stream[_] => + // Stream is lazy so we need to force materialization s.map(mapChild).force + case l: LazyList[_] => + // LazyList is lazy so we need to force materialization + l.map(mapChild).force case s: Seq[_] => s.map(mapChild) case m: Map[_, _] => @@ -778,6 +806,7 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] case other => other } + @nowarn("cat=deprecation") val newArgs = mapProductIterator { case arg: TreeNode[_] if containsChild(arg) => arg.asInstanceOf[BaseType].clone() @@ -790,7 +819,8 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] case (_, other) => other } case d: DataType => d // Avoid unpacking Structs - case args: LazyList[_] => args.map(mapChild).force // Force materialization on stream + case args: Stream[_] => args.map(mapChild).force // Force materialization on stream + case args: LazyList[_] => args.map(mapChild).force // Force materialization on LazyList case args: Iterable[_] => args.map(mapChild) case nonChild: AnyRef => nonChild case null => null diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreePatterns.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreePatterns.scala index 4ab075db5709a..c5cc1eaf8f05d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreePatterns.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreePatterns.scala @@ -96,7 +96,6 @@ object TreePattern extends Enumeration { val VARIANT_GET: Value = Value val WINDOW_EXPRESSION: Value = Value val WINDOW_TIME: Value = Value - val UNARY_POSITIVE: Value = Value val UNPIVOT: Value = Value val UPDATE_FIELDS: Value = Value val UPPER_OR_LOWER: Value = Value @@ -133,6 +132,7 @@ object TreePattern extends Enumeration { val UNION: Value = Value val UNRESOLVED_RELATION: Value = Value val UNRESOLVED_WITH: Value = Value + val UPDATE_EVENT_TIME_WATERMARK_COLUMN: Value = Value val TEMP_RESOLVED_COLUMN: Value = Value val TYPED_FILTER: Value = Value val WINDOW: Value = Value diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/types/DataTypeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/types/DataTypeUtils.scala index cf8e903f03a34..f8bb1077a080f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/types/DataTypeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/types/DataTypeUtils.scala @@ -22,7 +22,7 @@ import org.apache.spark.sql.catalyst.util.TypeUtils.toSQLId import org.apache.spark.sql.errors.QueryCompilationErrors import org.apache.spark.sql.internal.SQLConf.StoreAssignmentPolicy import org.apache.spark.sql.internal.SQLConf.StoreAssignmentPolicy.{ANSI, STRICT} -import org.apache.spark.sql.types.{ArrayType, AtomicType, DataType, Decimal, DecimalType, MapType, NullType, StructField, StructType, UserDefinedType} +import org.apache.spark.sql.types.{ArrayType, AtomicType, DataType, Decimal, DecimalType, MapType, NullType, StringType, StructField, StructType, UserDefinedType} import org.apache.spark.sql.types.DecimalType.{forType, fromDecimal} object DataTypeUtils { @@ -47,6 +47,31 @@ object DataTypeUtils { DataType.equalsIgnoreCaseAndNullability(from, to) } + /** + * Compares two types, ignoring nullability of ArrayType, MapType, StructType, ignoring case + * sensitivity of field names in StructType as well as differences in collation for String types. + */ + def equalsIgnoreCaseNullabilityAndCollation(from: DataType, to: DataType): Boolean = { + (from, to) match { + case (ArrayType(fromElement, _), ArrayType(toElement, _)) => + equalsIgnoreCaseNullabilityAndCollation(fromElement, toElement) + + case (MapType(fromKey, fromValue, _), MapType(toKey, toValue, _)) => + equalsIgnoreCaseNullabilityAndCollation(fromKey, toKey) && + equalsIgnoreCaseNullabilityAndCollation(fromValue, toValue) + + case (StructType(fromFields), StructType(toFields)) => + fromFields.length == toFields.length && + fromFields.zip(toFields).forall { case (l, r) => + l.name.equalsIgnoreCase(r.name) && + equalsIgnoreCaseNullabilityAndCollation(l.dataType, r.dataType) + } + + case (_: StringType, _: StringType) => true + case (fromDataType, toDataType) => fromDataType == toDataType + } + } + private val SparkGeneratedName = """col\d+""".r private def isSparkGeneratedName(name: String): Boolean = name match { case SparkGeneratedName(_*) => true diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/BadRecordException.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/BadRecordException.scala index 65a56c1064e45..4fa6a2275e743 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/BadRecordException.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/BadRecordException.scala @@ -67,16 +67,32 @@ case class PartialResultArrayException( extends Exception(cause) /** - * Exception thrown when the underlying parser meet a bad record and can't parse it. + * Exception thrown when the underlying parser met a bad record and can't parse it. + * The stacktrace is not collected for better performance, and thus, this exception should + * not be used in a user-facing context. * @param record a function to return the record that cause the parser to fail * @param partialResults a function that returns an row array, which is the partial results of * parsing this bad record. - * @param cause the actual exception about why the record is bad and can't be parsed. + * @param cause the actual exception about why the record is bad and can't be parsed. It's better + * to use `LazyBadRecordCauseWrapper` here to delay heavy cause construction + * until it's needed. */ case class BadRecordException( @transient record: () => UTF8String, @transient partialResults: () => Array[InternalRow] = () => Array.empty[InternalRow], - cause: Throwable) extends Exception(cause) + cause: Throwable) extends Exception(cause) { + override def getStackTrace(): Array[StackTraceElement] = new Array[StackTraceElement](0) + override def fillInStackTrace(): Throwable = this +} + +/** + * Exception to use as `BadRecordException` cause to delay heavy user-facing exception construction. + * Does not contain stacktrace and used only for control flow + */ +case class LazyBadRecordCauseWrapper(cause: () => Throwable) extends Exception() { + override def getStackTrace(): Array[StackTraceElement] = new Array[StackTraceElement](0) + override def fillInStackTrace(): Throwable = this +} /** * Exception thrown when the underlying parser parses a JSON array as a struct. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/CharVarcharUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/CharVarcharUtils.scala index 06a88b5d7b51b..011e385043d30 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/CharVarcharUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/CharVarcharUtils.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.util import scala.collection.mutable import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey._ +import org.apache.spark.internal.LogKeys._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.objects.StaticInvoke import org.apache.spark.sql.catalyst.parser.CatalystSqlParser diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala index e31ccdb818259..f1c36f2f5c28f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala @@ -698,7 +698,7 @@ object DateTimeUtils extends SparkDateTimeUtils { } } catch { case _: scala.MatchError => - throw SparkException.internalError(s"Got the unexpected unit '$unit'.") + throw QueryExecutionErrors.invalidDatetimeUnitError("TIMESTAMPADD", unit) case _: ArithmeticException | _: DateTimeException => throw QueryExecutionErrors.timestampAddOverflowError(micros, quantity, unit) case e: Throwable => @@ -736,7 +736,7 @@ object DateTimeUtils extends SparkDateTimeUtils { val endLocalTs = getLocalDateTime(endTs, zoneId) timestampDiffMap(unitInUpperCase)(startLocalTs, endLocalTs) } else { - throw SparkException.internalError(s"Got the unexpected unit '$unit'.") + throw QueryExecutionErrors.invalidDatetimeUnitError("TIMESTAMPDIFF", unit) } } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/FailureSafeParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/FailureSafeParser.scala index 10cd159c769b2..d9946d1b12ec3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/FailureSafeParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/FailureSafeParser.scala @@ -78,10 +78,17 @@ class FailureSafeParser[IN]( case StringAsDataTypeException(fieldName, fieldValue, dataType) => throw QueryExecutionErrors.cannotParseStringAsDataTypeError(e.record().toString, fieldName, fieldValue, dataType) - case other => throw QueryExecutionErrors.malformedRecordsDetectedInRecordParsingError( - toResultRow(e.partialResults().headOption, e.record).toString, other) + case causeWrapper: LazyBadRecordCauseWrapper => + throwMalformedRecordsDetectedInRecordParsingError(e, causeWrapper.cause()) + case cause => throwMalformedRecordsDetectedInRecordParsingError(e, cause) } } } } + + private def throwMalformedRecordsDetectedInRecordParsingError( + e: BadRecordException, cause: Throwable): Nothing = { + throw QueryExecutionErrors.malformedRecordsDetectedInRecordParsingError( + toResultRow(e.partialResults().headOption, e.record).toString, cause) + } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GeneratedColumn.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GeneratedColumn.scala index deb817a0cdb70..46f14876be363 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GeneratedColumn.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GeneratedColumn.scala @@ -163,9 +163,9 @@ object GeneratedColumn { s"generation expression data type ${analyzed.dataType.simpleString} " + s"is incompatible with column data type ${dataType.simpleString}") } - if (analyzed.exists(e => SchemaUtils.hasNonBinarySortableCollatedString(e.dataType))) { + if (analyzed.exists(e => SchemaUtils.hasNonUTF8BinaryCollation(e.dataType))) { throw unsupportedExpressionError( - "generation expression cannot contain non-binary orderable collated string type") + "generation expression cannot contain non utf8 binary collated string type") } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/InternalRowComparableWrapper.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/InternalRowComparableWrapper.scala index 9a0bdc6bcfd11..90e3bdcd082cd 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/InternalRowComparableWrapper.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/InternalRowComparableWrapper.scala @@ -24,6 +24,7 @@ import org.apache.spark.sql.catalyst.expressions.{Expression, Murmur3HashFunctio import org.apache.spark.sql.catalyst.plans.physical.KeyGroupedPartitioning import org.apache.spark.sql.connector.read.{HasPartitionKey, InputPartition} import org.apache.spark.sql.types.{DataType, StructField, StructType} +import org.apache.spark.util.NonFateSharingCache /** * Wraps the [[InternalRow]] with the corresponding [[DataType]] to make it comparable with @@ -34,9 +35,10 @@ import org.apache.spark.sql.types.{DataType, StructField, StructType} * @param dataTypes the data types for the row */ class InternalRowComparableWrapper(val row: InternalRow, val dataTypes: Seq[DataType]) { + import InternalRowComparableWrapper._ - private val structType = StructType(dataTypes.map(t => StructField("f", t))) - private val ordering = RowOrdering.createNaturalAscendingOrdering(dataTypes) + private val structType = structTypeCache.get(dataTypes) + private val ordering = orderingCache.get(dataTypes) override def hashCode(): Int = Murmur3HashFunction.hash(row, structType, 42L).toInt @@ -53,6 +55,21 @@ class InternalRowComparableWrapper(val row: InternalRow, val dataTypes: Seq[Data } object InternalRowComparableWrapper { + private final val MAX_CACHE_ENTRIES = 1024 + + private val orderingCache = { + val loadFunc = (dataTypes: Seq[DataType]) => { + RowOrdering.createNaturalAscendingOrdering(dataTypes) + } + NonFateSharingCache(loadFunc, MAX_CACHE_ENTRIES) + } + + private val structTypeCache = { + val loadFunc = (dataTypes: Seq[DataType]) => { + StructType(dataTypes.map(t => StructField("f", t))) + } + NonFateSharingCache(loadFunc, MAX_CACHE_ENTRIES) + } def apply( partition: InputPartition with HasPartitionKey, diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ParseMode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ParseMode.scala index dd1e466d1b38f..cc1a01083af41 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ParseMode.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ParseMode.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.util import java.util.Locale import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.PARSE_MODE +import org.apache.spark.internal.LogKeys.PARSE_MODE sealed trait ParseMode { /** diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ResolveDefaultColumnsUtil.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ResolveDefaultColumnsUtil.scala index db9adef8ef3b2..6b4f29bea7579 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ResolveDefaultColumnsUtil.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ResolveDefaultColumnsUtil.scala @@ -21,14 +21,14 @@ import scala.collection.mutable.ArrayBuffer import org.apache.spark.{SparkThrowable, SparkUnsupportedOperationException} import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey._ +import org.apache.spark.internal.LogKeys._ import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.{InternalRow, SQLConfHelper} import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.catalyst.catalog.{CatalogDatabase, InMemoryCatalog, SessionCatalog} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.{Literal => ExprLiteral} -import org.apache.spark.sql.catalyst.optimizer.ConstantFolding +import org.apache.spark.sql.catalyst.optimizer.{ConstantFolding, ReplaceExpressions} import org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, ParseException} import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.trees.TreePattern.PLAN_EXPRESSION @@ -169,7 +169,7 @@ object ResolveDefaultColumns extends QueryErrorsBase def resolveColumnDefaultInAssignmentValue( key: Expression, value: Expression, - invalidColumnDefaultException: Throwable): Expression = { + invalidColumnDefaultException: => Throwable): Expression = { key match { case attr: AttributeReference => value match { @@ -284,12 +284,13 @@ object ResolveDefaultColumns extends QueryErrorsBase throw QueryCompilationErrors.defaultValuesMayNotContainSubQueryExpressions( statementType, colName, defaultSQL) } + // Analyze the parse result. val plan = try { val analyzer: Analyzer = DefaultColumnAnalyzer val analyzed = analyzer.execute(Project(Seq(Alias(parsed, colName)()), OneRowRelation())) analyzer.checkAnalysis(analyzed) - ConstantFolding(analyzed) + ConstantFolding(ReplaceExpressions(analyzed)) } catch { case ex: AnalysisException => throw QueryCompilationErrors.defaultValuesUnresolvedExprError( @@ -298,6 +299,21 @@ object ResolveDefaultColumns extends QueryErrorsBase val analyzed: Expression = plan.collectFirst { case Project(Seq(a: Alias), OneRowRelation()) => a.child }.get + + if (!analyzed.foldable) { + throw QueryCompilationErrors.defaultValueNotConstantError(statementType, colName, defaultSQL) + } + + // Another extra check, expressions should already be resolved if AnalysisException is not + // thrown in the code block above + if (!analyzed.resolved) { + throw QueryCompilationErrors.defaultValuesUnresolvedExprError( + statementType, + colName, + defaultSQL, + cause = null) + } + // Perform implicit coercion from the provided expression type to the required column type. coerceDefaultValue(analyzed, dataType, statementType, colName, defaultSQL) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala index 04df3635d4754..e2a5319cbe1ad 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala @@ -22,7 +22,7 @@ import java.util.regex.{Pattern, PatternSyntaxException} import org.apache.commons.text.similarity.LevenshteinDistance import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey._ +import org.apache.spark.internal.LogKeys._ import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute import org.apache.spark.sql.errors.QueryCompilationErrors import org.apache.spark.sql.internal.SQLConf @@ -66,12 +66,6 @@ object StringUtils extends Logging { "(?s)" + out.result() // (?s) enables dotall mode, causing "." to match new lines } - /** - * Returns a pretty string of the byte array which prints each byte as a hex digit and add spaces - * between them. For example, [1A C0]. - */ - def getHexString(bytes: Array[Byte]): String = bytes.map("%02X".format(_)).mkString("[", " ", "]") - private[this] val trueStrings = Set("t", "true", "y", "yes", "1").map(UTF8String.fromString) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala index d2c708b380cf5..a0d578c66e736 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala @@ -58,7 +58,7 @@ object TypeUtils extends QueryErrorsBase { } def checkForMapKeyType(keyType: DataType): TypeCheckResult = { - if (keyType.existsRecursively(_.isInstanceOf[MapType])) { + if (keyType.existsRecursively(dt => dt.isInstanceOf[MapType] || dt.isInstanceOf[VariantType])) { DataTypeMismatch( errorSubClass = "INVALID_MAP_KEY_TYPE", messageParameters = Map( diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlGenerator.scala index 87f0b50b9af22..514138ab7508a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlGenerator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlGenerator.scala @@ -169,8 +169,8 @@ class StaxXmlGenerator( def writeElement(dt: DataType, v: Any, options: XmlOptions): Unit = (dt, v) match { case (_, null) | (NullType, _) => gen.writeCharacters(options.nullValue) - case (StringType, v: UTF8String) => gen.writeCharacters(v.toString) - case (StringType, v: String) => gen.writeCharacters(v) + case (_: StringType, v: UTF8String) => gen.writeCharacters(v.toString) + case (_: StringType, v: String) => gen.writeCharacters(v) case (TimestampType, v: Timestamp) => gen.writeCharacters(timestampFormatter.format(v.toInstant())) case (TimestampType, v: Long) => diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala index 725ef8fe79f79..9a0528468842c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala @@ -397,8 +397,7 @@ class StaxXmlParser( row(anyIndex) = values :+ newValue } } else { - StaxXmlParserUtils.skipChildren(parser) - StaxXmlParserUtils.skipNextEndElement(parser, field, options) + StaxXmlParserUtils.skipChildren(parser, field, options) } } } catch { @@ -802,19 +801,6 @@ class XmlTokenizer( commentIdx = 0 } - if (c == cdataStart(cdataIdx)) { - if (cdataIdx >= cdataStart.length - 1) { - // If a CDATA beigns we must ignore everything until its end - buffer.setLength(buffer.length - cdataStart.length) - cdataIdx = 0 - readUntilMatch(cdataEnd) - } else { - cdataIdx += 1 - } - } else { - cdataIdx = 0 - } - if (c == '>' && prevC != '/') { canSelfClose = false } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParserUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParserUtils.scala index a59ea6f460dee..5d267143b06c9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParserUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParserUtils.scala @@ -165,24 +165,27 @@ object StaxXmlParserUtils { /** * Skip the children of the current XML element. + * Before this function is called, the 'startElement' of the object has already been consumed. + * Upon completion, this function consumes the 'endElement' of the object, + * which effectively skipping the entire object enclosed within these elements. */ - def skipChildren(parser: XMLEventReader): Unit = { - var shouldStop = checkEndElement(parser) + def skipChildren( + parser: XMLEventReader, + expectedNextEndElementName: String, + options: XmlOptions): Unit = { + var shouldStop = false while (!shouldStop) { parser.nextEvent match { - case _: StartElement => - val e = parser.peek - if (e.isCharacters && e.asCharacters.isWhiteSpace) { - // There can be a `Characters` event between `StartElement`s. - // So, we need to check further to decide if this is a data or just - // a whitespace between them. - parser.next - } - if (parser.peek.isStartElement) { - skipChildren(parser) - } - case _: EndElement => - shouldStop = checkEndElement(parser) + case startElement: StartElement => + val childField = StaxXmlParserUtils.getName(startElement.asStartElement.getName, options) + skipChildren(parser, childField, options) + case endElement: EndElement => + val endElementName = getName(endElement.getName, options) + assert( + endElementName == expectedNextEndElementName, + s"Expected EndElement , but found " + ) + shouldStop = true case _: XMLEvent => // do nothing } } @@ -197,9 +200,10 @@ object StaxXmlParserUtils { case c: Characters if c.isWhiteSpace => skipNextEndElement(parser, expectedNextEndElementName, options) case endElement: EndElement => + val endElementName = getName(endElement.getName, options) assert( - getName(endElement.getName, options) == expectedNextEndElementName, - s"Expected EndElement ") + endElementName == expectedNextEndElementName, + s"Expected EndElement , but found ") case _ => throw new IllegalStateException( s"Expected EndElement ") } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/ValidatorUtil.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/ValidatorUtil.scala index 3d93c4e8742ab..8a5291d0bac74 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/ValidatorUtil.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/ValidatorUtil.scala @@ -27,7 +27,7 @@ import org.apache.hadoop.fs.Path import org.apache.spark.SparkFiles import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.Logging -import org.apache.spark.internal.LogKey._ +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.MDC /** @@ -42,7 +42,7 @@ object ValidatorUtil extends Logging { val in = openSchemaFile(new Path(key)) try { val schemaFactory = SchemaFactory.newInstance(XMLConstants.W3C_XML_SCHEMA_NS_URI) - schemaFactory.newSchema(new StreamSource(in)) + schemaFactory.newSchema(new StreamSource(in, key)) } finally { in.close() } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Util.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Util.scala index 5485f5255b6e7..f36310e8ad899 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Util.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Util.scala @@ -525,10 +525,15 @@ private[sql] object CatalogV2Util { } if (isDefaultColumn) { - val e = analyze(f, EXISTS_DEFAULT_COLUMN_METADATA_KEY) + val e = analyze( + f, + statementType = "Column analysis", + metadataKey = EXISTS_DEFAULT_COLUMN_METADATA_KEY) + assert(e.resolved && e.foldable, "The existence default value must be a simple SQL string that is resolved and foldable, " + "but got: " + f.getExistenceDefaultValue().get) + val defaultValue = new ColumnDefaultValue( f.getCurrentDefaultValue().get, LiteralValue(e.eval(), f.dataType)) val cleanedMetadata = metadataWithKeysRemoved( diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/expressions/expressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/expressions/expressions.scala index fc41d5a98e4aa..b43e627c0eece 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/expressions/expressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/expressions/expressions.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.connector.expressions +import org.apache.commons.lang3.StringUtils + import org.apache.spark.SparkException import org.apache.spark.sql.catalyst import org.apache.spark.sql.catalyst.parser.CatalystSqlParser @@ -388,7 +390,7 @@ private[sql] object HoursTransform { private[sql] final case class LiteralValue[T](value: T, dataType: DataType) extends Literal[T] { override def toString: String = { if (dataType.isInstanceOf[StringType]) { - s"'$value'" + s"'${StringUtils.replace(s"$value", "'", "''")}'" } else { s"$value" } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala index 6ad5624d4730a..d3bd265d0459e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala @@ -175,6 +175,13 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase with Compilat "functionName" -> toSQLId(funcName))) } + def nullDataSourceOption(option: String): Throwable = { + new AnalysisException( + errorClass = "NULL_DATA_SOURCE_OPTION", + messageParameters = Map("option" -> option) + ) + } + def unorderablePivotColError(pivotCol: Expression): Throwable = { new AnalysisException( errorClass = "INCOMPARABLE_PIVOT_COLUMN", @@ -2687,6 +2694,15 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase with Compilat Map("tableName" -> toSQLId(tableName), "columnName" -> toSQLId(columnName)) ) } + + def cannotAlterCollationBucketColumn(tableName: String, columnName: String): Throwable = { + new AnalysisException( + errorClass = "CANNOT_ALTER_COLLATION_BUCKET_COLUMN", + messageParameters = + Map("tableName" -> toSQLId(tableName), "columnName" -> toSQLId(columnName)) + ) + } + def cannotFindColumnError(name: String, fieldNames: Array[String]): Throwable = { new AnalysisException( errorClass = "_LEGACY_ERROR_TEMP_1246", @@ -2970,6 +2986,12 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase with Compilat "dataColumns" -> query.output.map(c => toSQLId(c.name)).mkString(", "))) } + def cannotAlterTempViewWithSchemaBindingError() : Throwable = { + new AnalysisException( + errorClass = "UNSUPPORTED_FEATURE.TEMPORARY_VIEW_WITH_SCHEMA_BINDING_MODE", + messageParameters = Map.empty) + } + def unsupportedCreateOrReplaceViewOnTableError( name: TableIdentifier, replace: Boolean): Throwable = { if (replace) { @@ -3227,6 +3249,12 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase with Compilat ) } + def invalidSingleVariantColumn(): Throwable = { + new AnalysisException( + errorClass = "INVALID_SINGLE_VARIANT_COLUMN", + messageParameters = Map.empty) + } + def writeWithSaveModeUnsupportedBySourceError(source: String, createMode: String): Throwable = { new AnalysisException( errorClass = "UNSUPPORTED_DATA_SOURCE_SAVE_MODE", @@ -3295,7 +3323,7 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase with Compilat def invalidJoinTypeInJoinWithError(joinType: JoinType): Throwable = { new AnalysisException( - errorClass = "_LEGACY_ERROR_TEMP_1319", + errorClass = "INVALID_JOIN_TYPE_FOR_JOINWITH", messageParameters = Map("joinType" -> joinType.sql)) } @@ -4055,4 +4083,18 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase with Compilat callDeprecatedMethodError("createTable(..., StructType, ...)", "createTable(..., Array[Column], ...)") } + + def cannotAssignEventTimeColumn(): Throwable = { + new AnalysisException( + errorClass = "CANNOT_ASSIGN_EVENT_TIME_COLUMN_WITHOUT_WATERMARK", + messageParameters = Map() + ) + } + + def avroNotLoadedSqlFunctionsUnusable(functionName: String): Throwable = { + new AnalysisException( + errorClass = "AVRO_NOT_LOADED_SQL_FUNCTIONS_UNUSABLE", + messageParameters = Map("functionName" -> functionName) + ) + } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala index ceb90fe6bea50..6fb09bdeffc51 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala @@ -211,6 +211,15 @@ private[sql] object QueryExecutionErrors extends QueryErrorsBase with ExecutionE summary = getSummary(context)) } + def invalidUTF8StringError(str: UTF8String): SparkIllegalArgumentException = { + new SparkIllegalArgumentException( + errorClass = "INVALID_UTF8_STRING", + messageParameters = Map( + "str" -> str.getBytes.map(byte => f"\\x$byte%02X").mkString + ) + ) + } + def invalidArrayIndexError( index: Int, numElements: Int, @@ -629,7 +638,7 @@ private[sql] object QueryExecutionErrors extends QueryErrorsBase with ExecutionE } def failedToCompileMsg(e: Exception): String = { - s"failed to compile: $e" + s"Failed to compile: $e" } def internalCompilerError(e: InternalCompilerException): Throwable = { @@ -2727,6 +2736,11 @@ private[sql] object QueryExecutionErrors extends QueryErrorsBase with ExecutionE messageParameters = Map("path" -> path, "functionName" -> toSQLId(functionName))) } + def malformedVariant(): Throwable = new SparkRuntimeException( + "MALFORMED_VARIANT", + Map.empty + ) + def invalidCharsetError(functionName: String, charset: String): RuntimeException = { new SparkIllegalArgumentException( errorClass = "INVALID_PARAMETER_VALUE.CHARSET", @@ -2736,6 +2750,14 @@ private[sql] object QueryExecutionErrors extends QueryErrorsBase with ExecutionE "charset" -> charset)) } + def malformedCharacterCoding(functionName: String, charset: String): RuntimeException = { + new SparkRuntimeException( + errorClass = "MALFORMED_CHARACTER_CODING", + messageParameters = Map( + "function" -> toSQLId(functionName), + "charset" -> charset)) + } + def invalidWriterCommitMessageError(details: String): Throwable = { new SparkRuntimeException( errorClass = "INVALID_WRITER_COMMIT_MESSAGE", @@ -2757,4 +2779,35 @@ private[sql] object QueryExecutionErrors extends QueryErrorsBase with ExecutionE "numFields" -> numFields.toString, "schemaLen" -> schemaLen.toString)) } + + def emittedRowsAreOlderThanWatermark( + currentWatermark: Long, emittedRowEventTime: Long): SparkRuntimeException = { + new SparkRuntimeException( + errorClass = "EMITTING_ROWS_OLDER_THAN_WATERMARK_NOT_ALLOWED", + messageParameters = Map( + "currentWatermark" -> currentWatermark.toString, + "emittedRowEventTime" -> emittedRowEventTime.toString + ) + ) + } + + def notNullAssertViolation(walkedTypePath: String): SparkRuntimeException = { + new SparkRuntimeException( + errorClass = "NOT_NULL_ASSERT_VIOLATION", + messageParameters = Map( + "walkedTypePath" -> walkedTypePath + ) + ) + } + + def invalidDatetimeUnitError( + functionName: String, + invalidValue: String): Throwable = { + new SparkIllegalArgumentException( + errorClass = "INVALID_PARAMETER_VALUE.DATETIME_UNIT", + messageParameters = Map( + "functionName" -> toSQLId(functionName), + "parameter" -> toSQLId("unit"), + "invalidValue" -> s"'$invalidValue'")) + } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index e5ba1be0f5f4f..4e7c6c180e9a2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -535,8 +535,7 @@ object SQLConf { val COLUMN_VECTOR_OFFHEAP_ENABLED = buildConf("spark.sql.columnVector.offheap.enabled") .internal() - .doc("When true, use OffHeapColumnVector in ColumnarBatch. " + - s"Defaults to $MEMORY_OFFHEAP_ENABLED.") + .doc("When true, use OffHeapColumnVector in ColumnarBatch.") .version("2.3.0") .fallbackConf(MEMORY_OFFHEAP_ENABLED) @@ -772,14 +771,28 @@ object SQLConf { " produced by a builtin function such as to_char or CAST") .version("4.0.0") .stringConf - .checkValue(CollationFactory.isValidCollation, + .checkValue( + collationName => { + try { + CollationFactory.fetchCollation(collationName) + true + } catch { + case e: SparkException if e.getErrorClass == "COLLATION_INVALID_NAME" => false + } + }, "DEFAULT_COLLATION", - name => - Map( - "proposal" -> CollationFactory.getClosestCollation(name) - )) + collationName => Map( + "proposals" -> CollationFactory.getClosestSuggestionsOnInvalidName(collationName, 3))) .createWithDefault("UTF8_BINARY") + val ICU_CASE_MAPPINGS_ENABLED = + buildConf("spark.sql.icu.caseMappings.enabled") + .doc("When enabled we use the ICU library (instead of the JVM) to implement case mappings" + + " for strings under UTF8_BINARY collation.") + .version("4.0.0") + .booleanConf + .createWithDefault(true) + val FETCH_SHUFFLE_BLOCKS_IN_BATCH = buildConf("spark.sql.adaptive.fetchShuffleBlocksInBatch") .internal() @@ -1495,6 +1508,48 @@ object SQLConf { .booleanConf .createWithDefault(true) + /** + * Output style for binary data. + */ + object BinaryOutputStyle extends Enumeration { + type BinaryOutputStyle = Value + val + /** + * Output as UTF-8 string. + * [83, 112, 97, 114, 107] -> "Spark" + */ + UTF8, + /** + * Output as comma separated byte array string. + * [83, 112, 97, 114, 107] -> [83, 112, 97, 114, 107] + */ + BASIC, + /** + * Output as base64 encoded string. + * [83, 112, 97, 114, 107] -> U3Bhcmsg + */ + BASE64, + /** + * Output as hex string. + * [83, 112, 97, 114, 107] -> 537061726b + */ + HEX, + /** + * Output as discrete hex string. + * [83, 112, 97, 114, 107] -> [53 70 61 72 6b] + */ + HEX_DISCRETE = Value + } + + val BINARY_OUTPUT_STYLE = buildConf("spark.sql.binaryOutputStyle") + .doc("The output style used display binary data. Valid values are 'UTF8', " + + "'BASIC', 'BASE64', 'HEX', and 'HEX_DISCRETE'.") + .version("4.0.0") + .stringConf + .transform(_.toUpperCase(Locale.ROOT)) + .checkValues(BinaryOutputStyle.values.map(_.toString)) + .createOptional + val PARTITION_COLUMN_TYPE_INFERENCE = buildConf("spark.sql.sources.partitionColumnTypeInference.enabled") .doc("When true, automatically infer the data types for partitioned columns.") @@ -1527,7 +1582,7 @@ object SQLConf { "side. This could help to eliminate unnecessary shuffles") .version("3.4.0") .booleanConf - .createWithDefault(false) + .createWithDefault(true) val V2_BUCKETING_PARTIALLY_CLUSTERED_DISTRIBUTION_ENABLED = buildConf("spark.sql.sources.v2.bucketing.partiallyClusteredDistribution.enabled") @@ -1658,6 +1713,22 @@ object SQLConf { .booleanConf .createWithDefault(true) + val VIEW_SCHEMA_BINDING_ENABLED = buildConf("spark.sql.legacy.viewSchemaBindingMode") + .internal() + .doc("Set to false to disable the WITH SCHEMA clause for view DDL and suppress the line in " + + "DESCRIBE EXTENDED and SHOW CREATE TABLE.") + .version("4.0.0") + .booleanConf + .createWithDefault(true) + + val VIEW_SCHEMA_COMPENSATION = buildConf("spark.sql.legacy.viewSchemaCompensation") + .internal() + .doc("Set to false to revert default view schema binding mode from WITH SCHEMA COMPENSATION " + + "to WITH SCHEMA BINDING.") + .version("4.0.0") + .booleanConf + .createWithDefault(true) + // The output committer class used by data sources. The specified class needs to be a // subclass of org.apache.hadoop.mapreduce.OutputCommitter. val OUTPUT_COMMITTER_CLASS = buildConf("spark.sql.sources.outputCommitterClass") @@ -1914,6 +1985,14 @@ object SQLConf { .booleanConf .createWithDefault(false) + val IGNORE_INVALID_PARTITION_PATHS = buildConf("spark.sql.files.ignoreInvalidPartitionPaths") + .doc("Whether to ignore invalid partition paths that do not match =. When " + + "the option is enabled, table with two partition directories 'table/invalid' and " + + "'table/col=1' will only load the latter directory and ignore the invalid partition") + .version("4.0.0") + .booleanConf + .createWithDefault(false) + val MAX_RECORDS_PER_FILE = buildConf("spark.sql.files.maxRecordsPerFile") .doc("Maximum number of records to write out to a single file. " + "If this value is zero or negative, there is no limit.") @@ -2065,6 +2144,13 @@ object SQLConf { .timeConf(TimeUnit.MILLISECONDS) .createWithDefault(TimeUnit.MINUTES.toMillis(1)) // 1 minute + val STREAMING_TRANSFORM_WITH_STATE_OP_STATE_SCHEMA_VERSION = + buildConf("spark.sql.streaming.transformWithState.stateSchemaVersion") + .doc("The version of the state schema used by the transformWithState operator") + .version("4.0.0") + .intConf + .createWithDefault(3) + val STATE_STORE_COMPRESSION_CODEC = buildConf("spark.sql.streaming.stateStore.compression.codec") .internal() @@ -2238,7 +2324,9 @@ object SQLConf { buildConf("spark.sql.streaming.stateStore.skipNullsForStreamStreamJoins.enabled") .internal() .doc("When true, this config will skip null values in hash based stream-stream joins. " + - "The number of skipped null values will be shown as custom metric of stream join operator.") + "The number of skipped null values will be shown as custom metric of stream join operator. " + + "If the streaming query was started with Spark 3.5 or above, please exercise caution " + + "before enabling this config since it may hide potential data loss/corruption issues.") .version("3.3.0") .booleanConf .createWithDefault(false) @@ -2271,6 +2359,17 @@ object SQLConf { .booleanConf .createWithDefault(false) + val STREAMING_OPTIMIZE_ONE_ROW_PLAN_ENABLED = + buildConf("spark.sql.streaming.optimizeOneRowPlan.enabled") + .internal() + .doc("When true, enable OptimizeOneRowPlan rule for the case where the child is a " + + "streaming Dataset. This is a fallback flag to revert the 'incorrect' behavior, hence " + + "this configuration must not be used without understanding in depth. Use this only to " + + "quickly recover failure in existing query!") + .version("4.0.0") + .booleanConf + .createWithDefault(false) + val VARIABLE_SUBSTITUTE_ENABLED = buildConf("spark.sql.variable.substitute") .doc("This enables substitution using syntax like `${var}`, `${system:var}`, " + @@ -2524,6 +2623,14 @@ object SQLConf { .booleanConf .createWithDefault(false) + val AVOID_COLLAPSE_UDF_WITH_EXPENSIVE_EXPR = + buildConf("spark.sql.optimizer.avoidCollapseUDFWithExpensiveExpr") + .doc("Whether to avoid collapsing projections that would duplicate expensive expressions " + + "in UDFs.") + .version("4.0.0") + .booleanConf + .createWithDefault(true) + val FILE_SINK_LOG_DELETION = buildConf("spark.sql.streaming.fileSink.log.deletion") .internal() .doc("Whether to delete the expired log files in file stream sink.") @@ -2874,6 +2981,22 @@ object SQLConf { .intConf .createWithDefault(SHUFFLE_SPILL_NUM_ELEMENTS_FORCE_SPILL_THRESHOLD.defaultValue.get) + val SHUFFLE_DEPENDENCY_SKIP_MIGRATION_ENABLED = + buildConf("spark.sql.shuffleDependency.skipMigration.enabled") + .doc("When enabled, shuffle dependencies for a Spark Connect SQL execution are marked at " + + "the end of the execution, and they will not be migrated during decommissions.") + .version("4.0.0") + .booleanConf + .createWithDefault(Utils.isTesting) + + val SHUFFLE_DEPENDENCY_FILE_CLEANUP_ENABLED = + buildConf("spark.sql.shuffleDependency.fileCleanup.enabled") + .doc("When enabled, shuffle files will be cleaned up at the end of Spark Connect " + + "SQL executions.") + .version("4.0.0") + .booleanConf + .createWithDefault(Utils.isTesting) + val SORT_MERGE_JOIN_EXEC_BUFFER_IN_MEMORY_THRESHOLD = buildConf("spark.sql.sortMergeJoinExec.buffer.in.memory.threshold") .internal() @@ -3371,7 +3494,7 @@ object SQLConf { "standard directly, but their behaviors align with ANSI SQL's style") .version("3.0.0") .booleanConf - .createWithDefault(sys.env.get("SPARK_ANSI_SQL_MODE").contains("true")) + .createWithDefault(!sys.env.get("SPARK_ANSI_SQL_MODE").contains("false")) val ENFORCE_RESERVED_KEYWORDS = buildConf("spark.sql.ansi.enforceReservedKeywords") .doc(s"When true and '${ANSI_ENABLED.key}' is true, the Spark SQL parser enforces the ANSI " + @@ -3443,6 +3566,17 @@ object SQLConf { .booleanConf .createWithDefault(false) + val USE_COMMON_EXPR_ID_FOR_ALIAS = + buildConf("spark.sql.useCommonExprIdForAlias") + .internal() + .doc("When true, use the common expression ID for the alias when rewriting With " + + "expressions. Otherwise, use the index of the common expression definition. When true " + + "this avoids duplicate alias names, but is helpful to set to false for testing to ensure" + + "that alias names are consistent.") + .version("4.0.0") + .booleanConf + .createWithDefault(true) + val USE_NULLS_FOR_MISSING_DEFAULT_COLUMN_VALUES = buildConf("spark.sql.defaultColumn.useNullsForMissingDefaultValues") .internal() @@ -3736,7 +3870,7 @@ object SQLConf { .checkValues((1 to 9).toSet + Deflater.DEFAULT_COMPRESSION) .createOptional - val AVRO_XZ_LEVEL = buildConf("spark.sql.avro.zx.level") + val AVRO_XZ_LEVEL = buildConf("spark.sql.avro.xz.level") .doc("Compression level for the xz codec used in writing of AVRO files. " + "Valid value must be in the range of from 1 to 9 inclusive " + "The default value is 6.") @@ -4122,11 +4256,20 @@ object SQLConf { val LEGACY_MSSQLSERVER_NUMERIC_MAPPING_ENABLED = buildConf("spark.sql.legacy.mssqlserver.numericMapping.enabled") .internal() - .doc("When true, use legacy MsSqlServer SMALLINT and REAL type mapping.") + .doc("When true, use legacy MsSqlServer TINYINT, SMALLINT and REAL type mapping.") .version("2.4.5") .booleanConf .createWithDefault(false) + val LEGACY_MSSQLSERVER_DATETIMEOFFSET_MAPPING_ENABLED = + buildConf("spark.sql.legacy.mssqlserver.datetimeoffsetMapping.enabled") + .internal() + .doc("When true, DATETIMEOFFSET is mapped to StringType; otherwise, it is mapped to " + + "TimestampType.") + .version("4.0.0") + .booleanConf + .createWithDefault(false) + val LEGACY_MYSQL_BIT_ARRAY_MAPPING_ENABLED = buildConf("spark.sql.legacy.mysql.bitArrayMapping.enabled") .internal() @@ -4135,6 +4278,17 @@ object SQLConf { .booleanConf .createWithDefault(false) + val LEGACY_MYSQL_TIMESTAMPNTZ_MAPPING_ENABLED = + buildConf("spark.sql.legacy.mysql.timestampNTZMapping.enabled") + .internal() + .doc("When true, TimestampNTZType and MySQL TIMESTAMP can be converted bidirectionally. " + + "For reading, MySQL TIMESTAMP is converted to TimestampNTZType when JDBC read option " + + "preferTimestampNTZ is true. For writing, TimestampNTZType is converted to MySQL " + + "TIMESTAMP; otherwise, DATETIME") + .version("4.0.0") + .booleanConf + .createWithDefault(false) + val LEGACY_ORACLE_TIMESTAMP_MAPPING_ENABLED = buildConf("spark.sql.legacy.oracle.timestampMapping.enabled") .internal() @@ -4144,6 +4298,33 @@ object SQLConf { .booleanConf .createWithDefault(false) + val LEGACY_DB2_TIMESTAMP_MAPPING_ENABLED = + buildConf("spark.sql.legacy.db2.numericMapping.enabled") + .internal() + .doc("When true, SMALLINT maps to IntegerType in DB2; otherwise, ShortType" ) + .version("4.0.0") + .booleanConf + .createWithDefault(false) + + val LEGACY_DB2_BOOLEAN_MAPPING_ENABLED = + buildConf("spark.sql.legacy.db2.booleanMapping.enabled") + .internal() + .doc("When true, BooleanType maps to CHAR(1) in DB2; otherwise, BOOLEAN" ) + .version("4.0.0") + .booleanConf + .createWithDefault(false) + + val LEGACY_POSTGRES_DATETIME_MAPPING_ENABLED = + buildConf("spark.sql.legacy.postgres.datetimeMapping.enabled") + .internal() + .doc("When true, TimestampType maps to TIMESTAMP WITHOUT TIME ZONE in PostgreSQL for " + + "writing; otherwise, TIMESTAMP WITH TIME ZONE. When true, TIMESTAMP WITH TIME ZONE " + + "can be converted to TimestampNTZType when JDBC read option preferTimestampNTZ is " + + "true; otherwise, converted to TimestampType regardless of preferTimestampNTZ.") + .version("4.0.0") + .booleanConf + .createWithDefault(false) + val CSV_FILTER_PUSHDOWN_ENABLED = buildConf("spark.sql.csv.filterPushdown.enabled") .doc("When true, enable filter pushdown to CSV datasource.") .version("3.0.0") @@ -4171,6 +4352,15 @@ object SQLConf { .booleanConf .createWithDefault(true) + val JSON_EXACT_STRING_PARSING = + buildConf("spark.sql.json.enableExactStringParsing") + .internal() + .doc("When set to true, string columns extracted from JSON objects will be extracted " + + "exactly as they appear in the input string, with no changes") + .version("4.0.0") + .booleanConf + .createWithDefault(true) + val LEGACY_CSV_ENABLE_DATE_TIME_PARSING_FALLBACK = buildConf("spark.sql.legacy.csv.enableDateTimeParsingFallback") .internal() @@ -4421,7 +4611,7 @@ object SQLConf { s"instead of the value of ${DEFAULT_DATA_SOURCE_NAME.key} as the table provider.") .version("3.1.0") .booleanConf - .createWithDefault(true) + .createWithDefault(sys.env.get("SPARK_SQL_LEGACY_CREATE_HIVE_TABLE").contains("true")) val LEGACY_CHAR_VARCHAR_AS_STRING = buildConf("spark.sql.legacy.charVarcharAsString") @@ -4449,6 +4639,14 @@ object SQLConf { .booleanConf .createWithDefault(true) + val LEGACY_NO_CHAR_PADDING_IN_PREDICATE = buildConf("spark.sql.legacy.noCharPaddingInPredicate") + .internal() + .doc("When true, Spark will not apply char type padding for CHAR type columns in string " + + s"comparison predicates, when '${READ_SIDE_CHAR_PADDING.key}' is false.") + .version("4.0.0") + .booleanConf + .createWithDefault(false) + val CLI_PRINT_HEADER = buildConf("spark.sql.cli.print.header") .doc("When set to true, spark-sql CLI prints the names of the columns in query output.") @@ -4508,6 +4706,7 @@ object SQLConf { val LEGACY_INFER_ARRAY_TYPE_FROM_FIRST_ELEMENT = buildConf("spark.sql.pyspark.legacy.inferArrayTypeFromFirstElement.enabled") + .internal() .doc("PySpark's SparkSession.createDataFrame infers the element type of an array from all " + "values in the array by default. If this config is set to true, it restores the legacy " + "behavior of only inferring the type from the first array element.") @@ -4515,6 +4714,16 @@ object SQLConf { .booleanConf .createWithDefault(false) + val LEGACY_INFER_MAP_STRUCT_TYPE_FROM_FIRST_ITEM = + buildConf("spark.sql.pyspark.legacy.inferMapTypeFromFirstPair.enabled") + .internal() + .doc("PySpark's SparkSession.createDataFrame infers the key/value types of a map from all " + + "paris in the map by default. If this config is set to true, it restores the legacy " + + "behavior of only inferring the type from the first non-null pair.") + .version("4.0.0") + .booleanConf + .createWithDefault(false) + val LEGACY_USE_V1_COMMAND = buildConf("spark.sql.legacy.useV1Command") .internal() @@ -4745,6 +4954,24 @@ object SQLConf { .booleanConf .createWithDefault(false) + val LEGACY_SCALAR_SUBQUERY_ALLOW_GROUP_BY_NON_EQUALITY_CORRELATED_PREDICATE = + buildConf("spark.sql.legacy.scalarSubqueryAllowGroupByNonEqualityCorrelatedPredicate") + .internal() + .doc("When set to true, use incorrect legacy behavior for checking whether a scalar " + + "subquery with a group-by on correlated columns is allowed. See SPARK-48503") + .version("4.0.0") + .booleanConf + .createWithDefault(false) + + val SCALAR_SUBQUERY_ALLOW_GROUP_BY_COLUMN_EQUAL_TO_CONSTANT = + buildConf("spark.sql.analyzer.scalarSubqueryAllowGroupByColumnEqualToConstant") + .internal() + .doc("When set to true, allow scalar subqueries with group-by on a column that also " + + " has an equality filter with a constant (SPARK-48557).") + .version("4.0.0") + .booleanConf + .createWithDefault(true) + val ALLOW_SUBQUERY_EXPRESSIONS_IN_LAMBDAS_AND_HIGHER_ORDER_FUNCTIONS = buildConf("spark.sql.analyzer.allowSubqueryExpressionsInLambdasOrHigherOrderFunctions") .internal() @@ -4798,6 +5025,14 @@ object SQLConf { .booleanConf .createWithDefault(false) + val LEGACY_CODING_ERROR_ACTION = buildConf("spark.sql.legacy.codingErrorAction") + .internal() + .doc("When set to true, encode/decode functions replace unmappable characters with mojibake " + + "instead of reporting coding errors.") + .version("4.0.0") + .booleanConf + .createWithDefault(false) + val LEGACY_EVAL_CURRENT_TIME = buildConf("spark.sql.legacy.earlyEvalCurrentTime") .internal() .doc("When set to true, evaluation and constant folding will happen for now() and " + @@ -4810,6 +5045,16 @@ object SQLConf { .booleanConf .createWithDefault(false) + val LEGACY_BANG_EQUALS_NOT = buildConf("spark.sql.legacy.bangEqualsNot") + .internal() + .doc("When set to true, '!' is a lexical equivalent for 'NOT'. That is '!' can be used " + + "outside of the documented prefix usage in a logical expression." + + "Examples are: `expr ! IN (1, 2)` and `expr ! BETWEEN 1 AND 2`, but also `IF ! EXISTS`." + ) + .version("4.0.0") + .booleanConf + .createWithDefault(false) + /** * Holds information about keys that have been deprecated. * @@ -5061,6 +5306,8 @@ class SQLConf extends Serializable with Logging with SqlApiConf { def ignoreMissingFiles: Boolean = getConf(IGNORE_MISSING_FILES) + def ignoreInvalidPartitionPaths: Boolean = getConf(IGNORE_INVALID_PARTITION_PATHS) + def maxRecordsPerFile: Long = getConf(MAX_RECORDS_PER_FILE) def useCompression: Boolean = getConf(COMPRESS_CACHED) @@ -5241,12 +5488,27 @@ class SQLConf extends Serializable with Logging with SqlApiConf { def legacyMsSqlServerNumericMappingEnabled: Boolean = getConf(LEGACY_MSSQLSERVER_NUMERIC_MAPPING_ENABLED) + def legacyMsSqlServerDatetimeOffsetMappingEnabled: Boolean = + getConf(LEGACY_MSSQLSERVER_DATETIMEOFFSET_MAPPING_ENABLED) + def legacyMySqlBitArrayMappingEnabled: Boolean = getConf(LEGACY_MYSQL_BIT_ARRAY_MAPPING_ENABLED) + def legacyMySqlTimestampNTZMappingEnabled: Boolean = + getConf(LEGACY_MYSQL_TIMESTAMPNTZ_MAPPING_ENABLED) + def legacyOracleTimestampMappingEnabled: Boolean = getConf(LEGACY_ORACLE_TIMESTAMP_MAPPING_ENABLED) + def legacyDB2numericMappingEnabled: Boolean = + getConf(LEGACY_DB2_TIMESTAMP_MAPPING_ENABLED) + + def legacyDB2BooleanMappingEnabled: Boolean = + getConf(LEGACY_DB2_BOOLEAN_MAPPING_ENABLED) + + def legacyPostgresDatetimeMappingEnabled: Boolean = + getConf(LEGACY_POSTGRES_DATETIME_MAPPING_ENABLED) + override def legacyTimeParserPolicy: LegacyBehaviorPolicy.Value = { LegacyBehaviorPolicy.withName(getConf(SQLConf.LEGACY_TIME_PARSER_POLICY)) } @@ -5392,6 +5654,10 @@ class SQLConf extends Serializable with Logging with SqlApiConf { def groupByAliases: Boolean = getConf(GROUP_BY_ALIASES) + def viewSchemaBindingEnabled: Boolean = getConf(VIEW_SCHEMA_BINDING_ENABLED) + + def viewSchemaCompensation: Boolean = getConf(VIEW_SCHEMA_COMPENSATION) + def defaultCacheStorageLevel: StorageLevel = StorageLevel.fromString(getConf(DEFAULT_CACHE_STORAGE_LEVEL)) @@ -5702,6 +5968,9 @@ class SQLConf extends Serializable with Logging with SqlApiConf { def legacyInferArrayTypeFromFirstElement: Boolean = getConf( SQLConf.LEGACY_INFER_ARRAY_TYPE_FROM_FIRST_ELEMENT) + def legacyInferMapStructTypeFromFirstItem: Boolean = getConf( + SQLConf.LEGACY_INFER_MAP_STRUCT_TYPE_FROM_FIRST_ITEM) + def parquetFieldIdReadEnabled: Boolean = getConf(SQLConf.PARQUET_FIELD_ID_READ_ENABLED) def parquetFieldIdWriteEnabled: Boolean = getConf(SQLConf.PARQUET_FIELD_ID_WRITE_ENABLED) @@ -5722,6 +5991,8 @@ class SQLConf extends Serializable with Logging with SqlApiConf { def defaultDatabase: String = getConf(StaticSQLConf.CATALOG_DEFAULT_DATABASE) + def globalTempDatabase: String = getConf(StaticSQLConf.GLOBAL_TEMP_DATABASE) + def allowsTempViewCreationWithMultipleNameparts: Boolean = getConf(SQLConf.ALLOW_TEMP_VIEW_CREATION_WITH_MULTIPLE_NAME_PARTS) @@ -5738,6 +6009,8 @@ class SQLConf extends Serializable with Logging with SqlApiConf { def legacyJavaCharsets: Boolean = getConf(SQLConf.LEGACY_JAVA_CHARSETS) + def legacyCodingErrorAction: Boolean = getConf(SQLConf.LEGACY_CODING_ERROR_ACTION) + def legacyEvalCurrentTime: Boolean = getConf(SQLConf.LEGACY_EVAL_CURRENT_TIME) /** ********************** SQLConf functionality methods ************ */ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/sources/filters.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/sources/filters.scala index a52bca1066059..88f556130bfe6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/sources/filters.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/sources/filters.scala @@ -23,7 +23,7 @@ import org.apache.spark.sql.catalyst.parser.ParseException import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.parseColumnPath import org.apache.spark.sql.connector.expressions.{FieldReference, LiteralValue, NamedReference} import org.apache.spark.sql.connector.expressions.filter.{AlwaysFalse => V2AlwaysFalse, AlwaysTrue => V2AlwaysTrue, And => V2And, Not => V2Not, Or => V2Or, Predicate} -import org.apache.spark.sql.types.StringType +import org.apache.spark.sql.types.{DataType, StringType} import org.apache.spark.unsafe.types.UTF8String //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -381,3 +381,87 @@ case class AlwaysFalse() extends Filter { @Evolving object AlwaysFalse extends AlwaysFalse { } + +/** + * Base class for collation aware string filters. + */ +@Evolving +abstract class CollatedFilter() extends Filter { + + /** The corresponding non-collation aware filter. */ + def correspondingFilter: Filter + def dataType: DataType + + override def references: Array[String] = correspondingFilter.references + override def toV2: Predicate = correspondingFilter.toV2 +} + +/** Collation aware equivalent of [[EqualTo]]. */ +@Evolving +case class CollatedEqualTo(attribute: String, value: Any, dataType: DataType) + extends CollatedFilter { + override def correspondingFilter: Filter = EqualTo(attribute, value) +} + +/** Collation aware equivalent of [[EqualNullSafe]]. */ +@Evolving +case class CollatedEqualNullSafe(attribute: String, value: Any, dataType: DataType) + extends CollatedFilter { + override def correspondingFilter: Filter = EqualNullSafe(attribute, value) +} + +/** Collation aware equivalent of [[GreaterThan]]. */ +@Evolving +case class CollatedGreaterThan(attribute: String, value: Any, dataType: DataType) + extends CollatedFilter { + override def correspondingFilter: Filter = GreaterThan(attribute, value) +} + +/** Collation aware equivalent of [[GreaterThanOrEqual]]. */ +@Evolving +case class CollatedGreaterThanOrEqual(attribute: String, value: Any, dataType: DataType) + extends CollatedFilter { + override def correspondingFilter: Filter = GreaterThanOrEqual(attribute, value) +} + +/** Collation aware equivalent of [[LessThan]]. */ +@Evolving +case class CollatedLessThan(attribute: String, value: Any, dataType: DataType) + extends CollatedFilter { + override def correspondingFilter: Filter = LessThan(attribute, value) +} + +/** Collation aware equivalent of [[LessThanOrEqual]]. */ +@Evolving +case class CollatedLessThanOrEqual(attribute: String, value: Any, dataType: DataType) + extends CollatedFilter { + override def correspondingFilter: Filter = LessThanOrEqual(attribute, value) +} + +/** Collation aware equivalent of [[In]]. */ +@Evolving +case class CollatedIn(attribute: String, values: Array[Any], dataType: DataType) + extends CollatedFilter { + override def correspondingFilter: Filter = In(attribute, values) +} + +/** Collation aware equivalent of [[StringStartsWith]]. */ +@Evolving +case class CollatedStringStartsWith(attribute: String, value: String, dataType: DataType) + extends CollatedFilter { + override def correspondingFilter: Filter = StringStartsWith(attribute, value) +} + +/** Collation aware equivalent of [[StringEndsWith]]. */ +@Evolving +case class CollatedStringEndsWith(attribute: String, value: String, dataType: DataType) + extends CollatedFilter { + override def correspondingFilter: Filter = StringEndsWith(attribute, value) +} + +/** Collation aware equivalent of [[StringContains]]. */ +@Evolving +case class CollatedStringContains(attribute: String, value: String, dataType: DataType) + extends CollatedFilter { + override def correspondingFilter: Filter = StringContains(attribute, value) +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataTypeExpression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataTypeExpression.scala index 026272a0f2d85..fd942ba60de4b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataTypeExpression.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataTypeExpression.scala @@ -30,7 +30,18 @@ private[sql] abstract class DataTypeExpression(val dataType: DataType) { } private[sql] case object BooleanTypeExpression extends DataTypeExpression(BooleanType) -private[sql] case object StringTypeExpression extends DataTypeExpression(StringType) +private[sql] case object StringTypeExpression { + /** + * Enables matching against StringType for expressions: + * {{{ + * case Cast(child @ StringType(collationId), NumericType) => + * ... + * }}} + */ + def unapply(e: Expression): Boolean = { + e.dataType.isInstanceOf[StringType] + } +} private[sql] case object TimestampTypeExpression extends DataTypeExpression(TimestampType) private[sql] case object DateTypeExpression extends DataTypeExpression(DateType) private[sql] case object ByteTypeExpression extends DataTypeExpression(ByteType) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/util/SchemaUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/util/SchemaUtils.scala index d459d2dd12272..1e0bac331dc75 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/util/SchemaUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/util/SchemaUtils.scala @@ -295,12 +295,29 @@ private[spark] object SchemaUtils { def escapeMetaCharacters(str: String): String = SparkSchemaUtils.escapeMetaCharacters(str) /** - * Checks if a given data type has a non-default collation string type. + * Checks if a given data type has a non utf8 binary (implicit) collation type. */ - def hasNonBinarySortableCollatedString(dt: DataType): Boolean = { + def hasNonUTF8BinaryCollation(dt: DataType): Boolean = { dt.existsRecursively { - case st: StringType => !st.supportsBinaryOrdering + case st: StringType => !st.isUTF8BinaryCollation case _ => false } } + + /** + * Replaces any collated string type with non collated StringType + * recursively in the given data type. + */ + def replaceCollatedStringWithString(dt: DataType): DataType = dt match { + case ArrayType(et, nullable) => + ArrayType(replaceCollatedStringWithString(et), nullable) + case MapType(kt, vt, nullable) => + MapType(replaceCollatedStringWithString(kt), replaceCollatedStringWithString(vt), nullable) + case StructType(fields) => + StructType(fields.map { field => + field.copy(dataType = replaceCollatedStringWithString(field.dataType)) + }) + case _: StringType => StringType + case _ => dt + } } diff --git a/sql/catalyst/src/test/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtilsSuite.scala b/sql/catalyst/src/test/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtilsSuite.scala index 4b33f9bc52785..40e6182e587b3 100644 --- a/sql/catalyst/src/test/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtilsSuite.scala +++ b/sql/catalyst/src/test/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtilsSuite.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.expressions import org.apache.commons.lang3.{JavaVersion, SystemUtils} -import org.apache.spark.{SparkFunSuite, SparkRuntimeException} +import org.apache.spark.{SparkFunSuite, SparkIllegalArgumentException, SparkRuntimeException} import org.apache.spark.unsafe.types.UTF8String class ExpressionImplUtilsSuite extends SparkFunSuite { @@ -353,4 +353,67 @@ class ExpressionImplUtilsSuite extends SparkFunSuite { parameters = t.errorParamsMap ) } + + test("Validate UTF8 string") { + def validateUTF8(str: UTF8String, expected: UTF8String, except: Boolean): Unit = { + if (except) { + checkError( + exception = intercept[SparkIllegalArgumentException] { + ExpressionImplUtils.validateUTF8String(str) + }, + errorClass = "INVALID_UTF8_STRING", + parameters = Map( + "str" -> str.getBytes.map(byte => f"\\x$byte%02X").mkString + ) + ) + } else { + assert(ExpressionImplUtils.validateUTF8String(str)== expected) + } + } + validateUTF8(UTF8String.EMPTY_UTF8, + UTF8String.fromString(""), except = false) + validateUTF8(UTF8String.fromString(""), + UTF8String.fromString(""), except = false) + validateUTF8(UTF8String.fromString("aa"), + UTF8String.fromString("aa"), except = false) + validateUTF8(UTF8String.fromString("\u0061"), + UTF8String.fromString("\u0061"), except = false) + validateUTF8(UTF8String.fromString(""), + UTF8String.fromString(""), except = false) + validateUTF8(UTF8String.fromString("abc"), + UTF8String.fromString("abc"), except = false) + validateUTF8(UTF8String.fromString("hello"), + UTF8String.fromString("hello"), except = false) + validateUTF8(UTF8String.fromBytes(Array.empty[Byte]), + UTF8String.fromString(""), except = false) + validateUTF8(UTF8String.fromBytes(Array[Byte](0x41)), + UTF8String.fromString("A"), except = false) + validateUTF8(UTF8String.fromBytes(Array[Byte](0x61)), + UTF8String.fromString("a"), except = false) + // scalastyle:off nonascii + validateUTF8(UTF8String.fromBytes(Array[Byte](0x80.toByte)), + UTF8String.fromString("\uFFFD"), except = true) + validateUTF8(UTF8String.fromBytes(Array[Byte](0xFF.toByte)), + UTF8String.fromString("\uFFFD"), except = true) + // scalastyle:on nonascii + } + + test("TryValidate UTF8 string") { + def tryValidateUTF8(str: UTF8String, expected: UTF8String): Unit = { + assert(ExpressionImplUtils.tryValidateUTF8String(str) == expected) + } + tryValidateUTF8(UTF8String.fromString(""), UTF8String.fromString("")) + tryValidateUTF8(UTF8String.fromString("aa"), UTF8String.fromString("aa")) + tryValidateUTF8(UTF8String.fromString("\u0061"), UTF8String.fromString("\u0061")) + tryValidateUTF8(UTF8String.EMPTY_UTF8, UTF8String.fromString("")) + tryValidateUTF8(UTF8String.fromString(""), UTF8String.fromString("")) + tryValidateUTF8(UTF8String.fromString("abc"), UTF8String.fromString("abc")) + tryValidateUTF8(UTF8String.fromString("hello"), UTF8String.fromString("hello")) + tryValidateUTF8(UTF8String.fromBytes(Array.empty[Byte]), UTF8String.fromString("")) + tryValidateUTF8(UTF8String.fromBytes(Array[Byte](0x41)), UTF8String.fromString("A")) + tryValidateUTF8(UTF8String.fromBytes(Array[Byte](0x61)), UTF8String.fromString("a")) + tryValidateUTF8(UTF8String.fromBytes(Array[Byte](0x80.toByte)), null) + tryValidateUTF8(UTF8String.fromBytes(Array[Byte](0xFF.toByte)), null) + } + } diff --git a/sql/catalyst/src/test/java/org/apache/spark/sql/connector/catalog/CatalogLoadingSuite.java b/sql/catalyst/src/test/java/org/apache/spark/sql/connector/catalog/CatalogLoadingSuite.java index 238b8ac04e7e6..0db155e88aea5 100644 --- a/sql/catalyst/src/test/java/org/apache/spark/sql/connector/catalog/CatalogLoadingSuite.java +++ b/sql/catalyst/src/test/java/org/apache/spark/sql/connector/catalog/CatalogLoadingSuite.java @@ -6,7 +6,7 @@ * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala index 22f24d8266177..088f0e21710d2 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala @@ -53,9 +53,12 @@ object RandomDataGenerator { */ private val PROBABILITY_OF_NULL: Float = 0.1f - final val MAX_STR_LEN: Int = 1024 - final val MAX_ARR_SIZE: Int = 128 - final val MAX_MAP_SIZE: Int = 128 + final val MAX_STR_LEN: Int = + System.getProperty("spark.sql.test.randomDataGenerator.maxStrLen", "1024").toInt + final val MAX_ARR_SIZE: Int = + System.getProperty("spark.sql.test.randomDataGenerator.maxArraySize", "128").toInt + final val MAX_MAP_SIZE: Int = + System.getProperty("spark.sql.test.randomDataGenerator.maxMapSize", "128").toInt /** * Helper function for constructing a biased random number generator which returns "interesting" diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/EscapePathBenchmark.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/EscapePathBenchmark.scala new file mode 100644 index 0000000000000..4cbffff184cd9 --- /dev/null +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/EscapePathBenchmark.scala @@ -0,0 +1,124 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.catalyst + +import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} +import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils + +/** + * Benchmark for path escaping/unescaping + * To run this benchmark: + * {{{ + * 1. without sbt: + * bin/spark-submit --class --jars + * 2. build/sbt "catalyst/Test/runMain " + * 3. generate result: + * SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "catalyst/Test/runMain " + * Results will be written to "benchmarks/EscapePathBenchmark-results.txt". + * }}} + */ +object EscapePathBenchmark extends BenchmarkBase { + override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { + val N = 1000000 + runBenchmark("Escape") { + val benchmark = new Benchmark("Escape Tests", N, 10, output = output) + val paths = Seq( + "https://issues.apache.org/jira/browse/SPARK-48551", + "https...issues.apache.org/jira/browse/SPARK-48551", + "https...issues.apache.org.jira/browse/SPARK-48551", + "https...issues.apache.org.jira.browse/SPARK-48551", + "https...issues.apache.org.jira.browse.SPARK-48551") + benchmark.addCase("Legacy") { _ => + (1 to N).foreach(_ => paths.foreach(escapePathNameLegacy)) + } + + benchmark.addCase("New") { _ => + (1 to N).foreach(_ => { + paths.foreach(ExternalCatalogUtils.escapePathName) + }) + } + benchmark.run() + } + + runBenchmark("Unescape") { + val benchmark = new Benchmark("Unescape Tests", N, 10, output = output) + val paths = Seq( + "https%3A%2F%2Fissues.apache.org%2Fjira%2Fbrowse%2FSPARK-48551", + "https:%2F%2Fissues.apache.org%2Fjira%2Fbrowse%2FSPARK-48551", + "https:/%2Fissues.apache.org%2Fjira%2Fbrowse%2FSPARK-48551", + "https://issues.apache.org%2Fjira%2Fbrowse%2FSPARK-48551", + "https://issues.apache.org/jira%2Fbrowse%2FSPARK-48551", + "https://issues.apache.org/jira%2Fbrowse%2FSPARK-48551", + "https://issues.apache.org/jira/browse%2FSPARK-48551", + "https://issues.apache.org/jira/browse%2SPARK-48551", + "https://issues.apache.org/jira/browse/SPARK-48551") + benchmark.addCase("Legacy") { _ => + (1 to N).foreach(_ => paths.foreach(unescapePathNameLegacy)) + } + + benchmark.addCase("New") { _ => + (1 to N).foreach(_ => { + paths.foreach(ExternalCatalogUtils.unescapePathName) + }) + } + benchmark.run() + } + } + + /** + * Legacy implementation of escapePathName before Spark 4.0 + */ + def escapePathNameLegacy(path: String): String = { + val builder = new StringBuilder() + path.foreach { c => + if (ExternalCatalogUtils.needsEscaping(c)) { + builder.append('%') + builder.append(f"${c.asInstanceOf[Int]}%02X") + } else { + builder.append(c) + } + } + + builder.toString() + } + + def unescapePathNameLegacy(path: String): String = { + val sb = new StringBuilder + var i = 0 + while (i < path.length) { + val c = path.charAt(i) + if (c == '%' && i + 2 < path.length) { + val code: Int = try { + Integer.parseInt(path.substring(i + 1, i + 3), 16) + } catch { + case _: Exception => -1 + } + if (code >= 0) { + sb.append(code.asInstanceOf[Char]) + i += 3 + } else { + sb.append(c) + i += 1 + } + } else { + sb.append(c) + i += 1 + } + } + sb.toString() + } +} diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/SQLKeywordSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/SQLKeywordSuite.scala index 8806431ab4395..9977dcd83d6af 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/SQLKeywordSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/SQLKeywordSuite.scala @@ -98,7 +98,7 @@ trait SQLKeywordUtils extends SparkFunSuite with SQLHelper { } (symbol, literals) :: Nil } else { - val literal = literalDef.replaceAll("'", "").trim + val literal = literalDef.split("\\{")(0).replaceAll("'", "").trim // The case where a symbol string and its literal string are different, // e.g., `SETMINUS: 'MINUS';`. if (symbol != literal) { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala index f12d224096917..19eb3a418543d 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala @@ -360,6 +360,40 @@ class AnalysisErrorSuite extends AnalysisTest with DataTypeErrorsBase { "inputType" -> "\"BOOLEAN\"", "requiredType" -> "\"INT\"")) + errorClassTest( + "the buckets of ntile window function is not foldable", + testRelation2.select( + WindowExpression( + NTile(Literal(99.9f)), + WindowSpecDefinition( + UnresolvedAttribute("a") :: Nil, + SortOrder(UnresolvedAttribute("b"), Ascending) :: Nil, + UnspecifiedFrame)).as("window")), + errorClass = "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + messageParameters = Map( + "sqlExpr" -> "\"ntile(99.9)\"", + "paramIndex" -> "first", + "inputSql" -> "\"99.9\"", + "inputType" -> "\"FLOAT\"", + "requiredType" -> "\"INT\"")) + + + errorClassTest( + "the buckets of ntile window function is not int literal", + testRelation2.select( + WindowExpression( + NTile(AttributeReference("b", IntegerType)()), + WindowSpecDefinition( + UnresolvedAttribute("a") :: Nil, + SortOrder(UnresolvedAttribute("b"), Ascending) :: Nil, + UnspecifiedFrame)).as("window")), + errorClass = "DATATYPE_MISMATCH.NON_FOLDABLE_INPUT", + messageParameters = Map( + "sqlExpr" -> "\"ntile(b)\"", + "inputName" -> "`buckets`", + "inputExpr" -> "\"b\"", + "inputType" -> "\"INT\"")) + errorClassTest( "unresolved attributes", testRelation.select($"abcd"), diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala index 3c628d35dcdb8..62856a96f7ee8 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala @@ -759,9 +759,10 @@ class AnalysisSuite extends AnalysisTest with Matchers { testRelation, testRelation, cond, - UpdateAction(Some(cond), Assignment($"a", $"a") :: Nil) :: Nil, - Nil, - Nil + matchedActions = UpdateAction(Some(cond), Assignment($"a", $"a") :: Nil) :: Nil, + notMatchedActions = Nil, + notMatchedBySourceActions = Nil, + withSchemaEvolution = false ), "AMBIGUOUS_REFERENCE", Map("name" -> "`a`", "referenceNames" -> "[`a`, `a`]")) @@ -1795,4 +1796,15 @@ class AnalysisSuite extends AnalysisTest with Matchers { assert(refs.head.resolved) assert(refs.head.isStreaming) } + + test("SPARK-47927: ScalaUDF output nullability") { + val udf = ScalaUDF( + function = (i: Int) => i + 1, + dataType = IntegerType, + children = $"a" :: Nil, + nullable = false, + inputEncoders = Seq(Some(ExpressionEncoder[Int]().resolveAndBind()))) + val plan = testRelation.select(udf.as("u")).select($"u").analyze + assert(plan.output.head.nullable) + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala index 4b58755e13ef6..4367cbbd24a89 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala @@ -747,6 +747,17 @@ class ExpressionTypeCheckingSuite extends SparkFunSuite with SQLHelper with Quer ) } + test("hash expressions are prohibited on VariantType elements") { + val argument = Literal.create(null, VariantType) + val murmur3Hash = new Murmur3Hash(Seq(argument)) + assert(murmur3Hash.checkInputDataTypes() == + DataTypeMismatch( + errorSubClass = "HASH_VARIANT_TYPE", + messageParameters = Map("functionName" -> toSQLId(murmur3Hash.prettyName)) + ) + ) + } + test("check types for Lag") { val lag = Lag(Literal(1), NonFoldableLiteral(10), Literal(null), true) assert(lag.checkInputDataTypes() == @@ -800,4 +811,9 @@ class ExpressionTypeCheckingSuite extends SparkFunSuite with SQLHelper with Quer "This should have been converted during analysis.")) ) } + + test("check that current time is foldable") { + val rnd = Rand(Month(CurrentDate())) + assert(rnd.checkInputDataTypes().isSuccess) + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala index 953b2c8bb1011..39cf298aec434 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala @@ -86,4 +86,22 @@ class SubstituteUnresolvedOrdinalsSuite extends AnalysisTest { testRelationWithData.groupBy(Literal(1))(Literal(100).as("a")) ) } + + test("SPARK-47895: group by all repeated analysis") { + val plan = testRelation.groupBy($"all")(Literal(100).as("a")).analyze + comparePlans( + plan, + testRelation.groupBy(Literal(1))(Literal(100).as("a")) + ) + + val testRelationWithData = testRelation.copy(data = Seq(new GenericInternalRow(Array(1: Any)))) + // Copy the plan to reset its `analyzed` flag, so that analyzer rules will re-apply. + val copiedPlan = plan.transform { + case _: LocalRelation => testRelationWithData + } + comparePlans( + copiedPlan.analyze, // repeated analysis + testRelationWithData.groupBy(Literal(1))(Literal(100).as("a")) + ) + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtilsSuite.scala new file mode 100644 index 0000000000000..4cdbda5494196 --- /dev/null +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtilsSuite.scala @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.catalog + +import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils.{escapePathName, unescapePathName} + +class ExternalCatalogUtilsSuite extends SparkFunSuite { + + test("SPARK-48551: escapePathName") { + ExternalCatalogUtils.charToEscape.stream().toArray.map(_.asInstanceOf[Char]).foreach { c => + // Check parity with old conversion technique: + assert(escapePathName(c.toString) === "%" + f"$c%02X", + s"wrong escaping for $c") + } + assert(escapePathName("") === "") + assert(escapePathName(" ") === " ") + assert(escapePathName("\n") === "%0A") + assert(escapePathName("a b") === "a b") + assert(escapePathName("a:b") === "a%3Ab") + assert(escapePathName(":ab") === "%3Aab") + assert(escapePathName("ab:") === "ab%3A") + assert(escapePathName("a%b") === "a%25b") + assert(escapePathName("a,b") === "a,b") + assert(escapePathName("a/b") === "a%2Fb") + } + + test("SPARK-48551: unescapePathName") { + ExternalCatalogUtils.charToEscape.stream().toArray.map(_.asInstanceOf[Char]).foreach { c => + // Check parity with old conversion technique: + assert(unescapePathName("%" + f"$c%02X") === c.toString, + s"wrong unescaping for $c") + } + assert(unescapePathName(null) === null) + assert(unescapePathName("") === "") + assert(unescapePathName(" ") === " ") + assert(unescapePathName("%0A") === "\n") + assert(unescapePathName("a b") === "a b") + assert(unescapePathName("a%3Ab") === "a:b") + assert(unescapePathName("%3Aab") === ":ab") + assert(unescapePathName("ab%3A") === "ab:") + assert(unescapePathName("a%25b") === "a%b") + assert(unescapePathName("a,b") === "a,b") + assert(unescapePathName("a%2Fb") === "a/b") + assert(unescapePathName("a%2") === "a%2") + assert(unescapePathName("a%F ") === "a%F ") + assert(unescapePathName("%0") === "%0") + assert(unescapePathName("0%") === "0%") + // scalastyle:off nonascii + assert(unescapePathName("a\u00FF") === "a\u00FF") + // scalastyle:on nonascii + } +} diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala index e9a60ff17fc1b..48f829548bb65 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala @@ -933,17 +933,17 @@ abstract class SessionCatalogSuite extends AnalysisTest with Eventually { createTempView(catalog, "temp_view4", tempTable, overrideIfExists = false) createGlobalTempView(catalog, "global_temp_view1", tempTable, overrideIfExists = false) createGlobalTempView(catalog, "global_temp_view2", tempTable, overrideIfExists = false) - assert(catalog.listTables(catalog.globalTempViewManager.database, "*").toSet == + assert(catalog.listTables(catalog.globalTempDatabase, "*").toSet == Set(TableIdentifier("temp_view1"), TableIdentifier("temp_view4"), - TableIdentifier("global_temp_view1", Some(catalog.globalTempViewManager.database)), - TableIdentifier("global_temp_view2", Some(catalog.globalTempViewManager.database)))) - assert(catalog.listTables(catalog.globalTempViewManager.database, "*temp_view1").toSet == + TableIdentifier("global_temp_view1", Some(catalog.globalTempDatabase)), + TableIdentifier("global_temp_view2", Some(catalog.globalTempDatabase)))) + assert(catalog.listTables(catalog.globalTempDatabase, "*temp_view1").toSet == Set(TableIdentifier("temp_view1"), - TableIdentifier("global_temp_view1", Some(catalog.globalTempViewManager.database)))) - assert(catalog.listTables(catalog.globalTempViewManager.database, "global*").toSet == - Set(TableIdentifier("global_temp_view1", Some(catalog.globalTempViewManager.database)), - TableIdentifier("global_temp_view2", Some(catalog.globalTempViewManager.database)))) + TableIdentifier("global_temp_view1", Some(catalog.globalTempDatabase)))) + assert(catalog.listTables(catalog.globalTempDatabase, "global*").toSet == + Set(TableIdentifier("global_temp_view1", Some(catalog.globalTempDatabase)), + TableIdentifier("global_temp_view2", Some(catalog.globalTempDatabase)))) } } @@ -1906,9 +1906,9 @@ abstract class SessionCatalogSuite extends AnalysisTest with Eventually { assert(catalog.getCachedTable(qualifiedName1) != null) createGlobalTempView(catalog, "tbl2", Range(2, 10, 1, 10), false) - val qualifiedName2 = QualifiedTableName(catalog.globalTempViewManager.database, "tbl2") + val qualifiedName2 = QualifiedTableName(catalog.globalTempDatabase, "tbl2") catalog.cacheTable(qualifiedName2, Range(2, 10, 1, 10)) - catalog.refreshTable(TableIdentifier("tbl2", Some(catalog.globalTempViewManager.database))) + catalog.refreshTable(TableIdentifier("tbl2", Some(catalog.globalTempDatabase))) assert(catalog.getCachedTable(qualifiedName2) != null) } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVExprUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVExprUtilsSuite.scala index 2e94c723a6f26..d4b68500e0789 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVExprUtilsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVExprUtilsSuite.scala @@ -33,6 +33,15 @@ class CSVExprUtilsSuite extends SparkFunSuite { assert(CSVExprUtils.toChar("""\\""") === '\\') } + test("Does not accept null delimiter") { + checkError( + exception = intercept[SparkIllegalArgumentException]{ + CSVExprUtils.toDelimiterStr(null) + }, + errorClass = "INVALID_DELIMITER_VALUE.NULL_VALUE", + parameters = Map.empty) + } + test("Does not accept delimiter larger than one character") { checkError( exception = intercept[SparkIllegalArgumentException]{ diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/EncoderResolutionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/EncoderResolutionSuite.scala index 82238de31f9fb..e29609c741633 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/EncoderResolutionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/EncoderResolutionSuite.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.encoders import scala.reflect.runtime.universe.TypeTag +import org.apache.spark.SparkRuntimeException import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.dsl.expressions._ @@ -169,10 +170,10 @@ class EncoderResolutionSuite extends PlanTest { fromRow(InternalRow(new GenericArrayData(Array(1, 2)))) // If there is null value, it should throw runtime exception - val e = intercept[RuntimeException] { + val exception = intercept[SparkRuntimeException] { fromRow(InternalRow(new GenericArrayData(Array(1, null)))) } - assert(e.getCause.getMessage.contains("Null value appeared in non-nullable field")) + assert(exception.getErrorClass == "NOT_NULL_ASSERT_VIOLATION") } test("the real number of fields doesn't match encoder schema: tuple encoder") { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala index df73d50fdcd6b..01a3daa77d38d 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.encoders import scala.collection.mutable import scala.util.Random +import org.apache.spark.SparkRuntimeException import org.apache.spark.sql.{RandomDataGenerator, Row} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.plans.CodegenInterpretedPlanTest @@ -275,9 +276,10 @@ class RowEncoderSuite extends CodegenInterpretedPlanTest { test("RowEncoder should throw RuntimeException if input row object is null") { val schema = new StructType().add("int", IntegerType) val encoder = ExpressionEncoder(schema) - val e = intercept[RuntimeException](toRow(encoder, null)) - assert(e.getCause.getMessage.contains("Null value appeared in non-nullable field")) - assert(e.getCause.getMessage.contains("top level Product or row object")) + // Check the error class only since the parameters may change depending on how we are running + // this test case. + val exception = intercept[SparkRuntimeException](toRow(encoder, null)) + assert(exception.getErrorClass == "NOT_NULL_ASSERT_VIOLATION") } test("RowEncoder should validate external type") { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/BitwiseExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/BitwiseExpressionsSuite.scala index 9089c6f17d408..63602d04b5c79 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/BitwiseExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/BitwiseExpressionsSuite.scala @@ -134,6 +134,47 @@ class BitwiseExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { } } + test("BitCount") { + // null + val nullLongLiteral = Literal.create(null, LongType) + val nullIntLiteral = Literal.create(null, IntegerType) + val nullBooleanLiteral = Literal.create(null, BooleanType) + checkEvaluation(BitwiseCount(nullLongLiteral), null) + checkEvaluation(BitwiseCount(nullIntLiteral), null) + checkEvaluation(BitwiseCount(nullBooleanLiteral), null) + + // boolean + checkEvaluation(BitwiseCount(Literal(true)), 1) + checkEvaluation(BitwiseCount(Literal(false)), 0) + + // byte/tinyint + checkEvaluation(BitwiseCount(Literal(1.toByte)), 1) + checkEvaluation(BitwiseCount(Literal(2.toByte)), 1) + checkEvaluation(BitwiseCount(Literal(3.toByte)), 2) + + // short/smallint + checkEvaluation(BitwiseCount(Literal(1.toShort)), 1) + checkEvaluation(BitwiseCount(Literal(2.toShort)), 1) + checkEvaluation(BitwiseCount(Literal(3.toShort)), 2) + + // int + checkEvaluation(BitwiseCount(Literal(1)), 1) + checkEvaluation(BitwiseCount(Literal(2)), 1) + checkEvaluation(BitwiseCount(Literal(3)), 2) + + // long/bigint + checkEvaluation(BitwiseCount(Literal(1L)), 1) + checkEvaluation(BitwiseCount(Literal(2L)), 1) + checkEvaluation(BitwiseCount(Literal(3L)), 2) + + // negative num + checkEvaluation(BitwiseCount(Literal(-1L)), 64) + + // edge value + checkEvaluation(BitwiseCount(Literal(9223372036854775807L)), 63) + checkEvaluation(BitwiseCount(Literal(-9223372036854775808L)), 1) + } + test("BitGet") { val nullLongLiteral = Literal.create(null, LongType) val nullIntLiteral = Literal.create(null, IntegerType) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CanonicalizeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CanonicalizeSuite.scala index 3366d99dd75e1..7e545d3321054 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CanonicalizeSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CanonicalizeSuite.scala @@ -454,4 +454,29 @@ class CanonicalizeSuite extends SparkFunSuite { // different. assert(common3.canonicalized != common4.canonicalized) } + + test("SPARK-48035: Add/Multiply operator canonicalization should take into account the" + + "evaluation mode of the operands before operand reordering") { + Seq(1, 10) map { multiCommutativeOpOptThreshold => + val default = SQLConf.get.getConf(MULTI_COMMUTATIVE_OP_OPT_THRESHOLD) + SQLConf.get.setConfString(MULTI_COMMUTATIVE_OP_OPT_THRESHOLD.key, + multiCommutativeOpOptThreshold.toString) + try { + val l1 = Literal(1) + val l2 = Literal(2) + val l3 = Literal(3) + + val expr1 = Add(Add(l1, l2), l3) + val expr2 = Add(Add(l2, l1, EvalMode.TRY), l3) + assert(!expr1.semanticEquals(expr2)) + + val expr3 = Multiply(Multiply(l1, l2), l3) + val expr4 = Multiply(Multiply(l2, l1, EvalMode.TRY), l3) + assert(!expr3.semanticEquals(expr4)) + } finally { + SQLConf.get.setConfString(MULTI_COMMUTATIVE_OP_OPT_THRESHOLD.key, + default.toString) + } + } + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala index 4df8d87074fc2..4c045f9fda731 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala @@ -104,7 +104,7 @@ class CodeGenerationSuite extends SparkFunSuite with ExpressionEvalHelper { test("SPARK-22543: split large if expressions into blocks due to JVM code size limit") { var strExpr: Expression = Literal("abc") for (_ <- 1 to 150) { - strExpr = StringDecode(Encode(strExpr, "utf-8"), "utf-8") + strExpr = StringTrimRight(StringTrimLeft(strExpr)) } val expressions = Seq(If(EqualTo(strExpr, strExpr), strExpr, strExpr)) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGeneratorWithInterpretedFallbackSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGeneratorWithInterpretedFallbackSuite.scala index da5bddb0c09fd..a843d43ae83b6 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGeneratorWithInterpretedFallbackSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGeneratorWithInterpretedFallbackSuite.scala @@ -89,7 +89,7 @@ class CodeGeneratorWithInterpretedFallbackSuite extends SparkFunSuite with PlanT FailedCodegenProjection.createObject(input) } }.getMessage - assert(errMsg.contains("failed to compile: org.codehaus.commons.compiler.CompileException:")) + assert(errMsg.contains("Failed to compile: org.codehaus.commons.compiler.CompileException:")) } test("SPARK-25358 Correctly handles NoOp in MutableProjection") { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollationExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollationExpressionSuite.scala index 537bac9aae9b4..a4651c6c4c7e9 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollationExpressionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollationExpressionSuite.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.expressions import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.sql.catalyst.util.CollationFactory import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.UTF8String class CollationExpressionSuite extends SparkFunSuite with ExpressionEvalHelper { test("validate default collation") { @@ -32,8 +33,8 @@ class CollationExpressionSuite extends SparkFunSuite with ExpressionEvalHelper { } test("collate against literal") { - val collateExpr = Collate(Literal("abc"), "UTF8_BINARY_LCASE") - val collationId = CollationFactory.collationNameToId("UTF8_BINARY_LCASE") + val collateExpr = Collate(Literal("abc"), "UTF8_LCASE") + val collationId = CollationFactory.collationNameToId("UTF8_LCASE") assert(collateExpr.dataType == StringType(collationId)) checkEvaluation(collateExpr, "abc") } @@ -62,7 +63,7 @@ class CollationExpressionSuite extends SparkFunSuite with ExpressionEvalHelper { exception = intercept[SparkException] { Collate(Literal("abc"), "UTF8_BS") }, errorClass = "COLLATION_INVALID_NAME", sqlState = "42704", - parameters = Map("proposal" -> "UTF8_BINARY", "collationName" -> "UTF8_BS")) + parameters = Map("collationName" -> "UTF8_BS", "proposals" -> "UTF8_LCASE")) } test("collation on non-explicit default collation") { @@ -71,11 +72,12 @@ class CollationExpressionSuite extends SparkFunSuite with ExpressionEvalHelper { test("collation on explicitly collated string") { checkEvaluation( - Collation(Literal.create("abc", StringType(1))).replacement, - "UTF8_BINARY_LCASE") + Collation(Literal.create("abc", + StringType(CollationFactory.UTF8_LCASE_COLLATION_ID))).replacement, + "UTF8_LCASE") checkEvaluation( - Collation(Collate(Literal("abc"), "UTF8_BINARY_LCASE")).replacement, - "UTF8_BINARY_LCASE") + Collation(Collate(Literal("abc"), "UTF8_LCASE")).replacement, + "UTF8_LCASE") } test("Array operations on arrays of collated strings") { @@ -90,8 +92,8 @@ class CollationExpressionSuite extends SparkFunSuite with ExpressionEvalHelper { (Seq("a"), Seq("a"), true, "UTF8_BINARY"), (Seq("a"), Seq("b"), false, "UTF8_BINARY"), (Seq("a"), Seq("A"), false, "UTF8_BINARY"), - (Seq("a"), Seq("A"), true, "UTF8_BINARY_LCASE"), - (Seq("a", "B"), Seq("A", "b"), true, "UTF8_BINARY_LCASE"), + (Seq("a"), Seq("A"), true, "UTF8_LCASE"), + (Seq("a", "B"), Seq("A", "b"), true, "UTF8_LCASE"), (Seq("a"), Seq("A"), false, "UNICODE"), (Seq("a", "B"), Seq("A", "b"), true, "UNICODE_CI") ) @@ -106,8 +108,8 @@ class CollationExpressionSuite extends SparkFunSuite with ExpressionEvalHelper { (Seq("a", "b", "c"), Seq("a", "b", "c"), "UTF8_BINARY"), (Seq("a", "a", "a"), Seq("a"), "UTF8_BINARY"), (Seq("aaa", "AAA", "Aaa", "aAa"), Seq("aaa", "AAA", "Aaa", "aAa"), "UTF8_BINARY"), - (Seq("aaa", "AAA", "Aaa", "aAa"), Seq("aaa"), "UTF8_BINARY_LCASE"), - (Seq("aaa", "AAA", "Aaa", "aAa", "b"), Seq("aaa", "b"), "UTF8_BINARY_LCASE"), + (Seq("aaa", "AAA", "Aaa", "aAa"), Seq("aaa"), "UTF8_LCASE"), + (Seq("aaa", "AAA", "Aaa", "aAa", "b"), Seq("aaa", "b"), "UTF8_LCASE"), (Seq("aaa", "AAA", "Aaa", "aAa"), Seq("aaa"), "UNICODE_CI") ) for ((in, out, collName) <- distinct) @@ -118,8 +120,8 @@ class CollationExpressionSuite extends SparkFunSuite with ExpressionEvalHelper { (Seq("a"), Seq("a"), Seq("a"), "UTF8_BINARY"), (Seq("a"), Seq("b"), Seq("a", "b"), "UTF8_BINARY"), (Seq("a"), Seq("A"), Seq("a", "A"), "UTF8_BINARY"), - (Seq("a"), Seq("A"), Seq("a"), "UTF8_BINARY_LCASE"), - (Seq("a", "B"), Seq("A", "b"), Seq("a", "B"), "UTF8_BINARY_LCASE"), + (Seq("a"), Seq("A"), Seq("a"), "UTF8_LCASE"), + (Seq("a", "B"), Seq("A", "b"), Seq("a", "B"), "UTF8_LCASE"), (Seq("a"), Seq("A"), Seq("a", "A"), "UNICODE"), (Seq("a", "B"), Seq("A", "b"), Seq("a", "B"), "UNICODE_CI") ) @@ -134,8 +136,8 @@ class CollationExpressionSuite extends SparkFunSuite with ExpressionEvalHelper { (Seq("a"), Seq("a"), Seq("a"), "UTF8_BINARY"), (Seq("a"), Seq("b"), Seq.empty, "UTF8_BINARY"), (Seq("a"), Seq("A"), Seq.empty, "UTF8_BINARY"), - (Seq("a"), Seq("A"), Seq("a"), "UTF8_BINARY_LCASE"), - (Seq("a", "B"), Seq("A", "b"), Seq("a", "B"), "UTF8_BINARY_LCASE"), + (Seq("a"), Seq("A"), Seq("a"), "UTF8_LCASE"), + (Seq("a", "B"), Seq("A", "b"), Seq("a", "B"), "UTF8_LCASE"), (Seq("a"), Seq("A"), Seq.empty, "UNICODE"), (Seq("a", "B"), Seq("A", "b"), Seq("a", "B"), "UNICODE_CI") ) @@ -150,8 +152,8 @@ class CollationExpressionSuite extends SparkFunSuite with ExpressionEvalHelper { (Seq("a"), Seq("a"), Seq.empty, "UTF8_BINARY"), (Seq("a"), Seq("b"), Seq("a"), "UTF8_BINARY"), (Seq("a"), Seq("A"), Seq("a"), "UTF8_BINARY"), - (Seq("a"), Seq("A"), Seq.empty, "UTF8_BINARY_LCASE"), - (Seq("a", "B"), Seq("A", "b"), Seq.empty, "UTF8_BINARY_LCASE"), + (Seq("a"), Seq("A"), Seq.empty, "UTF8_LCASE"), + (Seq("a", "B"), Seq("A", "b"), Seq.empty, "UTF8_LCASE"), (Seq("a"), Seq("A"), Seq("a"), "UNICODE"), (Seq("a", "B"), Seq("A", "b"), Seq.empty, "UNICODE_CI") ) @@ -161,4 +163,57 @@ class CollationExpressionSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(ArrayExcept(left, right), out) } } + + test("CollationKey generates correct collation key for collated string") { + val testCases = Seq( + ("", "UTF8_BINARY", UTF8String.fromString("").getBytes), + ("aa", "UTF8_BINARY", UTF8String.fromString("aa").getBytes), + ("AA", "UTF8_BINARY", UTF8String.fromString("AA").getBytes), + ("aA", "UTF8_BINARY", UTF8String.fromString("aA").getBytes), + ("", "UTF8_LCASE", UTF8String.fromString("").getBytes), + ("aa", "UTF8_LCASE", UTF8String.fromString("aa").getBytes), + ("AA", "UTF8_LCASE", UTF8String.fromString("aa").getBytes), + ("aA", "UTF8_LCASE", UTF8String.fromString("aa").getBytes), + ("", "UNICODE", Array[Byte](1, 1, 0)), + ("aa", "UNICODE", Array[Byte](42, 42, 1, 6, 1, 6, 0)), + ("AA", "UNICODE", Array[Byte](42, 42, 1, 6, 1, -36, -36, 0)), + ("aA", "UNICODE", Array[Byte](42, 42, 1, 6, 1, -59, -36, 0)), + ("", "UNICODE_CI", Array[Byte](1, 0)), + ("aa", "UNICODE_CI", Array[Byte](42, 42, 1, 6, 0)), + ("AA", "UNICODE_CI", Array[Byte](42, 42, 1, 6, 0)), + ("aA", "UNICODE_CI", Array[Byte](42, 42, 1, 6, 0)) + ) + for ((input, collation, expected) <- testCases) { + val str = Literal.create(input, StringType(collation)) + checkEvaluation(CollationKey(str), expected) + } + } + + test("collation name normalization in collation expression") { + Seq( + ("en_USA", "en_USA"), + ("en_CS", "en"), + ("en_AS", "en"), + ("en_CS_AS", "en"), + ("en_AS_CS", "en"), + ("en_CI", "en_CI"), + ("en_AI", "en_AI"), + ("en_AI_CI", "en_CI_AI"), + ("en_CI_AI", "en_CI_AI"), + ("en_CS_AI", "en_AI"), + ("en_AI_CS", "en_AI"), + ("en_CI_AS", "en_CI"), + ("en_AS_CI", "en_CI"), + ("en_USA_AI_CI", "en_USA_CI_AI"), + // randomized case + ("EN_USA", "en_USA"), + ("SR_CYRL", "sr_Cyrl"), + ("sr_cyrl_srb", "sr_Cyrl_SRB"), + ("sR_cYRl_sRb", "sr_Cyrl_SRB") + ).foreach { + case (collation, normalized) => + checkEvaluation(Collation(Literal.create("abc", StringType(collation))).replacement, + normalized) + } + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollationRegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollationRegexpExpressionsSuite.scala new file mode 100644 index 0000000000000..6f0d0c13b32a3 --- /dev/null +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollationRegexpExpressionsSuite.scala @@ -0,0 +1,169 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.expressions + +import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.catalyst.dsl.expressions._ +import org.apache.spark.sql.catalyst.util.CollationFactory +import org.apache.spark.sql.types._ + +class CollationRegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { + + test("Like/ILike/RLike expressions with collated strings") { + case class LikeTestCase[R](l: String, regexLike: String, regexRLike: String, collation: String, + expectedLike: R, expectedILike: R, expectedRLike: R) + val testCases = Seq( + LikeTestCase("AbC", "%AbC%", ".b.", "UTF8_BINARY", true, true, true), + LikeTestCase("AbC", "%ABC%", ".B.", "UTF8_BINARY", false, true, false), + LikeTestCase("AbC", "%abc%", ".b.", "UTF8_LCASE", true, true, true), + LikeTestCase("", "", "", "UTF8_LCASE", true, true, true), + LikeTestCase("Foo", "", "", "UTF8_LCASE", false, false, true), + LikeTestCase("", "%foo%", ".o.", "UTF8_LCASE", false, false, false), + LikeTestCase("AbC", "%ABC%", ".B.", "UTF8_BINARY", false, true, false), + LikeTestCase(null, "%foo%", ".o.", "UTF8_BINARY", null, null, null), + LikeTestCase("Foo", null, null, "UTF8_BINARY", null, null, null), + LikeTestCase(null, null, null, "UTF8_BINARY", null, null, null) + ) + testCases.foreach(t => { + // Like + checkEvaluation(Like( + Literal.create(t.l, StringType(CollationFactory.collationNameToId(t.collation))), + Literal.create(t.regexLike, StringType), '\\'), t.expectedLike) + // ILike + checkEvaluation(ILike( + Literal.create(t.l, StringType(CollationFactory.collationNameToId(t.collation))), + Literal.create(t.regexLike, StringType), '\\').replacement, t.expectedILike) + // RLike + checkEvaluation(RLike( + Literal.create(t.l, StringType(CollationFactory.collationNameToId(t.collation))), + Literal.create(t.regexRLike, StringType)), t.expectedRLike) + }) + } + + test("StringSplit expression with collated strings") { + case class StringSplitTestCase[R](s: String, r: String, collation: String, expected: R) + val testCases = Seq( + StringSplitTestCase("1A2B3C", "[ABC]", "UTF8_BINARY", Seq("1", "2", "3", "")), + StringSplitTestCase("1A2B3C", "[abc]", "UTF8_BINARY", Seq("1A2B3C")), + StringSplitTestCase("1A2B3C", "[ABC]", "UTF8_LCASE", Seq("1", "2", "3", "")), + StringSplitTestCase("1A2B3C", "[abc]", "UTF8_LCASE", Seq("1", "2", "3", "")), + StringSplitTestCase("1A2B3C", "[1-9]+", "UTF8_BINARY", Seq("", "A", "B", "C")), + StringSplitTestCase("", "", "UTF8_BINARY", Seq("")), + StringSplitTestCase("1A2B3C", "", "UTF8_BINARY", Seq("1", "A", "2", "B", "3", "C")), + StringSplitTestCase("", "[1-9]+", "UTF8_BINARY", Seq("")), + StringSplitTestCase(null, "[1-9]+", "UTF8_BINARY", null), + StringSplitTestCase("1A2B3C", null, "UTF8_BINARY", null), + StringSplitTestCase(null, null, "UTF8_BINARY", null) + ) + testCases.foreach(t => { + // StringSplit + checkEvaluation(StringSplit( + Literal.create(t.s, StringType(CollationFactory.collationNameToId(t.collation))), + Literal.create(t.r, StringType), -1), t.expected) + }) + } + + test("Regexp expressions with collated strings") { + case class RegexpTestCase[R](l: String, r: String, collation: String, + expectedExtract: R, expectedExtractAll: R, expectedCount: R) + val testCases = Seq( + RegexpTestCase("AbC-aBc", ".b.", "UTF8_BINARY", "AbC", Seq("AbC"), 1), + RegexpTestCase("AbC-abc", ".b.", "UTF8_BINARY", "AbC", Seq("AbC", "abc"), 2), + RegexpTestCase("AbC-aBc", ".b.", "UTF8_LCASE", "AbC", Seq("AbC", "aBc"), 2), + RegexpTestCase("ABC-abc", ".b.", "UTF8_LCASE", "ABC", Seq("ABC", "abc"), 2), + RegexpTestCase("", "", "UTF8_LCASE", "", Seq(""), 1), + RegexpTestCase("Foo", "", "UTF8_LCASE", "", Seq("", "", "", ""), 4), + RegexpTestCase("", ".o.", "UTF8_LCASE", "", Seq(), 0), + RegexpTestCase("Foo", ".O.", "UTF8_BINARY", "", Seq(), 0), + RegexpTestCase(null, ".O.", "UTF8_BINARY", null, null, null), + RegexpTestCase("Foo", null, "UTF8_BINARY", null, null, null), + RegexpTestCase(null, null, "UTF8_BINARY", null, null, null) + ) + testCases.foreach(t => { + // RegExpExtract + checkEvaluation(RegExpExtract( + Literal.create(t.l, StringType(CollationFactory.collationNameToId(t.collation))), + Literal.create(t.r, StringType), 0), t.expectedExtract) + // RegExpExtractAll + checkEvaluation(RegExpExtractAll( + Literal.create(t.l, StringType(CollationFactory.collationNameToId(t.collation))), + Literal.create(t.r, StringType), 0), t.expectedExtractAll) + // RegExpCount + checkEvaluation(RegExpCount( + Literal.create(t.l, StringType(CollationFactory.collationNameToId(t.collation))), + Literal.create(t.r, StringType)).replacement, t.expectedCount) + // RegExpInStr + def expectedInStr(count: Any): Any = count match { + case null => null + case 0 => 0 + case n: Int if n >= 1 => 1 + } + checkEvaluation(RegExpInStr( + Literal.create(t.l, StringType(CollationFactory.collationNameToId(t.collation))), + Literal.create(t.r, StringType), 0), expectedInStr(t.expectedCount)) + }) + } + + test("MultiLikeBase regexp expressions with collated strings") { + val nullStr = Literal.create(null, StringType) + // Supported collations (StringTypeBinaryLcase) + val binaryCollation = StringType(CollationFactory.collationNameToId("UTF8_BINARY")) + val lowercaseCollation = StringType(CollationFactory.collationNameToId("UTF8_LCASE")) + // LikeAll + checkEvaluation(Literal.create("foo", binaryCollation).likeAll("%foo%", "%oo"), true) + checkEvaluation(Literal.create("foo", binaryCollation).likeAll("%foo%", "%bar%"), false) + checkEvaluation(Literal.create("Foo", lowercaseCollation).likeAll("%foo%", "%oo"), true) + checkEvaluation(Literal.create("Foo", lowercaseCollation).likeAll("%foo%", "%bar%"), false) + checkEvaluation(Literal.create("foo", binaryCollation).likeAll("%foo%", "%oo"), true) + checkEvaluation(Literal.create("foo", binaryCollation).likeAll("%foo%", "%bar%"), false) + checkEvaluation(Literal.create("foo", binaryCollation).likeAll("%foo%", nullStr), null) + checkEvaluation(Literal.create("foo", binaryCollation).likeAll("%feo%", nullStr), false) + checkEvaluation(Literal.create(null, binaryCollation).likeAll("%foo%", "%oo"), null) + // NotLikeAll + checkEvaluation(Literal.create("foo", binaryCollation).notLikeAll("%foo%", "%oo"), false) + checkEvaluation(Literal.create("foo", binaryCollation).notLikeAll("%goo%", "%bar%"), true) + checkEvaluation(Literal.create("Foo", lowercaseCollation).notLikeAll("%foo%", "%oo"), false) + checkEvaluation(Literal.create("Foo", lowercaseCollation).notLikeAll("%goo%", "%bar%"), true) + checkEvaluation(Literal.create("foo", binaryCollation).notLikeAll("%foo%", "%oo"), false) + checkEvaluation(Literal.create("foo", binaryCollation).notLikeAll("%goo%", "%bar%"), true) + checkEvaluation(Literal.create("foo", binaryCollation).notLikeAll("%foo%", nullStr), false) + checkEvaluation(Literal.create("foo", binaryCollation).notLikeAll("%feo%", nullStr), null) + checkEvaluation(Literal.create(null, binaryCollation).notLikeAll("%foo%", "%oo"), null) + // LikeAny + checkEvaluation(Literal.create("foo", binaryCollation).likeAny("%goo%", "%hoo"), false) + checkEvaluation(Literal.create("foo", binaryCollation).likeAny("%foo%", "%bar%"), true) + checkEvaluation(Literal.create("Foo", lowercaseCollation).likeAny("%goo%", "%hoo"), false) + checkEvaluation(Literal.create("Foo", lowercaseCollation).likeAny("%foo%", "%bar%"), true) + checkEvaluation(Literal.create("foo", binaryCollation).likeAny("%goo%", "%hoo"), false) + checkEvaluation(Literal.create("foo", binaryCollation).likeAny("%foo%", "%bar%"), true) + checkEvaluation(Literal.create("foo", binaryCollation).likeAny("%foo%", nullStr), true) + checkEvaluation(Literal.create("foo", binaryCollation).likeAny("%feo%", nullStr), null) + checkEvaluation(Literal.create(null, binaryCollation).likeAny("%foo%", "%oo"), null) + // NotLikeAny + checkEvaluation(Literal.create("foo", binaryCollation).notLikeAny("%foo%", "%hoo"), true) + checkEvaluation(Literal.create("foo", binaryCollation).notLikeAny("%foo%", "%oo%"), false) + checkEvaluation(Literal.create("Foo", lowercaseCollation).notLikeAny("%Foo%", "%hoo"), true) + checkEvaluation(Literal.create("Foo", lowercaseCollation).notLikeAny("%foo%", "%oo%"), false) + checkEvaluation(Literal.create("foo", binaryCollation).notLikeAny("%Foo%", "%hoo"), true) + checkEvaluation(Literal.create("foo", binaryCollation).notLikeAny("%foo%", "%oo%"), false) + checkEvaluation(Literal.create("foo", binaryCollation).notLikeAny("%foo%", nullStr), null) + checkEvaluation(Literal.create("foo", binaryCollation).notLikeAny("%feo%", nullStr), true) + checkEvaluation(Literal.create(null, binaryCollation).notLikeAny("%foo%", "%oo"), null) + } + +} diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala index 5f135e46a3775..497b335289b11 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala @@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.util._ import org.apache.spark.sql.catalyst.util.TypeUtils.ordinalNumber import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ -import org.apache.spark.unsafe.types.UTF8String +import org.apache.spark.unsafe.types.{UTF8String, VariantVal} class ComplexTypeSuite extends SparkFunSuite with ExpressionEvalHelper { @@ -359,6 +359,38 @@ class ComplexTypeSuite extends SparkFunSuite with ExpressionEvalHelper { ) } + // map key can't be variant + val map6 = CreateMap(Seq( + Literal.create(new VariantVal(Array[Byte](), Array[Byte]())), + Literal.create(1) + )) + map6.checkInputDataTypes() match { + case TypeCheckResult.TypeCheckSuccess => fail("should not allow variant as a part of map key") + case TypeCheckResult.DataTypeMismatch(errorSubClass, messageParameters) => + assert(errorSubClass == "INVALID_MAP_KEY_TYPE") + assert(messageParameters === Map("keyType" -> "\"VARIANT\"")) + } + + // map key can't contain variant + val map7 = CreateMap( + Seq( + CreateStruct( + Seq(Literal.create(1), Literal.create(new VariantVal(Array[Byte](), Array[Byte]()))) + ), + Literal.create(1) + ) + ) + map7.checkInputDataTypes() match { + case TypeCheckResult.TypeCheckSuccess => fail("should not allow variant as a part of map key") + case TypeCheckResult.DataTypeMismatch(errorSubClass, messageParameters) => + assert(errorSubClass == "INVALID_MAP_KEY_TYPE") + assert( + messageParameters === Map( + "keyType" -> "\"STRUCT\"" + ) + ) + } + test("MapFromArrays") { val intSeq = Seq(5, 10, 15, 20, 25) val longSeq = intSeq.map(_.toLong) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala index 21e6b8692911d..a063e53486ad8 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala @@ -71,10 +71,15 @@ trait ExpressionEvalHelper extends ScalaCheckDrivenPropertyChecks with PlanTestB new ArrayBasedMapData(keyArray, valueArray) } + protected def replace(expr: Expression): Expression = expr match { + case r: RuntimeReplaceable => replace(r.replacement) + case _ => expr.mapChildren(replace) + } + private def prepareEvaluation(expression: Expression): Expression = { val serializer = new JavaSerializer(new SparkConf()).newInstance() val resolver = ResolveTimeZone - val expr = resolver.resolveTimeZones(expression) + val expr = resolver.resolveTimeZones(replace(expression)) assert(expr.resolved) serializer.deserialize(serializer.serialize(expr)) } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HexBenchmark.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HexBenchmark.scala new file mode 100644 index 0000000000000..df3fcbb83906c --- /dev/null +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HexBenchmark.scala @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.expressions + +import java.util.Locale + +import org.apache.commons.codec.binary.{Hex => ApacheHex} + +import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} +import org.apache.spark.unsafe.types.UTF8String + +/** + * Benchmark for hex + * To run this benchmark: + * {{{ + * 1. without sbt: + * bin/spark-submit --class --jars + * 2. build/sbt "catalyst/Test/runMain " + * 3. generate result: + * SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "catalyst/Test/runMain " + * Results will be written to "benchmarks/HexBenchmark-results.txt". + * }}} + */ +object HexBenchmark extends BenchmarkBase { + + private val hexStrings = { + var tmp = Seq("", "A", "AB", "ABC", "ABCD", "123ABCDEF") + tmp = tmp ++ tmp.map(_.toLowerCase(Locale.ROOT)) + (2 to 4).foreach { i => tmp = tmp ++ tmp.map(x => x * i) } + tmp.map(UTF8String.fromString(_).toString) + } + + private val hexBin = hexStrings.map(_.getBytes) + + override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { + runBenchmark("UnHex Comparison") { + val N = 1_000_000 + val benchmark = new Benchmark(s"Cardinality $N", N, 3, output = output) + benchmark.addCase("Common Codecs") { _ => + (1 to N).foreach(_ => hexStrings.foreach(y => apacheDecodeHex(y))) + } + + benchmark.addCase("Java") { _ => + (1 to N).foreach(_ => hexStrings.foreach(y => javaUnhex(y))) + } + + benchmark.addCase("Spark") { _ => + (1 to N).foreach(_ => hexStrings.foreach(y => builtinUnHex(y))) + } + + benchmark.addCase("Spark Binary") { _ => + (1 to N).foreach(_ => hexBin.foreach(y => builtinUnHex(y))) + } + benchmark.run() + } + } + + def apacheDecodeHex(value: String): Array[Byte] = { + val padding = if (value.length % 2 != 0) "0" else "" + ApacheHex.decodeHex(padding + value) + } + + def builtinUnHex(value: String): Array[Byte] = { + Hex.unhex(value) + } + + def builtinUnHex(value: Array[Byte]): Array[Byte] = { + Hex.unhex(value) + } + + def javaUnhex(value: String) : Array[Byte] = { + val padding = if ((value.length & 0x1) != 0) "0" else "" + java.util.HexFormat.of().parseHex(padding + value) + } +} diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HexSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HexSuite.scala new file mode 100644 index 0000000000000..a3f963538f447 --- /dev/null +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HexSuite.scala @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.expressions + +import org.apache.spark.SparkFunSuite + +class HexSuite extends SparkFunSuite { + test("SPARK-48596: hex long values") { + assert(Hex.hex(0).toString === "0") + assert(Hex.hex(1).toString === "1") + assert(Hex.hex(15).toString === "F") + assert(Hex.hex(16).toString === "10") + assert(Hex.hex(255).toString === "FF") + assert(Hex.hex(256).toString === "100") + assert(Hex.hex(4095).toString === "FFF") + assert(Hex.hex(4096).toString === "1000") + assert(Hex.hex(65535).toString === "FFFF") + assert(Hex.hex(65536).toString === "10000") + assert(Hex.hex(1048575).toString === "FFFFF") + assert(Hex.hex(1048576).toString === "100000") + assert(Hex.hex(-1).toString === "FFFFFFFFFFFFFFFF") + assert(Hex.hex(Long.MinValue).toString === "8000000000000000") + assert(Hex.hex(Long.MaxValue).toString === "7FFFFFFFFFFFFFFF") + } +} diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullExpressionsSuite.scala index da8e11c0433eb..ace017b1cddc3 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullExpressionsSuite.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.expressions import java.sql.Timestamp -import org.apache.spark.SparkFunSuite +import org.apache.spark.{SparkFunSuite, SparkRuntimeException} import org.apache.spark.sql.catalyst.analysis.SimpleAnalyzer import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext import org.apache.spark.sql.catalyst.expressions.objects.AssertNotNull @@ -53,10 +53,13 @@ class NullExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { } test("AssertNotNUll") { - val ex = intercept[RuntimeException] { - evaluateWithoutCodegen(AssertNotNull(Literal(null))) - }.getMessage - assert(ex.contains("Null value appeared in non-nullable field")) + checkError( + exception = intercept[SparkRuntimeException] { + evaluateWithoutCodegen(AssertNotNull(Literal(null))) + }, + errorClass = "NOT_NULL_ASSERT_VIOLATION", + sqlState = "42000", + parameters = Map("walkedTypePath" -> "\n\n")) } test("IsNaN") { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala index 0fcceef392389..ebd7245434819 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala @@ -489,6 +489,8 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { // non ascii characters are not allowed in the code, so we disable the scalastyle here. checkEvaluation( StringDecode(Encode(Literal("大千世界"), Literal("UTF-16LE")), Literal("UTF-16LE")), "大千世界") + checkEvaluation( + StringDecode(Encode(Literal("大千世界"), Literal("UTF-32")), Literal("UTF-32")), "大千世界") checkEvaluation( StringDecode(Encode(a, Literal("utf-8")), Literal("utf-8")), "大千世界", create_row("大千世界")) checkEvaluation( @@ -503,8 +505,8 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(StringDecode(b, Literal.create(null, StringType)), null, create_row(null)) // Test escaping of charset - GenerateUnsafeProjection.generate(Encode(a, Literal("\"quote")) :: Nil) - GenerateUnsafeProjection.generate(StringDecode(b, Literal("\"quote")) :: Nil) + GenerateUnsafeProjection.generate(Encode(a, Literal("\"quote")).replacement :: Nil) + GenerateUnsafeProjection.generate(StringDecode(b, Literal("\"quote")).replacement :: Nil) } test("initcap unit test") { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/TryEvalSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/TryEvalSuite.scala index 780a2692e87f7..e082f2e3acccd 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/TryEvalSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/TryEvalSuite.scala @@ -46,6 +46,19 @@ class TryEvalSuite extends SparkFunSuite with ExpressionEvalHelper { } } + test("try_remainder") { + Seq( + (3.0, 2.0, 1.0), + (1.0, 0.0, null), + (-1.0, 0.0, null) + ).foreach { case (a, b, expected) => + val left = Literal(a) + val right = Literal(b) + val input = Remainder(left, right, EvalMode.TRY) + checkEvaluation(input, expected) + } + } + test("try_subtract") { Seq( (1, 1, 0), diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/variant/VariantExpressionEvalUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/variant/VariantExpressionEvalUtilsSuite.scala index 574d5daa361e9..8fc72caa47860 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/variant/VariantExpressionEvalUtilsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/variant/VariantExpressionEvalUtilsSuite.scala @@ -25,9 +25,13 @@ class VariantExpressionEvalUtilsSuite extends SparkFunSuite { test("parseJson type coercion") { def check(json: String, expectedValue: Array[Byte], expectedMetadata: Array[Byte]): Unit = { + // parse_json val actual = VariantExpressionEvalUtils.parseJson(UTF8String.fromString(json)) + // try_parse_json + val tryActual = VariantExpressionEvalUtils.parseJson(UTF8String.fromString(json), + failOnError = false) val expected = new VariantVal(expectedValue, expectedMetadata) - assert(actual === expected) + assert(actual === expected && tryActual === expected) } // Dictionary size is `0` for value 0. An empty dictionary contains one offset `0` for the @@ -104,6 +108,8 @@ class VariantExpressionEvalUtilsSuite extends SparkFunSuite { test("parseJson negative") { def checkException(json: String, errorClass: String, parameters: Map[String, String]): Unit = { + val try_parse_json_output = VariantExpressionEvalUtils.parseJson(UTF8String.fromString(json), + failOnError = false) checkError( exception = intercept[SparkThrowable] { VariantExpressionEvalUtils.parseJson(UTF8String.fromString(json)) @@ -111,6 +117,7 @@ class VariantExpressionEvalUtilsSuite extends SparkFunSuite { errorClass = errorClass, parameters = parameters ) + assert(try_parse_json_output === null) } for (json <- Seq("", "[", "+1", "1a", """{"a": 1, "b": 2, "a": "3"}""")) { checkException(json, "MALFORMED_RECORD_IN_PARSING.WITHOUT_SUGGESTION", @@ -122,4 +129,42 @@ class VariantExpressionEvalUtilsSuite extends SparkFunSuite { Map("sizeLimit" -> "16.0 MiB", "functionName" -> "`parse_json`")) } } + + test("isVariantNull") { + def check(json: String, expected: Boolean): Unit = { + if (json != null) { + val parsedVariant = VariantExpressionEvalUtils.parseJson(UTF8String.fromString(json)) + val actual = VariantExpressionEvalUtils.isVariantNull(parsedVariant) + assert(actual == expected) + } else { + val actual = VariantExpressionEvalUtils.isVariantNull(null) + assert(actual == expected) + } + } + + // Primitive types + check("null", expected = true) + check(null, expected = false) + check("0", expected = false) + check("13", expected = false) + check("-54", expected = false) + check("2147483647", expected = false) + check("2147483648", expected = false) + check("238457328534848", expected = false) + check("342.769", expected = false) + check("true", expected = false) + check("false", expected = false) + check("false", expected = false) + check("65.43", expected = false) + check("\"" + "spark" * 100 + "\"", expected = false) + // Short String + check("\"\"", expected = false) + check("\"null\"", expected = false) + // Array + check("[]", expected = false) + check("[null, null]", expected = false) + check("[{\"a\" : 13}, \"spark\"]", expected = false) + // Object + check("[{\"a\" : 13, \"b\" : null}]", expected = false) + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/variant/VariantExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/variant/VariantExpressionSuite.scala index 9aa1dcd2ef952..a758fa84f6fca 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/variant/VariantExpressionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/variant/VariantExpressionSuite.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.expressions.variant import java.time.{LocalDateTime, ZoneId, ZoneOffset} +import scala.collection.mutable import scala.reflect.runtime.universe.TypeTag import org.apache.spark.{SparkFunSuite, SparkRuntimeException} @@ -58,6 +59,9 @@ class VariantExpressionSuite extends SparkFunSuite with ExpressionEvalHelper { check(Array(primitiveHeader(INT8), 0, 0, 0, 0, 0, 0, 0), emptyMetadata) // DECIMAL16 only has 15 byte content. check(Array(primitiveHeader(DECIMAL16)) ++ Array.fill(16)(0.toByte), emptyMetadata) + // 1e38 has a precision of 39. Even if it still fits into 16 bytes, it is not a valid decimal. + check(Array[Byte](primitiveHeader(DECIMAL16), 0) ++ + BigDecimal(1e38).toBigInt.toByteArray.reverse, emptyMetadata) // Short string content too short. check(Array(shortStrHeader(2), 'x'), emptyMetadata) // Long string length too short (requires 4 bytes). @@ -239,6 +243,13 @@ class VariantExpressionSuite extends SparkFunSuite with ExpressionEvalHelper { check(expectedResult4, smallObject, smallMetadata) } + test("is_variant_null invalid input") { + checkErrorInExpression[SparkRuntimeException]( + IsVariantNull(Literal(new VariantVal(Array(), Array(1, 2, 3)))), + "MALFORMED_VARIANT" + ) + } + private def parseJson(input: String): VariantVal = VariantExpressionEvalUtils.parseJson(UTF8String.fromString(input)) @@ -800,6 +811,15 @@ class VariantExpressionSuite extends SparkFunSuite with ExpressionEvalHelper { "Hello") } + test("SPARK-48150: ParseJson expression nullability") { + assert(!ParseJson(Literal("["), failOnError = true).replacement.nullable) + assert(ParseJson(Literal("["), failOnError = false).replacement.nullable) + checkEvaluation( + ParseJson(Literal("["), failOnError = false).replacement, + null + ) + } + test("cast to variant") { def check[T : TypeTag](input: T, expectedJson: String): Unit = { val cast = Cast(Literal.create(input), VariantType, evalMode = EvalMode.ANSI) @@ -807,9 +827,27 @@ class VariantExpressionSuite extends SparkFunSuite with ExpressionEvalHelper { } check(null.asInstanceOf[String], null) + // The following tests cover all allowed scalar types. for (input <- Seq[Any](false, true, 0.toByte, 1.toShort, 2, 3L, 4.0F, 5.0D)) { check(input, input.toString) } + for (precision <- Seq(9, 18, 38)) { + val input = BigDecimal("9" * precision) + check(Literal.create(input, DecimalType(precision, 0)), input.toString) + } + check("", "\"\"") + check("x" * 128, "\"" + ("x" * 128) + "\"") + check(Array[Byte](1, 2, 3), "\"AQID\"") + check(Literal(0, DateType), "\"1970-01-01\"") + withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> "UTC") { + check(Literal(0L, TimestampType), "\"1970-01-01 00:00:00+00:00\"") + check(Literal(0L, TimestampNTZType), "\"1970-01-01 00:00:00\"") + } + withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> "America/Los_Angeles") { + check(Literal(0L, TimestampType), "\"1969-12-31 16:00:00-08:00\"") + check(Literal(0L, TimestampNTZType), "\"1970-01-01 00:00:00\"") + } + check(Array(null, "a", "b", "c"), """[null,"a","b","c"]""") check(Map("z" -> 1, "y" -> 2, "x" -> 3), """{"x":3,"y":2,"z":1}""") check(Array(parseJson("""{"a": 1,"b": [1, 2, 3]}"""), @@ -823,4 +861,50 @@ class VariantExpressionSuite extends SparkFunSuite with ExpressionEvalHelper { StructType.fromDDL("c ARRAY,b MAP,a STRUCT")) check(struct, """{"a":{"i":0},"b":{"a":"123","b":"true","c":"f"},"c":["123","true","f"]}""") } + + test("schema_of_variant - schema merge") { + val nul = Literal(null, StringType) + val boolean = Literal.default(BooleanType) + val long = Literal.default(LongType) + val string = Literal.default(StringType) + val double = Literal.default(DoubleType) + val date = Literal.default(DateType) + val timestamp = Literal.default(TimestampType) + val timestampNtz = Literal.default(TimestampNTZType) + val float = Literal.default(FloatType) + val binary = Literal.default(BinaryType) + val decimal = Literal(Decimal("123.456"), DecimalType(6, 3)) + val array1 = Literal(Array(0L)) + val array2 = Literal(Array(0.0)) + val struct1 = Literal.default(StructType.fromDDL("a string")) + val struct2 = Literal.default(StructType.fromDDL("a boolean, b bigint")) + val inputs = Seq(nul, boolean, long, string, double, date, timestamp, timestampNtz, float, + binary, decimal, array1, array2, struct1, struct2) + + val results = mutable.HashMap.empty[(Literal, Literal), String] + for (i <- inputs) { + val inputType = if (i.value == null) "VOID" else i.dataType.sql + results.put((nul, i), inputType) + results.put((i, i), inputType) + } + results.put((long, double), "DOUBLE") + results.put((long, float), "FLOAT") + results.put((long, decimal), "DECIMAL(23,3)") + results.put((double, float), "DOUBLE") + results.put((double, decimal), "DOUBLE") + results.put((date, timestamp), "TIMESTAMP") + results.put((date, timestampNtz), "TIMESTAMP_NTZ") + results.put((timestamp, timestampNtz), "TIMESTAMP") + results.put((float, decimal), "DOUBLE") + results.put((array1, array2), "ARRAY") + results.put((struct1, struct2), "STRUCT") + + for (i1 <- inputs) { + for (i2 <- inputs) { + val expected = results.getOrElse((i1, i2), results.getOrElse((i2, i1), "VARIANT")) + val array = CreateArray(Seq(Cast(i1, VariantType), Cast(i2, VariantType))) + checkEvaluation(SchemaOfVariant(Cast(array, VariantType)).replacement, s"ARRAY<$expected>") + } + } + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/xml/UDFXPathUtilSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/xml/UDFXPathUtilSuite.scala index a8dc2b20f56d8..8351e94c0c360 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/xml/UDFXPathUtilSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/xml/UDFXPathUtilSuite.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql.catalyst.expressions.xml +import java.nio.charset.StandardCharsets import javax.xml.xpath.XPathConstants.STRING import org.w3c.dom.Node @@ -85,7 +86,7 @@ class UDFXPathUtilSuite extends SparkFunSuite { tempFile.deleteOnExit() val fname = tempFile.getAbsolutePath - FileUtils.writeStringToFile(tempFile, secretValue) + FileUtils.writeStringToFile(tempFile, secretValue, StandardCharsets.UTF_8) val xml = s""" diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BinaryComparisonSimplificationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BinaryComparisonSimplificationSuite.scala index d2368908d0386..2bbdb4e689193 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BinaryComparisonSimplificationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BinaryComparisonSimplificationSuite.scala @@ -36,6 +36,8 @@ class BinaryComparisonSimplificationSuite extends PlanTest { EliminateSubqueryAliases) :: Batch("Infer Filters", Once, InferFiltersFromConstraints) :: + Batch("Compute current time", Once, + ComputeCurrentTime) :: Batch("Constant Folding", FixedPoint(50), NullPropagation, ConstantFolding, diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseProjectSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseProjectSuite.scala index c5f506d4d6832..e83f231c188e7 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseProjectSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseProjectSuite.scala @@ -20,11 +20,12 @@ package org.apache.spark.sql.catalyst.optimizer import org.apache.spark.sql.catalyst.analysis.EliminateSubqueryAliases import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ -import org.apache.spark.sql.catalyst.expressions.{Alias, Rand, UpdateFields} +import org.apache.spark.sql.catalyst.expressions.{Alias, CreateArray, Expression, GetArrayItem, PythonUDF, Rand, UpdateFields} import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules.RuleExecutor -import org.apache.spark.sql.types.MetadataBuilder +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.{ArrayType, IntegerType, MetadataBuilder} class CollapseProjectSuite extends PlanTest { object Optimize extends RuleExecutor[LogicalPlan] { @@ -266,4 +267,35 @@ class CollapseProjectSuite extends PlanTest { val expected = relation.select(($"a" + $"b").as("C")).analyze comparePlans(optimized, expected) } + + test("ES-1102888: collapse project duplicating aggregate expressions in UDF") { + withSQLConf(SQLConf.AVOID_COLLAPSE_UDF_WITH_EXPENSIVE_EXPR.key -> "true") { + val pythonUdf = (e: Expression) => { + PythonUDF("udf", null, ArrayType(IntegerType), Seq(e), 0, udfDeterministic = true) + } + + val query = testRelation + .groupBy($"a")(collectList($"b").as("l1")) + .select(pythonUdf($"l1").as("l2")) + .select(CreateArray(Seq( + GetArrayItem($"l2", 0), + GetArrayItem($"l2", 1), + GetArrayItem($"l2", 2), + GetArrayItem($"l2", 3) + ))) + .analyze + + val optimized = Optimize.execute(query) + val expected = testRelation + .groupBy($"a")(pythonUdf(collectList($"b")).as("l2")) + .select(CreateArray(Seq( + GetArrayItem($"l2", 0), + GetArrayItem($"l2", 1), + GetArrayItem($"l2", 2), + GetArrayItem($"l2", 3) + ))) + .analyze + comparePlans(optimized, expected) + } + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala index 03e65412d166b..5027222be6b80 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala @@ -219,6 +219,17 @@ class FilterPushdownSuite extends PlanTest { comparePlans(optimized, correctAnswer) } + test("Can't push down nondeterministic filter through aggregate") { + val originalQuery = testRelation + .groupBy($"a")($"a", count($"b") as "c") + .where(Rand(10) > $"a") + .analyze + + val optimized = Optimize.execute(originalQuery) + + comparePlans(optimized, originalQuery) + } + test("filters: combines filters") { val originalQuery = testRelation .select($"a") @@ -1483,14 +1494,16 @@ class FilterPushdownSuite extends PlanTest { test("SPARK-46707: push down predicate with sequence (without step) through aggregates") { val x = testRelation.subquery("x") - // do not push down when sequence has step param + // Always push down sequence as it's deterministic val queryWithStep = x.groupBy($"x.a", $"x.b")($"x.a", $"x.b") .where(IsNotNull(Sequence($"x.a", $"x.b", Some(Literal(1))))) .analyze val optimizedQueryWithStep = Optimize.execute(queryWithStep) - comparePlans(optimizedQueryWithStep, queryWithStep) + val correctAnswerWithStep = x.where(IsNotNull(Sequence($"x.a", $"x.b", Some(Literal(1))))) + .groupBy($"x.a", $"x.b")($"x.a", $"x.b") + .analyze + comparePlans(optimizedQueryWithStep, correctAnswerWithStep) - // push down when sequence does not have step param val queryWithoutStep = x.groupBy($"x.a", $"x.b")($"x.a", $"x.b") .where(IsNotNull(Sequence($"x.a", $"x.b", None))) .analyze diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FoldablePropagationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FoldablePropagationSuite.scala index 767ef38ea7f7d..5866f29e4e864 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FoldablePropagationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FoldablePropagationSuite.scala @@ -214,4 +214,15 @@ class FoldablePropagationSuite extends PlanTest { val expected = testRelation.select(foldableAttr, $"a").rebalance(foldableAttr, $"a").analyze comparePlans(optimized, expected) } + + test("SPARK-48419: Foldable propagation replace foldable column should use origin column name") { + val query = testRelation + .select($"a".as("x"), "str".as("Y"), $"b".as("z")) + .select($"x", $"y", $"z") + val optimized = Optimize.execute(query.analyze) + val correctAnswer = testRelation + .select($"a".as("x"), "str".as("Y"), $"b".as("z")) + .select($"x", "str".as("y"), $"z").analyze + comparePlans(optimized, correctAnswer) + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InlineCTESuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InlineCTESuite.scala new file mode 100644 index 0000000000000..9d775a5335c67 --- /dev/null +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InlineCTESuite.scala @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.optimizer + +import org.apache.spark.sql.catalyst.analysis.TestRelation +import org.apache.spark.sql.catalyst.dsl.expressions._ +import org.apache.spark.sql.catalyst.dsl.plans._ +import org.apache.spark.sql.catalyst.plans.PlanTest +import org.apache.spark.sql.catalyst.plans.logical.{AppendData, CTERelationDef, CTERelationRef, LogicalPlan, OneRowRelation, WithCTE} +import org.apache.spark.sql.catalyst.rules.RuleExecutor + +class InlineCTESuite extends PlanTest { + + object Optimize extends RuleExecutor[LogicalPlan] { + val batches = Batch("inline CTE", FixedPoint(100), InlineCTE()) :: Nil + } + + test("SPARK-48307: not-inlined CTE relation in command") { + val cteDef = CTERelationDef(OneRowRelation().select(rand(0).as("a"))) + val cteRef = CTERelationRef(cteDef.id, cteDef.resolved, cteDef.output, cteDef.isStreaming) + val plan = AppendData.byName( + TestRelation(Seq($"a".double)), + WithCTE(cteRef.except(cteRef, isAll = true), Seq(cteDef)) + ).analyze + comparePlans(Optimize.execute(plan), plan) + } +} diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/JoinSelectionHelperSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/JoinSelectionHelperSuite.scala index 6acce44922f69..61fb68cfba863 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/JoinSelectionHelperSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/JoinSelectionHelperSuite.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.optimizer import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.expressions.AttributeMap import org.apache.spark.sql.catalyst.plans.{Inner, PlanTest} -import org.apache.spark.sql.catalyst.plans.logical.{BROADCAST, HintInfo, JoinHint, NO_BROADCAST_HASH, SHUFFLE_HASH} +import org.apache.spark.sql.catalyst.plans.logical.{BROADCAST, HintInfo, Join, JoinHint, NO_BROADCAST_HASH, SHUFFLE_HASH} import org.apache.spark.sql.catalyst.statsEstimation.StatsTestPlan import org.apache.spark.sql.internal.SQLConf @@ -38,16 +38,15 @@ class JoinSelectionHelperSuite extends PlanTest with JoinSelectionHelper { size = Some(1000), attributeStats = AttributeMap(Seq())) + private val join = Join(left, right, Inner, None, JoinHint(None, None)) + private val hintBroadcast = Some(HintInfo(Some(BROADCAST))) private val hintNotToBroadcast = Some(HintInfo(Some(NO_BROADCAST_HASH))) private val hintShuffleHash = Some(HintInfo(Some(SHUFFLE_HASH))) test("getBroadcastBuildSide (hintOnly = true) return BuildLeft with only a left hint") { val broadcastSide = getBroadcastBuildSide( - left, - right, - Inner, - JoinHint(hintBroadcast, None), + join.copy(hint = JoinHint(hintBroadcast, None)), hintOnly = true, SQLConf.get ) @@ -56,10 +55,7 @@ class JoinSelectionHelperSuite extends PlanTest with JoinSelectionHelper { test("getBroadcastBuildSide (hintOnly = true) return BuildRight with only a right hint") { val broadcastSide = getBroadcastBuildSide( - left, - right, - Inner, - JoinHint(None, hintBroadcast), + join.copy(hint = JoinHint(None, hintBroadcast)), hintOnly = true, SQLConf.get ) @@ -68,10 +64,7 @@ class JoinSelectionHelperSuite extends PlanTest with JoinSelectionHelper { test("getBroadcastBuildSide (hintOnly = true) return smaller side with both having hints") { val broadcastSide = getBroadcastBuildSide( - left, - right, - Inner, - JoinHint(hintBroadcast, hintBroadcast), + join.copy(hint = JoinHint(hintBroadcast, hintBroadcast)), hintOnly = true, SQLConf.get ) @@ -80,10 +73,7 @@ class JoinSelectionHelperSuite extends PlanTest with JoinSelectionHelper { test("getBroadcastBuildSide (hintOnly = true) return None when no side has a hint") { val broadcastSide = getBroadcastBuildSide( - left, - right, - Inner, - JoinHint(None, None), + join.copy(hint = JoinHint(None, None)), hintOnly = true, SQLConf.get ) @@ -92,10 +82,7 @@ class JoinSelectionHelperSuite extends PlanTest with JoinSelectionHelper { test("getBroadcastBuildSide (hintOnly = false) return BuildRight when right is broadcastable") { val broadcastSide = getBroadcastBuildSide( - left, - right, - Inner, - JoinHint(None, None), + join.copy(hint = JoinHint(None, None)), hintOnly = false, SQLConf.get ) @@ -105,10 +92,7 @@ class JoinSelectionHelperSuite extends PlanTest with JoinSelectionHelper { test("getBroadcastBuildSide (hintOnly = false) return None when right has no broadcast hint") { withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "10MB") { val broadcastSide = getBroadcastBuildSide( - left, - right, - Inner, - JoinHint(None, hintNotToBroadcast ), + join.copy(hint = JoinHint(None, hintNotToBroadcast)), hintOnly = false, SQLConf.get ) @@ -118,10 +102,7 @@ class JoinSelectionHelperSuite extends PlanTest with JoinSelectionHelper { test("getShuffleHashJoinBuildSide (hintOnly = true) return BuildLeft with only a left hint") { val broadcastSide = getShuffleHashJoinBuildSide( - left, - right, - Inner, - JoinHint(hintShuffleHash, None), + join.copy(hint = JoinHint(hintShuffleHash, None)), hintOnly = true, SQLConf.get ) @@ -130,10 +111,7 @@ class JoinSelectionHelperSuite extends PlanTest with JoinSelectionHelper { test("getShuffleHashJoinBuildSide (hintOnly = true) return BuildRight with only a right hint") { val broadcastSide = getShuffleHashJoinBuildSide( - left, - right, - Inner, - JoinHint(None, hintShuffleHash), + join.copy(hint = JoinHint(None, hintShuffleHash)), hintOnly = true, SQLConf.get ) @@ -142,10 +120,7 @@ class JoinSelectionHelperSuite extends PlanTest with JoinSelectionHelper { test("getShuffleHashJoinBuildSide (hintOnly = true) return smaller side when both have hints") { val broadcastSide = getShuffleHashJoinBuildSide( - left, - right, - Inner, - JoinHint(hintShuffleHash, hintShuffleHash), + join.copy(hint = JoinHint(hintShuffleHash, hintShuffleHash)), hintOnly = true, SQLConf.get ) @@ -154,10 +129,7 @@ class JoinSelectionHelperSuite extends PlanTest with JoinSelectionHelper { test("getShuffleHashJoinBuildSide (hintOnly = true) return None when no side has a hint") { val broadcastSide = getShuffleHashJoinBuildSide( - left, - right, - Inner, - JoinHint(None, None), + join.copy(hint = JoinHint(None, None)), hintOnly = true, SQLConf.get ) @@ -166,10 +138,7 @@ class JoinSelectionHelperSuite extends PlanTest with JoinSelectionHelper { test("getShuffleHashJoinBuildSide (hintOnly = false) return BuildRight when right is smaller") { val broadcastSide = getBroadcastBuildSide( - left, - right, - Inner, - JoinHint(None, None), + join.copy(hint = JoinHint(None, None)), hintOnly = false, SQLConf.get ) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/MergeScalarSubqueriesSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/MergeScalarSubqueriesSuite.scala index b640344658d40..b3444b0b43077 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/MergeScalarSubqueriesSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/MergeScalarSubqueriesSuite.scala @@ -38,7 +38,7 @@ class MergeScalarSubqueriesSuite extends PlanTest { val testRelation = LocalRelation(Symbol("a").int, Symbol("b").int, Symbol("c").string) val testRelationWithNonBinaryCollation = LocalRelation( Symbol("utf8_binary").string("UTF8_BINARY"), - Symbol("utf8_binary_lcase").string("UTF8_BINARY_LCASE")) + Symbol("utf8_lcase").string("UTF8_LCASE")) private def definitionNode(plan: LogicalPlan, cteIndex: Int) = { CTERelationDef(plan, cteIndex, underSubquery = true) @@ -204,7 +204,7 @@ class MergeScalarSubqueriesSuite extends PlanTest { val subquery1 = ScalarSubquery(testRelationWithNonBinaryCollation.groupBy( Symbol("utf8_binary"))(max(Symbol("utf8_binary")).as("max_utf8_binary"))) val subquery2 = ScalarSubquery(testRelationWithNonBinaryCollation.groupBy( - Symbol("utf8_binary_lcase"))(max(Symbol("utf8_binary_lcase")).as("utf8_binary_lcase"))) + Symbol("utf8_lcase"))(max(Symbol("utf8_lcase")).as("utf8_lcase"))) val originalQuery = testRelationWithNonBinaryCollation.select(subquery1, subquery2) Optimize.execute(originalQuery.analyze).collect { case WithCTE(_, _) => fail("Should not have merged") diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasingSuite.scala index bd0cc6216f7a2..38cd25cf491a1 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasingSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasingSuite.scala @@ -863,6 +863,27 @@ class NestedColumnAliasingSuite extends SchemaPruningTest { // The plan is expected to be unchanged. comparePlans(plan, RemoveNoopOperators.apply(optimized.get)) } + + test("SPARK-48428: Do not pushdown when attr is used in expression with mutliple references") { + val query = contact + .limit(5) + .select( + GetStructField(GetStructField(CreateStruct(Seq($"id", $"employer")), 1), 0), + $"employer.id") + .analyze + + val optimized = Optimize.execute(query) + + val expected = contact + .select($"id", $"employer") + .limit(5) + .select( + GetStructField(GetStructField(CreateStruct(Seq($"id", $"employer")), 1), 0), + $"employer.id") + .analyze + + comparePlans(optimized, expected) + } } object NestedColumnAliasingSuite { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeJoinConditionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeJoinConditionSuite.scala new file mode 100644 index 0000000000000..e7f090ec4d0dc --- /dev/null +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeJoinConditionSuite.scala @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.optimizer + +import org.apache.spark.sql.catalyst.dsl.expressions._ +import org.apache.spark.sql.catalyst.dsl.plans._ +import org.apache.spark.sql.catalyst.plans._ +import org.apache.spark.sql.catalyst.plans.logical._ +import org.apache.spark.sql.catalyst.rules._ + +class OptimizeJoinConditionSuite extends PlanTest { + + private object Optimize extends RuleExecutor[LogicalPlan] { + val batches = + Batch("Optimize join condition", FixedPoint(1), + OptimizeJoinCondition) :: Nil + } + + val testRelation = LocalRelation($"a".int, $"b".int) + val testRelation1 = LocalRelation($"c".int, $"d".int) + + test("Replace equivalent expression to <=> in join condition") { + val x = testRelation.subquery("x") + val y = testRelation1.subquery("y") + val joinTypes = Seq(Inner, FullOuter, LeftOuter, RightOuter, LeftSemi, LeftSemi, Cross) + joinTypes.foreach(joinType => { + val originalQuery = + x.join(y, joinType, Option($"a" === $"c" || ($"a".isNull && $"c".isNull))) + val correctAnswer = + x.join(y, joinType, Option($"a" <=> $"c")) + comparePlans(Optimize.execute(originalQuery.analyze), correctAnswer.analyze) + }) + } +} diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/PullupCorrelatedPredicatesSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/PullupCorrelatedPredicatesSuite.scala index 29bc46eaa3ebe..cbd24bd7bb299 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/PullupCorrelatedPredicatesSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/PullupCorrelatedPredicatesSuite.scala @@ -158,7 +158,8 @@ class PullupCorrelatedPredicatesSuite extends PlanTest { cond, Seq(DeleteAction(None)), Seq(InsertAction(None, Seq(Assignment($"a", $"c"), Assignment($"b", $"d")))), - Seq(DeleteAction(None))) + Seq(DeleteAction(None)), + withSchemaEvolution = false) val analyzedMergePlan = mergePlan.analyze assert(analyzedMergePlan.resolved) @@ -166,7 +167,7 @@ class PullupCorrelatedPredicatesSuite extends PlanTest { assert(optimized.resolved) optimized match { - case MergeIntoTable(_, _, s: InSubquery, _, _, _) => + case MergeIntoTable(_, _, s: InSubquery, _, _, _, _) => val outerRefs = SubExprUtils.getOuterReferences(s.query.plan) assert(outerRefs.isEmpty, "should be no outer refs") case other => diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceNullWithFalseInPredicateSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceNullWithFalseInPredicateSuite.scala index 7d037799fba76..a50842a26b2ce 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceNullWithFalseInPredicateSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceNullWithFalseInPredicateSuite.scala @@ -500,7 +500,8 @@ class ReplaceNullWithFalseInPredicateSuite extends PlanTest { mergeCondition = expr, matchedActions, notMatchedActions, - notMatchedBySourceActions) + notMatchedBySourceActions, + withSchemaEvolution = false) } val originalPlan = func(testRelation, anotherTestRelation, originalCond).analyze val optimizedPlan = Optimize.execute(originalPlan) @@ -522,7 +523,8 @@ class ReplaceNullWithFalseInPredicateSuite extends PlanTest { mergeCondition = expr, matchedActions, notMatchedActions, - Seq.empty) + notMatchedBySourceActions = Seq.empty, + withSchemaEvolution = false) } val originalPlanWithStar = mergePlanWithStar(originalCond).analyze val optimizedPlanWithStar = Optimize.execute(originalPlanWithStar) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RewriteWithExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RewriteWithExpressionSuite.scala index a386e9bf4efe6..0aeca961aa513 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RewriteWithExpressionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RewriteWithExpressionSuite.scala @@ -18,90 +18,96 @@ package org.apache.spark.sql.catalyst.optimizer import org.apache.spark.SparkException +import org.apache.spark.sql.catalyst.analysis.TempResolvedColumn import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ -import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Coalesce, CommonExpressionDef, CommonExpressionRef, With} +import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan} import org.apache.spark.sql.catalyst.rules.RuleExecutor -import org.apache.spark.sql.types.IntegerType class RewriteWithExpressionSuite extends PlanTest { object Optimizer extends RuleExecutor[LogicalPlan] { - val batches = Batch("Rewrite With expression", Once, RewriteWithExpression) :: Nil + val batches = Batch("Rewrite With expression", Once, + PullOutGroupingExpressions, + RewriteWithExpression) :: Nil } private val testRelation = LocalRelation($"a".int, $"b".int) private val testRelation2 = LocalRelation($"x".int, $"y".int) + private def normalizeCommonExpressionIds(plan: LogicalPlan): LogicalPlan = { + plan.transformAllExpressions { + case a: Alias if a.name.startsWith("_common_expr") => + a.withName("_common_expr_0") + case a: AttributeReference if a.name.startsWith("_common_expr") => + a.withName("_common_expr_0") + } + } + + override def comparePlans( + plan1: LogicalPlan, plan2: LogicalPlan, checkAnalysis: Boolean = true): Unit = { + super.comparePlans(normalizeCommonExpressionIds(plan1), normalizeCommonExpressionIds(plan2)) + } + test("simple common expression") { val a = testRelation.output.head - val commonExprDef = CommonExpressionDef(a) - val ref = new CommonExpressionRef(commonExprDef) - val plan = testRelation.select(With(ref + ref, Seq(commonExprDef)).as("col")) + val expr = With(a) { case Seq(ref) => + ref + ref + } + val plan = testRelation.select(expr.as("col")) comparePlans(Optimizer.execute(plan), testRelation.select((a + a).as("col"))) } test("non-cheap common expression") { val a = testRelation.output.head - val commonExprDef = CommonExpressionDef(a + a) - val ref = new CommonExpressionRef(commonExprDef) - val plan = testRelation.select(With(ref * ref, Seq(commonExprDef)).as("col")) - val commonExprName = "_common_expr_0" + val expr = With(a + a) { case Seq(ref) => + ref * ref + } + val plan = testRelation.select(expr.as("col")) comparePlans( Optimizer.execute(plan), testRelation - .select((testRelation.output :+ (a + a).as(commonExprName)): _*) - .select(($"$commonExprName" * $"$commonExprName").as("col")) + .select((testRelation.output :+ (a + a).as("_common_expr_0")): _*) + .select(($"_common_expr_0" * $"_common_expr_0").as("col")) .analyze ) } test("nested WITH expression in the definition expression") { - val a = testRelation.output.head - val commonExprDef = CommonExpressionDef(a + a) - val ref = new CommonExpressionRef(commonExprDef) - val innerExpr = With(ref + ref, Seq(commonExprDef)) - val innerCommonExprName = "_common_expr_0" - - val b = testRelation.output.last - val outerCommonExprDef = CommonExpressionDef(innerExpr + b) - val outerRef = new CommonExpressionRef(outerCommonExprDef) - val outerExpr = With(outerRef * outerRef, Seq(outerCommonExprDef)) - val outerCommonExprName = "_common_expr_0" + val Seq(a, b) = testRelation.output + val innerExpr = With(a + a) { case Seq(ref) => + ref + ref + } + val outerExpr = With(innerExpr + b) { case Seq(ref) => + ref * ref + } val plan = testRelation.select(outerExpr.as("col")) - val rewrittenOuterExpr = ($"$innerCommonExprName" + $"$innerCommonExprName" + b) - .as(outerCommonExprName) - val outerExprAttr = AttributeReference(outerCommonExprName, IntegerType)( - exprId = rewrittenOuterExpr.exprId) comparePlans( Optimizer.execute(plan), testRelation - .select((testRelation.output :+ (a + a).as(innerCommonExprName)): _*) - .select((testRelation.output :+ $"$innerCommonExprName" :+ rewrittenOuterExpr): _*) - .select((outerExprAttr * outerExprAttr).as("col")) + .select((testRelation.output :+ (a + a).as("_common_expr_0")): _*) + .select((testRelation.output ++ Seq($"_common_expr_0", + ($"_common_expr_0" + $"_common_expr_0" + b).as("_common_expr_1"))): _*) + .select(($"_common_expr_1" * $"_common_expr_1").as("col")) .analyze ) } test("nested WITH expression in the main expression") { - val a = testRelation.output.head - val commonExprDef = CommonExpressionDef(a + a) - val ref = new CommonExpressionRef(commonExprDef) - val innerExpr = With(ref + ref, Seq(commonExprDef)) - val innerCommonExprName = "_common_expr_0" - - val b = testRelation.output.last - val outerCommonExprDef = CommonExpressionDef(b + b) - val outerRef = new CommonExpressionRef(outerCommonExprDef) - val outerExpr = With(outerRef * outerRef + innerExpr, Seq(outerCommonExprDef)) - val outerCommonExprName = "_common_expr_0" + val Seq(a, b) = testRelation.output + val innerExpr = With(a + a) { case Seq(ref) => + ref + ref + } + val outerExpr = With(b + b) { case Seq(ref) => + ref * ref + innerExpr + } val plan = testRelation.select(outerExpr.as("col")) - val rewrittenInnerExpr = (a + a).as(innerCommonExprName) - val rewrittenOuterExpr = (b + b).as(outerCommonExprName) + val rewrittenInnerExpr = (a + a).as("_common_expr_0") + val rewrittenOuterExpr = (b + b).as("_common_expr_1") val finalExpr = rewrittenOuterExpr.toAttribute * rewrittenOuterExpr.toAttribute + (rewrittenInnerExpr.toAttribute + rewrittenInnerExpr.toAttribute) comparePlans( @@ -115,13 +121,12 @@ class RewriteWithExpressionSuite extends PlanTest { } test("correlated nested WITH expression is not supported") { - val b = testRelation.output.last - val outerCommonExprDef = CommonExpressionDef(b + b) + val Seq(a, b) = testRelation.output + val outerCommonExprDef = CommonExpressionDef(b + b, CommonExpressionId(0)) val outerRef = new CommonExpressionRef(outerCommonExprDef) - val a = testRelation.output.head // The inner expression definition references the outer expression - val commonExprDef1 = CommonExpressionDef(a + a + outerRef) + val commonExprDef1 = CommonExpressionDef(a + a + outerRef, CommonExpressionId(1)) val ref1 = new CommonExpressionRef(commonExprDef1) val innerExpr1 = With(ref1 + ref1, Seq(commonExprDef1)) @@ -139,15 +144,15 @@ class RewriteWithExpressionSuite extends PlanTest { test("WITH expression in filter") { val a = testRelation.output.head - val commonExprDef = CommonExpressionDef(a + a) - val ref = new CommonExpressionRef(commonExprDef) - val plan = testRelation.where(With(ref < 10 && ref > 0, Seq(commonExprDef))) - val commonExprName = "_common_expr_0" + val condition = With(a + a) { case Seq(ref) => + ref < 10 && ref > 0 + } + val plan = testRelation.where(condition) comparePlans( Optimizer.execute(plan), testRelation - .select((testRelation.output :+ (a + a).as(commonExprName)): _*) - .where($"$commonExprName" < 10 && $"$commonExprName" > 0) + .select((testRelation.output :+ (a + a).as("_common_expr_0")): _*) + .where($"_common_expr_0" < 10 && $"_common_expr_0" > 0) .select(testRelation.output: _*) .analyze ) @@ -155,16 +160,15 @@ class RewriteWithExpressionSuite extends PlanTest { test("WITH expression in join condition: only reference left child") { val a = testRelation.output.head - val commonExprDef = CommonExpressionDef(a + a) - val ref = new CommonExpressionRef(commonExprDef) - val condition = With(ref < 10 && ref > 0, Seq(commonExprDef)) + val condition = With(a + a) { case Seq(ref) => + ref < 10 && ref > 0 + } val plan = testRelation.join(testRelation2, condition = Some(condition)) - val commonExprName = "_common_expr_0" comparePlans( Optimizer.execute(plan), testRelation - .select((testRelation.output :+ (a + a).as(commonExprName)): _*) - .join(testRelation2, condition = Some($"$commonExprName" < 10 && $"$commonExprName" > 0)) + .select((testRelation.output :+ (a + a).as("_common_expr_0")): _*) + .join(testRelation2, condition = Some($"_common_expr_0" < 10 && $"_common_expr_0" > 0)) .select((testRelation.output ++ testRelation2.output): _*) .analyze ) @@ -172,17 +176,16 @@ class RewriteWithExpressionSuite extends PlanTest { test("WITH expression in join condition: only reference right child") { val x = testRelation2.output.head - val commonExprDef = CommonExpressionDef(x + x) - val ref = new CommonExpressionRef(commonExprDef) - val condition = With(ref < 10 && ref > 0, Seq(commonExprDef)) + val condition = With(x + x) { case Seq(ref) => + ref < 10 && ref > 0 + } val plan = testRelation.join(testRelation2, condition = Some(condition)) - val commonExprName = "_common_expr_0" comparePlans( Optimizer.execute(plan), testRelation .join( - testRelation2.select((testRelation2.output :+ (x + x).as(commonExprName)): _*), - condition = Some($"$commonExprName" < 10 && $"$commonExprName" > 0) + testRelation2.select((testRelation2.output :+ (x + x).as("_common_expr_0")): _*), + condition = Some($"_common_expr_0" < 10 && $"_common_expr_0" > 0) ) .select((testRelation.output ++ testRelation2.output): _*) .analyze @@ -192,9 +195,9 @@ class RewriteWithExpressionSuite extends PlanTest { test("WITH expression in join condition: reference both children") { val a = testRelation.output.head val x = testRelation2.output.head - val commonExprDef = CommonExpressionDef(a + x) - val ref = new CommonExpressionRef(commonExprDef) - val condition = With(ref < 10 && ref > 0, Seq(commonExprDef)) + val condition = With(a + x) { case Seq(ref) => + ref < 10 && ref > 0 + } val plan = testRelation.join(testRelation2, condition = Some(condition)) comparePlans( Optimizer.execute(plan), @@ -209,24 +212,244 @@ class RewriteWithExpressionSuite extends PlanTest { test("WITH expression inside conditional expression") { val a = testRelation.output.head - val commonExprDef = CommonExpressionDef(a + a) - val ref = new CommonExpressionRef(commonExprDef) - val expr = Coalesce(Seq(a, With(ref * ref, Seq(commonExprDef)))) + val expr = Coalesce(Seq(a, With(a + a) { case Seq(ref) => + ref * ref + })) val inlinedExpr = Coalesce(Seq(a, (a + a) * (a + a))) val plan = testRelation.select(expr.as("col")) // With in the conditional branches is always inlined. comparePlans(Optimizer.execute(plan), testRelation.select(inlinedExpr.as("col"))) - val expr2 = Coalesce(Seq(With(ref * ref, Seq(commonExprDef)), a)) + val expr2 = Coalesce(Seq(With(a + a) { case Seq(ref) => + ref * ref + }, a)) val plan2 = testRelation.select(expr2.as("col")) - val commonExprName = "_common_expr_0" // With in the always-evaluated branches can still be optimized. comparePlans( Optimizer.execute(plan2), testRelation - .select((testRelation.output :+ (a + a).as(commonExprName)): _*) - .select(Coalesce(Seq(($"$commonExprName" * $"$commonExprName"), a)).as("col")) + .select((testRelation.output :+ (a + a).as("_common_expr_0")): _*) + .select(Coalesce(Seq(($"_common_expr_0" * $"_common_expr_0"), a)).as("col")) + .analyze + ) + } + + test("WITH expression in grouping exprs") { + val a = testRelation.output.head + val expr1 = With(a + 1) { case Seq(ref) => + ref * ref + } + val expr2 = With(a + 1) { case Seq(ref) => + ref * ref + } + val expr3 = With(a + 1) { case Seq(ref) => + ref * ref + } + val plan = testRelation.groupBy(expr1)( + (expr2 + 2).as("col1"), + count(expr3 - 3).as("col2") + ) + comparePlans( + Optimizer.execute(plan), + testRelation + .select(testRelation.output :+ (a + 1).as("_common_expr_0"): _*) + .select(testRelation.output :+ + ($"_common_expr_0" * $"_common_expr_0").as("_groupingexpression"): _*) + .select(testRelation.output ++ Seq($"_groupingexpression", + (a + 1).as("_common_expr_1")): _*) + .groupBy($"_groupingexpression")( + $"_groupingexpression", + count($"_common_expr_1" * $"_common_expr_1" - 3).as("_aggregateexpression") + ) + .select(($"_groupingexpression" + 2).as("col1"), $"_aggregateexpression".as("col2")) + .analyze + ) + // Running CollapseProject after the rule cleans up the unnecessary projections. + comparePlans( + CollapseProject(Optimizer.execute(plan)), + testRelation + .select(testRelation.output :+ (a + 1).as("_common_expr_0"): _*) + .select(testRelation.output ++ Seq( + ($"_common_expr_0" * $"_common_expr_0").as("_groupingexpression"), + (a + 1).as("_common_expr_1")): _*) + .groupBy($"_groupingexpression")( + ($"_groupingexpression" + 2).as("col1"), + count($"_common_expr_1" * $"_common_expr_1" - 3).as("col2") + ) + .analyze + ) + } + + test("WITH expression in aggregate exprs") { + val Seq(a, b) = testRelation.output + val expr1 = With(a + 1) { case Seq(ref) => + ref * ref + } + val expr2 = With(b + 2) { case Seq(ref) => + ref * ref + } + val plan = testRelation.groupBy(a)( + (a + 3).as("col1"), + expr1.as("col2"), + max(expr2).as("col3") + ) + comparePlans( + Optimizer.execute(plan), + testRelation + .select(testRelation.output :+ (b + 2).as("_common_expr_0"): _*) + .groupBy(a)(a, max($"_common_expr_0" * $"_common_expr_0").as("_aggregateexpression")) + .select(a, $"_aggregateexpression", (a + 1).as("_common_expr_1")) + .select( + (a + 3).as("col1"), + ($"_common_expr_1" * $"_common_expr_1").as("col2"), + $"_aggregateexpression".as("col3") + ) + .analyze + ) + } + + test("WITH common expression is aggregate function") { + val a = testRelation.output.head + val expr = With(count(a - 1)) { case Seq(ref) => + ref * ref + } + val plan = testRelation.groupBy(a)( + (a - 1).as("col1"), + expr.as("col2") + ) + comparePlans( + Optimizer.execute(plan), + testRelation + .groupBy(a)(a, count(a - 1).as("_aggregateexpression")) + .select( + (a - 1).as("col1"), + ($"_aggregateexpression" * $"_aggregateexpression").as("col2") + ) + .analyze + ) + } + + test("aggregate functions in child of WITH expression with ref is not supported") { + val a = testRelation.output.head + intercept[java.lang.AssertionError] { + val expr = With(a - 1) { case Seq(ref) => + sum(ref * ref) + } + val plan = testRelation.groupBy(a)( + (a - 1).as("col1"), + expr.as("col2") + ) + Optimizer.execute(plan) + } + } + + test("WITH expression nested in aggregate function") { + val a = testRelation.output.head + val expr = With(a + 1) { case Seq(ref) => + ref * ref + } + val nestedExpr = With(a - 1) { case Seq(ref) => + ref * max(expr) + ref + } + val plan = testRelation.groupBy(a)(nestedExpr.as("col")).analyze + comparePlans( + Optimizer.execute(plan), + testRelation + .select(testRelation.output :+ (a + 1).as("_common_expr_0"): _*) + .groupBy(a)(a, max($"_common_expr_0" * $"_common_expr_0").as("_aggregateexpression")) + .select($"a", $"_aggregateexpression", (a - 1).as("_common_expr_1")) + .select(($"_common_expr_1" * $"_aggregateexpression" + $"_common_expr_1").as("col")) + .analyze + ) + } + + test("WITH expression in window exprs") { + val Seq(a, b) = testRelation.output + val expr1 = With(a + 1) { case Seq(ref) => + ref * ref + } + val expr2 = With(b + 2) { case Seq(ref) => + ref * ref + } + val frame = SpecifiedWindowFrame(RowFrame, UnboundedPreceding, UnboundedFollowing) + val plan = testRelation + .window( + Seq(windowExpr(count(a), windowSpec(Seq(expr2), Nil, frame)).as("col2")), + Seq(expr2), + Nil + ) + .window( + Seq(windowExpr(sum(expr1), windowSpec(Seq(a), Nil, frame)).as("col3")), + Seq(a), + Nil + ) + .select((a - 1).as("col1"), $"col2", $"col3") + .analyze + comparePlans( + Optimizer.execute(plan), + testRelation + .select(a, b, (b + 2).as("_common_expr_0")) + .select(a, b, $"_common_expr_0", (b + 2).as("_common_expr_1")) + .window( + Seq(windowExpr(count(a), windowSpec(Seq($"_common_expr_0" * $"_common_expr_0"), Nil, + frame)).as("col2")), + Seq($"_common_expr_1" * $"_common_expr_1"), + Nil + ) + .select(a, b, $"col2") + .select(a, b, $"col2", (a + 1).as("_common_expr_2")) + .window( + Seq(windowExpr(sum($"_common_expr_2" * $"_common_expr_2"), + windowSpec(Seq(a), Nil, frame)).as("col3")), + Seq(a), + Nil + ) + .select(a, b, $"col2", $"col3") + .select((a - 1).as("col1"), $"col2", $"col3") .analyze ) } + + test("WITH common expression is window function") { + val a = testRelation.output.head + val frame = SpecifiedWindowFrame(RowFrame, UnboundedPreceding, UnboundedFollowing) + val winExpr = windowExpr(sum(a), windowSpec(Seq(a), Nil, frame)) + val expr = With(winExpr) { + case Seq(ref) => ref * ref + } + val plan = testRelation.select(expr.as("col")).analyze + comparePlans( + Optimizer.execute(plan), + testRelation + .select(a) + .window(Seq(winExpr.as("_we0")), Seq(a), Nil) + .select(a, $"_we0", ($"_we0" * $"_we0").as("col")) + .select($"col") + .analyze + ) + } + + test("window functions in child of WITH expression with ref is not supported") { + val a = testRelation.output.head + intercept[java.lang.AssertionError] { + val expr = With(a - 1) { case Seq(ref) => + ref + windowExpr(sum(ref), windowSpec(Seq(a), Nil, UnspecifiedFrame)) + } + val plan = testRelation.window(Seq(expr.as("col")), Seq(a), Nil) + Optimizer.execute(plan) + } + } + + test("SPARK-48252: TempResolvedColumn in common expression") { + val a = testRelation.output.head + val tempResolved = TempResolvedColumn(a, Seq("a")) + val expr = With(tempResolved) { case Seq(ref) => + ref === 1 + } + val plan = testRelation.having($"b")(avg("a").as("a"))(expr).analyze + comparePlans( + Optimizer.execute(plan), + testRelation.groupBy($"b")(avg("a").as("a")).where($"a" === 1).analyze + ) + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala index b306ca3cd18a5..8612a6e9c50ff 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala @@ -1075,19 +1075,11 @@ class DDLParserSuite extends AnalysisTest { ifExists = true)) } - // ALTER TABLE table_name SET TBLPROPERTIES ('comment' = new_comment); // ALTER TABLE table_name UNSET TBLPROPERTIES [IF EXISTS] ('comment', 'key'); test("alter table: alter table properties") { - val sql1_table = "ALTER TABLE table_name SET TBLPROPERTIES ('test' = 'test', " + - "'comment' = 'new_comment')" val sql2_table = "ALTER TABLE table_name UNSET TBLPROPERTIES ('comment', 'test')" val sql3_table = "ALTER TABLE table_name UNSET TBLPROPERTIES IF EXISTS ('comment', 'test')" - comparePlans( - parsePlan(sql1_table), - SetTableProperties( - UnresolvedTable(Seq("table_name"), "ALTER TABLE ... SET TBLPROPERTIES", true), - Map("test" -> "test", "comment" -> "new_comment"))) comparePlans( parsePlan(sql2_table), UnsetTableProperties( @@ -1875,7 +1867,8 @@ class DDLParserSuite extends AnalysisTest { Assignment(UnresolvedAttribute("target.col2"), UnresolvedAttribute("source.col2"))))), Seq(DeleteAction(Some(EqualTo(UnresolvedAttribute("target.col3"), Literal("delete")))), UpdateAction(Some(EqualTo(UnresolvedAttribute("target.col3"), Literal("update"))), - Seq(Assignment(UnresolvedAttribute("target.col3"), Literal("delete"))))))) + Seq(Assignment(UnresolvedAttribute("target.col3"), Literal("delete"))))), + withSchemaEvolution = false)) } test("merge into table: using subquery") { @@ -1906,7 +1899,8 @@ class DDLParserSuite extends AnalysisTest { Assignment(UnresolvedAttribute("target.col2"), UnresolvedAttribute("source.col2"))))), Seq(DeleteAction(Some(EqualTo(UnresolvedAttribute("target.col3"), Literal("delete")))), UpdateAction(Some(EqualTo(UnresolvedAttribute("target.col3"), Literal("update"))), - Seq(Assignment(UnresolvedAttribute("target.col3"), Literal("delete"))))))) + Seq(Assignment(UnresolvedAttribute("target.col3"), Literal("delete"))))), + withSchemaEvolution = false)) } test("merge into table: cte") { @@ -1939,7 +1933,8 @@ class DDLParserSuite extends AnalysisTest { Assignment(UnresolvedAttribute("target.col2"), UnresolvedAttribute("source.col2"))))), Seq(DeleteAction(Some(EqualTo(UnresolvedAttribute("target.col3"), Literal("delete")))), UpdateAction(Some(EqualTo(UnresolvedAttribute("target.col3"), Literal("update"))), - Seq(Assignment(UnresolvedAttribute("target.col3"), Literal("delete"))))))) + Seq(Assignment(UnresolvedAttribute("target.col3"), Literal("delete"))))), + withSchemaEvolution = false)) } test("merge into table: no additional condition") { @@ -1962,7 +1957,8 @@ class DDLParserSuite extends AnalysisTest { Seq(InsertAction(None, Seq(Assignment(UnresolvedAttribute("target.col1"), UnresolvedAttribute("source.col1")), Assignment(UnresolvedAttribute("target.col2"), UnresolvedAttribute("source.col2"))))), - Seq(DeleteAction(None)))) + Seq(DeleteAction(None)), + withSchemaEvolution = false)) } test("merge into table: star") { @@ -1983,7 +1979,8 @@ class DDLParserSuite extends AnalysisTest { Seq(DeleteAction(Some(EqualTo(UnresolvedAttribute("target.col2"), Literal("delete")))), UpdateStarAction(Some(EqualTo(UnresolvedAttribute("target.col2"), Literal("update"))))), Seq(InsertStarAction(Some(EqualTo(UnresolvedAttribute("target.col2"), Literal("insert"))))), - Seq.empty)) + Seq.empty, + withSchemaEvolution = false)) } test("merge into table: invalid star in not matched by source") { @@ -2024,7 +2021,8 @@ class DDLParserSuite extends AnalysisTest { Seq(Assignment(UnresolvedAttribute("target.col1"), Literal(1)), Assignment(UnresolvedAttribute("target.col2"), UnresolvedAttribute("source.col2")))), InsertStarAction(None)), - Seq.empty)) + Seq.empty, + withSchemaEvolution = false)) } test("merge into table: column aliases are not allowed") { @@ -2085,7 +2083,26 @@ class DDLParserSuite extends AnalysisTest { UpdateAction(Some(EqualTo(UnresolvedAttribute("target.col3"), Literal("update1"))), Seq(Assignment(UnresolvedAttribute("target.col3"), Literal(1)))), UpdateAction(Some(EqualTo(UnresolvedAttribute("target.col3"), Literal("update2"))), - Seq(Assignment(UnresolvedAttribute("target.col3"), Literal(2))))))) + Seq(Assignment(UnresolvedAttribute("target.col3"), Literal(2))))), + withSchemaEvolution = false)) + } + + test("merge into table: schema evolution") { + parseCompare( + """ + |MERGE WITH SCHEMA EVOLUTION INTO testcat1.ns1.ns2.tbl AS target + |USING testcat2.ns1.ns2.tbl AS source + |ON target.col1 = source.col1 + |WHEN NOT MATCHED BY SOURCE THEN DELETE + """.stripMargin, + MergeIntoTable( + SubqueryAlias("target", UnresolvedRelation(Seq("testcat1", "ns1", "ns2", "tbl"))), + SubqueryAlias("source", UnresolvedRelation(Seq("testcat2", "ns1", "ns2", "tbl"))), + EqualTo(UnresolvedAttribute("target.col1"), UnresolvedAttribute("source.col1")), + matchedActions = Seq.empty, + notMatchedActions = Seq.empty, + notMatchedBySourceActions = Seq(DeleteAction(None)), + withSchemaEvolution = true)) } test("merge into table: only the last matched clause can omit the condition") { @@ -2824,7 +2841,8 @@ class DDLParserSuite extends AnalysisTest { Seq(DeleteAction(Some(EqualTo(UnresolvedAttribute("target.col2"), Literal("delete")))), UpdateAction(Some(EqualTo(UnresolvedAttribute("target.col2"), Literal("update"))), Seq(Assignment(UnresolvedAttribute("target.col2"), - UnresolvedAttribute("DEFAULT"))))))) + UnresolvedAttribute("DEFAULT"))))), + withSchemaEvolution = false)) } test("SPARK-40944: Relax ordering constraint for CREATE TABLE column options") { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ErrorParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ErrorParserSuite.scala index 6fb37ae33fa8d..0130ae72a03c4 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ErrorParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ErrorParserSuite.scala @@ -323,4 +323,37 @@ class ErrorParserSuite extends AnalysisTest { parameters = Map("type" -> "\"CHARACTER\""), context = ExpectedContext(fragment = "Character", start = 19, stop = 27)) } + + test("'!' where only NOT should be allowed") { + checkError( + exception = parseException("SELECT 1 ! IN (2)"), + errorClass = "SYNTAX_DISCONTINUED.BANG_EQUALS_NOT", + parameters = Map("clause" -> "!"), + context = ExpectedContext(fragment = "!", start = 9, stop = 9)) + checkError( + exception = parseException("SELECT 'a' ! LIKE 'b'"), + errorClass = "SYNTAX_DISCONTINUED.BANG_EQUALS_NOT", + parameters = Map("clause" -> "!"), + context = ExpectedContext(fragment = "!", start = 11, stop = 11)) + checkError( + exception = parseException("SELECT 1 ! BETWEEN 1 AND 2"), + errorClass = "SYNTAX_DISCONTINUED.BANG_EQUALS_NOT", + parameters = Map("clause" -> "!"), + context = ExpectedContext(fragment = "!", start = 9, stop = 9)) + checkError( + exception = parseException("SELECT 1 IS ! NULL"), + errorClass = "SYNTAX_DISCONTINUED.BANG_EQUALS_NOT", + parameters = Map("clause" -> "!"), + context = ExpectedContext(fragment = "!", start = 12, stop = 12)) + checkError( + exception = parseException("CREATE TABLE IF ! EXISTS t(c1 INT)"), + errorClass = "SYNTAX_DISCONTINUED.BANG_EQUALS_NOT", + parameters = Map("clause" -> "!"), + context = ExpectedContext(fragment = "!", start = 16, stop = 16)) + checkError( + exception = parseException("CREATE TABLE t(c1 INT ! NULL)"), + errorClass = "SYNTAX_DISCONTINUED.BANG_EQUALS_NOT", + parameters = Map("clause" -> "!"), + context = ExpectedContext(fragment = "!", start = 22, stop = 22)) + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ParserUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ParserUtilsSuite.scala index d9f3067d30e51..218304db3d591 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ParserUtilsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ParserUtilsSuite.scala @@ -131,6 +131,18 @@ class ParserUtilsSuite extends SparkFunSuite { |cd\ef"""".stripMargin) == """ab |cdef""".stripMargin) + + // String with an invalid '\' as the last character. + assert(unescapeSQLString(""""abc\"""") == "abc\\") + + // Strings containing invalid Unicode escapes with non-hex characters. + assert(unescapeSQLString("\"abc\\uXXXXa\"") == "abcuXXXXa") + assert(unescapeSQLString("\"abc\\uxxxxa\"") == "abcuxxxxa") + assert(unescapeSQLString("\"abc\\UXXXXXXXXa\"") == "abcUXXXXXXXXa") + assert(unescapeSQLString("\"abc\\Uxxxxxxxxa\"") == "abcUxxxxxxxxa") + // Guard against off-by-one errors in the "all chars are hex" routine: + assert(unescapeSQLString("\"abc\\uAAAXa\"") == "abcuAAAXa") + // scalastyle:on nonascii } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala index 17dd7349e7bea..8d01040563361 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala @@ -1617,14 +1617,23 @@ class PlanParserSuite extends AnalysisTest { parameters = Map( "error" -> "'order'", "hint" -> "")) - val sql8 = s"select * from my_tvf(arg1 => table(select col1, col2, col3 from v2) " + - s"$partition by col1, col2 order by col2 asc, col3 desc)" + val sql8tableArg = "table(select col1, col2, col3 from v2)" + val sql8partition = s"$partition by col1, col2 order by col2 asc, col3 desc" + val sql8 = s"select * from my_tvf(arg1 => $sql8tableArg $sql8partition)" checkError( exception = parseException(sql8), - errorClass = "PARSE_SYNTAX_ERROR", + errorClass = "_LEGACY_ERROR_TEMP_0064", parameters = Map( - "error" -> "'order'", - "hint" -> ": extra input 'order'")) + "msg" -> + ("The table function call includes a table argument with an invalid " + + "partitioning/ordering specification: the PARTITION BY clause included multiple " + + "expressions without parentheses surrounding them; please add parentheses around " + + "these expressions and then retry the query again")), + context = ExpectedContext( + fragment = s"$sql8tableArg $sql8partition", + start = 29, + stop = 110 + partition.length) + ) } } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/SqlScriptingParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/SqlScriptingParserSuite.scala new file mode 100644 index 0000000000000..657e4b2232ee9 --- /dev/null +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/SqlScriptingParserSuite.scala @@ -0,0 +1,172 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.parser + +import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.catalyst.plans.SQLHelper + +class SqlScriptingParserSuite extends SparkFunSuite with SQLHelper { + import CatalystSqlParser._ + + test("single select") { + val sqlScriptText = "SELECT 1;" + val tree = parseScript(sqlScriptText) + assert(tree.collection.length == 1) + assert(tree.collection.head.isInstanceOf[SingleStatement]) + val sparkStatement = tree.collection.head.asInstanceOf[SingleStatement] + assert(sparkStatement.getText(sqlScriptText) == "SELECT 1;") + } + + test("single select without ;") { + val sqlScriptText = "SELECT 1" + val tree = parseScript(sqlScriptText) + assert(tree.collection.length == 1) + assert(tree.collection.head.isInstanceOf[SingleStatement]) + val sparkStatement = tree.collection.head.asInstanceOf[SingleStatement] + assert(sparkStatement.getText(sqlScriptText) == "SELECT 1") + } + + test("multi select without ; - should fail") { + val sqlScriptText = "SELECT 1 SELECT 1" + val e = intercept[ParseException] { + parseScript(sqlScriptText) + } + assert(e.getErrorClass === "PARSE_SYNTAX_ERROR") + assert(e.getMessage.contains("Syntax error")) + assert(e.getMessage.contains("SELECT 1 SELECT 1")) + } + + test("multi select") { + val sqlScriptText = "BEGIN SELECT 1;SELECT 2; END" + val tree = parseScript(sqlScriptText) + assert(tree.collection.length == 2) + assert(tree.collection.forall(_.isInstanceOf[SingleStatement])) + + sqlScriptText.split(";") + .map(cleanupStatementString) + .zip(tree.collection) + .foreach { case (expected, statement) => + val sparkStatement = statement.asInstanceOf[SingleStatement] + val statementText = sparkStatement.getText(sqlScriptText) + assert(statementText == expected) + } + } + + test("empty BEGIN END block") { + val sqlScriptText = + """ + |BEGIN + |END""".stripMargin + val tree = parseScript(sqlScriptText) + assert(tree.collection.isEmpty) + } + + test("multiple ; in row - should fail") { + val sqlScriptText = + """ + |BEGIN + | SELECT 1;; + | SELECT 2; + |END""".stripMargin + val e = intercept[ParseException] { + parseScript(sqlScriptText) + } + assert(e.getErrorClass === "PARSE_SYNTAX_ERROR") + assert(e.getMessage.contains("Syntax error")) + assert(e.getMessage.contains("at or near ';'")) + } + + test("without ; in last statement - should fail") { + val sqlScriptText = + """ + |BEGIN + | SELECT 1; + | SELECT 2 + |END""".stripMargin + val e = intercept[ParseException] { + parseScript(sqlScriptText) + } + assert(e.getErrorClass === "PARSE_SYNTAX_ERROR") + assert(e.getMessage.contains("Syntax error")) + assert(e.getMessage.contains("at or near end of input")) + } + + test("multi statement") { + val sqlScriptText = + """ + |BEGIN + | SELECT 1; + | SELECT 2; + | INSERT INTO A VALUES (a, b, 3); + | SELECT a, b, c FROM T; + | SELECT * FROM T; + |END""".stripMargin + val tree = parseScript(sqlScriptText) + assert(tree.collection.length == 5) + assert(tree.collection.forall(_.isInstanceOf[SingleStatement])) + sqlScriptText.split(";") + .map(cleanupStatementString) + .zip(tree.collection) + .foreach { case (expected, statement) => + val sparkStatement = statement.asInstanceOf[SingleStatement] + val statementText = sparkStatement.getText(sqlScriptText) + assert(statementText == expected) + } + } + + test("nested begin end") { + val sqlScriptText = + """ + |BEGIN + | BEGIN + | SELECT 1; + | END; + | BEGIN + | BEGIN + | SELECT 2; + | SELECT 3; + | END; + | END; + |END""".stripMargin + val tree = parseScript(sqlScriptText) + assert(tree.collection.length == 2) + assert(tree.collection.head.isInstanceOf[CompoundBody]) + val body1 = tree.collection.head.asInstanceOf[CompoundBody] + assert(body1.collection.length == 1) + assert(body1.collection.head.asInstanceOf[SingleStatement].getText(sqlScriptText) + == "SELECT 1") + + val body2 = tree.collection(1).asInstanceOf[CompoundBody] + assert(body2.collection.length == 1) + assert(body2.collection.head.isInstanceOf[CompoundBody]) + val nestedBody = body2.collection.head.asInstanceOf[CompoundBody] + assert(nestedBody.collection.head.asInstanceOf[SingleStatement].getText(sqlScriptText) + == "SELECT 2") + assert(nestedBody.collection(1).asInstanceOf[SingleStatement].getText(sqlScriptText) + == "SELECT 3") + } + + // Helper methods + def cleanupStatementString(statementStr: String): String = { + statementStr + .replace("\n", "") + .replace("BEGIN", "") + .replace("END", "") + .trim + } +} diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/JoinTypesTest.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/JoinTypesTest.scala index 43221bf60ca34..886b043ad79e6 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/JoinTypesTest.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/JoinTypesTest.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.plans import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.AnalysisException class JoinTypesTest extends SparkFunSuite { @@ -61,4 +62,18 @@ class JoinTypesTest extends SparkFunSuite { assert(JoinType("cross") === Cross) } + test("unsupported join type") { + val joinType = "unknown" + checkError( + exception = intercept[AnalysisException]( + JoinType(joinType) + ), + errorClass = "UNSUPPORTED_JOIN_TYPE", + sqlState = "0A000", + parameters = Map( + "typ" -> joinType, + "supported" -> JoinType.supported.mkString("'", "', '", "'") + ) + ) + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/LogicalPlanSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/LogicalPlanSuite.scala index 31f7e07143c50..f783083d0a44f 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/LogicalPlanSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/LogicalPlanSuite.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.catalyst.plans +import scala.annotation.nowarn + import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation import org.apache.spark.sql.catalyst.dsl.expressions._ @@ -83,6 +85,26 @@ class LogicalPlanSuite extends SparkFunSuite { } test("transformExpressions works with a Stream") { + val id1 = NamedExpression.newExprId + val id2 = NamedExpression.newExprId + @nowarn("cat=deprecation") + val plan = Project(Stream( + Alias(Literal(1), "a")(exprId = id1), + Alias(Literal(2), "b")(exprId = id2)), + OneRowRelation()) + val result = plan.transformExpressions { + case Literal(v: Int, IntegerType) if v != 1 => + Literal(v + 1, IntegerType) + } + @nowarn("cat=deprecation") + val expected = Project(Stream( + Alias(Literal(1), "a")(exprId = id1), + Alias(Literal(3), "b")(exprId = id2)), + OneRowRelation()) + assert(result.sameResult(expected)) + } + + test("SPARK-45685: transformExpressions works with a LazyList") { val id1 = NamedExpression.newExprId val id2 = NamedExpression.newExprId val plan = Project(LazyList( diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala index 4dbadef93a071..21542d43eac98 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.trees import java.math.BigInteger import java.util.UUID +import scala.annotation.nowarn import scala.collection.mutable.ArrayBuffer import org.json4s.JsonAST._ @@ -693,6 +694,22 @@ class TreeNodeSuite extends SparkFunSuite with SQLHelper { } test("transform works on stream of children") { + @nowarn("cat=deprecation") + val before = Coalesce(Stream(Literal(1), Literal(2))) + // Note it is a bit tricky to exhibit the broken behavior. Basically we want to create the + // situation in which the TreeNode.mapChildren function's change detection is not triggered. A + // stream's first element is typically materialized, so in order to not trip the TreeNode change + // detection logic, we should not change the first element in the sequence. + val result = before.transform { + case Literal(v: Int, IntegerType) if v != 1 => + Literal(v + 1, IntegerType) + } + @nowarn("cat=deprecation") + val expected = Coalesce(Stream(Literal(1), Literal(3))) + assert(result === expected) + } + + test("SPARK-45685: transform works on LazyList of children") { val before = Coalesce(LazyList(Literal(1), Literal(2))) // Note it is a bit tricky to exhibit the broken behavior. Basically we want to create the // situation in which the TreeNode.mapChildren function's change detection is not triggered. A @@ -707,6 +724,16 @@ class TreeNodeSuite extends SparkFunSuite with SQLHelper { } test("withNewChildren on stream of children") { + @nowarn("cat=deprecation") + val before = Coalesce(Stream(Literal(1), Literal(2))) + @nowarn("cat=deprecation") + val result = before.withNewChildren(Stream(Literal(1), Literal(3))) + @nowarn("cat=deprecation") + val expected = Coalesce(Stream(Literal(1), Literal(3))) + assert(result === expected) + } + + test("SPARK-45685: withNewChildren on LazyList of children") { val before = Coalesce(LazyList(Literal(1), Literal(2))) val result = before.withNewChildren(LazyList(Literal(1), Literal(3))) val expected = Coalesce(LazyList(Literal(1), Literal(3))) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala index f68d485ac95fd..8d8669aece894 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala @@ -26,7 +26,7 @@ import java.util.concurrent.TimeUnit import org.scalatest.matchers.must.Matchers import org.scalatest.matchers.should.Matchers._ -import org.apache.spark.{SparkException, SparkFunSuite, SparkIllegalArgumentException} +import org.apache.spark.{SparkFunSuite, SparkIllegalArgumentException} import org.apache.spark.sql.catalyst.plans.SQLHelper import org.apache.spark.sql.catalyst.util.DateTimeConstants._ import org.apache.spark.sql.catalyst.util.DateTimeTestUtils._ @@ -1040,11 +1040,14 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper { } checkError( - exception = intercept[SparkException] { + exception = intercept[SparkIllegalArgumentException] { timestampAdd("SECS", 1, date(1969, 1, 1, 0, 0, 0, 1, getZoneId("UTC")), getZoneId("UTC")) }, - errorClass = "INTERNAL_ERROR", - parameters = Map("message" -> "Got the unexpected unit 'SECS'.")) + errorClass = "INVALID_PARAMETER_VALUE.DATETIME_UNIT", + parameters = Map( + "functionName" -> "`TIMESTAMPADD`", + "parameter" -> "`unit`", + "invalidValue" -> "'SECS'")) } test("SPARK-38284: difference between two timestamps in units") { @@ -1092,14 +1095,17 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper { } checkError( - exception = intercept[SparkException] { + exception = intercept[SparkIllegalArgumentException] { timestampDiff( "SECS", date(1969, 1, 1, 0, 0, 0, 1, getZoneId("UTC")), date(2022, 1, 1, 0, 0, 0, 1, getZoneId("UTC")), getZoneId("UTC")) }, - errorClass = "INTERNAL_ERROR", - parameters = Map("message" -> "Got the unexpected unit 'SECS'.")) + errorClass = "INVALID_PARAMETER_VALUE.DATETIME_UNIT", + parameters = + Map("functionName" -> "`TIMESTAMPDIFF`", + "parameter" -> "`unit`", + "invalidValue" -> "'SECS'")) } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/InternalRowComparableWrapperBenchmark.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/InternalRowComparableWrapperBenchmark.scala new file mode 100644 index 0000000000000..cc28e85525162 --- /dev/null +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/InternalRowComparableWrapperBenchmark.scala @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.util + +import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} +import org.apache.spark.sql.catalyst.expressions.Literal +import org.apache.spark.sql.catalyst.plans.physical.KeyGroupedPartitioning +import org.apache.spark.sql.connector.catalog.PartitionInternalRow +import org.apache.spark.sql.types.IntegerType + +/** + * Benchmark for [[InternalRowComparableWrapper]]. + * To run this benchmark: + * {{{ + * 1. without sbt: + * bin/spark-submit --class --jars + * 2. build/sbt "catalyst/Test/runMain " + * 3. generate result: + * SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "catalyst/Test/runMain " + * Results will be written to "benchmarks/InternalRowComparableWrapperBenchmark-results.txt". + * }}} + */ +object InternalRowComparableWrapperBenchmark extends BenchmarkBase { + + private def constructAndRunBenchmark(): Unit = { + val partitionNum = 200_000 + val bucketNum = 4096 + val day = 20240401 + val partitions = (0 until partitionNum).map { i => + val bucketId = i % bucketNum + PartitionInternalRow.apply(Array(day, bucketId)); + } + val benchmark = new Benchmark("internal row comparable wrapper", partitionNum, output = output) + + benchmark.addCase("toSet") { _ => + val distinct = partitions + .map(new InternalRowComparableWrapper(_, Seq(IntegerType, IntegerType))) + .toSet + assert(distinct.size == bucketNum) + } + + benchmark.addCase("mergePartitions") { _ => + // just to mock the data types + val expressions = (Seq(Literal(day, IntegerType), Literal(0, IntegerType))) + + val leftPartitioning = KeyGroupedPartitioning(expressions, bucketNum, partitions) + val rightPartitioning = KeyGroupedPartitioning(expressions, bucketNum, partitions) + val merged = InternalRowComparableWrapper.mergePartitions( + leftPartitioning, rightPartitioning, expressions) + assert(merged.size == bucketNum) + } + + benchmark.run() + } + + override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { + constructAndRunBenchmark() + } +} diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/UnsafeRowUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/UnsafeRowUtilsSuite.scala index b6e87c456de0c..0b3f1f1bdb79d 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/UnsafeRowUtilsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/UnsafeRowUtilsSuite.scala @@ -93,7 +93,7 @@ class UnsafeRowUtilsSuite extends SparkFunSuite { } test("isBinaryStable on complex types containing collated strings") { - val nonBinaryStringType = StringType(CollationFactory.collationNameToId("UTF8_BINARY_LCASE")) + val nonBinaryStringType = StringType(CollationFactory.collationNameToId("UTF8_LCASE")) // simple checks assert(UnsafeRowUtils.isBinaryStable(IntegerType)) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala index 3293957282e22..8fd9b7c43a659 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala @@ -23,11 +23,13 @@ import org.apache.spark.{SparkException, SparkFunSuite, SparkIllegalArgumentExce import org.apache.spark.sql.catalyst.analysis.{caseInsensitiveResolution, caseSensitiveResolution} import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.catalyst.types.DataTypeUtils -import org.apache.spark.sql.catalyst.util.StringConcat +import org.apache.spark.sql.catalyst.util.{CollationFactory, StringConcat} import org.apache.spark.sql.types.DataTypeTestUtils.{dayTimeIntervalTypes, yearMonthIntervalTypes} class DataTypeSuite extends SparkFunSuite { + private val UNICODE_COLLATION_ID = CollationFactory.collationNameToId("UNICODE") + test("construct an ArrayType") { val array = ArrayType(StringType) @@ -310,8 +312,8 @@ class DataTypeSuite extends SparkFunSuite { exception = intercept[SparkIllegalArgumentException] { DataType.fromJson("""{"fields": [{"a":123}], "type": "struct"}""") }, - errorClass = "_LEGACY_ERROR_TEMP_3250", - parameters = Map("other" -> """{"a":123}""")) + errorClass = "INVALID_JSON_DATA_TYPE", + parameters = Map("invalidType" -> """{"a":123}""")) // Malformed JSON string val message = intercept[JsonParseException] { @@ -687,6 +689,115 @@ class DataTypeSuite extends SparkFunSuite { false, caseSensitive = true) + def checkEqualsIgnoreCompatibleCollation( + from: DataType, + to: DataType, + expected: Boolean): Unit = { + val testName = s"equalsIgnoreCompatibleCollation: (from: $from, to: $to)" + + test(testName) { + assert(DataType.equalsIgnoreCompatibleCollation(from, to) === expected) + } + } + + // Simple types. + checkEqualsIgnoreCompatibleCollation(IntegerType, IntegerType, expected = true) + checkEqualsIgnoreCompatibleCollation(BooleanType, BooleanType, expected = true) + checkEqualsIgnoreCompatibleCollation(StringType, StringType, expected = true) + checkEqualsIgnoreCompatibleCollation(IntegerType, BooleanType, expected = false) + checkEqualsIgnoreCompatibleCollation(BooleanType, IntegerType, expected = false) + checkEqualsIgnoreCompatibleCollation(StringType, BooleanType, expected = false) + checkEqualsIgnoreCompatibleCollation(BooleanType, StringType, expected = false) + checkEqualsIgnoreCompatibleCollation(StringType, IntegerType, expected = false) + checkEqualsIgnoreCompatibleCollation(IntegerType, StringType, expected = false) + // Collated `StringType`. + checkEqualsIgnoreCompatibleCollation(StringType, StringType("UTF8_LCASE"), + expected = true) + checkEqualsIgnoreCompatibleCollation( + StringType("UTF8_BINARY"), StringType("UTF8_LCASE"), expected = true) + // Complex types. + checkEqualsIgnoreCompatibleCollation( + ArrayType(StringType), + ArrayType(StringType("UTF8_LCASE")), + expected = true + ) + checkEqualsIgnoreCompatibleCollation( + ArrayType(StringType), + ArrayType(ArrayType(StringType("UTF8_LCASE"))), + expected = false + ) + checkEqualsIgnoreCompatibleCollation( + ArrayType(ArrayType(StringType)), + ArrayType(ArrayType(StringType("UTF8_LCASE"))), + expected = true + ) + checkEqualsIgnoreCompatibleCollation( + MapType(StringType, StringType), + MapType(StringType, StringType("UTF8_LCASE")), + expected = true + ) + checkEqualsIgnoreCompatibleCollation( + MapType(StringType("UTF8_LCASE"), StringType), + MapType(StringType, StringType), + expected = false + ) + checkEqualsIgnoreCompatibleCollation( + MapType(StringType("UTF8_LCASE"), ArrayType(StringType)), + MapType(StringType("UTF8_LCASE"), ArrayType(StringType("UTF8_LCASE"))), + expected = true + ) + checkEqualsIgnoreCompatibleCollation( + MapType(ArrayType(StringType), IntegerType), + MapType(ArrayType(StringType("UTF8_LCASE")), IntegerType), + expected = false + ) + checkEqualsIgnoreCompatibleCollation( + MapType(ArrayType(StringType("UTF8_LCASE")), IntegerType), + MapType(ArrayType(StringType("UTF8_LCASE")), IntegerType), + expected = true + ) + checkEqualsIgnoreCompatibleCollation( + StructType(StructField("a", StringType) :: Nil), + StructType(StructField("a", StringType("UTF8_LCASE")) :: Nil), + expected = true + ) + checkEqualsIgnoreCompatibleCollation( + StructType(StructField("a", ArrayType(StringType)) :: Nil), + StructType(StructField("a", ArrayType(StringType("UTF8_LCASE"))) :: Nil), + expected = true + ) + checkEqualsIgnoreCompatibleCollation( + StructType(StructField("a", MapType(StringType, IntegerType)) :: Nil), + StructType(StructField("a", MapType(StringType("UTF8_LCASE"), IntegerType)) :: Nil), + expected = false + ) + checkEqualsIgnoreCompatibleCollation( + StructType(StructField("a", StringType) :: Nil), + StructType(StructField("b", StringType("UTF8_LCASE")) :: Nil), + expected = false + ) + // Null compatibility checks. + checkEqualsIgnoreCompatibleCollation( + ArrayType(StringType, containsNull = true), + ArrayType(StringType, containsNull = false), + expected = false + ) + checkEqualsIgnoreCompatibleCollation( + ArrayType(StringType, containsNull = true), + ArrayType(StringType("UTF8_LCASE"), containsNull = false), + expected = false + ) + checkEqualsIgnoreCompatibleCollation( + MapType(StringType, StringType, valueContainsNull = true), + MapType(StringType, StringType, valueContainsNull = false), + expected = false + ) + checkEqualsIgnoreCompatibleCollation( + StructType(StructField("a", StringType) :: Nil), + StructType(StructField("a", StringType, nullable = false) :: Nil), + expected = false + ) + test("SPARK-25031: MapType should produce current formatted string for complex types") { val keyType: DataType = StructType(Seq( StructField("a", DataTypes.IntegerType), @@ -712,4 +823,190 @@ class DataTypeSuite extends SparkFunSuite { assert(result === expected) } + + test("schema with collation should not change during ser/de") { + val simpleStruct = StructType( + StructField("c1", StringType(UNICODE_COLLATION_ID)) :: Nil) + + val nestedStruct = StructType( + StructField("nested", simpleStruct) :: Nil) + + val caseInsensitiveNames = StructType( + StructField("c1", StringType(UNICODE_COLLATION_ID)) :: + StructField("C1", StringType(UNICODE_COLLATION_ID)) :: Nil) + + val specialCharsInName = StructType( + StructField("c1.*23?", StringType(UNICODE_COLLATION_ID)) :: Nil) + + val arrayInSchema = StructType( + StructField("arrayField", ArrayType(StringType(UNICODE_COLLATION_ID))) :: Nil) + + val mapInSchema = StructType( + StructField("mapField", + MapType(StringType(UNICODE_COLLATION_ID), StringType(UNICODE_COLLATION_ID))) :: Nil) + + val mapWithKeyInNameInSchema = StructType( + StructField("name.key", StringType) :: + StructField("name", + MapType(StringType(UNICODE_COLLATION_ID), StringType(UNICODE_COLLATION_ID))) :: Nil) + + val arrayInMapInNestedSchema = StructType( + StructField("arrInMap", + MapType(StringType(UNICODE_COLLATION_ID), + ArrayType(StringType(UNICODE_COLLATION_ID)))) :: Nil) + + val nestedArrayInMap = StructType( + StructField("nestedArrayInMap", + ArrayType(MapType(StringType(UNICODE_COLLATION_ID), + ArrayType(ArrayType(StringType(UNICODE_COLLATION_ID)))))) :: Nil) + + val schemaWithMultipleFields = StructType( + simpleStruct.fields ++ nestedStruct.fields ++ arrayInSchema.fields ++ mapInSchema.fields ++ + mapWithKeyInNameInSchema ++ arrayInMapInNestedSchema.fields ++ nestedArrayInMap.fields) + + Seq( + simpleStruct, caseInsensitiveNames, specialCharsInName, nestedStruct, arrayInSchema, + mapInSchema, mapWithKeyInNameInSchema, nestedArrayInMap, arrayInMapInNestedSchema, + schemaWithMultipleFields) + .foreach { schema => + val json = schema.json + val parsed = DataType.fromJson(json) + assert(parsed === schema) + } + } + + test("non string field has collation metadata") { + val json = + s""" + |{ + | "type": "struct", + | "fields": [ + | { + | "name": "c1", + | "type": "integer", + | "nullable": true, + | "metadata": { + | "${DataType.COLLATIONS_METADATA_KEY}": { + | "c1": "icu.UNICODE" + | } + | } + | } + | ] + |} + |""".stripMargin + + checkError( + exception = intercept[SparkIllegalArgumentException] { + DataType.fromJson(json) + }, + errorClass = "INVALID_JSON_DATA_TYPE_FOR_COLLATIONS", + parameters = Map("jsonType" -> "integer") + ) + } + + test("non string field in map key has collation metadata") { + val json = + s""" + |{ + | "type": "struct", + | "fields": [ + | { + | "name": "mapField", + | "type": { + | "type": "map", + | "keyType": "string", + | "valueType": "integer", + | "valueContainsNull": true + | }, + | "nullable": true, + | "metadata": { + | "${DataType.COLLATIONS_METADATA_KEY}": { + | "mapField.value": "icu.UNICODE" + | } + | } + | } + | ] + |} + |""".stripMargin + + checkError( + exception = intercept[SparkIllegalArgumentException] { + DataType.fromJson(json) + }, + errorClass = "INVALID_JSON_DATA_TYPE_FOR_COLLATIONS", + parameters = Map("jsonType" -> "integer") + ) + } + + test("map field has collation metadata") { + val json = + s""" + |{ + | "type": "struct", + | "fields": [ + | { + | "name": "mapField", + | "type": { + | "type": "map", + | "keyType": "string", + | "valueType": "integer", + | "valueContainsNull": true + | }, + | "nullable": true, + | "metadata": { + | "${DataType.COLLATIONS_METADATA_KEY}": { + | "mapField": "icu.UNICODE" + | } + | } + | } + | ] + |} + |""".stripMargin + + checkError( + exception = intercept[SparkIllegalArgumentException] { + DataType.fromJson(json) + }, + errorClass = "INVALID_JSON_DATA_TYPE_FOR_COLLATIONS", + parameters = Map("jsonType" -> "map") + ) + } + + test("non existing collation provider") { + val json = + s""" + |{ + | "type": "struct", + | "fields": [ + | { + | "name": "c1", + | "type": "string", + | "nullable": true, + | "metadata": { + | "${DataType.COLLATIONS_METADATA_KEY}": { + | "c1": "badProvider.UNICODE" + | } + | } + | } + | ] + |} + |""".stripMargin + + checkError( + exception = intercept[SparkException] { + DataType.fromJson(json) + }, + errorClass = "COLLATION_INVALID_PROVIDER", + parameters = Map("provider" -> "badProvider", "supportedProviders" -> "spark, icu") + ) + } + + test("SPARK-48680: Add CharType and VarcharType to DataTypes JAVA API") { + assert(DataTypes.createCharType(1) === CharType(1)) + assert(DataTypes.createVarcharType(100) === VarcharType(100)) + val exception = intercept[IllegalArgumentException] { + DataTypes.createVarcharType(-1) + } + assert(exception.getMessage.contains("The length of varchar type cannot be negative.")) + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/StructTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/StructTypeSuite.scala index c165ab1bf61bd..562febe381130 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/StructTypeSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/StructTypeSuite.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.types +import com.fasterxml.jackson.databind.ObjectMapper + import org.apache.spark.{SparkException, SparkFunSuite, SparkIllegalArgumentException} import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.analysis.{caseInsensitiveResolution, caseSensitiveResolution} @@ -36,6 +38,10 @@ class StructTypeSuite extends SparkFunSuite with SQLHelper { private val s = StructType.fromDDL("a INT, b STRING") + private val UNICODE_COLLATION = "UNICODE" + private val UTF8_LCASE_COLLATION = "UTF8_LCASE" + private val mapper = new ObjectMapper() + test("lookup a single missing field should output existing fields") { checkError( exception = intercept[SparkIllegalArgumentException](s("c")), @@ -606,4 +612,181 @@ class StructTypeSuite extends SparkFunSuite with SQLHelper { "b STRING NOT NULL,c STRING COMMENT 'nullable comment'") assert(fromDDL(struct.toDDL) === struct) } + + test("simple struct with collations to json") { + val simpleStruct = StructType( + StructField("c1", StringType(UNICODE_COLLATION)) :: Nil) + + val expectedJson = + s""" + |{ + | "type": "struct", + | "fields": [ + | { + | "name": "c1", + | "type": "string", + | "nullable": true, + | "metadata": { + | "${DataType.COLLATIONS_METADATA_KEY}": { + | "c1": "icu.$UNICODE_COLLATION" + | } + | } + | } + | ] + |} + |""".stripMargin + + assert(mapper.readTree(simpleStruct.json) == mapper.readTree(expectedJson)) + } + + test("nested struct with collations to json") { + val nestedStruct = StructType( + StructField("nested", StructType( + StructField("c1", StringType(UTF8_LCASE_COLLATION)) :: Nil)) :: Nil) + + val expectedJson = + s""" + |{ + | "type": "struct", + | "fields": [ + | { + | "name": "nested", + | "type": { + | "type": "struct", + | "fields": [ + | { + | "name": "c1", + | "type": "string", + | "nullable": true, + | "metadata": { + | "${DataType.COLLATIONS_METADATA_KEY}": { + | "c1": "spark.$UTF8_LCASE_COLLATION" + | } + | } + | } + | ] + | }, + | "nullable": true, + | "metadata": {} + | } + | ] + |} + |""".stripMargin + + assert(mapper.readTree(nestedStruct.json) == mapper.readTree(expectedJson)) + } + + test("array with collations in schema to json") { + val arrayInSchema = StructType( + StructField("arrayField", ArrayType(StringType(UNICODE_COLLATION))) :: Nil) + + val expectedJson = + s""" + |{ + | "type": "struct", + | "fields": [ + | { + | "name": "arrayField", + | "type": { + | "type": "array", + | "elementType": "string", + | "containsNull": true + | }, + | "nullable": true, + | "metadata": { + | "${DataType.COLLATIONS_METADATA_KEY}": { + | "arrayField.element": "icu.$UNICODE_COLLATION" + | } + | } + | } + | ] + |} + |""".stripMargin + + assert(mapper.readTree(arrayInSchema.json) == mapper.readTree(expectedJson)) + } + + test("map with collations in schema to json") { + val arrayInSchema = StructType( + StructField("mapField", + MapType(StringType(UNICODE_COLLATION), StringType(UNICODE_COLLATION))) :: Nil) + + val expectedJson = + s""" + |{ + | "type": "struct", + | "fields": [ + | { + | "name": "mapField", + | "type": { + | "type": "map", + | "keyType": "string", + | "valueType": "string", + | "valueContainsNull": true + | }, + | "nullable": true, + | "metadata": { + | "${DataType.COLLATIONS_METADATA_KEY}": { + | "mapField.key": "icu.$UNICODE_COLLATION", + | "mapField.value": "icu.$UNICODE_COLLATION" + | } + | } + | } + | ] + |} + |""".stripMargin + + assert(mapper.readTree(arrayInSchema.json) == mapper.readTree(expectedJson)) + } + + test("nested array with collations in map to json" ) { + val mapWithNestedArray = StructType( + StructField("column", ArrayType(MapType( + StringType(UNICODE_COLLATION), + ArrayType(ArrayType(ArrayType(StringType(UNICODE_COLLATION))))))) :: Nil) + + val expectedJson = + s""" + |{ + | "type": "struct", + | "fields": [ + | { + | "name": "column", + | "type": { + | "type": "array", + | "elementType": { + | "type": "map", + | "keyType": "string", + | "valueType": { + | "type": "array", + | "elementType": { + | "type": "array", + | "elementType": { + | "type": "array", + | "elementType": "string", + | "containsNull": true + | }, + | "containsNull": true + | }, + | "containsNull": true + | }, + | "valueContainsNull": true + | }, + | "containsNull": true + | }, + | "nullable": true, + | "metadata": { + | "${DataType.COLLATIONS_METADATA_KEY}": { + | "column.element.key": "icu.$UNICODE_COLLATION", + | "column.element.value.element.element.element": "icu.$UNICODE_COLLATION" + | } + | } + | } + | ] + |} + |""".stripMargin + + assert( + mapper.readTree(mapWithNestedArray.json) == mapper.readTree(expectedJson)) + } } diff --git a/sql/core/benchmarks/AggregateBenchmark-jdk21-results.txt b/sql/core/benchmarks/AggregateBenchmark-jdk21-results.txt index c68ab02cc98dc..15fe089202fb8 100644 --- a/sql/core/benchmarks/AggregateBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/AggregateBenchmark-jdk21-results.txt @@ -2,147 +2,147 @@ aggregate without grouping ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor agg w/o group: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -agg w/o group wholestage off 34004 34304 425 61.7 16.2 1.0X -agg w/o group wholestage on 717 728 10 2925.5 0.3 47.4X +agg w/o group wholestage off 35342 35962 876 59.3 16.9 1.0X +agg w/o group wholestage on 2831 2851 16 740.7 1.4 12.5X ================================================================================================ stat functions ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor stddev: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -stddev wholestage off 4163 4196 47 25.2 39.7 1.0X -stddev wholestage on 979 984 3 107.1 9.3 4.3X +stddev wholestage off 4117 4150 47 25.5 39.3 1.0X +stddev wholestage on 976 980 4 107.4 9.3 4.2X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor kurtosis: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -kurtosis wholestage off 20832 20961 182 5.0 198.7 1.0X -kurtosis wholestage on 983 992 6 106.7 9.4 21.2X +kurtosis wholestage off 19477 19555 110 5.4 185.7 1.0X +kurtosis wholestage on 986 994 6 106.3 9.4 19.8X ================================================================================================ aggregate with linear keys ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Aggregate w keys: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -codegen = F 6917 6934 24 12.1 82.5 1.0X -codegen = T, hashmap = F 4302 4328 43 19.5 51.3 1.6X -codegen = T, row-based hashmap = T 1092 1102 10 76.8 13.0 6.3X -codegen = T, vectorized hashmap = T 758 766 6 110.6 9.0 9.1X +codegen = F 6889 6905 23 12.2 82.1 1.0X +codegen = T, hashmap = F 3899 3935 32 21.5 46.5 1.8X +codegen = T, row-based hashmap = T 1248 1254 6 67.2 14.9 5.5X +codegen = T, vectorized hashmap = T 825 837 12 101.7 9.8 8.4X ================================================================================================ aggregate with randomized keys ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Aggregate w keys: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -codegen = F 7543 7559 23 11.1 89.9 1.0X -codegen = T, hashmap = F 4895 4908 20 17.1 58.4 1.5X -codegen = T, row-based hashmap = T 1671 1676 3 50.2 19.9 4.5X -codegen = T, vectorized hashmap = T 983 995 12 85.3 11.7 7.7X +codegen = F 7875 7877 2 10.7 93.9 1.0X +codegen = T, hashmap = F 4903 4941 43 17.1 58.4 1.6X +codegen = T, row-based hashmap = T 1807 1818 9 46.4 21.5 4.4X +codegen = T, vectorized hashmap = T 1300 1344 49 64.5 15.5 6.1X ================================================================================================ aggregate with string key ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Aggregate w string key: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -codegen = F 2312 2332 28 9.1 110.3 1.0X -codegen = T, hashmap = F 1605 1630 23 13.1 76.5 1.4X -codegen = T, row-based hashmap = T 1198 1208 12 17.5 57.1 1.9X -codegen = T, vectorized hashmap = T 920 936 24 22.8 43.9 2.5X +codegen = F 2552 2573 30 8.2 121.7 1.0X +codegen = T, hashmap = F 1537 1545 9 13.6 73.3 1.7X +codegen = T, row-based hashmap = T 887 921 27 23.6 42.3 2.9X +codegen = T, vectorized hashmap = T 744 752 8 28.2 35.5 3.4X ================================================================================================ aggregate with decimal key ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Aggregate w decimal key: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -codegen = F 2162 2193 43 9.7 103.1 1.0X -codegen = T, hashmap = F 1356 1361 8 15.5 64.6 1.6X -codegen = T, row-based hashmap = T 435 453 11 48.2 20.8 5.0X -codegen = T, vectorized hashmap = T 285 291 4 73.6 13.6 7.6X +codegen = F 2460 2464 6 8.5 117.3 1.0X +codegen = T, hashmap = F 1601 1611 14 13.1 76.4 1.5X +codegen = T, row-based hashmap = T 475 497 29 44.1 22.7 5.2X +codegen = T, vectorized hashmap = T 336 343 5 62.5 16.0 7.3X ================================================================================================ aggregate with multiple key types ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Aggregate w multiple keys: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -codegen = F 4053 4053 0 5.2 193.3 1.0X -codegen = T, hashmap = F 2346 2352 9 8.9 111.9 1.7X -codegen = T, row-based hashmap = T 1855 1860 7 11.3 88.4 2.2X -codegen = T, vectorized hashmap = T 1918 1947 41 10.9 91.5 2.1X +codegen = F 4272 4302 43 4.9 203.7 1.0X +codegen = T, hashmap = F 2260 2262 2 9.3 107.8 1.9X +codegen = T, row-based hashmap = T 1652 1665 19 12.7 78.8 2.6X +codegen = T, vectorized hashmap = T 1519 1527 11 13.8 72.4 2.8X ================================================================================================ max function bytecode size of wholestagecodegen ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor max function bytecode size: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -codegen = F 362 380 29 1.8 552.9 1.0X -codegen = T, hugeMethodLimit = 10000 128 144 14 5.1 195.7 2.8X -codegen = T, hugeMethodLimit = 1500 122 138 21 5.4 186.8 3.0X +codegen = F 375 407 29 1.7 572.3 1.0X +codegen = T, hugeMethodLimit = 10000 137 160 19 4.8 209.5 2.7X +codegen = T, hugeMethodLimit = 1500 132 143 13 5.0 201.8 2.8X ================================================================================================ cube ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor cube: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cube wholestage off 2080 2088 11 2.5 396.7 1.0X -cube wholestage on 1154 1167 12 4.5 220.1 1.8X +cube wholestage off 1986 2005 26 2.6 378.9 1.0X +cube wholestage on 1079 1106 46 4.9 205.8 1.8X ================================================================================================ hash and BytesToBytesMap ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor BytesToBytesMap: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ UnsafeRowhash 146 146 1 143.7 7.0 1.0X -murmur3 hash 53 53 0 392.6 2.5 2.7X -fast hash 24 24 1 887.8 1.1 6.2X -arrayEqual 130 130 0 161.6 6.2 1.1X -Java HashMap (Long) 65 69 4 322.2 3.1 2.2X -Java HashMap (two ints) 87 90 2 240.9 4.2 1.7X -Java HashMap (UnsafeRow) 499 501 2 42.1 23.8 0.3X -LongToUnsafeRowMap (opt=false) 349 350 1 60.1 16.6 0.4X -LongToUnsafeRowMap (opt=true) 76 78 6 275.1 3.6 1.9X -BytesToBytesMap (off Heap) 581 584 2 36.1 27.7 0.3X -BytesToBytesMap (on Heap) 580 588 6 36.2 27.6 0.3X -Aggregate HashMap 30 31 2 698.9 1.4 4.9X +murmur3 hash 53 54 0 392.5 2.5 2.7X +fast hash 24 24 0 887.4 1.1 6.2X +arrayEqual 136 136 0 153.9 6.5 1.1X +Java HashMap (Long) 62 67 6 337.2 3.0 2.3X +Java HashMap (two ints) 87 91 8 242.3 4.1 1.7X +Java HashMap (UnsafeRow) 505 509 4 41.5 24.1 0.3X +LongToUnsafeRowMap (opt=false) 351 352 2 59.8 16.7 0.4X +LongToUnsafeRowMap (opt=true) 76 77 1 274.8 3.6 1.9X +BytesToBytesMap (off Heap) 450 460 9 46.6 21.5 0.3X +BytesToBytesMap (on Heap) 462 473 10 45.4 22.0 0.3X +Aggregate HashMap 30 30 1 699.0 1.4 4.9X diff --git a/sql/core/benchmarks/AggregateBenchmark-results.txt b/sql/core/benchmarks/AggregateBenchmark-results.txt index a546e2fb0bc61..bdfa6bd673586 100644 --- a/sql/core/benchmarks/AggregateBenchmark-results.txt +++ b/sql/core/benchmarks/AggregateBenchmark-results.txt @@ -2,147 +2,147 @@ aggregate without grouping ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor agg w/o group: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -agg w/o group wholestage off 30915 32941 2865 67.8 14.7 1.0X -agg w/o group wholestage on 717 720 2 2924.3 0.3 43.1X +agg w/o group wholestage off 38161 38820 933 55.0 18.2 1.0X +agg w/o group wholestage on 2472 2488 10 848.5 1.2 15.4X ================================================================================================ stat functions ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor stddev: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -stddev wholestage off 4304 4311 11 24.4 41.0 1.0X -stddev wholestage on 980 982 2 107.0 9.3 4.4X +stddev wholestage off 4488 4498 14 23.4 42.8 1.0X +stddev wholestage on 961 975 8 109.1 9.2 4.7X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor kurtosis: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -kurtosis wholestage off 20793 20816 32 5.0 198.3 1.0X -kurtosis wholestage on 988 993 4 106.1 9.4 21.0X +kurtosis wholestage off 20771 20817 65 5.0 198.1 1.0X +kurtosis wholestage on 1004 1009 4 104.5 9.6 20.7X ================================================================================================ aggregate with linear keys ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Aggregate w keys: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -codegen = F 6582 6606 34 12.7 78.5 1.0X -codegen = T, hashmap = F 3769 3785 18 22.3 44.9 1.7X -codegen = T, row-based hashmap = T 1124 1132 9 74.6 13.4 5.9X -codegen = T, vectorized hashmap = T 766 775 6 109.5 9.1 8.6X +codegen = F 6648 6749 142 12.6 79.3 1.0X +codegen = T, hashmap = F 3893 3974 83 21.6 46.4 1.7X +codegen = T, row-based hashmap = T 1198 1218 17 70.0 14.3 5.5X +codegen = T, vectorized hashmap = T 815 826 7 102.9 9.7 8.2X ================================================================================================ aggregate with randomized keys ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Aggregate w keys: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -codegen = F 7355 7372 24 11.4 87.7 1.0X -codegen = T, hashmap = F 4568 4627 61 18.4 54.5 1.6X -codegen = T, row-based hashmap = T 1635 1660 21 51.3 19.5 4.5X -codegen = T, vectorized hashmap = T 1084 1180 131 77.4 12.9 6.8X +codegen = F 7395 7411 24 11.3 88.1 1.0X +codegen = T, hashmap = F 4675 4833 165 17.9 55.7 1.6X +codegen = T, row-based hashmap = T 1658 1709 82 50.6 19.8 4.5X +codegen = T, vectorized hashmap = T 1066 1080 23 78.7 12.7 6.9X ================================================================================================ aggregate with string key ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Aggregate w string key: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -codegen = F 2409 2432 33 8.7 114.9 1.0X -codegen = T, hashmap = F 1476 1503 33 14.2 70.4 1.6X -codegen = T, row-based hashmap = T 947 950 3 22.2 45.1 2.5X -codegen = T, vectorized hashmap = T 717 722 4 29.3 34.2 3.4X +codegen = F 2400 2406 8 8.7 114.5 1.0X +codegen = T, hashmap = F 1499 1512 19 14.0 71.5 1.6X +codegen = T, row-based hashmap = T 942 972 28 22.3 44.9 2.5X +codegen = T, vectorized hashmap = T 756 764 5 27.7 36.1 3.2X ================================================================================================ aggregate with decimal key ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Aggregate w decimal key: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -codegen = F 2024 2029 7 10.4 96.5 1.0X -codegen = T, hashmap = F 1333 1333 0 15.7 63.6 1.5X -codegen = T, row-based hashmap = T 491 506 14 42.7 23.4 4.1X -codegen = T, vectorized hashmap = T 284 288 3 74.0 13.5 7.1X +codegen = F 2103 2115 17 10.0 100.3 1.0X +codegen = T, hashmap = F 1324 1330 9 15.8 63.1 1.6X +codegen = T, row-based hashmap = T 454 473 13 46.1 21.7 4.6X +codegen = T, vectorized hashmap = T 313 324 6 66.9 14.9 6.7X ================================================================================================ aggregate with multiple key types ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Aggregate w multiple keys: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -codegen = F 4035 4061 37 5.2 192.4 1.0X -codegen = T, hashmap = F 2336 2343 9 9.0 111.4 1.7X -codegen = T, row-based hashmap = T 1599 1630 43 13.1 76.2 2.5X -codegen = T, vectorized hashmap = T 1481 1508 38 14.2 70.6 2.7X +codegen = F 4133 4161 39 5.1 197.1 1.0X +codegen = T, hashmap = F 2402 2405 4 8.7 114.5 1.7X +codegen = T, row-based hashmap = T 1618 1626 12 13.0 77.1 2.6X +codegen = T, vectorized hashmap = T 1516 1525 12 13.8 72.3 2.7X ================================================================================================ max function bytecode size of wholestagecodegen ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor max function bytecode size: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -codegen = F 358 377 18 1.8 546.8 1.0X -codegen = T, hugeMethodLimit = 10000 127 144 12 5.2 193.6 2.8X -codegen = T, hugeMethodLimit = 1500 123 136 13 5.3 188.0 2.9X +codegen = F 402 410 4 1.6 614.1 1.0X +codegen = T, hugeMethodLimit = 10000 164 194 14 4.0 249.7 2.5X +codegen = T, hugeMethodLimit = 1500 132 153 15 5.0 201.8 3.0X ================================================================================================ cube ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor cube: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cube wholestage off 2001 2011 14 2.6 381.7 1.0X -cube wholestage on 1063 1079 29 4.9 202.7 1.9X +cube wholestage off 2101 2141 56 2.5 400.8 1.0X +cube wholestage on 1072 1084 13 4.9 204.5 2.0X ================================================================================================ hash and BytesToBytesMap ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor BytesToBytesMap: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -UnsafeRowhash 210 211 1 100.0 10.0 1.0X -murmur3 hash 68 69 1 309.1 3.2 3.1X -fast hash 67 72 1 311.9 3.2 3.1X -arrayEqual 143 145 1 146.2 6.8 1.5X -Java HashMap (Long) 62 64 3 340.9 2.9 3.4X -Java HashMap (two ints) 81 83 2 260.4 3.8 2.6X -Java HashMap (UnsafeRow) 522 527 5 40.1 24.9 0.4X -LongToUnsafeRowMap (opt=false) 345 346 1 60.9 16.4 0.6X -LongToUnsafeRowMap (opt=true) 77 78 1 273.6 3.7 2.7X -BytesToBytesMap (off Heap) 486 492 5 43.2 23.2 0.4X -BytesToBytesMap (on Heap) 502 507 6 41.8 23.9 0.4X -Aggregate HashMap 30 31 3 689.7 1.4 6.9X +UnsafeRowhash 198 203 14 106.1 9.4 1.0X +murmur3 hash 66 68 1 320.1 3.1 3.0X +fast hash 69 71 1 305.8 3.3 2.9X +arrayEqual 142 145 2 148.0 6.8 1.4X +Java HashMap (Long) 64 68 4 327.5 3.1 3.1X +Java HashMap (two ints) 82 84 2 257.0 3.9 2.4X +Java HashMap (UnsafeRow) 537 542 5 39.1 25.6 0.4X +LongToUnsafeRowMap (opt=false) 335 338 2 62.5 16.0 0.6X +LongToUnsafeRowMap (opt=true) 74 75 2 281.7 3.6 2.7X +BytesToBytesMap (off Heap) 489 494 7 42.9 23.3 0.4X +BytesToBytesMap (on Heap) 496 499 3 42.3 23.7 0.4X +Aggregate HashMap 30 31 2 705.1 1.4 6.6X diff --git a/sql/core/benchmarks/AnsiIntervalSortBenchmark-jdk21-results.txt b/sql/core/benchmarks/AnsiIntervalSortBenchmark-jdk21-results.txt index ceb70f9f4ded3..c9f022901b947 100644 --- a/sql/core/benchmarks/AnsiIntervalSortBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/AnsiIntervalSortBenchmark-jdk21-results.txt @@ -1,28 +1,28 @@ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor year month interval one column: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -year month interval one column enable radix 22445 22598 237 4.5 224.4 1.0X -year month interval one column disable radix 32401 32416 20 3.1 324.0 0.7X +year month interval one column enable radix 22681 22902 342 4.4 226.8 1.0X +year month interval one column disable radix 31984 32121 199 3.1 319.8 0.7X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor year month interval two columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------- -year month interval two columns enable radix 33624 33664 36 3.0 336.2 1.0X -year month interval two columns disable radix 33562 33647 79 3.0 335.6 1.0X +year month interval two columns enable radix 33382 33682 365 3.0 333.8 1.0X +year month interval two columns disable radix 33180 33612 586 3.0 331.8 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor day time interval one columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -day time interval one columns enable radix 22444 22552 166 4.5 224.4 1.0X -day time interval one columns disable radix 34000 34058 90 2.9 340.0 0.7X +day time interval one columns enable radix 20327 20446 140 4.9 203.3 1.0X +day time interval one columns disable radix 31683 32138 457 3.2 316.8 0.6X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor day time interval two columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -day time interval two columns enable radix 35780 35816 34 2.8 357.8 1.0X -day time interval two columns disable radix 36041 36575 472 2.8 360.4 1.0X +day time interval two columns enable radix 32522 32691 224 3.1 325.2 1.0X +day time interval two columns disable radix 32478 32743 285 3.1 324.8 1.0X diff --git a/sql/core/benchmarks/AnsiIntervalSortBenchmark-results.txt b/sql/core/benchmarks/AnsiIntervalSortBenchmark-results.txt index 9fb2e3b6b8c3f..def37b0d628ab 100644 --- a/sql/core/benchmarks/AnsiIntervalSortBenchmark-results.txt +++ b/sql/core/benchmarks/AnsiIntervalSortBenchmark-results.txt @@ -1,28 +1,28 @@ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor year month interval one column: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -year month interval one column enable radix 23013 23045 49 4.3 230.1 1.0X -year month interval one column disable radix 33043 33140 166 3.0 330.4 0.7X +year month interval one column enable radix 22540 22607 70 4.4 225.4 1.0X +year month interval one column disable radix 32453 32592 126 3.1 324.5 0.7X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor year month interval two columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------- -year month interval two columns enable radix 33865 33990 109 3.0 338.6 1.0X -year month interval two columns disable radix 35043 35124 81 2.9 350.4 1.0X +year month interval two columns enable radix 33313 33384 114 3.0 333.1 1.0X +year month interval two columns disable radix 33284 33357 83 3.0 332.8 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor day time interval one columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -day time interval one columns enable radix 20614 20666 45 4.9 206.1 1.0X -day time interval one columns disable radix 33399 33655 242 3.0 334.0 0.6X +day time interval one columns enable radix 21112 21150 63 4.7 211.1 1.0X +day time interval one columns disable radix 32667 32837 159 3.1 326.7 0.6X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor day time interval two columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -day time interval two columns enable radix 35242 35495 284 2.8 352.4 1.0X -day time interval two columns disable radix 35315 35481 181 2.8 353.1 1.0X +day time interval two columns enable radix 36203 36323 110 2.8 362.0 1.0X +day time interval two columns disable radix 34964 35031 70 2.9 349.6 1.0X diff --git a/sql/core/benchmarks/Base64Benchmark-jdk21-results.txt b/sql/core/benchmarks/Base64Benchmark-jdk21-results.txt index 9f156cfca71b0..f9ddb8465f4f0 100644 --- a/sql/core/benchmarks/Base64Benchmark-jdk21-results.txt +++ b/sql/core/benchmarks/Base64Benchmark-jdk21-results.txt @@ -1,56 +1,56 @@ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor encode for 1: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java 2210 2233 38 9.1 110.5 1.0X -apache 11572 11644 66 1.7 578.6 0.2X +java 2038 2103 63 9.8 101.9 1.0X +apache 11269 11369 86 1.8 563.4 0.2X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor encode for 3: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java 2586 2604 18 7.7 129.3 1.0X -apache 12940 12986 41 1.5 647.0 0.2X +java 2462 2507 76 8.1 123.1 1.0X +apache 12414 12475 54 1.6 620.7 0.2X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor encode for 5: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java 3053 3065 16 6.6 152.7 1.0X -apache 14277 14321 60 1.4 713.9 0.2X +java 3084 3093 8 6.5 154.2 1.0X +apache 13548 13629 86 1.5 677.4 0.2X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor encode for 7: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java 3106 3113 10 6.4 155.3 1.0X -apache 15346 15382 37 1.3 767.3 0.2X +java 3182 3189 9 6.3 159.1 1.0X +apache 14637 14678 40 1.4 731.8 0.2X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor decode for 1: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java 3494 3500 5 5.7 174.7 1.0X -apache 12827 12950 107 1.6 641.4 0.3X +java 3117 3254 175 6.4 155.8 1.0X +apache 12666 12718 47 1.6 633.3 0.2X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor decode for 3: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java 4203 4205 3 4.8 210.1 1.0X -apache 15071 15122 61 1.3 753.5 0.3X +java 3759 3765 6 5.3 187.9 1.0X +apache 13854 13870 15 1.4 692.7 0.3X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor decode for 5: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java 5025 5042 16 4.0 251.3 1.0X -apache 17056 17140 88 1.2 852.8 0.3X +java 4773 4781 12 4.2 238.6 1.0X +apache 15439 15482 42 1.3 771.9 0.3X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor decode for 7: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java 5414 5497 139 3.7 270.7 1.0X -apache 18678 18825 130 1.1 933.9 0.3X +java 5200 5228 25 3.8 260.0 1.0X +apache 16847 16904 69 1.2 842.4 0.3X diff --git a/sql/core/benchmarks/Base64Benchmark-results.txt b/sql/core/benchmarks/Base64Benchmark-results.txt index 4e574da76c9a1..975e6b9bcad23 100644 --- a/sql/core/benchmarks/Base64Benchmark-results.txt +++ b/sql/core/benchmarks/Base64Benchmark-results.txt @@ -1,56 +1,56 @@ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor encode for 1: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java 2334 2374 61 8.6 116.7 1.0X -apache 10888 10892 8 1.8 544.4 0.2X +java 2292 2321 30 8.7 114.6 1.0X +apache 11003 11085 79 1.8 550.2 0.2X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor encode for 3: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java 3172 3209 48 6.3 158.6 1.0X -apache 12136 12153 15 1.6 606.8 0.3X +java 2945 2956 18 6.8 147.3 1.0X +apache 12199 12295 135 1.6 609.9 0.2X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor encode for 5: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java 3525 3530 4 5.7 176.3 1.0X -apache 13421 13449 30 1.5 671.1 0.3X +java 3477 3489 12 5.8 173.9 1.0X +apache 13666 13776 96 1.5 683.3 0.3X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor encode for 7: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java 3917 3924 9 5.1 195.9 1.0X -apache 14449 14474 22 1.4 722.5 0.3X +java 3958 3973 22 5.1 197.9 1.0X +apache 14953 14993 45 1.3 747.7 0.3X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor decode for 1: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java 3412 3656 212 5.9 170.6 1.0X -apache 12318 12326 14 1.6 615.9 0.3X +java 3223 3313 79 6.2 161.1 1.0X +apache 12096 12321 196 1.7 604.8 0.3X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor decode for 3: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java 4950 4964 12 4.0 247.5 1.0X -apache 15030 15050 34 1.3 751.5 0.3X +java 4637 4655 18 4.3 231.9 1.0X +apache 14167 14307 124 1.4 708.4 0.3X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor decode for 5: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java 6552 6562 10 3.1 327.6 1.0X -apache 16803 16855 46 1.2 840.2 0.4X +java 5811 5821 9 3.4 290.6 1.0X +apache 15871 15886 15 1.3 793.6 0.4X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor decode for 7: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java 7340 7346 8 2.7 367.0 1.0X -apache 18708 18789 76 1.1 935.4 0.4X +java 6410 6436 23 3.1 320.5 1.0X +apache 17301 17395 109 1.2 865.1 0.4X diff --git a/sql/core/benchmarks/BloomFilterBenchmark-jdk21-results.txt b/sql/core/benchmarks/BloomFilterBenchmark-jdk21-results.txt index 2be64a77ae5d7..fd0fe949392b3 100644 --- a/sql/core/benchmarks/BloomFilterBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/BloomFilterBenchmark-jdk21-results.txt @@ -2,191 +2,191 @@ ORC Write ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Write 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 7752 7991 337 12.9 77.5 1.0X -With bloom filter 10081 10242 228 9.9 100.8 0.8X +Without bloom filter 8033 8137 147 12.4 80.3 1.0X +With bloom filter 10113 10202 125 9.9 101.1 0.8X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 2097152 838 890 45 119.3 8.4 1.0X -With bloom filter, blocksize: 2097152 579 596 17 172.8 5.8 1.4X +Without bloom filter, blocksize: 2097152 895 909 17 111.7 8.9 1.0X +With bloom filter, blocksize: 2097152 592 603 9 169.1 5.9 1.5X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 4194304 824 848 22 121.3 8.2 1.0X -With bloom filter, blocksize: 4194304 554 574 22 180.4 5.5 1.5X +Without bloom filter, blocksize: 4194304 852 871 23 117.4 8.5 1.0X +With bloom filter, blocksize: 4194304 542 573 37 184.6 5.4 1.6X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 6291456 805 822 20 124.3 8.0 1.0X -With bloom filter, blocksize: 6291456 527 582 72 189.9 5.3 1.5X +Without bloom filter, blocksize: 6291456 834 857 32 120.0 8.3 1.0X +With bloom filter, blocksize: 6291456 547 567 26 182.9 5.5 1.5X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 8388608 796 819 23 125.6 8.0 1.0X -With bloom filter, blocksize: 8388608 742 757 15 134.7 7.4 1.1X +Without bloom filter, blocksize: 8388608 819 844 21 122.0 8.2 1.0X +With bloom filter, blocksize: 8388608 542 572 25 184.4 5.4 1.5X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 12582912 1266 1267 1 79.0 12.7 1.0X -With bloom filter, blocksize: 12582912 766 778 16 130.6 7.7 1.7X +Without bloom filter, blocksize: 12582912 866 882 16 115.5 8.7 1.0X +With bloom filter, blocksize: 12582912 537 560 21 186.1 5.4 1.6X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 16777216 1209 1215 8 82.7 12.1 1.0X -With bloom filter, blocksize: 16777216 760 766 6 131.5 7.6 1.6X +Without bloom filter, blocksize: 16777216 805 829 21 124.3 8.0 1.0X +With bloom filter, blocksize: 16777216 537 567 30 186.2 5.4 1.5X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 33554432 1220 1221 2 82.0 12.2 1.0X -With bloom filter, blocksize: 33554432 745 763 23 134.3 7.4 1.6X +Without bloom filter, blocksize: 33554432 807 826 17 123.9 8.1 1.0X +With bloom filter, blocksize: 33554432 535 552 12 186.9 5.3 1.5X ================================================================================================ Parquet Write ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Write 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 9950 9963 19 10.1 99.5 1.0X -With bloom filter 12809 12908 140 7.8 128.1 0.8X +Without bloom filter 10510 10559 69 9.5 105.1 1.0X +With bloom filter 13372 13429 81 7.5 133.7 0.8X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 2097152 403 419 12 248.3 4.0 1.0X -With bloom filter, blocksize: 2097152 149 160 10 672.9 1.5 2.7X +Without bloom filter, blocksize: 2097152 400 430 24 250.0 4.0 1.0X +With bloom filter, blocksize: 2097152 148 160 12 677.1 1.5 2.7X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 4194304 387 397 11 258.4 3.9 1.0X -With bloom filter, blocksize: 4194304 104 109 5 966.1 1.0 3.7X +Without bloom filter, blocksize: 4194304 380 385 4 263.0 3.8 1.0X +With bloom filter, blocksize: 4194304 103 115 8 972.0 1.0 3.7X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 6291456 388 394 6 257.6 3.9 1.0X -With bloom filter, blocksize: 6291456 114 124 5 878.6 1.1 3.4X +Without bloom filter, blocksize: 6291456 380 384 5 263.0 3.8 1.0X +With bloom filter, blocksize: 6291456 123 136 7 811.1 1.2 3.1X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 8388608 386 402 19 259.0 3.9 1.0X -With bloom filter, blocksize: 8388608 159 174 11 628.9 1.6 2.4X +Without bloom filter, blocksize: 8388608 382 391 13 261.8 3.8 1.0X +With bloom filter, blocksize: 8388608 175 188 7 571.1 1.8 2.2X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 12582912 404 432 24 247.4 4.0 1.0X -With bloom filter, blocksize: 12582912 267 281 14 375.1 2.7 1.5X +Without bloom filter, blocksize: 12582912 386 393 7 259.1 3.9 1.0X +With bloom filter, blocksize: 12582912 316 322 5 316.2 3.2 1.2X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 16777216 391 398 6 255.9 3.9 1.0X -With bloom filter, blocksize: 16777216 351 358 9 285.2 3.5 1.1X +Without bloom filter, blocksize: 16777216 390 395 6 256.6 3.9 1.0X +With bloom filter, blocksize: 16777216 299 305 4 334.1 3.0 1.3X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 33554432 398 402 3 251.4 4.0 1.0X -With bloom filter, blocksize: 33554432 401 409 7 249.3 4.0 1.0X +Without bloom filter, blocksize: 33554432 397 409 10 252.0 4.0 1.0X +With bloom filter, blocksize: 33554432 583 640 39 171.7 5.8 0.7X diff --git a/sql/core/benchmarks/BloomFilterBenchmark-results.txt b/sql/core/benchmarks/BloomFilterBenchmark-results.txt index cb1be863cb651..7dfcdecded143 100644 --- a/sql/core/benchmarks/BloomFilterBenchmark-results.txt +++ b/sql/core/benchmarks/BloomFilterBenchmark-results.txt @@ -2,191 +2,191 @@ ORC Write ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Write 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 7507 7658 213 13.3 75.1 1.0X -With bloom filter 9532 9564 44 10.5 95.3 0.8X +Without bloom filter 7751 7823 102 12.9 77.5 1.0X +With bloom filter 9924 9966 59 10.1 99.2 0.8X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 2097152 834 888 78 119.8 8.3 1.0X -With bloom filter, blocksize: 2097152 626 647 17 159.9 6.3 1.3X +Without bloom filter, blocksize: 2097152 882 896 20 113.3 8.8 1.0X +With bloom filter, blocksize: 2097152 589 597 8 169.7 5.9 1.5X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 4194304 811 817 6 123.4 8.1 1.0X -With bloom filter, blocksize: 4194304 535 541 6 187.1 5.3 1.5X +Without bloom filter, blocksize: 4194304 817 823 5 122.4 8.2 1.0X +With bloom filter, blocksize: 4194304 524 534 10 191.0 5.2 1.6X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 6291456 788 809 18 126.9 7.9 1.0X -With bloom filter, blocksize: 6291456 513 531 20 195.0 5.1 1.5X +Without bloom filter, blocksize: 6291456 821 828 9 121.7 8.2 1.0X +With bloom filter, blocksize: 6291456 516 531 10 193.7 5.2 1.6X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 8388608 774 777 4 129.3 7.7 1.0X -With bloom filter, blocksize: 8388608 519 526 5 192.5 5.2 1.5X +Without bloom filter, blocksize: 8388608 791 811 24 126.5 7.9 1.0X +With bloom filter, blocksize: 8388608 531 566 27 188.5 5.3 1.5X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 12582912 1273 1282 13 78.6 12.7 1.0X -With bloom filter, blocksize: 12582912 761 771 9 131.4 7.6 1.7X +Without bloom filter, blocksize: 12582912 851 861 15 117.5 8.5 1.0X +With bloom filter, blocksize: 12582912 500 513 9 199.9 5.0 1.7X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 16777216 1232 1234 2 81.1 12.3 1.0X -With bloom filter, blocksize: 16777216 756 764 8 132.3 7.6 1.6X +Without bloom filter, blocksize: 16777216 811 819 8 123.2 8.1 1.0X +With bloom filter, blocksize: 16777216 502 516 9 199.0 5.0 1.6X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 33554432 1239 1249 15 80.7 12.4 1.0X -With bloom filter, blocksize: 33554432 760 785 32 131.6 7.6 1.6X +Without bloom filter, blocksize: 33554432 819 843 21 122.1 8.2 1.0X +With bloom filter, blocksize: 33554432 512 517 6 195.3 5.1 1.6X ================================================================================================ Parquet Write ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Write 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 10496 10555 84 9.5 105.0 1.0X -With bloom filter 14059 14124 92 7.1 140.6 0.7X +Without bloom filter 11467 11609 202 8.7 114.7 1.0X +With bloom filter 14502 14626 176 6.9 145.0 0.8X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 2097152 439 474 38 227.9 4.4 1.0X -With bloom filter, blocksize: 2097152 148 162 11 677.5 1.5 3.0X +Without bloom filter, blocksize: 2097152 430 450 14 232.6 4.3 1.0X +With bloom filter, blocksize: 2097152 146 158 9 684.2 1.5 2.9X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 4194304 414 421 8 241.6 4.1 1.0X -With bloom filter, blocksize: 4194304 112 120 9 895.8 1.1 3.7X +Without bloom filter, blocksize: 4194304 410 414 3 243.8 4.1 1.0X +With bloom filter, blocksize: 4194304 103 109 4 968.2 1.0 4.0X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 6291456 406 410 3 246.0 4.1 1.0X -With bloom filter, blocksize: 6291456 138 147 6 722.6 1.4 2.9X +Without bloom filter, blocksize: 6291456 409 414 5 244.5 4.1 1.0X +With bloom filter, blocksize: 6291456 130 139 7 772.2 1.3 3.2X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 8388608 404 421 12 247.3 4.0 1.0X -With bloom filter, blocksize: 8388608 222 232 6 451.4 2.2 1.8X +Without bloom filter, blocksize: 8388608 413 417 4 242.3 4.1 1.0X +With bloom filter, blocksize: 8388608 179 191 13 559.3 1.8 2.3X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 12582912 407 410 3 245.4 4.1 1.0X -With bloom filter, blocksize: 12582912 266 277 10 376.2 2.7 1.5X +Without bloom filter, blocksize: 12582912 412 418 5 242.7 4.1 1.0X +With bloom filter, blocksize: 12582912 346 351 3 288.8 3.5 1.2X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 16777216 408 417 19 245.1 4.1 1.0X -With bloom filter, blocksize: 16777216 434 458 20 230.5 4.3 0.9X +Without bloom filter, blocksize: 16777216 416 424 12 240.3 4.2 1.0X +With bloom filter, blocksize: 16777216 327 336 7 306.2 3.3 1.3X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 33554432 494 506 10 202.6 4.9 1.0X -With bloom filter, blocksize: 33554432 421 456 55 237.6 4.2 1.2X +Without bloom filter, blocksize: 33554432 423 427 3 236.7 4.2 1.0X +With bloom filter, blocksize: 33554432 683 695 11 146.5 6.8 0.6X diff --git a/sql/core/benchmarks/BuiltInDataSourceWriteBenchmark-jdk21-results.txt b/sql/core/benchmarks/BuiltInDataSourceWriteBenchmark-jdk21-results.txt index 32cc37491423e..a260bc0396455 100644 --- a/sql/core/benchmarks/BuiltInDataSourceWriteBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/BuiltInDataSourceWriteBenchmark-jdk21-results.txt @@ -2,69 +2,69 @@ Parquet writer benchmark ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor Parquet(PARQUET_1_0) writer benchmark: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Output Single Int Column 1744 1797 75 9.0 110.9 1.0X -Output Single Double Column 1753 1775 31 9.0 111.4 1.0X -Output Int and String Column 4384 4389 8 3.6 278.7 0.4X -Output Partitions 2948 3021 103 5.3 187.4 0.6X -Output Buckets 4110 4219 154 3.8 261.3 0.4X +Output Single Int Column 1732 1745 19 9.1 110.1 1.0X +Output Single Double Column 1754 1758 7 9.0 111.5 1.0X +Output Int and String Column 4309 4363 76 3.7 273.9 0.4X +Output Partitions 3252 3350 139 4.8 206.8 0.5X +Output Buckets 4487 4575 124 3.5 285.3 0.4X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor Parquet(PARQUET_2_0) writer benchmark: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Output Single Int Column 1605 1640 50 9.8 102.0 1.0X -Output Single Double Column 1709 1726 25 9.2 108.6 0.9X -Output Int and String Column 4570 4587 24 3.4 290.5 0.4X -Output Partitions 2943 2960 23 5.3 187.1 0.5X -Output Buckets 3816 3851 50 4.1 242.6 0.4X +Output Single Int Column 1938 1978 55 8.1 123.2 1.0X +Output Single Double Column 1762 1769 10 8.9 112.0 1.1X +Output Int and String Column 4920 4932 17 3.2 312.8 0.4X +Output Partitions 3385 3389 7 4.6 215.2 0.6X +Output Buckets 4528 4538 14 3.5 287.9 0.4X ================================================================================================ ORC writer benchmark ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor ORC writer benchmark: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Output Single Int Column 932 946 21 16.9 59.3 1.0X -Output Single Double Column 1571 1577 8 10.0 99.9 0.6X -Output Int and String Column 3621 3693 103 4.3 230.2 0.3X -Output Partitions 2301 2303 2 6.8 146.3 0.4X -Output Buckets 3033 3037 5 5.2 192.9 0.3X +Output Single Int Column 1137 1142 7 13.8 72.3 1.0X +Output Single Double Column 1700 1705 6 9.3 108.1 0.7X +Output Int and String Column 4028 4096 97 3.9 256.1 0.3X +Output Partitions 2562 2582 28 6.1 162.9 0.4X +Output Buckets 3524 3530 9 4.5 224.1 0.3X ================================================================================================ JSON writer benchmark ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor JSON writer benchmark: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Output Single Int Column 1606 1607 2 9.8 102.1 1.0X -Output Single Double Column 2363 2367 6 6.7 150.2 0.7X -Output Int and String Column 4054 4062 12 3.9 257.7 0.4X -Output Partitions 2924 2976 74 5.4 185.9 0.5X -Output Buckets 3826 3830 7 4.1 243.2 0.4X +Output Single Int Column 1618 1645 37 9.7 102.9 1.0X +Output Single Double Column 2398 2399 1 6.6 152.5 0.7X +Output Int and String Column 3766 3778 17 4.2 239.5 0.4X +Output Partitions 3162 3164 3 5.0 201.0 0.5X +Output Buckets 4015 4028 18 3.9 255.3 0.4X ================================================================================================ CSV writer benchmark ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor CSV writer benchmark: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Output Single Int Column 3863 3909 65 4.1 245.6 1.0X -Output Single Double Column 4411 4471 85 3.6 280.4 0.9X -Output Int and String Column 6697 6702 7 2.3 425.8 0.6X -Output Partitions 5281 5298 24 3.0 335.8 0.7X -Output Buckets 6902 6903 2 2.3 438.8 0.6X +Output Single Int Column 3985 3993 11 3.9 253.4 1.0X +Output Single Double Column 4148 4210 88 3.8 263.7 1.0X +Output Int and String Column 6728 6741 18 2.3 427.8 0.6X +Output Partitions 5431 5447 23 2.9 345.3 0.7X +Output Buckets 6927 6942 22 2.3 440.4 0.6X diff --git a/sql/core/benchmarks/BuiltInDataSourceWriteBenchmark-results.txt b/sql/core/benchmarks/BuiltInDataSourceWriteBenchmark-results.txt index a63258d40ee57..e43b3b53dfb25 100644 --- a/sql/core/benchmarks/BuiltInDataSourceWriteBenchmark-results.txt +++ b/sql/core/benchmarks/BuiltInDataSourceWriteBenchmark-results.txt @@ -2,69 +2,69 @@ Parquet writer benchmark ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor Parquet(PARQUET_1_0) writer benchmark: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Output Single Int Column 1657 1701 63 9.5 105.3 1.0X -Output Single Double Column 1766 1786 28 8.9 112.3 0.9X -Output Int and String Column 4365 4375 15 3.6 277.5 0.4X -Output Partitions 3042 3082 57 5.2 193.4 0.5X -Output Buckets 4252 4277 35 3.7 270.4 0.4X +Output Single Int Column 1813 1881 96 8.7 115.3 1.0X +Output Single Double Column 1976 1977 1 8.0 125.6 0.9X +Output Int and String Column 4403 4438 50 3.6 279.9 0.4X +Output Partitions 3388 3421 46 4.6 215.4 0.5X +Output Buckets 4670 4680 15 3.4 296.9 0.4X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor Parquet(PARQUET_2_0) writer benchmark: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Output Single Int Column 1666 1667 2 9.4 105.9 1.0X -Output Single Double Column 1686 1689 5 9.3 107.2 1.0X -Output Int and String Column 4778 4845 94 3.3 303.8 0.3X -Output Partitions 3067 3068 2 5.1 195.0 0.5X -Output Buckets 4045 4082 51 3.9 257.2 0.4X +Output Single Int Column 1903 1926 33 8.3 121.0 1.0X +Output Single Double Column 1998 1998 0 7.9 127.0 1.0X +Output Int and String Column 4916 4936 29 3.2 312.6 0.4X +Output Partitions 3366 3375 13 4.7 214.0 0.6X +Output Buckets 4560 4583 33 3.4 289.9 0.4X ================================================================================================ ORC writer benchmark ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor ORC writer benchmark: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Output Single Int Column 980 983 4 16.1 62.3 1.0X -Output Single Double Column 1564 1573 12 10.1 99.5 0.6X -Output Int and String Column 3639 3673 47 4.3 231.4 0.3X -Output Partitions 2345 2349 6 6.7 149.1 0.4X -Output Buckets 3579 3579 0 4.4 227.5 0.3X +Output Single Int Column 1034 1039 7 15.2 65.8 1.0X +Output Single Double Column 1687 1691 7 9.3 107.2 0.6X +Output Int and String Column 3941 3955 20 4.0 250.6 0.3X +Output Partitions 2553 2674 172 6.2 162.3 0.4X +Output Buckets 3544 3548 6 4.4 225.3 0.3X ================================================================================================ JSON writer benchmark ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor JSON writer benchmark: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Output Single Int Column 1600 1600 0 9.8 101.7 1.0X -Output Single Double Column 2347 2353 9 6.7 149.2 0.7X -Output Int and String Column 3992 4010 25 3.9 253.8 0.4X -Output Partitions 3035 3048 18 5.2 193.0 0.5X -Output Buckets 3985 3996 15 3.9 253.4 0.4X +Output Single Int Column 1669 1686 24 9.4 106.1 1.0X +Output Single Double Column 2342 2369 37 6.7 148.9 0.7X +Output Int and String Column 3776 3805 42 4.2 240.0 0.4X +Output Partitions 3060 3064 7 5.1 194.5 0.5X +Output Buckets 4009 4052 60 3.9 254.9 0.4X ================================================================================================ CSV writer benchmark ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor CSV writer benchmark: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Output Single Int Column 3633 3640 11 4.3 231.0 1.0X -Output Single Double Column 4154 4164 15 3.8 264.1 0.9X -Output Int and String Column 6469 6477 10 2.4 411.3 0.6X -Output Partitions 5158 5164 8 3.0 328.0 0.7X -Output Buckets 6467 6477 15 2.4 411.1 0.6X +Output Single Int Column 3877 3889 18 4.1 246.5 1.0X +Output Single Double Column 4079 4086 10 3.9 259.3 1.0X +Output Int and String Column 6266 6269 4 2.5 398.4 0.6X +Output Partitions 5432 5438 8 2.9 345.4 0.7X +Output Buckets 6528 6530 4 2.4 415.0 0.6X diff --git a/sql/core/benchmarks/ByteArrayBenchmark-jdk21-results.txt b/sql/core/benchmarks/ByteArrayBenchmark-jdk21-results.txt index 4f1571f639984..7fe68e003db73 100644 --- a/sql/core/benchmarks/ByteArrayBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/ByteArrayBenchmark-jdk21-results.txt @@ -2,26 +2,26 @@ byte array comparisons ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Byte Array compareTo: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -2-7 byte 254 258 2 257.9 3.9 1.0X -8-16 byte 408 443 46 160.8 6.2 0.6X -16-32 byte 407 408 1 161.2 6.2 0.6X -512-1024 byte 545 548 1 120.2 8.3 0.5X -512 byte slow 1524 1554 22 43.0 23.3 0.2X -2-7 byte 313 313 1 209.5 4.8 0.8X +2-7 byte 254 258 2 257.7 3.9 1.0X +8-16 byte 386 408 28 170.0 5.9 0.7X +16-32 byte 384 386 1 170.5 5.9 0.7X +512-1024 byte 518 521 3 126.5 7.9 0.5X +512 byte slow 1530 1555 22 42.8 23.4 0.2X +2-7 byte 313 314 1 209.2 4.8 0.8X ================================================================================================ byte array equals ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Byte Array equals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Byte Array equals 516 518 1 310.0 3.2 1.0X +Byte Array equals 517 518 1 309.7 3.2 1.0X diff --git a/sql/core/benchmarks/ByteArrayBenchmark-results.txt b/sql/core/benchmarks/ByteArrayBenchmark-results.txt index ae09ecd0f50e7..028b1ea55b5d6 100644 --- a/sql/core/benchmarks/ByteArrayBenchmark-results.txt +++ b/sql/core/benchmarks/ByteArrayBenchmark-results.txt @@ -2,26 +2,26 @@ byte array comparisons ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Byte Array compareTo: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -2-7 byte 258 259 1 254.0 3.9 1.0X -8-16 byte 432 463 22 151.7 6.6 0.6X -16-32 byte 477 479 1 137.4 7.3 0.5X -512-1024 byte 606 610 2 108.2 9.2 0.4X -512 byte slow 1493 1502 9 43.9 22.8 0.2X -2-7 byte 276 276 1 237.6 4.2 0.9X +2-7 byte 259 260 1 253.4 3.9 1.0X +8-16 byte 411 445 24 159.4 6.3 0.6X +16-32 byte 458 461 5 143.0 7.0 0.6X +512-1024 byte 587 591 3 111.7 9.0 0.4X +512 byte slow 1496 1507 9 43.8 22.8 0.2X +2-7 byte 276 277 1 237.4 4.2 0.9X ================================================================================================ byte array equals ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Byte Array equals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Byte Array equals 518 521 5 309.1 3.2 1.0X +Byte Array equals 523 524 4 306.2 3.3 1.0X diff --git a/sql/core/benchmarks/CSVBenchmark-jdk21-results.txt b/sql/core/benchmarks/CSVBenchmark-jdk21-results.txt index 7c0f30840b7b0..b37d2fa3060e7 100644 --- a/sql/core/benchmarks/CSVBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/CSVBenchmark-jdk21-results.txt @@ -2,69 +2,69 @@ Benchmark to measure CSV read/write performance ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Parsing quoted values: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -One quoted string 23926 24027 88 0.0 478511.1 1.0X +One quoted string 23353 23432 75 0.0 467067.4 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Wide rows with 1000 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Select 1000 columns 57625 57971 589 0.0 57624.8 1.0X -Select 100 columns 23109 23250 123 0.0 23109.2 2.5X -Select one column 19951 20034 74 0.1 19951.3 2.9X -count() 3690 3911 361 0.3 3689.7 15.6X -Select 100 columns, one bad input field 33099 33184 113 0.0 33099.2 1.7X -Select 100 columns, corrupt record field 36824 36966 141 0.0 36824.4 1.6X +Select 1000 columns 56825 57244 679 0.0 56825.1 1.0X +Select 100 columns 20482 20568 86 0.0 20481.7 2.8X +Select one column 16968 17000 36 0.1 16967.7 3.3X +count() 3366 3378 11 0.3 3366.4 16.9X +Select 100 columns, one bad input field 28347 28379 30 0.0 28346.6 2.0X +Select 100 columns, corrupt record field 32401 32450 42 0.0 32401.2 1.8X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Count a dataset with 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Select 10 columns + count() 11170 11203 49 0.9 1117.0 1.0X -Select 1 column + count() 7659 7674 13 1.3 765.9 1.5X -count() 1712 1718 5 5.8 171.2 6.5X +Select 10 columns + count() 11174 11195 18 0.9 1117.4 1.0X +Select 1 column + count() 7666 7694 24 1.3 766.6 1.5X +count() 2042 2048 5 4.9 204.2 5.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Write dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Create a dataset of timestamps 855 860 5 11.7 85.5 1.0X -to_csv(timestamp) 5694 5705 15 1.8 569.4 0.2X -write timestamps to files 6203 6210 7 1.6 620.3 0.1X -Create a dataset of dates 945 959 12 10.6 94.5 0.9X -to_csv(date) 3955 3958 3 2.5 395.5 0.2X -write dates to files 4158 4175 26 2.4 415.8 0.2X +Create a dataset of timestamps 854 882 27 11.7 85.4 1.0X +to_csv(timestamp) 6166 6174 13 1.6 616.6 0.1X +write timestamps to files 6480 6575 158 1.5 648.0 0.1X +Create a dataset of dates 948 949 1 10.6 94.8 0.9X +to_csv(date) 4471 4474 3 2.2 447.1 0.2X +write dates to files 4599 4616 15 2.2 459.9 0.2X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Read dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------------- -read timestamp text from files 1143 1232 150 8.7 114.3 1.0X -read timestamps from files 11824 11858 33 0.8 1182.4 0.1X -infer timestamps from files 23198 23221 20 0.4 2319.8 0.0X -read date text from files 1046 1051 5 9.6 104.6 1.1X -read date from files 10592 10603 10 0.9 1059.2 0.1X -infer date from files 21896 21934 52 0.5 2189.6 0.1X -timestamp strings 1422 1423 2 7.0 142.2 0.8X -parse timestamps from Dataset[String] 13380 13423 38 0.7 1338.0 0.1X -infer timestamps from Dataset[String] 24873 24906 33 0.4 2487.3 0.0X -date strings 1796 1801 4 5.6 179.6 0.6X -parse dates from Dataset[String] 12513 12563 45 0.8 1251.3 0.1X -from_csv(timestamp) 12067 12103 40 0.8 1206.7 0.1X -from_csv(date) 11803 11816 12 0.8 1180.3 0.1X -infer error timestamps from Dataset[String] with default format 15522 15627 105 0.6 1552.2 0.1X -infer error timestamps from Dataset[String] with user-provided format 15493 15561 61 0.6 1549.3 0.1X -infer error timestamps from Dataset[String] with legacy format 15507 15571 73 0.6 1550.7 0.1X +read timestamp text from files 1200 1213 12 8.3 120.0 1.0X +read timestamps from files 11576 11601 22 0.9 1157.6 0.1X +infer timestamps from files 23234 23253 16 0.4 2323.4 0.1X +read date text from files 1115 1162 44 9.0 111.5 1.1X +read date from files 10978 11006 43 0.9 1097.8 0.1X +infer date from files 22588 22604 13 0.4 2258.8 0.1X +timestamp strings 1224 1236 21 8.2 122.4 1.0X +parse timestamps from Dataset[String] 13566 13595 41 0.7 1356.6 0.1X +infer timestamps from Dataset[String] 25057 25094 36 0.4 2505.7 0.0X +date strings 1618 1626 7 6.2 161.8 0.7X +parse dates from Dataset[String] 12784 12816 34 0.8 1278.4 0.1X +from_csv(timestamp) 12008 12088 69 0.8 1200.8 0.1X +from_csv(date) 11930 11938 12 0.8 1193.0 0.1X +infer error timestamps from Dataset[String] with default format 14366 14394 35 0.7 1436.6 0.1X +infer error timestamps from Dataset[String] with user-provided format 14380 14412 52 0.7 1438.0 0.1X +infer error timestamps from Dataset[String] with legacy format 14439 14453 21 0.7 1443.9 0.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Filters pushdown: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -w/o filters 4097 4114 16 0.0 40974.1 1.0X -pushdown disabled 4043 4059 15 0.0 40426.6 1.0X -w/ filters 742 747 5 0.1 7417.0 5.5X +w/o filters 4302 4383 137 0.0 43020.6 1.0X +pushdown disabled 4206 4220 13 0.0 42058.8 1.0X +w/ filters 776 784 10 0.1 7756.3 5.5X diff --git a/sql/core/benchmarks/CSVBenchmark-results.txt b/sql/core/benchmarks/CSVBenchmark-results.txt index 9d941fa7b4cc8..522e164f80c8c 100644 --- a/sql/core/benchmarks/CSVBenchmark-results.txt +++ b/sql/core/benchmarks/CSVBenchmark-results.txt @@ -2,69 +2,69 @@ Benchmark to measure CSV read/write performance ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Parsing quoted values: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -One quoted string 25479 25496 15 0.0 509588.9 1.0X +One quoted string 24681 24724 74 0.0 493616.4 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Wide rows with 1000 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Select 1000 columns 53810 54394 906 0.0 53810.4 1.0X -Select 100 columns 22994 23062 77 0.0 22994.3 2.3X -Select one column 19875 20001 126 0.1 19875.1 2.7X -count() 4243 4309 90 0.2 4243.2 12.7X -Select 100 columns, one bad input field 35226 35392 146 0.0 35226.2 1.5X -Select 100 columns, corrupt record field 39131 39211 71 0.0 39130.6 1.4X +Select 1000 columns 55362 55719 576 0.0 55361.6 1.0X +Select 100 columns 22947 22975 36 0.0 22946.7 2.4X +Select one column 19695 19714 18 0.1 19694.7 2.8X +count() 3474 3514 54 0.3 3473.8 15.9X +Select 100 columns, one bad input field 32366 32417 47 0.0 32365.6 1.7X +Select 100 columns, corrupt record field 35921 35986 77 0.0 35921.3 1.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Count a dataset with 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Select 10 columns + count() 9294 9305 15 1.1 929.4 1.0X -Select 1 column + count() 6665 6689 29 1.5 666.5 1.4X -count() 1882 1892 12 5.3 188.2 4.9X +Select 10 columns + count() 9523 9537 15 1.1 952.3 1.0X +Select 1 column + count() 6868 6883 13 1.5 686.8 1.4X +count() 1820 1836 20 5.5 182.0 5.2X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Write dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Create a dataset of timestamps 916 926 13 10.9 91.6 1.0X -to_csv(timestamp) 7008 7013 4 1.4 700.8 0.1X -write timestamps to files 7146 7159 14 1.4 714.6 0.1X -Create a dataset of dates 1163 1168 5 8.6 116.3 0.8X -to_csv(date) 4703 4715 12 2.1 470.3 0.2X -write dates to files 4524 4532 8 2.2 452.4 0.2X +Create a dataset of timestamps 899 912 12 11.1 89.9 1.0X +to_csv(timestamp) 7355 7371 14 1.4 735.5 0.1X +write timestamps to files 7751 7761 12 1.3 775.1 0.1X +Create a dataset of dates 1171 1174 6 8.5 117.1 0.8X +to_csv(date) 5040 5044 4 2.0 504.0 0.2X +write dates to files 5277 5292 24 1.9 527.7 0.2X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Read dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------------- -read timestamp text from files 1269 1270 1 7.9 126.9 1.0X -read timestamps from files 10019 10032 16 1.0 1001.9 0.1X -infer timestamps from files 19853 19855 2 0.5 1985.3 0.1X -read date text from files 1142 1143 1 8.8 114.2 1.1X -read date from files 10157 10160 3 1.0 1015.7 0.1X -infer date from files 21005 21040 31 0.5 2100.5 0.1X -timestamp strings 1306 1312 9 7.7 130.6 1.0X -parse timestamps from Dataset[String] 11618 11712 85 0.9 1161.8 0.1X -infer timestamps from Dataset[String] 21545 21613 105 0.5 2154.5 0.1X -date strings 1741 1742 1 5.7 174.1 0.7X -parse dates from Dataset[String] 12115 12187 63 0.8 1211.5 0.1X -from_csv(timestamp) 10162 10220 71 1.0 1016.2 0.1X -from_csv(date) 11185 11296 177 0.9 1118.5 0.1X -infer error timestamps from Dataset[String] with default format 12461 12582 107 0.8 1246.1 0.1X -infer error timestamps from Dataset[String] with user-provided format 12575 12592 16 0.8 1257.5 0.1X -infer error timestamps from Dataset[String] with legacy format 12598 12609 13 0.8 1259.8 0.1X +read timestamp text from files 1245 1251 7 8.0 124.5 1.0X +read timestamps from files 10059 10074 23 1.0 1005.9 0.1X +infer timestamps from files 20189 20223 36 0.5 2018.9 0.1X +read date text from files 1151 1167 24 8.7 115.1 1.1X +read date from files 10547 10568 25 0.9 1054.7 0.1X +infer date from files 21527 21540 11 0.5 2152.7 0.1X +timestamp strings 1355 1364 15 7.4 135.5 0.9X +parse timestamps from Dataset[String] 11522 11553 28 0.9 1152.2 0.1X +infer timestamps from Dataset[String] 21195 21203 10 0.5 2119.5 0.1X +date strings 1785 1788 5 5.6 178.5 0.7X +parse dates from Dataset[String] 12245 12288 44 0.8 1224.5 0.1X +from_csv(timestamp) 10102 10144 51 1.0 1010.2 0.1X +from_csv(date) 11329 11353 29 0.9 1132.9 0.1X +infer error timestamps from Dataset[String] with default format 12067 12091 36 0.8 1206.7 0.1X +infer error timestamps from Dataset[String] with user-provided format 12077 12093 24 0.8 1207.7 0.1X +infer error timestamps from Dataset[String] with legacy format 12047 12076 26 0.8 1204.7 0.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Filters pushdown: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -w/o filters 4243 4254 13 0.0 42431.9 1.0X -pushdown disabled 4271 4316 51 0.0 42714.3 1.0X -w/ filters 735 743 8 0.1 7354.2 5.8X +w/o filters 4119 4139 17 0.0 41191.2 1.0X +pushdown disabled 4092 4110 16 0.0 40922.3 1.0X +w/ filters 691 702 13 0.1 6911.5 6.0X diff --git a/sql/core/benchmarks/CharVarcharBenchmark-jdk21-results.txt b/sql/core/benchmarks/CharVarcharBenchmark-jdk21-results.txt index 347255e96cb78..25b685baf20d1 100644 --- a/sql/core/benchmarks/CharVarcharBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/CharVarcharBenchmark-jdk21-results.txt @@ -2,121 +2,121 @@ Char Varchar Write Side Perf w/o Tailing Spaces ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Write with length 5: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 5 5848 6125 383 6.8 146.2 1.0X -write char with length 5 9155 9177 21 4.4 228.9 0.6X -write varchar with length 5 6536 6634 100 6.1 163.4 0.9X +write string with length 5 6814 6942 128 5.9 170.4 1.0X +write char with length 5 9886 9963 67 4.0 247.1 0.7X +write varchar with length 5 7603 7633 27 5.3 190.1 0.9X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Write with length 10: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 10 3012 3029 26 6.6 150.6 1.0X -write char with length 10 5692 5714 19 3.5 284.6 0.5X -write varchar with length 10 3263 3277 23 6.1 163.1 0.9X +write string with length 10 3542 3558 21 5.6 177.1 1.0X +write char with length 10 6351 6465 100 3.1 317.5 0.6X +write varchar with length 10 3772 3776 4 5.3 188.6 0.9X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Write with length 20: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 20 1540 1559 18 6.5 154.0 1.0X -write char with length 20 4242 4252 12 2.4 424.2 0.4X -write varchar with length 20 1697 1709 13 5.9 169.7 0.9X +write string with length 20 1810 1841 42 5.5 181.0 1.0X +write char with length 20 4390 4411 18 2.3 439.0 0.4X +write varchar with length 20 2008 2023 19 5.0 200.8 0.9X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Write with length 40: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 40 857 865 10 5.8 171.3 1.0X -write char with length 40 3436 3449 16 1.5 687.3 0.2X -write varchar with length 40 907 912 8 5.5 181.4 0.9X +write string with length 40 1061 1068 6 4.7 212.3 1.0X +write char with length 40 3566 3568 2 1.4 713.3 0.3X +write varchar with length 40 1042 1052 9 4.8 208.5 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Write with length 60: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 60 585 589 6 5.7 175.4 1.0X -write char with length 60 3126 3133 13 1.1 937.7 0.2X -write varchar with length 60 627 637 9 5.3 188.1 0.9X +write string with length 60 683 689 5 4.9 204.9 1.0X +write char with length 60 3179 3188 7 1.0 953.7 0.2X +write varchar with length 60 741 753 11 4.5 222.4 0.9X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Write with length 80: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 80 450 455 5 5.6 179.8 1.0X -write char with length 80 2967 2973 7 0.8 1186.9 0.2X -write varchar with length 80 496 498 3 5.0 198.5 0.9X +write string with length 80 577 590 20 4.3 230.9 1.0X +write char with length 80 3064 3071 11 0.8 1225.5 0.2X +write varchar with length 80 554 560 7 4.5 221.6 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Write with length 100: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 100 374 380 6 5.4 186.8 1.0X -write char with length 100 2934 2941 6 0.7 1466.9 0.1X -write varchar with length 100 407 423 14 4.9 203.7 0.9X +write string with length 100 472 491 16 4.2 235.9 1.0X +write char with length 100 2972 2975 5 0.7 1485.8 0.2X +write varchar with length 100 479 485 5 4.2 239.6 1.0X ================================================================================================ Char Varchar Write Side Perf w/ Tailing Spaces ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Write with length 5: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 5 8686 8701 16 4.6 217.2 1.0X -write char with length 5 10316 10357 63 3.9 257.9 0.8X -write varchar with length 5 10300 10324 36 3.9 257.5 0.8X +write string with length 5 10481 10507 33 3.8 262.0 1.0X +write char with length 5 11773 11799 33 3.4 294.3 0.9X +write varchar with length 5 11851 11879 28 3.4 296.3 0.9X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Write with length 10: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 10 4575 4589 21 4.4 228.7 1.0X -write char with length 10 6856 6868 14 2.9 342.8 0.7X -write varchar with length 10 6752 6759 9 3.0 337.6 0.7X +write string with length 10 5211 5215 4 3.8 260.6 1.0X +write char with length 10 7437 7455 16 2.7 371.8 0.7X +write varchar with length 10 7284 7301 22 2.7 364.2 0.7X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Write with length 20: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 20 2895 2900 5 3.5 289.5 1.0X -write char with length 20 5061 5073 11 2.0 506.1 0.6X -write varchar with length 20 4975 4980 4 2.0 497.5 0.6X +write string with length 20 3315 3339 23 3.0 331.5 1.0X +write char with length 20 5353 5358 8 1.9 535.3 0.6X +write varchar with length 20 5318 5322 4 1.9 531.8 0.6X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Write with length 40: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 40 2010 2018 7 2.5 402.0 1.0X -write char with length 40 4130 4136 8 1.2 826.1 0.5X -write varchar with length 40 4068 4075 6 1.2 813.7 0.5X +write string with length 40 2229 2231 2 2.2 445.8 1.0X +write char with length 40 4283 4287 3 1.2 856.6 0.5X +write varchar with length 40 4269 4270 1 1.2 853.8 0.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Write with length 60: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 60 1758 1768 9 1.9 527.3 1.0X -write char with length 60 3836 3839 3 0.9 1150.9 0.5X -write varchar with length 60 3780 3790 12 0.9 1134.1 0.5X +write string with length 60 1839 1845 5 1.8 551.8 1.0X +write char with length 60 3958 3961 4 0.8 1187.3 0.5X +write varchar with length 60 3895 3900 4 0.9 1168.5 0.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Write with length 80: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 80 1588 1590 2 1.6 635.3 1.0X -write char with length 80 3698 3722 24 0.7 1479.3 0.4X -write varchar with length 80 3644 3649 6 0.7 1457.5 0.4X +write string with length 80 1646 1650 4 1.5 658.4 1.0X +write char with length 80 3789 3790 2 0.7 1515.4 0.4X +write varchar with length 80 3704 3705 1 0.7 1481.5 0.4X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Write with length 100: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 100 1475 1485 12 1.4 737.4 1.0X -write char with length 100 3669 3677 12 0.5 1834.4 0.4X -write varchar with length 100 3542 3546 4 0.6 1770.9 0.4X +write string with length 100 1543 1547 4 1.3 771.6 1.0X +write char with length 100 3663 3676 21 0.5 1831.6 0.4X +write varchar with length 100 3611 3612 2 0.6 1805.3 0.4X diff --git a/sql/core/benchmarks/CharVarcharBenchmark-results.txt b/sql/core/benchmarks/CharVarcharBenchmark-results.txt index 7a41f9deacae2..e3d4e34db1489 100644 --- a/sql/core/benchmarks/CharVarcharBenchmark-results.txt +++ b/sql/core/benchmarks/CharVarcharBenchmark-results.txt @@ -2,121 +2,121 @@ Char Varchar Write Side Perf w/o Tailing Spaces ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Write with length 5: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 5 6314 6378 96 6.3 157.9 1.0X -write char with length 5 9041 9108 96 4.4 226.0 0.7X -write varchar with length 5 6868 6878 14 5.8 171.7 0.9X +write string with length 5 6296 6549 352 6.4 157.4 1.0X +write char with length 5 9227 9375 209 4.3 230.7 0.7X +write varchar with length 5 6706 6722 15 6.0 167.7 0.9X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Write with length 10: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 10 3209 3214 6 6.2 160.5 1.0X -write char with length 10 5946 5992 76 3.4 297.3 0.5X -write varchar with length 10 3496 3514 18 5.7 174.8 0.9X +write string with length 10 3218 3245 23 6.2 160.9 1.0X +write char with length 10 6113 6165 55 3.3 305.6 0.5X +write varchar with length 10 3604 3621 19 5.5 180.2 0.9X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Write with length 20: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 20 1701 1704 4 5.9 170.1 1.0X -write char with length 20 4598 4600 2 2.2 459.8 0.4X -write varchar with length 20 1851 1854 4 5.4 185.1 0.9X +write string with length 20 1700 1717 25 5.9 170.0 1.0X +write char with length 20 4275 4283 9 2.3 427.5 0.4X +write varchar with length 20 1737 1743 7 5.8 173.7 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Write with length 40: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 40 864 868 5 5.8 172.7 1.0X -write char with length 40 3482 3494 15 1.4 696.4 0.2X -write varchar with length 40 983 1002 17 5.1 196.7 0.9X +write string with length 40 915 916 1 5.5 183.0 1.0X +write char with length 40 3430 3456 22 1.5 686.0 0.3X +write varchar with length 40 958 969 17 5.2 191.6 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Write with length 60: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 60 625 629 7 5.3 187.4 1.0X -write char with length 60 3217 3227 15 1.0 965.1 0.2X -write varchar with length 60 693 699 8 4.8 208.0 0.9X +write string with length 60 614 629 15 5.4 184.1 1.0X +write char with length 60 3176 3189 12 1.0 952.8 0.2X +write varchar with length 60 643 648 6 5.2 192.8 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Write with length 80: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 80 483 486 2 5.2 193.3 1.0X -write char with length 80 3032 3036 4 0.8 1212.8 0.2X -write varchar with length 80 505 512 5 4.9 202.1 1.0X +write string with length 80 500 503 2 5.0 200.2 1.0X +write char with length 80 3003 3004 2 0.8 1201.1 0.2X +write varchar with length 80 507 517 11 4.9 202.6 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Write with length 100: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 100 409 430 30 4.9 204.3 1.0X -write char with length 100 2915 2945 41 0.7 1457.4 0.1X -write varchar with length 100 423 430 11 4.7 211.7 1.0X +write string with length 100 388 404 14 5.2 193.9 1.0X +write char with length 100 2927 2932 6 0.7 1463.6 0.1X +write varchar with length 100 422 431 10 4.7 211.1 0.9X ================================================================================================ Char Varchar Write Side Perf w/ Tailing Spaces ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Write with length 5: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 5 8099 8131 28 4.9 202.5 1.0X -write char with length 5 10486 10511 23 3.8 262.1 0.8X -write varchar with length 5 10656 10711 68 3.8 266.4 0.8X +write string with length 5 8732 8757 29 4.6 218.3 1.0X +write char with length 5 10464 10517 46 3.8 261.6 0.8X +write varchar with length 5 10783 10834 50 3.7 269.6 0.8X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Write with length 10: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 10 4834 4844 14 4.1 241.7 1.0X -write char with length 10 6757 6795 44 3.0 337.8 0.7X -write varchar with length 10 6773 6816 38 3.0 338.6 0.7X +write string with length 10 4713 4733 21 4.2 235.6 1.0X +write char with length 10 6723 6746 37 3.0 336.2 0.7X +write varchar with length 10 6682 6694 11 3.0 334.1 0.7X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Write with length 20: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 20 3028 3041 13 3.3 302.8 1.0X -write char with length 20 4762 4766 4 2.1 476.2 0.6X -write varchar with length 20 4813 4817 6 2.1 481.3 0.6X +write string with length 20 3067 3081 12 3.3 306.7 1.0X +write char with length 20 4638 4654 17 2.2 463.8 0.7X +write varchar with length 20 4698 4705 7 2.1 469.8 0.7X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Write with length 40: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 40 1958 1966 11 2.6 391.6 1.0X -write char with length 40 3628 3635 8 1.4 725.6 0.5X -write varchar with length 40 3658 3667 14 1.4 731.6 0.5X +write string with length 40 1967 1971 4 2.5 393.4 1.0X +write char with length 40 3615 3628 20 1.4 723.0 0.5X +write varchar with length 40 3603 3609 5 1.4 720.7 0.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Write with length 60: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 60 1692 1697 5 2.0 507.7 1.0X -write char with length 60 3372 3389 16 1.0 1011.6 0.5X -write varchar with length 60 3361 3367 7 1.0 1008.3 0.5X +write string with length 60 1651 1658 8 2.0 495.3 1.0X +write char with length 60 3360 3370 10 1.0 1007.9 0.5X +write varchar with length 60 3305 3307 2 1.0 991.6 0.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Write with length 80: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 80 1533 1549 24 1.6 613.1 1.0X -write char with length 80 3168 3176 10 0.8 1267.0 0.5X -write varchar with length 80 3240 3245 9 0.8 1296.0 0.5X +write string with length 80 1470 1475 5 1.7 587.9 1.0X +write char with length 80 3158 3168 10 0.8 1263.1 0.5X +write varchar with length 80 3091 3101 15 0.8 1236.4 0.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Write with length 100: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -write string with length 100 1436 1438 4 1.4 717.9 1.0X -write char with length 100 3200 3206 5 0.6 1599.9 0.4X -write varchar with length 100 3154 3178 28 0.6 1577.2 0.5X +write string with length 100 1348 1358 10 1.5 673.8 1.0X +write char with length 100 3034 3040 7 0.7 1517.0 0.4X +write varchar with length 100 3029 3033 5 0.7 1514.3 0.4X diff --git a/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt b/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt index 32cbbc74e9112..56b1523344a72 100644 --- a/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt @@ -1,27 +1,54 @@ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1017-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - equalsFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -UTF8_BINARY_LCASE 6910 6912 3 0.0 69099.7 1.0X -UNICODE 4367 4368 1 0.0 43669.6 1.6X -UTF8_BINARY 4361 4364 4 0.0 43606.5 1.6X -UNICODE_CI 46480 46526 66 0.0 464795.7 0.1X +UTF8_BINARY 1352 1352 1 0.1 13516.4 1.0X +UTF8_LCASE 4678 4693 21 0.0 46778.6 0.3X +UNICODE 17213 17223 13 0.0 172131.7 0.1X +UNICODE_CI 17101 17133 46 0.0 171009.6 0.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1017-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - compareFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -UTF8_BINARY_LCASE 6522 6526 4 0.0 65223.9 1.0X -UNICODE 45792 45797 7 0.0 457922.3 0.1X -UTF8_BINARY 7092 7112 29 0.0 70921.7 0.9X -UNICODE_CI 47548 47564 22 0.0 475476.7 0.1X +UTF8_BINARY 1775 1775 0 0.1 17749.8 1.0X +UTF8_LCASE 5342 5367 35 0.0 53423.0 0.3X +UNICODE 17011 17020 12 0.0 170110.1 0.1X +UNICODE_CI 16734 16760 37 0.0 167338.2 0.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1017-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - hashFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -UTF8_BINARY_LCASE 11716 11716 1 0.0 117157.9 1.0X -UNICODE 180133 180137 5 0.0 1801332.1 0.1X -UTF8_BINARY 10476 10477 1 0.0 104757.4 1.1X -UNICODE_CI 148171 148190 28 0.0 1481705.6 0.1X +UTF8_BINARY 7253 7256 4 0.0 72529.6 1.0X +UTF8_LCASE 16634 16676 59 0.0 166342.5 0.4X +UNICODE 66146 66163 23 0.0 661461.1 0.1X +UNICODE_CI 54563 54606 62 0.0 545625.5 0.1X + +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure +AMD EPYC 7763 64-Core Processor +collation unit benchmarks - contains: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +UTF8_BINARY 6570 6577 10 0.0 65696.6 1.0X +UTF8_LCASE 120073 120137 91 0.0 1200726.4 0.1X +UNICODE 364027 364291 374 0.0 3640267.9 0.0X +UNICODE_CI 421444 422138 981 0.0 4214438.7 0.0X + +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure +AMD EPYC 7763 64-Core Processor +collation unit benchmarks - startsWith: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +UTF8_BINARY 6573 6596 32 0.0 65733.4 1.0X +UTF8_LCASE 60284 60293 12 0.0 602844.4 0.1X +UNICODE 363685 364220 757 0.0 3636848.4 0.0X +UNICODE_CI 422761 423000 337 0.0 4227611.0 0.0X + +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure +AMD EPYC 7763 64-Core Processor +collation unit benchmarks - endsWith: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +UTF8_BINARY 6507 6508 1 0.0 65068.3 1.0X +UTF8_LCASE 59098 59118 28 0.0 590983.6 0.1X +UNICODE 378437 378790 499 0.0 3784367.4 0.0X +UNICODE_CI 433987 434294 435 0.0 4339869.2 0.0X diff --git a/sql/core/benchmarks/CollationBenchmark-results.txt b/sql/core/benchmarks/CollationBenchmark-results.txt index 4028b0f005a37..09847bbcaa260 100644 --- a/sql/core/benchmarks/CollationBenchmark-results.txt +++ b/sql/core/benchmarks/CollationBenchmark-results.txt @@ -1,27 +1,54 @@ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1017-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - equalsFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -UTF8_BINARY_LCASE 7692 7731 55 0.0 76919.2 1.0X -UNICODE 4378 4379 0 0.0 43784.6 1.8X -UTF8_BINARY 4382 4396 19 0.0 43821.6 1.8X -UNICODE_CI 48344 48360 23 0.0 483436.5 0.2X +UTF8_BINARY 1372 1374 3 0.1 13718.1 1.0X +UTF8_LCASE 6311 6311 0 0.0 63106.7 0.2X +UNICODE 19273 19300 37 0.0 192731.3 0.1X +UNICODE_CI 18991 18998 10 0.0 189906.3 0.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1017-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - compareFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -UTF8_BINARY_LCASE 9819 9820 0 0.0 98194.9 1.0X -UNICODE 49507 49518 17 0.0 495066.2 0.2X -UTF8_BINARY 7354 7365 17 0.0 73536.3 1.3X -UNICODE_CI 52149 52163 20 0.0 521489.4 0.2X +UTF8_BINARY 1725 1726 2 0.1 17249.0 1.0X +UTF8_LCASE 5806 5828 31 0.0 58061.5 0.3X +UNICODE 19105 19111 8 0.0 191051.5 0.1X +UNICODE_CI 18991 18996 7 0.0 189913.3 0.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1017-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - hashFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -UTF8_BINARY_LCASE 18110 18127 24 0.0 181103.9 1.0X -UNICODE 171375 171435 85 0.0 1713752.3 0.1X -UTF8_BINARY 14012 14030 26 0.0 140116.7 1.3X -UNICODE_CI 153847 153901 76 0.0 1538471.1 0.1X +UTF8_BINARY 3019 3021 3 0.0 30194.7 1.0X +UTF8_LCASE 19437 19439 3 0.0 194372.6 0.2X +UNICODE 63550 63568 25 0.0 635504.3 0.0X +UNICODE_CI 57839 57866 39 0.0 578385.0 0.1X + +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure +AMD EPYC 7763 64-Core Processor +collation unit benchmarks - contains: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +UTF8_BINARY 7022 7023 2 0.0 70216.8 1.0X +UTF8_LCASE 118836 118887 72 0.0 1188364.9 0.1X +UNICODE 376381 376546 234 0.0 3763807.3 0.0X +UNICODE_CI 427858 427981 174 0.0 4278584.6 0.0X + +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure +AMD EPYC 7763 64-Core Processor +collation unit benchmarks - startsWith: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +UTF8_BINARY 6720 6723 5 0.0 67197.9 1.0X +UTF8_LCASE 67132 67177 63 0.0 671324.6 0.1X +UNICODE 368690 369292 852 0.0 3686899.6 0.0X +UNICODE_CI 431481 431583 144 0.0 4314814.9 0.0X + +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure +AMD EPYC 7763 64-Core Processor +collation unit benchmarks - endsWith: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +UTF8_BINARY 7097 7099 3 0.0 70970.8 1.0X +UTF8_LCASE 57327 57351 35 0.0 573265.4 0.1X +UNICODE 375819 376473 924 0.0 3758191.8 0.0X +UNICODE_CI 445167 445212 64 0.0 4451666.7 0.0X diff --git a/sql/core/benchmarks/CollationNonASCIIBenchmark-jdk21-results.txt b/sql/core/benchmarks/CollationNonASCIIBenchmark-jdk21-results.txt index dc68b747203fa..a7d2afb3fffc9 100644 --- a/sql/core/benchmarks/CollationNonASCIIBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/CollationNonASCIIBenchmark-jdk21-results.txt @@ -1,27 +1,54 @@ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1017-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - equalsFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -UTF8_BINARY_LCASE 18244 18258 20 0.0 456096.4 1.0X -UNICODE 498 498 0 0.1 12440.3 36.7X -UTF8_BINARY 499 500 1 0.1 12467.7 36.6X -UNICODE_CI 13429 13443 19 0.0 335725.4 1.4X +UTF8_BINARY 171 171 1 0.2 4268.9 1.0X +UTF8_LCASE 6540 6549 12 0.0 163512.0 0.0X +UNICODE 5195 5196 2 0.0 129870.7 0.0X +UNICODE_CI 5129 5134 7 0.0 128222.9 0.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1017-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - compareFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -UTF8_BINARY_LCASE 18377 18399 31 0.0 459430.5 1.0X -UNICODE 14238 14240 3 0.0 355957.4 1.3X -UTF8_BINARY 975 976 1 0.0 24371.3 18.9X -UNICODE_CI 13819 13826 10 0.0 345482.6 1.3X +UTF8_BINARY 316 316 1 0.1 7899.6 1.0X +UTF8_LCASE 6525 6528 3 0.0 163136.8 0.0X +UNICODE 5186 5201 21 0.0 129654.8 0.1X +UNICODE_CI 5119 5120 0 0.0 127985.4 0.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1017-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - hashFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -UTF8_BINARY_LCASE 9183 9230 67 0.0 229564.0 1.0X -UNICODE 38937 38952 22 0.0 973421.3 0.2X -UTF8_BINARY 1376 1376 0 0.0 34397.5 6.7X -UNICODE_CI 32881 32882 1 0.0 822027.4 0.3X +UTF8_BINARY 384 386 2 0.1 9604.9 1.0X +UTF8_LCASE 3250 3255 7 0.0 81245.9 0.1X +UNICODE 14666 14668 3 0.0 366645.0 0.0X +UNICODE_CI 11055 11073 25 0.0 276376.4 0.0X + +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure +AMD EPYC 7763 64-Core Processor +collation unit benchmarks - contains: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +UTF8_BINARY 1439 1440 1 0.0 35973.9 1.0X +UTF8_LCASE 33643 33664 30 0.0 841072.8 0.0X +UNICODE 69901 69945 62 0.0 1747527.1 0.0X +UNICODE_CI 78298 78390 129 0.0 1957458.9 0.0X + +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure +AMD EPYC 7763 64-Core Processor +collation unit benchmarks - startsWith: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +UTF8_BINARY 1019 1020 1 0.0 25470.5 1.0X +UTF8_LCASE 18811 18833 32 0.0 470272.7 0.1X +UNICODE 67687 67758 101 0.0 1692181.1 0.0X +UNICODE_CI 77039 77148 154 0.0 1925975.7 0.0X + +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure +AMD EPYC 7763 64-Core Processor +collation unit benchmarks - endsWith: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +UTF8_BINARY 1020 1031 16 0.0 25493.0 1.0X +UTF8_LCASE 18574 18583 12 0.0 464350.7 0.1X +UNICODE 73937 74335 563 0.0 1848436.0 0.0X +UNICODE_CI 82022 82303 397 0.0 2050548.1 0.0X diff --git a/sql/core/benchmarks/CollationNonASCIIBenchmark-results.txt b/sql/core/benchmarks/CollationNonASCIIBenchmark-results.txt index bb58968764c7a..06d2e883cf788 100644 --- a/sql/core/benchmarks/CollationNonASCIIBenchmark-results.txt +++ b/sql/core/benchmarks/CollationNonASCIIBenchmark-results.txt @@ -1,27 +1,54 @@ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1017-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - equalsFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -UTF8_BINARY_LCASE 17881 17885 6 0.0 447017.7 1.0X -UNICODE 493 495 2 0.1 12328.9 36.3X -UTF8_BINARY 493 494 1 0.1 12331.4 36.3X -UNICODE_CI 13731 13737 8 0.0 343284.6 1.3X +UTF8_BINARY 133 133 0 0.3 3318.0 1.0X +UTF8_LCASE 9483 9494 16 0.0 237079.3 0.0X +UNICODE 5963 5965 3 0.0 149081.4 0.0X +UNICODE_CI 5661 5663 3 0.0 141518.7 0.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1017-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - compareFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -UTF8_BINARY_LCASE 18041 18047 8 0.0 451030.2 1.0X -UNICODE 14023 14047 34 0.0 350573.9 1.3X -UTF8_BINARY 1387 1397 14 0.0 34680.4 13.0X -UNICODE_CI 14232 14242 14 0.0 355808.4 1.3X +UTF8_BINARY 560 561 1 0.1 14008.3 1.0X +UTF8_LCASE 7535 7550 21 0.0 188384.6 0.1X +UNICODE 5868 5873 8 0.0 146691.2 0.1X +UNICODE_CI 5838 5839 1 0.0 145945.7 0.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1017-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - hashFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -UTF8_BINARY_LCASE 10494 10499 6 0.0 262360.0 1.0X -UNICODE 40410 40422 17 0.0 1010261.8 0.3X -UTF8_BINARY 2035 2035 1 0.0 50877.8 5.2X -UNICODE_CI 31470 31493 32 0.0 786752.4 0.3X +UTF8_BINARY 420 420 0 0.1 10489.3 1.0X +UTF8_LCASE 3524 3529 7 0.0 88101.6 0.1X +UNICODE 15630 15659 40 0.0 390755.8 0.0X +UNICODE_CI 12822 12838 22 0.0 320560.2 0.0X + +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure +AMD EPYC 7763 64-Core Processor +collation unit benchmarks - contains: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +UTF8_BINARY 1269 1270 2 0.0 31731.7 1.0X +UTF8_LCASE 34422 34448 37 0.0 860554.5 0.0X +UNICODE 66641 66780 196 0.0 1666024.0 0.0X +UNICODE_CI 76047 76084 52 0.0 1901185.9 0.0X + +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure +AMD EPYC 7763 64-Core Processor +collation unit benchmarks - startsWith: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +UTF8_BINARY 1114 1115 2 0.0 27839.1 1.0X +UTF8_LCASE 19656 19659 4 0.0 491401.9 0.1X +UNICODE 65990 66056 93 0.0 1649760.3 0.0X +UNICODE_CI 75764 75877 161 0.0 1894091.8 0.0X + +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure +AMD EPYC 7763 64-Core Processor +collation unit benchmarks - endsWith: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +UTF8_BINARY 1137 1137 0 0.0 28427.5 1.0X +UTF8_LCASE 18734 18765 44 0.0 468347.9 0.1X +UNICODE 74629 74646 24 0.0 1865724.7 0.0X +UNICODE_CI 83838 83888 70 0.0 2095948.2 0.0X diff --git a/sql/core/benchmarks/ColumnarBatchBenchmark-jdk21-results.txt b/sql/core/benchmarks/ColumnarBatchBenchmark-jdk21-results.txt index e45790a0d92d7..daa0fff3b464b 100644 --- a/sql/core/benchmarks/ColumnarBatchBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/ColumnarBatchBenchmark-jdk21-results.txt @@ -2,58 +2,58 @@ Int Read/Write ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Int Read/Write: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Java Array 122 123 0 2679.5 0.4 1.0X -ByteBuffer Unsafe 198 205 5 1658.3 0.6 0.6X -ByteBuffer API 426 426 0 768.6 1.3 0.3X -DirectByteBuffer 476 477 1 687.9 1.5 0.3X -Unsafe Buffer 153 154 1 2135.2 0.5 0.8X -Column(on heap) 123 124 2 2668.8 0.4 1.0X -Column(off heap) 154 154 0 2133.7 0.5 0.8X -Column(off heap direct) 154 154 1 2126.9 0.5 0.8X -UnsafeRow (on heap) 460 460 0 712.8 1.4 0.3X -UnsafeRow (off heap) 293 294 1 1119.3 0.9 0.4X -Column On Heap Append 336 337 1 974.7 1.0 0.4X +Java Array 118 119 1 2768.7 0.4 1.0X +ByteBuffer Unsafe 198 208 4 1655.2 0.6 0.6X +ByteBuffer API 388 394 7 845.5 1.2 0.3X +DirectByteBuffer 494 496 5 663.8 1.5 0.2X +Unsafe Buffer 154 154 0 2129.1 0.5 0.8X +Column(on heap) 123 123 0 2663.7 0.4 1.0X +Column(off heap) 154 154 0 2129.4 0.5 0.8X +Column(off heap direct) 154 154 0 2124.4 0.5 0.8X +UnsafeRow (on heap) 460 461 1 712.8 1.4 0.3X +UnsafeRow (off heap) 293 294 1 1116.6 0.9 0.4X +Column On Heap Append 336 337 2 975.8 1.0 0.4X ================================================================================================ Boolean Read/Write ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Boolean Read/Write: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Bitset 408 409 1 821.5 1.2 1.0X -Byte Array 249 249 1 1350.1 0.7 1.6X +Bitset 420 421 2 799.8 1.3 1.0X +Byte Array 236 236 0 1421.4 0.7 1.8X ================================================================================================ String Read/Write ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor String Read/Write: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -On Heap 122 128 5 134.2 7.5 1.0X -Off Heap 348 365 14 47.1 21.2 0.4X +On Heap 126 127 1 130.0 7.7 1.0X +Off Heap 427 463 23 38.4 26.0 0.3X ================================================================================================ Array Vector Read ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Array Vector Read: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -On Heap Read Size Only 87 88 0 1874.5 0.5 1.0X -Off Heap Read Size Only 411 411 0 398.9 2.5 0.2X -On Heap Read Elements 2465 2465 0 66.5 15.0 0.0X -Off Heap Read Elements 2529 2529 0 64.8 15.4 0.0X +On Heap Read Size Only 87 88 1 1873.1 0.5 1.0X +Off Heap Read Size Only 306 306 1 535.8 1.9 0.3X +On Heap Read Elements 2464 2464 1 66.5 15.0 0.0X +Off Heap Read Elements 2518 2519 2 65.1 15.4 0.0X diff --git a/sql/core/benchmarks/ColumnarBatchBenchmark-results.txt b/sql/core/benchmarks/ColumnarBatchBenchmark-results.txt index 5d1109fb52915..cd00e0bbd71e9 100644 --- a/sql/core/benchmarks/ColumnarBatchBenchmark-results.txt +++ b/sql/core/benchmarks/ColumnarBatchBenchmark-results.txt @@ -2,58 +2,58 @@ Int Read/Write ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Int Read/Write: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Java Array 174 174 1 1883.6 0.5 1.0X -ByteBuffer Unsafe 283 284 1 1156.3 0.9 0.6X -ByteBuffer API 496 496 1 660.8 1.5 0.4X -DirectByteBuffer 485 486 1 675.4 1.5 0.4X -Unsafe Buffer 161 163 0 2032.4 0.5 1.1X -Column(on heap) 177 177 0 1855.5 0.5 1.0X -Column(off heap) 162 162 0 2022.7 0.5 1.1X -Column(off heap direct) 156 156 1 2105.3 0.5 1.1X -UnsafeRow (on heap) 447 448 1 732.6 1.4 0.4X -UnsafeRow (off heap) 313 316 2 1045.3 1.0 0.6X -Column On Heap Append 361 362 0 906.5 1.1 0.5X +Java Array 174 174 1 1884.2 0.5 1.0X +ByteBuffer Unsafe 283 284 0 1157.5 0.9 0.6X +ByteBuffer API 499 500 1 656.1 1.5 0.3X +DirectByteBuffer 404 405 1 812.1 1.2 0.4X +Unsafe Buffer 161 163 1 2039.7 0.5 1.1X +Column(on heap) 177 177 0 1855.7 0.5 1.0X +Column(off heap) 162 162 0 2025.9 0.5 1.1X +Column(off heap direct) 155 156 1 2108.0 0.5 1.1X +UnsafeRow (on heap) 447 448 0 732.3 1.4 0.4X +UnsafeRow (off heap) 312 316 2 1049.5 1.0 0.6X +Column On Heap Append 361 388 65 907.1 1.1 0.5X ================================================================================================ Boolean Read/Write ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Boolean Read/Write: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Bitset 512 513 2 655.1 1.5 1.0X -Byte Array 320 321 1 1049.6 1.0 1.6X +Bitset 516 518 2 650.3 1.5 1.0X +Byte Array 314 315 2 1067.8 0.9 1.6X ================================================================================================ String Read/Write ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor String Read/Write: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -On Heap 137 137 0 119.6 8.4 1.0X -Off Heap 365 398 21 44.9 22.3 0.4X +On Heap 193 242 21 84.7 11.8 1.0X +Off Heap 340 375 21 48.2 20.7 0.6X ================================================================================================ Array Vector Read ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Array Vector Read: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -On Heap Read Size Only 89 90 1 1835.7 0.5 1.0X -Off Heap Read Size Only 84 85 0 1941.2 0.5 1.1X -On Heap Read Elements 2295 2297 2 71.4 14.0 0.0X -Off Heap Read Elements 2681 2683 2 61.1 16.4 0.0X +On Heap Read Size Only 89 90 1 1831.4 0.5 1.0X +Off Heap Read Size Only 85 85 1 1937.9 0.5 1.1X +On Heap Read Elements 2298 2302 6 71.3 14.0 0.0X +Off Heap Read Elements 2615 2617 3 62.7 16.0 0.0X diff --git a/sql/core/benchmarks/CompressionSchemeBenchmark-jdk21-results.txt b/sql/core/benchmarks/CompressionSchemeBenchmark-jdk21-results.txt index b33de7d152e80..df10c315b6871 100644 --- a/sql/core/benchmarks/CompressionSchemeBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/CompressionSchemeBenchmark-jdk21-results.txt @@ -2,136 +2,136 @@ Compression Scheme Benchmark ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor BOOLEAN Encode: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough(1.000) 1 1 0 45933.0 0.0 1.0X -RunLengthEncoding(2.498) 931 933 2 72.1 13.9 0.0X -BooleanBitSet(0.125) 295 296 0 227.3 4.4 0.0X +PassThrough(1.000) 1 1 0 49573.5 0.0 1.0X +RunLengthEncoding(2.501) 931 957 41 72.1 13.9 0.0X +BooleanBitSet(0.125) 295 296 1 227.6 4.4 0.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor BOOLEAN Decode: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough 210 211 1 319.9 3.1 1.0X -RunLengthEncoding 590 591 1 113.8 8.8 0.4X -BooleanBitSet 683 684 1 98.3 10.2 0.3X +PassThrough 210 212 2 319.6 3.1 1.0X +RunLengthEncoding 593 594 0 113.1 8.8 0.4X +BooleanBitSet 681 684 3 98.5 10.1 0.3X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor SHORT Encode (Lower Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough(1.000) 3 3 0 23353.5 0.0 1.0X -RunLengthEncoding(1.495) 1058 1060 3 63.4 15.8 0.0X +PassThrough(1.000) 3 3 0 24654.7 0.0 1.0X +RunLengthEncoding(1.502) 1208 1209 1 55.6 18.0 0.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor SHORT Decode (Lower Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough 607 609 2 110.5 9.0 1.0X -RunLengthEncoding 999 1000 2 67.2 14.9 0.6X +PassThrough 894 895 2 75.1 13.3 1.0X +RunLengthEncoding 1053 1055 2 63.7 15.7 0.8X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor SHORT Encode (Higher Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough(1.000) 3 3 0 23461.0 0.0 1.0X -RunLengthEncoding(1.996) 1126 1132 8 59.6 16.8 0.0X +PassThrough(1.000) 3 3 0 23243.9 0.0 1.0X +RunLengthEncoding(1.994) 1224 1226 2 54.8 18.2 0.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor SHORT Decode (Higher Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough 566 596 20 118.6 8.4 1.0X -RunLengthEncoding 947 949 2 70.9 14.1 0.6X +PassThrough 894 896 4 75.1 13.3 1.0X +RunLengthEncoding 1000 1001 1 67.1 14.9 0.9X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor INT Encode (Lower Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough(1.000) 6 6 0 11431.5 0.1 1.0X -RunLengthEncoding(1.001) 1029 1030 1 65.2 15.3 0.0X -DictionaryEncoding(0.500) 339 340 1 197.8 5.1 0.0X -IntDelta(0.250) 109 110 0 613.5 1.6 0.1X +PassThrough(1.000) 6 6 0 11188.7 0.1 1.0X +RunLengthEncoding(1.003) 1245 1246 2 53.9 18.6 0.0X +DictionaryEncoding(0.500) 1376 1379 4 48.8 20.5 0.0X +IntDelta(0.250) 110 113 9 612.8 1.6 0.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor INT Decode (Lower Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough 652 654 2 102.9 9.7 1.0X -RunLengthEncoding 1115 1117 3 60.2 16.6 0.6X -DictionaryEncoding 541 544 5 124.0 8.1 1.2X -IntDelta 498 498 0 134.8 7.4 1.3X +PassThrough 644 645 2 104.2 9.6 1.0X +RunLengthEncoding 1155 1156 0 58.1 17.2 0.6X +DictionaryEncoding 523 525 3 128.4 7.8 1.2X +IntDelta 499 501 3 134.5 7.4 1.3X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor INT Encode (Higher Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough(1.000) 6 6 0 11179.4 0.1 1.0X -RunLengthEncoding(1.344) 1060 1061 0 63.3 15.8 0.0X -DictionaryEncoding(0.501) 342 343 1 196.0 5.1 0.0X -IntDelta(0.250) 111 111 1 607.1 1.6 0.1X +PassThrough(1.000) 6 6 0 11231.0 0.1 1.0X +RunLengthEncoding(1.337) 1087 1104 24 61.7 16.2 0.0X +DictionaryEncoding(0.501) 565 567 3 118.9 8.4 0.0X +IntDelta(0.250) 109 110 1 613.0 1.6 0.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor INT Decode (Higher Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough 712 714 2 94.3 10.6 1.0X -RunLengthEncoding 1214 1214 1 55.3 18.1 0.6X -DictionaryEncoding 667 670 3 100.6 9.9 1.1X -IntDelta 519 547 53 129.3 7.7 1.4X +PassThrough 706 709 4 95.0 10.5 1.0X +RunLengthEncoding 1132 1136 5 59.3 16.9 0.6X +DictionaryEncoding 659 663 4 101.8 9.8 1.1X +IntDelta 624 626 4 107.5 9.3 1.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor LONG Encode (Lower Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough(1.000) 18 18 0 3788.5 0.3 1.0X -RunLengthEncoding(0.753) 1059 1059 0 63.4 15.8 0.0X -DictionaryEncoding(0.250) 397 398 2 169.0 5.9 0.0X -LongDelta(0.125) 110 110 0 609.4 1.6 0.2X +PassThrough(1.000) 13 13 0 5066.3 0.2 1.0X +RunLengthEncoding(0.744) 1050 1056 8 63.9 15.6 0.0X +DictionaryEncoding(0.250) 593 595 2 113.1 8.8 0.0X +LongDelta(0.125) 110 111 1 608.6 1.6 0.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor LONG Decode (Lower Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough 770 771 1 87.2 11.5 1.0X -RunLengthEncoding 1220 1220 0 55.0 18.2 0.6X -DictionaryEncoding 710 712 2 94.6 10.6 1.1X -LongDelta 540 541 1 124.3 8.0 1.4X +PassThrough 646 647 2 103.9 9.6 1.0X +RunLengthEncoding 1203 1205 3 55.8 17.9 0.5X +DictionaryEncoding 724 725 2 92.7 10.8 0.9X +LongDelta 524 525 1 128.1 7.8 1.2X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor LONG Encode (Higher Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough(1.000) 18 18 0 3782.6 0.3 1.0X -RunLengthEncoding(1.006) 1099 1100 1 61.1 16.4 0.0X -DictionaryEncoding(0.251) 397 398 1 169.1 5.9 0.0X -LongDelta(0.125) 111 111 1 603.8 1.7 0.2X +PassThrough(1.000) 13 13 0 5065.9 0.2 1.0X +RunLengthEncoding(1.002) 1099 1107 10 61.0 16.4 0.0X +DictionaryEncoding(0.251) 603 605 2 111.2 9.0 0.0X +LongDelta(0.125) 110 111 1 608.5 1.6 0.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor LONG Decode (Higher Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough 770 771 2 87.2 11.5 1.0X -RunLengthEncoding 1215 1215 1 55.3 18.1 0.6X -DictionaryEncoding 710 712 2 94.5 10.6 1.1X -LongDelta 667 668 2 100.6 9.9 1.2X +PassThrough 750 754 4 89.5 11.2 1.0X +RunLengthEncoding 1213 1216 3 55.3 18.1 0.6X +DictionaryEncoding 722 725 3 92.9 10.8 1.0X +LongDelta 653 653 0 102.8 9.7 1.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor STRING Encode: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough(1.000) 24 27 1 2851.8 0.4 1.0X -RunLengthEncoding(0.892) 1611 1620 13 41.7 24.0 0.0X -DictionaryEncoding(0.167) 1953 1955 3 34.4 29.1 0.0X +PassThrough(1.000) 29 29 0 2332.2 0.4 1.0X +RunLengthEncoding(0.889) 1744 1745 1 38.5 26.0 0.0X +DictionaryEncoding(0.167) 1441 1443 2 46.6 21.5 0.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor STRING Decode: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough 1421 1435 19 47.2 21.2 1.0X -RunLengthEncoding 2070 2071 2 32.4 30.9 0.7X -DictionaryEncoding 1782 1786 6 37.7 26.6 0.8X +PassThrough 1613 1614 1 41.6 24.0 1.0X +RunLengthEncoding 2107 2108 1 31.8 31.4 0.8X +DictionaryEncoding 1830 1832 3 36.7 27.3 0.9X diff --git a/sql/core/benchmarks/CompressionSchemeBenchmark-results.txt b/sql/core/benchmarks/CompressionSchemeBenchmark-results.txt index f58c63afc691d..ea810d3a26e36 100644 --- a/sql/core/benchmarks/CompressionSchemeBenchmark-results.txt +++ b/sql/core/benchmarks/CompressionSchemeBenchmark-results.txt @@ -2,136 +2,136 @@ Compression Scheme Benchmark ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor BOOLEAN Encode: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough(1.000) 1 1 0 49430.1 0.0 1.0X -RunLengthEncoding(2.491) 895 899 5 75.0 13.3 0.0X -BooleanBitSet(0.125) 289 289 0 232.2 4.3 0.0X +PassThrough(1.000) 1 1 0 50902.6 0.0 1.0X +RunLengthEncoding(2.510) 897 898 2 74.8 13.4 0.0X +BooleanBitSet(0.125) 229 229 0 293.3 3.4 0.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor BOOLEAN Decode: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough 167 167 1 402.2 2.5 1.0X -RunLengthEncoding 560 561 1 119.9 8.3 0.3X -BooleanBitSet 660 660 1 101.7 9.8 0.3X +PassThrough 166 167 1 403.2 2.5 1.0X +RunLengthEncoding 534 535 2 125.7 8.0 0.3X +BooleanBitSet 659 663 2 101.8 9.8 0.3X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor SHORT Encode (Lower Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough(1.000) 3 3 0 23350.8 0.0 1.0X -RunLengthEncoding(1.500) 1041 1042 2 64.5 15.5 0.0X +PassThrough(1.000) 3 3 0 23313.0 0.0 1.0X +RunLengthEncoding(1.503) 1142 1144 4 58.8 17.0 0.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor SHORT Decode (Lower Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough 644 646 2 104.2 9.6 1.0X -RunLengthEncoding 1047 1048 1 64.1 15.6 0.6X +PassThrough 812 868 62 82.7 12.1 1.0X +RunLengthEncoding 1094 1094 1 61.3 16.3 0.7X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor SHORT Encode (Higher Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough(1.000) 3 3 0 23448.1 0.0 1.0X -RunLengthEncoding(2.007) 1074 1075 2 62.5 16.0 0.0X +PassThrough(1.000) 3 3 0 23874.9 0.0 1.0X +RunLengthEncoding(2.021) 1131 1133 3 59.4 16.8 0.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor SHORT Decode (Higher Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough 572 573 2 117.4 8.5 1.0X -RunLengthEncoding 946 947 1 70.9 14.1 0.6X +PassThrough 851 877 39 78.8 12.7 1.0X +RunLengthEncoding 1063 1068 6 63.1 15.8 0.8X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor INT Encode (Lower Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough(1.000) 6 6 0 11484.4 0.1 1.0X -RunLengthEncoding(0.995) 994 996 2 67.5 14.8 0.0X -DictionaryEncoding(0.500) 384 386 1 174.6 5.7 0.0X -IntDelta(0.250) 110 110 1 612.4 1.6 0.1X +PassThrough(1.000) 6 6 0 11690.1 0.1 1.0X +RunLengthEncoding(1.001) 948 959 12 70.8 14.1 0.0X +DictionaryEncoding(0.500) 628 631 3 106.9 9.4 0.0X +IntDelta(0.250) 112 115 1 600.4 1.7 0.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor INT Decode (Lower Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough 644 647 4 104.2 9.6 1.0X -RunLengthEncoding 1197 1199 2 56.0 17.8 0.5X -DictionaryEncoding 522 527 5 128.5 7.8 1.2X -IntDelta 457 458 1 146.8 6.8 1.4X +PassThrough 632 639 7 106.2 9.4 1.0X +RunLengthEncoding 1053 1053 1 63.7 15.7 0.6X +DictionaryEncoding 502 506 4 133.7 7.5 1.3X +IntDelta 449 456 4 149.3 6.7 1.4X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor INT Encode (Higher Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough(1.000) 6 6 0 11058.8 0.1 1.0X -RunLengthEncoding(1.335) 1028 1028 0 65.3 15.3 0.0X -DictionaryEncoding(0.501) 386 387 1 173.9 5.7 0.0X -IntDelta(0.250) 110 110 1 612.7 1.6 0.1X +PassThrough(1.000) 6 6 0 11598.2 0.1 1.0X +RunLengthEncoding(1.336) 979 984 7 68.5 14.6 0.0X +DictionaryEncoding(0.501) 640 646 5 104.9 9.5 0.0X +IntDelta(0.250) 114 115 1 589.0 1.7 0.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor INT Decode (Higher Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough 728 729 2 92.2 10.8 1.0X -RunLengthEncoding 1144 1146 3 58.7 17.0 0.6X -DictionaryEncoding 542 545 5 123.7 8.1 1.3X -IntDelta 662 663 2 101.4 9.9 1.1X +PassThrough 764 766 2 87.8 11.4 1.0X +RunLengthEncoding 1142 1144 3 58.8 17.0 0.7X +DictionaryEncoding 671 679 7 100.0 10.0 1.1X +IntDelta 466 470 2 143.9 6.9 1.6X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor LONG Encode (Lower Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough(1.000) 13 13 0 5058.4 0.2 1.0X -RunLengthEncoding(0.751) 1037 1038 2 64.7 15.4 0.0X -DictionaryEncoding(0.250) 426 428 3 157.4 6.4 0.0X -LongDelta(0.125) 110 111 1 607.8 1.6 0.1X +PassThrough(1.000) 13 13 0 5217.0 0.2 1.0X +RunLengthEncoding(0.751) 990 990 1 67.8 14.7 0.0X +DictionaryEncoding(0.250) 615 616 2 109.2 9.2 0.0X +LongDelta(0.125) 108 110 1 622.0 1.6 0.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor LONG Decode (Lower Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough 753 760 6 89.1 11.2 1.0X -RunLengthEncoding 1208 1208 0 55.5 18.0 0.6X -DictionaryEncoding 772 774 3 86.9 11.5 1.0X -LongDelta 520 521 2 129.1 7.7 1.4X +PassThrough 740 759 16 90.7 11.0 1.0X +RunLengthEncoding 1169 1178 12 57.4 17.4 0.6X +DictionaryEncoding 757 763 7 88.7 11.3 1.0X +LongDelta 499 502 2 134.5 7.4 1.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor LONG Encode (Higher Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough(1.000) 13 13 0 5066.8 0.2 1.0X -RunLengthEncoding(1.002) 1060 1062 2 63.3 15.8 0.0X -DictionaryEncoding(0.251) 427 429 2 157.0 6.4 0.0X -LongDelta(0.125) 110 110 1 609.5 1.6 0.1X +PassThrough(1.000) 13 19 2 5062.3 0.2 1.0X +RunLengthEncoding(1.001) 1005 1008 4 66.8 15.0 0.0X +DictionaryEncoding(0.251) 612 613 1 109.7 9.1 0.0X +LongDelta(0.125) 106 110 1 634.4 1.6 0.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor LONG Decode (Higher Skew): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough 752 753 1 89.3 11.2 1.0X -RunLengthEncoding 1206 1207 0 55.6 18.0 0.6X -DictionaryEncoding 773 773 0 86.8 11.5 1.0X -LongDelta 662 687 41 101.4 9.9 1.1X +PassThrough 865 875 9 77.6 12.9 1.0X +RunLengthEncoding 1185 1188 4 56.6 17.7 0.7X +DictionaryEncoding 754 761 7 89.0 11.2 1.1X +LongDelta 660 667 7 101.8 9.8 1.3X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor STRING Encode: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough(1.000) 20 20 0 3386.0 0.3 1.0X -RunLengthEncoding(0.890) 1563 1570 11 42.9 23.3 0.0X -DictionaryEncoding(0.167) 1956 1958 2 34.3 29.1 0.0X +PassThrough(1.000) 27 27 0 2497.2 0.4 1.0X +RunLengthEncoding(0.888) 1584 1586 3 42.4 23.6 0.0X +DictionaryEncoding(0.167) 1597 1600 4 42.0 23.8 0.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor STRING Decode: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -PassThrough 1131 1141 14 59.3 16.9 1.0X -RunLengthEncoding 1884 1887 4 35.6 28.1 0.6X -DictionaryEncoding 1705 1706 2 39.4 25.4 0.7X +PassThrough 1465 1466 1 45.8 21.8 1.0X +RunLengthEncoding 1894 1894 1 35.4 28.2 0.8X +DictionaryEncoding 1775 1776 2 37.8 26.4 0.8X diff --git a/sql/core/benchmarks/ConstantColumnVectorBenchmark-jdk21-results.txt b/sql/core/benchmarks/ConstantColumnVectorBenchmark-jdk21-results.txt index 4a39ed8843fbe..8c09e95988536 100644 --- a/sql/core/benchmarks/ConstantColumnVectorBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/ConstantColumnVectorBenchmark-jdk21-results.txt @@ -1,280 +1,280 @@ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test write with StringType, row length = 1: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 1 1 0 365173.7 0.0 1.0X -OnHeapColumnVector 2655 2664 12 154.3 6.5 0.0X -OffHeapColumnVector 4597 4611 19 89.1 11.2 0.0X +ConstantColumnVector 1 1 0 386171.0 0.0 1.0X +OnHeapColumnVector 2709 2711 3 151.2 6.6 0.0X +OffHeapColumnVector 5028 5031 4 81.5 12.3 0.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test write with StringType, row length = 5: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 1 1 0 432863.4 0.0 1.0X -OnHeapColumnVector 3271 3289 26 125.2 8.0 0.0X -OffHeapColumnVector 4300 4320 28 95.3 10.5 0.0X +ConstantColumnVector 1 1 0 382579.9 0.0 1.0X +OnHeapColumnVector 3353 3353 1 122.2 8.2 0.0X +OffHeapColumnVector 5136 5142 7 79.7 12.5 0.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test write with StringType, row length = 10: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 1 1 0 446014.9 0.0 1.0X -OnHeapColumnVector 3576 3581 8 114.5 8.7 0.0X -OffHeapColumnVector 4502 4512 13 91.0 11.0 0.0X +ConstantColumnVector 1 1 0 382916.1 0.0 1.0X +OnHeapColumnVector 3715 3715 1 110.3 9.1 0.0X +OffHeapColumnVector 5746 5747 2 71.3 14.0 0.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test write with StringType, row length = 15: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 1 1 0 445490.0 0.0 1.0X -OnHeapColumnVector 3539 3549 15 115.8 8.6 0.0X -OffHeapColumnVector 4221 4223 3 97.0 10.3 0.0X +ConstantColumnVector 1 1 0 382658.2 0.0 1.0X +OnHeapColumnVector 3514 3522 11 116.5 8.6 0.0X +OffHeapColumnVector 5138 5142 6 79.7 12.5 0.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test write with StringType, row length = 20: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 1 1 0 446005.7 0.0 1.0X -OnHeapColumnVector 3684 3693 12 111.2 9.0 0.0X -OffHeapColumnVector 4422 4422 1 92.6 10.8 0.0X +ConstantColumnVector 1 1 0 382564.9 0.0 1.0X +OnHeapColumnVector 3808 3810 3 107.6 9.3 0.0X +OffHeapColumnVector 5476 5481 7 74.8 13.4 0.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test write with StringType, row length = 30: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 1 1 0 411269.8 0.0 1.0X -OnHeapColumnVector 3895 3900 7 105.2 9.5 0.0X -OffHeapColumnVector 4402 4403 1 93.0 10.7 0.0X +ConstantColumnVector 1 1 0 382528.8 0.0 1.0X +OnHeapColumnVector 3875 3877 3 105.7 9.5 0.0X +OffHeapColumnVector 5487 5495 11 74.6 13.4 0.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test write with IntegerType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 1 1 0 720547.0 0.0 1.0X -OnHeapColumnVector 15 16 0 27430.8 0.0 0.0X -OffHeapColumnVector 62 64 1 6642.4 0.2 0.0X +ConstantColumnVector 1 1 0 632701.2 0.0 1.0X +OnHeapColumnVector 16 16 0 25265.4 0.0 0.0X +OffHeapColumnVector 65 67 7 6265.1 0.2 0.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test write with LongType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 1 1 0 686246.6 0.0 1.0X -OnHeapColumnVector 31 32 1 13203.3 0.1 0.0X -OffHeapColumnVector 64 67 2 6377.9 0.2 0.0X +ConstantColumnVector 1 1 0 603950.7 0.0 1.0X +OnHeapColumnVector 33 33 0 12426.1 0.1 0.0X +OffHeapColumnVector 68 68 1 6051.2 0.2 0.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test write with FloatType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 1 1 0 496467.4 0.0 1.0X -OnHeapColumnVector 16 17 1 26196.4 0.0 0.1X -OffHeapColumnVector 122 125 2 3347.4 0.3 0.0X +ConstantColumnVector 1 1 0 491622.3 0.0 1.0X +OnHeapColumnVector 17 17 0 24658.2 0.0 0.1X +OffHeapColumnVector 127 128 0 3214.9 0.3 0.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test write with DoubleType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 1 1 0 575876.2 0.0 1.0X -OnHeapColumnVector 34 36 3 12183.4 0.1 0.0X -OffHeapColumnVector 123 127 2 3342.3 0.3 0.0X +ConstantColumnVector 1 1 0 491621.7 0.0 1.0X +OnHeapColumnVector 34 34 3 12179.0 0.1 0.0X +OffHeapColumnVector 129 129 0 3181.8 0.3 0.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test read with StringType, row length = 1: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 0 0 0 439957035.4 0.0 1.0X -OnHeapColumnVector 94 97 2 4342.9 0.2 0.0X -OffHeapColumnVector 2065 2072 11 198.4 5.0 0.0X +ConstantColumnVector 0 0 0 409190809.2 0.0 1.0X +OnHeapColumnVector 99 99 1 4150.4 0.2 0.0X +OffHeapColumnVector 1988 1997 12 206.0 4.9 0.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test read with StringType, row length = 5: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 0 0 0 439957035.4 0.0 1.0X -OnHeapColumnVector 94 96 2 4372.5 0.2 0.0X -OffHeapColumnVector 2033 2048 22 201.5 5.0 0.0X +ConstantColumnVector 0 0 0 409190809.2 0.0 1.0X +OnHeapColumnVector 99 99 0 4149.0 0.2 0.0X +OffHeapColumnVector 2065 2065 1 198.4 5.0 0.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test read with StringType, row length = 10: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 0 0 0 439957035.4 0.0 1.0X -OnHeapColumnVector 93 95 1 4393.5 0.2 0.0X -OffHeapColumnVector 2031 2033 2 201.6 5.0 0.0X +ConstantColumnVector 0 0 0 409190809.2 0.0 1.0X +OnHeapColumnVector 99 99 0 4150.1 0.2 0.0X +OffHeapColumnVector 1980 1983 5 206.9 4.8 0.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test read with StringType, row length = 15: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 0 0 0 439957035.4 0.0 1.0X -OnHeapColumnVector 94 98 1 4353.0 0.2 0.0X -OffHeapColumnVector 2083 2110 38 196.6 5.1 0.0X +ConstantColumnVector 0 0 0 409190809.2 0.0 1.0X +OnHeapColumnVector 99 102 13 4152.9 0.2 0.0X +OffHeapColumnVector 1980 1981 2 206.8 4.8 0.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test read with StringType, row length = 20: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 0 0 0 439957035.4 0.0 1.0X -OnHeapColumnVector 96 98 1 4279.0 0.2 0.0X -OffHeapColumnVector 2086 2089 4 196.4 5.1 0.0X +ConstantColumnVector 0 0 0 409190809.2 0.0 1.0X +OnHeapColumnVector 99 99 1 4152.0 0.2 0.0X +OffHeapColumnVector 1977 1978 2 207.2 4.8 0.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test read with StringType, row length = 30: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 0 0 0 439957035.4 0.0 1.0X -OnHeapColumnVector 99 99 0 4157.8 0.2 0.0X -OffHeapColumnVector 2048 2052 6 200.0 5.0 0.0X +ConstantColumnVector 0 0 0 409190809.2 0.0 1.0X +OnHeapColumnVector 99 99 1 4148.5 0.2 0.0X +OffHeapColumnVector 1973 1975 2 207.6 4.8 0.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test read with IntegerType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 0 0 0 14395670.1 0.0 1.0X -OnHeapColumnVector 0 0 0 1601132.1 0.0 0.1X -OffHeapColumnVector 883 887 3 464.0 2.2 0.0X +ConstantColumnVector 0 0 0 53029518.4 0.0 1.0X +OnHeapColumnVector 0 0 0 1203831.3 0.0 0.0X +OffHeapColumnVector 889 889 1 461.0 2.2 0.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test read with LongType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 1904 1910 8 215.1 4.6 1.0X -OnHeapColumnVector 2039 2045 9 200.9 5.0 0.9X -OffHeapColumnVector 2580 2581 1 158.8 6.3 0.7X +ConstantColumnVector 1935 1935 0 211.7 4.7 1.0X +OnHeapColumnVector 2089 2097 11 196.0 5.1 0.9X +OffHeapColumnVector 2593 2594 1 157.9 6.3 0.7X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test read with FloatType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 1822 1823 1 224.8 4.4 1.0X -OnHeapColumnVector 2071 2078 10 197.8 5.1 0.9X -OffHeapColumnVector 2588 2590 2 158.3 6.3 0.7X +ConstantColumnVector 2011 2011 1 203.7 4.9 1.0X +OnHeapColumnVector 2196 2199 4 186.5 5.4 0.9X +OffHeapColumnVector 2606 2623 25 157.2 6.4 0.8X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test read with DoubleType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 1960 1962 3 209.0 4.8 1.0X -OnHeapColumnVector 2079 2087 12 197.1 5.1 0.9X -OffHeapColumnVector 2724 2730 7 150.3 6.7 0.7X +ConstantColumnVector 2112 2112 1 194.0 5.2 1.0X +OnHeapColumnVector 2255 2257 2 181.6 5.5 0.9X +OffHeapColumnVector 2759 2792 45 148.4 6.7 0.8X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test write and read with StringType, row length = 1: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 1920 1926 8 213.3 4.7 1.0X -OnHeapColumnVector 4208 4210 3 97.3 10.3 0.5X -OffHeapColumnVector 4045 4053 11 101.3 9.9 0.5X +ConstantColumnVector 1936 1941 8 211.6 4.7 1.0X +OnHeapColumnVector 4457 4459 2 91.9 10.9 0.4X +OffHeapColumnVector 3980 3982 4 102.9 9.7 0.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test write and read with StringType, row length = 5: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 1908 1910 4 214.7 4.7 1.0X -OnHeapColumnVector 4214 4214 1 97.2 10.3 0.5X -OffHeapColumnVector 4065 4068 4 100.8 9.9 0.5X +ConstantColumnVector 1936 1936 1 211.6 4.7 1.0X +OnHeapColumnVector 4453 4459 9 92.0 10.9 0.4X +OffHeapColumnVector 3974 3974 0 103.1 9.7 0.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test write and read with StringType, row length = 10: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 1919 1922 3 213.4 4.7 1.0X -OnHeapColumnVector 4209 4210 1 97.3 10.3 0.5X -OffHeapColumnVector 4036 4045 13 101.5 9.9 0.5X +ConstantColumnVector 1928 1930 3 212.4 4.7 1.0X +OnHeapColumnVector 4460 4462 3 91.8 10.9 0.4X +OffHeapColumnVector 3961 3966 8 103.4 9.7 0.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test write and read with StringType, row length = 15: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 1933 1934 0 211.8 4.7 1.0X -OnHeapColumnVector 4185 4194 12 97.9 10.2 0.5X -OffHeapColumnVector 4077 4097 28 100.5 10.0 0.5X +ConstantColumnVector 1934 1938 5 211.8 4.7 1.0X +OnHeapColumnVector 4458 4462 6 91.9 10.9 0.4X +OffHeapColumnVector 3978 3980 3 103.0 9.7 0.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test write and read with StringType, row length = 20: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 1922 1933 15 213.1 4.7 1.0X -OnHeapColumnVector 4195 4200 7 97.6 10.2 0.5X -OffHeapColumnVector 4079 4097 26 100.4 10.0 0.5X +ConstantColumnVector 1937 1938 2 211.5 4.7 1.0X +OnHeapColumnVector 4465 4467 3 91.7 10.9 0.4X +OffHeapColumnVector 3979 3983 5 102.9 9.7 0.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test write and read with StringType, row length = 30: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 1934 1936 3 211.8 4.7 1.0X -OnHeapColumnVector 4212 4214 3 97.2 10.3 0.5X -OffHeapColumnVector 4064 4083 28 100.8 9.9 0.5X +ConstantColumnVector 1941 1944 4 211.0 4.7 1.0X +OnHeapColumnVector 4453 4455 2 92.0 10.9 0.4X +OffHeapColumnVector 3981 3982 1 102.9 9.7 0.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test write and read with IntegerType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 751 755 3 545.5 1.8 1.0X -OnHeapColumnVector 888 888 0 461.2 2.2 0.8X -OffHeapColumnVector 885 886 1 463.0 2.2 0.8X +ConstantColumnVector 888 888 0 461.4 2.2 1.0X +OnHeapColumnVector 1020 1020 1 401.7 2.5 0.9X +OffHeapColumnVector 888 889 1 461.5 2.2 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test write and read with LongType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 2816 2818 3 145.5 6.9 1.0X -OnHeapColumnVector 2959 2960 2 138.4 7.2 1.0X -OffHeapColumnVector 2950 2951 1 138.8 7.2 1.0X +ConstantColumnVector 2849 2849 0 143.8 7.0 1.0X +OnHeapColumnVector 2971 2974 5 137.9 7.3 1.0X +OffHeapColumnVector 2978 2979 1 137.5 7.3 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test write and read with FloatType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 2837 2841 5 144.4 6.9 1.0X -OnHeapColumnVector 2960 2961 2 138.4 7.2 1.0X -OffHeapColumnVector 2954 2956 3 138.7 7.2 1.0X +ConstantColumnVector 2859 2865 8 143.3 7.0 1.0X +OnHeapColumnVector 3111 3112 1 131.6 7.6 0.9X +OffHeapColumnVector 2981 2982 2 137.4 7.3 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test write and read with DoubleType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 2968 2979 15 138.0 7.2 1.0X -OnHeapColumnVector 3071 3083 16 133.4 7.5 1.0X -OffHeapColumnVector 3099 3101 3 132.2 7.6 1.0X +ConstantColumnVector 2363 2365 3 173.3 5.8 1.0X +OnHeapColumnVector 3130 3133 4 130.9 7.6 0.8X +OffHeapColumnVector 3127 3127 0 131.0 7.6 0.8X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test isNull with StringType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 0 0 0 439957035.4 0.0 1.0X -OnHeapColumnVector 0 0 0 3602335.9 0.0 0.0X -OffHeapColumnVector 0 0 0 439957035.4 0.0 1.0X +ConstantColumnVector 0 0 0 405143422.4 0.0 1.0X +OnHeapColumnVector 0 0 0 3321143.9 0.0 0.0X +OffHeapColumnVector 0 0 0 405143422.4 0.0 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test isNull with IntegerType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 0 0 0 439957035.4 0.0 1.0X -OnHeapColumnVector 0 0 0 3602335.9 0.0 0.0X -OffHeapColumnVector 0 0 0 439957035.4 0.0 1.0X +ConstantColumnVector 0 0 0 405143422.4 0.0 1.0X +OnHeapColumnVector 0 0 0 3321143.9 0.0 0.0X +OffHeapColumnVector 0 0 0 405143422.4 0.0 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test isNull with LongType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 0 0 0 439957035.4 0.0 1.0X -OnHeapColumnVector 0 0 0 3602367.6 0.0 0.0X -OffHeapColumnVector 0 0 0 439957035.4 0.0 1.0X +ConstantColumnVector 0 0 0 405143422.4 0.0 1.0X +OnHeapColumnVector 0 0 0 3321143.9 0.0 0.0X +OffHeapColumnVector 0 0 0 405143422.4 0.0 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test isNull with FloatType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 0 0 0 439957035.4 0.0 1.0X -OnHeapColumnVector 0 0 0 3602335.9 0.0 0.0X -OffHeapColumnVector 0 0 0 439957035.4 0.0 1.0X +ConstantColumnVector 0 0 0 405143422.4 0.0 1.0X +OnHeapColumnVector 0 0 0 3321143.9 0.0 0.0X +OffHeapColumnVector 0 0 0 405143422.4 0.0 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test isNull with DoubleType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 0 0 0 439957035.4 0.0 1.0X -OnHeapColumnVector 0 0 0 3602367.6 0.0 0.0X -OffHeapColumnVector 0 0 0 439957035.4 0.0 1.0X +ConstantColumnVector 0 0 0 405143422.4 0.0 1.0X +OnHeapColumnVector 0 0 0 3321386.3 0.0 0.0X +OffHeapColumnVector 0 0 0 405143422.4 0.0 1.0X diff --git a/sql/core/benchmarks/ConstantColumnVectorBenchmark-results.txt b/sql/core/benchmarks/ConstantColumnVectorBenchmark-results.txt index 745e822b1b1db..71245ab989f80 100644 --- a/sql/core/benchmarks/ConstantColumnVectorBenchmark-results.txt +++ b/sql/core/benchmarks/ConstantColumnVectorBenchmark-results.txt @@ -1,280 +1,280 @@ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test write with StringType, row length = 1: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 1 1 0 328565.8 0.0 1.0X -OnHeapColumnVector 2907 2907 0 140.9 7.1 0.0X -OffHeapColumnVector 3321 3326 7 123.4 8.1 0.0X +ConstantColumnVector 1 1 0 375313.9 0.0 1.0X +OnHeapColumnVector 2845 2855 14 144.0 6.9 0.0X +OffHeapColumnVector 3241 3246 7 126.4 7.9 0.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test write with StringType, row length = 5: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 1 1 0 387374.0 0.0 1.0X -OnHeapColumnVector 3630 3632 3 112.8 8.9 0.0X -OffHeapColumnVector 4341 4341 0 94.3 10.6 0.0X +ConstantColumnVector 1 1 0 385212.6 0.0 1.0X +OnHeapColumnVector 3602 3608 9 113.7 8.8 0.0X +OffHeapColumnVector 4441 4442 1 92.2 10.8 0.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test write with StringType, row length = 10: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 1 1 0 387407.0 0.0 1.0X -OnHeapColumnVector 3607 3608 1 113.5 8.8 0.0X -OffHeapColumnVector 4164 4166 3 98.4 10.2 0.0X +ConstantColumnVector 1 1 0 385394.2 0.0 1.0X +OnHeapColumnVector 3931 3932 3 104.2 9.6 0.0X +OffHeapColumnVector 4526 4527 1 90.5 11.1 0.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test write with StringType, row length = 15: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 1 1 0 337614.8 0.0 1.0X -OnHeapColumnVector 3563 3563 1 115.0 8.7 0.0X -OffHeapColumnVector 5135 5138 4 79.8 12.5 0.0X +ConstantColumnVector 1 1 0 385444.6 0.0 1.0X +OnHeapColumnVector 3625 3639 20 113.0 8.9 0.0X +OffHeapColumnVector 4792 4792 0 85.5 11.7 0.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test write with StringType, row length = 20: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 1 1 0 387278.8 0.0 1.0X -OnHeapColumnVector 3562 3568 8 115.0 8.7 0.0X -OffHeapColumnVector 5469 5475 8 74.9 13.4 0.0X +ConstantColumnVector 1 1 0 385238.3 0.0 1.0X +OnHeapColumnVector 3706 3711 7 110.5 9.0 0.0X +OffHeapColumnVector 5015 5015 1 81.7 12.2 0.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test write with StringType, row length = 30: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 1 1 0 387345.1 0.0 1.0X -OnHeapColumnVector 3790 3794 6 108.1 9.3 0.0X -OffHeapColumnVector 5339 5340 1 76.7 13.0 0.0X +ConstantColumnVector 1 1 0 385509.9 0.0 1.0X +OnHeapColumnVector 4026 4051 35 101.7 9.8 0.0X +OffHeapColumnVector 5144 5166 30 79.6 12.6 0.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test write with IntegerType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 1 1 0 699317.4 0.0 1.0X -OnHeapColumnVector 16 16 0 25569.9 0.0 0.0X -OffHeapColumnVector 65 65 0 6297.2 0.2 0.0X +ConstantColumnVector 1 1 0 699304.3 0.0 1.0X +OnHeapColumnVector 16 16 0 25324.1 0.0 0.0X +OffHeapColumnVector 66 66 0 6241.0 0.2 0.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test write with LongType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 1 1 0 664350.5 0.0 1.0X -OnHeapColumnVector 34 34 0 12217.2 0.1 0.0X -OffHeapColumnVector 67 67 0 6090.0 0.2 0.0X +ConstantColumnVector 1 1 0 664338.7 0.0 1.0X +OnHeapColumnVector 34 35 0 11906.0 0.1 0.0X +OffHeapColumnVector 66 66 0 6201.7 0.2 0.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test write with FloatType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 1 1 0 577132.3 0.0 1.0X -OnHeapColumnVector 16 16 0 24993.9 0.0 0.0X -OffHeapColumnVector 127 127 0 3215.5 0.3 0.0X +ConstantColumnVector 1 1 0 553053.1 0.0 1.0X +OnHeapColumnVector 16 16 0 25324.9 0.0 0.0X +OffHeapColumnVector 127 127 0 3216.6 0.3 0.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test write with DoubleType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 1 1 0 553055.4 0.0 1.0X -OnHeapColumnVector 34 34 0 12093.7 0.1 0.0X -OffHeapColumnVector 128 129 0 3189.9 0.3 0.0X +ConstantColumnVector 1 1 0 553051.6 0.0 1.0X +OnHeapColumnVector 36 36 0 11516.6 0.1 0.0X +OffHeapColumnVector 128 129 0 3190.1 0.3 0.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test read with StringType, row length = 1: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 918 918 1 446.4 2.2 1.0X -OnHeapColumnVector 1925 1925 0 212.8 4.7 0.5X -OffHeapColumnVector 2933 2934 2 139.7 7.2 0.3X +ConstantColumnVector 693 698 9 591.4 1.7 1.0X +OnHeapColumnVector 1672 1673 0 244.9 4.1 0.4X +OffHeapColumnVector 3044 3046 2 134.5 7.4 0.2X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test read with StringType, row length = 5: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 784 785 1 522.6 1.9 1.0X -OnHeapColumnVector 3273 3273 0 125.2 8.0 0.2X -OffHeapColumnVector 2991 2992 1 136.9 7.3 0.3X +ConstantColumnVector 795 797 2 515.0 1.9 1.0X +OnHeapColumnVector 3428 3429 1 119.5 8.4 0.2X +OffHeapColumnVector 3089 3101 18 132.6 7.5 0.3X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test read with StringType, row length = 10: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 785 791 6 521.5 1.9 1.0X -OnHeapColumnVector 3407 3411 5 120.2 8.3 0.2X -OffHeapColumnVector 2992 2993 0 136.9 7.3 0.3X +ConstantColumnVector 793 795 3 516.4 1.9 1.0X +OnHeapColumnVector 3442 3443 2 119.0 8.4 0.2X +OffHeapColumnVector 3083 3085 4 132.9 7.5 0.3X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test read with StringType, row length = 15: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 785 789 6 522.1 1.9 1.0X -OnHeapColumnVector 3424 3426 3 119.6 8.4 0.2X -OffHeapColumnVector 3003 3004 2 136.4 7.3 0.3X +ConstantColumnVector 796 799 4 514.9 1.9 1.0X +OnHeapColumnVector 3460 3462 3 118.4 8.4 0.2X +OffHeapColumnVector 3073 3073 1 133.3 7.5 0.3X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test read with StringType, row length = 20: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 784 784 0 522.6 1.9 1.0X -OnHeapColumnVector 3397 3399 2 120.6 8.3 0.2X -OffHeapColumnVector 2999 2999 0 136.6 7.3 0.3X +ConstantColumnVector 795 796 1 515.1 1.9 1.0X +OnHeapColumnVector 3447 3447 0 118.8 8.4 0.2X +OffHeapColumnVector 3076 3084 12 133.2 7.5 0.3X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test read with StringType, row length = 30: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 785 787 1 521.5 1.9 1.0X -OnHeapColumnVector 3402 3406 5 120.4 8.3 0.2X -OffHeapColumnVector 2996 2999 4 136.7 7.3 0.3X +ConstantColumnVector 795 795 1 515.5 1.9 1.0X +OnHeapColumnVector 3453 3453 1 118.6 8.4 0.2X +OffHeapColumnVector 3084 3084 0 132.8 7.5 0.3X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test read with IntegerType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 1811 1815 5 226.2 4.4 1.0X -OnHeapColumnVector 2077 2078 2 197.2 5.1 0.9X -OffHeapColumnVector 2575 2576 1 159.0 6.3 0.7X +ConstantColumnVector 0 0 0 6641264.7 0.0 1.0X +OnHeapColumnVector 0 0 0 1476254.1 0.0 0.2X +OffHeapColumnVector 762 786 39 537.5 1.9 0.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test read with LongType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 0 0 0 2657186.6 0.0 1.0X -OnHeapColumnVector 0 0 0 949055.5 0.0 0.4X -OffHeapColumnVector 763 764 1 537.1 1.9 0.0X +ConstantColumnVector 0 0 0 3321170.8 0.0 1.0X +OnHeapColumnVector 0 0 0 1328632.1 0.0 0.4X +OffHeapColumnVector 762 762 0 537.2 1.9 0.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test read with FloatType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 0 0 0 2657186.6 0.0 1.0X -OnHeapColumnVector 0 1 0 949055.5 0.0 0.4X -OffHeapColumnVector 762 763 2 537.3 1.9 0.0X +ConstantColumnVector 0 0 0 2952306.1 0.0 1.0X +OnHeapColumnVector 0 1 0 1207849.8 0.0 0.4X +OffHeapColumnVector 762 765 2 537.5 1.9 0.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test read with DoubleType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 0 0 0 1207529.4 0.0 1.0X -OnHeapColumnVector 1 1 0 738059.2 0.0 0.6X -OffHeapColumnVector 763 765 2 537.0 1.9 0.0X +ConstantColumnVector 0 0 0 2214341.3 0.0 1.0X +OnHeapColumnVector 0 0 0 1207885.5 0.0 0.5X +OffHeapColumnVector 890 891 2 460.5 2.2 0.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test write and read with StringType, row length = 1: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 0 0 0 1107203.6 0.0 1.0X -OnHeapColumnVector 2359 2359 1 173.6 5.8 0.0X -OffHeapColumnVector 1793 1793 1 228.5 4.4 0.0X +ConstantColumnVector 0 0 0 1897930.6 0.0 1.0X +OnHeapColumnVector 2249 2251 3 182.1 5.5 0.0X +OffHeapColumnVector 1965 1966 1 208.4 4.8 0.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test write and read with StringType, row length = 5: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------- -ConstantColumnVector 0 0 0 1107203.6 0.0 1.0X -OnHeapColumnVector 2375 2384 12 172.5 5.8 0.0X -OffHeapColumnVector 1803 1803 1 227.2 4.4 0.0X +ConstantColumnVector 0 0 0 1897930.6 0.0 1.0X +OnHeapColumnVector 2249 2252 4 182.1 5.5 0.0X +OffHeapColumnVector 1976 1980 5 207.2 4.8 0.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test write and read with StringType, row length = 10: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 0 0 0 1107203.6 0.0 1.0X -OnHeapColumnVector 2366 2372 9 173.2 5.8 0.0X -OffHeapColumnVector 1798 1809 15 227.8 4.4 0.0X +ConstantColumnVector 0 0 0 1897948.2 0.0 1.0X +OnHeapColumnVector 2272 2272 1 180.3 5.5 0.0X +OffHeapColumnVector 1980 1989 12 206.9 4.8 0.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test write and read with StringType, row length = 15: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 0 0 0 1107203.6 0.0 1.0X -OnHeapColumnVector 2369 2369 1 172.9 5.8 0.0X -OffHeapColumnVector 1799 1800 0 227.6 4.4 0.0X +ConstantColumnVector 0 0 0 1897939.4 0.0 1.0X +OnHeapColumnVector 2267 2273 9 180.7 5.5 0.0X +OffHeapColumnVector 1973 1974 3 207.7 4.8 0.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test write and read with StringType, row length = 20: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 0 0 0 1107203.6 0.0 1.0X -OnHeapColumnVector 2364 2365 1 173.3 5.8 0.0X -OffHeapColumnVector 1792 1793 1 228.5 4.4 0.0X +ConstantColumnVector 0 0 0 1897939.4 0.0 1.0X +OnHeapColumnVector 2263 2265 4 181.0 5.5 0.0X +OffHeapColumnVector 1979 1981 4 207.0 4.8 0.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test write and read with StringType, row length = 30: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 0 0 0 1107203.6 0.0 1.0X -OnHeapColumnVector 2367 2368 1 173.0 5.8 0.0X -OffHeapColumnVector 1795 1796 1 228.2 4.4 0.0X +ConstantColumnVector 0 0 0 1897939.4 0.0 1.0X +OnHeapColumnVector 2253 2253 0 181.8 5.5 0.0X +OffHeapColumnVector 1966 1969 4 208.4 4.8 0.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test write and read with IntegerType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 3429 3430 1 119.4 8.4 1.0X -OnHeapColumnVector 2572 2572 0 159.3 6.3 1.3X -OffHeapColumnVector 2581 2589 11 158.7 6.3 1.3X +ConstantColumnVector 761 762 0 538.1 1.9 1.0X +OnHeapColumnVector 888 891 4 461.3 2.2 0.9X +OffHeapColumnVector 888 889 1 461.3 2.2 0.9X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test write and read with LongType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 761 761 1 538.3 1.9 1.0X -OnHeapColumnVector 763 764 1 537.0 1.9 1.0X -OffHeapColumnVector 762 763 1 537.2 1.9 1.0X +ConstantColumnVector 761 762 1 538.3 1.9 1.0X +OnHeapColumnVector 763 764 0 536.6 1.9 1.0X +OffHeapColumnVector 764 764 1 536.5 1.9 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test write and read with FloatType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 760 761 0 538.7 1.9 1.0X -OnHeapColumnVector 763 763 1 537.1 1.9 1.0X -OffHeapColumnVector 762 763 1 537.2 1.9 1.0X +ConstantColumnVector 761 764 4 538.0 1.9 1.0X +OnHeapColumnVector 766 767 1 534.9 1.9 1.0X +OffHeapColumnVector 762 763 1 537.6 1.9 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test write and read with DoubleType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 888 888 1 461.5 2.2 1.0X -OnHeapColumnVector 763 774 18 536.8 1.9 1.2X -OffHeapColumnVector 762 763 1 537.4 1.9 1.2X +ConstantColumnVector 761 762 1 538.3 1.9 1.0X +OnHeapColumnVector 889 889 0 460.7 2.2 0.9X +OffHeapColumnVector 890 894 5 460.1 2.2 0.9X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test isNull with StringType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 0 0 0 209728622.6 0.0 1.0X -OnHeapColumnVector 0 0 0 2211961.7 0.0 0.0X -OffHeapColumnVector 0 0 0 209728622.6 0.0 1.0X +ConstantColumnVector 0 0 0 105648697.4 0.0 1.0X +OnHeapColumnVector 0 0 0 2211949.7 0.0 0.0X +OffHeapColumnVector 0 0 0 105648697.4 0.0 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test isNull with IntegerType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 0 0 0 209728622.6 0.0 1.0X -OnHeapColumnVector 0 0 0 2211961.7 0.0 0.0X -OffHeapColumnVector 0 0 0 209728622.6 0.0 1.0X +ConstantColumnVector 0 0 0 105648697.4 0.0 1.0X +OnHeapColumnVector 0 0 0 2211949.7 0.0 0.0X +OffHeapColumnVector 0 0 0 105648697.4 0.0 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test isNull with LongType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 0 0 0 209728622.6 0.0 1.0X -OnHeapColumnVector 0 0 0 2211961.7 0.0 0.0X -OffHeapColumnVector 0 0 0 209728622.6 0.0 1.0X +ConstantColumnVector 0 0 0 105648697.4 0.0 1.0X +OnHeapColumnVector 0 0 0 2211949.7 0.0 0.0X +OffHeapColumnVector 0 0 0 105648697.4 0.0 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test isNull with FloatType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 0 0 0 209728622.6 0.0 1.0X +ConstantColumnVector 0 0 0 105648697.4 0.0 1.0X OnHeapColumnVector 0 0 0 2211949.7 0.0 0.0X -OffHeapColumnVector 0 0 0 209728622.6 0.0 1.0X +OffHeapColumnVector 0 0 0 105648697.4 0.0 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Test isNull with DoubleType: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ConstantColumnVector 0 0 0 209728622.6 0.0 1.0X -OnHeapColumnVector 0 0 0 2211961.7 0.0 0.0X -OffHeapColumnVector 0 0 0 209728622.6 0.0 1.0X +ConstantColumnVector 0 0 0 105648697.4 0.0 1.0X +OnHeapColumnVector 0 0 0 2211949.7 0.0 0.0X +OffHeapColumnVector 0 0 0 105648697.4 0.0 1.0X diff --git a/sql/core/benchmarks/DataSourceReadBenchmark-jdk21-results.txt b/sql/core/benchmarks/DataSourceReadBenchmark-jdk21-results.txt index 8217e1561c174..43d7eb15b0ea5 100644 --- a/sql/core/benchmarks/DataSourceReadBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/DataSourceReadBenchmark-jdk21-results.txt @@ -2,430 +2,430 @@ SQL Single Numeric Column Scan ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor SQL Single BOOLEAN Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 7930 7984 77 2.0 504.2 1.0X -SQL Json 8135 8250 163 1.9 517.2 1.0X -SQL Parquet Vectorized: DataPageV1 76 87 9 205.7 4.9 103.7X -SQL Parquet Vectorized: DataPageV2 55 65 8 285.3 3.5 143.8X -SQL Parquet MR: DataPageV1 1785 1787 3 8.8 113.5 4.4X -SQL Parquet MR: DataPageV2 1643 1680 52 9.6 104.5 4.8X -SQL ORC Vectorized 114 124 10 138.2 7.2 69.7X -SQL ORC MR 1494 1496 3 10.5 95.0 5.3X - -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +SQL CSV 9893 9962 97 1.6 629.0 1.0X +SQL Json 7942 8051 155 2.0 504.9 1.2X +SQL Parquet Vectorized: DataPageV1 84 96 8 187.9 5.3 118.2X +SQL Parquet Vectorized: DataPageV2 95 107 9 166.3 6.0 104.6X +SQL Parquet MR: DataPageV1 1727 1730 3 9.1 109.8 5.7X +SQL Parquet MR: DataPageV2 1615 1615 1 9.7 102.6 6.1X +SQL ORC Vectorized 135 146 8 116.4 8.6 73.2X +SQL ORC MR 1495 1511 22 10.5 95.0 6.6X + +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor Parquet Reader Single BOOLEAN Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 35 36 1 449.0 2.2 1.0X -ParquetReader Vectorized: DataPageV2 25 26 1 638.4 1.6 1.4X -ParquetReader Vectorized -> Row: DataPageV1 29 30 1 548.0 1.8 1.2X -ParquetReader Vectorized -> Row: DataPageV2 18 20 2 851.6 1.2 1.9X +ParquetReader Vectorized: DataPageV1 92 93 1 170.7 5.9 1.0X +ParquetReader Vectorized: DataPageV2 112 113 1 140.8 7.1 0.8X +ParquetReader Vectorized -> Row: DataPageV1 72 73 1 218.6 4.6 1.3X +ParquetReader Vectorized -> Row: DataPageV2 94 96 2 167.5 6.0 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor SQL Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 9218 9237 26 1.7 586.1 1.0X -SQL Json 8885 8900 21 1.8 564.9 1.0X -SQL Parquet Vectorized: DataPageV1 74 86 9 212.6 4.7 124.6X -SQL Parquet Vectorized: DataPageV2 74 88 12 211.4 4.7 123.9X -SQL Parquet MR: DataPageV1 1832 1837 8 8.6 116.5 5.0X -SQL Parquet MR: DataPageV2 1761 1763 3 8.9 112.0 5.2X -SQL ORC Vectorized 104 114 11 150.9 6.6 88.5X -SQL ORC MR 1523 1560 52 10.3 96.8 6.1X - -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +SQL CSV 9431 9439 11 1.7 599.6 1.0X +SQL Json 8552 8570 26 1.8 543.7 1.1X +SQL Parquet Vectorized: DataPageV1 96 105 9 164.4 6.1 98.6X +SQL Parquet Vectorized: DataPageV2 93 104 9 168.4 5.9 101.0X +SQL Parquet MR: DataPageV1 1816 1821 6 8.7 115.5 5.2X +SQL Parquet MR: DataPageV2 1742 1746 5 9.0 110.8 5.4X +SQL ORC Vectorized 107 113 6 146.6 6.8 87.9X +SQL ORC MR 1582 1598 22 9.9 100.6 6.0X + +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor Parquet Reader Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 125 138 14 125.8 7.9 1.0X -ParquetReader Vectorized: DataPageV2 125 137 11 126.2 7.9 1.0X -ParquetReader Vectorized -> Row: DataPageV1 44 47 5 355.9 2.8 2.8X -ParquetReader Vectorized -> Row: DataPageV2 44 47 5 357.8 2.8 2.8X +ParquetReader Vectorized: DataPageV1 66 68 2 238.1 4.2 1.0X +ParquetReader Vectorized: DataPageV2 66 67 1 239.4 4.2 1.0X +ParquetReader Vectorized -> Row: DataPageV1 44 46 3 357.8 2.8 1.5X +ParquetReader Vectorized -> Row: DataPageV2 44 45 1 357.9 2.8 1.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor SQL Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 9794 9896 144 1.6 622.7 1.0X -SQL Json 9146 9163 24 1.7 581.5 1.1X -SQL Parquet Vectorized: DataPageV1 109 117 7 144.1 6.9 89.7X -SQL Parquet Vectorized: DataPageV2 126 136 5 124.8 8.0 77.7X -SQL Parquet MR: DataPageV1 2090 2102 16 7.5 132.9 4.7X -SQL Parquet MR: DataPageV2 1898 1907 14 8.3 120.6 5.2X -SQL ORC Vectorized 138 149 14 114.1 8.8 71.0X -SQL ORC MR 1574 1605 43 10.0 100.1 6.2X - -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +SQL CSV 9996 10013 25 1.6 635.5 1.0X +SQL Json 8898 8902 5 1.8 565.7 1.1X +SQL Parquet Vectorized: DataPageV1 121 137 14 129.7 7.7 82.4X +SQL Parquet Vectorized: DataPageV2 139 153 14 113.1 8.8 71.9X +SQL Parquet MR: DataPageV1 2015 2035 28 7.8 128.1 5.0X +SQL Parquet MR: DataPageV2 2000 2012 17 7.9 127.2 5.0X +SQL ORC Vectorized 143 174 27 109.8 9.1 69.8X +SQL ORC MR 1959 1990 44 8.0 124.6 5.1X + +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor Parquet Reader Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 140 161 67 112.2 8.9 1.0X -ParquetReader Vectorized: DataPageV2 163 166 3 96.4 10.4 0.9X -ParquetReader Vectorized -> Row: DataPageV1 139 140 2 113.1 8.8 1.0X -ParquetReader Vectorized -> Row: DataPageV2 166 182 10 94.8 10.6 0.8X +ParquetReader Vectorized: DataPageV1 151 160 8 104.3 9.6 1.0X +ParquetReader Vectorized: DataPageV2 168 180 14 93.5 10.7 0.9X +ParquetReader Vectorized -> Row: DataPageV1 160 166 6 98.3 10.2 0.9X +ParquetReader Vectorized -> Row: DataPageV2 164 175 12 96.1 10.4 0.9X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor SQL Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 11232 11256 33 1.4 714.1 1.0X -SQL Json 9725 9740 22 1.6 618.3 1.2X -SQL Parquet Vectorized: DataPageV1 84 97 15 187.8 5.3 134.1X -SQL Parquet Vectorized: DataPageV2 162 181 13 96.8 10.3 69.1X -SQL Parquet MR: DataPageV1 1882 1900 26 8.4 119.6 6.0X -SQL Parquet MR: DataPageV2 1898 1899 2 8.3 120.7 5.9X -SQL ORC Vectorized 148 157 13 106.1 9.4 75.7X -SQL ORC MR 1667 1674 10 9.4 106.0 6.7X - -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +SQL CSV 11250 11336 121 1.4 715.3 1.0X +SQL Json 9272 9279 10 1.7 589.5 1.2X +SQL Parquet Vectorized: DataPageV1 109 126 14 144.4 6.9 103.3X +SQL Parquet Vectorized: DataPageV2 190 195 5 82.8 12.1 59.2X +SQL Parquet MR: DataPageV1 2338 2342 6 6.7 148.6 4.8X +SQL Parquet MR: DataPageV2 2332 2343 17 6.7 148.2 4.8X +SQL ORC Vectorized 179 193 12 87.9 11.4 62.9X +SQL ORC MR 2094 2095 1 7.5 133.2 5.4X + +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor Parquet Reader Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 130 140 11 121.1 8.3 1.0X -ParquetReader Vectorized: DataPageV2 213 230 10 74.0 13.5 0.6X -ParquetReader Vectorized -> Row: DataPageV1 128 132 6 122.9 8.1 1.0X -ParquetReader Vectorized -> Row: DataPageV2 222 226 5 70.7 14.1 0.6X +ParquetReader Vectorized: DataPageV1 134 138 2 117.7 8.5 1.0X +ParquetReader Vectorized: DataPageV2 210 215 7 74.8 13.4 0.6X +ParquetReader Vectorized -> Row: DataPageV1 128 133 8 123.3 8.1 1.0X +ParquetReader Vectorized -> Row: DataPageV2 225 232 6 70.0 14.3 0.6X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor SQL Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 14617 14690 103 1.1 929.3 1.0X -SQL Json 10772 10780 11 1.5 684.9 1.4X -SQL Parquet Vectorized: DataPageV1 118 132 13 133.4 7.5 124.0X -SQL Parquet Vectorized: DataPageV2 268 300 20 58.7 17.0 54.5X -SQL Parquet MR: DataPageV1 2289 2314 36 6.9 145.5 6.4X -SQL Parquet MR: DataPageV2 1993 1995 3 7.9 126.7 7.3X -SQL ORC Vectorized 215 224 12 73.1 13.7 68.0X -SQL ORC MR 1840 1851 17 8.6 117.0 7.9X - -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +SQL CSV 11683 11683 1 1.3 742.8 1.0X +SQL Json 9457 9460 4 1.7 601.3 1.2X +SQL Parquet Vectorized: DataPageV1 277 312 21 56.9 17.6 42.2X +SQL Parquet Vectorized: DataPageV2 281 291 10 56.0 17.9 41.6X +SQL Parquet MR: DataPageV1 2506 2517 15 6.3 159.4 4.7X +SQL Parquet MR: DataPageV2 2053 2058 7 7.7 130.5 5.7X +SQL ORC Vectorized 166 172 12 95.0 10.5 70.5X +SQL ORC MR 1709 1738 40 9.2 108.7 6.8X + +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor Parquet Reader Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 167 179 12 94.0 10.6 1.0X -ParquetReader Vectorized: DataPageV2 324 331 4 48.5 20.6 0.5X -ParquetReader Vectorized -> Row: DataPageV1 181 185 5 87.1 11.5 0.9X -ParquetReader Vectorized -> Row: DataPageV2 322 331 6 48.8 20.5 0.5X +ParquetReader Vectorized: DataPageV1 311 331 16 50.6 19.8 1.0X +ParquetReader Vectorized: DataPageV2 265 280 21 59.4 16.8 1.2X +ParquetReader Vectorized -> Row: DataPageV1 317 321 3 49.6 20.2 1.0X +ParquetReader Vectorized -> Row: DataPageV2 254 262 13 62.0 16.1 1.2X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor SQL Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 11070 11076 9 1.4 703.8 1.0X -SQL Json 11574 11602 39 1.4 735.9 1.0X -SQL Parquet Vectorized: DataPageV1 86 97 15 182.7 5.5 128.6X -SQL Parquet Vectorized: DataPageV2 94 103 5 166.9 6.0 117.4X -SQL Parquet MR: DataPageV1 2065 2130 93 7.6 131.3 5.4X -SQL Parquet MR: DataPageV2 2157 2169 17 7.3 137.1 5.1X -SQL ORC Vectorized 266 288 20 59.0 16.9 41.5X -SQL ORC MR 1740 1780 57 9.0 110.6 6.4X - -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +SQL CSV 11446 11452 8 1.4 727.7 1.0X +SQL Json 10952 10955 4 1.4 696.3 1.0X +SQL Parquet Vectorized: DataPageV1 83 97 16 189.5 5.3 137.9X +SQL Parquet Vectorized: DataPageV2 82 94 12 192.7 5.2 140.2X +SQL Parquet MR: DataPageV1 2107 2120 18 7.5 134.0 5.4X +SQL Parquet MR: DataPageV2 1975 2003 40 8.0 125.5 5.8X +SQL ORC Vectorized 235 245 14 66.9 14.9 48.7X +SQL ORC MR 1779 1801 30 8.8 113.1 6.4X + +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor Parquet Reader Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 144 144 1 109.5 9.1 1.0X -ParquetReader Vectorized: DataPageV2 140 142 1 112.1 8.9 1.0X -ParquetReader Vectorized -> Row: DataPageV1 149 156 6 105.6 9.5 1.0X -ParquetReader Vectorized -> Row: DataPageV2 148 153 5 106.2 9.4 1.0X +ParquetReader Vectorized: DataPageV1 134 141 8 117.1 8.5 1.0X +ParquetReader Vectorized: DataPageV2 147 151 4 107.3 9.3 0.9X +ParquetReader Vectorized -> Row: DataPageV1 144 151 7 109.2 9.2 0.9X +ParquetReader Vectorized -> Row: DataPageV2 128 139 7 123.3 8.1 1.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor SQL Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 14612 14718 150 1.1 929.0 1.0X -SQL Json 14802 14812 14 1.1 941.1 1.0X -SQL Parquet Vectorized: DataPageV1 126 144 15 124.3 8.0 115.5X -SQL Parquet Vectorized: DataPageV2 161 167 5 97.4 10.3 90.5X -SQL Parquet MR: DataPageV1 2239 2249 14 7.0 142.4 6.5X -SQL Parquet MR: DataPageV2 2125 2169 63 7.4 135.1 6.9X -SQL ORC Vectorized 352 366 11 44.6 22.4 41.5X -SQL ORC MR 1823 1824 1 8.6 115.9 8.0X - -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +SQL CSV 11723 11745 31 1.3 745.3 1.0X +SQL Json 11373 11395 31 1.4 723.1 1.0X +SQL Parquet Vectorized: DataPageV1 304 316 11 51.7 19.3 38.6X +SQL Parquet Vectorized: DataPageV2 276 301 16 56.9 17.6 42.4X +SQL Parquet MR: DataPageV1 2427 2438 16 6.5 154.3 4.8X +SQL Parquet MR: DataPageV2 2365 2381 22 6.7 150.4 5.0X +SQL ORC Vectorized 577 580 2 27.3 36.7 20.3X +SQL ORC MR 2149 2174 35 7.3 136.6 5.5X + +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor Parquet Reader Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 202 205 2 77.7 12.9 1.0X -ParquetReader Vectorized: DataPageV2 200 205 5 78.5 12.7 1.0X -ParquetReader Vectorized -> Row: DataPageV1 182 187 5 86.2 11.6 1.1X -ParquetReader Vectorized -> Row: DataPageV2 182 186 4 86.3 11.6 1.1X +ParquetReader Vectorized: DataPageV1 325 333 5 48.4 20.6 1.0X +ParquetReader Vectorized: DataPageV2 324 333 8 48.5 20.6 1.0X +ParquetReader Vectorized -> Row: DataPageV1 312 326 14 50.4 19.8 1.0X +ParquetReader Vectorized -> Row: DataPageV2 323 329 6 48.6 20.6 1.0X ================================================================================================ SQL Single Numeric Column Scan in Struct ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor SQL Single TINYINT Column Scan in Struct: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------- -SQL ORC MR 1989 2016 38 7.9 126.4 1.0X -SQL ORC Vectorized (Nested Column Disabled) 1965 1966 2 8.0 124.9 1.0X -SQL ORC Vectorized (Nested Column Enabled) 195 207 15 80.6 12.4 10.2X -SQL Parquet MR: DataPageV1 2261 2267 9 7.0 143.7 0.9X -SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 2698 2708 14 5.8 171.5 0.7X -SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 100 105 4 157.5 6.3 19.9X -SQL Parquet MR: DataPageV2 2108 2109 1 7.5 134.0 0.9X -SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 2617 2636 27 6.0 166.4 0.8X -SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 90 98 9 175.2 5.7 22.2X - -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +SQL ORC MR 2116 2119 4 7.4 134.5 1.0X +SQL ORC Vectorized (Nested Column Disabled) 2127 2157 42 7.4 135.3 1.0X +SQL ORC Vectorized (Nested Column Enabled) 146 153 9 107.5 9.3 14.5X +SQL Parquet MR: DataPageV1 2589 2609 28 6.1 164.6 0.8X +SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 2883 2886 4 5.5 183.3 0.7X +SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 104 121 18 151.6 6.6 20.4X +SQL Parquet MR: DataPageV2 2472 2505 46 6.4 157.2 0.9X +SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 2834 2851 25 5.6 180.2 0.7X +SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 106 121 13 148.8 6.7 20.0X + +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor SQL Single SMALLINT Column Scan in Struct: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------- -SQL ORC MR 2099 2122 32 7.5 133.5 1.0X -SQL ORC Vectorized (Nested Column Disabled) 2154 2157 4 7.3 137.0 1.0X -SQL ORC Vectorized (Nested Column Enabled) 275 287 14 57.3 17.5 7.6X -SQL Parquet MR: DataPageV1 2310 2320 15 6.8 146.9 0.9X -SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 2891 2907 23 5.4 183.8 0.7X -SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 99 122 23 158.5 6.3 21.2X -SQL Parquet MR: DataPageV2 2250 2254 7 7.0 143.0 0.9X -SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 2848 2874 37 5.5 181.0 0.7X -SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 124 137 10 127.2 7.9 17.0X - -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +SQL ORC MR 2332 2378 65 6.7 148.3 1.0X +SQL ORC Vectorized (Nested Column Disabled) 2331 2360 41 6.7 148.2 1.0X +SQL ORC Vectorized (Nested Column Enabled) 257 270 10 61.2 16.3 9.1X +SQL Parquet MR: DataPageV1 2383 2385 2 6.6 151.5 1.0X +SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 2944 2945 1 5.3 187.2 0.8X +SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 120 134 16 130.9 7.6 19.4X +SQL Parquet MR: DataPageV2 2323 2334 17 6.8 147.7 1.0X +SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 2983 2992 12 5.3 189.7 0.8X +SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 231 263 16 68.0 14.7 10.1X + +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor SQL Single INT Column Scan in Struct: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------- -SQL ORC MR 2147 2182 49 7.3 136.5 1.0X -SQL ORC Vectorized (Nested Column Disabled) 2138 2160 30 7.4 136.0 1.0X -SQL ORC Vectorized (Nested Column Enabled) 307 315 10 51.2 19.5 7.0X -SQL Parquet MR: DataPageV1 2349 2351 3 6.7 149.3 0.9X -SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 2783 2823 56 5.7 177.0 0.8X -SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 92 111 18 170.3 5.9 23.3X -SQL Parquet MR: DataPageV2 2394 2416 31 6.6 152.2 0.9X -SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 2774 2776 3 5.7 176.4 0.8X -SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 206 227 18 76.3 13.1 10.4X - -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +SQL ORC MR 2196 2201 7 7.2 139.6 1.0X +SQL ORC Vectorized (Nested Column Disabled) 2243 2312 97 7.0 142.6 1.0X +SQL ORC Vectorized (Nested Column Enabled) 278 292 18 56.6 17.7 7.9X +SQL Parquet MR: DataPageV1 2539 2540 1 6.2 161.4 0.9X +SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 3499 3514 20 4.5 222.5 0.6X +SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 112 117 4 139.9 7.2 19.5X +SQL Parquet MR: DataPageV2 2555 2563 12 6.2 162.4 0.9X +SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 3424 3441 25 4.6 217.7 0.6X +SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 242 250 5 64.9 15.4 9.1X + +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor SQL Single BIGINT Column Scan in Struct: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------- -SQL ORC MR 2253 2258 7 7.0 143.3 1.0X -SQL ORC Vectorized (Nested Column Disabled) 2311 2324 18 6.8 146.9 1.0X -SQL ORC Vectorized (Nested Column Enabled) 356 377 29 44.2 22.6 6.3X -SQL Parquet MR: DataPageV1 2600 2609 13 6.0 165.3 0.9X -SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 3090 3097 9 5.1 196.5 0.7X -SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 128 144 16 122.6 8.2 17.6X -SQL Parquet MR: DataPageV2 2303 2325 31 6.8 146.4 1.0X -SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 2816 2821 7 5.6 179.0 0.8X -SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 332 339 7 47.3 21.1 6.8X - -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +SQL ORC MR 2219 2229 15 7.1 141.1 1.0X +SQL ORC Vectorized (Nested Column Disabled) 2234 2248 21 7.0 142.0 1.0X +SQL ORC Vectorized (Nested Column Enabled) 290 309 18 54.2 18.5 7.6X +SQL Parquet MR: DataPageV1 2806 2812 8 5.6 178.4 0.8X +SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 3281 3296 20 4.8 208.6 0.7X +SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 326 335 10 48.3 20.7 6.8X +SQL Parquet MR: DataPageV2 2430 2454 34 6.5 154.5 0.9X +SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 2898 2912 20 5.4 184.3 0.8X +SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 251 288 23 62.6 16.0 8.8X + +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor SQL Single FLOAT Column Scan in Struct: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------- -SQL ORC MR 2274 2315 58 6.9 144.6 1.0X -SQL ORC Vectorized (Nested Column Disabled) 2291 2319 40 6.9 145.6 1.0X -SQL ORC Vectorized (Nested Column Enabled) 364 381 25 43.3 23.1 6.3X -SQL Parquet MR: DataPageV1 2379 2384 7 6.6 151.3 1.0X -SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 2946 2955 13 5.3 187.3 0.8X -SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 98 105 5 160.0 6.2 23.1X -SQL Parquet MR: DataPageV2 2303 2311 11 6.8 146.4 1.0X -SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 2866 2878 18 5.5 182.2 0.8X -SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 90 101 8 175.1 5.7 25.3X - -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +SQL ORC MR 2313 2372 83 6.8 147.1 1.0X +SQL ORC Vectorized (Nested Column Disabled) 2405 2419 19 6.5 152.9 1.0X +SQL ORC Vectorized (Nested Column Enabled) 337 355 19 46.6 21.5 6.9X +SQL Parquet MR: DataPageV1 2604 2617 17 6.0 165.6 0.9X +SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 3103 3112 12 5.1 197.3 0.7X +SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 95 100 4 165.2 6.1 24.3X +SQL Parquet MR: DataPageV2 2674 2698 34 5.9 170.0 0.9X +SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 3215 3237 32 4.9 204.4 0.7X +SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 87 101 9 180.4 5.5 26.5X + +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor SQL Single DOUBLE Column Scan in Struct: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------- -SQL ORC MR 2265 2272 10 6.9 144.0 1.0X -SQL ORC Vectorized (Nested Column Disabled) 2301 2323 31 6.8 146.3 1.0X -SQL ORC Vectorized (Nested Column Enabled) 442 457 19 35.6 28.1 5.1X -SQL Parquet MR: DataPageV1 2573 2587 20 6.1 163.6 0.9X -SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 3230 3231 1 4.9 205.4 0.7X -SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 130 148 15 120.9 8.3 17.4X -SQL Parquet MR: DataPageV2 2539 2555 23 6.2 161.4 0.9X -SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 3169 3176 9 5.0 201.5 0.7X -SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 128 145 13 122.4 8.2 17.6X +SQL ORC MR 2676 2684 12 5.9 170.1 1.0X +SQL ORC Vectorized (Nested Column Disabled) 2595 2596 2 6.1 165.0 1.0X +SQL ORC Vectorized (Nested Column Enabled) 697 708 16 22.6 44.3 3.8X +SQL Parquet MR: DataPageV1 2836 2854 25 5.5 180.3 0.9X +SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 3428 3435 10 4.6 218.0 0.8X +SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 307 319 11 51.2 19.5 8.7X +SQL Parquet MR: DataPageV2 2903 2904 2 5.4 184.6 0.9X +SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 3511 3518 9 4.5 223.2 0.8X +SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 317 322 4 49.7 20.1 8.5X ================================================================================================ SQL Nested Column Scan ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor SQL Nested Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------- -SQL ORC MR 12831 12987 144 0.1 12236.2 1.0X -SQL ORC Vectorized (Nested Column Disabled) 12819 12984 139 0.1 12224.7 1.0X -SQL ORC Vectorized (Nested Column Enabled) 7154 7188 21 0.1 6822.6 1.8X -SQL Parquet MR: DataPageV1 8782 8811 21 0.1 8375.1 1.5X -SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 9453 9503 28 0.1 9015.2 1.4X -SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 5994 6037 29 0.2 5716.6 2.1X -SQL Parquet MR: DataPageV2 9566 9608 23 0.1 9123.0 1.3X -SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 9873 9912 24 0.1 9415.5 1.3X -SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 5739 5775 25 0.2 5473.3 2.2X +SQL ORC MR 12857 12956 97 0.1 12261.0 1.0X +SQL ORC Vectorized (Nested Column Disabled) 12868 12963 93 0.1 12272.0 1.0X +SQL ORC Vectorized (Nested Column Enabled) 7063 7109 31 0.1 6735.6 1.8X +SQL Parquet MR: DataPageV1 9067 9173 81 0.1 8646.8 1.4X +SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 9287 9373 59 0.1 8856.4 1.4X +SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 5899 5931 25 0.2 5625.7 2.2X +SQL Parquet MR: DataPageV2 9529 9579 54 0.1 9087.2 1.3X +SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 9864 10035 165 0.1 9406.6 1.3X +SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 5650 5702 49 0.2 5388.4 2.3X ================================================================================================ Int and String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor Int and String Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 10391 10454 88 1.0 991.0 1.0X -SQL Json 10116 10160 62 1.0 964.7 1.0X -SQL Parquet Vectorized: DataPageV1 1684 1687 4 6.2 160.6 6.2X -SQL Parquet Vectorized: DataPageV2 1919 1938 26 5.5 183.1 5.4X -SQL Parquet MR: DataPageV1 3735 3735 1 2.8 356.2 2.8X -SQL Parquet MR: DataPageV2 3767 3770 5 2.8 359.2 2.8X -SQL ORC Vectorized 1725 1744 27 6.1 164.5 6.0X -SQL ORC MR 3469 3469 1 3.0 330.8 3.0X +SQL CSV 10098 10209 156 1.0 963.0 1.0X +SQL Json 9940 9993 75 1.1 947.9 1.0X +SQL Parquet Vectorized: DataPageV1 1682 1707 36 6.2 160.4 6.0X +SQL Parquet Vectorized: DataPageV2 1912 1930 25 5.5 182.4 5.3X +SQL Parquet MR: DataPageV1 3861 3870 13 2.7 368.2 2.6X +SQL Parquet MR: DataPageV2 3961 3969 10 2.6 377.8 2.5X +SQL ORC Vectorized 1768 1780 18 5.9 168.6 5.7X +SQL ORC MR 3478 3493 21 3.0 331.7 2.9X ================================================================================================ Repeated String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor Repeated String: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 5749 5778 41 1.8 548.3 1.0X -SQL Json 6428 6433 7 1.6 613.0 0.9X -SQL Parquet Vectorized: DataPageV1 428 437 9 24.5 40.9 13.4X -SQL Parquet Vectorized: DataPageV2 440 464 17 23.8 42.0 13.1X -SQL Parquet MR: DataPageV1 1640 1669 40 6.4 156.4 3.5X -SQL Parquet MR: DataPageV2 1652 1670 25 6.3 157.6 3.5X -SQL ORC Vectorized 365 369 4 28.7 34.9 15.7X -SQL ORC MR 1628 1628 1 6.4 155.2 3.5X +SQL CSV 5870 5882 17 1.8 559.8 1.0X +SQL Json 6337 6345 10 1.7 604.4 0.9X +SQL Parquet Vectorized: DataPageV1 457 473 22 23.0 43.5 12.9X +SQL Parquet Vectorized: DataPageV2 491 501 8 21.3 46.9 11.9X +SQL Parquet MR: DataPageV1 1631 1648 24 6.4 155.6 3.6X +SQL Parquet MR: DataPageV2 1580 1606 36 6.6 150.7 3.7X +SQL ORC Vectorized 372 378 8 28.2 35.5 15.8X +SQL ORC MR 1732 1735 5 6.1 165.1 3.4X ================================================================================================ Partitioned Table Scan ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor Partitioned Table: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -Data column - CSV 14556 14605 69 1.1 925.5 1.0X -Data column - Json 10309 10312 4 1.5 655.4 1.4X -Data column - Parquet Vectorized: DataPageV1 139 147 11 113.4 8.8 105.0X -Data column - Parquet Vectorized: DataPageV2 265 283 15 59.4 16.8 54.9X -Data column - Parquet MR: DataPageV1 2451 2456 8 6.4 155.8 5.9X -Data column - Parquet MR: DataPageV2 2284 2303 27 6.9 145.2 6.4X -Data column - ORC Vectorized 184 193 12 85.5 11.7 79.1X -Data column - ORC MR 1952 1971 27 8.1 124.1 7.5X -Partition column - CSV 4241 4257 23 3.7 269.6 3.4X -Partition column - Json 9027 9034 10 1.7 573.9 1.6X -Partition column - Parquet Vectorized: DataPageV1 21 27 7 732.6 1.4 678.0X -Partition column - Parquet Vectorized: DataPageV2 21 27 7 746.0 1.3 690.4X -Partition column - Parquet MR: DataPageV1 1219 1220 2 12.9 77.5 11.9X -Partition column - Parquet MR: DataPageV2 1190 1197 10 13.2 75.6 12.2X -Partition column - ORC Vectorized 22 28 6 699.8 1.4 647.6X -Partition column - ORC MR 1259 1266 9 12.5 80.0 11.6X -Both columns - CSV 14815 14838 32 1.1 941.9 1.0X -Both columns - Json 11180 11208 40 1.4 710.8 1.3X -Both columns - Parquet Vectorized: DataPageV1 158 173 15 99.8 10.0 92.3X -Both columns - Parquet Vectorized: DataPageV2 300 310 11 52.5 19.1 48.6X -Both columns - Parquet MR: DataPageV1 2543 2548 6 6.2 161.7 5.7X -Both columns - Parquet MR: DataPageV2 2264 2275 15 6.9 143.9 6.4X -Both columns - ORC Vectorized 225 243 27 70.1 14.3 64.8X -Both columns - ORC MR 2090 2096 7 7.5 132.9 7.0X +Data column - CSV 10956 10967 15 1.4 696.5 1.0X +Data column - Json 9169 9189 29 1.7 583.0 1.2X +Data column - Parquet Vectorized: DataPageV1 108 126 16 145.8 6.9 101.6X +Data column - Parquet Vectorized: DataPageV2 217 233 20 72.5 13.8 50.5X +Data column - Parquet MR: DataPageV1 2229 2346 166 7.1 141.7 4.9X +Data column - Parquet MR: DataPageV2 2224 2240 23 7.1 141.4 4.9X +Data column - ORC Vectorized 178 184 4 88.3 11.3 61.5X +Data column - ORC MR 2040 2069 41 7.7 129.7 5.4X +Partition column - CSV 3493 3514 30 4.5 222.1 3.1X +Partition column - Json 8200 8367 236 1.9 521.3 1.3X +Partition column - Parquet Vectorized: DataPageV1 29 36 7 543.6 1.8 378.6X +Partition column - Parquet Vectorized: DataPageV2 28 35 7 560.2 1.8 390.2X +Partition column - Parquet MR: DataPageV1 1233 1255 31 12.8 78.4 8.9X +Partition column - Parquet MR: DataPageV2 1239 1248 13 12.7 78.8 8.8X +Partition column - ORC Vectorized 29 34 6 547.3 1.8 381.2X +Partition column - ORC MR 1300 1304 5 12.1 82.6 8.4X +Both columns - CSV 10899 10923 34 1.4 693.0 1.0X +Both columns - Json 9755 9777 31 1.6 620.2 1.1X +Both columns - Parquet Vectorized: DataPageV1 187 215 18 83.9 11.9 58.5X +Both columns - Parquet Vectorized: DataPageV2 266 290 24 59.0 16.9 41.1X +Both columns - Parquet MR: DataPageV1 2368 2379 15 6.6 150.6 4.6X +Both columns - Parquet MR: DataPageV2 2315 2323 11 6.8 147.2 4.7X +Both columns - ORC Vectorized 181 210 27 86.8 11.5 60.4X +Both columns - ORC MR 2214 2274 86 7.1 140.7 4.9X ================================================================================================ String with Nulls Scan ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor String with Nulls Scan (0.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 7259 7326 95 1.4 692.3 1.0X -SQL Json 8815 8826 16 1.2 840.7 0.8X -SQL Parquet Vectorized: DataPageV1 995 1005 13 10.5 94.9 7.3X -SQL Parquet Vectorized: DataPageV2 1355 1355 0 7.7 129.2 5.4X -SQL Parquet MR: DataPageV1 3218 3225 10 3.3 306.9 2.3X -SQL Parquet MR: DataPageV2 3445 3492 67 3.0 328.5 2.1X -ParquetReader Vectorized: DataPageV1 692 700 9 15.1 66.0 10.5X -ParquetReader Vectorized: DataPageV2 997 999 2 10.5 95.1 7.3X -SQL ORC Vectorized 782 803 30 13.4 74.6 9.3X -SQL ORC MR 2808 2824 23 3.7 267.8 2.6X - -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +SQL CSV 6979 7004 35 1.5 665.6 1.0X +SQL Json 8795 8811 23 1.2 838.8 0.8X +SQL Parquet Vectorized: DataPageV1 1153 1174 30 9.1 110.0 6.1X +SQL Parquet Vectorized: DataPageV2 1419 1454 51 7.4 135.3 4.9X +SQL Parquet MR: DataPageV1 3349 3358 14 3.1 319.3 2.1X +SQL Parquet MR: DataPageV2 3710 3720 13 2.8 353.8 1.9X +ParquetReader Vectorized: DataPageV1 788 795 10 13.3 75.2 8.9X +ParquetReader Vectorized: DataPageV2 1033 1057 35 10.2 98.5 6.8X +SQL ORC Vectorized 815 820 4 12.9 77.7 8.6X +SQL ORC MR 2914 2955 58 3.6 277.9 2.4X + +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor String with Nulls Scan (50.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 5382 5390 10 1.9 513.3 1.0X -SQL Json 7375 7386 15 1.4 703.4 0.7X -SQL Parquet Vectorized: DataPageV1 748 777 35 14.0 71.4 7.2X -SQL Parquet Vectorized: DataPageV2 975 990 14 10.8 93.0 5.5X -SQL Parquet MR: DataPageV1 2691 2695 6 3.9 256.6 2.0X -SQL Parquet MR: DataPageV2 2885 2885 0 3.6 275.1 1.9X -ParquetReader Vectorized: DataPageV1 650 654 4 16.1 62.0 8.3X -ParquetReader Vectorized: DataPageV2 861 864 2 12.2 82.1 6.2X -SQL ORC Vectorized 934 949 24 11.2 89.1 5.8X -SQL ORC MR 2598 2634 51 4.0 247.8 2.1X - -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +SQL CSV 5379 5379 1 1.9 513.0 1.0X +SQL Json 7512 7522 14 1.4 716.4 0.7X +SQL Parquet Vectorized: DataPageV1 766 773 10 13.7 73.1 7.0X +SQL Parquet Vectorized: DataPageV2 953 973 29 11.0 90.9 5.6X +SQL Parquet MR: DataPageV1 2627 2634 11 4.0 250.5 2.0X +SQL Parquet MR: DataPageV2 2857 2863 8 3.7 272.4 1.9X +ParquetReader Vectorized: DataPageV1 686 701 22 15.3 65.4 7.8X +ParquetReader Vectorized: DataPageV2 868 882 16 12.1 82.8 6.2X +SQL ORC Vectorized 952 980 34 11.0 90.8 5.6X +SQL ORC MR 2794 2796 3 3.8 266.4 1.9X + +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor String with Nulls Scan (95.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 4165 4169 6 2.5 397.2 1.0X -SQL Json 5382 5391 14 1.9 513.2 0.8X -SQL Parquet Vectorized: DataPageV1 167 174 11 62.9 15.9 25.0X -SQL Parquet Vectorized: DataPageV2 196 202 5 53.5 18.7 21.3X -SQL Parquet MR: DataPageV1 1736 1737 1 6.0 165.6 2.4X -SQL Parquet MR: DataPageV2 1628 1642 20 6.4 155.2 2.6X -ParquetReader Vectorized: DataPageV1 141 142 1 74.4 13.4 29.5X -ParquetReader Vectorized: DataPageV2 168 170 1 62.5 16.0 24.8X -SQL ORC Vectorized 307 320 10 34.1 29.3 13.6X -SQL ORC MR 1506 1510 5 7.0 143.6 2.8X +SQL CSV 4196 4197 2 2.5 400.2 1.0X +SQL Json 5466 5479 19 1.9 521.3 0.8X +SQL Parquet Vectorized: DataPageV1 156 159 4 67.2 14.9 26.9X +SQL Parquet Vectorized: DataPageV2 184 190 6 57.0 17.5 22.8X +SQL Parquet MR: DataPageV1 1656 1659 4 6.3 157.9 2.5X +SQL Parquet MR: DataPageV2 1604 1604 0 6.5 153.0 2.6X +ParquetReader Vectorized: DataPageV1 163 164 1 64.5 15.5 25.8X +ParquetReader Vectorized: DataPageV2 190 193 2 55.3 18.1 22.1X +SQL ORC Vectorized 315 322 6 33.3 30.0 13.3X +SQL ORC MR 1610 1615 6 6.5 153.6 2.6X ================================================================================================ Single Column Scan From Wide Columns ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor Single Column Scan from 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 1559 1559 0 0.7 1487.0 1.0X -SQL Json 1935 1953 25 0.5 1845.4 0.8X -SQL Parquet Vectorized: DataPageV1 25 29 5 42.5 23.5 63.2X -SQL Parquet Vectorized: DataPageV2 35 40 5 29.8 33.6 44.3X -SQL Parquet MR: DataPageV1 171 182 12 6.1 162.8 9.1X -SQL Parquet MR: DataPageV2 150 159 9 7.0 142.9 10.4X -SQL ORC Vectorized 32 39 8 33.0 30.3 49.0X -SQL ORC MR 131 138 7 8.0 125.2 11.9X - -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +SQL CSV 1157 1159 3 0.9 1103.2 1.0X +SQL Json 1698 1702 5 0.6 1619.7 0.7X +SQL Parquet Vectorized: DataPageV1 24 29 6 43.3 23.1 47.8X +SQL Parquet Vectorized: DataPageV2 32 38 6 32.5 30.8 35.8X +SQL Parquet MR: DataPageV1 163 170 8 6.4 155.3 7.1X +SQL Parquet MR: DataPageV2 159 167 6 6.6 151.8 7.3X +SQL ORC Vectorized 28 34 7 37.5 26.7 41.4X +SQL ORC MR 130 136 6 8.1 123.8 8.9X + +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor Single Column Scan from 50 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 3490 3503 19 0.3 3328.3 1.0X -SQL Json 6330 6341 15 0.2 6037.0 0.6X -SQL Parquet Vectorized: DataPageV1 33 41 7 32.0 31.3 106.3X -SQL Parquet Vectorized: DataPageV2 42 47 8 24.8 40.3 82.7X -SQL Parquet MR: DataPageV1 190 199 8 5.5 181.5 18.3X -SQL Parquet MR: DataPageV2 161 168 6 6.5 153.9 21.6X -SQL ORC Vectorized 40 45 5 26.0 38.5 86.5X -SQL ORC MR 147 155 10 7.1 140.5 23.7X - -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +SQL CSV 2485 2523 55 0.4 2369.7 1.0X +SQL Json 5915 5940 35 0.2 5641.2 0.4X +SQL Parquet Vectorized: DataPageV1 29 36 7 36.2 27.6 85.8X +SQL Parquet Vectorized: DataPageV2 35 39 6 30.2 33.1 71.6X +SQL Parquet MR: DataPageV1 168 173 4 6.2 160.0 14.8X +SQL Parquet MR: DataPageV2 164 175 8 6.4 156.2 15.2X +SQL ORC Vectorized 32 35 5 33.1 30.2 78.5X +SQL ORC MR 142 148 5 7.4 135.1 17.5X + +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor Single Column Scan from 100 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 5835 5860 36 0.2 5564.9 1.0X -SQL Json 11235 11374 197 0.1 10714.3 0.5X -SQL Parquet Vectorized: DataPageV1 48 60 11 22.0 45.5 122.2X -SQL Parquet Vectorized: DataPageV2 52 60 6 20.2 49.6 112.2X -SQL Parquet MR: DataPageV1 204 212 7 5.1 194.6 28.6X -SQL Parquet MR: DataPageV2 178 183 4 5.9 169.6 32.8X -SQL ORC Vectorized 52 61 9 20.3 49.4 112.7X -SQL ORC MR 162 175 16 6.5 154.4 36.0X +SQL CSV 4100 4175 105 0.3 3910.5 1.0X +SQL Json 9817 9951 190 0.1 9362.4 0.4X +SQL Parquet Vectorized: DataPageV1 34 45 10 31.0 32.2 121.4X +SQL Parquet Vectorized: DataPageV2 41 47 7 25.5 39.2 99.7X +SQL Parquet MR: DataPageV1 179 187 8 5.9 170.8 22.9X +SQL Parquet MR: DataPageV2 169 183 14 6.2 161.0 24.3X +SQL ORC Vectorized 38 45 9 27.4 36.5 107.1X +SQL ORC MR 143 146 3 7.3 136.1 28.7X diff --git a/sql/core/benchmarks/DataSourceReadBenchmark-results.txt b/sql/core/benchmarks/DataSourceReadBenchmark-results.txt index 36a1bc00be182..76bbbfa26ae96 100644 --- a/sql/core/benchmarks/DataSourceReadBenchmark-results.txt +++ b/sql/core/benchmarks/DataSourceReadBenchmark-results.txt @@ -1,431 +1,431 @@ -================================================================================================ +DataSourceReadBenchmark-jdk21-results.txt================================================================================================ SQL Single Numeric Column Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor SQL Single BOOLEAN Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 9411 9431 29 1.7 598.3 1.0X -SQL Json 8995 9008 19 1.7 571.9 1.0X -SQL Parquet Vectorized: DataPageV1 68 82 11 232.7 4.3 139.2X -SQL Parquet Vectorized: DataPageV2 49 57 6 318.5 3.1 190.5X -SQL Parquet MR: DataPageV1 1798 1846 69 8.7 114.3 5.2X -SQL Parquet MR: DataPageV2 1712 1712 1 9.2 108.8 5.5X -SQL ORC Vectorized 115 125 9 137.3 7.3 82.1X -SQL ORC MR 1533 1540 9 10.3 97.5 6.1X - -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +SQL CSV 10363 10364 2 1.5 658.9 1.0X +SQL Json 8667 8699 46 1.8 551.0 1.2X +SQL Parquet Vectorized: DataPageV1 103 114 8 153.3 6.5 101.0X +SQL Parquet Vectorized: DataPageV2 101 111 6 155.4 6.4 102.4X +SQL Parquet MR: DataPageV1 1809 1813 6 8.7 115.0 5.7X +SQL Parquet MR: DataPageV2 1715 1720 8 9.2 109.0 6.0X +SQL ORC Vectorized 139 146 5 113.1 8.8 74.5X +SQL ORC MR 1508 1511 5 10.4 95.8 6.9X + +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor Parquet Reader Single BOOLEAN Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 37 38 2 430.0 2.3 1.0X -ParquetReader Vectorized: DataPageV2 27 28 1 586.2 1.7 1.4X -ParquetReader Vectorized -> Row: DataPageV1 29 30 1 536.5 1.9 1.2X -ParquetReader Vectorized -> Row: DataPageV2 20 21 3 790.6 1.3 1.8X +ParquetReader Vectorized: DataPageV1 88 90 2 178.9 5.6 1.0X +ParquetReader Vectorized: DataPageV2 95 96 1 166.2 6.0 0.9X +ParquetReader Vectorized -> Row: DataPageV1 73 74 1 215.3 4.6 1.2X +ParquetReader Vectorized -> Row: DataPageV2 81 83 1 193.1 5.2 1.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor SQL Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 9874 9959 120 1.6 627.8 1.0X -SQL Json 9359 9367 11 1.7 595.0 1.1X -SQL Parquet Vectorized: DataPageV1 75 83 7 211.0 4.7 132.5X -SQL Parquet Vectorized: DataPageV2 74 82 7 213.5 4.7 134.0X -SQL Parquet MR: DataPageV1 1856 1868 17 8.5 118.0 5.3X -SQL Parquet MR: DataPageV2 1759 1776 25 8.9 111.8 5.6X -SQL ORC Vectorized 120 124 4 131.3 7.6 82.4X -SQL ORC MR 1545 1549 6 10.2 98.2 6.4X - -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +SQL CSV 11538 11589 73 1.4 733.5 1.0X +SQL Json 9586 9596 14 1.6 609.5 1.2X +SQL Parquet Vectorized: DataPageV1 109 116 6 144.8 6.9 106.2X +SQL Parquet Vectorized: DataPageV2 110 118 8 142.6 7.0 104.6X +SQL Parquet MR: DataPageV1 1901 1953 74 8.3 120.9 6.1X +SQL Parquet MR: DataPageV2 1817 1832 22 8.7 115.5 6.4X +SQL ORC Vectorized 118 126 7 133.6 7.5 98.0X +SQL ORC MR 1505 1535 43 10.5 95.7 7.7X + +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor Parquet Reader Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 69 70 2 228.9 4.4 1.0X -ParquetReader Vectorized: DataPageV2 69 70 2 228.4 4.4 1.0X -ParquetReader Vectorized -> Row: DataPageV1 48 50 3 328.0 3.0 1.4X -ParquetReader Vectorized -> Row: DataPageV2 48 49 2 327.4 3.1 1.4X +ParquetReader Vectorized: DataPageV1 93 94 1 169.9 5.9 1.0X +ParquetReader Vectorized: DataPageV2 93 94 1 169.1 5.9 1.0X +ParquetReader Vectorized -> Row: DataPageV1 61 62 1 258.0 3.9 1.5X +ParquetReader Vectorized -> Row: DataPageV2 61 62 1 258.4 3.9 1.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor SQL Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 10495 10514 26 1.5 667.3 1.0X -SQL Json 9793 9794 2 1.6 622.6 1.1X -SQL Parquet Vectorized: DataPageV1 87 95 9 181.8 5.5 121.3X -SQL Parquet Vectorized: DataPageV2 115 120 4 136.8 7.3 91.3X -SQL Parquet MR: DataPageV1 1892 1903 16 8.3 120.3 5.5X -SQL Parquet MR: DataPageV2 1852 1859 10 8.5 117.7 5.7X -SQL ORC Vectorized 141 144 3 111.6 9.0 74.5X -SQL ORC MR 1612 1614 2 9.8 102.5 6.5X - -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +SQL CSV 12200 12203 5 1.3 775.7 1.0X +SQL Json 9813 9854 57 1.6 623.9 1.2X +SQL Parquet Vectorized: DataPageV1 101 107 6 156.1 6.4 121.0X +SQL Parquet Vectorized: DataPageV2 129 135 6 122.3 8.2 94.9X +SQL Parquet MR: DataPageV1 1968 1989 29 8.0 125.1 6.2X +SQL Parquet MR: DataPageV2 1913 1916 3 8.2 121.6 6.4X +SQL ORC Vectorized 130 135 6 120.8 8.3 93.7X +SQL ORC MR 1593 1600 10 9.9 101.3 7.7X + +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor Parquet Reader Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 139 140 1 113.3 8.8 1.0X -ParquetReader Vectorized: DataPageV2 168 170 1 93.6 10.7 0.8X -ParquetReader Vectorized -> Row: DataPageV1 137 138 2 115.1 8.7 1.0X -ParquetReader Vectorized -> Row: DataPageV2 164 166 2 95.7 10.4 0.8X +ParquetReader Vectorized: DataPageV1 138 140 2 113.9 8.8 1.0X +ParquetReader Vectorized: DataPageV2 166 168 3 94.8 10.6 0.8X +ParquetReader Vectorized -> Row: DataPageV1 136 138 6 115.6 8.6 1.0X +ParquetReader Vectorized -> Row: DataPageV2 164 166 2 96.1 10.4 0.8X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor SQL Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 11467 11490 31 1.4 729.1 1.0X -SQL Json 10337 10343 9 1.5 657.2 1.1X -SQL Parquet Vectorized: DataPageV1 86 91 4 181.9 5.5 132.6X -SQL Parquet Vectorized: DataPageV2 163 168 6 96.5 10.4 70.4X -SQL Parquet MR: DataPageV1 1979 1992 19 7.9 125.8 5.8X -SQL Parquet MR: DataPageV2 1929 1941 17 8.2 122.6 5.9X -SQL ORC Vectorized 163 168 4 96.8 10.3 70.5X -SQL ORC MR 1692 1697 7 9.3 107.6 6.8X - -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +SQL CSV 13361 13368 9 1.2 849.5 1.0X +SQL Json 10099 10118 27 1.6 642.1 1.3X +SQL Parquet Vectorized: DataPageV1 108 131 29 145.0 6.9 123.2X +SQL Parquet Vectorized: DataPageV2 177 185 7 88.9 11.3 75.5X +SQL Parquet MR: DataPageV1 2031 2083 74 7.7 129.1 6.6X +SQL Parquet MR: DataPageV2 2022 2026 5 7.8 128.6 6.6X +SQL ORC Vectorized 146 151 4 107.7 9.3 91.5X +SQL ORC MR 1642 1642 0 9.6 104.4 8.1X + +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor Parquet Reader Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 141 145 7 111.9 8.9 1.0X -ParquetReader Vectorized: DataPageV2 215 217 3 73.2 13.7 0.7X -ParquetReader Vectorized -> Row: DataPageV1 138 140 2 113.6 8.8 1.0X -ParquetReader Vectorized -> Row: DataPageV2 212 213 2 74.3 13.5 0.7X +ParquetReader Vectorized: DataPageV1 141 143 2 111.9 8.9 1.0X +ParquetReader Vectorized: DataPageV2 209 210 1 75.3 13.3 0.7X +ParquetReader Vectorized -> Row: DataPageV1 138 140 2 113.9 8.8 1.0X +ParquetReader Vectorized -> Row: DataPageV2 207 210 7 76.1 13.1 0.7X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor SQL Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 15201 15213 17 1.0 966.4 1.0X -SQL Json 11480 11486 7 1.4 729.9 1.3X -SQL Parquet Vectorized: DataPageV1 123 128 5 127.6 7.8 123.3X -SQL Parquet Vectorized: DataPageV2 215 220 7 73.1 13.7 70.7X -SQL Parquet MR: DataPageV1 2300 2335 49 6.8 146.2 6.6X -SQL Parquet MR: DataPageV2 2082 2100 25 7.6 132.4 7.3X -SQL ORC Vectorized 218 221 3 72.0 13.9 69.6X -SQL ORC MR 1722 1727 7 9.1 109.5 8.8X - -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +SQL CSV 13316 13326 13 1.2 846.6 1.0X +SQL Json 9808 9885 109 1.6 623.6 1.4X +SQL Parquet Vectorized: DataPageV1 290 293 3 54.3 18.4 46.0X +SQL Parquet Vectorized: DataPageV2 235 238 3 66.9 14.9 56.6X +SQL Parquet MR: DataPageV1 2404 2409 7 6.5 152.9 5.5X +SQL Parquet MR: DataPageV2 2007 2030 33 7.8 127.6 6.6X +SQL ORC Vectorized 150 153 3 104.8 9.5 88.7X +SQL ORC MR 1625 1634 13 9.7 103.3 8.2X + +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor Parquet Reader Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 177 181 4 88.7 11.3 1.0X -ParquetReader Vectorized: DataPageV2 266 269 2 59.1 16.9 0.7X -ParquetReader Vectorized -> Row: DataPageV1 175 177 1 89.7 11.2 1.0X -ParquetReader Vectorized -> Row: DataPageV2 266 267 2 59.1 16.9 0.7X +ParquetReader Vectorized: DataPageV1 334 335 2 47.1 21.2 1.0X +ParquetReader Vectorized: DataPageV2 277 279 2 56.9 17.6 1.2X +ParquetReader Vectorized -> Row: DataPageV1 351 355 3 44.8 22.3 1.0X +ParquetReader Vectorized -> Row: DataPageV2 297 303 7 52.9 18.9 1.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor SQL Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 12187 12207 29 1.3 774.8 1.0X -SQL Json 12291 12296 7 1.3 781.4 1.0X -SQL Parquet Vectorized: DataPageV1 83 87 7 190.1 5.3 147.3X -SQL Parquet Vectorized: DataPageV2 82 86 3 191.1 5.2 148.1X -SQL Parquet MR: DataPageV1 2020 2023 4 7.8 128.4 6.0X -SQL Parquet MR: DataPageV2 1938 1944 9 8.1 123.2 6.3X -SQL ORC Vectorized 270 272 2 58.3 17.1 45.2X -SQL ORC MR 1810 1837 38 8.7 115.1 6.7X - -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +SQL CSV 13826 13835 13 1.1 879.0 1.0X +SQL Json 11577 11606 40 1.4 736.1 1.2X +SQL Parquet Vectorized: DataPageV1 87 103 11 181.0 5.5 159.1X +SQL Parquet Vectorized: DataPageV2 88 101 7 178.8 5.6 157.2X +SQL Parquet MR: DataPageV1 2072 2075 4 7.6 131.7 6.7X +SQL Parquet MR: DataPageV2 2075 2087 17 7.6 131.9 6.7X +SQL ORC Vectorized 261 273 10 60.2 16.6 52.9X +SQL ORC MR 1720 1726 8 9.1 109.4 8.0X + +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor Parquet Reader Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 137 138 2 115.2 8.7 1.0X -ParquetReader Vectorized: DataPageV2 134 135 1 117.3 8.5 1.0X -ParquetReader Vectorized -> Row: DataPageV1 133 135 3 118.1 8.5 1.0X -ParquetReader Vectorized -> Row: DataPageV2 133 134 1 118.1 8.5 1.0X +ParquetReader Vectorized: DataPageV1 135 138 5 116.9 8.6 1.0X +ParquetReader Vectorized: DataPageV2 134 135 2 117.7 8.5 1.0X +ParquetReader Vectorized -> Row: DataPageV1 149 155 5 105.3 9.5 0.9X +ParquetReader Vectorized -> Row: DataPageV2 133 140 11 118.4 8.4 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor SQL Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 15788 15813 35 1.0 1003.8 1.0X -SQL Json 15330 15351 29 1.0 974.7 1.0X -SQL Parquet Vectorized: DataPageV1 122 128 7 128.7 7.8 129.2X -SQL Parquet Vectorized: DataPageV2 120 124 4 130.6 7.7 131.1X -SQL Parquet MR: DataPageV1 2321 2322 2 6.8 147.5 6.8X -SQL Parquet MR: DataPageV2 2193 2236 60 7.2 139.5 7.2X -SQL ORC Vectorized 356 357 2 44.2 22.6 44.3X -SQL ORC MR 1816 1825 11 8.7 115.5 8.7X - -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +SQL CSV 14086 14095 13 1.1 895.6 1.0X +SQL Json 11716 11726 14 1.3 744.9 1.2X +SQL Parquet Vectorized: DataPageV1 280 291 8 56.2 17.8 50.3X +SQL Parquet Vectorized: DataPageV2 282 287 4 55.8 17.9 50.0X +SQL Parquet MR: DataPageV1 2479 2498 27 6.3 157.6 5.7X +SQL Parquet MR: DataPageV2 2492 2509 23 6.3 158.4 5.7X +SQL ORC Vectorized 622 628 7 25.3 39.5 22.6X +SQL ORC MR 2084 2093 14 7.5 132.5 6.8X + +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor Parquet Reader Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized: DataPageV1 175 176 2 90.1 11.1 1.0X -ParquetReader Vectorized: DataPageV2 174 176 5 90.5 11.0 1.0X -ParquetReader Vectorized -> Row: DataPageV1 172 175 2 91.3 11.0 1.0X -ParquetReader Vectorized -> Row: DataPageV2 172 175 3 91.4 10.9 1.0X +ParquetReader Vectorized: DataPageV1 346 348 2 45.4 22.0 1.0X +ParquetReader Vectorized: DataPageV2 347 349 4 45.4 22.0 1.0X +ParquetReader Vectorized -> Row: DataPageV1 355 358 4 44.3 22.6 1.0X +ParquetReader Vectorized -> Row: DataPageV2 354 357 5 44.4 22.5 1.0X ================================================================================================ SQL Single Numeric Column Scan in Struct ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor SQL Single TINYINT Column Scan in Struct: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------- -SQL ORC MR 2124 2313 267 7.4 135.1 1.0X -SQL ORC Vectorized (Nested Column Disabled) 2112 2131 27 7.4 134.3 1.0X -SQL ORC Vectorized (Nested Column Enabled) 183 189 3 86.0 11.6 11.6X -SQL Parquet MR: DataPageV1 2359 2374 22 6.7 150.0 0.9X -SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 2774 2777 5 5.7 176.3 0.8X -SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 93 100 5 169.8 5.9 22.9X -SQL Parquet MR: DataPageV2 2225 2231 9 7.1 141.5 1.0X -SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 2579 2588 13 6.1 164.0 0.8X -SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 92 97 3 170.5 5.9 23.0X - -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +SQL ORC MR 2210 2239 41 7.1 140.5 1.0X +SQL ORC Vectorized (Nested Column Disabled) 2196 2226 43 7.2 139.6 1.0X +SQL ORC Vectorized (Nested Column Enabled) 106 138 35 148.1 6.8 20.8X +SQL Parquet MR: DataPageV1 2436 2446 14 6.5 154.9 0.9X +SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 2790 2819 40 5.6 177.4 0.8X +SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 107 113 7 146.4 6.8 20.6X +SQL Parquet MR: DataPageV2 2308 2310 4 6.8 146.7 1.0X +SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 2855 2862 9 5.5 181.5 0.8X +SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 125 137 11 125.9 7.9 17.7X + +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor SQL Single SMALLINT Column Scan in Struct: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------- -SQL ORC MR 2155 2186 45 7.3 137.0 1.0X -SQL ORC Vectorized (Nested Column Disabled) 2221 2222 2 7.1 141.2 1.0X -SQL ORC Vectorized (Nested Column Enabled) 272 279 6 57.8 17.3 7.9X -SQL Parquet MR: DataPageV1 2470 2485 21 6.4 157.0 0.9X -SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 2805 2816 17 5.6 178.3 0.8X -SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 99 105 5 159.1 6.3 21.8X -SQL Parquet MR: DataPageV2 2333 2338 6 6.7 148.4 0.9X -SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 2649 2655 8 5.9 168.4 0.8X -SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 138 143 5 114.2 8.8 15.6X - -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +SQL ORC MR 2174 2175 2 7.2 138.2 1.0X +SQL ORC Vectorized (Nested Column Disabled) 2170 2183 19 7.2 137.9 1.0X +SQL ORC Vectorized (Nested Column Enabled) 272 279 7 57.7 17.3 8.0X +SQL Parquet MR: DataPageV1 2539 2547 11 6.2 161.4 0.9X +SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 2723 2741 25 5.8 173.1 0.8X +SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 131 140 8 119.7 8.4 16.5X +SQL Parquet MR: DataPageV2 2430 2430 0 6.5 154.5 0.9X +SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 2748 2749 2 5.7 174.7 0.8X +SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 244 254 8 64.4 15.5 8.9X + +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor SQL Single INT Column Scan in Struct: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------- -SQL ORC MR 2159 2227 95 7.3 137.3 1.0X -SQL ORC Vectorized (Nested Column Disabled) 2200 2206 9 7.2 139.9 1.0X -SQL ORC Vectorized (Nested Column Enabled) 295 306 17 53.3 18.8 7.3X -SQL Parquet MR: DataPageV1 2378 2387 13 6.6 151.2 0.9X -SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 2754 2755 2 5.7 175.1 0.8X -SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 102 109 6 154.9 6.5 21.3X -SQL Parquet MR: DataPageV2 2345 2352 10 6.7 149.1 0.9X -SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 2696 2706 14 5.8 171.4 0.8X -SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 262 272 10 60.0 16.7 8.2X - -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +SQL ORC MR 2156 2188 46 7.3 137.1 1.0X +SQL ORC Vectorized (Nested Column Disabled) 2176 2228 73 7.2 138.4 1.0X +SQL ORC Vectorized (Nested Column Enabled) 272 295 19 57.8 17.3 7.9X +SQL Parquet MR: DataPageV1 2542 2544 3 6.2 161.6 0.8X +SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 2963 2973 14 5.3 188.4 0.7X +SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 135 144 9 116.8 8.6 16.0X +SQL Parquet MR: DataPageV2 2393 2412 28 6.6 152.1 0.9X +SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 2939 2942 4 5.4 186.9 0.7X +SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 267 275 7 58.9 17.0 8.1X + +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor SQL Single BIGINT Column Scan in Struct: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------- -SQL ORC MR 2348 2371 32 6.7 149.3 1.0X -SQL ORC Vectorized (Nested Column Disabled) 2447 2500 74 6.4 155.6 1.0X -SQL ORC Vectorized (Nested Column Enabled) 418 428 7 37.7 26.5 5.6X -SQL Parquet MR: DataPageV1 2602 2634 45 6.0 165.4 0.9X -SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 3019 3030 17 5.2 191.9 0.8X -SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 135 150 14 116.9 8.6 17.4X -SQL Parquet MR: DataPageV2 2419 2420 2 6.5 153.8 1.0X -SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 2814 2822 12 5.6 178.9 0.8X -SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 244 257 16 64.5 15.5 9.6X - -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +SQL ORC MR 2236 2261 35 7.0 142.2 1.0X +SQL ORC Vectorized (Nested Column Disabled) 2212 2256 63 7.1 140.6 1.0X +SQL ORC Vectorized (Nested Column Enabled) 279 294 17 56.3 17.8 8.0X +SQL Parquet MR: DataPageV1 2785 2796 15 5.6 177.1 0.8X +SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 3213 3327 162 4.9 204.3 0.7X +SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 308 321 10 51.1 19.6 7.3X +SQL Parquet MR: DataPageV2 2454 2496 59 6.4 156.0 0.9X +SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 2719 2744 36 5.8 172.9 0.8X +SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 278 285 3 56.6 17.7 8.0X + +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor SQL Single FLOAT Column Scan in Struct: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------- -SQL ORC MR 2297 2315 25 6.8 146.0 1.0X -SQL ORC Vectorized (Nested Column Disabled) 2319 2356 51 6.8 147.5 1.0X -SQL ORC Vectorized (Nested Column Enabled) 393 406 12 40.0 25.0 5.8X -SQL Parquet MR: DataPageV1 2318 2344 37 6.8 147.4 1.0X -SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 2881 2927 66 5.5 183.1 0.8X -SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 97 117 12 163.0 6.1 23.8X -SQL Parquet MR: DataPageV2 2290 2307 25 6.9 145.6 1.0X -SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 2808 2812 6 5.6 178.5 0.8X -SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 96 115 14 164.6 6.1 24.0X - -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +SQL ORC MR 2286 2327 57 6.9 145.4 1.0X +SQL ORC Vectorized (Nested Column Disabled) 2290 2299 13 6.9 145.6 1.0X +SQL ORC Vectorized (Nested Column Enabled) 356 385 18 44.2 22.6 6.4X +SQL Parquet MR: DataPageV1 2374 2410 51 6.6 150.9 1.0X +SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 3159 3169 14 5.0 200.8 0.7X +SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 103 122 14 153.3 6.5 22.3X +SQL Parquet MR: DataPageV2 2446 2456 14 6.4 155.5 0.9X +SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 3008 3010 3 5.2 191.3 0.8X +SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 93 107 10 169.1 5.9 24.6X + +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor SQL Single DOUBLE Column Scan in Struct: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------- -SQL ORC MR 2441 2492 73 6.4 155.2 1.0X -SQL ORC Vectorized (Nested Column Disabled) 2452 2464 16 6.4 155.9 1.0X -SQL ORC Vectorized (Nested Column Enabled) 482 502 13 32.6 30.6 5.1X -SQL Parquet MR: DataPageV1 2649 2650 2 5.9 168.4 0.9X -SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 3132 3143 15 5.0 199.2 0.8X -SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 149 164 11 105.7 9.5 16.4X -SQL Parquet MR: DataPageV2 2656 2661 8 5.9 168.8 0.9X -SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 3102 3157 79 5.1 197.2 0.8X -SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 129 145 13 121.9 8.2 18.9X +SQL ORC MR 2626 2658 45 6.0 167.0 1.0X +SQL ORC Vectorized (Nested Column Disabled) 2738 2746 11 5.7 174.1 1.0X +SQL ORC Vectorized (Nested Column Enabled) 778 779 1 20.2 49.5 3.4X +SQL Parquet MR: DataPageV1 2911 2911 1 5.4 185.0 0.9X +SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 3340 3354 19 4.7 212.4 0.8X +SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 298 310 9 52.7 19.0 8.8X +SQL Parquet MR: DataPageV2 2959 2966 11 5.3 188.1 0.9X +SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 3281 3289 10 4.8 208.6 0.8X +SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 297 305 8 52.9 18.9 8.8X ================================================================================================ SQL Nested Column Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor SQL Nested Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------- -SQL ORC MR 13140 13312 105 0.1 12530.8 1.0X -SQL ORC Vectorized (Nested Column Disabled) 13227 13351 118 0.1 12614.2 1.0X -SQL ORC Vectorized (Nested Column Enabled) 7145 7209 34 0.1 6813.8 1.8X -SQL Parquet MR: DataPageV1 8789 8807 17 0.1 8382.1 1.5X -SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 9112 9130 12 0.1 8690.1 1.4X -SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 5897 6081 111 0.2 5623.7 2.2X -SQL Parquet MR: DataPageV2 9841 9939 78 0.1 9385.0 1.3X -SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 9850 9884 16 0.1 9393.3 1.3X -SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 5678 5705 28 0.2 5414.9 2.3X +SQL ORC MR 13102 13223 110 0.1 12495.0 1.0X +SQL ORC Vectorized (Nested Column Disabled) 12894 13024 101 0.1 12296.2 1.0X +SQL ORC Vectorized (Nested Column Enabled) 7180 7220 36 0.1 6847.0 1.8X +SQL Parquet MR: DataPageV1 8625 8658 23 0.1 8225.2 1.5X +SQL Parquet Vectorized: DataPageV1 (Nested Column Disabled) 9197 9324 94 0.1 8771.2 1.4X +SQL Parquet Vectorized: DataPageV1 (Nested Column Enabled) 5862 6041 81 0.2 5590.5 2.2X +SQL Parquet MR: DataPageV2 9564 9731 184 0.1 9120.6 1.4X +SQL Parquet Vectorized: DataPageV2 (Nested Column Disabled) 9814 9865 50 0.1 9359.5 1.3X +SQL Parquet Vectorized: DataPageV2 (Nested Column Enabled) 5651 5735 38 0.2 5389.3 2.3X ================================================================================================ Int and String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor Int and String Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 10869 10983 161 1.0 1036.6 1.0X -SQL Json 10536 10542 8 1.0 1004.8 1.0X -SQL Parquet Vectorized: DataPageV1 1780 1786 9 5.9 169.7 6.1X -SQL Parquet Vectorized: DataPageV2 1891 1893 4 5.5 180.3 5.7X -SQL Parquet MR: DataPageV1 4057 4065 11 2.6 386.9 2.7X -SQL Parquet MR: DataPageV2 3947 3957 14 2.7 376.4 2.8X -SQL ORC Vectorized 1797 1806 12 5.8 171.4 6.0X -SQL ORC MR 3511 3513 3 3.0 334.8 3.1X +SQL CSV 12381 12387 8 0.8 1180.8 1.0X +SQL Json 10369 10422 75 1.0 988.8 1.2X +SQL Parquet Vectorized: DataPageV1 1801 1809 12 5.8 171.8 6.9X +SQL Parquet Vectorized: DataPageV2 2010 2024 21 5.2 191.7 6.2X +SQL Parquet MR: DataPageV1 3932 3944 16 2.7 375.0 3.1X +SQL Parquet MR: DataPageV2 4029 4043 20 2.6 384.2 3.1X +SQL ORC Vectorized 1838 1839 2 5.7 175.3 6.7X +SQL ORC MR 3529 3549 28 3.0 336.5 3.5X ================================================================================================ Repeated String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor Repeated String: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 6037 6105 96 1.7 575.7 1.0X -SQL Json 6878 6911 47 1.5 655.9 0.9X -SQL Parquet Vectorized: DataPageV1 388 392 3 27.0 37.0 15.6X -SQL Parquet Vectorized: DataPageV2 390 391 1 26.9 37.2 15.5X -SQL Parquet MR: DataPageV1 1712 1731 26 6.1 163.3 3.5X -SQL Parquet MR: DataPageV2 1620 1638 26 6.5 154.5 3.7X -SQL ORC Vectorized 392 396 4 26.8 37.4 15.4X -SQL ORC MR 1791 1794 4 5.9 170.8 3.4X +SQL CSV 7396 7452 80 1.4 705.4 1.0X +SQL Json 6836 6847 14 1.5 652.0 1.1X +SQL Parquet Vectorized: DataPageV1 468 474 5 22.4 44.6 15.8X +SQL Parquet Vectorized: DataPageV2 458 475 12 22.9 43.7 16.1X +SQL Parquet MR: DataPageV1 1621 1625 4 6.5 154.6 4.6X +SQL Parquet MR: DataPageV2 1645 1654 13 6.4 156.8 4.5X +SQL ORC Vectorized 390 395 3 26.9 37.2 19.0X +SQL ORC MR 1787 1791 5 5.9 170.4 4.1X ================================================================================================ Partitioned Table Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor Partitioned Table: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -Data column - CSV 15303 15419 164 1.0 972.9 1.0X -Data column - Json 11147 11172 35 1.4 708.7 1.4X -Data column - Parquet Vectorized: DataPageV1 122 140 14 129.3 7.7 125.8X -Data column - Parquet Vectorized: DataPageV2 231 242 7 68.0 14.7 66.1X -Data column - Parquet MR: DataPageV1 2670 2681 16 5.9 169.7 5.7X -Data column - Parquet MR: DataPageV2 2364 2373 12 6.7 150.3 6.5X -Data column - ORC Vectorized 227 257 24 69.4 14.4 67.5X -Data column - ORC MR 2301 2323 31 6.8 146.3 6.7X -Partition column - CSV 4486 4490 6 3.5 285.2 3.4X -Partition column - Json 9443 9449 8 1.7 600.4 1.6X -Partition column - Parquet Vectorized: DataPageV1 24 26 4 669.2 1.5 651.1X -Partition column - Parquet Vectorized: DataPageV2 23 27 4 681.3 1.5 662.9X -Partition column - Parquet MR: DataPageV1 1328 1341 18 11.8 84.4 11.5X -Partition column - Parquet MR: DataPageV2 1333 1335 3 11.8 84.8 11.5X -Partition column - ORC Vectorized 24 28 4 642.2 1.6 624.8X -Partition column - ORC MR 1431 1435 5 11.0 91.0 10.7X -Both columns - CSV 15209 15218 11 1.0 967.0 1.0X -Both columns - Json 11530 11541 16 1.4 733.0 1.3X -Both columns - Parquet Vectorized: DataPageV1 191 201 10 82.5 12.1 80.3X -Both columns - Parquet Vectorized: DataPageV2 294 311 10 53.5 18.7 52.0X -Both columns - Parquet MR: DataPageV1 2794 2797 4 5.6 177.7 5.5X -Both columns - Parquet MR: DataPageV2 2405 2435 44 6.5 152.9 6.4X -Both columns - ORC Vectorized 227 237 10 69.3 14.4 67.4X -Both columns - ORC MR 2330 2338 12 6.7 148.2 6.6X +Data column - CSV 13711 13750 55 1.1 871.7 1.0X +Data column - Json 9919 9951 44 1.6 630.7 1.4X +Data column - Parquet Vectorized: DataPageV1 111 130 16 142.2 7.0 124.0X +Data column - Parquet Vectorized: DataPageV2 259 274 9 60.7 16.5 52.9X +Data column - Parquet MR: DataPageV1 2372 2381 13 6.6 150.8 5.8X +Data column - Parquet MR: DataPageV2 2337 2339 4 6.7 148.6 5.9X +Data column - ORC Vectorized 139 162 16 113.0 8.9 98.5X +Data column - ORC MR 2068 2078 15 7.6 131.4 6.6X +Partition column - CSV 3797 3846 69 4.1 241.4 3.6X +Partition column - Json 8388 8396 10 1.9 533.3 1.6X +Partition column - Parquet Vectorized: DataPageV1 32 35 4 498.4 2.0 434.5X +Partition column - Parquet Vectorized: DataPageV2 31 35 4 500.3 2.0 436.1X +Partition column - Parquet MR: DataPageV1 1241 1242 1 12.7 78.9 11.1X +Partition column - Parquet MR: DataPageV2 1222 1224 3 12.9 77.7 11.2X +Partition column - ORC Vectorized 30 33 3 531.0 1.9 462.9X +Partition column - ORC MR 1232 1241 13 12.8 78.3 11.1X +Both columns - CSV 13510 13516 9 1.2 858.9 1.0X +Both columns - Json 10324 10374 71 1.5 656.4 1.3X +Both columns - Parquet Vectorized: DataPageV1 121 144 18 130.3 7.7 113.6X +Both columns - Parquet Vectorized: DataPageV2 259 274 16 60.8 16.4 53.0X +Both columns - Parquet MR: DataPageV1 2338 2356 25 6.7 148.7 5.9X +Both columns - Parquet MR: DataPageV2 2320 2322 2 6.8 147.5 5.9X +Both columns - ORC Vectorized 177 193 17 89.1 11.2 77.7X +Both columns - ORC MR 2109 2135 36 7.5 134.1 6.5X ================================================================================================ String with Nulls Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor String with Nulls Scan (0.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 7592 7616 34 1.4 724.1 1.0X -SQL Json 8827 8828 2 1.2 841.8 0.9X -SQL Parquet Vectorized: DataPageV1 1194 1203 13 8.8 113.9 6.4X -SQL Parquet Vectorized: DataPageV2 1232 1248 23 8.5 117.4 6.2X -SQL Parquet MR: DataPageV1 3481 3531 71 3.0 332.0 2.2X -SQL Parquet MR: DataPageV2 3585 3585 1 2.9 341.8 2.1X -ParquetReader Vectorized: DataPageV1 786 787 1 13.3 75.0 9.7X -ParquetReader Vectorized: DataPageV2 858 861 3 12.2 81.8 8.8X -SQL ORC Vectorized 891 908 22 11.8 85.0 8.5X -SQL ORC MR 2943 2972 41 3.6 280.7 2.6X - -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +SQL CSV 8866 8885 26 1.2 845.5 1.0X +SQL Json 9201 9207 8 1.1 877.5 1.0X +SQL Parquet Vectorized: DataPageV1 1286 1291 6 8.2 122.7 6.9X +SQL Parquet Vectorized: DataPageV2 1554 1566 17 6.7 148.2 5.7X +SQL Parquet MR: DataPageV1 3482 3506 34 3.0 332.1 2.5X +SQL Parquet MR: DataPageV2 3607 3635 40 2.9 344.0 2.5X +ParquetReader Vectorized: DataPageV1 792 794 2 13.2 75.5 11.2X +ParquetReader Vectorized: DataPageV2 1116 1123 10 9.4 106.5 7.9X +SQL ORC Vectorized 912 934 20 11.5 87.0 9.7X +SQL ORC MR 2987 3000 18 3.5 284.9 3.0X + +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor String with Nulls Scan (50.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 5836 5847 15 1.8 556.6 1.0X -SQL Json 7771 7775 6 1.3 741.1 0.8X -SQL Parquet Vectorized: DataPageV1 791 795 4 13.3 75.4 7.4X -SQL Parquet Vectorized: DataPageV2 854 858 4 12.3 81.5 6.8X -SQL Parquet MR: DataPageV1 2893 2906 19 3.6 275.9 2.0X -SQL Parquet MR: DataPageV2 3072 3078 8 3.4 293.0 1.9X -ParquetReader Vectorized: DataPageV1 726 730 7 14.5 69.2 8.0X -ParquetReader Vectorized: DataPageV2 794 800 7 13.2 75.7 7.3X -SQL ORC Vectorized 991 997 6 10.6 94.5 5.9X -SQL ORC MR 2815 2823 11 3.7 268.4 2.1X - -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +SQL CSV 6247 6258 16 1.7 595.8 1.0X +SQL Json 7887 7902 22 1.3 752.1 0.8X +SQL Parquet Vectorized: DataPageV1 824 836 19 12.7 78.5 7.6X +SQL Parquet Vectorized: DataPageV2 1027 1033 10 10.2 97.9 6.1X +SQL Parquet MR: DataPageV1 2799 2799 0 3.7 266.9 2.2X +SQL Parquet MR: DataPageV2 2883 2893 15 3.6 274.9 2.2X +ParquetReader Vectorized: DataPageV1 740 741 1 14.2 70.6 8.4X +ParquetReader Vectorized: DataPageV2 905 906 1 11.6 86.3 6.9X +SQL ORC Vectorized 983 986 3 10.7 93.8 6.4X +SQL ORC MR 2738 2741 4 3.8 261.1 2.3X + +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor String with Nulls Scan (95.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 4310 4329 27 2.4 411.0 1.0X -SQL Json 5772 5797 35 1.8 550.5 0.7X -SQL Parquet Vectorized: DataPageV1 162 166 3 64.8 15.4 26.6X -SQL Parquet Vectorized: DataPageV2 178 181 2 58.9 17.0 24.2X -SQL Parquet MR: DataPageV1 1905 1906 1 5.5 181.7 2.3X -SQL Parquet MR: DataPageV2 1843 1845 2 5.7 175.8 2.3X -ParquetReader Vectorized: DataPageV1 166 167 2 63.3 15.8 26.0X -ParquetReader Vectorized: DataPageV2 183 185 4 57.4 17.4 23.6X -SQL ORC Vectorized 324 329 7 32.4 30.9 13.3X -SQL ORC MR 1760 1823 89 6.0 167.9 2.4X +SQL CSV 4395 4398 4 2.4 419.2 1.0X +SQL Json 5649 5663 20 1.9 538.7 0.8X +SQL Parquet Vectorized: DataPageV1 164 170 7 64.1 15.6 26.9X +SQL Parquet Vectorized: DataPageV2 186 190 4 56.4 17.7 23.6X +SQL Parquet MR: DataPageV1 1769 1771 2 5.9 168.7 2.5X +SQL Parquet MR: DataPageV2 1721 1730 13 6.1 164.2 2.6X +ParquetReader Vectorized: DataPageV1 169 170 2 62.1 16.1 26.0X +ParquetReader Vectorized: DataPageV2 193 195 2 54.3 18.4 22.8X +SQL ORC Vectorized 313 316 3 33.5 29.9 14.0X +SQL ORC MR 1580 1592 18 6.6 150.6 2.8X ================================================================================================ Single Column Scan From Wide Columns ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor Single Column Scan from 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 1745 1776 43 0.6 1664.2 1.0X -SQL Json 2042 2046 5 0.5 1947.8 0.9X -SQL Parquet Vectorized: DataPageV1 27 31 5 39.1 25.6 65.1X -SQL Parquet Vectorized: DataPageV2 32 36 4 32.3 31.0 53.7X -SQL Parquet MR: DataPageV1 183 188 3 5.7 174.9 9.5X -SQL Parquet MR: DataPageV2 160 164 3 6.6 152.5 10.9X -SQL ORC Vectorized 32 35 4 32.3 31.0 53.7X -SQL ORC MR 133 136 3 7.9 127.2 13.1X - -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +SQL CSV 1197 1198 1 0.9 1141.7 1.0X +SQL Json 1855 1857 3 0.6 1769.2 0.6X +SQL Parquet Vectorized: DataPageV1 25 29 4 41.4 24.2 47.3X +SQL Parquet Vectorized: DataPageV2 34 37 5 30.9 32.4 35.2X +SQL Parquet MR: DataPageV1 160 167 6 6.6 152.7 7.5X +SQL Parquet MR: DataPageV2 154 158 4 6.8 146.7 7.8X +SQL ORC Vectorized 29 32 3 36.6 27.3 41.8X +SQL ORC MR 135 148 37 7.8 128.3 8.9X + +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor Single Column Scan from 50 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 3718 3734 23 0.3 3545.9 1.0X -SQL Json 7458 7541 116 0.1 7112.8 0.5X -SQL Parquet Vectorized: DataPageV1 35 38 4 30.2 33.1 107.2X -SQL Parquet Vectorized: DataPageV2 40 44 5 26.1 38.3 92.5X -SQL Parquet MR: DataPageV1 194 198 5 5.4 184.9 19.2X -SQL Parquet MR: DataPageV2 172 177 3 6.1 164.0 21.6X -SQL ORC Vectorized 42 46 7 25.2 39.7 89.3X -SQL ORC MR 143 147 3 7.3 136.8 25.9X - -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +SQL CSV 2630 2651 29 0.4 2508.3 1.0X +SQL Json 6628 6696 96 0.2 6321.0 0.4X +SQL Parquet Vectorized: DataPageV1 29 33 4 36.2 27.6 90.8X +SQL Parquet Vectorized: DataPageV2 38 41 4 27.7 36.1 69.4X +SQL Parquet MR: DataPageV1 164 167 2 6.4 156.9 16.0X +SQL Parquet MR: DataPageV2 160 165 4 6.5 152.9 16.4X +SQL ORC Vectorized 33 36 4 31.6 31.6 79.3X +SQL ORC MR 141 145 6 7.5 134.2 18.7X + +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor Single Column Scan from 100 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 6223 6233 14 0.2 5935.2 1.0X -SQL Json 13981 14087 151 0.1 13332.9 0.4X -SQL Parquet Vectorized: DataPageV1 50 59 8 20.8 48.1 123.5X -SQL Parquet Vectorized: DataPageV2 54 56 4 19.4 51.5 115.4X -SQL Parquet MR: DataPageV1 214 218 3 4.9 204.4 29.0X -SQL Parquet MR: DataPageV2 192 199 4 5.5 183.4 32.4X -SQL ORC Vectorized 52 56 5 20.1 49.7 119.4X -SQL ORC MR 155 158 2 6.8 147.8 40.1X +SQL CSV 4436 4536 141 0.2 4230.6 1.0X +SQL Json 12445 12624 253 0.1 11868.7 0.4X +SQL Parquet Vectorized: DataPageV1 36 39 4 29.2 34.3 123.5X +SQL Parquet Vectorized: DataPageV2 46 49 3 23.0 43.5 97.3X +SQL Parquet MR: DataPageV1 176 182 4 6.0 167.8 25.2X +SQL Parquet MR: DataPageV2 172 180 7 6.1 164.4 25.7X +SQL ORC Vectorized 39 43 4 26.8 37.3 113.6X +SQL ORC MR 148 154 11 7.1 141.5 29.9X diff --git a/sql/core/benchmarks/DatasetBenchmark-jdk21-results.txt b/sql/core/benchmarks/DatasetBenchmark-jdk21-results.txt index a0aa9ec3b08dd..321b8c5014cb8 100644 --- a/sql/core/benchmarks/DatasetBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/DatasetBenchmark-jdk21-results.txt @@ -2,45 +2,45 @@ Dataset Benchmark ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor back-to-back map long: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -RDD 6427 6526 141 15.6 64.3 1.0X -DataFrame 1136 1225 126 88.0 11.4 5.7X -Dataset 1920 1994 105 52.1 19.2 3.3X +RDD 6408 6469 86 15.6 64.1 1.0X +DataFrame 1151 1152 2 86.9 11.5 5.6X +Dataset 1725 1850 177 58.0 17.2 3.7X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor back-to-back map: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -RDD 7373 7383 14 13.6 73.7 1.0X -DataFrame 2811 2824 18 35.6 28.1 2.6X -Dataset 6370 6564 274 15.7 63.7 1.2X +RDD 7374 7378 6 13.6 73.7 1.0X +DataFrame 3111 3119 11 32.1 31.1 2.4X +Dataset 6397 6516 168 15.6 64.0 1.2X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor back-to-back filter Long: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -RDD 3972 4002 42 25.2 39.7 1.0X -DataFrame 775 789 15 129.1 7.7 5.1X -Dataset 1540 1547 10 64.9 15.4 2.6X +RDD 4022 4162 197 24.9 40.2 1.0X +DataFrame 701 722 19 142.6 7.0 5.7X +Dataset 1528 1545 24 65.4 15.3 2.6X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor back-to-back filter: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -RDD 2069 2082 19 48.3 20.7 1.0X -DataFrame 102 117 13 978.9 1.0 20.3X -Dataset 2242 2258 22 44.6 22.4 0.9X +RDD 2065 2095 43 48.4 20.6 1.0X +DataFrame 106 120 12 944.9 1.1 19.5X +Dataset 2302 2326 34 43.4 23.0 0.9X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor aggregate: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -RDD sum 1493 1518 36 67.0 14.9 1.0X -DataFrame sum 34 46 8 2962.3 0.3 44.2X -Dataset sum using Aggregator 1937 1976 55 51.6 19.4 0.8X -Dataset complex Aggregator 4915 5032 166 20.3 49.1 0.3X +RDD sum 1424 1452 40 70.2 14.2 1.0X +DataFrame sum 61 73 9 1636.9 0.6 23.3X +Dataset sum using Aggregator 1953 2020 94 51.2 19.5 0.7X +Dataset complex Aggregator 5019 5030 16 19.9 50.2 0.3X diff --git a/sql/core/benchmarks/DatasetBenchmark-results.txt b/sql/core/benchmarks/DatasetBenchmark-results.txt index bb1fe1af99330..79a6ed1e9ce33 100644 --- a/sql/core/benchmarks/DatasetBenchmark-results.txt +++ b/sql/core/benchmarks/DatasetBenchmark-results.txt @@ -2,45 +2,45 @@ Dataset Benchmark ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor back-to-back map long: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -RDD 6839 7062 315 14.6 68.4 1.0X -DataFrame 1279 1369 127 78.2 12.8 5.3X -Dataset 1689 1755 93 59.2 16.9 4.0X +RDD 6764 6789 36 14.8 67.6 1.0X +DataFrame 1296 1297 0 77.1 13.0 5.2X +Dataset 1448 1472 35 69.1 14.5 4.7X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor back-to-back map: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -RDD 8003 8003 0 12.5 80.0 1.0X -DataFrame 2833 2849 23 35.3 28.3 2.8X -Dataset 7895 7909 20 12.7 78.9 1.0X +RDD 8070 8080 14 12.4 80.7 1.0X +DataFrame 3215 3220 7 31.1 32.2 2.5X +Dataset 7842 7868 36 12.8 78.4 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor back-to-back filter Long: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -RDD 4231 4342 157 23.6 42.3 1.0X -DataFrame 754 775 28 132.6 7.5 5.6X -Dataset 1634 1656 30 61.2 16.3 2.6X +RDD 4145 4221 107 24.1 41.5 1.0X +DataFrame 758 782 32 131.9 7.6 5.5X +Dataset 1601 1622 29 62.5 16.0 2.6X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor back-to-back filter: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -RDD 2244 2351 151 44.6 22.4 1.0X -DataFrame 106 123 16 939.3 1.1 21.1X -Dataset 2484 2541 81 40.3 24.8 0.9X +RDD 2267 2394 180 44.1 22.7 1.0X +DataFrame 110 121 10 907.1 1.1 20.6X +Dataset 2384 2413 42 42.0 23.8 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor aggregate: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -RDD sum 1405 1406 1 71.2 14.1 1.0X -DataFrame sum 35 46 6 2824.3 0.4 39.7X -Dataset sum using Aggregator 2130 2174 62 46.9 21.3 0.7X -Dataset complex Aggregator 5208 5275 95 19.2 52.1 0.3X +RDD sum 1406 1433 38 71.1 14.1 1.0X +DataFrame sum 69 81 11 1459.0 0.7 20.5X +Dataset sum using Aggregator 2216 2225 13 45.1 22.2 0.6X +Dataset complex Aggregator 4974 5165 269 20.1 49.7 0.3X diff --git a/sql/core/benchmarks/DateTimeBenchmark-jdk21-results.txt b/sql/core/benchmarks/DateTimeBenchmark-jdk21-results.txt index 143f433a31604..dc4af0de55307 100644 --- a/sql/core/benchmarks/DateTimeBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/DateTimeBenchmark-jdk21-results.txt @@ -2,460 +2,460 @@ datetime +/- interval ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor datetime +/- interval: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date + interval(m) 850 887 33 11.8 85.0 1.0X -date + interval(m, d) 863 864 2 11.6 86.3 1.0X -date + interval(m, d, ms) 3507 3511 5 2.9 350.7 0.2X -date - interval(m) 841 851 9 11.9 84.1 1.0X -date - interval(m, d) 864 870 5 11.6 86.4 1.0X -date - interval(m, d, ms) 3518 3519 2 2.8 351.8 0.2X -timestamp + interval(m) 1756 1759 5 5.7 175.6 0.5X -timestamp + interval(m, d) 1802 1805 4 5.5 180.2 0.5X -timestamp + interval(m, d, ms) 1958 1961 4 5.1 195.8 0.4X -timestamp - interval(m) 1744 1745 2 5.7 174.4 0.5X -timestamp - interval(m, d) 1796 1799 4 5.6 179.6 0.5X -timestamp - interval(m, d, ms) 1944 1947 5 5.1 194.4 0.4X +date + interval(m) 845 871 24 11.8 84.5 1.0X +date + interval(m, d) 871 882 10 11.5 87.1 1.0X +date + interval(m, d, ms) 3744 3747 4 2.7 374.4 0.2X +date - interval(m) 833 836 5 12.0 83.3 1.0X +date - interval(m, d) 879 886 9 11.4 87.9 1.0X +date - interval(m, d, ms) 3703 3710 9 2.7 370.3 0.2X +timestamp + interval(m) 1768 1771 5 5.7 176.8 0.5X +timestamp + interval(m, d) 1809 1811 2 5.5 180.9 0.5X +timestamp + interval(m, d, ms) 1739 1746 10 5.8 173.9 0.5X +timestamp - interval(m) 1519 1530 16 6.6 151.9 0.6X +timestamp - interval(m, d) 1565 1584 27 6.4 156.5 0.5X +timestamp - interval(m, d, ms) 1734 1736 3 5.8 173.4 0.5X ================================================================================================ Extract components ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor cast to timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast to timestamp wholestage off 209 209 0 47.9 20.9 1.0X -cast to timestamp wholestage on 209 225 15 47.8 20.9 1.0X +cast to timestamp wholestage off 197 199 3 50.8 19.7 1.0X +cast to timestamp wholestage on 207 217 7 48.3 20.7 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor year of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -year of timestamp wholestage off 639 640 0 15.6 63.9 1.0X -year of timestamp wholestage on 631 635 6 15.9 63.1 1.0X +year of timestamp wholestage off 628 636 11 15.9 62.8 1.0X +year of timestamp wholestage on 626 632 11 16.0 62.6 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor quarter of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -quarter of timestamp wholestage off 685 694 12 14.6 68.5 1.0X -quarter of timestamp wholestage on 676 681 5 14.8 67.6 1.0X +quarter of timestamp wholestage off 664 665 2 15.1 66.4 1.0X +quarter of timestamp wholestage on 666 668 3 15.0 66.6 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor month of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -month of timestamp wholestage off 651 653 3 15.4 65.1 1.0X -month of timestamp wholestage on 644 649 4 15.5 64.4 1.0X +month of timestamp wholestage off 642 643 2 15.6 64.2 1.0X +month of timestamp wholestage on 631 636 3 15.8 63.1 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor weekofyear of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -weekofyear of timestamp wholestage off 920 920 0 10.9 92.0 1.0X -weekofyear of timestamp wholestage on 1054 1062 5 9.5 105.4 0.9X +weekofyear of timestamp wholestage off 1004 1005 2 10.0 100.4 1.0X +weekofyear of timestamp wholestage on 1059 1068 6 9.4 105.9 0.9X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor day of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -day of timestamp wholestage off 667 670 4 15.0 66.7 1.0X -day of timestamp wholestage on 657 662 6 15.2 65.7 1.0X +day of timestamp wholestage off 655 666 15 15.3 65.5 1.0X +day of timestamp wholestage on 643 648 4 15.5 64.3 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor dayofyear of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -dayofyear of timestamp wholestage off 714 714 1 14.0 71.4 1.0X -dayofyear of timestamp wholestage on 689 695 4 14.5 68.9 1.0X +dayofyear of timestamp wholestage off 685 686 1 14.6 68.5 1.0X +dayofyear of timestamp wholestage on 680 685 5 14.7 68.0 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor dayofmonth of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -dayofmonth of timestamp wholestage off 685 686 2 14.6 68.5 1.0X -dayofmonth of timestamp wholestage on 653 657 4 15.3 65.3 1.1X +dayofmonth of timestamp wholestage off 671 677 8 14.9 67.1 1.0X +dayofmonth of timestamp wholestage on 639 643 3 15.6 63.9 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor dayofweek of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -dayofweek of timestamp wholestage off 822 830 11 12.2 82.2 1.0X -dayofweek of timestamp wholestage on 807 812 3 12.4 80.7 1.0X +dayofweek of timestamp wholestage off 813 818 8 12.3 81.3 1.0X +dayofweek of timestamp wholestage on 804 810 6 12.4 80.4 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor weekday of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -weekday of timestamp wholestage off 753 765 16 13.3 75.3 1.0X -weekday of timestamp wholestage on 751 753 2 13.3 75.1 1.0X +weekday of timestamp wholestage off 745 748 5 13.4 74.5 1.0X +weekday of timestamp wholestage on 746 752 7 13.4 74.6 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor hour of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -hour of timestamp wholestage off 554 554 1 18.1 55.4 1.0X -hour of timestamp wholestage on 559 567 9 17.9 55.9 1.0X +hour of timestamp wholestage off 537 541 6 18.6 53.7 1.0X +hour of timestamp wholestage on 544 547 3 18.4 54.4 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor minute of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -minute of timestamp wholestage off 552 558 8 18.1 55.2 1.0X -minute of timestamp wholestage on 557 561 5 18.0 55.7 1.0X +minute of timestamp wholestage off 547 548 1 18.3 54.7 1.0X +minute of timestamp wholestage on 554 557 3 18.1 55.4 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor second of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -second of timestamp wholestage off 555 563 11 18.0 55.5 1.0X -second of timestamp wholestage on 558 561 4 17.9 55.8 1.0X +second of timestamp wholestage off 540 544 6 18.5 54.0 1.0X +second of timestamp wholestage on 546 551 5 18.3 54.6 1.0X ================================================================================================ Current date and time ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor current_date: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -current_date wholestage off 184 188 6 54.4 18.4 1.0X -current_date wholestage on 215 219 5 46.6 21.5 0.9X +current_date wholestage off 172 174 3 58.2 17.2 1.0X +current_date wholestage on 207 212 6 48.3 20.7 0.8X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor current_timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -current_timestamp wholestage off 186 199 18 53.7 18.6 1.0X -current_timestamp wholestage on 221 245 29 45.2 22.1 0.8X +current_timestamp wholestage off 172 174 2 58.0 17.2 1.0X +current_timestamp wholestage on 224 239 24 44.7 22.4 0.8X ================================================================================================ Date arithmetic ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor cast to date: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast to date wholestage off 652 658 8 15.3 65.2 1.0X -cast to date wholestage on 670 689 37 14.9 67.0 1.0X +cast to date wholestage off 599 602 4 16.7 59.9 1.0X +cast to date wholestage on 603 606 4 16.6 60.3 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor last_day: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -last_day wholestage off 735 737 3 13.6 73.5 1.0X -last_day wholestage on 728 729 2 13.7 72.8 1.0X +last_day wholestage off 714 718 6 14.0 71.4 1.0X +last_day wholestage on 700 708 6 14.3 70.0 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor next_day: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -next_day wholestage off 697 702 7 14.3 69.7 1.0X -next_day wholestage on 696 703 7 14.4 69.6 1.0X +next_day wholestage off 637 639 4 15.7 63.7 1.0X +next_day wholestage on 635 644 13 15.7 63.5 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor date_add: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_add wholestage off 626 633 10 16.0 62.6 1.0X -date_add wholestage on 631 638 11 15.9 63.1 1.0X +date_add wholestage off 585 600 22 17.1 58.5 1.0X +date_add wholestage on 596 598 1 16.8 59.6 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor date_sub: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_sub wholestage off 630 630 1 15.9 63.0 1.0X -date_sub wholestage on 625 632 5 16.0 62.5 1.0X +date_sub wholestage off 585 588 4 17.1 58.5 1.0X +date_sub wholestage on 597 600 2 16.7 59.7 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor add_months: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -add_months wholestage off 858 859 1 11.7 85.8 1.0X -add_months wholestage on 848 851 4 11.8 84.8 1.0X +add_months wholestage off 816 817 1 12.3 81.6 1.0X +add_months wholestage on 830 835 5 12.1 83.0 1.0X ================================================================================================ Formatting dates ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor format date: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -format date wholestage off 3217 3224 9 3.1 321.7 1.0X -format date wholestage on 3156 3172 14 3.2 315.6 1.0X +format date wholestage off 3079 3082 5 3.2 307.9 1.0X +format date wholestage on 3310 3363 31 3.0 331.0 0.9X ================================================================================================ Formatting timestamps ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor from_unixtime: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -from_unixtime wholestage off 2765 2768 5 3.6 276.5 1.0X -from_unixtime wholestage on 2774 2784 8 3.6 277.4 1.0X +from_unixtime wholestage off 2774 2777 3 3.6 277.4 1.0X +from_unixtime wholestage on 2749 2794 67 3.6 274.9 1.0X ================================================================================================ Convert timestamps ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor from_utc_timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -from_utc_timestamp wholestage off 664 665 2 15.1 66.4 1.0X -from_utc_timestamp wholestage on 786 792 4 12.7 78.6 0.8X +from_utc_timestamp wholestage off 616 621 6 16.2 61.6 1.0X +from_utc_timestamp wholestage on 736 740 3 13.6 73.6 0.8X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor to_utc_timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to_utc_timestamp wholestage off 775 814 56 12.9 77.5 1.0X -to_utc_timestamp wholestage on 816 822 4 12.3 81.6 0.9X +to_utc_timestamp wholestage off 764 764 1 13.1 76.4 1.0X +to_utc_timestamp wholestage on 825 827 2 12.1 82.5 0.9X ================================================================================================ Intervals ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor cast interval: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast interval wholestage off 221 229 12 45.3 22.1 1.0X -cast interval wholestage on 242 245 5 41.4 24.2 0.9X +cast interval wholestage off 228 232 6 43.8 22.8 1.0X +cast interval wholestage on 207 220 17 48.2 20.7 1.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor datediff: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -datediff wholestage off 1061 1065 6 9.4 106.1 1.0X -datediff wholestage on 1081 1085 3 9.3 108.1 1.0X +datediff wholestage off 999 999 1 10.0 99.9 1.0X +datediff wholestage on 997 999 2 10.0 99.7 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor months_between: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -months_between wholestage off 3517 3525 11 2.8 351.7 1.0X -months_between wholestage on 3514 3521 7 2.8 351.4 1.0X +months_between wholestage off 3358 3360 3 3.0 335.8 1.0X +months_between wholestage on 3328 3347 22 3.0 332.8 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor window: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -window wholestage off 310 324 19 3.2 310.3 1.0X -window wholestage on 623 638 16 1.6 622.8 0.5X +window wholestage off 375 382 11 2.7 374.7 1.0X +window wholestage on 676 687 13 1.5 675.8 0.6X ================================================================================================ Truncation ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor date_trunc YEAR: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc YEAR wholestage off 1870 1881 16 5.3 187.0 1.0X -date_trunc YEAR wholestage on 1839 1850 16 5.4 183.9 1.0X +date_trunc YEAR wholestage off 1715 1719 5 5.8 171.5 1.0X +date_trunc YEAR wholestage on 1742 1743 2 5.7 174.2 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor date_trunc YYYY: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc YYYY wholestage off 1873 1874 2 5.3 187.3 1.0X -date_trunc YYYY wholestage on 1849 1851 2 5.4 184.9 1.0X +date_trunc YYYY wholestage off 1722 1732 14 5.8 172.2 1.0X +date_trunc YYYY wholestage on 1734 1738 2 5.8 173.4 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor date_trunc YY: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc YY wholestage off 1869 1870 1 5.3 186.9 1.0X -date_trunc YY wholestage on 1849 1851 2 5.4 184.9 1.0X +date_trunc YY wholestage off 1726 1730 5 5.8 172.6 1.0X +date_trunc YY wholestage on 1739 1745 7 5.8 173.9 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor date_trunc MON: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc MON wholestage off 1902 1906 5 5.3 190.2 1.0X -date_trunc MON wholestage on 2046 2048 2 4.9 204.6 0.9X +date_trunc MON wholestage off 1703 1705 3 5.9 170.3 1.0X +date_trunc MON wholestage on 1745 1748 3 5.7 174.5 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor date_trunc MONTH: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc MONTH wholestage off 1900 1907 9 5.3 190.0 1.0X -date_trunc MONTH wholestage on 2048 2049 2 4.9 204.8 0.9X +date_trunc MONTH wholestage off 1703 1703 1 5.9 170.3 1.0X +date_trunc MONTH wholestage on 1744 1748 3 5.7 174.4 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor date_trunc MM: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc MM wholestage off 1899 1900 1 5.3 189.9 1.0X -date_trunc MM wholestage on 2045 2049 4 4.9 204.5 0.9X +date_trunc MM wholestage off 1701 1703 3 5.9 170.1 1.0X +date_trunc MM wholestage on 1744 1762 20 5.7 174.4 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor date_trunc DAY: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc DAY wholestage off 1251 1255 7 8.0 125.1 1.0X -date_trunc DAY wholestage on 1198 1203 4 8.3 119.8 1.0X +date_trunc DAY wholestage off 1228 1233 8 8.1 122.8 1.0X +date_trunc DAY wholestage on 1199 1207 7 8.3 119.9 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor date_trunc DD: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc DD wholestage off 1257 1260 4 8.0 125.7 1.0X -date_trunc DD wholestage on 1200 1205 5 8.3 120.0 1.0X +date_trunc DD wholestage off 1228 1229 3 8.1 122.8 1.0X +date_trunc DD wholestage on 1204 1206 2 8.3 120.4 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor date_trunc HOUR: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc HOUR wholestage off 1259 1262 4 7.9 125.9 1.0X -date_trunc HOUR wholestage on 1241 1246 5 8.1 124.1 1.0X +date_trunc HOUR wholestage off 1238 1244 8 8.1 123.8 1.0X +date_trunc HOUR wholestage on 1228 1238 15 8.1 122.8 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor date_trunc MINUTE: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc MINUTE wholestage off 1219 1220 1 8.2 121.9 1.0X -date_trunc MINUTE wholestage on 1201 1208 11 8.3 120.1 1.0X +date_trunc MINUTE wholestage off 1223 1228 8 8.2 122.3 1.0X +date_trunc MINUTE wholestage on 1226 1239 15 8.2 122.6 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor date_trunc SECOND: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc SECOND wholestage off 309 314 8 32.4 30.9 1.0X -date_trunc SECOND wholestage on 279 285 4 35.8 27.9 1.1X +date_trunc SECOND wholestage off 326 336 13 30.6 32.6 1.0X +date_trunc SECOND wholestage on 295 300 6 33.9 29.5 1.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor date_trunc WEEK: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc WEEK wholestage off 1788 1789 0 5.6 178.8 1.0X -date_trunc WEEK wholestage on 1760 1764 5 5.7 176.0 1.0X +date_trunc WEEK wholestage off 1648 1649 0 6.1 164.8 1.0X +date_trunc WEEK wholestage on 1633 1638 6 6.1 163.3 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor date_trunc QUARTER: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc QUARTER wholestage off 2416 2417 1 4.1 241.6 1.0X -date_trunc QUARTER wholestage on 2419 2421 3 4.1 241.9 1.0X +date_trunc QUARTER wholestage off 2063 2064 2 4.8 206.3 1.0X +date_trunc QUARTER wholestage on 2064 2069 4 4.8 206.4 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor trunc year: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc year wholestage off 296 300 7 33.8 29.6 1.0X -trunc year wholestage on 275 278 2 36.3 27.5 1.1X +trunc year wholestage off 821 822 1 12.2 82.1 1.0X +trunc year wholestage on 793 796 3 12.6 79.3 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor trunc yyyy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc yyyy wholestage off 301 305 5 33.2 30.1 1.0X -trunc yyyy wholestage on 271 275 3 36.9 27.1 1.1X +trunc yyyy wholestage off 822 823 2 12.2 82.2 1.0X +trunc yyyy wholestage on 790 799 11 12.7 79.0 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor trunc yy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc yy wholestage off 295 295 0 33.9 29.5 1.0X -trunc yy wholestage on 270 277 4 37.0 27.0 1.1X +trunc yy wholestage off 818 820 2 12.2 81.8 1.0X +trunc yy wholestage on 792 795 2 12.6 79.2 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor trunc mon: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc mon wholestage off 296 297 1 33.8 29.6 1.0X -trunc mon wholestage on 275 278 2 36.4 27.5 1.1X +trunc mon wholestage off 767 767 0 13.0 76.7 1.0X +trunc mon wholestage on 741 745 5 13.5 74.1 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor trunc month: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc month wholestage off 304 308 6 32.9 30.4 1.0X -trunc month wholestage on 276 280 4 36.2 27.6 1.1X +trunc month wholestage off 765 766 2 13.1 76.5 1.0X +trunc month wholestage on 742 746 2 13.5 74.2 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor trunc mm: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc mm wholestage off 297 298 1 33.6 29.7 1.0X -trunc mm wholestage on 274 276 2 36.5 27.4 1.1X +trunc mm wholestage off 766 767 2 13.1 76.6 1.0X +trunc mm wholestage on 742 744 3 13.5 74.2 1.0X ================================================================================================ Parsing ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor to timestamp str: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to timestamp str wholestage off 94 95 1 10.6 94.1 1.0X -to timestamp str wholestage on 93 96 2 10.7 93.2 1.0X +to timestamp str wholestage off 99 100 2 10.1 98.7 1.0X +to timestamp str wholestage on 93 99 6 10.8 93.0 1.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor to_timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to_timestamp wholestage off 659 659 1 1.5 658.7 1.0X -to_timestamp wholestage on 654 659 9 1.5 654.2 1.0X +to_timestamp wholestage off 656 664 10 1.5 656.4 1.0X +to_timestamp wholestage on 664 668 4 1.5 664.5 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor to_unix_timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to_unix_timestamp wholestage off 649 652 5 1.5 648.7 1.0X -to_unix_timestamp wholestage on 656 660 4 1.5 656.0 1.0X +to_unix_timestamp wholestage off 679 681 3 1.5 679.3 1.0X +to_unix_timestamp wholestage on 658 660 1 1.5 658.0 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor to date str: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to date str wholestage off 119 121 2 8.4 119.3 1.0X -to date str wholestage on 117 121 4 8.5 117.3 1.0X +to date str wholestage off 129 133 6 7.8 128.6 1.0X +to date str wholestage on 123 129 6 8.1 122.7 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor to_date: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to_date wholestage off 1071 1073 2 0.9 1071.3 1.0X -to_date wholestage on 1055 1057 2 0.9 1055.1 1.0X +to_date wholestage off 662 664 3 1.5 661.6 1.0X +to_date wholestage on 659 665 4 1.5 659.4 1.0X ================================================================================================ Conversion from/to external types ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor To/from Java's date-time: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -From java.sql.Date 293 299 5 17.0 58.7 1.0X -From java.time.LocalDate 241 246 6 20.7 48.3 1.2X -Collect java.sql.Date 1243 1366 169 4.0 248.5 0.2X -Collect java.time.LocalDate 1042 1080 40 4.8 208.3 0.3X -From java.sql.Timestamp 237 251 14 21.1 47.4 1.2X -From java.time.Instant 195 210 14 25.7 38.9 1.5X -Collect longs 979 1100 106 5.1 195.8 0.3X -Collect java.sql.Timestamp 1115 1165 60 4.5 223.0 0.3X -Collect java.time.Instant 1021 1095 98 4.9 204.3 0.3X -java.sql.Date to Hive string 4109 4252 124 1.2 821.8 0.1X -java.time.LocalDate to Hive string 3162 3288 125 1.6 632.3 0.1X -java.sql.Timestamp to Hive string 3714 3835 106 1.3 742.8 0.1X -java.time.Instant to Hive string 4319 4344 23 1.2 863.8 0.1X +From java.sql.Date 335 337 1 14.9 67.0 1.0X +From java.time.LocalDate 232 238 6 21.6 46.3 1.4X +Collect java.sql.Date 1204 1314 95 4.2 240.9 0.3X +Collect java.time.LocalDate 946 1072 112 5.3 189.3 0.4X +From java.sql.Timestamp 239 256 15 20.9 47.8 1.4X +From java.time.Instant 201 219 26 24.9 40.2 1.7X +Collect longs 948 1007 61 5.3 189.7 0.4X +Collect java.sql.Timestamp 1067 1183 100 4.7 213.5 0.3X +Collect java.time.Instant 961 1044 75 5.2 192.2 0.3X +java.sql.Date to Hive string 4059 4129 64 1.2 811.7 0.1X +java.time.LocalDate to Hive string 3007 3166 141 1.7 601.3 0.1X +java.sql.Timestamp to Hive string 3795 3876 88 1.3 759.1 0.1X +java.time.Instant to Hive string 4301 4398 84 1.2 860.1 0.1X diff --git a/sql/core/benchmarks/DateTimeBenchmark-results.txt b/sql/core/benchmarks/DateTimeBenchmark-results.txt index e972533e0b10b..051b9107109d3 100644 --- a/sql/core/benchmarks/DateTimeBenchmark-results.txt +++ b/sql/core/benchmarks/DateTimeBenchmark-results.txt @@ -2,460 +2,460 @@ datetime +/- interval ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor datetime +/- interval: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date + interval(m) 1248 1273 35 8.0 124.8 1.0X -date + interval(m, d) 1247 1263 24 8.0 124.7 1.0X -date + interval(m, d, ms) 4155 4156 2 2.4 415.5 0.3X -date - interval(m) 1294 1326 46 7.7 129.4 1.0X -date - interval(m, d) 1342 1342 1 7.5 134.2 0.9X -date - interval(m, d, ms) 4168 4176 11 2.4 416.8 0.3X -timestamp + interval(m) 1903 1913 15 5.3 190.3 0.7X -timestamp + interval(m, d) 1986 1986 0 5.0 198.6 0.6X -timestamp + interval(m, d, ms) 2216 2225 13 4.5 221.6 0.6X -timestamp - interval(m) 1936 1942 8 5.2 193.6 0.6X -timestamp - interval(m, d) 2033 2037 5 4.9 203.3 0.6X -timestamp - interval(m, d, ms) 2219 2239 28 4.5 221.9 0.6X +date + interval(m) 1014 1017 5 9.9 101.4 1.0X +date + interval(m, d) 1015 1015 0 9.9 101.5 1.0X +date + interval(m, d, ms) 3966 3979 19 2.5 396.6 0.3X +date - interval(m) 971 985 13 10.3 97.1 1.0X +date - interval(m, d) 1008 1011 5 9.9 100.8 1.0X +date - interval(m, d, ms) 4016 4024 11 2.5 401.6 0.3X +timestamp + interval(m) 1917 1939 31 5.2 191.7 0.5X +timestamp + interval(m, d) 1948 1959 17 5.1 194.8 0.5X +timestamp + interval(m, d, ms) 2056 2057 0 4.9 205.6 0.5X +timestamp - interval(m) 1814 1822 11 5.5 181.4 0.6X +timestamp - interval(m, d) 1871 1877 9 5.3 187.1 0.5X +timestamp - interval(m, d, ms) 2012 2017 7 5.0 201.2 0.5X ================================================================================================ Extract components ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor cast to timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast to timestamp wholestage off 196 219 33 51.0 19.6 1.0X -cast to timestamp wholestage on 203 220 15 49.2 20.3 1.0X +cast to timestamp wholestage off 205 218 19 48.9 20.5 1.0X +cast to timestamp wholestage on 225 228 3 44.5 22.5 0.9X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor year of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -year of timestamp wholestage off 838 847 12 11.9 83.8 1.0X -year of timestamp wholestage on 846 850 5 11.8 84.6 1.0X +year of timestamp wholestage off 777 788 16 12.9 77.7 1.0X +year of timestamp wholestage on 777 781 4 12.9 77.7 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor quarter of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -quarter of timestamp wholestage off 861 863 3 11.6 86.1 1.0X -quarter of timestamp wholestage on 877 882 4 11.4 87.7 1.0X +quarter of timestamp wholestage off 793 804 16 12.6 79.3 1.0X +quarter of timestamp wholestage on 791 801 11 12.6 79.1 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor month of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -month of timestamp wholestage off 850 854 6 11.8 85.0 1.0X -month of timestamp wholestage on 836 843 8 12.0 83.6 1.0X +month of timestamp wholestage off 766 769 5 13.1 76.6 1.0X +month of timestamp wholestage on 772 775 2 13.0 77.2 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor weekofyear of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -weekofyear of timestamp wholestage off 1240 1250 13 8.1 124.0 1.0X -weekofyear of timestamp wholestage on 1266 1269 4 7.9 126.6 1.0X +weekofyear of timestamp wholestage off 1114 1124 15 9.0 111.4 1.0X +weekofyear of timestamp wholestage on 1190 1196 4 8.4 119.0 0.9X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor day of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -day of timestamp wholestage off 848 849 2 11.8 84.8 1.0X -day of timestamp wholestage on 842 850 7 11.9 84.2 1.0X +day of timestamp wholestage off 766 773 10 13.1 76.6 1.0X +day of timestamp wholestage on 770 775 5 13.0 77.0 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor dayofyear of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -dayofyear of timestamp wholestage off 880 881 1 11.4 88.0 1.0X -dayofyear of timestamp wholestage on 912 914 2 11.0 91.2 1.0X +dayofyear of timestamp wholestage off 809 812 5 12.4 80.9 1.0X +dayofyear of timestamp wholestage on 811 815 4 12.3 81.1 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor dayofmonth of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -dayofmonth of timestamp wholestage off 870 871 2 11.5 87.0 1.0X -dayofmonth of timestamp wholestage on 845 848 3 11.8 84.5 1.0X +dayofmonth of timestamp wholestage off 779 783 5 12.8 77.9 1.0X +dayofmonth of timestamp wholestage on 769 773 4 13.0 76.9 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor dayofweek of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -dayofweek of timestamp wholestage off 982 989 10 10.2 98.2 1.0X -dayofweek of timestamp wholestage on 1030 1037 8 9.7 103.0 1.0X +dayofweek of timestamp wholestage off 918 918 0 10.9 91.8 1.0X +dayofweek of timestamp wholestage on 915 920 5 10.9 91.5 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor weekday of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -weekday of timestamp wholestage off 950 956 9 10.5 95.0 1.0X -weekday of timestamp wholestage on 976 980 5 10.3 97.6 1.0X +weekday of timestamp wholestage off 868 868 0 11.5 86.8 1.0X +weekday of timestamp wholestage on 874 880 5 11.4 87.4 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor hour of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -hour of timestamp wholestage off 600 601 2 16.7 60.0 1.0X -hour of timestamp wholestage on 606 610 3 16.5 60.6 1.0X +hour of timestamp wholestage off 604 608 5 16.6 60.4 1.0X +hour of timestamp wholestage on 619 625 5 16.2 61.9 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor minute of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -minute of timestamp wholestage off 595 597 4 16.8 59.5 1.0X -minute of timestamp wholestage on 608 613 4 16.4 60.8 1.0X +minute of timestamp wholestage off 608 616 12 16.4 60.8 1.0X +minute of timestamp wholestage on 614 621 4 16.3 61.4 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor second of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -second of timestamp wholestage off 630 633 4 15.9 63.0 1.0X -second of timestamp wholestage on 609 610 1 16.4 60.9 1.0X +second of timestamp wholestage off 606 612 9 16.5 60.6 1.0X +second of timestamp wholestage on 616 620 4 16.2 61.6 1.0X ================================================================================================ Current date and time ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor current_date: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -current_date wholestage off 181 183 2 55.1 18.1 1.0X -current_date wholestage on 215 218 4 46.5 21.5 0.8X +current_date wholestage off 192 194 3 52.1 19.2 1.0X +current_date wholestage on 214 228 16 46.8 21.4 0.9X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor current_timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -current_timestamp wholestage off 189 193 6 52.9 18.9 1.0X -current_timestamp wholestage on 214 259 48 46.8 21.4 0.9X +current_timestamp wholestage off 187 189 3 53.4 18.7 1.0X +current_timestamp wholestage on 227 238 13 44.0 22.7 0.8X ================================================================================================ Date arithmetic ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor cast to date: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast to date wholestage off 792 792 1 12.6 79.2 1.0X -cast to date wholestage on 795 805 7 12.6 79.5 1.0X +cast to date wholestage off 705 708 5 14.2 70.5 1.0X +cast to date wholestage on 677 681 2 14.8 67.7 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor last_day: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -last_day wholestage off 924 928 5 10.8 92.4 1.0X -last_day wholestage on 927 931 4 10.8 92.7 1.0X +last_day wholestage off 791 791 1 12.6 79.1 1.0X +last_day wholestage on 775 782 6 12.9 77.5 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor next_day: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -next_day wholestage off 820 820 0 12.2 82.0 1.0X -next_day wholestage on 862 866 5 11.6 86.2 1.0X +next_day wholestage off 696 697 2 14.4 69.6 1.0X +next_day wholestage on 704 706 2 14.2 70.4 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor date_add: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_add wholestage off 795 798 3 12.6 79.5 1.0X -date_add wholestage on 800 806 11 12.5 80.0 1.0X +date_add wholestage off 646 648 3 15.5 64.6 1.0X +date_add wholestage on 648 651 2 15.4 64.8 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor date_sub: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_sub wholestage off 772 772 0 13.0 77.2 1.0X -date_sub wholestage on 787 791 5 12.7 78.7 1.0X +date_sub wholestage off 646 647 1 15.5 64.6 1.0X +date_sub wholestage on 653 659 9 15.3 65.3 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor add_months: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -add_months wholestage off 1168 1169 1 8.6 116.8 1.0X -add_months wholestage on 1209 1214 6 8.3 120.9 1.0X +add_months wholestage off 922 930 11 10.8 92.2 1.0X +add_months wholestage on 908 911 2 11.0 90.8 1.0X ================================================================================================ Formatting dates ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor format date: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -format date wholestage off 3809 3809 0 2.6 380.9 1.0X -format date wholestage on 3724 3733 9 2.7 372.4 1.0X +format date wholestage off 3228 3232 6 3.1 322.8 1.0X +format date wholestage on 3205 3215 10 3.1 320.5 1.0X ================================================================================================ Formatting timestamps ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor from_unixtime: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -from_unixtime wholestage off 4096 4097 2 2.4 409.6 1.0X -from_unixtime wholestage on 3998 4007 12 2.5 399.8 1.0X +from_unixtime wholestage off 3681 3684 5 2.7 368.1 1.0X +from_unixtime wholestage on 3557 3603 27 2.8 355.7 1.0X ================================================================================================ Convert timestamps ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor from_utc_timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -from_utc_timestamp wholestage off 738 739 2 13.6 73.8 1.0X -from_utc_timestamp wholestage on 821 825 3 12.2 82.1 0.9X +from_utc_timestamp wholestage off 690 692 4 14.5 69.0 1.0X +from_utc_timestamp wholestage on 782 785 2 12.8 78.2 0.9X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor to_utc_timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to_utc_timestamp wholestage off 1059 1059 0 9.4 105.9 1.0X -to_utc_timestamp wholestage on 1037 1045 10 9.6 103.7 1.0X +to_utc_timestamp wholestage off 1040 1040 0 9.6 104.0 1.0X +to_utc_timestamp wholestage on 1026 1032 7 9.7 102.6 1.0X ================================================================================================ Intervals ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor cast interval: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast interval wholestage off 209 213 5 47.9 20.9 1.0X -cast interval wholestage on 198 208 8 50.4 19.8 1.1X +cast interval wholestage off 225 226 2 44.5 22.5 1.0X +cast interval wholestage on 216 225 7 46.4 21.6 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor datediff: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -datediff wholestage off 1374 1380 8 7.3 137.4 1.0X -datediff wholestage on 1384 1390 5 7.2 138.4 1.0X +datediff wholestage off 1139 1141 4 8.8 113.9 1.0X +datediff wholestage on 1157 1162 5 8.6 115.7 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor months_between: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -months_between wholestage off 3564 3569 8 2.8 356.4 1.0X -months_between wholestage on 3601 3603 2 2.8 360.1 1.0X +months_between wholestage off 3217 3219 4 3.1 321.7 1.0X +months_between wholestage on 3254 3266 9 3.1 325.4 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor window: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -window wholestage off 390 415 35 2.6 390.0 1.0X -window wholestage on 616 636 14 1.6 616.3 0.6X +window wholestage off 406 410 6 2.5 406.2 1.0X +window wholestage on 667 677 10 1.5 667.2 0.6X ================================================================================================ Truncation ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor date_trunc YEAR: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc YEAR wholestage off 1871 1878 10 5.3 187.1 1.0X -date_trunc YEAR wholestage on 1823 1829 8 5.5 182.3 1.0X +date_trunc YEAR wholestage off 1720 1726 9 5.8 172.0 1.0X +date_trunc YEAR wholestage on 1742 1752 7 5.7 174.2 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor date_trunc YYYY: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc YYYY wholestage off 1878 1888 15 5.3 187.8 1.0X -date_trunc YYYY wholestage on 1838 1843 4 5.4 183.8 1.0X +date_trunc YYYY wholestage off 1722 1724 2 5.8 172.2 1.0X +date_trunc YYYY wholestage on 1734 1748 11 5.8 173.4 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor date_trunc YY: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc YY wholestage off 1877 1878 1 5.3 187.7 1.0X -date_trunc YY wholestage on 1839 1844 4 5.4 183.9 1.0X +date_trunc YY wholestage off 1721 1722 1 5.8 172.1 1.0X +date_trunc YY wholestage on 1738 1749 8 5.8 173.8 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor date_trunc MON: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc MON wholestage off 1978 1978 0 5.1 197.8 1.0X -date_trunc MON wholestage on 1937 1939 1 5.2 193.7 1.0X +date_trunc MON wholestage off 1806 1808 4 5.5 180.6 1.0X +date_trunc MON wholestage on 1863 1870 5 5.4 186.3 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor date_trunc MONTH: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc MONTH wholestage off 1986 1991 7 5.0 198.6 1.0X -date_trunc MONTH wholestage on 1939 1950 12 5.2 193.9 1.0X +date_trunc MONTH wholestage off 1808 1811 5 5.5 180.8 1.0X +date_trunc MONTH wholestage on 1865 1871 8 5.4 186.5 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor date_trunc MM: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc MM wholestage off 1987 1989 3 5.0 198.7 1.0X -date_trunc MM wholestage on 1944 1958 12 5.1 194.4 1.0X +date_trunc MM wholestage off 1809 1820 15 5.5 180.9 1.0X +date_trunc MM wholestage on 1862 1865 2 5.4 186.2 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor date_trunc DAY: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc DAY wholestage off 1392 1394 3 7.2 139.2 1.0X -date_trunc DAY wholestage on 1320 1323 5 7.6 132.0 1.1X +date_trunc DAY wholestage off 1364 1365 2 7.3 136.4 1.0X +date_trunc DAY wholestage on 1339 1344 3 7.5 133.9 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor date_trunc DD: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc DD wholestage off 1386 1388 2 7.2 138.6 1.0X -date_trunc DD wholestage on 1314 1320 4 7.6 131.4 1.1X +date_trunc DD wholestage off 1371 1374 3 7.3 137.1 1.0X +date_trunc DD wholestage on 1336 1338 3 7.5 133.6 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor date_trunc HOUR: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc HOUR wholestage off 1380 1382 3 7.2 138.0 1.0X -date_trunc HOUR wholestage on 1340 1355 22 7.5 134.0 1.0X +date_trunc HOUR wholestage off 1408 1409 1 7.1 140.8 1.0X +date_trunc HOUR wholestage on 1316 1322 9 7.6 131.6 1.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor date_trunc MINUTE: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc MINUTE wholestage off 1396 1400 6 7.2 139.6 1.0X -date_trunc MINUTE wholestage on 1346 1355 11 7.4 134.6 1.0X +date_trunc MINUTE wholestage off 1412 1413 1 7.1 141.2 1.0X +date_trunc MINUTE wholestage on 1317 1321 4 7.6 131.7 1.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor date_trunc SECOND: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc SECOND wholestage off 366 373 10 27.3 36.6 1.0X -date_trunc SECOND wholestage on 309 315 4 32.4 30.9 1.2X +date_trunc SECOND wholestage off 357 361 6 28.0 35.7 1.0X +date_trunc SECOND wholestage on 306 308 3 32.7 30.6 1.2X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor date_trunc WEEK: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc WEEK wholestage off 1847 1849 2 5.4 184.7 1.0X -date_trunc WEEK wholestage on 1819 1827 6 5.5 181.9 1.0X +date_trunc WEEK wholestage off 1646 1664 25 6.1 164.6 1.0X +date_trunc WEEK wholestage on 1667 1671 7 6.0 166.7 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor date_trunc QUARTER: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc QUARTER wholestage off 2679 2694 21 3.7 267.9 1.0X -date_trunc QUARTER wholestage on 2691 2704 12 3.7 269.1 1.0X +date_trunc QUARTER wholestage off 2239 2241 3 4.5 223.9 1.0X +date_trunc QUARTER wholestage on 2199 2202 4 4.5 219.9 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor trunc year: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc year wholestage off 335 337 3 29.9 33.5 1.0X -trunc year wholestage on 302 303 1 33.1 30.2 1.1X +trunc year wholestage off 1006 1010 5 9.9 100.6 1.0X +trunc year wholestage on 925 931 8 10.8 92.5 1.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor trunc yyyy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc yyyy wholestage off 332 333 2 30.1 33.2 1.0X -trunc yyyy wholestage on 303 304 1 33.0 30.3 1.1X +trunc yyyy wholestage off 1008 1009 2 9.9 100.8 1.0X +trunc yyyy wholestage on 925 927 2 10.8 92.5 1.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor trunc yy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc yy wholestage off 332 333 2 30.1 33.2 1.0X -trunc yy wholestage on 301 303 1 33.2 30.1 1.1X +trunc yy wholestage off 1004 1010 9 10.0 100.4 1.0X +trunc yy wholestage on 926 928 1 10.8 92.6 1.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor trunc mon: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc mon wholestage off 332 340 10 30.1 33.2 1.0X -trunc mon wholestage on 302 303 2 33.2 30.2 1.1X +trunc mon wholestage off 947 947 0 10.6 94.7 1.0X +trunc mon wholestage on 896 902 6 11.2 89.6 1.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor trunc month: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc month wholestage off 333 340 9 30.0 33.3 1.0X -trunc month wholestage on 301 308 5 33.2 30.1 1.1X +trunc month wholestage off 948 950 2 10.5 94.8 1.0X +trunc month wholestage on 895 899 6 11.2 89.5 1.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor trunc mm: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc mm wholestage off 332 338 9 30.1 33.2 1.0X -trunc mm wholestage on 301 304 4 33.2 30.1 1.1X +trunc mm wholestage off 950 950 1 10.5 95.0 1.0X +trunc mm wholestage on 895 897 2 11.2 89.5 1.1X ================================================================================================ Parsing ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor to timestamp str: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to timestamp str wholestage off 111 111 0 9.0 111.4 1.0X -to timestamp str wholestage on 97 102 3 10.3 96.7 1.2X +to timestamp str wholestage off 102 107 6 9.8 102.2 1.0X +to timestamp str wholestage on 94 99 4 10.6 94.4 1.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor to_timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to_timestamp wholestage off 729 729 1 1.4 729.0 1.0X -to_timestamp wholestage on 745 748 2 1.3 744.8 1.0X +to_timestamp wholestage off 717 719 3 1.4 716.5 1.0X +to_timestamp wholestage on 704 706 2 1.4 703.8 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor to_unix_timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to_unix_timestamp wholestage off 743 750 10 1.3 742.7 1.0X -to_unix_timestamp wholestage on 739 742 2 1.4 739.4 1.0X +to_unix_timestamp wholestage off 706 707 0 1.4 706.2 1.0X +to_unix_timestamp wholestage on 698 701 2 1.4 697.6 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor to date str: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to date str wholestage off 118 120 3 8.5 118.1 1.0X -to date str wholestage on 126 131 7 7.9 126.5 0.9X +to date str wholestage off 140 142 3 7.1 139.9 1.0X +to date str wholestage on 131 137 3 7.6 131.0 1.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor to_date: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to_date wholestage off 1175 1175 0 0.9 1175.3 1.0X -to_date wholestage on 1173 1176 2 0.9 1173.3 1.0X +to_date wholestage off 596 597 2 1.7 596.2 1.0X +to_date wholestage on 604 606 2 1.7 603.6 1.0X ================================================================================================ Conversion from/to external types ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor To/from Java's date-time: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -From java.sql.Date 307 310 2 16.3 61.4 1.0X -From java.time.LocalDate 322 322 1 15.6 64.3 1.0X -Collect java.sql.Date 1093 1298 190 4.6 218.6 0.3X -Collect java.time.LocalDate 1122 1227 163 4.5 224.5 0.3X -From java.sql.Timestamp 261 275 13 19.2 52.1 1.2X -From java.time.Instant 228 237 8 21.9 45.7 1.3X -Collect longs 952 1083 132 5.3 190.5 0.3X -Collect java.sql.Timestamp 1152 1197 40 4.3 230.3 0.3X -Collect java.time.Instant 919 1033 115 5.4 183.8 0.3X -java.sql.Date to Hive string 3984 4102 145 1.3 796.9 0.1X -java.time.LocalDate to Hive string 3606 3709 90 1.4 721.2 0.1X -java.sql.Timestamp to Hive string 3774 3865 102 1.3 754.8 0.1X -java.time.Instant to Hive string 5409 5480 84 0.9 1081.7 0.1X +From java.sql.Date 304 317 13 16.4 60.8 1.0X +From java.time.LocalDate 269 271 1 18.6 53.9 1.1X +Collect java.sql.Date 1269 1296 38 3.9 253.8 0.2X +Collect java.time.LocalDate 987 1054 59 5.1 197.5 0.3X +From java.sql.Timestamp 251 264 22 19.9 50.3 1.2X +From java.time.Instant 194 201 6 25.7 38.9 1.6X +Collect longs 876 977 87 5.7 175.1 0.3X +Collect java.sql.Timestamp 1028 1081 53 4.9 205.5 0.3X +Collect java.time.Instant 1017 1114 85 4.9 203.4 0.3X +java.sql.Date to Hive string 3697 3897 178 1.4 739.3 0.1X +java.time.LocalDate to Hive string 3366 3505 131 1.5 673.1 0.1X +java.sql.Timestamp to Hive string 3490 3648 151 1.4 698.1 0.1X +java.time.Instant to Hive string 5279 5339 97 0.9 1055.7 0.1X diff --git a/sql/core/benchmarks/DateTimeRebaseBenchmark-jdk21-results.txt b/sql/core/benchmarks/DateTimeRebaseBenchmark-jdk21-results.txt index ac6fff4fe5b30..8119824cb769f 100644 --- a/sql/core/benchmarks/DateTimeRebaseBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/DateTimeRebaseBenchmark-jdk21-results.txt @@ -2,153 +2,153 @@ Rebasing dates/timestamps in Parquet datasource ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Save DATE to parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, noop 10761 10761 0 9.3 107.6 1.0X -before 1582, noop 6737 6737 0 14.8 67.4 1.6X -after 1582, rebase EXCEPTION 18652 18652 0 5.4 186.5 0.6X -after 1582, rebase LEGACY 18558 18558 0 5.4 185.6 0.6X -after 1582, rebase CORRECTED 18805 18805 0 5.3 188.0 0.6X -before 1582, rebase LEGACY 14983 14983 0 6.7 149.8 0.7X -before 1582, rebase CORRECTED 14583 14583 0 6.9 145.8 0.7X - -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +after 1582, noop 10287 10287 0 9.7 102.9 1.0X +before 1582, noop 6015 6015 0 16.6 60.2 1.7X +after 1582, rebase EXCEPTION 19200 19200 0 5.2 192.0 0.5X +after 1582, rebase LEGACY 19267 19267 0 5.2 192.7 0.5X +after 1582, rebase CORRECTED 19175 19175 0 5.2 191.7 0.5X +before 1582, rebase LEGACY 15262 15262 0 6.6 152.6 0.7X +before 1582, rebase CORRECTED 15273 15273 0 6.5 152.7 0.7X + +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Load DATE from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, vec off, rebase EXCEPTION 10738 10862 144 9.3 107.4 1.0X -after 1582, vec off, rebase LEGACY 10910 10937 25 9.2 109.1 1.0X -after 1582, vec off, rebase CORRECTED 10842 10901 63 9.2 108.4 1.0X -after 1582, vec on, rebase EXCEPTION 2454 2484 35 40.8 24.5 4.4X -after 1582, vec on, rebase LEGACY 2464 2484 20 40.6 24.6 4.4X -after 1582, vec on, rebase CORRECTED 2457 2493 32 40.7 24.6 4.4X -before 1582, vec off, rebase LEGACY 11112 11125 21 9.0 111.1 1.0X -before 1582, vec off, rebase CORRECTED 10859 10899 35 9.2 108.6 1.0X -before 1582, vec on, rebase LEGACY 2836 2848 13 35.3 28.4 3.8X -before 1582, vec on, rebase CORRECTED 2416 2452 34 41.4 24.2 4.4X - -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +after 1582, vec off, rebase EXCEPTION 11299 11345 61 8.9 113.0 1.0X +after 1582, vec off, rebase LEGACY 12489 12538 43 8.0 124.9 0.9X +after 1582, vec off, rebase CORRECTED 12542 12552 16 8.0 125.4 0.9X +after 1582, vec on, rebase EXCEPTION 2429 2471 42 41.2 24.3 4.7X +after 1582, vec on, rebase LEGACY 2473 2500 33 40.4 24.7 4.6X +after 1582, vec on, rebase CORRECTED 2462 2483 29 40.6 24.6 4.6X +before 1582, vec off, rebase LEGACY 12815 12872 59 7.8 128.2 0.9X +before 1582, vec off, rebase CORRECTED 12553 12596 40 8.0 125.5 0.9X +before 1582, vec on, rebase LEGACY 2771 2802 29 36.1 27.7 4.1X +before 1582, vec on, rebase CORRECTED 2403 2447 39 41.6 24.0 4.7X + +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Save TIMESTAMP_INT96 to parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1900, noop 2382 2382 0 42.0 23.8 1.0X -before 1900, noop 2331 2331 0 42.9 23.3 1.0X -after 1900, rebase EXCEPTION 12497 12497 0 8.0 125.0 0.2X -after 1900, rebase LEGACY 12285 12285 0 8.1 122.9 0.2X -after 1900, rebase CORRECTED 11954 11954 0 8.4 119.5 0.2X -before 1900, rebase LEGACY 13867 13867 0 7.2 138.7 0.2X -before 1900, rebase CORRECTED 12243 12243 0 8.2 122.4 0.2X - -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +after 1900, noop 2413 2413 0 41.4 24.1 1.0X +before 1900, noop 2439 2439 0 41.0 24.4 1.0X +after 1900, rebase EXCEPTION 12548 12548 0 8.0 125.5 0.2X +after 1900, rebase LEGACY 12525 12525 0 8.0 125.2 0.2X +after 1900, rebase CORRECTED 12548 12548 0 8.0 125.5 0.2X +before 1900, rebase LEGACY 14343 14343 0 7.0 143.4 0.2X +before 1900, rebase CORRECTED 12758 12758 0 7.8 127.6 0.2X + +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Load TIMESTAMP_INT96 from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1900, vec off, rebase EXCEPTION 15537 15544 6 6.4 155.4 1.0X -after 1900, vec off, rebase LEGACY 15699 15753 56 6.4 157.0 1.0X -after 1900, vec off, rebase CORRECTED 15611 15621 10 6.4 156.1 1.0X -after 1900, vec on, rebase EXCEPTION 4058 4074 14 24.6 40.6 3.8X -after 1900, vec on, rebase LEGACY 4015 4043 26 24.9 40.1 3.9X -after 1900, vec on, rebase CORRECTED 4069 4121 48 24.6 40.7 3.8X -before 1900, vec off, rebase LEGACY 18703 18729 31 5.3 187.0 0.8X -before 1900, vec off, rebase CORRECTED 15777 15814 46 6.3 157.8 1.0X -before 1900, vec on, rebase LEGACY 6290 6313 22 15.9 62.9 2.5X -before 1900, vec on, rebase CORRECTED 4056 4080 22 24.7 40.6 3.8X - -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +after 1900, vec off, rebase EXCEPTION 15626 15663 37 6.4 156.3 1.0X +after 1900, vec off, rebase LEGACY 16695 16750 47 6.0 167.0 0.9X +after 1900, vec off, rebase CORRECTED 15958 16047 77 6.3 159.6 1.0X +after 1900, vec on, rebase EXCEPTION 4039 4056 20 24.8 40.4 3.9X +after 1900, vec on, rebase LEGACY 4113 4132 27 24.3 41.1 3.8X +after 1900, vec on, rebase CORRECTED 4062 4071 9 24.6 40.6 3.8X +before 1900, vec off, rebase LEGACY 18025 18067 48 5.5 180.3 0.9X +before 1900, vec off, rebase CORRECTED 16044 16064 23 6.2 160.4 1.0X +before 1900, vec on, rebase LEGACY 6302 6317 22 15.9 63.0 2.5X +before 1900, vec on, rebase CORRECTED 4041 4061 18 24.7 40.4 3.9X + +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Save TIMESTAMP_MICROS to parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1900, noop 2360 2360 0 42.4 23.6 1.0X -before 1900, noop 2318 2318 0 43.1 23.2 1.0X -after 1900, rebase EXCEPTION 11233 11233 0 8.9 112.3 0.2X -after 1900, rebase LEGACY 11001 11001 0 9.1 110.0 0.2X -after 1900, rebase CORRECTED 10952 10952 0 9.1 109.5 0.2X -before 1900, rebase LEGACY 13110 13110 0 7.6 131.1 0.2X -before 1900, rebase CORRECTED 11511 11511 0 8.7 115.1 0.2X - -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +after 1900, noop 2449 2449 0 40.8 24.5 1.0X +before 1900, noop 2448 2448 0 40.8 24.5 1.0X +after 1900, rebase EXCEPTION 11787 11787 0 8.5 117.9 0.2X +after 1900, rebase LEGACY 11894 11894 0 8.4 118.9 0.2X +after 1900, rebase CORRECTED 11807 11807 0 8.5 118.1 0.2X +before 1900, rebase LEGACY 13934 13934 0 7.2 139.3 0.2X +before 1900, rebase CORRECTED 11771 11771 0 8.5 117.7 0.2X + +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Load TIMESTAMP_MICROS from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1900, vec off, rebase EXCEPTION 14455 14506 67 6.9 144.5 1.0X -after 1900, vec off, rebase LEGACY 14512 14595 77 6.9 145.1 1.0X -after 1900, vec off, rebase CORRECTED 14534 14551 16 6.9 145.3 1.0X -after 1900, vec on, rebase EXCEPTION 3691 3705 13 27.1 36.9 3.9X -after 1900, vec on, rebase LEGACY 3715 3776 56 26.9 37.2 3.9X -after 1900, vec on, rebase CORRECTED 3696 3712 14 27.1 37.0 3.9X -before 1900, vec off, rebase LEGACY 16982 17027 60 5.9 169.8 0.9X -before 1900, vec off, rebase CORRECTED 14446 14480 47 6.9 144.5 1.0X -before 1900, vec on, rebase LEGACY 5700 5722 20 17.5 57.0 2.5X -before 1900, vec on, rebase CORRECTED 3766 3783 26 26.6 37.7 3.8X - -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +after 1900, vec off, rebase EXCEPTION 14723 14750 35 6.8 147.2 1.0X +after 1900, vec off, rebase LEGACY 14871 14964 115 6.7 148.7 1.0X +after 1900, vec off, rebase CORRECTED 14771 14797 27 6.8 147.7 1.0X +after 1900, vec on, rebase EXCEPTION 3748 3753 8 26.7 37.5 3.9X +after 1900, vec on, rebase LEGACY 3754 3767 11 26.6 37.5 3.9X +after 1900, vec on, rebase CORRECTED 3737 3756 24 26.8 37.4 3.9X +before 1900, vec off, rebase LEGACY 17307 17328 31 5.8 173.1 0.9X +before 1900, vec off, rebase CORRECTED 14712 14757 76 6.8 147.1 1.0X +before 1900, vec on, rebase LEGACY 5700 5718 16 17.5 57.0 2.6X +before 1900, vec on, rebase CORRECTED 3734 3773 34 26.8 37.3 3.9X + +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Save TIMESTAMP_MILLIS to parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1900, noop 2361 2361 0 42.4 23.6 1.0X -before 1900, noop 2303 2303 0 43.4 23.0 1.0X -after 1900, rebase EXCEPTION 10793 10793 0 9.3 107.9 0.2X -after 1900, rebase LEGACY 10804 10804 0 9.3 108.0 0.2X -after 1900, rebase CORRECTED 10894 10894 0 9.2 108.9 0.2X -before 1900, rebase LEGACY 12759 12759 0 7.8 127.6 0.2X -before 1900, rebase CORRECTED 10916 10916 0 9.2 109.2 0.2X - -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +after 1900, noop 2380 2380 0 42.0 23.8 1.0X +before 1900, noop 2378 2378 0 42.1 23.8 1.0X +after 1900, rebase EXCEPTION 11216 11216 0 8.9 112.2 0.2X +after 1900, rebase LEGACY 11924 11924 0 8.4 119.2 0.2X +after 1900, rebase CORRECTED 12410 12410 0 8.1 124.1 0.2X +before 1900, rebase LEGACY 13779 13779 0 7.3 137.8 0.2X +before 1900, rebase CORRECTED 11367 11367 0 8.8 113.7 0.2X + +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Load TIMESTAMP_MILLIS from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1900, vec off, rebase EXCEPTION 14148 14183 45 7.1 141.5 1.0X -after 1900, vec off, rebase LEGACY 14408 14444 62 6.9 144.1 1.0X -after 1900, vec off, rebase CORRECTED 14462 14498 44 6.9 144.6 1.0X -after 1900, vec on, rebase EXCEPTION 4887 4908 21 20.5 48.9 2.9X -after 1900, vec on, rebase LEGACY 4465 4483 16 22.4 44.7 3.2X -after 1900, vec on, rebase CORRECTED 4880 4922 54 20.5 48.8 2.9X -before 1900, vec off, rebase LEGACY 16541 16553 17 6.0 165.4 0.9X -before 1900, vec off, rebase CORRECTED 14451 14519 63 6.9 144.5 1.0X -before 1900, vec on, rebase LEGACY 6116 6127 11 16.3 61.2 2.3X -before 1900, vec on, rebase CORRECTED 4898 4918 20 20.4 49.0 2.9X +after 1900, vec off, rebase EXCEPTION 14340 14389 75 7.0 143.4 1.0X +after 1900, vec off, rebase LEGACY 14608 14627 23 6.8 146.1 1.0X +after 1900, vec off, rebase CORRECTED 14466 14528 60 6.9 144.7 1.0X +after 1900, vec on, rebase EXCEPTION 4894 4932 38 20.4 48.9 2.9X +after 1900, vec on, rebase LEGACY 4569 4593 28 21.9 45.7 3.1X +after 1900, vec on, rebase CORRECTED 4918 4943 23 20.3 49.2 2.9X +before 1900, vec off, rebase LEGACY 16912 16926 15 5.9 169.1 0.8X +before 1900, vec off, rebase CORRECTED 14495 14499 4 6.9 144.9 1.0X +before 1900, vec on, rebase LEGACY 6265 6307 41 16.0 62.7 2.3X +before 1900, vec on, rebase CORRECTED 4892 4930 38 20.4 48.9 2.9X ================================================================================================ Rebasing dates/timestamps in ORC datasource ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Save DATE to ORC: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, noop 10993 10993 0 9.1 109.9 1.0X -before 1582, noop 6556 6556 0 15.3 65.6 1.7X -after 1582 14554 14554 0 6.9 145.5 0.8X -before 1582 10400 10400 0 9.6 104.0 1.1X +after 1582, noop 11339 11339 0 8.8 113.4 1.0X +before 1582, noop 5604 5604 0 17.8 56.0 2.0X +after 1582 15457 15457 0 6.5 154.6 0.7X +before 1582 10049 10049 0 10.0 100.5 1.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Load DATE from ORC: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, vec off 8426 8444 15 11.9 84.3 1.0X -after 1582, vec on 2381 2404 37 42.0 23.8 3.5X -before 1582, vec off 8541 8558 20 11.7 85.4 1.0X -before 1582, vec on 2527 2538 10 39.6 25.3 3.3X +after 1582, vec off 8478 8853 595 11.8 84.8 1.0X +after 1582, vec on 2380 2428 60 42.0 23.8 3.6X +before 1582, vec off 8570 8583 14 11.7 85.7 1.0X +before 1582, vec on 2510 2518 7 39.8 25.1 3.4X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Save TIMESTAMP to ORC: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1900, noop 2216 2216 0 45.1 22.2 1.0X -before 1900, noop 2241 2241 0 44.6 22.4 1.0X -after 1900 9421 9421 0 10.6 94.2 0.2X -before 1900 11674 11674 0 8.6 116.7 0.2X +after 1900, noop 2275 2275 0 44.0 22.7 1.0X +before 1900, noop 2288 2288 0 43.7 22.9 1.0X +after 1900 9472 9472 0 10.6 94.7 0.2X +before 1900 11470 11470 0 8.7 114.7 0.2X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Load TIMESTAMP from ORC: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1900, vec off 9808 10011 312 10.2 98.1 1.0X -after 1900, vec on 3963 3972 10 25.2 39.6 2.5X -before 1900, vec off 11884 11908 22 8.4 118.8 0.8X -before 1900, vec on 5435 5449 16 18.4 54.3 1.8X +after 1900, vec off 9871 9914 39 10.1 98.7 1.0X +after 1900, vec on 4138 4153 13 24.2 41.4 2.4X +before 1900, vec off 11828 11874 53 8.5 118.3 0.8X +before 1900, vec on 5976 5984 13 16.7 59.8 1.7X diff --git a/sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt b/sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt index cd6235f177889..39d679bd8b1d9 100644 --- a/sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt +++ b/sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt @@ -2,153 +2,153 @@ Rebasing dates/timestamps in Parquet datasource ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Save DATE to parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, noop 13431 13431 0 7.4 134.3 1.0X -before 1582, noop 7898 7898 0 12.7 79.0 1.7X -after 1582, rebase EXCEPTION 22422 22422 0 4.5 224.2 0.6X -after 1582, rebase LEGACY 21993 21993 0 4.5 219.9 0.6X -after 1582, rebase CORRECTED 21855 21855 0 4.6 218.5 0.6X -before 1582, rebase LEGACY 17426 17426 0 5.7 174.3 0.8X -before 1582, rebase CORRECTED 16427 16427 0 6.1 164.3 0.8X - -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +after 1582, noop 12846 12846 0 7.8 128.5 1.0X +before 1582, noop 7874 7874 0 12.7 78.7 1.6X +after 1582, rebase EXCEPTION 20968 20968 0 4.8 209.7 0.6X +after 1582, rebase LEGACY 20802 20802 0 4.8 208.0 0.6X +after 1582, rebase CORRECTED 21961 21961 0 4.6 219.6 0.6X +before 1582, rebase LEGACY 16440 16440 0 6.1 164.4 0.8X +before 1582, rebase CORRECTED 16324 16324 0 6.1 163.2 0.8X + +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Load DATE from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, vec off, rebase EXCEPTION 11518 11573 48 8.7 115.2 1.0X -after 1582, vec off, rebase LEGACY 11742 11775 50 8.5 117.4 1.0X -after 1582, vec off, rebase CORRECTED 11654 11718 57 8.6 116.5 1.0X -after 1582, vec on, rebase EXCEPTION 2602 2603 1 38.4 26.0 4.4X -after 1582, vec on, rebase LEGACY 2598 2607 11 38.5 26.0 4.4X -after 1582, vec on, rebase CORRECTED 2556 2577 21 39.1 25.6 4.5X -before 1582, vec off, rebase LEGACY 11959 11990 28 8.4 119.6 1.0X -before 1582, vec off, rebase CORRECTED 11738 11758 33 8.5 117.4 1.0X -before 1582, vec on, rebase LEGACY 3013 3020 10 33.2 30.1 3.8X -before 1582, vec on, rebase CORRECTED 2670 2677 6 37.4 26.7 4.3X - -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +after 1582, vec off, rebase EXCEPTION 11932 12128 170 8.4 119.3 1.0X +after 1582, vec off, rebase LEGACY 11902 11976 67 8.4 119.0 1.0X +after 1582, vec off, rebase CORRECTED 11866 11900 59 8.4 118.7 1.0X +after 1582, vec on, rebase EXCEPTION 2554 2578 39 39.2 25.5 4.7X +after 1582, vec on, rebase LEGACY 2550 2599 84 39.2 25.5 4.7X +after 1582, vec on, rebase CORRECTED 2529 2548 19 39.5 25.3 4.7X +before 1582, vec off, rebase LEGACY 12073 12082 10 8.3 120.7 1.0X +before 1582, vec off, rebase CORRECTED 11835 11890 47 8.4 118.4 1.0X +before 1582, vec on, rebase LEGACY 2809 2829 19 35.6 28.1 4.2X +before 1582, vec on, rebase CORRECTED 2487 2509 21 40.2 24.9 4.8X + +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Save TIMESTAMP_INT96 to parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1900, noop 2332 2332 0 42.9 23.3 1.0X -before 1900, noop 2316 2316 0 43.2 23.2 1.0X -after 1900, rebase EXCEPTION 12731 12731 0 7.9 127.3 0.2X -after 1900, rebase LEGACY 13684 13684 0 7.3 136.8 0.2X -after 1900, rebase CORRECTED 12925 12925 0 7.7 129.3 0.2X -before 1900, rebase LEGACY 14813 14813 0 6.8 148.1 0.2X -before 1900, rebase CORRECTED 13043 13043 0 7.7 130.4 0.2X - -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +after 1900, noop 2307 2307 0 43.3 23.1 1.0X +before 1900, noop 2414 2414 0 41.4 24.1 1.0X +after 1900, rebase EXCEPTION 13251 13251 0 7.5 132.5 0.2X +after 1900, rebase LEGACY 13218 13218 0 7.6 132.2 0.2X +after 1900, rebase CORRECTED 12495 12495 0 8.0 124.9 0.2X +before 1900, rebase LEGACY 14825 14825 0 6.7 148.2 0.2X +before 1900, rebase CORRECTED 12741 12741 0 7.8 127.4 0.2X + +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Load TIMESTAMP_INT96 from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1900, vec off, rebase EXCEPTION 15058 15125 62 6.6 150.6 1.0X -after 1900, vec off, rebase LEGACY 14938 14973 34 6.7 149.4 1.0X -after 1900, vec off, rebase CORRECTED 14827 14890 69 6.7 148.3 1.0X -after 1900, vec on, rebase EXCEPTION 5442 5498 51 18.4 54.4 2.8X -after 1900, vec on, rebase LEGACY 5500 5532 54 18.2 55.0 2.7X -after 1900, vec on, rebase CORRECTED 5493 5504 15 18.2 54.9 2.7X -before 1900, vec off, rebase LEGACY 19000 19002 2 5.3 190.0 0.8X -before 1900, vec off, rebase CORRECTED 16181 16225 41 6.2 161.8 0.9X -before 1900, vec on, rebase LEGACY 7514 7529 26 13.3 75.1 2.0X -before 1900, vec on, rebase CORRECTED 5677 5698 25 17.6 56.8 2.7X - -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +after 1900, vec off, rebase EXCEPTION 14943 14997 65 6.7 149.4 1.0X +after 1900, vec off, rebase LEGACY 15155 15243 101 6.6 151.6 1.0X +after 1900, vec off, rebase CORRECTED 14988 15008 20 6.7 149.9 1.0X +after 1900, vec on, rebase EXCEPTION 5430 5440 11 18.4 54.3 2.8X +after 1900, vec on, rebase LEGACY 5446 5458 11 18.4 54.5 2.7X +after 1900, vec on, rebase CORRECTED 5409 5440 41 18.5 54.1 2.8X +before 1900, vec off, rebase LEGACY 18150 18193 46 5.5 181.5 0.8X +before 1900, vec off, rebase CORRECTED 15954 15969 16 6.3 159.5 0.9X +before 1900, vec on, rebase LEGACY 7145 7152 8 14.0 71.5 2.1X +before 1900, vec on, rebase CORRECTED 5396 5408 11 18.5 54.0 2.8X + +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Save TIMESTAMP_MICROS to parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1900, noop 2356 2356 0 42.5 23.6 1.0X -before 1900, noop 2483 2483 0 40.3 24.8 0.9X -after 1900, rebase EXCEPTION 11597 11597 0 8.6 116.0 0.2X -after 1900, rebase LEGACY 11557 11557 0 8.7 115.6 0.2X -after 1900, rebase CORRECTED 11536 11536 0 8.7 115.4 0.2X -before 1900, rebase LEGACY 13606 13606 0 7.3 136.1 0.2X -before 1900, rebase CORRECTED 11738 11738 0 8.5 117.4 0.2X - -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +after 1900, noop 2344 2344 0 42.7 23.4 1.0X +before 1900, noop 2360 2360 0 42.4 23.6 1.0X +after 1900, rebase EXCEPTION 11075 11075 0 9.0 110.7 0.2X +after 1900, rebase LEGACY 11018 11018 0 9.1 110.2 0.2X +after 1900, rebase CORRECTED 15681 15681 0 6.4 156.8 0.1X +before 1900, rebase LEGACY 13002 13002 0 7.7 130.0 0.2X +before 1900, rebase CORRECTED 11179 11179 0 8.9 111.8 0.2X + +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Load TIMESTAMP_MICROS from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1900, vec off, rebase EXCEPTION 14878 14912 47 6.7 148.8 1.0X -after 1900, vec off, rebase LEGACY 14937 14992 49 6.7 149.4 1.0X -after 1900, vec off, rebase CORRECTED 14880 14910 39 6.7 148.8 1.0X -after 1900, vec on, rebase EXCEPTION 3768 3779 10 26.5 37.7 3.9X -after 1900, vec on, rebase LEGACY 3855 3881 34 25.9 38.6 3.9X -after 1900, vec on, rebase CORRECTED 3832 3882 44 26.1 38.3 3.9X -before 1900, vec off, rebase LEGACY 17291 17350 53 5.8 172.9 0.9X -before 1900, vec off, rebase CORRECTED 14862 14898 33 6.7 148.6 1.0X -before 1900, vec on, rebase LEGACY 5701 5724 28 17.5 57.0 2.6X -before 1900, vec on, rebase CORRECTED 3830 3844 12 26.1 38.3 3.9X - -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +after 1900, vec off, rebase EXCEPTION 15439 15460 29 6.5 154.4 1.0X +after 1900, vec off, rebase LEGACY 15345 15375 44 6.5 153.5 1.0X +after 1900, vec off, rebase CORRECTED 15418 15470 85 6.5 154.2 1.0X +after 1900, vec on, rebase EXCEPTION 3850 3895 51 26.0 38.5 4.0X +after 1900, vec on, rebase LEGACY 3862 3896 31 25.9 38.6 4.0X +after 1900, vec on, rebase CORRECTED 3827 3846 20 26.1 38.3 4.0X +before 1900, vec off, rebase LEGACY 17672 17726 48 5.7 176.7 0.9X +before 1900, vec off, rebase CORRECTED 15368 15407 45 6.5 153.7 1.0X +before 1900, vec on, rebase LEGACY 5715 5729 14 17.5 57.1 2.7X +before 1900, vec on, rebase CORRECTED 3809 3872 63 26.3 38.1 4.1X + +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Save TIMESTAMP_MILLIS to parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1900, noop 2186 2186 0 45.7 21.9 1.0X -before 1900, noop 2096 2096 0 47.7 21.0 1.0X -after 1900, rebase EXCEPTION 11031 11031 0 9.1 110.3 0.2X -after 1900, rebase LEGACY 10989 10989 0 9.1 109.9 0.2X -after 1900, rebase CORRECTED 10861 10861 0 9.2 108.6 0.2X -before 1900, rebase LEGACY 13330 13330 0 7.5 133.3 0.2X -before 1900, rebase CORRECTED 11320 11320 0 8.8 113.2 0.2X - -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +after 1900, noop 2350 2350 0 42.6 23.5 1.0X +before 1900, noop 2285 2285 0 43.8 22.8 1.0X +after 1900, rebase EXCEPTION 10977 10977 0 9.1 109.8 0.2X +after 1900, rebase LEGACY 10489 10489 0 9.5 104.9 0.2X +after 1900, rebase CORRECTED 10558 10558 0 9.5 105.6 0.2X +before 1900, rebase LEGACY 12991 12991 0 7.7 129.9 0.2X +before 1900, rebase CORRECTED 10591 10591 0 9.4 105.9 0.2X + +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Load TIMESTAMP_MILLIS from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1900, vec off, rebase EXCEPTION 14936 14989 88 6.7 149.4 1.0X -after 1900, vec off, rebase LEGACY 15025 15083 101 6.7 150.2 1.0X -after 1900, vec off, rebase CORRECTED 14907 14956 45 6.7 149.1 1.0X -after 1900, vec on, rebase EXCEPTION 3940 3967 28 25.4 39.4 3.8X -after 1900, vec on, rebase LEGACY 4553 4569 27 22.0 45.5 3.3X -after 1900, vec on, rebase CORRECTED 3944 3982 61 25.4 39.4 3.8X -before 1900, vec off, rebase LEGACY 17301 17340 56 5.8 173.0 0.9X -before 1900, vec off, rebase CORRECTED 14959 14998 45 6.7 149.6 1.0X -before 1900, vec on, rebase LEGACY 6380 6420 40 15.7 63.8 2.3X -before 1900, vec on, rebase CORRECTED 3963 3989 28 25.2 39.6 3.8X +after 1900, vec off, rebase EXCEPTION 15242 15326 82 6.6 152.4 1.0X +after 1900, vec off, rebase LEGACY 15273 15325 51 6.5 152.7 1.0X +after 1900, vec off, rebase CORRECTED 15273 15291 19 6.5 152.7 1.0X +after 1900, vec on, rebase EXCEPTION 3942 3964 23 25.4 39.4 3.9X +after 1900, vec on, rebase LEGACY 4558 4595 33 21.9 45.6 3.3X +after 1900, vec on, rebase CORRECTED 3912 3949 39 25.6 39.1 3.9X +before 1900, vec off, rebase LEGACY 17593 17643 44 5.7 175.9 0.9X +before 1900, vec off, rebase CORRECTED 15336 15346 11 6.5 153.4 1.0X +before 1900, vec on, rebase LEGACY 6310 6332 37 15.8 63.1 2.4X +before 1900, vec on, rebase CORRECTED 3947 3956 8 25.3 39.5 3.9X ================================================================================================ Rebasing dates/timestamps in ORC datasource ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Save DATE to ORC: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, noop 13755 13755 0 7.3 137.5 1.0X -before 1582, noop 7969 7969 0 12.5 79.7 1.7X -after 1582 17101 17101 0 5.8 171.0 0.8X -before 1582 11962 11962 0 8.4 119.6 1.1X +after 1582, noop 12923 12923 0 7.7 129.2 1.0X +before 1582, noop 7980 7980 0 12.5 79.8 1.6X +after 1582 16656 16656 0 6.0 166.6 0.8X +before 1582 11823 11823 0 8.5 118.2 1.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Load DATE from ORC: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, vec off 8810 8852 38 11.4 88.1 1.0X -after 1582, vec on 2401 2441 39 41.7 24.0 3.7X -before 1582, vec off 8791 8812 19 11.4 87.9 1.0X -before 1582, vec on 2540 2547 7 39.4 25.4 3.5X +after 1582, vec off 8836 8854 18 11.3 88.4 1.0X +after 1582, vec on 2492 2520 24 40.1 24.9 3.5X +before 1582, vec off 8903 8931 25 11.2 89.0 1.0X +before 1582, vec on 2644 2652 9 37.8 26.4 3.3X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Save TIMESTAMP to ORC: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1900, noop 2079 2079 0 48.1 20.8 1.0X -before 1900, noop 2058 2058 0 48.6 20.6 1.0X -after 1900 9653 9653 0 10.4 96.5 0.2X -before 1900 11808 11808 0 8.5 118.1 0.2X +after 1900, noop 2310 2310 0 43.3 23.1 1.0X +before 1900, noop 2237 2237 0 44.7 22.4 1.0X +after 1900 9656 9656 0 10.4 96.6 0.2X +before 1900 11859 11859 0 8.4 118.6 0.2X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Load TIMESTAMP from ORC: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1900, vec off 10416 10464 45 9.6 104.2 1.0X -after 1900, vec on 4435 4440 8 22.5 44.3 2.3X -before 1900, vec off 12357 12366 14 8.1 123.6 0.8X -before 1900, vec on 6040 6050 14 16.6 60.4 1.7X +after 1900, vec off 10102 10150 47 9.9 101.0 1.0X +after 1900, vec on 4274 4296 20 23.4 42.7 2.4X +before 1900, vec off 12042 12119 76 8.3 120.4 0.8X +before 1900, vec on 5850 5859 13 17.1 58.5 1.7X diff --git a/sql/core/benchmarks/EncodeBenchmark-jdk21-results.txt b/sql/core/benchmarks/EncodeBenchmark-jdk21-results.txt new file mode 100644 index 0000000000000..0a6164bc652e1 --- /dev/null +++ b/sql/core/benchmarks/EncodeBenchmark-jdk21-results.txt @@ -0,0 +1,8 @@ +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure +AMD EPYC 7763 64-Core Processor +encode: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +UTF-32 47469 47482 19 0.2 4746.9 1.0X +UTF-16 57463 57487 35 0.2 5746.3 0.8X +UTF-8 2803 2805 3 3.6 280.3 16.9X + diff --git a/sql/core/benchmarks/EncodeBenchmark-results.txt b/sql/core/benchmarks/EncodeBenchmark-results.txt new file mode 100644 index 0000000000000..404138db7d36d --- /dev/null +++ b/sql/core/benchmarks/EncodeBenchmark-results.txt @@ -0,0 +1,8 @@ +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure +AMD EPYC 7763 64-Core Processor +encode: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +UTF-32 31107 31205 138 0.3 3110.7 1.0X +UTF-16 47904 47934 43 0.2 4790.4 0.6X +UTF-8 2957 2978 30 3.4 295.7 10.5X + diff --git a/sql/core/benchmarks/ExternalAppendOnlyUnsafeRowArrayBenchmark-jdk21-results.txt b/sql/core/benchmarks/ExternalAppendOnlyUnsafeRowArrayBenchmark-jdk21-results.txt index 59b806bb5d5f3..08cd0f2c47f86 100644 --- a/sql/core/benchmarks/ExternalAppendOnlyUnsafeRowArrayBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/ExternalAppendOnlyUnsafeRowArrayBenchmark-jdk21-results.txt @@ -2,44 +2,44 @@ WITHOUT SPILL ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Array with 100000 rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ArrayBuffer 2553 2566 18 40.1 24.9 1.0X -ExternalAppendOnlyUnsafeRowArray 3528 3545 24 29.0 34.5 0.7X +ArrayBuffer 2445 2451 10 41.9 23.9 1.0X +ExternalAppendOnlyUnsafeRowArray 3464 3489 36 29.6 33.8 0.7X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Array with 1000 rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ArrayBuffer 5503 5512 13 47.6 21.0 1.0X -ExternalAppendOnlyUnsafeRowArray 10635 10654 26 24.6 40.6 0.5X +ArrayBuffer 5292 5328 50 49.5 20.2 1.0X +ExternalAppendOnlyUnsafeRowArray 11921 11927 9 22.0 45.5 0.4X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Array with 30000 rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ArrayBuffer 10547 10586 55 46.6 21.5 1.0X -ExternalAppendOnlyUnsafeRowArray 16275 16422 208 30.2 33.1 0.6X +ArrayBuffer 10418 10422 6 47.2 21.2 1.0X +ExternalAppendOnlyUnsafeRowArray 16589 16692 145 29.6 33.8 0.6X ================================================================================================ WITH SPILL ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Spilling with 1000 rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -UnsafeExternalSorter 8525 8546 29 30.7 32.5 1.0X -ExternalAppendOnlyUnsafeRowArray 6313 6315 3 41.5 24.1 1.4X +UnsafeExternalSorter 8507 8542 50 30.8 32.5 1.0X +ExternalAppendOnlyUnsafeRowArray 6301 6314 18 41.6 24.0 1.4X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Spilling with 10000 rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -UnsafeExternalSorter 5 5 1 32.4 30.8 1.0X -ExternalAppendOnlyUnsafeRowArray 4 4 0 40.2 24.9 1.2X +UnsafeExternalSorter 5 5 0 33.0 30.3 1.0X +ExternalAppendOnlyUnsafeRowArray 4 4 0 40.1 24.9 1.2X diff --git a/sql/core/benchmarks/ExternalAppendOnlyUnsafeRowArrayBenchmark-results.txt b/sql/core/benchmarks/ExternalAppendOnlyUnsafeRowArrayBenchmark-results.txt index 8e9cc9ecbba00..10af77fdd8bb2 100644 --- a/sql/core/benchmarks/ExternalAppendOnlyUnsafeRowArrayBenchmark-results.txt +++ b/sql/core/benchmarks/ExternalAppendOnlyUnsafeRowArrayBenchmark-results.txt @@ -2,44 +2,44 @@ WITHOUT SPILL ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Array with 100000 rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ArrayBuffer 2532 2539 10 40.4 24.7 1.0X -ExternalAppendOnlyUnsafeRowArray 3568 3577 12 28.7 34.8 0.7X +ArrayBuffer 2330 2333 4 44.0 22.8 1.0X +ExternalAppendOnlyUnsafeRowArray 3306 3317 15 31.0 32.3 0.7X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Array with 1000 rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ArrayBuffer 5144 5154 13 51.0 19.6 1.0X -ExternalAppendOnlyUnsafeRowArray 10745 10770 35 24.4 41.0 0.5X +ArrayBuffer 5594 5598 6 46.9 21.3 1.0X +ExternalAppendOnlyUnsafeRowArray 12278 12332 75 21.4 46.8 0.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Array with 30000 rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ArrayBuffer 9749 9751 3 50.4 19.8 1.0X -ExternalAppendOnlyUnsafeRowArray 17484 17526 59 28.1 35.6 0.6X +ArrayBuffer 10249 10252 4 48.0 20.9 1.0X +ExternalAppendOnlyUnsafeRowArray 16386 16397 16 30.0 33.3 0.6X ================================================================================================ WITH SPILL ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Spilling with 1000 rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -UnsafeExternalSorter 8372 8408 50 31.3 31.9 1.0X -ExternalAppendOnlyUnsafeRowArray 6238 6243 7 42.0 23.8 1.3X +UnsafeExternalSorter 8294 8315 30 31.6 31.6 1.0X +ExternalAppendOnlyUnsafeRowArray 6767 6797 42 38.7 25.8 1.2X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Spilling with 10000 rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -UnsafeExternalSorter 5 5 0 34.7 28.9 1.0X -ExternalAppendOnlyUnsafeRowArray 4 4 0 41.6 24.0 1.2X +UnsafeExternalSorter 5 5 0 34.2 29.2 1.0X +ExternalAppendOnlyUnsafeRowArray 4 4 0 38.8 25.8 1.1X diff --git a/sql/core/benchmarks/ExtractBenchmark-jdk21-results.txt b/sql/core/benchmarks/ExtractBenchmark-jdk21-results.txt index 736518398edb6..a1c284712c3d4 100644 --- a/sql/core/benchmarks/ExtractBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/ExtractBenchmark-jdk21-results.txt @@ -1,104 +1,104 @@ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Invoke extract for timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast to timestamp 266 280 12 37.6 26.6 1.0X -YEAR of timestamp 749 752 4 13.3 74.9 0.4X -YEAROFWEEK of timestamp 680 684 6 14.7 68.0 0.4X -QUARTER of timestamp 698 703 6 14.3 69.8 0.4X -MONTH of timestamp 614 625 10 16.3 61.4 0.4X -WEEK of timestamp 884 888 3 11.3 88.4 0.3X -DAY of timestamp 614 621 7 16.3 61.4 0.4X -DAYOFWEEK of timestamp 781 787 8 12.8 78.1 0.3X -DOW of timestamp 795 798 2 12.6 79.5 0.3X -DOW_ISO of timestamp 748 763 22 13.4 74.8 0.4X -DAYOFWEEK_ISO of timestamp 692 698 8 14.4 69.2 0.4X -DOY of timestamp 640 643 4 15.6 64.0 0.4X -HOUR of timestamp 473 479 8 21.2 47.3 0.6X -MINUTE of timestamp 472 476 4 21.2 47.2 0.6X -SECOND of timestamp 533 535 2 18.8 53.3 0.5X +cast to timestamp 288 299 15 34.7 28.8 1.0X +YEAR of timestamp 663 672 8 15.1 66.3 0.4X +YEAROFWEEK of timestamp 638 659 25 15.7 63.8 0.5X +QUARTER of timestamp 631 633 2 15.9 63.1 0.5X +MONTH of timestamp 568 575 6 17.6 56.8 0.5X +WEEK of timestamp 858 861 4 11.7 85.8 0.3X +DAY of timestamp 573 576 2 17.4 57.3 0.5X +DAYOFWEEK of timestamp 745 748 5 13.4 74.5 0.4X +DOW of timestamp 736 747 14 13.6 73.6 0.4X +DOW_ISO of timestamp 676 681 5 14.8 67.6 0.4X +DAYOFWEEK_ISO of timestamp 680 682 3 14.7 68.0 0.4X +DOY of timestamp 591 598 6 16.9 59.1 0.5X +HOUR of timestamp 474 479 4 21.1 47.4 0.6X +MINUTE of timestamp 474 477 3 21.1 47.4 0.6X +SECOND of timestamp 534 539 6 18.7 53.4 0.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Invoke date_part for timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast to timestamp 245 250 5 40.9 24.5 1.0X -YEAR of timestamp 596 599 3 16.8 59.6 0.4X -YEAROFWEEK of timestamp 651 655 3 15.4 65.1 0.4X -QUARTER of timestamp 752 754 2 13.3 75.2 0.3X -MONTH of timestamp 608 614 10 16.4 60.8 0.4X -WEEK of timestamp 874 879 5 11.4 87.4 0.3X -DAY of timestamp 609 611 2 16.4 60.9 0.4X -DAYOFWEEK of timestamp 734 747 18 13.6 73.4 0.3X -DOW of timestamp 739 753 19 13.5 73.9 0.3X -DOW_ISO of timestamp 685 687 1 14.6 68.5 0.4X -DAYOFWEEK_ISO of timestamp 682 689 10 14.7 68.2 0.4X -DOY of timestamp 638 641 4 15.7 63.8 0.4X -HOUR of timestamp 470 480 15 21.3 47.0 0.5X -MINUTE of timestamp 466 481 23 21.4 46.6 0.5X -SECOND of timestamp 532 533 1 18.8 53.2 0.5X +cast to timestamp 246 248 2 40.7 24.6 1.0X +YEAR of timestamp 532 533 1 18.8 53.2 0.5X +YEAROFWEEK of timestamp 613 616 3 16.3 61.3 0.4X +QUARTER of timestamp 615 620 6 16.3 61.5 0.4X +MONTH of timestamp 563 564 1 17.8 56.3 0.4X +WEEK of timestamp 851 855 5 11.8 85.1 0.3X +DAY of timestamp 567 568 1 17.6 56.7 0.4X +DAYOFWEEK of timestamp 731 738 8 13.7 73.1 0.3X +DOW of timestamp 730 734 5 13.7 73.0 0.3X +DOW_ISO of timestamp 668 668 1 15.0 66.8 0.4X +DAYOFWEEK_ISO of timestamp 666 678 10 15.0 66.6 0.4X +DOY of timestamp 586 591 5 17.1 58.6 0.4X +HOUR of timestamp 471 472 2 21.2 47.1 0.5X +MINUTE of timestamp 473 478 5 21.1 47.3 0.5X +SECOND of timestamp 533 534 2 18.8 53.3 0.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Invoke extract for date: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast to date 602 605 3 16.6 60.2 1.0X -YEAR of date 596 601 7 16.8 59.6 1.0X -YEAROFWEEK of date 651 657 5 15.4 65.1 0.9X -QUARTER of date 747 750 2 13.4 74.7 0.8X -MONTH of date 600 622 39 16.7 60.0 1.0X -WEEK of date 876 882 7 11.4 87.6 0.7X -DAY of date 624 630 6 16.0 62.4 1.0X -DAYOFWEEK of date 731 735 3 13.7 73.1 0.8X -DOW of date 731 735 7 13.7 73.1 0.8X -DOW_ISO of date 677 679 2 14.8 67.7 0.9X -DAYOFWEEK_ISO of date 682 682 1 14.7 68.2 0.9X -DOY of date 630 636 7 15.9 63.0 1.0X -HOUR of date 980 983 5 10.2 98.0 0.6X -MINUTE of date 982 987 4 10.2 98.2 0.6X -SECOND of date 1037 1040 5 9.6 103.7 0.6X +cast to date 511 514 3 19.6 51.1 1.0X +YEAR of date 528 535 6 18.9 52.8 1.0X +YEAROFWEEK of date 610 615 7 16.4 61.0 0.8X +QUARTER of date 609 613 6 16.4 60.9 0.8X +MONTH of date 559 560 1 17.9 55.9 0.9X +WEEK of date 849 859 10 11.8 84.9 0.6X +DAY of date 564 568 4 17.7 56.4 0.9X +DAYOFWEEK of date 696 715 19 14.4 69.6 0.7X +DOW of date 692 693 1 14.4 69.2 0.7X +DOW_ISO of date 628 634 7 15.9 62.8 0.8X +DAYOFWEEK_ISO of date 629 633 7 15.9 62.9 0.8X +DOY of date 550 556 7 18.2 55.0 0.9X +HOUR of date 952 955 3 10.5 95.2 0.5X +MINUTE of date 953 962 12 10.5 95.3 0.5X +SECOND of date 1027 1031 6 9.7 102.7 0.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Invoke date_part for date: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast to date 599 604 6 16.7 59.9 1.0X -YEAR of date 588 595 9 17.0 58.8 1.0X -YEAROFWEEK of date 648 650 2 15.4 64.8 0.9X -QUARTER of date 754 766 18 13.3 75.4 0.8X -MONTH of date 598 601 3 16.7 59.8 1.0X -WEEK of date 875 878 5 11.4 87.5 0.7X -DAY of date 604 608 3 16.5 60.4 1.0X -DAYOFWEEK of date 734 734 0 13.6 73.4 0.8X -DOW of date 730 733 5 13.7 73.0 0.8X -DOW_ISO of date 682 684 2 14.7 68.2 0.9X -DAYOFWEEK_ISO of date 678 680 2 14.8 67.8 0.9X -DOY of date 632 634 3 15.8 63.2 0.9X -HOUR of date 978 981 3 10.2 97.8 0.6X -MINUTE of date 980 984 4 10.2 98.0 0.6X -SECOND of date 1040 1042 1 9.6 104.0 0.6X +cast to date 481 484 5 20.8 48.1 1.0X +YEAR of date 489 495 5 20.5 48.9 1.0X +YEAROFWEEK of date 569 574 5 17.6 56.9 0.8X +QUARTER of date 573 574 1 17.5 57.3 0.8X +MONTH of date 515 518 3 19.4 51.5 0.9X +WEEK of date 816 818 3 12.3 81.6 0.6X +DAY of date 528 528 0 18.9 52.8 0.9X +DAYOFWEEK of date 694 706 18 14.4 69.4 0.7X +DOW of date 692 693 2 14.4 69.2 0.7X +DOW_ISO of date 628 630 3 15.9 62.8 0.8X +DAYOFWEEK_ISO of date 628 635 7 15.9 62.8 0.8X +DOY of date 551 557 7 18.2 55.1 0.9X +HOUR of date 954 961 9 10.5 95.4 0.5X +MINUTE of date 954 955 3 10.5 95.4 0.5X +SECOND of date 1034 1039 9 9.7 103.4 0.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Invoke extract for interval: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast to interval 758 760 2 13.2 75.8 1.0X -YEAR of interval 742 747 7 13.5 74.2 1.0X -MONTH of interval 739 742 3 13.5 73.9 1.0X -DAY of interval 738 739 2 13.6 73.8 1.0X -HOUR of interval 746 748 2 13.4 74.6 1.0X -MINUTE of interval 740 745 5 13.5 74.0 1.0X -SECOND of interval 801 810 9 12.5 80.1 0.9X +cast to interval 705 710 4 14.2 70.5 1.0X +YEAR of interval 671 673 3 14.9 67.1 1.1X +MONTH of interval 679 686 9 14.7 67.9 1.0X +DAY of interval 674 678 6 14.8 67.4 1.0X +HOUR of interval 680 684 4 14.7 68.0 1.0X +MINUTE of interval 682 688 6 14.7 68.2 1.0X +SECOND of interval 736 741 5 13.6 73.6 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Invoke date_part for interval: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast to interval 753 756 2 13.3 75.3 1.0X -YEAR of interval 742 743 1 13.5 74.2 1.0X -MONTH of interval 740 741 1 13.5 74.0 1.0X -DAY of interval 736 739 3 13.6 73.6 1.0X -HOUR of interval 738 740 2 13.5 73.8 1.0X -MINUTE of interval 741 743 4 13.5 74.1 1.0X -SECOND of interval 802 803 3 12.5 80.2 0.9X +cast to interval 708 709 0 14.1 70.8 1.0X +YEAR of interval 674 677 3 14.8 67.4 1.1X +MONTH of interval 675 677 2 14.8 67.5 1.0X +DAY of interval 670 671 3 14.9 67.0 1.1X +HOUR of interval 681 683 2 14.7 68.1 1.0X +MINUTE of interval 685 686 2 14.6 68.5 1.0X +SECOND of interval 725 735 13 13.8 72.5 1.0X diff --git a/sql/core/benchmarks/ExtractBenchmark-results.txt b/sql/core/benchmarks/ExtractBenchmark-results.txt index 0c5d5f10880c4..e0c939c54947d 100644 --- a/sql/core/benchmarks/ExtractBenchmark-results.txt +++ b/sql/core/benchmarks/ExtractBenchmark-results.txt @@ -1,104 +1,104 @@ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Invoke extract for timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast to timestamp 296 318 24 33.8 29.6 1.0X -YEAR of timestamp 805 806 1 12.4 80.5 0.4X -YEAROFWEEK of timestamp 856 880 26 11.7 85.6 0.3X -QUARTER of timestamp 823 829 6 12.1 82.3 0.4X -MONTH of timestamp 793 800 7 12.6 79.3 0.4X -WEEK of timestamp 1124 1132 7 8.9 112.4 0.3X -DAY of timestamp 802 806 5 12.5 80.2 0.4X -DAYOFWEEK of timestamp 945 948 2 10.6 94.5 0.3X -DOW of timestamp 945 948 4 10.6 94.5 0.3X -DOW_ISO of timestamp 886 893 7 11.3 88.6 0.3X -DAYOFWEEK_ISO of timestamp 890 894 5 11.2 89.0 0.3X -DOY of timestamp 831 831 1 12.0 83.1 0.4X -HOUR of timestamp 577 581 5 17.3 57.7 0.5X -MINUTE of timestamp 578 590 19 17.3 57.8 0.5X -SECOND of timestamp 659 664 5 15.2 65.9 0.4X +cast to timestamp 310 342 28 32.3 31.0 1.0X +YEAR of timestamp 786 795 11 12.7 78.6 0.4X +YEAROFWEEK of timestamp 847 891 52 11.8 84.7 0.4X +QUARTER of timestamp 795 800 8 12.6 79.5 0.4X +MONTH of timestamp 785 801 14 12.7 78.5 0.4X +WEEK of timestamp 1087 1091 4 9.2 108.7 0.3X +DAY of timestamp 783 784 0 12.8 78.3 0.4X +DAYOFWEEK of timestamp 919 921 2 10.9 91.9 0.3X +DOW of timestamp 923 925 2 10.8 92.3 0.3X +DOW_ISO of timestamp 982 991 10 10.2 98.2 0.3X +DAYOFWEEK_ISO of timestamp 988 993 5 10.1 98.8 0.3X +DOY of timestamp 791 793 2 12.6 79.1 0.4X +HOUR of timestamp 549 551 2 18.2 54.9 0.6X +MINUTE of timestamp 545 550 8 18.4 54.5 0.6X +SECOND of timestamp 648 652 4 15.4 64.8 0.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Invoke date_part for timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast to timestamp 268 279 13 37.3 26.8 1.0X -YEAR of timestamp 785 786 1 12.7 78.5 0.3X -YEAROFWEEK of timestamp 840 842 5 11.9 84.0 0.3X -QUARTER of timestamp 804 808 3 12.4 80.4 0.3X -MONTH of timestamp 787 789 2 12.7 78.7 0.3X -WEEK of timestamp 1122 1123 1 8.9 112.2 0.2X -DAY of timestamp 789 794 7 12.7 78.9 0.3X -DAYOFWEEK of timestamp 934 935 1 10.7 93.4 0.3X -DOW of timestamp 933 937 5 10.7 93.3 0.3X -DOW_ISO of timestamp 887 896 9 11.3 88.7 0.3X -DAYOFWEEK_ISO of timestamp 883 888 4 11.3 88.3 0.3X -DOY of timestamp 826 828 4 12.1 82.6 0.3X -HOUR of timestamp 579 584 5 17.3 57.9 0.5X -MINUTE of timestamp 575 584 12 17.4 57.5 0.5X -SECOND of timestamp 663 665 3 15.1 66.3 0.4X +cast to timestamp 248 250 2 40.4 24.8 1.0X +YEAR of timestamp 771 779 10 13.0 77.1 0.3X +YEAROFWEEK of timestamp 825 827 4 12.1 82.5 0.3X +QUARTER of timestamp 780 783 4 12.8 78.0 0.3X +MONTH of timestamp 779 785 8 12.8 77.9 0.3X +WEEK of timestamp 1075 1082 11 9.3 107.5 0.2X +DAY of timestamp 777 781 7 12.9 77.7 0.3X +DAYOFWEEK of timestamp 908 915 7 11.0 90.8 0.3X +DOW of timestamp 906 914 7 11.0 90.6 0.3X +DOW_ISO of timestamp 982 986 3 10.2 98.2 0.3X +DAYOFWEEK_ISO of timestamp 986 988 2 10.1 98.6 0.3X +DOY of timestamp 792 801 11 12.6 79.2 0.3X +HOUR of timestamp 546 549 3 18.3 54.6 0.5X +MINUTE of timestamp 553 557 4 18.1 55.3 0.4X +SECOND of timestamp 646 657 12 15.5 64.6 0.4X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Invoke extract for date: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast to date 716 721 4 14.0 71.6 1.0X -YEAR of date 782 783 1 12.8 78.2 0.9X -YEAROFWEEK of date 830 833 3 12.0 83.0 0.9X -QUARTER of date 801 805 4 12.5 80.1 0.9X -MONTH of date 782 791 11 12.8 78.2 0.9X -WEEK of date 1114 1116 1 9.0 111.4 0.6X -DAY of date 790 795 5 12.7 79.0 0.9X -DAYOFWEEK of date 934 940 6 10.7 93.4 0.8X -DOW of date 938 940 1 10.7 93.8 0.8X -DOW_ISO of date 879 883 4 11.4 87.9 0.8X -DAYOFWEEK_ISO of date 882 885 3 11.3 88.2 0.8X -DOY of date 825 830 7 12.1 82.5 0.9X -HOUR of date 1252 1255 4 8.0 125.2 0.6X -MINUTE of date 1241 1242 1 8.1 124.1 0.6X -SECOND of date 1405 1406 1 7.1 140.5 0.5X +cast to date 706 728 37 14.2 70.6 1.0X +YEAR of date 768 771 3 13.0 76.8 0.9X +YEAROFWEEK of date 821 826 5 12.2 82.1 0.9X +QUARTER of date 778 782 6 12.8 77.8 0.9X +MONTH of date 779 780 1 12.8 77.9 0.9X +WEEK of date 1074 1075 1 9.3 107.4 0.7X +DAY of date 773 777 3 12.9 77.3 0.9X +DAYOFWEEK of date 907 910 3 11.0 90.7 0.8X +DOW of date 907 910 3 11.0 90.7 0.8X +DOW_ISO of date 974 978 4 10.3 97.4 0.7X +DAYOFWEEK_ISO of date 978 979 1 10.2 97.8 0.7X +DOY of date 790 803 16 12.7 79.0 0.9X +HOUR of date 1188 1191 3 8.4 118.8 0.6X +MINUTE of date 1189 1192 4 8.4 118.9 0.6X +SECOND of date 1335 1340 6 7.5 133.5 0.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Invoke date_part for date: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast to date 714 718 5 14.0 71.4 1.0X -YEAR of date 783 786 5 12.8 78.3 0.9X -YEAROFWEEK of date 834 837 3 12.0 83.4 0.9X -QUARTER of date 803 806 2 12.4 80.3 0.9X -MONTH of date 780 788 11 12.8 78.0 0.9X -WEEK of date 1109 1112 3 9.0 110.9 0.6X -DAY of date 790 797 11 12.7 79.0 0.9X -DAYOFWEEK of date 931 934 3 10.7 93.1 0.8X -DOW of date 936 938 2 10.7 93.6 0.8X -DOW_ISO of date 884 894 12 11.3 88.4 0.8X -DAYOFWEEK_ISO of date 882 885 3 11.3 88.2 0.8X -DOY of date 821 837 21 12.2 82.1 0.9X -HOUR of date 1251 1251 1 8.0 125.1 0.6X -MINUTE of date 1245 1249 5 8.0 124.5 0.6X -SECOND of date 1399 1404 7 7.1 139.9 0.5X +cast to date 706 709 5 14.2 70.6 1.0X +YEAR of date 771 773 2 13.0 77.1 0.9X +YEAROFWEEK of date 820 823 4 12.2 82.0 0.9X +QUARTER of date 776 779 3 12.9 77.6 0.9X +MONTH of date 771 780 12 13.0 77.1 0.9X +WEEK of date 1075 1078 3 9.3 107.5 0.7X +DAY of date 772 774 2 13.0 77.2 0.9X +DAYOFWEEK of date 902 911 8 11.1 90.2 0.8X +DOW of date 901 912 9 11.1 90.1 0.8X +DOW_ISO of date 973 976 4 10.3 97.3 0.7X +DAYOFWEEK_ISO of date 974 976 2 10.3 97.4 0.7X +DOY of date 787 789 2 12.7 78.7 0.9X +HOUR of date 1186 1187 2 8.4 118.6 0.6X +MINUTE of date 1188 1191 3 8.4 118.8 0.6X +SECOND of date 1278 1310 51 7.8 127.8 0.6X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Invoke extract for interval: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast to interval 939 945 6 10.7 93.9 1.0X -YEAR of interval 918 923 4 10.9 91.8 1.0X -MONTH of interval 918 924 5 10.9 91.8 1.0X -DAY of interval 918 921 3 10.9 91.8 1.0X -HOUR of interval 934 937 3 10.7 93.4 1.0X -MINUTE of interval 936 937 1 10.7 93.6 1.0X -SECOND of interval 1083 1085 4 9.2 108.3 0.9X +cast to interval 1059 1064 8 9.4 105.9 1.0X +YEAR of interval 1054 1063 11 9.5 105.4 1.0X +MONTH of interval 1046 1047 2 9.6 104.6 1.0X +DAY of interval 1048 1052 4 9.5 104.8 1.0X +HOUR of interval 1042 1047 4 9.6 104.2 1.0X +MINUTE of interval 1070 1075 5 9.4 107.0 1.0X +SECOND of interval 1142 1146 5 8.8 114.2 0.9X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Invoke date_part for interval: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast to interval 943 945 2 10.6 94.3 1.0X -YEAR of interval 925 929 4 10.8 92.5 1.0X -MONTH of interval 921 925 5 10.9 92.1 1.0X -DAY of interval 927 933 10 10.8 92.7 1.0X -HOUR of interval 928 936 8 10.8 92.8 1.0X -MINUTE of interval 933 937 4 10.7 93.3 1.0X -SECOND of interval 1081 1083 1 9.2 108.1 0.9X +cast to interval 1061 1065 5 9.4 106.1 1.0X +YEAR of interval 1054 1056 4 9.5 105.4 1.0X +MONTH of interval 1049 1053 5 9.5 104.9 1.0X +DAY of interval 1057 1063 6 9.5 105.7 1.0X +HOUR of interval 1048 1051 3 9.5 104.8 1.0X +MINUTE of interval 1079 1083 5 9.3 107.9 1.0X +SECOND of interval 1137 1140 3 8.8 113.7 0.9X diff --git a/sql/core/benchmarks/FilterPushdownBenchmark-jdk21-results.txt b/sql/core/benchmarks/FilterPushdownBenchmark-jdk21-results.txt index 8ba705faddf8c..d3b677b84562e 100644 --- a/sql/core/benchmarks/FilterPushdownBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/FilterPushdownBenchmark-jdk21-results.txt @@ -2,733 +2,733 @@ Pushdown for many distinct value case ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 0 string row (value IS NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 6365 6420 67 2.5 404.7 1.0X -Parquet Vectorized (Pushdown) 315 341 24 49.9 20.0 20.2X -Native ORC Vectorized 4984 5073 69 3.2 316.9 1.3X -Native ORC Vectorized (Pushdown) 311 321 11 50.5 19.8 20.5X +Parquet Vectorized 6309 6370 57 2.5 401.1 1.0X +Parquet Vectorized (Pushdown) 294 324 23 53.4 18.7 21.4X +Native ORC Vectorized 5129 5216 60 3.1 326.1 1.2X +Native ORC Vectorized (Pushdown) 323 330 6 48.7 20.5 19.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 0 string row ('7864320' < value < '7864320'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6261 6292 25 2.5 398.1 1.0X -Parquet Vectorized (Pushdown) 281 298 12 56.1 17.8 22.3X -Native ORC Vectorized 5053 5078 22 3.1 321.3 1.2X -Native ORC Vectorized (Pushdown) 300 328 35 52.4 19.1 20.9X +Parquet Vectorized 6334 6358 22 2.5 402.7 1.0X +Parquet Vectorized (Pushdown) 270 290 12 58.2 17.2 23.4X +Native ORC Vectorized 5237 5252 12 3.0 332.9 1.2X +Native ORC Vectorized (Pushdown) 318 333 11 49.4 20.2 19.9X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 1 string row (value = '7864320'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 6294 6356 90 2.5 400.1 1.0X -Parquet Vectorized (Pushdown) 270 283 14 58.3 17.1 23.3X -Native ORC Vectorized 5057 5086 20 3.1 321.5 1.2X -Native ORC Vectorized (Pushdown) 289 298 8 54.4 18.4 21.8X +Parquet Vectorized 6322 6345 15 2.5 401.9 1.0X +Parquet Vectorized (Pushdown) 256 267 7 61.4 16.3 24.7X +Native ORC Vectorized 5290 5305 9 3.0 336.3 1.2X +Native ORC Vectorized (Pushdown) 297 312 9 52.9 18.9 21.3X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 1 string row (value <=> '7864320'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6252 6274 17 2.5 397.5 1.0X -Parquet Vectorized (Pushdown) 256 272 15 61.4 16.3 24.4X -Native ORC Vectorized 5036 5054 12 3.1 320.2 1.2X -Native ORC Vectorized (Pushdown) 275 291 8 57.1 17.5 22.7X +Parquet Vectorized 6313 6327 11 2.5 401.4 1.0X +Parquet Vectorized (Pushdown) 256 264 7 61.4 16.3 24.6X +Native ORC Vectorized 5262 5293 33 3.0 334.6 1.2X +Native ORC Vectorized (Pushdown) 289 306 14 54.4 18.4 21.8X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 1 string row ('7864320' <= value <= '7864320'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6241 6259 11 2.5 396.8 1.0X -Parquet Vectorized (Pushdown) 257 266 12 61.3 16.3 24.3X -Native ORC Vectorized 5038 5055 20 3.1 320.3 1.2X -Native ORC Vectorized (Pushdown) 277 290 10 56.8 17.6 22.5X +Parquet Vectorized 6370 6387 11 2.5 405.0 1.0X +Parquet Vectorized (Pushdown) 254 265 11 61.9 16.2 25.1X +Native ORC Vectorized 5284 5294 7 3.0 335.9 1.2X +Native ORC Vectorized (Pushdown) 292 306 14 53.8 18.6 21.8X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select all string rows (value IS NOT NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 12335 12564 207 1.3 784.2 1.0X -Parquet Vectorized (Pushdown) 12561 12587 20 1.3 798.6 1.0X -Native ORC Vectorized 11278 11295 12 1.4 717.0 1.1X -Native ORC Vectorized (Pushdown) 11398 11468 98 1.4 724.7 1.1X +Parquet Vectorized 12536 12596 56 1.3 797.0 1.0X +Parquet Vectorized (Pushdown) 12610 12645 26 1.2 801.7 1.0X +Native ORC Vectorized 11428 11444 16 1.4 726.6 1.1X +Native ORC Vectorized (Pushdown) 11524 11532 10 1.4 732.7 1.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 0 int row (value IS NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 6150 6175 26 2.6 391.0 1.0X -Parquet Vectorized (Pushdown) 252 276 25 62.4 16.0 24.4X -Native ORC Vectorized 4484 4503 12 3.5 285.1 1.4X -Native ORC Vectorized (Pushdown) 276 290 10 57.0 17.5 22.3X +Parquet Vectorized 6369 6487 151 2.5 404.9 1.0X +Parquet Vectorized (Pushdown) 292 381 77 53.9 18.5 21.8X +Native ORC Vectorized 4726 4808 85 3.3 300.5 1.3X +Native ORC Vectorized (Pushdown) 294 320 13 53.5 18.7 21.6X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 0 int row (7864320 < value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5983 5997 10 2.6 380.4 1.0X -Parquet Vectorized (Pushdown) 258 274 18 61.1 16.4 23.2X -Native ORC Vectorized 4470 4481 9 3.5 284.2 1.3X -Native ORC Vectorized (Pushdown) 284 289 6 55.3 18.1 21.1X +Parquet Vectorized 6121 6133 16 2.6 389.2 1.0X +Parquet Vectorized (Pushdown) 257 276 25 61.3 16.3 23.8X +Native ORC Vectorized 4735 4759 20 3.3 301.0 1.3X +Native ORC Vectorized (Pushdown) 294 309 11 53.6 18.7 20.8X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 1 int row (value = 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 6050 6074 27 2.6 384.6 1.0X -Parquet Vectorized (Pushdown) 251 262 15 62.7 16.0 24.1X -Native ORC Vectorized 4539 4568 29 3.5 288.6 1.3X -Native ORC Vectorized (Pushdown) 270 286 11 58.4 17.1 22.4X +Parquet Vectorized 6166 6182 16 2.6 392.0 1.0X +Parquet Vectorized (Pushdown) 249 261 13 63.2 15.8 24.8X +Native ORC Vectorized 4797 4812 14 3.3 305.0 1.3X +Native ORC Vectorized (Pushdown) 284 296 6 55.4 18.1 21.7X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 1 int row (value <=> 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 5983 6015 26 2.6 380.4 1.0X -Parquet Vectorized (Pushdown) 242 247 4 65.0 15.4 24.7X -Native ORC Vectorized 4502 4529 34 3.5 286.2 1.3X -Native ORC Vectorized (Pushdown) 267 276 6 58.8 17.0 22.4X +Parquet Vectorized 6139 6164 17 2.6 390.3 1.0X +Parquet Vectorized (Pushdown) 241 256 16 65.3 15.3 25.5X +Native ORC Vectorized 4798 4837 59 3.3 305.1 1.3X +Native ORC Vectorized (Pushdown) 285 299 9 55.3 18.1 21.6X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 1 int row (7864320 <= value <= 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 5978 6000 14 2.6 380.1 1.0X -Parquet Vectorized (Pushdown) 243 254 12 64.8 15.4 24.6X -Native ORC Vectorized 4520 4532 12 3.5 287.4 1.3X -Native ORC Vectorized (Pushdown) 267 280 11 58.9 17.0 22.4X +Parquet Vectorized 6152 6176 26 2.6 391.2 1.0X +Parquet Vectorized (Pushdown) 244 253 7 64.3 15.5 25.2X +Native ORC Vectorized 4789 4803 15 3.3 304.5 1.3X +Native ORC Vectorized (Pushdown) 285 294 7 55.2 18.1 21.6X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 1 int row (7864319 < value < 7864321): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5992 6010 17 2.6 381.0 1.0X -Parquet Vectorized (Pushdown) 250 257 9 63.0 15.9 24.0X -Native ORC Vectorized 4525 4536 12 3.5 287.7 1.3X -Native ORC Vectorized (Pushdown) 267 276 7 58.9 17.0 22.4X +Parquet Vectorized 6146 6179 43 2.6 390.8 1.0X +Parquet Vectorized (Pushdown) 239 253 13 65.9 15.2 25.7X +Native ORC Vectorized 4791 4801 10 3.3 304.6 1.3X +Native ORC Vectorized (Pushdown) 281 293 9 55.9 17.9 21.9X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 10% int rows (value < 1572864): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 6624 6642 20 2.4 421.1 1.0X -Parquet Vectorized (Pushdown) 1438 1450 11 10.9 91.4 4.6X -Native ORC Vectorized 5086 5101 17 3.1 323.4 1.3X -Native ORC Vectorized (Pushdown) 1293 1303 8 12.2 82.2 5.1X +Parquet Vectorized 6748 6773 25 2.3 429.1 1.0X +Parquet Vectorized (Pushdown) 1435 1445 12 11.0 91.3 4.7X +Native ORC Vectorized 5345 5351 5 2.9 339.8 1.3X +Native ORC Vectorized (Pushdown) 1328 1335 4 11.8 84.4 5.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 50% int rows (value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 8735 8748 8 1.8 555.3 1.0X -Parquet Vectorized (Pushdown) 5861 5871 12 2.7 372.7 1.5X -Native ORC Vectorized 7281 7331 29 2.2 462.9 1.2X -Native ORC Vectorized (Pushdown) 5242 5250 12 3.0 333.3 1.7X +Parquet Vectorized 8886 8910 25 1.8 564.9 1.0X +Parquet Vectorized (Pushdown) 5996 6011 15 2.6 381.2 1.5X +Native ORC Vectorized 7499 7508 10 2.1 476.8 1.2X +Native ORC Vectorized (Pushdown) 5316 5330 18 3.0 338.0 1.7X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 90% int rows (value < 14155776): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 11127 11148 16 1.4 707.5 1.0X -Parquet Vectorized (Pushdown) 10606 10632 30 1.5 674.3 1.0X -Native ORC Vectorized 9596 9614 11 1.6 610.1 1.2X -Native ORC Vectorized (Pushdown) 9248 9257 8 1.7 588.0 1.2X +Parquet Vectorized 11048 11060 11 1.4 702.4 1.0X +Parquet Vectorized (Pushdown) 10492 10509 13 1.5 667.1 1.1X +Native ORC Vectorized 9684 9706 19 1.6 615.7 1.1X +Native ORC Vectorized (Pushdown) 9296 9311 21 1.7 591.0 1.2X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select all int rows (value IS NOT NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 11514 11537 32 1.4 732.0 1.0X -Parquet Vectorized (Pushdown) 11493 11514 17 1.4 730.7 1.0X -Native ORC Vectorized 10169 10181 11 1.5 646.5 1.1X -Native ORC Vectorized (Pushdown) 10303 10321 15 1.5 655.1 1.1X +Parquet Vectorized 11567 11612 50 1.4 735.4 1.0X +Parquet Vectorized (Pushdown) 11631 11642 9 1.4 739.5 1.0X +Native ORC Vectorized 10373 10388 11 1.5 659.5 1.1X +Native ORC Vectorized (Pushdown) 10450 10466 11 1.5 664.4 1.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select all int rows (value > -1): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 11501 11520 21 1.4 731.2 1.0X -Parquet Vectorized (Pushdown) 11549 11591 40 1.4 734.3 1.0X -Native ORC Vectorized 10206 10220 15 1.5 648.9 1.1X -Native ORC Vectorized (Pushdown) 10274 10305 30 1.5 653.2 1.1X +Parquet Vectorized 11581 11600 12 1.4 736.3 1.0X +Parquet Vectorized (Pushdown) 11623 11644 18 1.4 738.9 1.0X +Native ORC Vectorized 10326 10333 9 1.5 656.5 1.1X +Native ORC Vectorized (Pushdown) 10394 10407 14 1.5 660.8 1.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select all int rows (value != -1): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 11571 11635 49 1.4 735.7 1.0X -Parquet Vectorized (Pushdown) 11589 11620 31 1.4 736.8 1.0X -Native ORC Vectorized 10168 10209 45 1.5 646.4 1.1X -Native ORC Vectorized (Pushdown) 10245 10262 20 1.5 651.4 1.1X +Parquet Vectorized 11803 11819 20 1.3 750.4 1.0X +Parquet Vectorized (Pushdown) 11859 11868 9 1.3 754.0 1.0X +Native ORC Vectorized 10609 10614 4 1.5 674.5 1.1X +Native ORC Vectorized (Pushdown) 10681 10697 16 1.5 679.1 1.1X ================================================================================================ Pushdown for few distinct value case (use dictionary encoding) ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 0 distinct string row (value IS NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5490 5539 43 2.9 349.0 1.0X -Parquet Vectorized (Pushdown) 207 222 13 75.9 13.2 26.5X -Native ORC Vectorized 6053 6083 42 2.6 384.8 0.9X -Native ORC Vectorized (Pushdown) 886 893 4 17.7 56.3 6.2X +Parquet Vectorized 5546 5587 37 2.8 352.6 1.0X +Parquet Vectorized (Pushdown) 218 235 18 72.2 13.8 25.5X +Native ORC Vectorized 6190 6206 15 2.5 393.6 0.9X +Native ORC Vectorized (Pushdown) 943 957 16 16.7 60.0 5.9X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 0 distinct string row ('100' < value < '100'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 5479 5491 18 2.9 348.3 1.0X -Parquet Vectorized (Pushdown) 208 222 11 75.7 13.2 26.4X -Native ORC Vectorized 6230 6242 13 2.5 396.1 0.9X -Native ORC Vectorized (Pushdown) 900 904 7 17.5 57.2 6.1X +Parquet Vectorized 5551 5567 13 2.8 352.9 1.0X +Parquet Vectorized (Pushdown) 215 224 7 73.0 13.7 25.8X +Native ORC Vectorized 6364 6372 6 2.5 404.6 0.9X +Native ORC Vectorized (Pushdown) 941 956 11 16.7 59.8 5.9X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 1 distinct string row (value = '100'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5397 5434 67 2.9 343.1 1.0X -Parquet Vectorized (Pushdown) 251 273 17 62.8 15.9 21.5X -Native ORC Vectorized 6187 6225 48 2.5 393.3 0.9X -Native ORC Vectorized (Pushdown) 934 951 16 16.8 59.4 5.8X +Parquet Vectorized 5490 5498 6 2.9 349.1 1.0X +Parquet Vectorized (Pushdown) 259 271 10 60.8 16.4 21.2X +Native ORC Vectorized 6359 6377 12 2.5 404.3 0.9X +Native ORC Vectorized (Pushdown) 1003 1008 7 15.7 63.8 5.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 1 distinct string row (value <=> '100'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 5397 5414 10 2.9 343.1 1.0X -Parquet Vectorized (Pushdown) 271 315 68 58.1 17.2 19.9X -Native ORC Vectorized 6206 6221 16 2.5 394.6 0.9X -Native ORC Vectorized (Pushdown) 925 931 3 17.0 58.8 5.8X +Parquet Vectorized 5492 5495 4 2.9 349.2 1.0X +Parquet Vectorized (Pushdown) 256 266 8 61.5 16.3 21.5X +Native ORC Vectorized 6367 6378 16 2.5 404.8 0.9X +Native ORC Vectorized (Pushdown) 1002 1005 4 15.7 63.7 5.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 1 distinct string row ('100' <= value <= '100'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5486 5497 12 2.9 348.8 1.0X -Parquet Vectorized (Pushdown) 254 264 9 62.0 16.1 21.6X -Native ORC Vectorized 6246 6272 19 2.5 397.1 0.9X -Native ORC Vectorized (Pushdown) 935 958 29 16.8 59.4 5.9X +Parquet Vectorized 5566 5582 18 2.8 353.9 1.0X +Parquet Vectorized (Pushdown) 275 278 5 57.3 17.5 20.3X +Native ORC Vectorized 6443 6451 8 2.4 409.6 0.9X +Native ORC Vectorized (Pushdown) 1003 1015 10 15.7 63.8 5.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select all distinct string rows (value IS NOT NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 11760 11813 38 1.3 747.7 1.0X -Parquet Vectorized (Pushdown) 11729 11761 49 1.3 745.7 1.0X -Native ORC Vectorized 13965 14180 169 1.1 887.9 0.8X -Native ORC Vectorized (Pushdown) 14779 14998 178 1.1 939.6 0.8X +Parquet Vectorized 12200 12218 15 1.3 775.6 1.0X +Parquet Vectorized (Pushdown) 12173 12206 40 1.3 773.9 1.0X +Native ORC Vectorized 13191 13208 19 1.2 838.7 0.9X +Native ORC Vectorized (Pushdown) 13378 13410 29 1.2 850.5 0.9X ================================================================================================ Pushdown benchmark for StringStartsWith ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor StringStartsWith filter: (value like '10%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6784 6853 66 2.3 431.3 1.0X -Parquet Vectorized (Pushdown) 855 910 57 18.4 54.4 7.9X -Native ORC Vectorized 5741 5816 64 2.7 365.0 1.2X -Native ORC Vectorized (Pushdown) 5678 5797 81 2.8 361.0 1.2X +Parquet Vectorized 6707 6727 17 2.3 426.4 1.0X +Parquet Vectorized (Pushdown) 846 855 12 18.6 53.8 7.9X +Native ORC Vectorized 5473 5506 67 2.9 347.9 1.2X +Native ORC Vectorized (Pushdown) 5550 5556 6 2.8 352.9 1.2X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor StringStartsWith filter: (value like '1000%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6516 6533 12 2.4 414.3 1.0X -Parquet Vectorized (Pushdown) 254 270 17 61.9 16.2 25.6X -Native ORC Vectorized 5271 5324 40 3.0 335.1 1.2X -Native ORC Vectorized (Pushdown) 5173 5308 115 3.0 328.9 1.3X +Parquet Vectorized 6438 6452 14 2.4 409.3 1.0X +Parquet Vectorized (Pushdown) 255 262 8 61.7 16.2 25.2X +Native ORC Vectorized 5314 5320 5 3.0 337.8 1.2X +Native ORC Vectorized (Pushdown) 5404 5415 15 2.9 343.6 1.2X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor StringStartsWith filter: (value like '786432%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6412 6431 24 2.5 407.7 1.0X -Parquet Vectorized (Pushdown) 235 242 5 66.9 14.9 27.3X -Native ORC Vectorized 5068 5206 157 3.1 322.2 1.3X -Native ORC Vectorized (Pushdown) 5139 5155 14 3.1 326.7 1.2X +Parquet Vectorized 6436 6455 11 2.4 409.2 1.0X +Parquet Vectorized (Pushdown) 260 266 7 60.5 16.5 24.8X +Native ORC Vectorized 5316 5327 10 3.0 338.0 1.2X +Native ORC Vectorized (Pushdown) 5394 5403 8 2.9 343.0 1.2X ================================================================================================ Pushdown benchmark for StringEndsWith ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor StringEndsWith filter: (value like '%10'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5669 5828 139 2.8 360.4 1.0X -Parquet Vectorized (Pushdown) 395 414 18 39.8 25.1 14.4X -Native ORC Vectorized 6255 6272 19 2.5 397.7 0.9X -Native ORC Vectorized (Pushdown) 6397 6410 12 2.5 406.7 0.9X +Parquet Vectorized 5500 5593 76 2.9 349.7 1.0X +Parquet Vectorized (Pushdown) 335 339 4 46.9 21.3 16.4X +Native ORC Vectorized 6373 6395 24 2.5 405.2 0.9X +Native ORC Vectorized (Pushdown) 6573 6584 15 2.4 417.9 0.8X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor StringEndsWith filter: (value like '%1000'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5419 5429 9 2.9 344.5 1.0X -Parquet Vectorized (Pushdown) 260 276 14 60.5 16.5 20.8X -Native ORC Vectorized 6195 6223 35 2.5 393.8 0.9X -Native ORC Vectorized (Pushdown) 6519 6706 118 2.4 414.4 0.8X +Parquet Vectorized 5431 5435 5 2.9 345.3 1.0X +Parquet Vectorized (Pushdown) 246 249 3 64.0 15.6 22.1X +Native ORC Vectorized 6318 6330 9 2.5 401.7 0.9X +Native ORC Vectorized (Pushdown) 6518 6545 32 2.4 414.4 0.8X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor StringEndsWith filter: (value like '%786432'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5401 5413 8 2.9 343.4 1.0X -Parquet Vectorized (Pushdown) 249 268 23 63.3 15.8 21.7X -Native ORC Vectorized 6177 6183 7 2.5 392.7 0.9X -Native ORC Vectorized (Pushdown) 7467 7553 141 2.1 474.7 0.7X +Parquet Vectorized 5427 5436 16 2.9 345.0 1.0X +Parquet Vectorized (Pushdown) 246 249 2 63.9 15.6 22.1X +Native ORC Vectorized 6332 6345 12 2.5 402.6 0.9X +Native ORC Vectorized (Pushdown) 6501 6507 7 2.4 413.3 0.8X ================================================================================================ Pushdown benchmark for StringContains ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor StringContains filter: (value like '%10%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5852 6100 222 2.7 372.1 1.0X -Parquet Vectorized (Pushdown) 839 899 53 18.8 53.3 7.0X -Native ORC Vectorized 7383 7538 125 2.1 469.4 0.8X -Native ORC Vectorized (Pushdown) 7667 7872 159 2.1 487.5 0.8X +Parquet Vectorized 5671 5760 67 2.8 360.6 1.0X +Parquet Vectorized (Pushdown) 737 745 9 21.3 46.9 7.7X +Native ORC Vectorized 6477 6515 48 2.4 411.8 0.9X +Native ORC Vectorized (Pushdown) 6669 6689 19 2.4 424.0 0.9X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor StringContains filter: (value like '%1000%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5501 5540 27 2.9 349.7 1.0X -Parquet Vectorized (Pushdown) 268 286 16 58.6 17.1 20.5X -Native ORC Vectorized 7027 7112 58 2.2 446.7 0.8X -Native ORC Vectorized (Pushdown) 6321 6345 26 2.5 401.8 0.9X +Parquet Vectorized 5446 5453 5 2.9 346.2 1.0X +Parquet Vectorized (Pushdown) 245 256 12 64.2 15.6 22.2X +Native ORC Vectorized 6275 6282 4 2.5 399.0 0.9X +Native ORC Vectorized (Pushdown) 6474 6482 7 2.4 411.6 0.8X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor StringContains filter: (value like '%786432%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 5390 5439 56 2.9 342.7 1.0X -Parquet Vectorized (Pushdown) 250 277 27 62.9 15.9 21.5X -Native ORC Vectorized 6128 6157 28 2.6 389.6 0.9X -Native ORC Vectorized (Pushdown) 6298 6322 24 2.5 400.4 0.9X +Parquet Vectorized 5438 5442 6 2.9 345.7 1.0X +Parquet Vectorized (Pushdown) 248 256 6 63.4 15.8 21.9X +Native ORC Vectorized 6266 6273 8 2.5 398.4 0.9X +Native ORC Vectorized (Pushdown) 6462 6475 13 2.4 410.8 0.8X ================================================================================================ Pushdown benchmark for decimal ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 1 decimal(9, 2) row (value = 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 2399 2430 24 6.6 152.5 1.0X -Parquet Vectorized (Pushdown) 66 82 17 240.1 4.2 36.6X -Native ORC Vectorized 3131 3149 25 5.0 199.0 0.8X -Native ORC Vectorized (Pushdown) 58 66 7 273.1 3.7 41.7X +Parquet Vectorized 2417 2431 19 6.5 153.7 1.0X +Parquet Vectorized (Pushdown) 64 67 4 244.0 4.1 37.5X +Native ORC Vectorized 3441 3463 31 4.6 218.7 0.7X +Native ORC Vectorized (Pushdown) 58 62 7 272.0 3.7 41.8X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 10% decimal(9, 2) rows (value < 1572864): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3517 3562 46 4.5 223.6 1.0X -Parquet Vectorized (Pushdown) 1638 1653 14 9.6 104.1 2.1X -Native ORC Vectorized 4376 4397 30 3.6 278.2 0.8X -Native ORC Vectorized (Pushdown) 1913 1927 20 8.2 121.6 1.8X +Parquet Vectorized 3560 3606 62 4.4 226.3 1.0X +Parquet Vectorized (Pushdown) 1678 1686 10 9.4 106.7 2.1X +Native ORC Vectorized 4686 4710 24 3.4 297.9 0.8X +Native ORC Vectorized (Pushdown) 1978 1992 21 8.0 125.7 1.8X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 50% decimal(9, 2) rows (value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 7145 7165 18 2.2 454.3 1.0X -Parquet Vectorized (Pushdown) 6858 6878 17 2.3 436.0 1.0X -Native ORC Vectorized 8642 8697 98 1.8 549.5 0.8X -Native ORC Vectorized (Pushdown) 8334 8351 17 1.9 529.9 0.9X +Parquet Vectorized 7360 7379 20 2.1 468.0 1.0X +Parquet Vectorized (Pushdown) 7055 7090 28 2.2 448.5 1.0X +Native ORC Vectorized 8850 8867 21 1.8 562.7 0.8X +Native ORC Vectorized (Pushdown) 8382 8394 10 1.9 532.9 0.9X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 90% decimal(9, 2) rows (value < 14155776): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8037 8063 24 2.0 511.0 1.0X -Parquet Vectorized (Pushdown) 8067 8094 30 1.9 512.9 1.0X -Native ORC Vectorized 9665 9685 16 1.6 614.5 0.8X -Native ORC Vectorized (Pushdown) 9798 9849 41 1.6 623.0 0.8X +Parquet Vectorized 8369 8388 20 1.9 532.1 1.0X +Parquet Vectorized (Pushdown) 8409 8420 9 1.9 534.6 1.0X +Native ORC Vectorized 9926 9960 24 1.6 631.1 0.8X +Native ORC Vectorized (Pushdown) 9974 10012 34 1.6 634.2 0.8X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 1 decimal(18, 2) row (value = 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 2650 2672 27 5.9 168.5 1.0X -Parquet Vectorized (Pushdown) 62 70 10 253.1 4.0 42.6X -Native ORC Vectorized 3181 3202 26 4.9 202.2 0.8X -Native ORC Vectorized (Pushdown) 55 63 6 287.6 3.5 48.5X +Parquet Vectorized 2601 2612 7 6.0 165.4 1.0X +Parquet Vectorized (Pushdown) 64 69 8 247.5 4.0 40.9X +Native ORC Vectorized 3243 3261 21 4.9 206.2 0.8X +Native ORC Vectorized (Pushdown) 56 63 5 279.6 3.6 46.2X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 10% decimal(18, 2) rows (value < 1572864): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3254 3305 73 4.8 206.9 1.0X -Parquet Vectorized (Pushdown) 931 945 12 16.9 59.2 3.5X -Native ORC Vectorized 3783 3803 19 4.2 240.5 0.9X -Native ORC Vectorized (Pushdown) 954 975 24 16.5 60.7 3.4X +Parquet Vectorized 3202 3240 25 4.9 203.6 1.0X +Parquet Vectorized (Pushdown) 919 952 45 17.1 58.4 3.5X +Native ORC Vectorized 3936 3954 22 4.0 250.2 0.8X +Native ORC Vectorized (Pushdown) 1001 1009 10 15.7 63.7 3.2X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 50% decimal(18, 2) rows (value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5649 5660 6 2.8 359.2 1.0X -Parquet Vectorized (Pushdown) 4392 4412 21 3.6 279.3 1.3X -Native ORC Vectorized 6152 6161 6 2.6 391.1 0.9X -Native ORC Vectorized (Pushdown) 4552 4568 19 3.5 289.4 1.2X +Parquet Vectorized 5598 5605 8 2.8 355.9 1.0X +Parquet Vectorized (Pushdown) 4366 4389 26 3.6 277.6 1.3X +Native ORC Vectorized 6410 6424 10 2.5 407.6 0.9X +Native ORC Vectorized (Pushdown) 4843 4852 9 3.2 307.9 1.2X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 90% decimal(18, 2) rows (value < 14155776): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 7937 7962 40 2.0 504.6 1.0X -Parquet Vectorized (Pushdown) 7661 7697 46 2.1 487.1 1.0X -Native ORC Vectorized 8633 8665 25 1.8 548.9 0.9X -Native ORC Vectorized (Pushdown) 8296 8311 14 1.9 527.4 1.0X +Parquet Vectorized 7861 7918 46 2.0 499.8 1.0X +Parquet Vectorized (Pushdown) 7637 7659 22 2.1 485.5 1.0X +Native ORC Vectorized 8921 8930 9 1.8 567.2 0.9X +Native ORC Vectorized (Pushdown) 8753 8784 26 1.8 556.5 0.9X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 1 decimal(38, 2) row (value = 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3699 3713 14 4.3 235.2 1.0X -Parquet Vectorized (Pushdown) 67 71 7 234.7 4.3 55.2X -Native ORC Vectorized 3177 3192 19 5.0 202.0 1.2X -Native ORC Vectorized (Pushdown) 53 57 5 296.0 3.4 69.6X +Parquet Vectorized 3766 3780 20 4.2 239.4 1.0X +Parquet Vectorized (Pushdown) 70 74 5 225.8 4.4 54.1X +Native ORC Vectorized 3250 3259 10 4.8 206.6 1.2X +Native ORC Vectorized (Pushdown) 55 61 6 285.2 3.5 68.3X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 10% decimal(38, 2) rows (value < 1572864): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 4523 4568 83 3.5 287.6 1.0X -Parquet Vectorized (Pushdown) 1226 1235 7 12.8 78.0 3.7X -Native ORC Vectorized 3929 3956 38 4.0 249.8 1.2X -Native ORC Vectorized (Pushdown) 1098 1106 6 14.3 69.8 4.1X +Parquet Vectorized 4596 4632 53 3.4 292.2 1.0X +Parquet Vectorized (Pushdown) 1245 1257 9 12.6 79.1 3.7X +Native ORC Vectorized 4063 4077 14 3.9 258.3 1.1X +Native ORC Vectorized (Pushdown) 1126 1133 6 14.0 71.6 4.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 50% decimal(38, 2) rows (value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 7733 7741 5 2.0 491.6 1.0X -Parquet Vectorized (Pushdown) 5937 5959 23 2.6 377.4 1.3X -Native ORC Vectorized 6909 6913 4 2.3 439.2 1.1X -Native ORC Vectorized (Pushdown) 5286 5293 8 3.0 336.1 1.5X +Parquet Vectorized 7843 7862 14 2.0 498.6 1.0X +Parquet Vectorized (Pushdown) 6005 6027 19 2.6 381.8 1.3X +Native ORC Vectorized 7116 7128 17 2.2 452.5 1.1X +Native ORC Vectorized (Pushdown) 5410 5438 19 2.9 343.9 1.4X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 90% decimal(38, 2) rows (value < 14155776): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 10916 10947 34 1.4 694.0 1.0X -Parquet Vectorized (Pushdown) 10577 10588 8 1.5 672.5 1.0X -Native ORC Vectorized 9997 10028 33 1.6 635.6 1.1X -Native ORC Vectorized (Pushdown) 9623 9637 9 1.6 611.8 1.1X +Parquet Vectorized 11034 11064 18 1.4 701.5 1.0X +Parquet Vectorized (Pushdown) 10676 10694 30 1.5 678.8 1.0X +Native ORC Vectorized 10144 10166 21 1.6 644.9 1.1X +Native ORC Vectorized (Pushdown) 9838 9874 32 1.6 625.5 1.1X ================================================================================================ Pushdown benchmark for InSet -> InFilters ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 5, distribution: 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6258 6286 27 2.5 397.8 1.0X -Parquet Vectorized (Pushdown) 251 281 26 62.6 16.0 24.9X -Native ORC Vectorized 4559 4669 109 3.4 289.9 1.4X -Native ORC Vectorized (Pushdown) 282 302 21 55.9 17.9 22.2X +Parquet Vectorized 6250 6291 27 2.5 397.4 1.0X +Parquet Vectorized (Pushdown) 257 275 16 61.1 16.4 24.3X +Native ORC Vectorized 4812 4839 21 3.3 305.9 1.3X +Native ORC Vectorized (Pushdown) 312 320 7 50.4 19.9 20.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 5, distribution: 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6098 6113 15 2.6 387.7 1.0X -Parquet Vectorized (Pushdown) 246 253 4 63.9 15.6 24.8X -Native ORC Vectorized 4549 4575 23 3.5 289.2 1.3X -Native ORC Vectorized (Pushdown) 291 300 8 54.1 18.5 21.0X +Parquet Vectorized 6162 6168 6 2.6 391.8 1.0X +Parquet Vectorized (Pushdown) 272 274 1 57.9 17.3 22.7X +Native ORC Vectorized 4794 4806 10 3.3 304.8 1.3X +Native ORC Vectorized (Pushdown) 309 316 7 50.9 19.6 19.9X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 5, distribution: 90): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6089 6112 14 2.6 387.1 1.0X -Parquet Vectorized (Pushdown) 249 254 3 63.1 15.9 24.4X -Native ORC Vectorized 4540 4550 7 3.5 288.7 1.3X -Native ORC Vectorized (Pushdown) 279 294 11 56.4 17.7 21.8X +Parquet Vectorized 6148 6156 5 2.6 390.9 1.0X +Parquet Vectorized (Pushdown) 270 271 2 58.3 17.2 22.8X +Native ORC Vectorized 4778 4786 8 3.3 303.7 1.3X +Native ORC Vectorized (Pushdown) 293 302 7 53.6 18.6 21.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 10, distribution: 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6096 6122 17 2.6 387.5 1.0X -Parquet Vectorized (Pushdown) 266 271 4 59.2 16.9 22.9X -Native ORC Vectorized 4567 4578 15 3.4 290.3 1.3X -Native ORC Vectorized (Pushdown) 295 303 7 53.4 18.7 20.7X +Parquet Vectorized 6165 6182 16 2.6 391.9 1.0X +Parquet Vectorized (Pushdown) 269 280 11 58.4 17.1 22.9X +Native ORC Vectorized 4812 4821 11 3.3 306.0 1.3X +Native ORC Vectorized (Pushdown) 301 310 6 52.3 19.1 20.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 10, distribution: 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6103 6131 26 2.6 388.0 1.0X -Parquet Vectorized (Pushdown) 280 282 1 56.1 17.8 21.8X -Native ORC Vectorized 4575 4589 9 3.4 290.9 1.3X -Native ORC Vectorized (Pushdown) 292 300 6 53.9 18.6 20.9X +Parquet Vectorized 6175 6178 5 2.5 392.6 1.0X +Parquet Vectorized (Pushdown) 274 281 7 57.4 17.4 22.5X +Native ORC Vectorized 4802 4830 42 3.3 305.3 1.3X +Native ORC Vectorized (Pushdown) 322 326 8 48.8 20.5 19.2X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 10, distribution: 90): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6107 6115 9 2.6 388.3 1.0X -Parquet Vectorized (Pushdown) 266 270 3 59.2 16.9 23.0X -Native ORC Vectorized 4573 4583 12 3.4 290.7 1.3X -Native ORC Vectorized (Pushdown) 293 301 7 53.8 18.6 20.9X +Parquet Vectorized 6153 6187 30 2.6 391.2 1.0X +Parquet Vectorized (Pushdown) 289 289 1 54.5 18.3 21.3X +Native ORC Vectorized 4816 4819 3 3.3 306.2 1.3X +Native ORC Vectorized (Pushdown) 309 316 5 50.9 19.6 19.9X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 50, distribution: 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6297 6319 15 2.5 400.4 1.0X -Parquet Vectorized (Pushdown) 844 847 2 18.6 53.6 7.5X -Native ORC Vectorized 4761 4767 6 3.3 302.7 1.3X -Native ORC Vectorized (Pushdown) 382 387 4 41.1 24.3 16.5X +Parquet Vectorized 6360 6379 13 2.5 404.3 1.0X +Parquet Vectorized (Pushdown) 841 848 7 18.7 53.5 7.6X +Native ORC Vectorized 5016 5025 11 3.1 318.9 1.3X +Native ORC Vectorized (Pushdown) 429 431 2 36.6 27.3 14.8X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 50, distribution: 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6293 6309 11 2.5 400.1 1.0X -Parquet Vectorized (Pushdown) 3247 3256 8 4.8 206.4 1.9X -Native ORC Vectorized 4785 4797 12 3.3 304.2 1.3X -Native ORC Vectorized (Pushdown) 414 420 6 38.0 26.3 15.2X +Parquet Vectorized 6363 6382 13 2.5 404.5 1.0X +Parquet Vectorized (Pushdown) 3313 3325 11 4.7 210.6 1.9X +Native ORC Vectorized 5009 5018 10 3.1 318.5 1.3X +Native ORC Vectorized (Pushdown) 438 441 3 35.9 27.8 14.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 50, distribution: 90): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6304 6314 6 2.5 400.8 1.0X -Parquet Vectorized (Pushdown) 5548 5564 13 2.8 352.7 1.1X -Native ORC Vectorized 4768 4785 31 3.3 303.1 1.3X -Native ORC Vectorized (Pushdown) 419 421 2 37.5 26.7 15.0X +Parquet Vectorized 6370 6384 14 2.5 405.0 1.0X +Parquet Vectorized (Pushdown) 5709 5742 28 2.8 363.0 1.1X +Native ORC Vectorized 5011 5015 6 3.1 318.6 1.3X +Native ORC Vectorized (Pushdown) 433 436 2 36.3 27.5 14.7X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 100, distribution: 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6275 6286 8 2.5 398.9 1.0X -Parquet Vectorized (Pushdown) 839 843 3 18.8 53.3 7.5X -Native ORC Vectorized 4747 4763 17 3.3 301.8 1.3X -Native ORC Vectorized (Pushdown) 474 479 5 33.2 30.1 13.2X +Parquet Vectorized 6339 6346 7 2.5 403.0 1.0X +Parquet Vectorized (Pushdown) 845 849 3 18.6 53.7 7.5X +Native ORC Vectorized 4991 4997 5 3.2 317.3 1.3X +Native ORC Vectorized (Pushdown) 513 519 6 30.7 32.6 12.4X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 100, distribution: 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6262 6300 59 2.5 398.2 1.0X -Parquet Vectorized (Pushdown) 3230 3246 12 4.9 205.3 1.9X -Native ORC Vectorized 4739 4756 15 3.3 301.3 1.3X -Native ORC Vectorized (Pushdown) 548 550 3 28.7 34.8 11.4X +Parquet Vectorized 6328 6342 18 2.5 402.3 1.0X +Parquet Vectorized (Pushdown) 3233 3262 28 4.9 205.6 2.0X +Native ORC Vectorized 4966 4979 15 3.2 315.7 1.3X +Native ORC Vectorized (Pushdown) 566 570 3 27.8 36.0 11.2X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 100, distribution: 90): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6288 6298 13 2.5 399.8 1.0X -Parquet Vectorized (Pushdown) 5416 5424 9 2.9 344.3 1.2X -Native ORC Vectorized 4761 4773 16 3.3 302.7 1.3X -Native ORC Vectorized (Pushdown) 558 561 4 28.2 35.5 11.3X +Parquet Vectorized 6319 6329 7 2.5 401.8 1.0X +Parquet Vectorized (Pushdown) 5713 5731 15 2.8 363.2 1.1X +Native ORC Vectorized 4957 4967 13 3.2 315.1 1.3X +Native ORC Vectorized (Pushdown) 576 581 5 27.3 36.6 11.0X ================================================================================================ Pushdown benchmark for tinyint ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 1 tinyint row (value = CAST(63 AS tinyint)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 2764 2782 23 5.7 175.8 1.0X -Parquet Vectorized (Pushdown) 95 102 8 165.3 6.1 29.0X -Native ORC Vectorized 2102 2139 28 7.5 133.6 1.3X -Native ORC Vectorized (Pushdown) 111 121 12 141.1 7.1 24.8X +Parquet Vectorized 2771 2801 32 5.7 176.2 1.0X +Parquet Vectorized (Pushdown) 100 112 15 157.7 6.3 27.8X +Native ORC Vectorized 2143 2161 28 7.3 136.3 1.3X +Native ORC Vectorized (Pushdown) 115 120 4 136.8 7.3 24.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 10% tinyint rows (value < CAST(12 AS tinyint)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3252 3280 38 4.8 206.7 1.0X -Parquet Vectorized (Pushdown) 855 878 20 18.4 54.4 3.8X -Native ORC Vectorized 2592 2595 5 6.1 164.8 1.3X -Native ORC Vectorized (Pushdown) 782 784 1 20.1 49.7 4.2X +Parquet Vectorized 3243 3289 62 4.9 206.2 1.0X +Parquet Vectorized (Pushdown) 851 863 14 18.5 54.1 3.8X +Native ORC Vectorized 2632 2648 21 6.0 167.3 1.2X +Native ORC Vectorized (Pushdown) 791 794 3 19.9 50.3 4.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 50% tinyint rows (value < CAST(63 AS tinyint)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5499 5507 7 2.9 349.6 1.0X -Parquet Vectorized (Pushdown) 4165 4183 16 3.8 264.8 1.3X -Native ORC Vectorized 4726 4747 25 3.3 300.4 1.2X -Native ORC Vectorized (Pushdown) 3729 3733 2 4.2 237.1 1.5X +Parquet Vectorized 5304 5315 17 3.0 337.2 1.0X +Parquet Vectorized (Pushdown) 4024 4034 15 3.9 255.8 1.3X +Native ORC Vectorized 4663 4675 11 3.4 296.5 1.1X +Native ORC Vectorized (Pushdown) 3665 3675 10 4.3 233.0 1.4X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 90% tinyint rows (value < CAST(114 AS tinyint)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 7817 7832 14 2.0 497.0 1.0X -Parquet Vectorized (Pushdown) 7581 7590 6 2.1 482.0 1.0X -Native ORC Vectorized 7074 7097 40 2.2 449.7 1.1X -Native ORC Vectorized (Pushdown) 6884 6889 4 2.3 437.7 1.1X +Parquet Vectorized 7478 7496 14 2.1 475.4 1.0X +Parquet Vectorized (Pushdown) 7250 7261 10 2.2 461.0 1.0X +Native ORC Vectorized 6712 6716 3 2.3 426.7 1.1X +Native ORC Vectorized (Pushdown) 6527 6538 18 2.4 415.0 1.1X ================================================================================================ Pushdown benchmark for Timestamp ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 1 timestamp stored as INT96 row (value = timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3115 3124 9 5.0 198.1 1.0X -Parquet Vectorized (Pushdown) 3101 3110 8 5.1 197.2 1.0X -Native ORC Vectorized 1984 2002 27 7.9 126.1 1.6X -Native ORC Vectorized (Pushdown) 38 42 6 413.3 2.4 81.9X +Parquet Vectorized 3103 3115 9 5.1 197.3 1.0X +Parquet Vectorized (Pushdown) 3107 3123 9 5.1 197.6 1.0X +Native ORC Vectorized 1978 1991 12 7.9 125.8 1.6X +Native ORC Vectorized (Pushdown) 40 44 5 390.9 2.6 77.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 10% timestamp stored as INT96 rows (value < timestamp_seconds(1572864)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3673 3688 19 4.3 233.5 1.0X -Parquet Vectorized (Pushdown) 3651 3674 20 4.3 232.1 1.0X -Native ORC Vectorized 2497 2508 16 6.3 158.8 1.5X -Native ORC Vectorized (Pushdown) 751 771 17 20.9 47.7 4.9X +Parquet Vectorized 3651 3670 20 4.3 232.1 1.0X +Parquet Vectorized (Pushdown) 3652 3660 7 4.3 232.2 1.0X +Native ORC Vectorized 2524 2535 14 6.2 160.5 1.4X +Native ORC Vectorized (Pushdown) 786 793 9 20.0 50.0 4.6X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 50% timestamp stored as INT96 rows (value < timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6070 6082 14 2.6 385.9 1.0X -Parquet Vectorized (Pushdown) 6083 6098 13 2.6 386.7 1.0X -Native ORC Vectorized 4669 4686 20 3.4 296.9 1.3X -Native ORC Vectorized (Pushdown) 3649 3671 13 4.3 232.0 1.7X +Parquet Vectorized 5953 5959 7 2.6 378.5 1.0X +Parquet Vectorized (Pushdown) 5959 5970 12 2.6 378.9 1.0X +Native ORC Vectorized 4855 4868 9 3.2 308.7 1.2X +Native ORC Vectorized (Pushdown) 3818 3823 4 4.1 242.7 1.6X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 90% timestamp stored as INT96 rows (value < timestamp_seconds(14155776)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8292 8309 14 1.9 527.2 1.0X -Parquet Vectorized (Pushdown) 8311 8323 20 1.9 528.4 1.0X -Native ORC Vectorized 6958 6983 31 2.3 442.4 1.2X -Native ORC Vectorized (Pushdown) 6779 6794 9 2.3 431.0 1.2X +Parquet Vectorized 8418 8436 18 1.9 535.2 1.0X +Parquet Vectorized (Pushdown) 8407 8431 25 1.9 534.5 1.0X +Native ORC Vectorized 7012 7030 17 2.2 445.8 1.2X +Native ORC Vectorized (Pushdown) 6794 6849 89 2.3 431.9 1.2X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 1 timestamp stored as TIMESTAMP_MICROS row (value = timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 2593 2606 14 6.1 164.8 1.0X -Parquet Vectorized (Pushdown) 62 68 8 251.9 4.0 41.5X -Native ORC Vectorized 1989 1996 4 7.9 126.5 1.3X -Native ORC Vectorized (Pushdown) 39 45 6 401.1 2.5 66.1X +Parquet Vectorized 2583 2596 17 6.1 164.2 1.0X +Parquet Vectorized (Pushdown) 63 66 3 248.8 4.0 40.9X +Native ORC Vectorized 1970 1976 4 8.0 125.2 1.3X +Native ORC Vectorized (Pushdown) 39 45 7 401.1 2.5 65.9X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 10% timestamp stored as TIMESTAMP_MICROS rows (value < timestamp_seconds(1572864)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3152 3157 6 5.0 200.4 1.0X -Parquet Vectorized (Pushdown) 888 894 9 17.7 56.5 3.5X -Native ORC Vectorized 2537 2544 5 6.2 161.3 1.2X -Native ORC Vectorized (Pushdown) 780 790 10 20.2 49.6 4.0X +Parquet Vectorized 3127 3140 8 5.0 198.8 1.0X +Parquet Vectorized (Pushdown) 885 890 4 17.8 56.3 3.5X +Native ORC Vectorized 2595 2599 6 6.1 165.0 1.2X +Native ORC Vectorized (Pushdown) 794 799 7 19.8 50.5 3.9X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 50% timestamp stored as TIMESTAMP_MICROS rows (value < timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5491 5505 11 2.9 349.1 1.0X -Parquet Vectorized (Pushdown) 4252 4262 6 3.7 270.4 1.3X -Native ORC Vectorized 4765 4782 17 3.3 302.9 1.2X -Native ORC Vectorized (Pushdown) 3732 3746 12 4.2 237.3 1.5X +Parquet Vectorized 5492 5514 36 2.9 349.2 1.0X +Parquet Vectorized (Pushdown) 4254 4266 7 3.7 270.5 1.3X +Native ORC Vectorized 4680 4690 9 3.4 297.5 1.2X +Native ORC Vectorized (Pushdown) 3612 3618 4 4.4 229.7 1.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 90% timestamp stored as TIMESTAMP_MICROS rows (value < timestamp_seconds(14155776)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 7482 7496 13 2.1 475.7 1.0X -Parquet Vectorized (Pushdown) 7264 7275 7 2.2 461.8 1.0X -Native ORC Vectorized 6919 6949 30 2.3 439.9 1.1X -Native ORC Vectorized (Pushdown) 6742 6748 7 2.3 428.6 1.1X +Parquet Vectorized 7631 7642 10 2.1 485.2 1.0X +Parquet Vectorized (Pushdown) 7408 7430 26 2.1 471.0 1.0X +Native ORC Vectorized 7062 7081 14 2.2 449.0 1.1X +Native ORC Vectorized (Pushdown) 6872 6890 10 2.3 436.9 1.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 1 timestamp stored as TIMESTAMP_MILLIS row (value = timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 2763 2780 29 5.7 175.6 1.0X -Parquet Vectorized (Pushdown) 64 66 3 247.6 4.0 43.5X -Native ORC Vectorized 1992 1999 14 7.9 126.6 1.4X -Native ORC Vectorized (Pushdown) 40 42 4 397.4 2.5 69.8X +Parquet Vectorized 2739 2747 9 5.7 174.1 1.0X +Parquet Vectorized (Pushdown) 64 67 5 247.5 4.0 43.1X +Native ORC Vectorized 1962 1969 8 8.0 124.8 1.4X +Native ORC Vectorized (Pushdown) 39 43 4 400.8 2.5 69.8X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 10% timestamp stored as TIMESTAMP_MILLIS rows (value < timestamp_seconds(1572864)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3300 3313 9 4.8 209.8 1.0X -Parquet Vectorized (Pushdown) 891 899 11 17.7 56.6 3.7X -Native ORC Vectorized 2545 2562 15 6.2 161.8 1.3X -Native ORC Vectorized (Pushdown) 783 788 4 20.1 49.8 4.2X +Parquet Vectorized 3295 3309 16 4.8 209.5 1.0X +Parquet Vectorized (Pushdown) 899 910 17 17.5 57.1 3.7X +Native ORC Vectorized 2589 2598 7 6.1 164.6 1.3X +Native ORC Vectorized (Pushdown) 793 800 9 19.8 50.4 4.2X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 50% timestamp stored as TIMESTAMP_MILLIS rows (value < timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5618 5627 8 2.8 357.2 1.0X -Parquet Vectorized (Pushdown) 4299 4316 18 3.7 273.3 1.3X -Native ORC Vectorized 4718 4732 13 3.3 300.0 1.2X -Native ORC Vectorized (Pushdown) 3689 3738 95 4.3 234.6 1.5X +Parquet Vectorized 5633 5660 28 2.8 358.2 1.0X +Parquet Vectorized (Pushdown) 4358 4401 53 3.6 277.1 1.3X +Native ORC Vectorized 4621 4640 18 3.4 293.8 1.2X +Native ORC Vectorized (Pushdown) 3619 3638 18 4.3 230.1 1.6X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 90% timestamp stored as TIMESTAMP_MILLIS rows (value < timestamp_seconds(14155776)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8017 8025 8 2.0 509.7 1.0X -Parquet Vectorized (Pushdown) 7756 7769 10 2.0 493.1 1.0X -Native ORC Vectorized 6879 6900 16 2.3 437.4 1.2X -Native ORC Vectorized (Pushdown) 6700 6725 25 2.3 426.0 1.2X +Parquet Vectorized 7895 7935 54 2.0 501.9 1.0X +Parquet Vectorized (Pushdown) 7662 7702 35 2.1 487.1 1.0X +Native ORC Vectorized 7068 7091 18 2.2 449.4 1.1X +Native ORC Vectorized (Pushdown) 6868 6889 24 2.3 436.7 1.1X ================================================================================================ Pushdown benchmark with many filters ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 1 row with 1 filters: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 63 84 20 0.0 63428969.0 1.0X -Parquet Vectorized (Pushdown) 60 64 7 0.0 60435812.0 1.0X -Native ORC Vectorized 55 57 3 0.0 54999306.0 1.2X -Native ORC Vectorized (Pushdown) 56 58 4 0.0 55584526.0 1.1X +Parquet Vectorized 62 80 21 0.0 62474384.0 1.0X +Parquet Vectorized (Pushdown) 63 65 3 0.0 62745406.0 1.0X +Native ORC Vectorized 56 59 5 0.0 56291738.0 1.1X +Native ORC Vectorized (Pushdown) 58 60 2 0.0 57939662.0 1.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 1 row with 250 filters: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 371 376 6 0.0 370725520.0 1.0X -Parquet Vectorized (Pushdown) 379 387 8 0.0 379310488.0 1.0X -Native ORC Vectorized 361 365 3 0.0 361340424.0 1.0X -Native ORC Vectorized (Pushdown) 366 373 5 0.0 366314223.0 1.0X +Parquet Vectorized 360 373 10 0.0 360453365.0 1.0X +Parquet Vectorized (Pushdown) 366 374 8 0.0 366449891.0 1.0X +Native ORC Vectorized 353 356 3 0.0 352735496.0 1.0X +Native ORC Vectorized (Pushdown) 359 369 8 0.0 358660716.0 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 1 row with 500 filters: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 1975 2018 60 0.0 1974703729.0 1.0X -Parquet Vectorized (Pushdown) 1980 2024 53 0.0 1979551499.0 1.0X -Native ORC Vectorized 1965 2012 57 0.0 1964990387.0 1.0X -Native ORC Vectorized (Pushdown) 1968 1998 30 0.0 1967786641.0 1.0X +Parquet Vectorized 1895 1916 27 0.0 1895172425.0 1.0X +Parquet Vectorized (Pushdown) 1918 1962 56 0.0 1918148217.0 1.0X +Native ORC Vectorized 1889 1921 49 0.0 1888761721.0 1.0X +Native ORC Vectorized (Pushdown) 1903 1913 9 0.0 1902514400.0 1.0X diff --git a/sql/core/benchmarks/FilterPushdownBenchmark-results.txt b/sql/core/benchmarks/FilterPushdownBenchmark-results.txt index d62d646a6564f..ef89bc72e4576 100644 --- a/sql/core/benchmarks/FilterPushdownBenchmark-results.txt +++ b/sql/core/benchmarks/FilterPushdownBenchmark-results.txt @@ -2,733 +2,733 @@ Pushdown for many distinct value case ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 0 string row (value IS NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 6742 6808 59 2.3 428.6 1.0X -Parquet Vectorized (Pushdown) 307 319 12 51.3 19.5 22.0X -Native ORC Vectorized 5062 5183 92 3.1 321.8 1.3X -Native ORC Vectorized (Pushdown) 307 327 18 51.2 19.5 21.9X +Parquet Vectorized 6787 6843 55 2.3 431.5 1.0X +Parquet Vectorized (Pushdown) 298 322 20 52.8 18.9 22.8X +Native ORC Vectorized 5201 5298 95 3.0 330.7 1.3X +Native ORC Vectorized (Pushdown) 328 337 8 47.9 20.9 20.7X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 0 string row ('7864320' < value < '7864320'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6849 6872 28 2.3 435.5 1.0X -Parquet Vectorized (Pushdown) 274 293 14 57.4 17.4 25.0X -Native ORC Vectorized 5195 5211 22 3.0 330.3 1.3X -Native ORC Vectorized (Pushdown) 309 345 47 51.0 19.6 22.2X +Parquet Vectorized 6854 6878 15 2.3 435.8 1.0X +Parquet Vectorized (Pushdown) 289 299 10 54.4 18.4 23.7X +Native ORC Vectorized 5278 5297 22 3.0 335.6 1.3X +Native ORC Vectorized (Pushdown) 331 362 34 47.5 21.0 20.7X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 1 string row (value = '7864320'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 6824 6840 16 2.3 433.8 1.0X -Parquet Vectorized (Pushdown) 264 267 2 59.7 16.8 25.9X -Native ORC Vectorized 5108 5127 24 3.1 324.8 1.3X -Native ORC Vectorized (Pushdown) 291 303 8 54.1 18.5 23.5X +Parquet Vectorized 6829 6844 12 2.3 434.2 1.0X +Parquet Vectorized (Pushdown) 266 281 14 59.1 16.9 25.7X +Native ORC Vectorized 5262 5283 14 3.0 334.5 1.3X +Native ORC Vectorized (Pushdown) 308 322 9 51.1 19.6 22.2X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 1 string row (value <=> '7864320'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6848 6862 12 2.3 435.4 1.0X -Parquet Vectorized (Pushdown) 257 261 4 61.2 16.3 26.7X -Native ORC Vectorized 5099 5118 15 3.1 324.2 1.3X -Native ORC Vectorized (Pushdown) 281 288 6 56.0 17.9 24.4X +Parquet Vectorized 6819 6838 13 2.3 433.6 1.0X +Parquet Vectorized (Pushdown) 261 274 11 60.3 16.6 26.1X +Native ORC Vectorized 5231 5251 13 3.0 332.6 1.3X +Native ORC Vectorized (Pushdown) 305 316 10 51.6 19.4 22.4X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 1 string row ('7864320' <= value <= '7864320'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6831 6850 15 2.3 434.3 1.0X -Parquet Vectorized (Pushdown) 257 262 5 61.2 16.3 26.6X -Native ORC Vectorized 5122 5147 21 3.1 325.7 1.3X -Native ORC Vectorized (Pushdown) 287 298 9 54.8 18.2 23.8X +Parquet Vectorized 6859 6869 6 2.3 436.1 1.0X +Parquet Vectorized (Pushdown) 266 274 10 59.2 16.9 25.8X +Native ORC Vectorized 5284 5296 9 3.0 336.0 1.3X +Native ORC Vectorized (Pushdown) 308 330 20 51.0 19.6 22.2X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select all string rows (value IS NOT NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 12806 12858 41 1.2 814.2 1.0X -Parquet Vectorized (Pushdown) 12798 12824 24 1.2 813.7 1.0X -Native ORC Vectorized 11297 11328 31 1.4 718.2 1.1X -Native ORC Vectorized (Pushdown) 11377 11394 16 1.4 723.3 1.1X +Parquet Vectorized 12624 12760 116 1.2 802.6 1.0X +Parquet Vectorized (Pushdown) 12621 12631 11 1.2 802.4 1.0X +Native ORC Vectorized 11074 11089 10 1.4 704.1 1.1X +Native ORC Vectorized (Pushdown) 11168 11188 15 1.4 710.0 1.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 0 int row (value IS NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 6428 6440 13 2.4 408.7 1.0X -Parquet Vectorized (Pushdown) 242 256 11 65.1 15.4 26.6X -Native ORC Vectorized 4596 4620 24 3.4 292.2 1.4X -Native ORC Vectorized (Pushdown) 272 279 6 57.9 17.3 23.7X +Parquet Vectorized 6439 6458 16 2.4 409.4 1.0X +Parquet Vectorized (Pushdown) 250 257 7 63.0 15.9 25.8X +Native ORC Vectorized 4759 4770 8 3.3 302.6 1.4X +Native ORC Vectorized (Pushdown) 294 304 14 53.6 18.7 21.9X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 0 int row (7864320 < value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6427 6445 20 2.4 408.6 1.0X -Parquet Vectorized (Pushdown) 247 270 22 63.7 15.7 26.0X -Native ORC Vectorized 4602 4611 8 3.4 292.6 1.4X -Native ORC Vectorized (Pushdown) 273 282 8 57.7 17.3 23.6X +Parquet Vectorized 6422 6457 38 2.4 408.3 1.0X +Parquet Vectorized (Pushdown) 254 266 15 61.9 16.2 25.3X +Native ORC Vectorized 4755 4771 20 3.3 302.3 1.4X +Native ORC Vectorized (Pushdown) 300 306 4 52.4 19.1 21.4X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 1 int row (value = 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 6477 6499 31 2.4 411.8 1.0X -Parquet Vectorized (Pushdown) 247 255 6 63.6 15.7 26.2X -Native ORC Vectorized 4664 4668 5 3.4 296.5 1.4X -Native ORC Vectorized (Pushdown) 276 282 8 57.0 17.5 23.5X +Parquet Vectorized 6477 6498 32 2.4 411.8 1.0X +Parquet Vectorized (Pushdown) 253 263 12 62.2 16.1 25.6X +Native ORC Vectorized 4799 4806 7 3.3 305.1 1.3X +Native ORC Vectorized (Pushdown) 295 300 3 53.3 18.8 21.9X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 1 int row (value <=> 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 6476 6484 10 2.4 411.7 1.0X -Parquet Vectorized (Pushdown) 243 255 8 64.7 15.4 26.7X -Native ORC Vectorized 4644 4663 19 3.4 295.3 1.4X -Native ORC Vectorized (Pushdown) 270 279 8 58.3 17.2 24.0X +Parquet Vectorized 6457 6483 21 2.4 410.5 1.0X +Parquet Vectorized (Pushdown) 251 261 8 62.7 16.0 25.7X +Native ORC Vectorized 4787 4807 18 3.3 304.3 1.3X +Native ORC Vectorized (Pushdown) 292 302 11 53.9 18.6 22.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 1 int row (7864320 <= value <= 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 6461 6481 18 2.4 410.8 1.0X -Parquet Vectorized (Pushdown) 247 252 4 63.6 15.7 26.1X -Native ORC Vectorized 4644 4660 12 3.4 295.3 1.4X -Native ORC Vectorized (Pushdown) 268 280 8 58.7 17.0 24.1X +Parquet Vectorized 6460 6475 13 2.4 410.7 1.0X +Parquet Vectorized (Pushdown) 251 256 6 62.6 16.0 25.7X +Native ORC Vectorized 4830 4840 10 3.3 307.1 1.3X +Native ORC Vectorized (Pushdown) 292 297 4 53.8 18.6 22.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 1 int row (7864319 < value < 7864321): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6461 6475 9 2.4 410.8 1.0X -Parquet Vectorized (Pushdown) 243 259 9 64.6 15.5 26.6X -Native ORC Vectorized 4650 4663 13 3.4 295.7 1.4X -Native ORC Vectorized (Pushdown) 271 275 3 58.1 17.2 23.9X +Parquet Vectorized 6464 6478 12 2.4 411.0 1.0X +Parquet Vectorized (Pushdown) 250 262 6 63.0 15.9 25.9X +Native ORC Vectorized 4803 4822 12 3.3 305.4 1.3X +Native ORC Vectorized (Pushdown) 292 298 5 53.8 18.6 22.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 10% int rows (value < 1572864): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 7051 7060 9 2.2 448.3 1.0X -Parquet Vectorized (Pushdown) 1434 1446 14 11.0 91.2 4.9X -Native ORC Vectorized 5204 5216 12 3.0 330.9 1.4X -Native ORC Vectorized (Pushdown) 1265 1280 13 12.4 80.4 5.6X +Parquet Vectorized 7055 7068 19 2.2 448.6 1.0X +Parquet Vectorized (Pushdown) 1462 1468 6 10.8 92.9 4.8X +Native ORC Vectorized 5388 5401 10 2.9 342.5 1.3X +Native ORC Vectorized (Pushdown) 1324 1328 4 11.9 84.2 5.3X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 50% int rows (value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 9069 9075 6 1.7 576.6 1.0X -Parquet Vectorized (Pushdown) 5971 5977 6 2.6 379.6 1.5X -Native ORC Vectorized 7341 7359 15 2.1 466.7 1.2X -Native ORC Vectorized (Pushdown) 5167 5178 12 3.0 328.5 1.8X +Parquet Vectorized 9195 9226 24 1.7 584.6 1.0X +Parquet Vectorized (Pushdown) 6115 6134 14 2.6 388.8 1.5X +Native ORC Vectorized 7526 7553 19 2.1 478.5 1.2X +Native ORC Vectorized (Pushdown) 5330 5338 10 3.0 338.9 1.7X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 90% int rows (value < 14155776): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 11273 11286 19 1.4 716.7 1.0X -Parquet Vectorized (Pushdown) 10641 10665 27 1.5 676.5 1.1X -Native ORC Vectorized 9236 9282 61 1.7 587.2 1.2X -Native ORC Vectorized (Pushdown) 8872 8909 49 1.8 564.0 1.3X +Parquet Vectorized 11369 11390 32 1.4 722.8 1.0X +Parquet Vectorized (Pushdown) 10766 10775 10 1.5 684.5 1.1X +Native ORC Vectorized 9650 9667 12 1.6 613.6 1.2X +Native ORC Vectorized (Pushdown) 9267 9305 56 1.7 589.2 1.2X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select all int rows (value IS NOT NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 11624 11629 5 1.4 739.0 1.0X -Parquet Vectorized (Pushdown) 11665 11692 25 1.3 741.6 1.0X -Native ORC Vectorized 9822 9837 14 1.6 624.5 1.2X -Native ORC Vectorized (Pushdown) 9875 9883 5 1.6 627.8 1.2X +Parquet Vectorized 11931 11947 17 1.3 758.6 1.0X +Parquet Vectorized (Pushdown) 11917 11938 15 1.3 757.7 1.0X +Native ORC Vectorized 10059 10078 25 1.6 639.6 1.2X +Native ORC Vectorized (Pushdown) 10127 10148 16 1.6 643.8 1.2X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select all int rows (value > -1): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 11543 11563 13 1.4 733.9 1.0X -Parquet Vectorized (Pushdown) 11588 11594 4 1.4 736.8 1.0X -Native ORC Vectorized 9784 9807 29 1.6 622.0 1.2X -Native ORC Vectorized (Pushdown) 9848 9876 17 1.6 626.1 1.2X +Parquet Vectorized 11818 11836 12 1.3 751.4 1.0X +Parquet Vectorized (Pushdown) 11904 11917 14 1.3 756.9 1.0X +Native ORC Vectorized 10241 10264 20 1.5 651.1 1.2X +Native ORC Vectorized (Pushdown) 10308 10332 21 1.5 655.4 1.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select all int rows (value != -1): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 11727 11740 8 1.3 745.6 1.0X -Parquet Vectorized (Pushdown) 11781 11791 11 1.3 749.0 1.0X -Native ORC Vectorized 9776 9803 23 1.6 621.5 1.2X -Native ORC Vectorized (Pushdown) 9858 9874 19 1.6 626.8 1.2X +Parquet Vectorized 11863 11867 3 1.3 754.2 1.0X +Parquet Vectorized (Pushdown) 11916 11937 14 1.3 757.6 1.0X +Native ORC Vectorized 10068 10109 43 1.6 640.1 1.2X +Native ORC Vectorized (Pushdown) 10156 10185 37 1.5 645.7 1.2X ================================================================================================ Pushdown for few distinct value case (use dictionary encoding) ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 0 distinct string row (value IS NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5752 5769 18 2.7 365.7 1.0X -Parquet Vectorized (Pushdown) 207 213 9 75.9 13.2 27.8X -Native ORC Vectorized 6475 6485 8 2.4 411.7 0.9X -Native ORC Vectorized (Pushdown) 910 916 6 17.3 57.8 6.3X +Parquet Vectorized 5712 5727 29 2.8 363.2 1.0X +Parquet Vectorized (Pushdown) 209 215 5 75.2 13.3 27.3X +Native ORC Vectorized 6631 6661 21 2.4 421.6 0.9X +Native ORC Vectorized (Pushdown) 970 974 4 16.2 61.6 5.9X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 0 distinct string row ('100' < value < '100'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 5869 5872 3 2.7 373.1 1.0X -Parquet Vectorized (Pushdown) 209 217 7 75.1 13.3 28.0X -Native ORC Vectorized 6684 6699 12 2.4 424.9 0.9X -Native ORC Vectorized (Pushdown) 921 929 13 17.1 58.6 6.4X +Parquet Vectorized 5888 5896 9 2.7 374.4 1.0X +Parquet Vectorized (Pushdown) 212 222 12 74.1 13.5 27.7X +Native ORC Vectorized 6813 6820 5 2.3 433.2 0.9X +Native ORC Vectorized (Pushdown) 971 978 5 16.2 61.8 6.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 1 distinct string row (value = '100'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5871 5881 21 2.7 373.3 1.0X -Parquet Vectorized (Pushdown) 254 262 8 62.0 16.1 23.1X -Native ORC Vectorized 6618 6632 16 2.4 420.8 0.9X -Native ORC Vectorized (Pushdown) 958 971 12 16.4 60.9 6.1X +Parquet Vectorized 5826 5838 15 2.7 370.4 1.0X +Parquet Vectorized (Pushdown) 261 263 2 60.4 16.6 22.4X +Native ORC Vectorized 6763 6785 18 2.3 430.0 0.9X +Native ORC Vectorized (Pushdown) 1019 1031 19 15.4 64.8 5.7X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 1 distinct string row (value <=> '100'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 5859 5870 18 2.7 372.5 1.0X -Parquet Vectorized (Pushdown) 252 257 7 62.3 16.0 23.2X -Native ORC Vectorized 6619 6630 15 2.4 420.8 0.9X -Native ORC Vectorized (Pushdown) 956 965 9 16.4 60.8 6.1X +Parquet Vectorized 5823 5836 13 2.7 370.2 1.0X +Parquet Vectorized (Pushdown) 258 264 6 60.9 16.4 22.6X +Native ORC Vectorized 6715 6737 28 2.3 426.9 0.9X +Native ORC Vectorized (Pushdown) 1017 1029 14 15.5 64.6 5.7X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 1 distinct string row ('100' <= value <= '100'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5927 5932 6 2.7 376.8 1.0X -Parquet Vectorized (Pushdown) 257 260 4 61.3 16.3 23.1X -Native ORC Vectorized 6684 6698 10 2.4 425.0 0.9X -Native ORC Vectorized (Pushdown) 963 972 9 16.3 61.2 6.2X +Parquet Vectorized 5911 5924 12 2.7 375.8 1.0X +Parquet Vectorized (Pushdown) 260 262 1 60.4 16.6 22.7X +Native ORC Vectorized 6793 6830 45 2.3 431.9 0.9X +Native ORC Vectorized (Pushdown) 1023 1032 7 15.4 65.0 5.8X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select all distinct string rows (value IS NOT NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 12265 12294 35 1.3 779.8 1.0X -Parquet Vectorized (Pushdown) 12344 12375 34 1.3 784.8 1.0X -Native ORC Vectorized 13187 13225 26 1.2 838.4 0.9X -Native ORC Vectorized (Pushdown) 13400 13431 30 1.2 852.0 0.9X +Parquet Vectorized 12245 12263 13 1.3 778.5 1.0X +Parquet Vectorized (Pushdown) 12336 12360 28 1.3 784.3 1.0X +Native ORC Vectorized 13260 13282 14 1.2 843.0 0.9X +Native ORC Vectorized (Pushdown) 13461 13478 20 1.2 855.8 0.9X ================================================================================================ Pushdown benchmark for StringStartsWith ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor StringStartsWith filter: (value like '10%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 7041 7147 129 2.2 447.7 1.0X -Parquet Vectorized (Pushdown) 888 903 20 17.7 56.4 7.9X -Native ORC Vectorized 5292 5341 45 3.0 336.4 1.3X -Native ORC Vectorized (Pushdown) 5381 5416 33 2.9 342.1 1.3X +Parquet Vectorized 7051 7097 66 2.2 448.3 1.0X +Parquet Vectorized (Pushdown) 892 908 14 17.6 56.7 7.9X +Native ORC Vectorized 5370 5406 28 2.9 341.4 1.3X +Native ORC Vectorized (Pushdown) 5447 5458 10 2.9 346.3 1.3X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor StringStartsWith filter: (value like '1000%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 7185 7196 11 2.2 456.8 1.0X -Parquet Vectorized (Pushdown) 245 250 5 64.2 15.6 29.3X -Native ORC Vectorized 5172 5187 12 3.0 328.8 1.4X -Native ORC Vectorized (Pushdown) 5237 5264 23 3.0 333.0 1.4X +Parquet Vectorized 6897 6908 14 2.3 438.5 1.0X +Parquet Vectorized (Pushdown) 247 252 4 63.6 15.7 27.9X +Native ORC Vectorized 5237 5242 4 3.0 333.0 1.3X +Native ORC Vectorized (Pushdown) 5307 5329 35 3.0 337.4 1.3X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor StringStartsWith filter: (value like '786432%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 7134 7161 27 2.2 453.6 1.0X -Parquet Vectorized (Pushdown) 239 247 9 65.9 15.2 29.9X -Native ORC Vectorized 5107 5116 9 3.1 324.7 1.4X -Native ORC Vectorized (Pushdown) 5180 5189 8 3.0 329.3 1.4X +Parquet Vectorized 6893 6917 26 2.3 438.3 1.0X +Parquet Vectorized (Pushdown) 241 246 8 65.4 15.3 28.7X +Native ORC Vectorized 5240 5249 9 3.0 333.1 1.3X +Native ORC Vectorized (Pushdown) 5305 5317 14 3.0 337.3 1.3X ================================================================================================ Pushdown benchmark for StringEndsWith ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor StringEndsWith filter: (value like '%10'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5816 5829 9 2.7 369.8 1.0X -Parquet Vectorized (Pushdown) 336 345 14 46.9 21.3 17.3X -Native ORC Vectorized 6601 6617 13 2.4 419.7 0.9X -Native ORC Vectorized (Pushdown) 6768 6799 18 2.3 430.3 0.9X +Parquet Vectorized 5852 5870 15 2.7 372.1 1.0X +Parquet Vectorized (Pushdown) 338 345 9 46.6 21.5 17.3X +Native ORC Vectorized 6740 6753 12 2.3 428.5 0.9X +Native ORC Vectorized (Pushdown) 6932 6955 18 2.3 440.7 0.8X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor StringEndsWith filter: (value like '%1000'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5783 5803 25 2.7 367.7 1.0X -Parquet Vectorized (Pushdown) 243 248 4 64.8 15.4 23.8X -Native ORC Vectorized 6557 6576 14 2.4 416.9 0.9X -Native ORC Vectorized (Pushdown) 6730 6748 13 2.3 427.9 0.9X +Parquet Vectorized 5787 5812 17 2.7 367.9 1.0X +Parquet Vectorized (Pushdown) 244 252 8 64.5 15.5 23.7X +Native ORC Vectorized 6697 6716 11 2.3 425.8 0.9X +Native ORC Vectorized (Pushdown) 6884 6900 12 2.3 437.7 0.8X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor StringEndsWith filter: (value like '%786432'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5762 5781 11 2.7 366.4 1.0X -Parquet Vectorized (Pushdown) 241 244 2 65.4 15.3 24.0X -Native ORC Vectorized 6567 6579 20 2.4 417.5 0.9X -Native ORC Vectorized (Pushdown) 6757 6766 11 2.3 429.6 0.9X +Parquet Vectorized 5784 5800 26 2.7 367.8 1.0X +Parquet Vectorized (Pushdown) 242 247 4 64.9 15.4 23.9X +Native ORC Vectorized 6704 6717 10 2.3 426.2 0.9X +Native ORC Vectorized (Pushdown) 6895 6911 17 2.3 438.4 0.8X ================================================================================================ Pushdown benchmark for StringContains ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor StringContains filter: (value like '%10%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5964 5980 16 2.6 379.2 1.0X -Parquet Vectorized (Pushdown) 741 745 6 21.2 47.1 8.1X -Native ORC Vectorized 6709 6731 13 2.3 426.6 0.9X -Native ORC Vectorized (Pushdown) 6933 6941 11 2.3 440.8 0.9X +Parquet Vectorized 6010 6037 20 2.6 382.1 1.0X +Parquet Vectorized (Pushdown) 755 758 4 20.8 48.0 8.0X +Native ORC Vectorized 6902 6918 11 2.3 438.8 0.9X +Native ORC Vectorized (Pushdown) 7113 7128 10 2.2 452.2 0.8X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor StringContains filter: (value like '%1000%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5788 5802 12 2.7 368.0 1.0X -Parquet Vectorized (Pushdown) 241 244 2 65.3 15.3 24.0X -Native ORC Vectorized 6538 6559 16 2.4 415.7 0.9X -Native ORC Vectorized (Pushdown) 6714 6734 14 2.3 426.9 0.9X +Parquet Vectorized 5791 5799 8 2.7 368.2 1.0X +Parquet Vectorized (Pushdown) 246 247 1 64.0 15.6 23.6X +Native ORC Vectorized 6700 6712 12 2.3 426.0 0.9X +Native ORC Vectorized (Pushdown) 6906 6923 18 2.3 439.1 0.8X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor StringContains filter: (value like '%786432%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 5786 5795 7 2.7 367.9 1.0X -Parquet Vectorized (Pushdown) 243 245 2 64.7 15.4 23.8X -Native ORC Vectorized 6558 6580 26 2.4 417.0 0.9X -Native ORC Vectorized (Pushdown) 6735 6751 13 2.3 428.2 0.9X +Parquet Vectorized 5803 5821 27 2.7 368.9 1.0X +Parquet Vectorized (Pushdown) 243 248 9 64.6 15.5 23.8X +Native ORC Vectorized 6709 6730 18 2.3 426.5 0.9X +Native ORC Vectorized (Pushdown) 6910 6921 10 2.3 439.3 0.8X ================================================================================================ Pushdown benchmark for decimal ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 1 decimal(9, 2) row (value = 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 2818 2835 11 5.6 179.2 1.0X -Parquet Vectorized (Pushdown) 64 66 4 246.6 4.1 44.2X -Native ORC Vectorized 3385 3402 20 4.6 215.2 0.8X -Native ORC Vectorized (Pushdown) 56 60 4 280.6 3.6 50.3X +Parquet Vectorized 2857 2879 29 5.5 181.6 1.0X +Parquet Vectorized (Pushdown) 65 68 5 241.6 4.1 43.9X +Native ORC Vectorized 3410 3429 17 4.6 216.8 0.8X +Native ORC Vectorized (Pushdown) 59 62 3 265.7 3.8 48.3X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 10% decimal(9, 2) rows (value < 1572864): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 4033 4046 10 3.9 256.4 1.0X -Parquet Vectorized (Pushdown) 1791 1799 9 8.8 113.9 2.3X -Native ORC Vectorized 4672 4695 33 3.4 297.0 0.9X -Native ORC Vectorized (Pushdown) 1931 1940 20 8.1 122.8 2.1X +Parquet Vectorized 4091 4111 24 3.8 260.1 1.0X +Parquet Vectorized (Pushdown) 1849 1853 5 8.5 117.6 2.2X +Native ORC Vectorized 4743 4778 46 3.3 301.6 0.9X +Native ORC Vectorized (Pushdown) 1983 1991 7 7.9 126.1 2.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 50% decimal(9, 2) rows (value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 7745 7749 3 2.0 492.4 1.0X -Parquet Vectorized (Pushdown) 7436 7451 17 2.1 472.8 1.0X -Native ORC Vectorized 8812 8828 11 1.8 560.3 0.9X -Native ORC Vectorized (Pushdown) 8370 8398 27 1.9 532.2 0.9X +Parquet Vectorized 8000 8018 20 2.0 508.6 1.0X +Parquet Vectorized (Pushdown) 7633 7645 7 2.1 485.3 1.0X +Native ORC Vectorized 8981 8991 9 1.8 571.0 0.9X +Native ORC Vectorized (Pushdown) 8512 8613 192 1.8 541.1 0.9X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 90% decimal(9, 2) rows (value < 14155776): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8527 8541 19 1.8 542.2 1.0X -Parquet Vectorized (Pushdown) 8546 8565 23 1.8 543.3 1.0X -Native ORC Vectorized 9948 10010 73 1.6 632.5 0.9X -Native ORC Vectorized (Pushdown) 9983 9991 9 1.6 634.7 0.9X +Parquet Vectorized 8941 8969 32 1.8 568.5 1.0X +Parquet Vectorized (Pushdown) 8968 9032 115 1.8 570.2 1.0X +Native ORC Vectorized 10136 10159 28 1.6 644.4 0.9X +Native ORC Vectorized (Pushdown) 10162 10196 33 1.5 646.1 0.9X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 1 decimal(18, 2) row (value = 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 2968 3005 22 5.3 188.7 1.0X -Parquet Vectorized (Pushdown) 62 65 4 253.1 4.0 47.8X -Native ORC Vectorized 3458 3464 8 4.5 219.9 0.9X -Native ORC Vectorized (Pushdown) 53 56 4 296.9 3.4 56.0X +Parquet Vectorized 2995 3006 9 5.3 190.4 1.0X +Parquet Vectorized (Pushdown) 63 67 4 248.3 4.0 47.3X +Native ORC Vectorized 3431 3441 13 4.6 218.1 0.9X +Native ORC Vectorized (Pushdown) 56 60 3 279.2 3.6 53.2X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 10% decimal(18, 2) rows (value < 1572864): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3608 3617 12 4.4 229.4 1.0X -Parquet Vectorized (Pushdown) 917 927 14 17.2 58.3 3.9X -Native ORC Vectorized 4100 4110 9 3.8 260.6 0.9X -Native ORC Vectorized (Pushdown) 1005 1007 2 15.7 63.9 3.6X +Parquet Vectorized 3718 3727 17 4.2 236.4 1.0X +Parquet Vectorized (Pushdown) 1012 1016 2 15.5 64.4 3.7X +Native ORC Vectorized 4142 4181 26 3.8 263.4 0.9X +Native ORC Vectorized (Pushdown) 1056 1063 7 14.9 67.1 3.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 50% decimal(18, 2) rows (value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5916 5933 15 2.7 376.1 1.0X -Parquet Vectorized (Pushdown) 4418 4425 6 3.6 280.9 1.3X -Native ORC Vectorized 6642 6683 49 2.4 422.3 0.9X -Native ORC Vectorized (Pushdown) 4902 4908 4 3.2 311.6 1.2X +Parquet Vectorized 6331 6343 17 2.5 402.5 1.0X +Parquet Vectorized (Pushdown) 4843 4855 10 3.2 307.9 1.3X +Native ORC Vectorized 6859 6864 10 2.3 436.1 0.9X +Native ORC Vectorized (Pushdown) 5112 5123 10 3.1 325.0 1.2X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 90% decimal(18, 2) rows (value < 14155776): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8185 8206 28 1.9 520.4 1.0X -Parquet Vectorized (Pushdown) 7907 7920 11 2.0 502.7 1.0X -Native ORC Vectorized 9169 9182 15 1.7 582.9 0.9X -Native ORC Vectorized (Pushdown) 8834 8848 10 1.8 561.7 0.9X +Parquet Vectorized 8837 8842 4 1.8 561.8 1.0X +Parquet Vectorized (Pushdown) 8563 8571 6 1.8 544.4 1.0X +Native ORC Vectorized 9504 9540 31 1.7 604.2 0.9X +Native ORC Vectorized (Pushdown) 9203 9209 7 1.7 585.1 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 1 decimal(38, 2) row (value = 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 4245 4253 12 3.7 269.9 1.0X -Parquet Vectorized (Pushdown) 68 70 3 230.2 4.3 62.1X -Native ORC Vectorized 3501 3524 16 4.5 222.6 1.2X -Native ORC Vectorized (Pushdown) 53 55 3 298.7 3.3 80.6X +Parquet Vectorized 4207 4222 14 3.7 267.5 1.0X +Parquet Vectorized (Pushdown) 69 72 3 227.7 4.4 60.9X +Native ORC Vectorized 3482 3502 19 4.5 221.4 1.2X +Native ORC Vectorized (Pushdown) 56 58 2 283.2 3.5 75.7X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 10% decimal(38, 2) rows (value < 1572864): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5051 5064 11 3.1 321.1 1.0X -Parquet Vectorized (Pushdown) 1272 1276 2 12.4 80.9 4.0X -Native ORC Vectorized 4282 4293 18 3.7 272.3 1.2X -Native ORC Vectorized (Pushdown) 1150 1155 6 13.7 73.1 4.4X +Parquet Vectorized 5073 5086 12 3.1 322.5 1.0X +Parquet Vectorized (Pushdown) 1319 1322 3 11.9 83.9 3.8X +Native ORC Vectorized 4294 4312 24 3.7 273.0 1.2X +Native ORC Vectorized (Pushdown) 1158 1164 5 13.6 73.7 4.4X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 50% decimal(38, 2) rows (value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8224 8255 60 1.9 522.9 1.0X -Parquet Vectorized (Pushdown) 6149 6159 9 2.6 391.0 1.3X -Native ORC Vectorized 7372 7389 23 2.1 468.7 1.1X -Native ORC Vectorized (Pushdown) 5581 5589 10 2.8 354.9 1.5X +Parquet Vectorized 8420 8438 11 1.9 535.3 1.0X +Parquet Vectorized (Pushdown) 6373 6378 5 2.5 405.2 1.3X +Native ORC Vectorized 7440 7449 13 2.1 473.0 1.1X +Native ORC Vectorized (Pushdown) 5638 5662 14 2.8 358.5 1.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 90% decimal(38, 2) rows (value < 14155776): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 11359 11379 15 1.4 722.2 1.0X -Parquet Vectorized (Pushdown) 10971 10975 2 1.4 697.5 1.0X -Native ORC Vectorized 10366 10385 26 1.5 659.1 1.1X -Native ORC Vectorized (Pushdown) 9993 10016 19 1.6 635.3 1.1X +Parquet Vectorized 11729 11741 9 1.3 745.7 1.0X +Parquet Vectorized (Pushdown) 11358 11369 8 1.4 722.1 1.0X +Native ORC Vectorized 10556 10591 25 1.5 671.1 1.1X +Native ORC Vectorized (Pushdown) 10164 10192 18 1.5 646.2 1.2X ================================================================================================ Pushdown benchmark for InSet -> InFilters ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 5, distribution: 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6439 6534 103 2.4 409.4 1.0X -Parquet Vectorized (Pushdown) 250 257 8 62.9 15.9 25.7X -Native ORC Vectorized 4639 4653 12 3.4 294.9 1.4X -Native ORC Vectorized (Pushdown) 279 285 5 56.3 17.7 23.1X +Parquet Vectorized 6436 6463 34 2.4 409.2 1.0X +Parquet Vectorized (Pushdown) 255 260 4 61.6 16.2 25.2X +Native ORC Vectorized 4805 4811 6 3.3 305.5 1.3X +Native ORC Vectorized (Pushdown) 296 304 5 53.2 18.8 21.8X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 5, distribution: 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6443 6456 16 2.4 409.7 1.0X -Parquet Vectorized (Pushdown) 251 255 2 62.6 16.0 25.6X -Native ORC Vectorized 4648 4672 29 3.4 295.5 1.4X -Native ORC Vectorized (Pushdown) 280 286 5 56.3 17.8 23.0X +Parquet Vectorized 6437 6443 6 2.4 409.2 1.0X +Parquet Vectorized (Pushdown) 254 262 7 61.9 16.2 25.3X +Native ORC Vectorized 4803 4813 14 3.3 305.3 1.3X +Native ORC Vectorized (Pushdown) 299 305 6 52.6 19.0 21.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 5, distribution: 90): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6446 6452 4 2.4 409.8 1.0X -Parquet Vectorized (Pushdown) 254 260 7 62.0 16.1 25.4X -Native ORC Vectorized 4637 4643 4 3.4 294.8 1.4X -Native ORC Vectorized (Pushdown) 279 285 7 56.4 17.7 23.1X +Parquet Vectorized 6431 6444 8 2.4 408.9 1.0X +Parquet Vectorized (Pushdown) 255 259 3 61.8 16.2 25.2X +Native ORC Vectorized 4802 4814 12 3.3 305.3 1.3X +Native ORC Vectorized (Pushdown) 296 300 3 53.1 18.8 21.7X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 10, distribution: 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6455 6467 10 2.4 410.4 1.0X -Parquet Vectorized (Pushdown) 271 274 3 58.1 17.2 23.8X -Native ORC Vectorized 4651 4666 17 3.4 295.7 1.4X -Native ORC Vectorized (Pushdown) 292 297 4 53.9 18.5 22.1X +Parquet Vectorized 6453 6463 12 2.4 410.2 1.0X +Parquet Vectorized (Pushdown) 275 279 4 57.1 17.5 23.4X +Native ORC Vectorized 4794 4807 13 3.3 304.8 1.3X +Native ORC Vectorized (Pushdown) 310 314 3 50.8 19.7 20.8X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 10, distribution: 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6451 6460 6 2.4 410.2 1.0X -Parquet Vectorized (Pushdown) 270 277 8 58.3 17.2 23.9X -Native ORC Vectorized 4646 4656 11 3.4 295.4 1.4X -Native ORC Vectorized (Pushdown) 296 300 3 53.2 18.8 21.8X +Parquet Vectorized 6445 6456 7 2.4 409.8 1.0X +Parquet Vectorized (Pushdown) 271 282 13 58.1 17.2 23.8X +Native ORC Vectorized 4809 4828 19 3.3 305.8 1.3X +Native ORC Vectorized (Pushdown) 314 317 2 50.1 20.0 20.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 10, distribution: 90): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6462 6471 9 2.4 410.8 1.0X -Parquet Vectorized (Pushdown) 270 280 9 58.3 17.2 23.9X -Native ORC Vectorized 4648 4655 6 3.4 295.5 1.4X -Native ORC Vectorized (Pushdown) 293 298 5 53.7 18.6 22.1X +Parquet Vectorized 6457 6464 13 2.4 410.5 1.0X +Parquet Vectorized (Pushdown) 272 276 3 57.8 17.3 23.7X +Native ORC Vectorized 4811 4837 17 3.3 305.9 1.3X +Native ORC Vectorized (Pushdown) 309 314 3 50.9 19.6 20.9X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 50, distribution: 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6665 6681 17 2.4 423.8 1.0X -Parquet Vectorized (Pushdown) 875 882 6 18.0 55.6 7.6X -Native ORC Vectorized 4869 4883 13 3.2 309.6 1.4X -Native ORC Vectorized (Pushdown) 395 397 2 39.8 25.1 16.9X +Parquet Vectorized 6651 6690 50 2.4 422.8 1.0X +Parquet Vectorized (Pushdown) 855 858 5 18.4 54.4 7.8X +Native ORC Vectorized 5035 5071 26 3.1 320.1 1.3X +Native ORC Vectorized (Pushdown) 414 417 4 38.0 26.3 16.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 50, distribution: 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6659 6670 10 2.4 423.4 1.0X -Parquet Vectorized (Pushdown) 3157 3199 50 5.0 200.7 2.1X -Native ORC Vectorized 4859 4874 14 3.2 308.9 1.4X -Native ORC Vectorized (Pushdown) 424 425 1 37.1 27.0 15.7X +Parquet Vectorized 6652 6657 5 2.4 422.9 1.0X +Parquet Vectorized (Pushdown) 3465 3482 15 4.5 220.3 1.9X +Native ORC Vectorized 5025 5033 5 3.1 319.5 1.3X +Native ORC Vectorized (Pushdown) 441 442 2 35.7 28.0 15.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 50, distribution: 90): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6667 6679 18 2.4 423.9 1.0X -Parquet Vectorized (Pushdown) 5982 5991 7 2.6 380.3 1.1X -Native ORC Vectorized 4879 4887 12 3.2 310.2 1.4X -Native ORC Vectorized (Pushdown) 423 426 2 37.2 26.9 15.8X +Parquet Vectorized 6656 6671 12 2.4 423.2 1.0X +Parquet Vectorized (Pushdown) 5728 5738 11 2.7 364.2 1.2X +Native ORC Vectorized 5040 5055 10 3.1 320.5 1.3X +Native ORC Vectorized (Pushdown) 444 447 4 35.4 28.3 15.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 100, distribution: 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6599 6606 10 2.4 419.5 1.0X -Parquet Vectorized (Pushdown) 887 891 5 17.7 56.4 7.4X -Native ORC Vectorized 4798 4809 7 3.3 305.1 1.4X -Native ORC Vectorized (Pushdown) 482 486 6 32.7 30.6 13.7X +Parquet Vectorized 6596 6605 8 2.4 419.4 1.0X +Parquet Vectorized (Pushdown) 887 890 3 17.7 56.4 7.4X +Native ORC Vectorized 4967 4974 7 3.2 315.8 1.3X +Native ORC Vectorized (Pushdown) 511 515 4 30.8 32.5 12.9X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 100, distribution: 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6611 6628 16 2.4 420.3 1.0X -Parquet Vectorized (Pushdown) 3286 3292 10 4.8 208.9 2.0X -Native ORC Vectorized 4801 4806 5 3.3 305.2 1.4X -Native ORC Vectorized (Pushdown) 558 564 6 28.2 35.4 11.9X +Parquet Vectorized 6618 6626 7 2.4 420.8 1.0X +Parquet Vectorized (Pushdown) 3375 3385 8 4.7 214.6 2.0X +Native ORC Vectorized 4988 4995 9 3.2 317.1 1.3X +Native ORC Vectorized (Pushdown) 587 591 7 26.8 37.3 11.3X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor InSet -> InFilters (values count: 100, distribution: 90): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6616 6630 20 2.4 420.6 1.0X -Parquet Vectorized (Pushdown) 5983 5988 3 2.6 380.4 1.1X -Native ORC Vectorized 4808 4815 7 3.3 305.7 1.4X -Native ORC Vectorized (Pushdown) 564 567 2 27.9 35.8 11.7X +Parquet Vectorized 6604 6626 19 2.4 419.9 1.0X +Parquet Vectorized (Pushdown) 5909 5920 8 2.7 375.7 1.1X +Native ORC Vectorized 4987 4991 4 3.2 317.0 1.3X +Native ORC Vectorized (Pushdown) 601 604 2 26.2 38.2 11.0X ================================================================================================ Pushdown benchmark for tinyint ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 1 tinyint row (value = CAST(63 AS tinyint)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3117 3209 82 5.0 198.2 1.0X -Parquet Vectorized (Pushdown) 100 102 5 157.7 6.3 31.3X -Native ORC Vectorized 2264 2271 9 6.9 144.0 1.4X -Native ORC Vectorized (Pushdown) 112 115 3 140.3 7.1 27.8X +Parquet Vectorized 3095 3112 16 5.1 196.8 1.0X +Parquet Vectorized (Pushdown) 101 103 3 156.4 6.4 30.8X +Native ORC Vectorized 2387 2394 5 6.6 151.8 1.3X +Native ORC Vectorized (Pushdown) 119 121 3 132.5 7.5 26.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 10% tinyint rows (value < CAST(12 AS tinyint)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3624 3632 9 4.3 230.4 1.0X -Parquet Vectorized (Pushdown) 880 881 1 17.9 55.9 4.1X -Native ORC Vectorized 2736 2742 4 5.7 173.9 1.3X -Native ORC Vectorized (Pushdown) 775 783 11 20.3 49.3 4.7X +Parquet Vectorized 3619 3637 15 4.3 230.1 1.0X +Parquet Vectorized (Pushdown) 879 886 10 17.9 55.9 4.1X +Native ORC Vectorized 2876 2885 5 5.5 182.9 1.3X +Native ORC Vectorized (Pushdown) 808 809 2 19.5 51.4 4.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 50% tinyint rows (value < CAST(63 AS tinyint)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5679 5688 10 2.8 361.0 1.0X -Parquet Vectorized (Pushdown) 4159 4167 7 3.8 264.4 1.4X -Native ORC Vectorized 4832 4846 18 3.3 307.2 1.2X -Native ORC Vectorized (Pushdown) 3744 3753 12 4.2 238.0 1.5X +Parquet Vectorized 5707 5724 18 2.8 362.8 1.0X +Parquet Vectorized (Pushdown) 4215 4219 3 3.7 268.0 1.4X +Native ORC Vectorized 4873 4884 9 3.2 309.8 1.2X +Native ORC Vectorized (Pushdown) 3724 3748 14 4.2 236.8 1.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 90% tinyint rows (value < CAST(114 AS tinyint)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 7666 7681 14 2.1 487.4 1.0X -Parquet Vectorized (Pushdown) 7380 7403 34 2.1 469.2 1.0X -Native ORC Vectorized 6779 6792 12 2.3 431.0 1.1X -Native ORC Vectorized (Pushdown) 6594 6606 23 2.4 419.3 1.2X +Parquet Vectorized 7806 7815 9 2.0 496.3 1.0X +Parquet Vectorized (Pushdown) 7548 7552 3 2.1 479.9 1.0X +Native ORC Vectorized 7017 7034 30 2.2 446.1 1.1X +Native ORC Vectorized (Pushdown) 6797 6808 13 2.3 432.2 1.1X ================================================================================================ Pushdown benchmark for Timestamp ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 1 timestamp stored as INT96 row (value = timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3202 3216 9 4.9 203.6 1.0X -Parquet Vectorized (Pushdown) 3218 3226 9 4.9 204.6 1.0X -Native ORC Vectorized 2223 2233 6 7.1 141.4 1.4X -Native ORC Vectorized (Pushdown) 38 41 4 413.1 2.4 84.1X +Parquet Vectorized 3215 3233 16 4.9 204.4 1.0X +Parquet Vectorized (Pushdown) 3202 3213 11 4.9 203.6 1.0X +Native ORC Vectorized 2269 2275 6 6.9 144.2 1.4X +Native ORC Vectorized (Pushdown) 40 43 3 392.1 2.6 80.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 10% timestamp stored as INT96 rows (value < timestamp_seconds(1572864)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3746 3770 19 4.2 238.2 1.0X -Parquet Vectorized (Pushdown) 3744 3760 18 4.2 238.0 1.0X -Native ORC Vectorized 2757 2768 14 5.7 175.3 1.4X -Native ORC Vectorized (Pushdown) 782 786 4 20.1 49.7 4.8X +Parquet Vectorized 3776 3799 32 4.2 240.1 1.0X +Parquet Vectorized (Pushdown) 3783 3786 4 4.2 240.5 1.0X +Native ORC Vectorized 2818 2837 17 5.6 179.1 1.3X +Native ORC Vectorized (Pushdown) 807 812 3 19.5 51.3 4.7X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 50% timestamp stored as INT96 rows (value < timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5956 5971 15 2.6 378.7 1.0X -Parquet Vectorized (Pushdown) 5948 5964 19 2.6 378.1 1.0X -Native ORC Vectorized 4949 4980 44 3.2 314.6 1.2X -Native ORC Vectorized (Pushdown) 3785 3790 5 4.2 240.6 1.6X +Parquet Vectorized 6080 6082 2 2.6 386.6 1.0X +Parquet Vectorized (Pushdown) 6072 6103 30 2.6 386.0 1.0X +Native ORC Vectorized 5071 5086 18 3.1 322.4 1.2X +Native ORC Vectorized (Pushdown) 3911 3920 6 4.0 248.6 1.6X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 90% timestamp stored as INT96 rows (value < timestamp_seconds(14155776)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8129 8135 8 1.9 516.8 1.0X -Parquet Vectorized (Pushdown) 8126 8143 28 1.9 516.6 1.0X -Native ORC Vectorized 7117 7153 25 2.2 452.5 1.1X -Native ORC Vectorized (Pushdown) 6898 6936 28 2.3 438.5 1.2X +Parquet Vectorized 8437 8445 7 1.9 536.4 1.0X +Parquet Vectorized (Pushdown) 8452 8458 4 1.9 537.4 1.0X +Native ORC Vectorized 7424 7504 90 2.1 472.0 1.1X +Native ORC Vectorized (Pushdown) 7202 7230 25 2.2 457.9 1.2X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 1 timestamp stored as TIMESTAMP_MICROS row (value = timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 2990 2994 3 5.3 190.1 1.0X -Parquet Vectorized (Pushdown) 61 64 4 256.0 3.9 48.7X -Native ORC Vectorized 2229 2236 8 7.1 141.7 1.3X -Native ORC Vectorized (Pushdown) 38 41 4 416.9 2.4 79.3X +Parquet Vectorized 2975 2983 9 5.3 189.1 1.0X +Parquet Vectorized (Pushdown) 63 68 6 247.8 4.0 46.9X +Native ORC Vectorized 2264 2282 35 6.9 144.0 1.3X +Native ORC Vectorized (Pushdown) 40 43 4 392.7 2.5 74.3X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 10% timestamp stored as TIMESTAMP_MICROS rows (value < timestamp_seconds(1572864)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3529 3538 14 4.5 224.3 1.0X -Parquet Vectorized (Pushdown) 900 906 11 17.5 57.2 3.9X -Native ORC Vectorized 2754 2757 3 5.7 175.1 1.3X -Native ORC Vectorized (Pushdown) 783 784 2 20.1 49.8 4.5X +Parquet Vectorized 3552 3568 25 4.4 225.8 1.0X +Parquet Vectorized (Pushdown) 914 917 2 17.2 58.1 3.9X +Native ORC Vectorized 2827 2832 7 5.6 179.8 1.3X +Native ORC Vectorized (Pushdown) 813 816 2 19.3 51.7 4.4X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 50% timestamp stored as TIMESTAMP_MICROS rows (value < timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5727 5733 11 2.7 364.1 1.0X -Parquet Vectorized (Pushdown) 4263 4269 5 3.7 271.0 1.3X -Native ORC Vectorized 4941 4979 67 3.2 314.1 1.2X -Native ORC Vectorized (Pushdown) 3776 3792 19 4.2 240.1 1.5X +Parquet Vectorized 5843 5849 6 2.7 371.5 1.0X +Parquet Vectorized (Pushdown) 4410 4440 54 3.6 280.4 1.3X +Native ORC Vectorized 5066 5081 16 3.1 322.1 1.2X +Native ORC Vectorized (Pushdown) 3908 3916 6 4.0 248.5 1.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 90% timestamp stored as TIMESTAMP_MICROS rows (value < timestamp_seconds(14155776)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 7900 7907 6 2.0 502.3 1.0X -Parquet Vectorized (Pushdown) 7632 7636 5 2.1 485.2 1.0X -Native ORC Vectorized 7124 7140 19 2.2 452.9 1.1X -Native ORC Vectorized (Pushdown) 6902 6917 10 2.3 438.8 1.1X +Parquet Vectorized 8214 8218 4 1.9 522.2 1.0X +Parquet Vectorized (Pushdown) 7953 8002 63 2.0 505.6 1.0X +Native ORC Vectorized 7436 7453 25 2.1 472.8 1.1X +Native ORC Vectorized (Pushdown) 7215 7248 46 2.2 458.7 1.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 1 timestamp stored as TIMESTAMP_MILLIS row (value = timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3008 3060 72 5.2 191.3 1.0X -Parquet Vectorized (Pushdown) 62 63 2 255.5 3.9 48.9X -Native ORC Vectorized 2227 2238 15 7.1 141.6 1.4X -Native ORC Vectorized (Pushdown) 38 42 5 418.2 2.4 80.0X +Parquet Vectorized 3003 3009 7 5.2 190.9 1.0X +Parquet Vectorized (Pushdown) 63 67 4 248.8 4.0 47.5X +Native ORC Vectorized 2258 2262 4 7.0 143.6 1.3X +Native ORC Vectorized (Pushdown) 40 42 3 395.7 2.5 75.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 10% timestamp stored as TIMESTAMP_MILLIS rows (value < timestamp_seconds(1572864)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3551 3559 7 4.4 225.7 1.0X -Parquet Vectorized (Pushdown) 901 907 5 17.5 57.3 3.9X -Native ORC Vectorized 2758 2762 4 5.7 175.3 1.3X -Native ORC Vectorized (Pushdown) 780 783 3 20.2 49.6 4.5X +Parquet Vectorized 3586 3599 11 4.4 228.0 1.0X +Parquet Vectorized (Pushdown) 916 919 3 17.2 58.2 3.9X +Native ORC Vectorized 2836 2850 19 5.5 180.3 1.3X +Native ORC Vectorized (Pushdown) 811 815 3 19.4 51.6 4.4X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 50% timestamp stored as TIMESTAMP_MILLIS rows (value < timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5737 5751 19 2.7 364.8 1.0X -Parquet Vectorized (Pushdown) 4269 4273 4 3.7 271.4 1.3X -Native ORC Vectorized 4939 4943 3 3.2 314.0 1.2X -Native ORC Vectorized (Pushdown) 3784 3787 2 4.2 240.6 1.5X +Parquet Vectorized 5866 5881 27 2.7 372.9 1.0X +Parquet Vectorized (Pushdown) 4410 4417 8 3.6 280.4 1.3X +Native ORC Vectorized 5077 5089 19 3.1 322.8 1.2X +Native ORC Vectorized (Pushdown) 3911 3919 8 4.0 248.6 1.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 90% timestamp stored as TIMESTAMP_MILLIS rows (value < timestamp_seconds(14155776)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 7918 7942 20 2.0 503.4 1.0X -Parquet Vectorized (Pushdown) 7645 7650 5 2.1 486.1 1.0X -Native ORC Vectorized 7120 7138 19 2.2 452.7 1.1X -Native ORC Vectorized (Pushdown) 6908 6931 17 2.3 439.2 1.1X +Parquet Vectorized 8221 8229 7 1.9 522.7 1.0X +Parquet Vectorized (Pushdown) 7970 7981 9 2.0 506.7 1.0X +Native ORC Vectorized 7426 7442 23 2.1 472.1 1.1X +Native ORC Vectorized (Pushdown) 7193 7204 9 2.2 457.3 1.1X ================================================================================================ Pushdown benchmark with many filters ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 1 row with 1 filters: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 61 63 3 0.0 60809084.0 1.0X -Parquet Vectorized (Pushdown) 62 65 4 0.0 62249744.0 1.0X -Native ORC Vectorized 55 58 3 0.0 54905858.0 1.1X -Native ORC Vectorized (Pushdown) 56 61 5 0.0 56310471.0 1.1X +Parquet Vectorized 63 65 3 0.0 63314623.0 1.0X +Parquet Vectorized (Pushdown) 64 66 3 0.0 64051869.0 1.0X +Native ORC Vectorized 58 59 2 0.0 57520375.0 1.1X +Native ORC Vectorized (Pushdown) 60 64 8 0.0 59737469.0 1.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 1 row with 250 filters: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 377 386 7 0.0 377453808.0 1.0X -Parquet Vectorized (Pushdown) 388 398 10 0.0 387673530.0 1.0X -Native ORC Vectorized 373 376 3 0.0 372643927.0 1.0X -Native ORC Vectorized (Pushdown) 377 388 8 0.0 377351458.0 1.0X +Parquet Vectorized 423 431 11 0.0 422883307.0 1.0X +Parquet Vectorized (Pushdown) 427 431 2 0.0 427230106.0 1.0X +Native ORC Vectorized 407 416 8 0.0 406712827.0 1.0X +Native ORC Vectorized (Pushdown) 418 423 4 0.0 418468099.0 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select 1 row with 500 filters: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 2008 2050 47 0.0 2007664554.0 1.0X -Parquet Vectorized (Pushdown) 2028 2059 34 0.0 2028080865.0 1.0X -Native ORC Vectorized 2007 2015 8 0.0 2007404672.0 1.0X -Native ORC Vectorized (Pushdown) 2022 2072 56 0.0 2021625278.0 1.0X +Parquet Vectorized 2351 2379 30 0.0 2351073582.0 1.0X +Parquet Vectorized (Pushdown) 2359 2383 14 0.0 2358892376.0 1.0X +Native ORC Vectorized 2349 2376 19 0.0 2348555337.0 1.0X +Native ORC Vectorized (Pushdown) 2351 2372 17 0.0 2350854713.0 1.0X diff --git a/sql/core/benchmarks/GenerateExecBenchmark-jdk21-results.txt b/sql/core/benchmarks/GenerateExecBenchmark-jdk21-results.txt index 5f36f7900c51e..c2e7f658a4dc6 100644 --- a/sql/core/benchmarks/GenerateExecBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/GenerateExecBenchmark-jdk21-results.txt @@ -2,11 +2,11 @@ GenerateExec benchmark ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor GenerateExec Benchmark: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -GenerateExec Benchmark wholestage off 70956 71507 779 1.4 709.6 1.0X -GenerateExec Benchmark wholestage on 20836 20862 25 4.8 208.4 3.4X +GenerateExec Benchmark wholestage off 88754 89024 381 1.1 887.5 1.0X +GenerateExec Benchmark wholestage on 26904 27017 173 3.7 269.0 3.3X diff --git a/sql/core/benchmarks/GenerateExecBenchmark-results.txt b/sql/core/benchmarks/GenerateExecBenchmark-results.txt index 309b7691c7e57..8398bfdefbb40 100644 --- a/sql/core/benchmarks/GenerateExecBenchmark-results.txt +++ b/sql/core/benchmarks/GenerateExecBenchmark-results.txt @@ -2,11 +2,11 @@ GenerateExec benchmark ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor GenerateExec Benchmark: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -GenerateExec Benchmark wholestage off 68787 69054 378 1.5 687.9 1.0X -GenerateExec Benchmark wholestage on 22124 22203 69 4.5 221.2 3.1X +GenerateExec Benchmark wholestage off 85096 85378 398 1.2 851.0 1.0X +GenerateExec Benchmark wholestage on 25729 25905 115 3.9 257.3 3.3X diff --git a/sql/core/benchmarks/HashedRelationMetricsBenchmark-jdk21-results.txt b/sql/core/benchmarks/HashedRelationMetricsBenchmark-jdk21-results.txt index 22ebdc256a5c2..b60eba694717c 100644 --- a/sql/core/benchmarks/HashedRelationMetricsBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/HashedRelationMetricsBenchmark-jdk21-results.txt @@ -2,10 +2,10 @@ LongToUnsafeRowMap metrics ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor LongToUnsafeRowMap metrics: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -LongToUnsafeRowMap 261 266 6 1.9 522.3 1.0X +LongToUnsafeRowMap 361 363 2 1.4 721.9 1.0X diff --git a/sql/core/benchmarks/HashedRelationMetricsBenchmark-results.txt b/sql/core/benchmarks/HashedRelationMetricsBenchmark-results.txt index 791e6d3af1c0e..d66030f047958 100644 --- a/sql/core/benchmarks/HashedRelationMetricsBenchmark-results.txt +++ b/sql/core/benchmarks/HashedRelationMetricsBenchmark-results.txt @@ -2,10 +2,10 @@ LongToUnsafeRowMap metrics ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor LongToUnsafeRowMap metrics: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -LongToUnsafeRowMap 252 256 4 2.0 503.3 1.0X +LongToUnsafeRowMap 380 390 7 1.3 760.6 1.0X diff --git a/sql/core/benchmarks/InExpressionBenchmark-jdk21-results.txt b/sql/core/benchmarks/InExpressionBenchmark-jdk21-results.txt index 28468f39d226a..02eef14d6c991 100644 --- a/sql/core/benchmarks/InExpressionBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/InExpressionBenchmark-jdk21-results.txt @@ -2,739 +2,739 @@ In Expression Benchmark ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 5 bytes: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 50 67 12 198.3 5.0 1.0X -InSet expression 33 41 5 300.1 3.3 1.5X +In expression 28 34 7 358.3 2.8 1.0X +InSet expression 51 65 15 194.7 5.1 0.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 10 bytes: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 50 55 5 199.0 5.0 1.0X -InSet expression 35 41 7 289.5 3.5 1.5X +In expression 31 35 5 326.8 3.1 1.0X +InSet expression 50 54 3 200.1 5.0 0.6X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 25 bytes: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 82 88 7 122.3 8.2 1.0X -InSet expression 45 48 3 223.8 4.5 1.8X +In expression 59 62 2 168.2 5.9 1.0X +InSet expression 66 75 11 150.9 6.6 0.9X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 50 bytes: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 136 140 3 73.3 13.6 1.0X -InSet expression 64 67 4 157.3 6.4 2.1X +In expression 97 103 4 103.0 9.7 1.0X +InSet expression 79 83 3 126.7 7.9 1.2X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 100 bytes: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 247 250 3 40.6 24.7 1.0X -InSet expression 105 107 2 95.3 10.5 2.4X +In expression 171 176 4 58.4 17.1 1.0X +InSet expression 107 112 4 93.5 10.7 1.6X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 200 bytes: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 410 416 5 24.4 41.0 1.0X -InSet expression 183 188 3 54.7 18.3 2.2X +In expression 387 403 13 25.8 38.7 1.0X +InSet expression 188 196 8 53.3 18.8 2.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 5 shorts: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 26 30 5 380.3 2.6 1.0X -InSet expression 20 22 4 508.5 2.0 1.3X +In expression 26 31 4 379.5 2.6 1.0X +InSet expression 73 78 4 137.2 7.3 0.4X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 10 shorts: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 34 39 4 290.0 3.4 1.0X -InSet expression 21 25 4 472.2 2.1 1.6X +In expression 36 40 4 278.6 3.6 1.0X +InSet expression 87 91 3 114.3 8.7 0.4X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 25 shorts: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 57 60 4 175.0 5.7 1.0X -InSet expression 21 25 5 466.1 2.1 2.7X +In expression 59 63 6 170.4 5.9 1.0X +InSet expression 87 91 3 115.0 8.7 0.7X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 50 shorts: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 101 106 8 98.6 10.1 1.0X -InSet expression 21 24 3 465.5 2.1 4.7X +In expression 99 103 3 101.2 9.9 1.0X +InSet expression 119 125 9 84.2 11.9 0.8X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 100 shorts: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 183 186 3 54.6 18.3 1.0X -InSet expression 22 24 3 458.7 2.2 8.4X +In expression 182 186 4 54.8 18.2 1.0X +InSet expression 105 111 4 95.2 10.5 1.7X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 200 shorts: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 364 370 4 27.4 36.4 1.0X -InSet expression 23 26 4 427.7 2.3 15.6X +In expression 343 356 14 29.1 34.3 1.0X +InSet expression 117 123 3 85.1 11.7 2.9X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 300 shorts: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 886 902 12 11.3 88.6 1.0X -InSet expression 25 27 3 398.9 2.5 35.4X +In expression 523 540 10 19.1 52.3 1.0X +InSet expression 129 136 4 77.6 12.9 4.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 400 shorts: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 656 707 61 15.2 65.6 1.0X -InSet expression 26 30 4 383.1 2.6 25.1X +In expression 683 709 21 14.6 68.3 1.0X +InSet expression 142 149 6 70.7 14.2 4.8X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 500 shorts: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 800 802 3 12.5 80.0 1.0X -InSet expression 193 195 4 51.9 19.3 4.2X +In expression 1370 1396 18 7.3 137.0 1.0X +InSet expression 161 167 4 62.1 16.1 8.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 5 shorts (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 23 25 3 435.4 2.3 1.0X -InSet expression 25 27 3 404.5 2.5 0.9X +In expression 25 28 3 394.1 2.5 1.0X +InSet expression 69 72 3 145.8 6.9 0.4X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 10 shorts (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 29 33 6 339.8 2.9 1.0X -InSet expression 27 30 6 368.3 2.7 1.1X +In expression 31 35 5 318.9 3.1 1.0X +InSet expression 89 92 3 112.2 8.9 0.4X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 25 shorts (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 61 65 4 163.6 6.1 1.0X -InSet expression 34 37 5 296.4 3.4 1.8X +In expression 57 61 4 176.3 5.7 1.0X +InSet expression 112 116 6 89.6 11.2 0.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 50 shorts (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 106 109 4 94.7 10.6 1.0X -InSet expression 35 38 3 282.5 3.5 3.0X +In expression 101 104 2 99.0 10.1 1.0X +InSet expression 122 128 3 81.8 12.2 0.8X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 100 shorts (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 200 202 3 50.0 20.0 1.0X -InSet expression 41 43 3 245.9 4.1 4.9X +In expression 178 183 5 56.3 17.8 1.0X +InSet expression 109 114 3 91.7 10.9 1.6X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 200 shorts (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 380 383 4 26.3 38.0 1.0X -InSet expression 46 51 6 215.3 4.6 8.2X +In expression 315 323 9 31.7 31.5 1.0X +InSet expression 115 119 2 86.8 11.5 2.7X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 300 shorts (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 565 567 4 17.7 56.5 1.0X -InSet expression 50 54 5 200.4 5.0 11.3X +In expression 478 483 4 20.9 47.8 1.0X +InSet expression 125 131 4 80.3 12.5 3.8X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 400 shorts (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 756 757 1 13.2 75.6 1.0X -InSet expression 54 56 3 185.5 5.4 14.0X +In expression 628 649 12 15.9 62.8 1.0X +InSet expression 140 145 8 71.6 14.0 4.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 500 shorts (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 883 885 2 11.3 88.3 1.0X -InSet expression 191 194 2 52.5 19.1 4.6X +In expression 999 1005 5 10.0 99.9 1.0X +InSet expression 153 157 4 65.2 15.3 6.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 5 ints: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 19 20 2 527.4 1.9 1.0X -InSet expression 16 18 3 635.6 1.6 1.2X +In expression 23 25 3 427.9 2.3 1.0X +InSet expression 72 76 2 138.3 7.2 0.3X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 10 ints: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 26 30 3 390.3 2.6 1.0X -InSet expression 16 18 3 637.0 1.6 1.6X +In expression 31 34 4 324.0 3.1 1.0X +InSet expression 88 92 3 113.2 8.8 0.3X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 25 ints: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 53 59 3 189.4 5.3 1.0X -InSet expression 16 17 2 639.7 1.6 3.4X +In expression 62 66 3 161.3 6.2 1.0X +InSet expression 88 92 3 113.7 8.8 0.7X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 50 ints: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 98 103 3 102.1 9.8 1.0X -InSet expression 16 18 3 626.5 1.6 6.1X +In expression 104 108 3 96.2 10.4 1.0X +InSet expression 123 127 3 81.5 12.3 0.8X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 100 ints: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 171 173 3 58.5 17.1 1.0X -InSet expression 17 18 2 602.2 1.7 10.3X +In expression 179 184 4 55.9 17.9 1.0X +InSet expression 105 111 8 95.4 10.5 1.7X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 200 ints: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 328 332 5 30.4 32.8 1.0X -InSet expression 19 22 3 516.2 1.9 17.0X +In expression 327 330 3 30.6 32.7 1.0X +InSet expression 113 118 3 88.2 11.3 2.9X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 300 ints: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 486 491 5 20.6 48.6 1.0X -InSet expression 20 23 3 490.2 2.0 23.8X +In expression 473 477 4 21.1 47.3 1.0X +InSet expression 123 129 4 81.0 12.3 3.8X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 400 ints: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 643 650 5 15.6 64.3 1.0X -InSet expression 23 25 4 442.7 2.3 28.5X +In expression 614 635 19 16.3 61.4 1.0X +InSet expression 137 142 3 72.8 13.7 4.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 500 ints: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 801 804 6 12.5 80.1 1.0X -InSet expression 168 171 3 59.6 16.8 4.8X +In expression 780 788 9 12.8 78.0 1.0X +InSet expression 151 157 3 66.2 15.1 5.2X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 5 ints (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 14 16 2 707.0 1.4 1.0X -InSet expression 11 12 2 897.1 1.1 1.3X +In expression 19 21 3 530.8 1.9 1.0X +InSet expression 70 74 2 143.5 7.0 0.3X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 10 ints (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 22 24 2 447.6 2.2 1.0X -InSet expression 21 23 3 465.2 2.1 1.0X +In expression 26 29 4 387.9 2.6 1.0X +InSet expression 90 93 2 111.5 9.0 0.3X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 25 ints (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 47 49 3 212.5 4.7 1.0X -InSet expression 18 20 3 544.5 1.8 2.6X +In expression 52 56 4 193.0 5.2 1.0X +InSet expression 99 103 2 100.8 9.9 0.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 50 ints (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 87 88 2 115.0 8.7 1.0X -InSet expression 19 21 3 527.5 1.9 4.6X +In expression 92 96 3 108.4 9.2 1.0X +InSet expression 121 125 4 82.9 12.1 0.8X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 100 ints (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 168 169 3 59.7 16.8 1.0X -InSet expression 19 21 3 516.5 1.9 8.7X +In expression 168 174 5 59.5 16.8 1.0X +InSet expression 110 115 3 90.8 11.0 1.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 200 ints (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 323 328 4 30.9 32.3 1.0X -InSet expression 21 22 3 480.7 2.1 15.6X +In expression 318 329 8 31.4 31.8 1.0X +InSet expression 114 118 3 87.6 11.4 2.8X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 300 ints (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 487 490 5 20.5 48.7 1.0X -InSet expression 25 27 3 394.8 2.5 19.2X +In expression 467 473 5 21.4 46.7 1.0X +InSet expression 124 128 3 80.7 12.4 3.8X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 400 ints (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 644 647 2 15.5 64.4 1.0X -InSet expression 25 27 3 404.5 2.5 26.0X +In expression 620 628 5 16.1 62.0 1.0X +InSet expression 135 143 7 74.1 13.5 4.6X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 500 ints (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 800 837 69 12.5 80.0 1.0X -InSet expression 167 172 5 60.0 16.7 4.8X +In expression 772 804 60 12.9 77.2 1.0X +InSet expression 147 153 4 68.1 14.7 5.3X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 5 longs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 18 19 2 564.4 1.8 1.0X -InSet expression 81 83 2 123.3 8.1 0.2X +In expression 17 19 3 576.6 1.7 1.0X +InSet expression 67 70 2 149.6 6.7 0.3X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 10 longs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 24 26 2 414.0 2.4 1.0X -InSet expression 96 100 5 104.3 9.6 0.3X +In expression 27 29 3 377.2 2.7 1.0X +InSet expression 85 88 2 117.5 8.5 0.3X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 25 longs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 50 52 3 201.2 5.0 1.0X -InSet expression 97 100 2 102.8 9.7 0.5X +In expression 47 51 3 212.0 4.7 1.0X +InSet expression 83 87 3 120.8 8.3 0.6X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 50 longs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 89 93 3 112.8 8.9 1.0X -InSet expression 129 132 2 77.6 12.9 0.7X +In expression 86 90 3 116.7 8.6 1.0X +InSet expression 117 122 2 85.4 11.7 0.7X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 100 longs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 168 175 5 59.7 16.8 1.0X -InSet expression 115 117 2 86.8 11.5 1.5X +In expression 160 166 3 62.4 16.0 1.0X +InSet expression 100 104 3 100.4 10.0 1.6X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 200 longs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 328 332 6 30.5 32.8 1.0X -InSet expression 123 125 2 81.5 12.3 2.7X +In expression 309 321 10 32.3 30.9 1.0X +InSet expression 106 109 2 94.6 10.6 2.9X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 5 floats: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 35 39 4 284.2 3.5 1.0X -InSet expression 104 106 3 96.5 10.4 0.3X +In expression 34 37 3 294.2 3.4 1.0X +InSet expression 70 74 2 142.0 7.0 0.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 10 floats: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 59 62 3 168.5 5.9 1.0X -InSet expression 133 135 2 75.2 13.3 0.4X +In expression 55 59 3 180.6 5.5 1.0X +InSet expression 89 94 3 112.4 8.9 0.6X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 25 floats: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 147 149 3 68.1 14.7 1.0X -InSet expression 130 132 2 76.8 13.0 1.1X +In expression 135 138 2 74.2 13.5 1.0X +InSet expression 91 96 3 110.1 9.1 1.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 50 floats: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 226 229 4 44.3 22.6 1.0X -InSet expression 186 189 3 53.8 18.6 1.2X +In expression 225 228 3 44.4 22.5 1.0X +InSet expression 131 137 5 76.4 13.1 1.7X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 100 floats: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 380 385 4 26.3 38.0 1.0X -InSet expression 146 148 3 68.7 14.6 2.6X +In expression 378 387 8 26.5 37.8 1.0X +InSet expression 108 112 3 92.8 10.8 3.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 200 floats: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 2090 2139 28 4.8 209.0 1.0X -InSet expression 150 152 1 66.6 15.0 13.9X +In expression 1764 1808 77 5.7 176.4 1.0X +InSet expression 110 113 2 91.0 11.0 16.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 5 doubles: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 33 35 3 306.5 3.3 1.0X -InSet expression 117 119 1 85.5 11.7 0.3X +In expression 31 34 3 318.3 3.1 1.0X +InSet expression 73 78 6 137.3 7.3 0.4X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 10 doubles: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 60 63 3 167.6 6.0 1.0X -InSet expression 145 146 1 68.9 14.5 0.4X +In expression 57 61 3 175.1 5.7 1.0X +InSet expression 91 96 3 109.3 9.1 0.6X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 25 doubles: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 150 153 3 66.5 15.0 1.0X -InSet expression 141 143 2 71.0 14.1 1.1X +In expression 138 143 4 72.4 13.8 1.0X +InSet expression 92 95 3 108.5 9.2 1.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 50 doubles: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 226 229 3 44.2 22.6 1.0X -InSet expression 193 195 2 51.7 19.3 1.2X +In expression 211 215 4 47.4 21.1 1.0X +InSet expression 136 140 4 73.3 13.6 1.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 100 doubles: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 383 387 4 26.1 38.3 1.0X -InSet expression 157 158 1 63.7 15.7 2.4X +In expression 366 374 7 27.3 36.6 1.0X +InSet expression 111 115 2 90.1 11.1 3.3X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 200 doubles: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 2213 2250 73 4.5 221.3 1.0X -InSet expression 159 162 2 62.8 15.9 13.9X +In expression 2083 2171 91 4.8 208.3 1.0X +InSet expression 111 116 3 90.2 11.1 18.8X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 5 small decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 19 20 1 52.4 19.1 1.0X -InSet expression 50 52 2 20.1 49.7 0.4X +In expression 18 20 2 55.8 17.9 1.0X +InSet expression 53 56 2 18.9 52.9 0.3X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 10 small decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 25 26 1 40.4 24.7 1.0X -InSet expression 51 54 5 19.4 51.5 0.5X +In expression 23 25 2 42.9 23.3 1.0X +InSet expression 55 57 2 18.3 54.7 0.4X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 25 small decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 39 41 1 25.6 39.1 1.0X -InSet expression 52 55 1 19.1 52.5 0.7X +In expression 37 39 2 27.0 37.0 1.0X +InSet expression 56 58 2 17.9 55.9 0.7X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 50 small decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 77 78 1 13.0 76.8 1.0X -InSet expression 58 60 1 17.2 58.1 1.3X +In expression 73 78 3 13.7 73.0 1.0X +InSet expression 61 64 3 16.5 60.6 1.2X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 100 small decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 240 241 1 4.2 239.5 1.0X -InSet expression 56 58 1 17.8 56.0 4.3X +In expression 224 227 3 4.5 223.7 1.0X +InSet expression 59 61 1 16.9 59.1 3.8X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 200 small decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 570 585 31 1.8 569.9 1.0X -InSet expression 57 59 1 17.5 57.2 10.0X +In expression 538 545 4 1.9 538.4 1.0X +InSet expression 60 63 2 16.5 60.5 8.9X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 5 large decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 4 6 2 222.2 4.5 1.0X -InSet expression 4 5 2 227.8 4.4 1.0X +In expression 5 6 2 218.3 4.6 1.0X +InSet expression 4 6 2 224.6 4.5 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 10 large decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 5 6 2 217.8 4.6 1.0X -InSet expression 4 5 2 225.6 4.4 1.0X +In expression 5 6 2 219.5 4.6 1.0X +InSet expression 5 6 2 220.7 4.5 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 25 large decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 5 6 2 204.1 4.9 1.0X -InSet expression 5 6 2 201.2 5.0 1.0X +In expression 5 6 2 201.5 5.0 1.0X +InSet expression 5 6 2 203.5 4.9 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 50 large decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 6 7 2 172.2 5.8 1.0X -InSet expression 6 7 2 174.6 5.7 1.0X +In expression 6 7 2 174.7 5.7 1.0X +InSet expression 6 6 2 175.7 5.7 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 100 large decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 8 9 2 133.3 7.5 1.0X -InSet expression 8 9 2 131.1 7.6 1.0X +In expression 7 9 2 137.7 7.3 1.0X +InSet expression 7 8 2 137.6 7.3 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 200 large decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 11 12 2 91.7 10.9 1.0X -InSet expression 11 13 2 91.9 10.9 1.0X +In expression 11 12 2 93.7 10.7 1.0X +InSet expression 11 12 1 94.2 10.6 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 5 strings: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 28 29 1 35.5 28.2 1.0X -InSet expression 39 40 1 25.5 39.2 0.7X +In expression 27 29 3 36.9 27.1 1.0X +InSet expression 37 39 1 26.8 37.4 0.7X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 10 strings: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 33 35 3 30.3 33.0 1.0X -InSet expression 42 43 1 23.9 41.9 0.8X +In expression 32 33 1 31.4 31.9 1.0X +InSet expression 40 42 2 25.0 40.0 0.8X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 25 strings: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 42 45 4 23.7 42.3 1.0X -InSet expression 46 48 1 21.6 46.2 0.9X +In expression 40 42 2 25.1 39.9 1.0X +InSet expression 44 46 1 22.5 44.5 0.9X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 50 strings: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 57 59 1 17.4 57.4 1.0X -InSet expression 51 52 1 19.8 50.6 1.1X +In expression 54 56 2 18.4 54.3 1.0X +InSet expression 47 49 1 21.2 47.3 1.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 100 strings: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 88 89 1 11.4 87.7 1.0X -InSet expression 47 49 1 21.2 47.2 1.9X +In expression 102 109 4 9.8 102.3 1.0X +InSet expression 44 47 3 22.5 44.4 2.3X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 200 strings: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 378 379 1 2.6 377.6 1.0X -InSet expression 48 49 1 20.9 47.8 7.9X +In expression 367 372 5 2.7 366.5 1.0X +InSet expression 45 46 2 22.4 44.7 8.2X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 5 timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 17 19 2 585.1 1.7 1.0X -InSet expression 90 92 2 111.0 9.0 0.2X +In expression 15 17 2 647.4 1.5 1.0X +InSet expression 79 81 2 127.2 7.9 0.2X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 10 timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 23 24 2 439.2 2.3 1.0X -InSet expression 101 103 1 99.5 10.1 0.2X +In expression 21 22 2 482.1 2.1 1.0X +InSet expression 89 91 2 112.7 8.9 0.2X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 25 timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 48 50 3 210.5 4.8 1.0X -InSet expression 129 132 3 77.5 12.9 0.4X +In expression 44 48 4 227.0 4.4 1.0X +InSet expression 114 118 3 87.5 11.4 0.4X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 50 timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 81 83 2 123.0 8.1 1.0X -InSet expression 153 156 2 65.2 15.3 0.5X +In expression 75 77 2 133.7 7.5 1.0X +InSet expression 138 141 2 72.2 13.8 0.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 100 timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 153 154 2 65.5 15.3 1.0X -InSet expression 134 135 1 74.7 13.4 1.1X +In expression 140 143 3 71.5 14.0 1.0X +InSet expression 121 128 5 82.4 12.1 1.2X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 200 timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 295 307 11 33.9 29.5 1.0X -InSet expression 132 134 1 75.7 13.2 2.2X +In expression 286 296 9 35.0 28.6 1.0X +InSet expression 120 124 3 83.3 12.0 2.4X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 5 dates: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 376 377 2 26.6 37.6 1.0X -InSet expression 374 379 3 26.7 37.4 1.0X +In expression 297 300 3 33.6 29.7 1.0X +InSet expression 299 303 6 33.5 29.9 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 10 dates: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 374 377 2 26.7 37.4 1.0X -InSet expression 373 377 4 26.8 37.3 1.0X +In expression 302 306 3 33.1 30.2 1.0X +InSet expression 296 301 5 33.8 29.6 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 25 dates: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 406 413 5 24.6 40.6 1.0X -InSet expression 373 376 2 26.8 37.3 1.1X +In expression 329 344 18 30.4 32.9 1.0X +InSet expression 294 300 4 34.0 29.4 1.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 50 dates: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 477 482 3 21.0 47.7 1.0X -InSet expression 371 376 4 27.0 37.1 1.3X +In expression 375 378 2 26.7 37.5 1.0X +InSet expression 297 306 5 33.7 29.7 1.3X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 100 dates: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 564 568 3 17.7 56.4 1.0X -InSet expression 373 377 3 26.8 37.3 1.5X +In expression 461 467 6 21.7 46.1 1.0X +InSet expression 294 307 8 34.1 29.4 1.6X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 200 dates: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 716 722 6 14.0 71.6 1.0X -InSet expression 378 380 2 26.4 37.8 1.9X +In expression 609 622 9 16.4 60.9 1.0X +InSet expression 296 300 4 33.8 29.6 2.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 300 dates: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 877 880 2 11.4 87.7 1.0X -InSet expression 378 381 3 26.5 37.8 2.3X +In expression 775 785 8 12.9 77.5 1.0X +InSet expression 296 299 3 33.8 29.6 2.6X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 400 dates: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 1033 1039 6 9.7 103.3 1.0X -InSet expression 382 384 2 26.2 38.2 2.7X +In expression 925 940 14 10.8 92.5 1.0X +InSet expression 300 308 6 33.3 30.0 3.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 500 dates: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 1198 1203 4 8.4 119.8 1.0X -InSet expression 468 469 1 21.4 46.8 2.6X +In expression 1079 1103 15 9.3 107.9 1.0X +InSet expression 396 401 5 25.3 39.6 2.7X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 5 arrays: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 25 27 2 39.5 25.3 1.0X -InSet expression 58 61 5 17.3 57.9 0.4X +In expression 23 25 1 42.8 23.3 1.0X +InSet expression 53 56 2 18.8 53.3 0.4X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 10 arrays: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 40 42 2 25.2 39.7 1.0X -InSet expression 85 87 2 11.7 85.4 0.5X +In expression 37 38 2 27.4 36.5 1.0X +InSet expression 80 83 2 12.4 80.4 0.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 25 arrays: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 173 174 2 5.8 172.8 1.0X -InSet expression 100 102 1 10.0 100.2 1.7X +In expression 158 164 4 6.3 157.9 1.0X +InSet expression 95 99 3 10.5 95.4 1.7X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 50 arrays: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 377 383 5 2.6 377.4 1.0X -InSet expression 130 132 2 7.7 129.6 2.9X +In expression 355 361 5 2.8 355.2 1.0X +InSet expression 123 128 4 8.1 123.1 2.9X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 100 arrays: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 818 821 3 1.2 818.0 1.0X -InSet expression 145 147 2 6.9 144.6 5.7X +In expression 763 766 3 1.3 763.2 1.0X +InSet expression 139 141 2 7.2 138.8 5.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 200 arrays: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 1689 1871 325 0.6 1689.0 1.0X -InSet expression 162 164 2 6.2 162.0 10.4X +In expression 1623 1803 325 0.6 1623.3 1.0X +InSet expression 155 163 6 6.5 154.8 10.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 5 structs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 17 18 2 60.4 16.6 1.0X -InSet expression 87 88 2 11.5 86.9 0.2X +In expression 15 17 2 65.4 15.3 1.0X +InSet expression 76 81 2 13.2 76.0 0.2X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 10 structs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 22 25 2 44.5 22.5 1.0X -InSet expression 134 136 2 7.5 133.6 0.2X +In expression 21 22 1 47.3 21.1 1.0X +InSet expression 115 121 3 8.7 115.0 0.2X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 25 structs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 71 73 4 14.1 71.0 1.0X -InSet expression 158 160 2 6.3 157.7 0.5X +In expression 65 72 3 15.5 64.6 1.0X +InSet expression 134 143 6 7.4 134.3 0.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 50 structs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 217 219 2 4.6 217.4 1.0X -InSet expression 205 210 6 4.9 204.9 1.1X +In expression 195 197 2 5.1 194.6 1.0X +InSet expression 179 189 10 5.6 179.2 1.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 100 structs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 554 555 2 1.8 553.7 1.0X -InSet expression 229 232 2 4.4 229.4 2.4X +In expression 516 523 6 1.9 516.2 1.0X +InSet expression 204 209 3 4.9 203.5 2.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 200 structs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 1378 1482 221 0.7 1378.1 1.0X -InSet expression 266 270 4 3.8 266.2 5.2X +In expression 1322 1432 221 0.8 1321.9 1.0X +InSet expression 225 230 4 4.5 224.7 5.9X diff --git a/sql/core/benchmarks/InExpressionBenchmark-results.txt b/sql/core/benchmarks/InExpressionBenchmark-results.txt index 8a432b1657356..e3529cd7f9cda 100644 --- a/sql/core/benchmarks/InExpressionBenchmark-results.txt +++ b/sql/core/benchmarks/InExpressionBenchmark-results.txt @@ -2,739 +2,739 @@ In Expression Benchmark ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 5 bytes: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 49 71 13 205.7 4.9 1.0X -InSet expression 33 40 5 298.9 3.3 1.5X +In expression 30 36 6 334.4 3.0 1.0X +InSet expression 69 73 5 145.1 6.9 0.4X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 10 bytes: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 45 52 9 222.3 4.5 1.0X -InSet expression 34 40 6 292.1 3.4 1.3X +In expression 34 37 4 295.6 3.4 1.0X +InSet expression 67 71 3 148.6 6.7 0.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 25 bytes: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 82 85 2 122.6 8.2 1.0X -InSet expression 42 46 3 235.8 4.2 1.9X +In expression 63 65 2 159.9 6.3 1.0X +InSet expression 75 79 3 133.0 7.5 0.8X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 50 bytes: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 140 143 2 71.2 14.0 1.0X -InSet expression 62 65 3 160.4 6.2 2.3X +In expression 103 108 4 96.8 10.3 1.0X +InSet expression 83 85 2 120.5 8.3 1.2X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 100 bytes: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 249 251 1 40.1 24.9 1.0X -InSet expression 104 107 2 96.1 10.4 2.4X +In expression 192 195 3 52.0 19.2 1.0X +InSet expression 98 99 2 102.5 9.8 2.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 200 bytes: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 410 412 3 24.4 41.0 1.0X -InSet expression 179 182 2 55.9 17.9 2.3X +In expression 396 402 7 25.2 39.6 1.0X +InSet expression 205 211 5 48.9 20.5 1.9X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 5 shorts: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 25 29 4 392.7 2.5 1.0X -InSet expression 21 23 2 469.8 2.1 1.2X +In expression 29 31 2 346.8 2.9 1.0X +InSet expression 95 98 2 105.0 9.5 0.3X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 10 shorts: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 34 37 4 290.8 3.4 1.0X -InSet expression 21 24 4 467.3 2.1 1.6X +In expression 35 38 2 282.3 3.5 1.0X +InSet expression 109 112 2 91.5 10.9 0.3X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 25 shorts: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 57 59 3 176.2 5.7 1.0X -InSet expression 21 25 5 466.0 2.1 2.6X +In expression 61 63 3 164.9 6.1 1.0X +InSet expression 109 113 2 91.5 10.9 0.6X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 50 shorts: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 98 100 2 102.1 9.8 1.0X -InSet expression 22 24 4 461.8 2.2 4.5X +In expression 106 109 4 94.5 10.6 1.0X +InSet expression 141 144 3 70.9 14.1 0.8X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 100 shorts: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 192 193 2 52.0 19.2 1.0X -InSet expression 22 24 2 448.5 2.2 8.6X +In expression 185 188 5 54.1 18.5 1.0X +InSet expression 127 130 3 78.8 12.7 1.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 200 shorts: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 367 370 2 27.2 36.7 1.0X -InSet expression 23 25 3 437.4 2.3 16.1X +In expression 365 366 1 27.4 36.5 1.0X +InSet expression 138 141 3 72.2 13.8 2.6X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 300 shorts: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 545 549 4 18.4 54.5 1.0X -InSet expression 26 28 3 387.9 2.6 21.1X +In expression 549 553 5 18.2 54.9 1.0X +InSet expression 154 156 3 65.1 15.4 3.6X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 400 shorts: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 709 714 4 14.1 70.9 1.0X -InSet expression 26 30 4 382.2 2.6 27.1X +In expression 669 676 4 15.0 66.9 1.0X +InSet expression 167 170 3 60.0 16.7 4.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 500 shorts: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 882 886 5 11.3 88.2 1.0X -InSet expression 163 165 1 61.4 16.3 5.4X +In expression 931 932 2 10.7 93.1 1.0X +InSet expression 182 184 1 55.0 18.2 5.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 5 shorts (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 23 24 2 435.1 2.3 1.0X -InSet expression 24 27 5 416.2 2.4 1.0X +In expression 28 31 3 355.0 2.8 1.0X +InSet expression 93 95 2 107.8 9.3 0.3X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 10 shorts (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 32 35 2 308.7 3.2 1.0X -InSet expression 29 31 3 346.1 2.9 1.1X +In expression 34 36 2 290.2 3.4 1.0X +InSet expression 110 112 2 90.9 11.0 0.3X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 25 shorts (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 57 59 1 174.2 5.7 1.0X -InSet expression 32 34 2 311.2 3.2 1.8X +In expression 63 65 2 157.6 6.3 1.0X +InSet expression 131 132 1 76.6 13.1 0.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 50 shorts (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 99 100 1 101.0 9.9 1.0X -InSet expression 39 40 2 257.5 3.9 2.5X +In expression 115 117 2 86.8 11.5 1.0X +InSet expression 147 149 1 68.0 14.7 0.8X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 100 shorts (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 187 190 6 53.3 18.7 1.0X -InSet expression 45 47 4 224.0 4.5 4.2X +In expression 195 198 6 51.3 19.5 1.0X +InSet expression 130 133 2 76.8 13.0 1.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 200 shorts (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 366 367 1 27.4 36.6 1.0X -InSet expression 46 47 2 219.6 4.6 8.0X +In expression 354 356 2 28.3 35.4 1.0X +InSet expression 135 138 2 73.9 13.5 2.6X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 300 shorts (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 539 542 3 18.6 53.9 1.0X -InSet expression 51 52 2 197.6 5.1 10.7X +In expression 501 509 10 19.9 50.1 1.0X +InSet expression 151 153 2 66.4 15.1 3.3X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 400 shorts (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 713 716 3 14.0 71.3 1.0X -InSet expression 56 58 2 177.6 5.6 12.7X +In expression 666 669 4 15.0 66.6 1.0X +InSet expression 165 167 3 60.7 16.5 4.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 500 shorts (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 1355 1373 20 7.4 135.5 1.0X -InSet expression 163 165 1 61.3 16.3 8.3X +In expression 858 861 4 11.7 85.8 1.0X +InSet expression 178 181 3 56.2 17.8 4.8X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 5 ints: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 22 24 1 448.3 2.2 1.0X -InSet expression 19 20 1 529.9 1.9 1.2X +In expression 25 27 3 397.2 2.5 1.0X +InSet expression 94 97 4 106.1 9.4 0.3X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 10 ints: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 29 30 1 350.0 2.9 1.0X -InSet expression 19 20 2 527.6 1.9 1.5X +In expression 34 35 2 294.5 3.4 1.0X +InSet expression 109 111 1 91.4 10.9 0.3X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 25 ints: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 54 58 3 183.9 5.4 1.0X -InSet expression 19 20 2 528.7 1.9 2.9X +In expression 60 61 1 166.1 6.0 1.0X +InSet expression 112 114 1 89.2 11.2 0.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 50 ints: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 92 99 4 108.1 9.2 1.0X -InSet expression 20 22 4 511.6 2.0 4.7X +In expression 114 115 1 87.6 11.4 1.0X +InSet expression 144 146 1 69.2 14.4 0.8X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 100 ints: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 175 176 2 57.3 17.5 1.0X -InSet expression 20 21 2 499.8 2.0 8.7X +In expression 194 195 1 51.5 19.4 1.0X +InSet expression 126 128 1 79.5 12.6 1.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 200 ints: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 329 341 27 30.4 32.9 1.0X -InSet expression 21 23 2 466.6 2.1 15.3X +In expression 353 357 6 28.3 35.3 1.0X +InSet expression 136 138 2 73.6 13.6 2.6X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 300 ints: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 486 487 2 20.6 48.6 1.0X -InSet expression 23 25 3 441.1 2.3 21.4X +In expression 511 513 3 19.6 51.1 1.0X +InSet expression 149 151 2 67.2 14.9 3.4X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 400 ints: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 644 651 6 15.5 64.4 1.0X -InSet expression 24 26 3 420.0 2.4 27.0X +In expression 660 677 25 15.2 66.0 1.0X +InSet expression 162 165 3 61.6 16.2 4.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 500 ints: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 800 827 56 12.5 80.0 1.0X -InSet expression 156 159 3 63.9 15.6 5.1X +In expression 842 849 10 11.9 84.2 1.0X +InSet expression 175 178 2 57.2 17.5 4.8X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 5 ints (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 14 16 2 702.0 1.4 1.0X -InSet expression 12 14 2 819.4 1.2 1.2X +In expression 22 24 2 459.4 2.2 1.0X +InSet expression 91 94 2 109.4 9.1 0.2X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 10 ints (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 23 24 1 441.9 2.3 1.0X -InSet expression 18 19 2 545.1 1.8 1.2X +In expression 28 29 2 354.7 2.8 1.0X +InSet expression 107 109 1 93.5 10.7 0.3X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 25 ints (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 47 48 1 212.6 4.7 1.0X -InSet expression 16 17 2 644.2 1.6 3.0X +In expression 56 57 2 179.5 5.6 1.0X +InSet expression 122 124 1 82.0 12.2 0.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 50 ints (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 88 90 1 113.2 8.8 1.0X -InSet expression 17 19 3 575.2 1.7 5.1X +In expression 100 102 2 99.8 10.0 1.0X +InSet expression 142 144 1 70.4 14.2 0.7X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 100 ints (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 168 168 1 59.6 16.8 1.0X -InSet expression 20 20 1 511.4 2.0 8.6X +In expression 180 182 3 55.6 18.0 1.0X +InSet expression 129 132 3 77.5 12.9 1.4X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 200 ints (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 325 328 2 30.7 32.5 1.0X -InSet expression 21 22 2 474.6 2.1 15.4X +In expression 346 348 2 28.9 34.6 1.0X +InSet expression 134 138 4 74.4 13.4 2.6X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 300 ints (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 488 490 3 20.5 48.8 1.0X -InSet expression 26 27 1 391.7 2.6 19.1X +In expression 507 508 1 19.7 50.7 1.0X +InSet expression 148 150 1 67.7 14.8 3.4X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 400 ints (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 648 651 2 15.4 64.8 1.0X -InSet expression 30 32 3 332.4 3.0 21.5X +In expression 666 669 2 15.0 66.6 1.0X +InSet expression 161 163 1 62.1 16.1 4.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 500 ints (non-compact): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 800 832 64 12.5 80.0 1.0X -InSet expression 155 157 1 64.3 15.5 5.1X +In expression 824 861 78 12.1 82.4 1.0X +InSet expression 173 176 2 57.7 17.3 4.8X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 5 longs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 20 21 1 506.8 2.0 1.0X -InSet expression 88 90 2 114.1 8.8 0.2X +In expression 21 22 2 486.8 2.1 1.0X +InSet expression 86 88 2 115.8 8.6 0.2X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 10 longs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 28 29 1 361.9 2.8 1.0X -InSet expression 102 104 1 98.2 10.2 0.3X +In expression 28 30 2 350.9 2.8 1.0X +InSet expression 101 104 4 98.7 10.1 0.3X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 25 longs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 55 69 21 181.0 5.5 1.0X -InSet expression 105 106 1 95.6 10.5 0.5X +In expression 53 55 2 188.7 5.3 1.0X +InSet expression 104 106 2 96.3 10.4 0.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 50 longs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 95 96 2 105.3 9.5 1.0X -InSet expression 134 136 1 74.6 13.4 0.7X +In expression 92 95 2 108.7 9.2 1.0X +InSet expression 134 136 2 74.9 13.4 0.7X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 100 longs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 170 171 1 58.7 17.0 1.0X -InSet expression 116 118 1 86.4 11.6 1.5X +In expression 172 174 4 58.2 17.2 1.0X +InSet expression 117 119 2 85.5 11.7 1.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 200 longs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 328 342 13 30.5 32.8 1.0X -InSet expression 127 128 1 78.9 12.7 2.6X +In expression 329 334 5 30.4 32.9 1.0X +InSet expression 126 127 1 79.5 12.6 2.6X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 5 floats: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 35 36 1 285.5 3.5 1.0X -InSet expression 95 97 2 105.4 9.5 0.4X +In expression 35 36 1 282.0 3.5 1.0X +InSet expression 98 100 2 102.5 9.8 0.4X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 10 floats: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 62 64 2 162.3 6.2 1.0X -InSet expression 114 116 1 87.6 11.4 0.5X +In expression 64 66 5 157.0 6.4 1.0X +InSet expression 118 119 1 85.1 11.8 0.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 25 floats: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 126 127 1 79.4 12.6 1.0X -InSet expression 116 117 2 86.6 11.6 1.1X +In expression 127 129 2 78.7 12.7 1.0X +InSet expression 118 120 1 84.7 11.8 1.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 50 floats: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 205 206 1 48.8 20.5 1.0X -InSet expression 157 161 4 63.6 15.7 1.3X +In expression 209 211 2 47.8 20.9 1.0X +InSet expression 159 161 4 63.0 15.9 1.3X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 100 floats: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 365 367 5 27.4 36.5 1.0X -InSet expression 128 130 3 78.2 12.8 2.9X +In expression 366 379 26 27.3 36.6 1.0X +InSet expression 132 135 4 75.8 13.2 2.8X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 200 floats: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 1676 1879 169 6.0 167.6 1.0X -InSet expression 133 135 1 74.9 13.3 12.6X +In expression 1675 1744 125 6.0 167.5 1.0X +InSet expression 135 137 3 74.3 13.5 12.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 5 doubles: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 34 36 3 292.7 3.4 1.0X -InSet expression 96 97 2 104.5 9.6 0.4X +In expression 39 41 2 254.3 3.9 1.0X +InSet expression 114 116 1 87.6 11.4 0.3X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 10 doubles: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 61 62 1 164.3 6.1 1.0X -InSet expression 116 118 2 86.6 11.6 0.5X +In expression 62 63 1 161.3 6.2 1.0X +InSet expression 143 147 8 69.9 14.3 0.4X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 25 doubles: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 127 129 1 78.5 12.7 1.0X -InSet expression 118 120 2 84.7 11.8 1.1X +In expression 128 129 2 78.0 12.8 1.0X +InSet expression 143 147 6 69.8 14.3 0.9X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 50 doubles: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 207 208 1 48.2 20.7 1.0X -InSet expression 157 160 2 63.5 15.7 1.3X +In expression 209 211 3 47.9 20.9 1.0X +InSet expression 195 197 3 51.2 19.5 1.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 100 doubles: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 365 365 1 27.4 36.5 1.0X -InSet expression 128 129 1 78.3 12.8 2.9X +In expression 366 368 2 27.3 36.6 1.0X +InSet expression 152 154 1 65.8 15.2 2.4X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 200 doubles: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 1961 2009 82 5.1 196.1 1.0X -InSet expression 136 138 1 73.6 13.6 14.4X +In expression 1922 1995 100 5.2 192.2 1.0X +InSet expression 155 157 1 64.7 15.5 12.4X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 5 small decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 19 20 1 52.1 19.2 1.0X -InSet expression 55 57 2 18.0 55.4 0.3X +In expression 20 21 1 50.0 20.0 1.0X +InSet expression 60 61 1 16.7 59.7 0.3X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 10 small decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 24 25 2 41.4 24.2 1.0X -InSet expression 57 59 1 17.4 57.4 0.4X +In expression 25 28 3 39.3 25.5 1.0X +InSet expression 62 64 1 16.2 61.9 0.4X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 25 small decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 39 40 2 25.7 38.9 1.0X -InSet expression 58 59 1 17.3 57.9 0.7X +In expression 40 42 3 24.7 40.4 1.0X +InSet expression 62 65 3 16.1 62.0 0.7X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 50 small decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 74 76 3 13.4 74.5 1.0X -InSet expression 62 64 2 16.1 62.1 1.2X +In expression 78 80 1 12.8 78.3 1.0X +InSet expression 67 69 1 14.9 67.0 1.2X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 100 small decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 236 238 1 4.2 236.3 1.0X -InSet expression 61 62 1 16.5 60.5 3.9X +In expression 236 333 252 4.2 235.9 1.0X +InSet expression 66 70 7 15.1 66.3 3.6X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 200 small decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 548 605 122 1.8 547.8 1.0X -InSet expression 63 65 1 15.9 62.8 8.7X +In expression 558 613 121 1.8 558.2 1.0X +InSet expression 67 69 1 15.0 66.8 8.4X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 5 large decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 5 6 1 213.0 4.7 1.0X -InSet expression 5 5 1 220.4 4.5 1.0X +In expression 5 6 1 186.8 5.4 1.0X +InSet expression 5 6 1 195.2 5.1 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 10 large decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 5 5 1 216.1 4.6 1.0X -InSet expression 5 5 1 216.8 4.6 1.0X +In expression 5 6 1 192.0 5.2 1.0X +InSet expression 5 6 1 191.5 5.2 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 25 large decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 5 6 1 196.3 5.1 1.0X -InSet expression 5 5 1 197.0 5.1 1.0X +In expression 6 6 1 176.0 5.7 1.0X +InSet expression 6 6 1 178.0 5.6 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 50 large decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 6 7 1 168.4 5.9 1.0X -InSet expression 6 6 1 167.7 6.0 1.0X +In expression 6 7 1 155.1 6.4 1.0X +InSet expression 6 7 1 155.7 6.4 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 100 large decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 8 8 1 130.3 7.7 1.0X -InSet expression 8 8 1 130.9 7.6 1.0X +In expression 8 9 1 121.9 8.2 1.0X +InSet expression 8 9 1 122.1 8.2 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 200 large decimals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 11 12 1 89.4 11.2 1.0X -InSet expression 11 12 1 88.5 11.3 1.0X +In expression 12 12 1 86.0 11.6 1.0X +InSet expression 12 12 1 86.3 11.6 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 5 strings: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 29 30 1 34.7 28.9 1.0X -InSet expression 42 44 1 23.8 42.1 0.7X +In expression 29 30 1 34.6 28.9 1.0X +InSet expression 43 44 1 23.2 43.0 0.7X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 10 strings: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 34 35 1 29.1 34.3 1.0X -InSet expression 44 45 1 22.6 44.2 0.8X +In expression 34 35 1 29.2 34.3 1.0X +InSet expression 46 47 2 22.0 45.5 0.8X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 25 strings: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 42 44 1 23.8 42.1 1.0X -InSet expression 48 50 1 20.6 48.4 0.9X +In expression 43 44 1 23.2 43.1 1.0X +InSet expression 50 52 4 20.0 50.0 0.9X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 50 strings: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 57 61 8 17.7 56.6 1.0X -InSet expression 53 54 2 19.0 52.6 1.1X +In expression 58 59 1 17.3 57.7 1.0X +InSet expression 54 55 1 18.6 53.8 1.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 100 strings: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 89 90 2 11.3 88.8 1.0X -InSet expression 50 52 3 20.1 49.7 1.8X +In expression 89 90 2 11.3 88.6 1.0X +InSet expression 52 53 2 19.4 51.7 1.7X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 200 strings: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 390 392 2 2.6 390.0 1.0X -InSet expression 50 52 1 19.8 50.4 7.7X +In expression 383 387 3 2.6 383.3 1.0X +InSet expression 51 54 2 19.5 51.3 7.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 5 timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 15 16 1 646.9 1.5 1.0X -InSet expression 90 91 1 111.7 9.0 0.2X +In expression 16 17 1 636.1 1.6 1.0X +InSet expression 92 94 2 108.7 9.2 0.2X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 10 timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 22 23 2 457.4 2.2 1.0X -InSet expression 99 100 1 101.0 9.9 0.2X +In expression 22 23 1 447.6 2.2 1.0X +InSet expression 102 103 1 98.4 10.2 0.2X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 25 timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 45 46 1 221.5 4.5 1.0X -InSet expression 125 127 1 79.9 12.5 0.4X +In expression 46 47 1 218.8 4.6 1.0X +InSet expression 127 130 1 78.5 12.7 0.4X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 50 timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 81 82 1 123.8 8.1 1.0X -InSet expression 153 155 3 65.5 15.3 0.5X +In expression 81 83 2 123.6 8.1 1.0X +InSet expression 151 153 2 66.1 15.1 0.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 100 timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 153 155 3 65.4 15.3 1.0X -InSet expression 131 140 19 76.2 13.1 1.2X +In expression 153 155 5 65.4 15.3 1.0X +InSet expression 132 135 1 75.5 13.2 1.2X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 200 timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 297 307 10 33.7 29.7 1.0X -InSet expression 127 130 1 78.4 12.7 2.3X +In expression 297 307 10 33.6 29.7 1.0X +InSet expression 131 132 2 76.6 13.1 2.3X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 5 dates: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 452 455 5 22.1 45.2 1.0X -InSet expression 455 458 5 22.0 45.5 1.0X +In expression 450 455 6 22.2 45.0 1.0X +InSet expression 445 447 2 22.5 44.5 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 10 dates: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 460 461 1 21.7 46.0 1.0X -InSet expression 448 449 1 22.3 44.8 1.0X +In expression 451 456 4 22.2 45.1 1.0X +InSet expression 445 449 3 22.5 44.5 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 25 dates: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 485 487 3 20.6 48.5 1.0X -InSet expression 452 456 4 22.1 45.2 1.1X +In expression 484 487 2 20.6 48.4 1.0X +InSet expression 445 447 2 22.5 44.5 1.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 50 dates: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 544 548 6 18.4 54.4 1.0X -InSet expression 452 454 2 22.1 45.2 1.2X +In expression 533 536 3 18.7 53.3 1.0X +InSet expression 449 450 2 22.3 44.9 1.2X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 100 dates: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 622 624 2 16.1 62.2 1.0X -InSet expression 454 457 4 22.0 45.4 1.4X +In expression 619 623 6 16.2 61.9 1.0X +InSet expression 447 448 1 22.4 44.7 1.4X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 200 dates: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 783 788 8 12.8 78.3 1.0X -InSet expression 450 455 6 22.2 45.0 1.7X +In expression 779 790 12 12.8 77.9 1.0X +InSet expression 447 454 10 22.3 44.7 1.7X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 300 dates: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 949 952 3 10.5 94.9 1.0X -InSet expression 455 457 4 22.0 45.5 2.1X +In expression 944 948 4 10.6 94.4 1.0X +InSet expression 455 456 1 22.0 45.5 2.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 400 dates: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 1115 1119 5 9.0 111.5 1.0X -InSet expression 462 463 1 21.6 46.2 2.4X +In expression 1105 1107 2 9.0 110.5 1.0X +InSet expression 451 454 3 22.2 45.1 2.4X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 500 dates: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 1279 1284 3 7.8 127.9 1.0X -InSet expression 543 544 1 18.4 54.3 2.4X +In expression 1279 1289 6 7.8 127.9 1.0X +InSet expression 542 544 3 18.4 54.2 2.4X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 5 arrays: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ In expression 25 26 2 39.7 25.2 1.0X -InSet expression 56 57 1 17.9 55.8 0.5X +InSet expression 57 58 2 17.6 56.7 0.4X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 10 arrays: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 40 41 1 25.3 39.5 1.0X -InSet expression 84 85 1 11.9 83.8 0.5X +In expression 40 41 1 25.0 40.0 1.0X +InSet expression 85 87 3 11.7 85.1 0.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 25 arrays: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 148 151 2 6.7 148.5 1.0X -InSet expression 99 101 4 10.1 98.5 1.5X +In expression 158 161 3 6.3 158.3 1.0X +InSet expression 100 103 4 10.0 99.9 1.6X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 50 arrays: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 368 370 2 2.7 367.9 1.0X -InSet expression 126 129 4 7.9 126.5 2.9X +In expression 400 401 1 2.5 400.1 1.0X +InSet expression 128 131 2 7.8 128.3 3.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 100 arrays: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 753 759 7 1.3 753.5 1.0X -InSet expression 143 145 1 7.0 142.9 5.3X +In expression 751 754 4 1.3 751.0 1.0X +InSet expression 145 147 2 6.9 144.5 5.2X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 200 arrays: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 1611 1858 317 0.6 1610.9 1.0X -InSet expression 159 161 1 6.3 159.3 10.1X +In expression 1672 1862 278 0.6 1671.8 1.0X +InSet expression 162 164 2 6.2 162.4 10.3X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 5 structs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 20 22 2 49.2 20.3 1.0X -InSet expression 81 83 2 12.4 80.8 0.3X +In expression 20 22 2 48.9 20.5 1.0X +InSet expression 81 84 2 12.3 81.3 0.3X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 10 structs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 29 31 2 34.2 29.3 1.0X -InSet expression 122 125 1 8.2 122.5 0.2X +In expression 30 32 3 32.9 30.4 1.0X +InSet expression 124 126 2 8.0 124.5 0.2X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 25 structs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 69 71 3 14.5 69.2 1.0X -InSet expression 144 147 3 6.9 143.9 0.5X +In expression 69 71 1 14.4 69.4 1.0X +InSet expression 146 148 1 6.8 146.5 0.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 50 structs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 205 207 3 4.9 204.6 1.0X -InSet expression 186 192 9 5.4 186.2 1.1X +In expression 212 214 3 4.7 212.1 1.0X +InSet expression 190 193 3 5.3 189.9 1.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 100 structs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 501 503 2 2.0 501.2 1.0X -InSet expression 211 212 2 4.7 211.3 2.4X +In expression 482 492 6 2.1 482.2 1.0X +InSet expression 214 216 1 4.7 213.8 2.3X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor 200 structs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -In expression 1209 1412 252 0.8 1208.5 1.0X -InSet expression 245 246 2 4.1 244.6 4.9X +In expression 1209 1433 281 0.8 1209.2 1.0X +InSet expression 245 247 2 4.1 245.0 4.9X diff --git a/sql/core/benchmarks/InMemoryColumnarBenchmark-jdk21-results.txt b/sql/core/benchmarks/InMemoryColumnarBenchmark-jdk21-results.txt index bb16529424567..a6cadf2a57c1a 100644 --- a/sql/core/benchmarks/InMemoryColumnarBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/InMemoryColumnarBenchmark-jdk21-results.txt @@ -2,11 +2,11 @@ Int In-memory with 1000000 rows ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Int In-Memory scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -columnar deserialization + columnar-to-row 175 200 28 5.7 175.0 1.0X -row-based deserialization 132 142 9 7.6 132.3 1.3X +columnar deserialization + columnar-to-row 156 169 18 6.4 155.6 1.0X +row-based deserialization 125 176 46 8.0 125.1 1.2X diff --git a/sql/core/benchmarks/InMemoryColumnarBenchmark-results.txt b/sql/core/benchmarks/InMemoryColumnarBenchmark-results.txt index 69ef98c2e5f25..1774d114da136 100644 --- a/sql/core/benchmarks/InMemoryColumnarBenchmark-results.txt +++ b/sql/core/benchmarks/InMemoryColumnarBenchmark-results.txt @@ -2,11 +2,11 @@ Int In-memory with 1000000 rows ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Int In-Memory scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -columnar deserialization + columnar-to-row 177 218 64 5.7 176.6 1.0X -row-based deserialization 132 190 81 7.6 132.2 1.3X +columnar deserialization + columnar-to-row 195 199 8 5.1 194.6 1.0X +row-based deserialization 127 128 1 7.9 127.3 1.5X diff --git a/sql/core/benchmarks/InsertTableWithDynamicPartitionsBenchmark-jdk21-results.txt b/sql/core/benchmarks/InsertTableWithDynamicPartitionsBenchmark-jdk21-results.txt index bf27e9ce51ead..b2a1034782f1d 100644 --- a/sql/core/benchmarks/InsertTableWithDynamicPartitionsBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/InsertTableWithDynamicPartitionsBenchmark-jdk21-results.txt @@ -1,8 +1,8 @@ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor dynamic insert table benchmark, totalRows = 200000: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------- -one partition column, 100 partitions 8818 8868 71 0.0 44088.8 1.0X -two partition columns, 500 partitions 24601 24662 86 0.0 123006.4 0.4X -three partition columns, 2000 partitions 66361 66397 51 0.0 331804.1 0.1X +one partition column, 100 partitions 9816 9873 80 0.0 49079.3 1.0X +two partition columns, 500 partitions 26057 26309 355 0.0 130285.9 0.4X +three partition columns, 2000 partitions 72728 72816 124 0.0 363640.3 0.1X diff --git a/sql/core/benchmarks/InsertTableWithDynamicPartitionsBenchmark-results.txt b/sql/core/benchmarks/InsertTableWithDynamicPartitionsBenchmark-results.txt index cc4a7661dd8da..b360d3f5d5270 100644 --- a/sql/core/benchmarks/InsertTableWithDynamicPartitionsBenchmark-results.txt +++ b/sql/core/benchmarks/InsertTableWithDynamicPartitionsBenchmark-results.txt @@ -1,8 +1,8 @@ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor dynamic insert table benchmark, totalRows = 200000: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------- -one partition column, 100 partitions 8858 8907 70 0.0 44289.2 1.0X -two partition columns, 500 partitions 24244 25085 1189 0.0 121220.9 0.4X -three partition columns, 2000 partitions 65616 67508 2676 0.0 328079.4 0.1X +one partition column, 100 partitions 9772 9823 71 0.0 48861.7 1.0X +two partition columns, 500 partitions 25719 25897 252 0.0 128594.9 0.4X +three partition columns, 2000 partitions 72019 72199 254 0.0 360097.0 0.1X diff --git a/sql/core/benchmarks/IntervalBenchmark-jdk21-results.txt b/sql/core/benchmarks/IntervalBenchmark-jdk21-results.txt index 2384e04fd5647..06f16fec2065d 100644 --- a/sql/core/benchmarks/IntervalBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/IntervalBenchmark-jdk21-results.txt @@ -1,40 +1,40 @@ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor cast strings to intervals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -prepare string w/ interval 415 449 38 2.4 414.7 1.0X -prepare string w/o interval 376 383 9 2.7 376.3 1.1X -1 units w/ interval 326 338 15 3.1 326.1 1.3X -1 units w/o interval 320 321 2 3.1 319.9 1.3X -2 units w/ interval 474 483 9 2.1 474.2 0.9X -2 units w/o interval 474 474 1 2.1 473.5 0.9X -3 units w/ interval 1035 1045 9 1.0 1035.4 0.4X -3 units w/o interval 1035 1041 8 1.0 1035.2 0.4X -4 units w/ interval 1287 1292 6 0.8 1287.1 0.3X -4 units w/o interval 1297 1305 13 0.8 1296.8 0.3X -5 units w/ interval 1447 1455 7 0.7 1446.8 0.3X -5 units w/o interval 1445 1451 6 0.7 1445.3 0.3X -6 units w/ interval 1600 1604 5 0.6 1600.3 0.3X -6 units w/o interval 1596 1600 7 0.6 1595.9 0.3X -7 units w/ interval 1839 1848 9 0.5 1838.6 0.2X -7 units w/o interval 1821 1826 8 0.5 1821.0 0.2X -8 units w/ interval 2021 2038 26 0.5 2021.3 0.2X -8 units w/o interval 2095 2109 20 0.5 2095.5 0.2X -9 units w/ interval 2273 2290 19 0.4 2272.6 0.2X -9 units w/o interval 2286 2326 47 0.4 2285.8 0.2X -10 units w/ interval 2884 2899 14 0.3 2884.4 0.1X -10 units w/o interval 2882 2889 10 0.3 2882.1 0.1X -11 units w/ interval 3155 3190 50 0.3 3155.2 0.1X -11 units w/o interval 3032 3076 49 0.3 3032.3 0.1X +prepare string w/ interval 399 402 5 2.5 399.2 1.0X +prepare string w/o interval 390 397 13 2.6 389.7 1.0X +1 units w/ interval 319 320 2 3.1 318.8 1.3X +1 units w/o interval 327 329 2 3.1 326.8 1.2X +2 units w/ interval 503 509 6 2.0 503.5 0.8X +2 units w/o interval 498 500 2 2.0 497.7 0.8X +3 units w/ interval 1080 1089 15 0.9 1079.9 0.4X +3 units w/o interval 1110 1113 5 0.9 1109.8 0.4X +4 units w/ interval 1369 1371 4 0.7 1368.5 0.3X +4 units w/o interval 1378 1386 12 0.7 1377.7 0.3X +5 units w/ interval 1529 1531 2 0.7 1528.8 0.3X +5 units w/o interval 1545 1549 5 0.6 1545.2 0.3X +6 units w/ interval 1698 1706 7 0.6 1698.0 0.2X +6 units w/o interval 1700 1707 7 0.6 1700.2 0.2X +7 units w/ interval 2028 2040 11 0.5 2027.5 0.2X +7 units w/o interval 2044 2046 3 0.5 2043.9 0.2X +8 units w/ interval 2261 2271 12 0.4 2260.7 0.2X +8 units w/o interval 2249 2258 9 0.4 2249.3 0.2X +9 units w/ interval 2705 2710 4 0.4 2705.2 0.1X +9 units w/o interval 2713 2722 8 0.4 2713.3 0.1X +10 units w/ interval 2784 2789 7 0.4 2784.0 0.1X +10 units w/o interval 2785 2790 5 0.4 2784.9 0.1X +11 units w/ interval 3123 3148 31 0.3 3122.7 0.1X +11 units w/o interval 3136 3144 10 0.3 3136.2 0.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor make_interval(): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -prepare make_interval() 323 329 6 3.1 323.5 1.0X -make_interval(0, 1, 2, 3, 4, 5, 50.123456) 37 39 2 26.8 37.3 8.7X -make_interval(*, *, 2, 3, 4, 5, 50.123456) 59 64 4 16.9 59.2 5.5X -make_interval(0, 1, *, *, 4, 5, 50.123456) 60 63 5 16.7 59.8 5.4X -make_interval(0, 1, 2, 3, *, *, *) 308 313 8 3.2 308.0 1.1X -make_interval(*, *, *, *, *, *, *) 335 343 7 3.0 334.7 1.0X +prepare make_interval() 346 352 5 2.9 346.3 1.0X +make_interval(0, 1, 2, 3, 4, 5, 50.123456) 39 43 4 25.3 39.5 8.8X +make_interval(*, *, 2, 3, 4, 5, 50.123456) 51 57 9 19.5 51.3 6.7X +make_interval(0, 1, *, *, 4, 5, 50.123456) 57 59 3 17.7 56.6 6.1X +make_interval(0, 1, 2, 3, *, *, *) 356 358 2 2.8 355.9 1.0X +make_interval(*, *, *, *, *, *, *) 344 347 4 2.9 344.4 1.0X diff --git a/sql/core/benchmarks/IntervalBenchmark-results.txt b/sql/core/benchmarks/IntervalBenchmark-results.txt index 73ccbdbfaac23..86cfcdf96dbd7 100644 --- a/sql/core/benchmarks/IntervalBenchmark-results.txt +++ b/sql/core/benchmarks/IntervalBenchmark-results.txt @@ -1,40 +1,40 @@ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor cast strings to intervals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -prepare string w/ interval 396 448 74 2.5 396.2 1.0X -prepare string w/o interval 373 383 9 2.7 372.6 1.1X -1 units w/ interval 332 341 11 3.0 331.8 1.2X -1 units w/o interval 371 377 7 2.7 371.0 1.1X -2 units w/ interval 499 502 3 2.0 499.3 0.8X -2 units w/o interval 474 481 7 2.1 474.4 0.8X -3 units w/ interval 1122 1124 4 0.9 1122.0 0.4X -3 units w/o interval 1105 1118 12 0.9 1104.6 0.4X -4 units w/ interval 1418 1425 8 0.7 1418.1 0.3X -4 units w/o interval 1397 1401 4 0.7 1397.1 0.3X -5 units w/ interval 1568 1576 10 0.6 1568.0 0.3X -5 units w/o interval 1564 1566 2 0.6 1563.7 0.3X -6 units w/ interval 1748 1754 6 0.6 1748.5 0.2X -6 units w/o interval 1736 1737 1 0.6 1735.7 0.2X -7 units w/ interval 2148 2159 10 0.5 2147.5 0.2X -7 units w/o interval 2176 2176 0 0.5 2175.7 0.2X -8 units w/ interval 2404 2412 7 0.4 2404.5 0.2X -8 units w/o interval 2381 2387 6 0.4 2381.1 0.2X -9 units w/ interval 2632 2649 15 0.4 2631.9 0.2X -9 units w/o interval 2636 2648 16 0.4 2635.7 0.2X -10 units w/ interval 2849 2852 4 0.4 2849.3 0.1X -10 units w/o interval 2836 2848 20 0.4 2835.8 0.1X -11 units w/ interval 3049 3058 16 0.3 3048.6 0.1X -11 units w/o interval 3052 3062 9 0.3 3051.8 0.1X +prepare string w/ interval 375 379 4 2.7 375.4 1.0X +prepare string w/o interval 365 367 2 2.7 364.8 1.0X +1 units w/ interval 321 329 8 3.1 321.1 1.2X +1 units w/o interval 291 302 12 3.4 291.2 1.3X +2 units w/ interval 435 441 7 2.3 434.9 0.9X +2 units w/o interval 416 418 2 2.4 415.7 0.9X +3 units w/ interval 1019 1024 4 1.0 1019.3 0.4X +3 units w/o interval 1000 1006 9 1.0 1000.1 0.4X +4 units w/ interval 1319 1326 5 0.8 1319.4 0.3X +4 units w/o interval 1317 1321 4 0.8 1317.3 0.3X +5 units w/ interval 1457 1467 9 0.7 1457.4 0.3X +5 units w/o interval 1461 1467 5 0.7 1461.2 0.3X +6 units w/ interval 1631 1635 4 0.6 1630.7 0.2X +6 units w/o interval 1614 1620 6 0.6 1614.4 0.2X +7 units w/ interval 2270 2282 12 0.4 2270.2 0.2X +7 units w/o interval 2252 2255 3 0.4 2252.0 0.2X +8 units w/ interval 2238 2247 13 0.4 2237.6 0.2X +8 units w/o interval 2237 2239 3 0.4 2236.6 0.2X +9 units w/ interval 2478 2484 7 0.4 2478.2 0.2X +9 units w/o interval 2455 2458 4 0.4 2455.2 0.2X +10 units w/ interval 2628 2635 6 0.4 2628.5 0.1X +10 units w/o interval 2618 2633 13 0.4 2618.4 0.1X +11 units w/ interval 2907 2915 8 0.3 2907.3 0.1X +11 units w/o interval 2905 2914 10 0.3 2905.1 0.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor make_interval(): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -prepare make_interval() 344 348 4 2.9 344.4 1.0X -make_interval(0, 1, 2, 3, 4, 5, 50.123456) 45 50 8 22.0 45.4 7.6X -make_interval(*, *, 2, 3, 4, 5, 50.123456) 55 56 1 18.1 55.3 6.2X -make_interval(0, 1, *, *, 4, 5, 50.123456) 56 59 4 17.8 56.2 6.1X -make_interval(0, 1, 2, 3, *, *, *) 329 331 3 3.0 328.7 1.0X -make_interval(*, *, *, *, *, *, *) 340 343 3 2.9 339.9 1.0X +prepare make_interval() 344 347 3 2.9 344.3 1.0X +make_interval(0, 1, 2, 3, 4, 5, 50.123456) 44 45 1 22.8 44.0 7.8X +make_interval(*, *, 2, 3, 4, 5, 50.123456) 51 51 1 19.7 50.8 6.8X +make_interval(0, 1, *, *, 4, 5, 50.123456) 55 60 9 18.2 54.9 6.3X +make_interval(0, 1, 2, 3, *, *, *) 340 341 1 2.9 340.0 1.0X +make_interval(*, *, *, *, *, *, *) 333 335 2 3.0 333.3 1.0X diff --git a/sql/core/benchmarks/JoinBenchmark-jdk21-results.txt b/sql/core/benchmarks/JoinBenchmark-jdk21-results.txt index 02744d00178b8..d7bb196bb7144 100644 --- a/sql/core/benchmarks/JoinBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/JoinBenchmark-jdk21-results.txt @@ -2,81 +2,81 @@ Join Benchmark ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Join w long: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Join w long wholestage off 2064 2065 2 10.2 98.4 1.0X -Join w long wholestage on 930 961 29 22.6 44.3 2.2X +Join w long wholestage off 2147 2166 27 9.8 102.4 1.0X +Join w long wholestage on 944 990 40 22.2 45.0 2.3X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Join w long duplicated: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Join w long duplicated wholestage off 2006 2018 18 10.5 95.7 1.0X -Join w long duplicated wholestage on 917 932 27 22.9 43.7 2.2X +Join w long duplicated wholestage off 2214 2248 48 9.5 105.6 1.0X +Join w long duplicated wholestage on 996 1005 10 21.1 47.5 2.2X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Join w 2 ints: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Join w 2 ints wholestage off 147152 147206 77 0.1 7016.7 1.0X -Join w 2 ints wholestage on 105544 105604 67 0.2 5032.7 1.4X +Join w 2 ints wholestage off 148982 149062 112 0.1 7104.0 1.0X +Join w 2 ints wholestage on 105434 105515 63 0.2 5027.5 1.4X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Join w 2 longs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Join w 2 longs wholestage off 3414 3420 9 6.1 162.8 1.0X -Join w 2 longs wholestage on 2117 2154 32 9.9 100.9 1.6X +Join w 2 longs wholestage off 3442 3459 23 6.1 164.1 1.0X +Join w 2 longs wholestage on 2179 2191 11 9.6 103.9 1.6X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Join w 2 longs duplicated: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Join w 2 longs duplicated wholestage off 8996 9037 58 2.3 429.0 1.0X -Join w 2 longs duplicated wholestage on 5567 5636 95 3.8 265.5 1.6X +Join w 2 longs duplicated wholestage off 10326 10385 84 2.0 492.4 1.0X +Join w 2 longs duplicated wholestage on 6246 6271 22 3.4 297.8 1.7X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor outer join w long: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -outer join w long wholestage off 1605 1633 39 13.1 76.5 1.0X -outer join w long wholestage on 983 993 9 21.3 46.9 1.6X +outer join w long wholestage off 1711 1713 3 12.3 81.6 1.0X +outer join w long wholestage on 1045 1056 8 20.1 49.8 1.6X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor semi join w long: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -semi join w long wholestage off 1062 1075 19 19.8 50.6 1.0X -semi join w long wholestage on 558 580 14 37.6 26.6 1.9X +semi join w long wholestage off 1207 1210 4 17.4 57.6 1.0X +semi join w long wholestage on 682 701 14 30.7 32.5 1.8X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor sort merge join: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -sort merge join wholestage off 517 527 15 4.1 246.5 1.0X -sort merge join wholestage on 446 464 12 4.7 212.4 1.2X +sort merge join wholestage off 548 573 35 3.8 261.3 1.0X +sort merge join wholestage on 489 520 28 4.3 233.1 1.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor sort merge join with duplicates: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------ -sort merge join with duplicates wholestage off 1010 1020 14 2.1 481.6 1.0X -sort merge join with duplicates wholestage on 904 927 17 2.3 431.2 1.1X +sort merge join with duplicates wholestage off 1054 1091 52 2.0 502.8 1.0X +sort merge join with duplicates wholestage on 934 961 24 2.2 445.2 1.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor shuffle hash join: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -shuffle hash join wholestage off 516 524 11 8.1 123.1 1.0X -shuffle hash join wholestage on 377 419 34 11.1 89.9 1.4X +shuffle hash join wholestage off 501 514 18 8.4 119.4 1.0X +shuffle hash join wholestage on 427 469 32 9.8 101.7 1.2X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor broadcast nested loop join: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -broadcast nested loop join wholestage off 25325 25376 73 0.8 1207.6 1.0X -broadcast nested loop join wholestage on 19304 19559 319 1.1 920.5 1.3X +broadcast nested loop join wholestage off 26497 26499 2 0.8 1263.5 1.0X +broadcast nested loop join wholestage on 18614 18698 61 1.1 887.6 1.4X diff --git a/sql/core/benchmarks/JoinBenchmark-results.txt b/sql/core/benchmarks/JoinBenchmark-results.txt index a3c026eb02fab..a8e057371664b 100644 --- a/sql/core/benchmarks/JoinBenchmark-results.txt +++ b/sql/core/benchmarks/JoinBenchmark-results.txt @@ -2,81 +2,81 @@ Join Benchmark ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Join w long: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Join w long wholestage off 2121 2143 31 9.9 101.2 1.0X -Join w long wholestage on 943 963 17 22.2 45.0 2.2X +Join w long wholestage off 2138 2142 6 9.8 101.9 1.0X +Join w long wholestage on 944 958 15 22.2 45.0 2.3X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Join w long duplicated: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Join w long duplicated wholestage off 2151 2153 3 9.7 102.6 1.0X -Join w long duplicated wholestage on 882 907 23 23.8 42.1 2.4X +Join w long duplicated wholestage off 2278 2326 68 9.2 108.6 1.0X +Join w long duplicated wholestage on 1080 1084 2 19.4 51.5 2.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Join w 2 ints: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Join w 2 ints wholestage off 144107 144246 197 0.1 6871.6 1.0X -Join w 2 ints wholestage on 108436 109016 343 0.2 5170.6 1.3X +Join w 2 ints wholestage off 149192 149209 24 0.1 7114.0 1.0X +Join w 2 ints wholestage on 111484 111555 82 0.2 5316.0 1.3X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Join w 2 longs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Join w 2 longs wholestage off 3146 3165 27 6.7 150.0 1.0X -Join w 2 longs wholestage on 2023 2035 14 10.4 96.4 1.6X +Join w 2 longs wholestage off 3463 3507 62 6.1 165.1 1.0X +Join w 2 longs wholestage on 2116 2133 25 9.9 100.9 1.6X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Join w 2 longs duplicated: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Join w 2 longs duplicated wholestage off 11530 11630 142 1.8 549.8 1.0X -Join w 2 longs duplicated wholestage on 5228 5248 24 4.0 249.3 2.2X +Join w 2 longs duplicated wholestage off 10496 10500 6 2.0 500.5 1.0X +Join w 2 longs duplicated wholestage on 6203 6227 32 3.4 295.8 1.7X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor outer join w long: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -outer join w long wholestage off 1760 1771 16 11.9 83.9 1.0X -outer join w long wholestage on 931 951 17 22.5 44.4 1.9X +outer join w long wholestage off 1834 1840 9 11.4 87.5 1.0X +outer join w long wholestage on 1078 1083 4 19.5 51.4 1.7X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor semi join w long: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -semi join w long wholestage off 1158 1161 4 18.1 55.2 1.0X -semi join w long wholestage on 527 548 24 39.8 25.1 2.2X +semi join w long wholestage off 1326 1332 8 15.8 63.2 1.0X +semi join w long wholestage on 711 716 7 29.5 33.9 1.9X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor sort merge join: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -sort merge join wholestage off 510 522 17 4.1 243.1 1.0X -sort merge join wholestage on 461 478 13 4.6 219.6 1.1X +sort merge join wholestage off 541 551 14 3.9 258.2 1.0X +sort merge join wholestage on 487 493 4 4.3 232.2 1.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor sort merge join with duplicates: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------ -sort merge join with duplicates wholestage off 1020 1022 3 2.1 486.2 1.0X -sort merge join with duplicates wholestage on 915 921 9 2.3 436.1 1.1X +sort merge join with duplicates wholestage off 1092 1096 7 1.9 520.6 1.0X +sort merge join with duplicates wholestage on 965 977 14 2.2 460.3 1.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor shuffle hash join: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -shuffle hash join wholestage off 524 528 6 8.0 124.8 1.0X -shuffle hash join wholestage on 372 378 4 11.3 88.7 1.4X +shuffle hash join wholestage off 551 558 10 7.6 131.4 1.0X +shuffle hash join wholestage on 397 404 6 10.6 94.7 1.4X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor broadcast nested loop join: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -broadcast nested loop join wholestage off 28415 28455 57 0.7 1354.9 1.0X -broadcast nested loop join wholestage on 17648 17700 36 1.2 841.5 1.6X +broadcast nested loop join wholestage off 27651 27728 109 0.8 1318.5 1.0X +broadcast nested loop join wholestage on 19162 19202 33 1.1 913.7 1.4X diff --git a/sql/core/benchmarks/JsonBenchmark-jdk21-results.txt b/sql/core/benchmarks/JsonBenchmark-jdk21-results.txt index 4cb72c9b02fed..80448f80df486 100644 --- a/sql/core/benchmarks/JsonBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/JsonBenchmark-jdk21-results.txt @@ -3,128 +3,128 @@ Benchmark for performance of JSON parsing ================================================================================================ Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor JSON schema inferring: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 2293 2316 37 2.2 458.6 1.0X -UTF-8 is set 3389 3399 14 1.5 677.8 0.7X +No encoding 2459 2482 39 2.0 491.9 1.0X +UTF-8 is set 3337 3360 20 1.5 667.4 0.7X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor count a short column: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 1900 1931 36 2.6 380.1 1.0X -UTF-8 is set 3049 3055 6 1.6 609.7 0.6X +No encoding 2195 2205 11 2.3 439.1 1.0X +UTF-8 is set 3159 3169 9 1.6 631.7 0.7X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor count a wide column: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 4662 4674 20 0.2 4661.7 1.0X -UTF-8 is set 4492 4508 20 0.2 4491.8 1.0X +No encoding 4837 4914 116 0.2 4837.1 1.0X +UTF-8 is set 4384 4417 30 0.2 4383.6 1.1X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor select wide row: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 9989 10251 226 0.0 199788.0 1.0X -UTF-8 is set 10872 10943 93 0.0 217437.4 0.9X +No encoding 9775 9911 129 0.0 195491.4 1.0X +UTF-8 is set 10824 10845 31 0.0 216478.6 0.9X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select a subset of 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Select 10 columns 1724 1740 17 0.6 1723.6 1.0X -Select 1 column 1345 1349 7 0.7 1344.6 1.3X +Select 10 columns 1606 1614 8 0.6 1606.2 1.0X +Select 1 column 1334 1341 7 0.7 1333.7 1.2X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor creation of JSON parser per line: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Short column without encoding 633 640 9 1.6 632.9 1.0X -Short column with UTF-8 872 886 22 1.1 872.1 0.7X -Wide column without encoding 5266 5277 12 0.2 5266.2 0.1X -Wide column with UTF-8 6953 6959 8 0.1 6953.0 0.1X +Short column without encoding 595 596 2 1.7 594.9 1.0X +Short column with UTF-8 819 828 10 1.2 819.2 0.7X +Wide column without encoding 5442 5464 28 0.2 5442.1 0.1X +Wide column with UTF-8 6442 6454 12 0.2 6442.0 0.1X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor JSON functions: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Text read 58 61 4 17.2 58.2 1.0X -from_json 1235 1257 21 0.8 1235.4 0.0X -json_tuple 1101 1110 10 0.9 1100.5 0.1X -get_json_object wholestage off 1063 1068 6 0.9 1062.6 0.1X -get_json_object wholestage on 989 989 1 1.0 988.7 0.1X +Text read 55 56 1 18.2 55.0 1.0X +from_json 1152 1156 3 0.9 1152.1 0.0X +json_tuple 1185 1188 4 0.8 1185.0 0.0X +get_json_object wholestage off 1093 1099 10 0.9 1093.3 0.1X +get_json_object wholestage on 1017 1019 1 1.0 1017.3 0.1X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Dataset of json strings: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Text read 230 236 8 21.7 46.1 1.0X -schema inferring 1914 1921 11 2.6 382.7 0.1X -parsing 2849 2856 9 1.8 569.8 0.1X +Text read 236 238 2 21.2 47.2 1.0X +schema inferring 2018 2025 8 2.5 403.6 0.1X +parsing 2730 2737 10 1.8 546.1 0.1X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Json files in the per-line mode: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Text read 536 548 14 9.3 107.1 1.0X -Schema inferring 2366 2374 7 2.1 473.2 0.2X -Parsing without charset 2908 2911 3 1.7 581.6 0.2X -Parsing with UTF-8 4059 4064 8 1.2 811.8 0.1X +Text read 549 552 4 9.1 109.9 1.0X +Schema inferring 2522 2525 4 2.0 504.4 0.2X +Parsing without charset 2921 2933 17 1.7 584.2 0.2X +Parsing with UTF-8 3873 3881 13 1.3 774.7 0.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Write dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Create a dataset of timestamps 106 112 9 9.4 105.9 1.0X -to_json(timestamp) 744 747 4 1.3 743.6 0.1X -write timestamps to files 633 637 4 1.6 633.4 0.2X -Create a dataset of dates 124 128 5 8.1 123.8 0.9X -to_json(date) 560 561 1 1.8 559.9 0.2X -write dates to files 453 466 12 2.2 452.7 0.2X +Create a dataset of timestamps 103 107 7 9.7 103.1 1.0X +to_json(timestamp) 737 742 5 1.4 736.5 0.1X +write timestamps to files 644 646 2 1.6 643.9 0.2X +Create a dataset of dates 111 117 6 9.0 110.7 0.9X +to_json(date) 557 562 6 1.8 556.6 0.2X +write dates to files 434 436 2 2.3 434.1 0.2X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Read dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------------- -read timestamp text from files 146 154 8 6.8 146.2 1.0X -read timestamps from files 1079 1084 5 0.9 1079.0 0.1X -infer timestamps from files 1977 1986 11 0.5 1976.8 0.1X -read date text from files 142 144 2 7.0 142.4 1.0X -read date from files 718 725 6 1.4 718.1 0.2X -timestamp strings 137 138 1 7.3 137.2 1.1X -parse timestamps from Dataset[String] 1258 1275 14 0.8 1258.0 0.1X -infer timestamps from Dataset[String] 2182 2186 6 0.5 2182.4 0.1X -date strings 196 201 5 5.1 195.5 0.7X -parse dates from Dataset[String] 1016 1025 7 1.0 1016.3 0.1X -from_json(timestamp) 1924 1953 38 0.5 1924.2 0.1X -from_json(date) 1644 1696 74 0.6 1644.1 0.1X -infer error timestamps from Dataset[String] with default format 1463 1473 9 0.7 1463.1 0.1X -infer error timestamps from Dataset[String] with user-provided format 1451 1459 12 0.7 1450.6 0.1X -infer error timestamps from Dataset[String] with legacy format 1486 1494 8 0.7 1486.3 0.1X +read timestamp text from files 151 157 8 6.6 150.7 1.0X +read timestamps from files 1071 1086 13 0.9 1071.1 0.1X +infer timestamps from files 2021 2025 5 0.5 2020.8 0.1X +read date text from files 137 147 11 7.3 136.5 1.1X +read date from files 699 705 9 1.4 698.7 0.2X +timestamp strings 143 149 5 7.0 143.4 1.1X +parse timestamps from Dataset[String] 1251 1255 3 0.8 1251.1 0.1X +infer timestamps from Dataset[String] 2181 2186 5 0.5 2181.1 0.1X +date strings 226 234 13 4.4 225.7 0.7X +parse dates from Dataset[String] 974 977 4 1.0 973.8 0.2X +from_json(timestamp) 1758 1764 9 0.6 1758.2 0.1X +from_json(date) 1470 1473 3 0.7 1469.7 0.1X +infer error timestamps from Dataset[String] with default format 1436 1438 3 0.7 1436.1 0.1X +infer error timestamps from Dataset[String] with user-provided format 1437 1444 8 0.7 1437.4 0.1X +infer error timestamps from Dataset[String] with legacy format 1448 1450 3 0.7 1448.2 0.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Filters pushdown: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -w/o filters 5708 5724 17 0.0 57078.7 1.0X -pushdown disabled 5625 5646 20 0.0 56254.5 1.0X -w/ filters 742 770 38 0.1 7418.9 7.7X +w/o filters 5891 5911 22 0.0 58911.2 1.0X +pushdown disabled 5547 5560 11 0.0 55470.8 1.1X +w/ filters 618 626 10 0.2 6177.6 9.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Partial JSON results: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -parse invalid JSON 2551 2628 90 0.0 255064.5 1.0X +parse invalid JSON 2319 2338 26 0.0 231898.9 1.0X diff --git a/sql/core/benchmarks/JsonBenchmark-results.txt b/sql/core/benchmarks/JsonBenchmark-results.txt index 20388dc756fb8..3f4b9e435b06d 100644 --- a/sql/core/benchmarks/JsonBenchmark-results.txt +++ b/sql/core/benchmarks/JsonBenchmark-results.txt @@ -3,128 +3,128 @@ Benchmark for performance of JSON parsing ================================================================================================ Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor JSON schema inferring: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 2456 2513 59 2.0 491.2 1.0X -UTF-8 is set 3355 3365 15 1.5 671.1 0.7X +No encoding 2406 2422 16 2.1 481.1 1.0X +UTF-8 is set 3323 3335 10 1.5 664.5 0.7X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor count a short column: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 2162 2201 34 2.3 432.4 1.0X -UTF-8 is set 3168 3178 17 1.6 633.5 0.7X +No encoding 2005 2037 29 2.5 401.0 1.0X +UTF-8 is set 3138 3143 5 1.6 627.6 0.6X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor count a wide column: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 3185 3258 122 0.3 3185.0 1.0X -UTF-8 is set 4058 4093 42 0.2 4058.1 0.8X +No encoding 3337 3406 64 0.3 3336.8 1.0X +UTF-8 is set 4383 4411 27 0.2 4383.4 0.8X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor select wide row: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 9244 9334 132 0.0 184884.5 1.0X -UTF-8 is set 10249 10258 10 0.0 204988.0 0.9X +No encoding 9364 9394 35 0.0 187287.2 1.0X +UTF-8 is set 10402 10439 42 0.0 208036.3 0.9X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Select a subset of 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Select 10 columns 1641 1650 7 0.6 1641.4 1.0X -Select 1 column 1118 1120 3 0.9 1117.9 1.5X +Select 10 columns 1657 1663 5 0.6 1657.2 1.0X +Select 1 column 1328 1331 4 0.8 1327.7 1.2X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor creation of JSON parser per line: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Short column without encoding 627 635 7 1.6 626.7 1.0X -Short column with UTF-8 819 834 15 1.2 819.5 0.8X -Wide column without encoding 5191 5227 39 0.2 5191.4 0.1X -Wide column with UTF-8 6490 6506 17 0.2 6489.9 0.1X +Short column without encoding 656 656 1 1.5 655.8 1.0X +Short column with UTF-8 844 858 23 1.2 843.5 0.8X +Wide column without encoding 5501 5529 26 0.2 5501.5 0.1X +Wide column with UTF-8 6440 6460 25 0.2 6440.0 0.1X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor JSON functions: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Text read 57 58 0 17.4 57.4 1.0X -from_json 1105 1118 17 0.9 1105.2 0.1X -json_tuple 1151 1152 1 0.9 1151.2 0.0X -get_json_object wholestage off 1080 1081 2 0.9 1079.8 0.1X -get_json_object wholestage on 1018 1024 7 1.0 1018.3 0.1X +Text read 51 52 1 19.8 50.6 1.0X +from_json 1134 1142 7 0.9 1134.4 0.0X +json_tuple 1117 1121 4 0.9 1116.9 0.0X +get_json_object wholestage off 1036 1042 7 1.0 1036.3 0.0X +get_json_object wholestage on 944 945 1 1.1 944.3 0.1X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Dataset of json strings: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Text read 255 257 1 19.6 51.1 1.0X -schema inferring 1775 1776 2 2.8 355.0 0.1X -parsing 2833 2835 3 1.8 566.6 0.1X +Text read 227 230 3 22.0 45.5 1.0X +schema inferring 1835 1836 1 2.7 367.1 0.1X +parsing 2831 2843 15 1.8 566.3 0.1X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Json files in the per-line mode: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Text read 581 583 2 8.6 116.2 1.0X -Schema inferring 2391 2397 6 2.1 478.2 0.2X -Parsing without charset 2973 2975 3 1.7 594.6 0.2X -Parsing with UTF-8 3956 3969 17 1.3 791.2 0.1X +Text read 588 594 6 8.5 117.5 1.0X +Schema inferring 2444 2449 5 2.0 488.8 0.2X +Parsing without charset 3046 3052 5 1.6 609.2 0.2X +Parsing with UTF-8 3937 3940 4 1.3 787.4 0.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Write dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Create a dataset of timestamps 109 119 13 9.2 109.3 1.0X -to_json(timestamp) 795 800 6 1.3 794.8 0.1X -write timestamps to files 730 734 3 1.4 730.1 0.1X -Create a dataset of dates 133 143 8 7.5 133.3 0.8X -to_json(date) 598 601 4 1.7 598.1 0.2X -write dates to files 475 478 3 2.1 474.6 0.2X +Create a dataset of timestamps 100 105 6 10.0 100.5 1.0X +to_json(timestamp) 815 820 4 1.2 815.2 0.1X +write timestamps to files 734 745 14 1.4 733.6 0.1X +Create a dataset of dates 112 118 6 8.9 111.9 0.9X +to_json(date) 606 608 3 1.6 606.3 0.2X +write dates to files 472 480 8 2.1 472.3 0.2X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Read dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------------- -read timestamp text from files 149 153 4 6.7 148.5 1.0X -read timestamps from files 1049 1057 8 1.0 1048.6 0.1X -infer timestamps from files 1942 1969 37 0.5 1942.4 0.1X -read date text from files 141 143 3 7.1 140.7 1.1X -read date from files 712 718 7 1.4 712.4 0.2X -timestamp strings 142 158 14 7.0 142.4 1.0X -parse timestamps from Dataset[String] 1286 1291 5 0.8 1285.9 0.1X -infer timestamps from Dataset[String] 2139 2145 6 0.5 2138.6 0.1X -date strings 209 210 1 4.8 209.0 0.7X -parse dates from Dataset[String] 1019 1026 6 1.0 1019.0 0.1X -from_json(timestamp) 1738 1741 5 0.6 1737.8 0.1X -from_json(date) 1477 1482 6 0.7 1477.3 0.1X -infer error timestamps from Dataset[String] with default format 1380 1387 6 0.7 1380.4 0.1X -infer error timestamps from Dataset[String] with user-provided format 1380 1388 7 0.7 1380.5 0.1X -infer error timestamps from Dataset[String] with legacy format 1442 1450 9 0.7 1442.3 0.1X +read timestamp text from files 154 156 3 6.5 153.9 1.0X +read timestamps from files 1048 1055 6 1.0 1048.1 0.1X +infer timestamps from files 1962 1967 6 0.5 1961.6 0.1X +read date text from files 139 148 12 7.2 139.1 1.1X +read date from files 728 736 7 1.4 727.8 0.2X +timestamp strings 135 140 5 7.4 134.7 1.1X +parse timestamps from Dataset[String] 1193 1197 3 0.8 1192.9 0.1X +infer timestamps from Dataset[String] 2042 2046 4 0.5 2042.3 0.1X +date strings 203 204 1 4.9 203.1 0.8X +parse dates from Dataset[String] 991 996 8 1.0 990.6 0.2X +from_json(timestamp) 1670 1679 11 0.6 1669.9 0.1X +from_json(date) 1459 1460 1 0.7 1458.6 0.1X +infer error timestamps from Dataset[String] with default format 1393 1400 7 0.7 1392.6 0.1X +infer error timestamps from Dataset[String] with user-provided format 1384 1388 5 0.7 1383.6 0.1X +infer error timestamps from Dataset[String] with legacy format 1418 1419 2 0.7 1418.3 0.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Filters pushdown: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -w/o filters 5828 5845 19 0.0 58278.4 1.0X -pushdown disabled 5515 5536 32 0.0 55146.3 1.1X -w/ filters 685 691 7 0.1 6845.1 8.5X +w/o filters 5964 5972 10 0.0 59641.7 1.0X +pushdown disabled 5780 5785 6 0.0 57798.8 1.0X +w/ filters 701 702 1 0.1 7010.4 8.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Partial JSON results: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -parse invalid JSON 2346 2495 227 0.0 234637.6 1.0X +parse invalid JSON 2429 2545 138 0.0 242888.8 1.0X diff --git a/sql/core/benchmarks/MakeDateTimeBenchmark-jdk21-results.txt b/sql/core/benchmarks/MakeDateTimeBenchmark-jdk21-results.txt index 558c1887d63fe..7ff49dc2c3d73 100644 --- a/sql/core/benchmarks/MakeDateTimeBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/MakeDateTimeBenchmark-jdk21-results.txt @@ -1,22 +1,22 @@ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor make_date(): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -prepare make_date() 1981 2010 32 50.5 19.8 1.0X -make_date(2019, 9, 16) 1833 1849 14 54.5 18.3 1.1X -make_date(*, *, *) 4011 4035 26 24.9 40.1 0.5X +prepare make_date() 2324 2379 51 43.0 23.2 1.0X +make_date(2019, 9, 16) 1859 1869 11 53.8 18.6 1.3X +make_date(*, *, *) 2884 2914 29 34.7 28.8 0.8X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor make_timestamp(): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -prepare make_timestamp() 351 361 11 2.8 351.3 1.0X -make_timestamp(2019, 1, 2, 3, 4, 50.123456) 42 55 12 23.9 41.9 8.4X -make_timestamp(2019, 1, 2, 3, 4, 60.000000) 36 40 4 28.0 35.8 9.8X -make_timestamp(2019, 12, 31, 23, 59, 60.00) 34 43 12 29.7 33.7 10.4X -make_timestamp(*, *, *, 3, 4, 50.123456) 164 166 2 6.1 163.9 2.1X -make_timestamp(*, *, *, *, *, 0) 101 108 10 9.9 101.3 3.5X -make_timestamp(*, *, *, *, *, 60.0) 145 147 2 6.9 145.5 2.4X -make_timestamp(2019, 1, 2, *, *, *) 460 462 2 2.2 460.3 0.8X -make_timestamp(*, *, *, *, *, *) 477 480 4 2.1 476.5 0.7X +prepare make_timestamp() 312 318 6 3.2 312.2 1.0X +make_timestamp(2019, 1, 2, 3, 4, 50.123456) 48 49 1 20.7 48.3 6.5X +make_timestamp(2019, 1, 2, 3, 4, 60.000000) 33 38 5 30.0 33.3 9.4X +make_timestamp(2019, 12, 31, 23, 59, 60.00) 32 35 3 30.9 32.3 9.7X +make_timestamp(*, *, *, 3, 4, 50.123456) 165 169 6 6.1 164.7 1.9X +make_timestamp(*, *, *, *, *, 0) 107 110 2 9.3 107.3 2.9X +make_timestamp(*, *, *, *, *, 60.0) 149 159 14 6.7 149.2 2.1X +make_timestamp(2019, 1, 2, *, *, *) 476 477 1 2.1 475.8 0.7X +make_timestamp(*, *, *, *, *, *) 495 503 9 2.0 495.5 0.6X diff --git a/sql/core/benchmarks/MakeDateTimeBenchmark-results.txt b/sql/core/benchmarks/MakeDateTimeBenchmark-results.txt index 8f53cd7815be2..02c43e519ff81 100644 --- a/sql/core/benchmarks/MakeDateTimeBenchmark-results.txt +++ b/sql/core/benchmarks/MakeDateTimeBenchmark-results.txt @@ -1,22 +1,22 @@ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor make_date(): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -prepare make_date() 2009 2031 35 49.8 20.1 1.0X -make_date(2019, 9, 16) 1855 1866 10 53.9 18.6 1.1X -make_date(*, *, *) 3980 4011 27 25.1 39.8 0.5X +prepare make_date() 2144 2161 29 46.6 21.4 1.0X +make_date(2019, 9, 16) 1807 1812 5 55.3 18.1 1.2X +make_date(*, *, *) 2860 2876 22 35.0 28.6 0.7X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor make_timestamp(): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -prepare make_timestamp() 362 369 8 2.8 361.7 1.0X -make_timestamp(2019, 1, 2, 3, 4, 50.123456) 45 49 5 22.0 45.4 8.0X -make_timestamp(2019, 1, 2, 3, 4, 60.000000) 38 40 2 26.0 38.5 9.4X -make_timestamp(2019, 12, 31, 23, 59, 60.00) 42 50 9 23.8 42.0 8.6X -make_timestamp(*, *, *, 3, 4, 50.123456) 158 162 6 6.3 158.4 2.3X -make_timestamp(*, *, *, *, *, 0) 106 113 8 9.5 105.5 3.4X -make_timestamp(*, *, *, *, *, 60.0) 144 146 2 6.9 144.1 2.5X -make_timestamp(2019, 1, 2, *, *, *) 471 473 3 2.1 470.9 0.8X -make_timestamp(*, *, *, *, *, *) 450 455 6 2.2 449.8 0.8X +prepare make_timestamp() 327 329 3 3.1 326.9 1.0X +make_timestamp(2019, 1, 2, 3, 4, 50.123456) 34 34 1 29.8 33.6 9.7X +make_timestamp(2019, 1, 2, 3, 4, 60.000000) 34 40 5 29.4 34.1 9.6X +make_timestamp(2019, 12, 31, 23, 59, 60.00) 34 38 4 29.6 33.8 9.7X +make_timestamp(*, *, *, 3, 4, 50.123456) 171 176 5 5.9 170.5 1.9X +make_timestamp(*, *, *, *, *, 0) 101 108 10 9.9 101.0 3.2X +make_timestamp(*, *, *, *, *, 60.0) 144 146 3 7.0 143.6 2.3X +make_timestamp(2019, 1, 2, *, *, *) 429 430 1 2.3 428.8 0.8X +make_timestamp(*, *, *, *, *, *) 481 488 6 2.1 481.2 0.7X diff --git a/sql/core/benchmarks/MetadataStructBenchmark-jdk21-results.txt b/sql/core/benchmarks/MetadataStructBenchmark-jdk21-results.txt index be07b82eba912..0d974239430df 100644 --- a/sql/core/benchmarks/MetadataStructBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/MetadataStructBenchmark-jdk21-results.txt @@ -2,45 +2,45 @@ Metadata Struct Benchmark ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Vectorized Parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -no metadata columns 378 399 21 13.2 75.6 1.0X -_metadata.file_path 476 507 15 10.5 95.3 0.8X -_metadata.file_name 490 504 8 10.2 97.9 0.8X -_metadata.file_size 390 417 8 12.8 78.0 1.0X -_metadata.file_block_start 406 417 10 12.3 81.1 0.9X -_metadata.file_block_length 384 413 10 13.0 76.9 1.0X -_metadata.file_modification_time 404 416 6 12.4 80.7 0.9X -_metadata.row_index 440 468 10 11.4 88.0 0.9X -_metadata 727 744 14 6.9 145.5 0.5X +no metadata columns 631 647 9 7.9 126.3 1.0X +_metadata.file_path 704 737 10 7.1 140.9 0.9X +_metadata.file_name 700 739 15 7.1 139.9 0.9X +_metadata.file_size 623 666 15 8.0 124.7 1.0X +_metadata.file_block_start 630 665 12 7.9 126.0 1.0X +_metadata.file_block_length 622 661 17 8.0 124.3 1.0X +_metadata.file_modification_time 629 664 13 7.9 125.8 1.0X +_metadata.row_index 669 713 17 7.5 133.8 0.9X +_metadata 961 993 20 5.2 192.2 0.7X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Parquet-mr: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -no metadata columns 1702 1724 15 2.9 340.5 1.0X -_metadata.file_path 2177 2218 32 2.3 435.4 0.8X -_metadata.file_name 2200 2226 28 2.3 440.0 0.8X -_metadata.file_size 1995 2037 19 2.5 399.0 0.9X -_metadata.file_block_start 2015 2044 17 2.5 403.0 0.8X -_metadata.file_block_length 2021 2044 14 2.5 404.2 0.8X -_metadata.file_modification_time 2000 2042 20 2.5 399.9 0.9X -_metadata.row_index 2095 2136 17 2.4 418.9 0.8X -_metadata 3039 3088 32 1.6 607.9 0.6X +no metadata columns 2687 2714 20 1.9 537.3 1.0X +_metadata.file_path 3372 3402 21 1.5 674.4 0.8X +_metadata.file_name 3370 3402 23 1.5 673.9 0.8X +_metadata.file_size 3227 3256 61 1.5 645.4 0.8X +_metadata.file_block_start 3196 3225 21 1.6 639.1 0.8X +_metadata.file_block_length 3198 3246 30 1.6 639.7 0.8X +_metadata.file_modification_time 3189 3239 19 1.6 637.9 0.8X +_metadata.row_index 3643 3686 25 1.4 728.5 0.7X +_metadata 4684 4710 20 1.1 936.9 0.6X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor JSON: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -no metadata columns 4770 4852 44 1.0 953.9 1.0X -_metadata.file_path 5356 5374 10 0.9 1071.1 0.9X -_metadata.file_name 5372 5397 21 0.9 1074.4 0.9X -_metadata.file_size 5130 5167 13 1.0 1025.9 0.9X -_metadata.file_block_start 5143 5165 11 1.0 1028.7 0.9X -_metadata.file_block_length 5136 5162 25 1.0 1027.2 0.9X -_metadata.file_modification_time 5146 5158 8 1.0 1029.1 0.9X -_metadata 5864 5886 15 0.9 1172.8 0.8X +no metadata columns 6920 6947 27 0.7 1384.1 1.0X +_metadata.file_path 7691 7716 17 0.7 1538.2 0.9X +_metadata.file_name 7694 7728 32 0.6 1538.8 0.9X +_metadata.file_size 7502 7538 26 0.7 1500.5 0.9X +_metadata.file_block_start 7513 7536 19 0.7 1502.6 0.9X +_metadata.file_block_length 7504 7525 13 0.7 1500.9 0.9X +_metadata.file_modification_time 7501 7520 11 0.7 1500.2 0.9X +_metadata 8293 8310 10 0.6 1658.6 0.8X diff --git a/sql/core/benchmarks/MetadataStructBenchmark-results.txt b/sql/core/benchmarks/MetadataStructBenchmark-results.txt index 2be20cea649de..b74cc469b2c8d 100644 --- a/sql/core/benchmarks/MetadataStructBenchmark-results.txt +++ b/sql/core/benchmarks/MetadataStructBenchmark-results.txt @@ -2,45 +2,45 @@ Metadata Struct Benchmark ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Vectorized Parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -no metadata columns 380 405 22 13.1 76.0 1.0X -_metadata.file_path 489 503 7 10.2 97.8 0.8X -_metadata.file_name 487 495 7 10.3 97.5 0.8X -_metadata.file_size 410 418 6 12.2 82.1 0.9X -_metadata.file_block_start 403 409 4 12.4 80.6 0.9X -_metadata.file_block_length 406 412 5 12.3 81.2 0.9X -_metadata.file_modification_time 406 414 8 12.3 81.1 0.9X -_metadata.row_index 451 458 8 11.1 90.2 0.8X -_metadata 764 773 6 6.5 152.8 0.5X +no metadata columns 649 669 23 7.7 129.8 1.0X +_metadata.file_path 750 765 10 6.7 150.0 0.9X +_metadata.file_name 776 793 20 6.4 155.2 0.8X +_metadata.file_size 687 706 24 7.3 137.5 0.9X +_metadata.file_block_start 684 697 11 7.3 136.7 0.9X +_metadata.file_block_length 686 703 13 7.3 137.3 0.9X +_metadata.file_modification_time 686 702 15 7.3 137.2 0.9X +_metadata.row_index 732 757 28 6.8 146.4 0.9X +_metadata 1048 1065 16 4.8 209.5 0.6X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Parquet-mr: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -no metadata columns 2084 2104 15 2.4 416.7 1.0X -_metadata.file_path 2577 2611 34 1.9 515.3 0.8X -_metadata.file_name 2596 2624 29 1.9 519.3 0.8X -_metadata.file_size 2430 2468 61 2.1 486.0 0.9X -_metadata.file_block_start 2383 2407 31 2.1 476.7 0.9X -_metadata.file_block_length 2430 2457 21 2.1 486.1 0.9X -_metadata.file_modification_time 2427 2456 30 2.1 485.4 0.9X -_metadata.row_index 2898 2926 22 1.7 579.6 0.7X -_metadata 3924 3965 26 1.3 784.8 0.5X +no metadata columns 2565 2605 42 1.9 512.9 1.0X +_metadata.file_path 3396 3431 32 1.5 679.3 0.8X +_metadata.file_name 3391 3429 30 1.5 678.2 0.8X +_metadata.file_size 3174 3205 25 1.6 634.8 0.8X +_metadata.file_block_start 3187 3243 61 1.6 637.4 0.8X +_metadata.file_block_length 3228 3266 39 1.5 645.6 0.8X +_metadata.file_modification_time 3220 3261 31 1.6 644.0 0.8X +_metadata.row_index 3609 3644 23 1.4 721.7 0.7X +_metadata 4854 4910 27 1.0 970.8 0.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor JSON: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -no metadata columns 5252 5270 18 1.0 1050.3 1.0X -_metadata.file_path 5861 5880 15 0.9 1172.2 0.9X -_metadata.file_name 5877 5899 11 0.9 1175.5 0.9X -_metadata.file_size 5610 5631 13 0.9 1122.1 0.9X -_metadata.file_block_start 5590 5619 26 0.9 1118.0 0.9X -_metadata.file_block_length 5600 5617 11 0.9 1120.1 0.9X -_metadata.file_modification_time 5598 5618 17 0.9 1119.6 0.9X -_metadata 6512 6555 29 0.8 1302.3 0.8X +no metadata columns 6808 6932 123 0.7 1361.5 1.0X +_metadata.file_path 7560 7591 18 0.7 1512.0 0.9X +_metadata.file_name 7594 7619 42 0.7 1518.9 0.9X +_metadata.file_size 7372 7392 10 0.7 1474.3 0.9X +_metadata.file_block_start 7369 7393 18 0.7 1473.8 0.9X +_metadata.file_block_length 7370 7389 15 0.7 1474.1 0.9X +_metadata.file_modification_time 7376 7393 13 0.7 1475.1 0.9X +_metadata 8188 8211 23 0.6 1637.5 0.8X diff --git a/sql/core/benchmarks/MetricsAggregationBenchmark-jdk21-results.txt b/sql/core/benchmarks/MetricsAggregationBenchmark-jdk21-results.txt index c97c0059404ef..6cc4668711283 100644 --- a/sql/core/benchmarks/MetricsAggregationBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/MetricsAggregationBenchmark-jdk21-results.txt @@ -1,12 +1,12 @@ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor metrics aggregation (50 metrics, 100000 tasks per stage): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------- -1 stage(s) 681 733 49 0.0 680971074.0 1.0X -2 stage(s) 1345 1443 139 0.0 1345060505.0 0.5X -3 stage(s) 1872 1995 174 0.0 1871758987.0 0.4X +1 stage(s) 757 787 33 0.0 756863015.0 1.0X +2 stage(s) 1448 1611 230 0.0 1447967154.0 0.5X +3 stage(s) 2313 2394 115 0.0 2312633108.0 0.3X Stage Count Stage Proc. Time Aggreg. Time - 1 388 66 - 2 395 159 - 3 384 229 + 1 375 73 + 2 348 230 + 3 393 279 diff --git a/sql/core/benchmarks/MetricsAggregationBenchmark-results.txt b/sql/core/benchmarks/MetricsAggregationBenchmark-results.txt index 2b2d73d682521..14203a6d527e4 100644 --- a/sql/core/benchmarks/MetricsAggregationBenchmark-results.txt +++ b/sql/core/benchmarks/MetricsAggregationBenchmark-results.txt @@ -1,12 +1,12 @@ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor metrics aggregation (50 metrics, 100000 tasks per stage): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------- -1 stage(s) 614 653 49 0.0 613999403.0 1.0X -2 stage(s) 1306 1324 25 0.0 1306014655.0 0.5X -3 stage(s) 1977 2002 35 0.0 1977274301.0 0.3X +1 stage(s) 759 804 53 0.0 759264298.0 1.0X +2 stage(s) 1571 1604 48 0.0 1570666996.0 0.5X +3 stage(s) 2364 2417 75 0.0 2363843200.0 0.3X Stage Count Stage Proc. Time Aggreg. Time - 1 394 65 - 2 385 168 - 3 374 251 + 1 481 66 + 2 403 198 + 3 443 292 diff --git a/sql/core/benchmarks/MiscBenchmark-jdk21-results.txt b/sql/core/benchmarks/MiscBenchmark-jdk21-results.txt index 20e23bb049b39..d48562a67ddfe 100644 --- a/sql/core/benchmarks/MiscBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/MiscBenchmark-jdk21-results.txt @@ -2,126 +2,126 @@ filter & aggregate without group ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor range/filter/sum: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -range/filter/sum wholestage off 34570 35516 1338 60.7 16.5 1.0X -range/filter/sum wholestage on 2343 2443 192 895.2 1.1 14.8X +range/filter/sum wholestage off 39354 39673 451 53.3 18.8 1.0X +range/filter/sum wholestage on 3472 3578 64 604.1 1.7 11.3X ================================================================================================ range/limit/sum ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor range/limit/sum: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -range/limit/sum wholestage off 66 72 10 8003.4 0.1 1.0X -range/limit/sum wholestage on 54 65 7 9624.5 0.1 1.2X +range/limit/sum wholestage off 59 62 4 8881.1 0.1 1.0X +range/limit/sum wholestage on 60 67 7 8666.1 0.1 1.0X ================================================================================================ sample ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor sample with replacement: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -sample with replacement wholestage off 7862 8003 199 16.7 60.0 1.0X -sample with replacement wholestage on 5047 5075 18 26.0 38.5 1.6X +sample with replacement wholestage off 8093 8118 35 16.2 61.7 1.0X +sample with replacement wholestage on 5100 5122 18 25.7 38.9 1.6X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor sample without replacement: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -sample without replacement wholestage off 1844 1854 14 71.1 14.1 1.0X -sample without replacement wholestage on 637 649 10 205.8 4.9 2.9X +sample without replacement wholestage off 2716 2724 12 48.3 20.7 1.0X +sample without replacement wholestage on 652 659 8 200.9 5.0 4.2X ================================================================================================ collect ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor collect: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -collect 1 million 161 230 72 6.5 153.9 1.0X -collect 2 millions 356 427 54 2.9 339.4 0.5X -collect 4 millions 766 822 65 1.4 730.4 0.2X +collect 1 million 149 227 52 7.0 142.0 1.0X +collect 2 millions 288 413 105 3.6 274.5 0.5X +collect 4 millions 780 831 85 1.3 743.8 0.2X ================================================================================================ collect limit ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor collect limit: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -collect limit 1 million 153 228 82 6.8 146.0 1.0X -collect limit 2 millions 300 417 128 3.5 285.9 0.5X +collect limit 1 million 156 222 55 6.7 148.8 1.0X +collect limit 2 millions 322 441 83 3.3 307.4 0.5X ================================================================================================ generate explode ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor generate explode array: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -generate explode array wholestage off 13081 13094 19 1.3 779.7 1.0X -generate explode array wholestage on 3028 3137 75 5.5 180.5 4.3X +generate explode array wholestage off 12087 12259 244 1.4 720.4 1.0X +generate explode array wholestage on 3313 3473 99 5.1 197.5 3.6X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor generate explode map: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -generate explode map wholestage off 24054 24141 123 0.7 1433.7 1.0X -generate explode map wholestage on 9148 9370 220 1.8 545.2 2.6X +generate explode map wholestage off 24473 24489 22 0.7 1458.7 1.0X +generate explode map wholestage on 9589 9743 163 1.7 571.6 2.6X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor generate posexplode array: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -generate posexplode array wholestage off 13193 13206 18 1.3 786.4 1.0X -generate posexplode array wholestage on 2898 2982 73 5.8 172.8 4.6X +generate posexplode array wholestage off 12779 12830 72 1.3 761.7 1.0X +generate posexplode array wholestage on 3340 3492 89 5.0 199.1 3.8X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor generate inline array: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -generate inline array wholestage off 6456 6502 65 2.6 384.8 1.0X -generate inline array wholestage on 2340 2491 86 7.2 139.5 2.8X +generate inline array wholestage off 7039 7047 12 2.4 419.5 1.0X +generate inline array wholestage on 2712 2806 80 6.2 161.6 2.6X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor generate big struct array: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -generate big struct array wholestage off 192 198 9 0.3 3198.6 1.0X -generate big struct array wholestage on 151 157 6 0.4 2518.7 1.3X +generate big struct array wholestage off 188 196 11 0.3 3127.5 1.0X +generate big struct array wholestage on 149 169 16 0.4 2484.0 1.3X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor generate big nested struct array: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------- -generate big nested struct array wholestage off 18698 20695 2824 0.0 311636.5 1.0X -generate big nested struct array wholestage on 147 172 22 0.4 2449.1 127.2X +generate big nested struct array wholestage off 21562 21565 4 0.0 359373.5 1.0X +generate big nested struct array wholestage on 143 161 17 0.4 2378.5 151.1X ================================================================================================ generate regular generator ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor generate stack: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -generate stack wholestage off 13964 14158 274 1.2 832.3 1.0X -generate stack wholestage on 3062 3085 21 5.5 182.5 4.6X +generate stack wholestage off 13383 13385 2 1.3 797.7 1.0X +generate stack wholestage on 3139 3149 11 5.3 187.1 4.3X diff --git a/sql/core/benchmarks/MiscBenchmark-results.txt b/sql/core/benchmarks/MiscBenchmark-results.txt index 113839ab5cfba..bc6376495bc1d 100644 --- a/sql/core/benchmarks/MiscBenchmark-results.txt +++ b/sql/core/benchmarks/MiscBenchmark-results.txt @@ -2,126 +2,126 @@ filter & aggregate without group ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor range/filter/sum: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -range/filter/sum wholestage off 33468 36665 4522 62.7 16.0 1.0X -range/filter/sum wholestage on 2352 2437 160 891.6 1.1 14.2X +range/filter/sum wholestage off 39427 39448 31 53.2 18.8 1.0X +range/filter/sum wholestage on 3452 3698 149 607.6 1.6 11.4X ================================================================================================ range/limit/sum ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor range/limit/sum: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -range/limit/sum wholestage off 72 79 10 7264.4 0.1 1.0X -range/limit/sum wholestage on 64 69 6 8179.1 0.1 1.1X +range/limit/sum wholestage off 91 106 22 5753.7 0.2 1.0X +range/limit/sum wholestage on 75 84 10 6966.3 0.1 1.2X ================================================================================================ sample ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor sample with replacement: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -sample with replacement wholestage off 8125 8280 219 16.1 62.0 1.0X -sample with replacement wholestage on 4990 5012 30 26.3 38.1 1.6X +sample with replacement wholestage off 8030 8041 16 16.3 61.3 1.0X +sample with replacement wholestage on 4992 5008 26 26.3 38.1 1.6X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor sample without replacement: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -sample without replacement wholestage off 1912 1920 12 68.6 14.6 1.0X -sample without replacement wholestage on 646 652 6 202.9 4.9 3.0X +sample without replacement wholestage off 3093 3100 9 42.4 23.6 1.0X +sample without replacement wholestage on 630 660 28 208.1 4.8 4.9X ================================================================================================ collect ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor collect: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -collect 1 million 165 211 64 6.4 157.2 1.0X -collect 2 millions 284 401 112 3.7 271.1 0.6X -collect 4 millions 696 772 122 1.5 664.0 0.2X +collect 1 million 147 194 72 7.2 139.8 1.0X +collect 2 millions 274 394 87 3.8 261.2 0.5X +collect 4 millions 758 817 60 1.4 722.7 0.2X ================================================================================================ collect limit ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor collect limit: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -collect limit 1 million 153 211 70 6.9 145.5 1.0X -collect limit 2 millions 302 430 85 3.5 287.8 0.5X +collect limit 1 million 149 216 81 7.0 142.0 1.0X +collect limit 2 millions 283 397 87 3.7 269.6 0.5X ================================================================================================ generate explode ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor generate explode array: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -generate explode array wholestage off 13376 13504 181 1.3 797.3 1.0X -generate explode array wholestage on 2842 2931 70 5.9 169.4 4.7X +generate explode array wholestage off 13755 13765 14 1.2 819.9 1.0X +generate explode array wholestage on 2847 2938 80 5.9 169.7 4.8X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor generate explode map: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -generate explode map wholestage off 24432 24452 28 0.7 1456.2 1.0X -generate explode map wholestage on 9206 9290 93 1.8 548.7 2.7X +generate explode map wholestage off 24921 24985 91 0.7 1485.4 1.0X +generate explode map wholestage on 9283 9399 83 1.8 553.3 2.7X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor generate posexplode array: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -generate posexplode array wholestage off 13900 13921 29 1.2 828.5 1.0X -generate posexplode array wholestage on 2891 3015 71 5.8 172.3 4.8X +generate posexplode array wholestage off 14332 14431 139 1.2 854.3 1.0X +generate posexplode array wholestage on 2909 3002 52 5.8 173.4 4.9X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor generate inline array: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -generate inline array wholestage off 6752 6784 45 2.5 402.5 1.0X -generate inline array wholestage on 2495 2563 86 6.7 148.7 2.7X +generate inline array wholestage off 7138 7205 96 2.4 425.5 1.0X +generate inline array wholestage on 2369 2489 116 7.1 141.2 3.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor generate big struct array: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -generate big struct array wholestage off 227 235 13 0.3 3776.5 1.0X -generate big struct array wholestage on 167 185 26 0.4 2791.6 1.4X +generate big struct array wholestage off 194 211 25 0.3 3229.0 1.0X +generate big struct array wholestage on 165 173 10 0.4 2750.7 1.2X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor generate big nested struct array: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------- -generate big nested struct array wholestage off 18438 20607 3067 0.0 307296.7 1.0X -generate big nested struct array wholestage on 167 178 9 0.4 2787.1 110.3X +generate big nested struct array wholestage off 17295 17757 654 0.0 288246.5 1.0X +generate big nested struct array wholestage on 163 174 11 0.4 2709.5 106.4X ================================================================================================ generate regular generator ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor generate stack: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -generate stack wholestage off 15293 15312 28 1.1 911.5 1.0X -generate stack wholestage on 3068 3096 26 5.5 182.9 5.0X +generate stack wholestage off 14910 14980 99 1.1 888.7 1.0X +generate stack wholestage on 3115 3129 14 5.4 185.7 4.8X diff --git a/sql/core/benchmarks/OrcNestedSchemaPruningBenchmark-jdk21-results.txt b/sql/core/benchmarks/OrcNestedSchemaPruningBenchmark-jdk21-results.txt index 41107864c1ad9..e5f8398d72d7a 100644 --- a/sql/core/benchmarks/OrcNestedSchemaPruningBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/OrcNestedSchemaPruningBenchmark-jdk21-results.txt @@ -2,52 +2,52 @@ Nested Schema Pruning Benchmark For ORC v1 ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Selection: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 60 78 12 16.6 60.2 1.0X -Nested column 56 64 6 17.7 56.4 1.1X -Nested column in array 164 171 5 6.1 164.3 0.4X +Top-level column 49 60 11 20.5 48.8 1.0X +Nested column 51 55 5 19.8 50.6 1.0X +Nested column in array 159 165 5 6.3 159.4 0.3X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Limiting: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 245 256 10 4.1 244.8 1.0X -Nested column 239 255 13 4.2 238.5 1.0X -Nested column in array 525 543 14 1.9 524.5 0.5X +Top-level column 242 260 19 4.1 241.8 1.0X +Nested column 230 250 18 4.3 230.0 1.1X +Nested column in array 498 543 22 2.0 497.5 0.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Repartitioning: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 216 231 20 4.6 216.4 1.0X -Nested column 216 227 11 4.6 215.8 1.0X -Nested column in array 472 482 7 2.1 472.5 0.5X +Top-level column 208 219 9 4.8 207.9 1.0X +Nested column 214 218 6 4.7 214.1 1.0X +Nested column in array 477 492 10 2.1 476.6 0.4X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Repartitioning by exprs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 210 220 6 4.8 210.0 1.0X -Nested column 233 245 9 4.3 233.0 0.9X -Nested column in array 501 509 5 2.0 501.0 0.4X +Top-level column 207 219 8 4.8 206.8 1.0X +Nested column 236 252 22 4.2 236.2 0.9X +Nested column in array 498 513 20 2.0 497.9 0.4X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Sample: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 65 75 8 15.4 65.1 1.0X -Nested column 73 91 16 13.8 72.6 0.9X -Nested column in array 205 245 54 4.9 205.0 0.3X +Top-level column 63 73 9 15.9 62.8 1.0X +Nested column 70 83 13 14.3 70.1 0.9X +Nested column in array 200 224 17 5.0 200.3 0.3X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Sorting: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 297 311 21 3.4 296.8 1.0X -Nested column 351 369 25 2.8 351.0 0.8X -Nested column in array 719 783 47 1.4 718.7 0.4X +Top-level column 288 302 14 3.5 287.6 1.0X +Nested column 344 356 15 2.9 344.0 0.8X +Nested column in array 712 760 44 1.4 711.8 0.4X diff --git a/sql/core/benchmarks/OrcNestedSchemaPruningBenchmark-results.txt b/sql/core/benchmarks/OrcNestedSchemaPruningBenchmark-results.txt index e8e79b7b32039..f9a3f229b7ff5 100644 --- a/sql/core/benchmarks/OrcNestedSchemaPruningBenchmark-results.txt +++ b/sql/core/benchmarks/OrcNestedSchemaPruningBenchmark-results.txt @@ -2,52 +2,52 @@ Nested Schema Pruning Benchmark For ORC v1 ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Selection: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 70 90 12 14.2 70.3 1.0X -Nested column 60 68 6 16.7 60.0 1.2X -Nested column in array 162 169 5 6.2 161.9 0.4X +Top-level column 46 54 8 21.8 45.9 1.0X +Nested column 49 55 5 20.2 49.5 0.9X +Nested column in array 152 157 7 6.6 151.6 0.3X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Limiting: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 240 258 12 4.2 240.2 1.0X -Nested column 245 256 7 4.1 244.8 1.0X -Nested column in array 510 537 21 2.0 509.6 0.5X +Top-level column 243 257 21 4.1 243.3 1.0X +Nested column 238 263 15 4.2 238.3 1.0X +Nested column in array 508 531 18 2.0 507.9 0.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Repartitioning: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 216 223 4 4.6 216.2 1.0X -Nested column 221 230 5 4.5 220.8 1.0X -Nested column in array 475 479 5 2.1 474.6 0.5X +Top-level column 218 226 7 4.6 217.7 1.0X +Nested column 222 227 3 4.5 221.6 1.0X +Nested column in array 471 479 6 2.1 471.2 0.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Repartitioning by exprs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 214 222 7 4.7 213.7 1.0X -Nested column 237 250 11 4.2 236.9 0.9X -Nested column in array 506 512 5 2.0 505.8 0.4X +Top-level column 216 223 3 4.6 215.9 1.0X +Nested column 242 248 11 4.1 242.0 0.9X +Nested column in array 508 517 17 2.0 508.1 0.4X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Sample: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 69 80 9 14.6 68.6 1.0X -Nested column 77 92 24 13.0 76.7 0.9X -Nested column in array 203 231 18 4.9 203.4 0.3X +Top-level column 56 73 13 17.7 56.5 1.0X +Nested column 68 82 17 14.8 67.6 0.8X +Nested column in array 216 226 9 4.6 216.3 0.3X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Sorting: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 311 324 11 3.2 311.0 1.0X -Nested column 359 369 7 2.8 359.0 0.9X -Nested column in array 715 727 13 1.4 714.7 0.4X +Top-level column 307 344 41 3.3 306.7 1.0X +Nested column 361 401 28 2.8 361.3 0.8X +Nested column in array 771 793 15 1.3 771.2 0.4X diff --git a/sql/core/benchmarks/OrcV2NestedSchemaPruningBenchmark-jdk21-results.txt b/sql/core/benchmarks/OrcV2NestedSchemaPruningBenchmark-jdk21-results.txt index b29fc02ad669d..2a303225d57b9 100644 --- a/sql/core/benchmarks/OrcV2NestedSchemaPruningBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/OrcV2NestedSchemaPruningBenchmark-jdk21-results.txt @@ -2,52 +2,52 @@ Nested Schema Pruning Benchmark For ORC v2 ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Selection: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 67 81 12 15.0 66.6 1.0X -Nested column 59 65 4 17.1 58.6 1.1X -Nested column in array 167 178 9 6.0 167.1 0.4X +Top-level column 61 83 14 16.3 61.3 1.0X +Nested column 59 67 7 17.1 58.6 1.0X +Nested column in array 169 176 4 5.9 169.4 0.4X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Limiting: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 242 253 8 4.1 241.7 1.0X -Nested column 234 258 17 4.3 234.5 1.0X -Nested column in array 522 536 9 1.9 522.1 0.5X +Top-level column 242 263 16 4.1 242.4 1.0X +Nested column 237 260 10 4.2 236.6 1.0X +Nested column in array 513 535 14 1.9 513.5 0.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Repartitioning: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 212 225 9 4.7 212.0 1.0X -Nested column 218 228 9 4.6 217.5 1.0X -Nested column in array 473 480 5 2.1 473.4 0.4X +Top-level column 217 229 9 4.6 216.6 1.0X +Nested column 222 236 8 4.5 222.1 1.0X +Nested column in array 477 484 7 2.1 477.2 0.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Repartitioning by exprs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 214 222 8 4.7 213.8 1.0X -Nested column 235 250 14 4.3 235.0 0.9X -Nested column in array 502 508 3 2.0 501.6 0.4X +Top-level column 214 225 9 4.7 214.3 1.0X +Nested column 240 254 14 4.2 239.8 0.9X +Nested column in array 508 520 8 2.0 508.4 0.4X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Sample: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 64 79 13 15.7 63.7 1.0X -Nested column 75 92 14 13.3 75.2 0.8X -Nested column in array 217 254 54 4.6 216.5 0.3X +Top-level column 56 71 8 18.0 55.7 1.0X +Nested column 73 92 18 13.7 73.2 0.8X +Nested column in array 209 237 22 4.8 208.6 0.3X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Sorting: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 294 309 21 3.4 293.7 1.0X -Nested column 348 363 19 2.9 348.2 0.8X -Nested column in array 719 808 33 1.4 718.6 0.4X +Top-level column 300 315 21 3.3 299.8 1.0X +Nested column 353 366 17 2.8 353.4 0.8X +Nested column in array 809 838 18 1.2 809.3 0.4X diff --git a/sql/core/benchmarks/OrcV2NestedSchemaPruningBenchmark-results.txt b/sql/core/benchmarks/OrcV2NestedSchemaPruningBenchmark-results.txt index 8824e3aaa0d2c..80519cb6b28bc 100644 --- a/sql/core/benchmarks/OrcV2NestedSchemaPruningBenchmark-results.txt +++ b/sql/core/benchmarks/OrcV2NestedSchemaPruningBenchmark-results.txt @@ -2,52 +2,52 @@ Nested Schema Pruning Benchmark For ORC v2 ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Selection: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 66 85 10 15.1 66.1 1.0X -Nested column 59 67 6 17.0 58.9 1.1X -Nested column in array 165 170 4 6.1 165.2 0.4X +Top-level column 80 103 13 12.5 79.8 1.0X +Nested column 70 81 9 14.3 69.9 1.1X +Nested column in array 183 193 6 5.5 182.6 0.4X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Limiting: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 233 259 29 4.3 232.7 1.0X -Nested column 251 267 21 4.0 250.6 0.9X -Nested column in array 506 531 17 2.0 505.5 0.5X +Top-level column 269 283 10 3.7 268.9 1.0X +Nested column 259 282 16 3.9 259.4 1.0X +Nested column in array 567 581 15 1.8 567.0 0.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Repartitioning: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 214 220 7 4.7 214.3 1.0X -Nested column 219 225 6 4.6 218.6 1.0X -Nested column in array 470 476 4 2.1 470.4 0.5X +Top-level column 242 247 5 4.1 242.0 1.0X +Nested column 245 254 7 4.1 245.0 1.0X +Nested column in array 517 523 4 1.9 516.9 0.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Repartitioning by exprs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 212 220 9 4.7 212.0 1.0X -Nested column 237 244 5 4.2 236.9 0.9X -Nested column in array 505 512 4 2.0 505.0 0.4X +Top-level column 234 246 10 4.3 234.2 1.0X +Nested column 259 270 7 3.9 258.7 0.9X +Nested column in array 547 559 9 1.8 546.8 0.4X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Sample: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 69 84 18 14.4 69.3 1.0X -Nested column 76 92 15 13.2 76.0 0.9X -Nested column in array 206 229 16 4.8 206.2 0.3X +Top-level column 78 89 16 12.8 78.0 1.0X +Nested column 86 96 8 11.6 86.0 0.9X +Nested column in array 225 263 18 4.4 224.8 0.3X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Sorting: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 302 314 6 3.3 302.3 1.0X -Nested column 361 377 22 2.8 361.2 0.8X -Nested column in array 718 743 32 1.4 718.3 0.4X +Top-level column 344 359 10 2.9 344.1 1.0X +Nested column 385 409 14 2.6 384.6 0.9X +Nested column in array 769 781 10 1.3 768.9 0.4X diff --git a/sql/core/benchmarks/ParquetNestedPredicatePushDownBenchmark-jdk21-results.txt b/sql/core/benchmarks/ParquetNestedPredicatePushDownBenchmark-jdk21-results.txt index cd071f54b6e4d..2b5f80423b41d 100644 --- a/sql/core/benchmarks/ParquetNestedPredicatePushDownBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/ParquetNestedPredicatePushDownBenchmark-jdk21-results.txt @@ -1,21 +1,21 @@ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Can skip all row groups: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without nested predicate Pushdown 6478 6539 38 16.2 61.8 1.0X -With nested predicate Pushdown 54 74 20 1929.3 0.5 119.2X +Without nested predicate Pushdown 6507 6573 42 16.1 62.1 1.0X +With nested predicate Pushdown 51 66 15 2074.7 0.5 128.7X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Can skip some row groups: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without nested predicate Pushdown 6966 7033 67 15.1 66.4 1.0X -With nested predicate Pushdown 46 59 9 2286.9 0.4 151.9X +Without nested predicate Pushdown 6994 7049 34 15.0 66.7 1.0X +With nested predicate Pushdown 41 58 13 2582.3 0.4 172.2X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Can skip no row groups: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without nested predicate Pushdown 13440 13517 51 7.8 128.2 1.0X -With nested predicate Pushdown 13430 13499 67 7.8 128.1 1.0X +Without nested predicate Pushdown 13276 13334 59 7.9 126.6 1.0X +With nested predicate Pushdown 13267 13393 117 7.9 126.5 1.0X diff --git a/sql/core/benchmarks/ParquetNestedPredicatePushDownBenchmark-results.txt b/sql/core/benchmarks/ParquetNestedPredicatePushDownBenchmark-results.txt index 556da31a7c424..b2a9464e8f4d6 100644 --- a/sql/core/benchmarks/ParquetNestedPredicatePushDownBenchmark-results.txt +++ b/sql/core/benchmarks/ParquetNestedPredicatePushDownBenchmark-results.txt @@ -1,21 +1,21 @@ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Can skip all row groups: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without nested predicate Pushdown 7164 7228 37 14.6 68.3 1.0X -With nested predicate Pushdown 67 87 15 1560.0 0.6 106.6X +Without nested predicate Pushdown 7218 7266 26 14.5 68.8 1.0X +With nested predicate Pushdown 49 72 14 2136.4 0.5 147.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Can skip some row groups: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without nested predicate Pushdown 7767 7799 36 13.5 74.1 1.0X -With nested predicate Pushdown 46 60 10 2287.6 0.4 169.5X +Without nested predicate Pushdown 7799 7864 49 13.4 74.4 1.0X +With nested predicate Pushdown 48 60 9 2194.5 0.5 163.2X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Can skip no row groups: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without nested predicate Pushdown 14168 14233 35 7.4 135.1 1.0X -With nested predicate Pushdown 14156 14242 84 7.4 135.0 1.0X +Without nested predicate Pushdown 14137 14228 42 7.4 134.8 1.0X +With nested predicate Pushdown 14176 14233 49 7.4 135.2 1.0X diff --git a/sql/core/benchmarks/ParquetNestedSchemaPruningBenchmark-jdk21-results.txt b/sql/core/benchmarks/ParquetNestedSchemaPruningBenchmark-jdk21-results.txt index efe69ca7b5e2d..f3a5ff49939b7 100644 --- a/sql/core/benchmarks/ParquetNestedSchemaPruningBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/ParquetNestedSchemaPruningBenchmark-jdk21-results.txt @@ -2,52 +2,52 @@ Nested Schema Pruning Benchmark For Parquet ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Selection: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 67 78 12 15.0 66.7 1.0X -Nested column 66 79 10 15.2 66.0 1.0X -Nested column in array 222 229 6 4.5 222.3 0.3X +Top-level column 64 77 8 15.6 64.1 1.0X +Nested column 65 74 9 15.3 65.3 1.0X +Nested column in array 245 251 6 4.1 244.6 0.3X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Limiting: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 243 264 17 4.1 242.5 1.0X -Nested column 245 271 18 4.1 244.8 1.0X -Nested column in array 565 614 28 1.8 565.4 0.4X +Top-level column 232 252 12 4.3 231.7 1.0X +Nested column 242 259 16 4.1 242.2 1.0X +Nested column in array 578 609 23 1.7 578.0 0.4X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Repartitioning: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 221 233 13 4.5 220.7 1.0X -Nested column 222 238 13 4.5 222.4 1.0X -Nested column in array 535 544 6 1.9 535.4 0.4X +Top-level column 214 226 9 4.7 213.5 1.0X +Nested column 219 229 9 4.6 219.0 1.0X +Nested column in array 540 551 9 1.9 540.2 0.4X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Repartitioning by exprs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 213 222 8 4.7 212.6 1.0X -Nested column 236 255 11 4.2 236.1 0.9X -Nested column in array 559 590 24 1.8 559.4 0.4X +Top-level column 212 225 8 4.7 211.6 1.0X +Nested column 236 245 6 4.2 235.8 0.9X +Nested column in array 576 586 10 1.7 576.0 0.4X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Sample: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 70 85 12 14.2 70.5 1.0X -Nested column 72 87 10 13.8 72.3 1.0X -Nested column in array 259 307 28 3.9 259.2 0.3X +Top-level column 75 85 11 13.4 74.7 1.0X +Nested column 77 100 17 12.9 77.2 1.0X +Nested column in array 283 300 10 3.5 283.1 0.3X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Sorting: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 305 317 15 3.3 305.1 1.0X -Nested column 359 384 37 2.8 359.0 0.8X -Nested column in array 771 822 34 1.3 770.7 0.4X +Top-level column 303 318 18 3.3 302.5 1.0X +Nested column 352 373 20 2.8 352.1 0.9X +Nested column in array 813 850 33 1.2 812.8 0.4X diff --git a/sql/core/benchmarks/ParquetNestedSchemaPruningBenchmark-results.txt b/sql/core/benchmarks/ParquetNestedSchemaPruningBenchmark-results.txt index 51e2fb1081aa5..e30af4fa62c56 100644 --- a/sql/core/benchmarks/ParquetNestedSchemaPruningBenchmark-results.txt +++ b/sql/core/benchmarks/ParquetNestedSchemaPruningBenchmark-results.txt @@ -2,52 +2,52 @@ Nested Schema Pruning Benchmark For Parquet ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Selection: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 68 84 11 14.7 68.1 1.0X -Nested column 68 75 5 14.8 67.5 1.0X -Nested column in array 222 230 7 4.5 221.5 0.3X +Top-level column 75 87 10 13.3 74.9 1.0X +Nested column 72 80 8 13.9 72.0 1.0X +Nested column in array 243 248 5 4.1 242.7 0.3X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Limiting: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 245 262 14 4.1 244.6 1.0X -Nested column 241 267 23 4.2 240.8 1.0X -Nested column in array 558 582 20 1.8 557.8 0.4X +Top-level column 251 283 41 4.0 250.7 1.0X +Nested column 258 275 11 3.9 258.5 1.0X +Nested column in array 586 622 33 1.7 586.3 0.4X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Repartitioning: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 219 224 4 4.6 218.9 1.0X -Nested column 224 231 5 4.5 224.4 1.0X -Nested column in array 523 528 5 1.9 523.2 0.4X +Top-level column 226 234 6 4.4 226.4 1.0X +Nested column 231 240 8 4.3 230.7 1.0X +Nested column in array 550 560 18 1.8 549.8 0.4X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Repartitioning by exprs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 216 220 3 4.6 215.7 1.0X -Nested column 238 245 5 4.2 238.4 0.9X -Nested column in array 552 562 6 1.8 552.3 0.4X +Top-level column 224 229 6 4.5 224.1 1.0X +Nested column 252 259 5 4.0 252.0 0.9X +Nested column in array 595 604 9 1.7 595.3 0.4X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Sample: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 72 81 5 13.8 72.5 1.0X -Nested column 79 96 12 12.7 78.8 0.9X -Nested column in array 258 276 19 3.9 258.2 0.3X +Top-level column 75 86 9 13.4 74.8 1.0X +Nested column 76 98 21 13.1 76.4 1.0X +Nested column in array 286 300 11 3.5 286.1 0.3X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Sorting: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Top-level column 307 312 3 3.3 306.9 1.0X -Nested column 351 359 6 2.9 350.6 0.9X -Nested column in array 767 777 12 1.3 766.6 0.4X +Top-level column 319 333 14 3.1 318.6 1.0X +Nested column 376 388 10 2.7 375.6 0.8X +Nested column in array 820 824 5 1.2 819.7 0.4X diff --git a/sql/core/benchmarks/PrimitiveArrayBenchmark-jdk21-results.txt b/sql/core/benchmarks/PrimitiveArrayBenchmark-jdk21-results.txt index 6c4ad608aa760..42f2d9349d24a 100644 --- a/sql/core/benchmarks/PrimitiveArrayBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/PrimitiveArrayBenchmark-jdk21-results.txt @@ -2,11 +2,11 @@ Write primitive arrays in dataset ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Write an array in Dataset: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Int 225 250 20 37.2 26.9 1.0X -Double 279 291 8 30.0 33.3 0.8X +Int 168 186 15 49.9 20.0 1.0X +Double 269 286 13 31.2 32.1 0.6X diff --git a/sql/core/benchmarks/PrimitiveArrayBenchmark-results.txt b/sql/core/benchmarks/PrimitiveArrayBenchmark-results.txt index 918ea9b063ea0..eb5e87109dabc 100644 --- a/sql/core/benchmarks/PrimitiveArrayBenchmark-results.txt +++ b/sql/core/benchmarks/PrimitiveArrayBenchmark-results.txt @@ -2,11 +2,11 @@ Write primitive arrays in dataset ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Write an array in Dataset: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Int 259 276 13 32.4 30.8 1.0X -Double 301 313 10 27.9 35.9 0.9X +Int 157 173 10 53.4 18.7 1.0X +Double 248 269 13 33.8 29.6 0.6X diff --git a/sql/core/benchmarks/RangeBenchmark-jdk21-results.txt b/sql/core/benchmarks/RangeBenchmark-jdk21-results.txt index 041e51290affc..16f19bd9bce4c 100644 --- a/sql/core/benchmarks/RangeBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/RangeBenchmark-jdk21-results.txt @@ -2,14 +2,14 @@ range ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor range: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -full scan 10045 10154 180 52.2 19.2 1.0X -limit after range 40 43 4 13223.6 0.1 253.4X -filter after range 1004 1028 16 522.3 1.9 10.0X -count after range 35 39 5 15096.5 0.1 289.2X -count after limit after range 40 41 1 13172.1 0.1 252.4X +full scan 12772 12920 156 41.1 24.4 1.0X +limit after range 19 19 0 27834.0 0.0 678.0X +filter after range 1020 1042 34 514.1 1.9 12.5X +count after range 334 339 6 1572.0 0.6 38.3X +count after limit after range 28 31 3 18729.5 0.1 456.2X diff --git a/sql/core/benchmarks/RangeBenchmark-results.txt b/sql/core/benchmarks/RangeBenchmark-results.txt index 0fb11bf11525f..d6a426e804efd 100644 --- a/sql/core/benchmarks/RangeBenchmark-results.txt +++ b/sql/core/benchmarks/RangeBenchmark-results.txt @@ -2,14 +2,14 @@ range ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor range: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -full scan 9808 10162 239 53.5 18.7 1.0X -limit after range 47 48 1 11171.4 0.1 209.0X -filter after range 1000 1016 20 524.1 1.9 9.8X -count after range 35 39 4 15014.2 0.1 280.9X -count after limit after range 34 39 5 15248.4 0.1 285.3X +full scan 13257 13466 272 39.5 25.3 1.0X +limit after range 16 17 1 31952.5 0.0 808.0X +filter after range 1029 1044 23 509.6 2.0 12.9X +count after range 187 191 2 2803.4 0.4 70.9X +count after limit after range 27 32 4 19467.1 0.1 492.3X diff --git a/sql/core/benchmarks/SortBenchmark-jdk21-results.txt b/sql/core/benchmarks/SortBenchmark-jdk21-results.txt index 1f7f55a5c5b81..0950e46adcf30 100644 --- a/sql/core/benchmarks/SortBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/SortBenchmark-jdk21-results.txt @@ -2,15 +2,15 @@ radix sort ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor radix sort 25000000: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -reference TimSort key prefix array 8388 8420 45 3.0 335.5 1.0X -reference Arrays.sort 2044 2069 35 12.2 81.8 4.1X -radix sort one byte 66 69 3 379.7 2.6 127.4X -radix sort two bytes 123 126 3 204.0 4.9 68.4X -radix sort eight bytes 467 482 14 53.6 18.7 18.0X -radix sort key prefix array 556 561 6 45.0 22.2 15.1X +reference TimSort key prefix array 8459 8536 108 3.0 338.4 1.0X +reference Arrays.sort 2076 2089 17 12.0 83.1 4.1X +radix sort one byte 67 70 2 372.3 2.7 126.0X +radix sort two bytes 125 130 4 200.1 5.0 67.7X +radix sort eight bytes 470 481 6 53.1 18.8 18.0X +radix sort key prefix array 566 572 4 44.2 22.6 14.9X diff --git a/sql/core/benchmarks/SortBenchmark-results.txt b/sql/core/benchmarks/SortBenchmark-results.txt index 7d6805987bcc9..68f6eed9b84cc 100644 --- a/sql/core/benchmarks/SortBenchmark-results.txt +++ b/sql/core/benchmarks/SortBenchmark-results.txt @@ -2,15 +2,15 @@ radix sort ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor radix sort 25000000: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -reference TimSort key prefix array 8372 8465 132 3.0 334.9 1.0X -reference Arrays.sort 2043 2071 40 12.2 81.7 4.1X -radix sort one byte 64 72 5 390.2 2.6 130.7X -radix sort two bytes 119 130 8 209.6 4.8 70.2X -radix sort eight bytes 476 508 27 52.5 19.0 17.6X -radix sort key prefix array 913 917 5 27.4 36.5 9.2X +reference TimSort key prefix array 8170 8294 175 3.1 326.8 1.0X +reference Arrays.sort 2059 2085 37 12.1 82.4 4.0X +radix sort one byte 64 71 5 391.0 2.6 127.8X +radix sort two bytes 117 126 5 213.2 4.7 69.7X +radix sort eight bytes 477 493 9 52.4 19.1 17.1X +radix sort key prefix array 912 920 7 27.4 36.5 9.0X diff --git a/sql/core/benchmarks/StateStoreBasicOperationsBenchmark-jdk21-results.txt b/sql/core/benchmarks/StateStoreBasicOperationsBenchmark-jdk21-results.txt index 0317e61163752..6a42c7b283b7e 100644 --- a/sql/core/benchmarks/StateStoreBasicOperationsBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/StateStoreBasicOperationsBenchmark-jdk21-results.txt @@ -2,141 +2,143 @@ put rows ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor putting 10000 rows (10000 rows to overwrite - rate 100): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -In-memory 9 10 1 1.1 936.2 1.0X -RocksDB (trackTotalNumberOfRows: true) 41 42 1 0.2 4068.9 0.2X -RocksDB (trackTotalNumberOfRows: false) 15 16 1 0.7 1500.4 0.6X +In-memory 10 11 1 1.0 968.0 1.0X +RocksDB (trackTotalNumberOfRows: true) 40 42 2 0.2 4033.5 0.2X +RocksDB (trackTotalNumberOfRows: false) 15 16 1 0.7 1502.0 0.6X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor putting 10000 rows (5000 rows to overwrite - rate 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------- -In-memory 9 11 1 1.1 929.8 1.0X -RocksDB (trackTotalNumberOfRows: true) 40 41 1 0.3 3955.7 0.2X -RocksDB (trackTotalNumberOfRows: false) 15 16 1 0.7 1497.3 0.6X +In-memory 9 11 1 1.1 943.4 1.0X +RocksDB (trackTotalNumberOfRows: true) 48 50 1 0.2 4817.3 0.2X +RocksDB (trackTotalNumberOfRows: false) 15 16 1 0.7 1499.9 0.6X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor putting 10000 rows (1000 rows to overwrite - rate 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------- -In-memory 9 10 1 1.1 907.5 1.0X -RocksDB (trackTotalNumberOfRows: true) 39 40 1 0.3 3886.5 0.2X -RocksDB (trackTotalNumberOfRows: false) 15 16 1 0.7 1497.2 0.6X +In-memory 9 10 1 1.1 906.1 1.0X +RocksDB (trackTotalNumberOfRows: true) 54 56 1 0.2 5418.7 0.2X +RocksDB (trackTotalNumberOfRows: false) 15 16 1 0.7 1535.8 0.6X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor putting 10000 rows (0 rows to overwrite - rate 0): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -In-memory 9 10 1 1.1 904.0 1.0X -RocksDB (trackTotalNumberOfRows: true) 39 40 1 0.3 3859.8 0.2X -RocksDB (trackTotalNumberOfRows: false) 15 16 0 0.7 1497.2 0.6X +In-memory 10 11 1 1.1 951.0 1.0X +RocksDB (trackTotalNumberOfRows: true) 57 58 1 0.2 5680.0 0.2X +RocksDB (trackTotalNumberOfRows: false) 16 16 1 0.6 1563.7 0.6X ================================================================================================ merge rows ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor merging 10000 rows with 10 values per key (10000 rows to overwrite - rate 100): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------- -RocksDB (trackTotalNumberOfRows: true) 519 533 7 0.0 51916.6 1.0X -RocksDB (trackTotalNumberOfRows: false) 171 177 3 0.1 17083.9 3.0X +RocksDB (trackTotalNumberOfRows: true) 531 550 7 0.0 53076.7 1.0X +RocksDB (trackTotalNumberOfRows: false) 175 183 3 0.1 17475.3 3.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor merging 10000 rows with 10 values per key (5000 rows to overwrite - rate 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------ -RocksDB (trackTotalNumberOfRows: true) 506 521 7 0.0 50644.0 1.0X -RocksDB (trackTotalNumberOfRows: false) 170 176 3 0.1 17022.0 3.0X +RocksDB (trackTotalNumberOfRows: true) 522 538 8 0.0 52183.0 1.0X +RocksDB (trackTotalNumberOfRows: false) 171 177 3 0.1 17100.7 3.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor merging 10000 rows with 10 values per key (1000 rows to overwrite - rate 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------ -RocksDB (trackTotalNumberOfRows: true) 493 508 6 0.0 49319.3 1.0X -RocksDB (trackTotalNumberOfRows: false) 169 175 3 0.1 16897.6 2.9X +RocksDB (trackTotalNumberOfRows: true) 518 534 7 0.0 51827.6 1.0X +RocksDB (trackTotalNumberOfRows: false) 174 179 2 0.1 17358.9 3.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor merging 10000 rows with 10 values per key (0 rows to overwrite - rate 0): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------- -RocksDB (trackTotalNumberOfRows: true) 495 508 6 0.0 49462.5 1.0X -RocksDB (trackTotalNumberOfRows: false) 169 175 3 0.1 16896.6 2.9X +RocksDB (trackTotalNumberOfRows: true) 522 537 6 0.0 52162.9 1.0X +RocksDB (trackTotalNumberOfRows: false) 173 181 3 0.1 17259.8 3.0X ================================================================================================ delete rows ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor trying to delete 10000 rows from 10000 rows(10000 rows are non-existing - rate 100): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------- -In-memory 0 1 0 26.3 38.0 1.0X -RocksDB (trackTotalNumberOfRows: true) 39 41 1 0.3 3942.0 0.0X -RocksDB (trackTotalNumberOfRows: false) 15 16 1 0.7 1529.2 0.0X +In-memory 1 1 0 17.5 57.2 1.0X +RocksDB (trackTotalNumberOfRows: true) 56 58 1 0.2 5647.4 0.0X +RocksDB (trackTotalNumberOfRows: false) 15 16 1 0.7 1525.5 0.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor trying to delete 10000 rows from 10000 rows(5000 rows are non-existing - rate 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------------------------- -In-memory 8 9 1 1.3 790.4 1.0X -RocksDB (trackTotalNumberOfRows: true) 40 41 1 0.2 4036.7 0.2X -RocksDB (trackTotalNumberOfRows: false) 15 16 0 0.7 1536.9 0.5X +In-memory 8 9 0 1.2 826.2 1.0X +RocksDB (trackTotalNumberOfRows: true) 50 51 1 0.2 4955.2 0.2X +RocksDB (trackTotalNumberOfRows: false) 15 16 0 0.7 1533.6 0.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor trying to delete 10000 rows from 10000 rows(1000 rows are non-existing - rate 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------------------------- -In-memory 8 10 1 1.2 847.0 1.0X -RocksDB (trackTotalNumberOfRows: true) 41 42 1 0.2 4099.8 0.2X -RocksDB (trackTotalNumberOfRows: false) 16 16 0 0.6 1563.3 0.5X +In-memory 9 11 1 1.1 892.6 1.0X +RocksDB (trackTotalNumberOfRows: true) 44 45 1 0.2 4351.5 0.2X +RocksDB (trackTotalNumberOfRows: false) 15 16 1 0.7 1526.5 0.6X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor trying to delete 10000 rows from 10000 rows(0 rows are non-existing - rate 0): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------- -In-memory 9 10 1 1.2 859.4 1.0X -RocksDB (trackTotalNumberOfRows: true) 41 42 1 0.2 4118.9 0.2X -RocksDB (trackTotalNumberOfRows: false) 15 16 1 0.7 1507.8 0.6X +In-memory 9 10 1 1.1 894.3 1.0X +RocksDB (trackTotalNumberOfRows: true) 41 42 1 0.2 4142.6 0.2X +RocksDB (trackTotalNumberOfRows: false) 15 16 0 0.7 1509.7 0.6X ================================================================================================ evict rows ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor evicting 10000 rows (maxTimestampToEvictInMillis: 9999) from 10000 rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------- -In-memory 8 9 1 1.2 831.0 1.0X -RocksDB (trackTotalNumberOfRows: true) 40 40 1 0.3 3956.6 0.2X -RocksDB (trackTotalNumberOfRows: false) 16 16 0 0.6 1571.3 0.5X +In-memory 9 9 1 1.2 851.8 1.0X +RocksDB (trackTotalNumberOfRows: true) 40 41 1 0.2 4030.3 0.2X +RocksDB (trackTotalNumberOfRows: false) 16 17 1 0.6 1632.1 0.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor evicting 5000 rows (maxTimestampToEvictInMillis: 4999) from 10000 rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------ -In-memory 8 8 1 1.3 787.6 1.0X -RocksDB (trackTotalNumberOfRows: true) 21 22 0 0.5 2112.6 0.4X -RocksDB (trackTotalNumberOfRows: false) 9 9 0 1.1 932.9 0.8X +In-memory 8 9 1 1.2 807.1 1.0X +RocksDB (trackTotalNumberOfRows: true) 21 22 1 0.5 2124.6 0.4X +RocksDB (trackTotalNumberOfRows: false) 9 10 0 1.1 940.9 0.9X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor evicting 1000 rows (maxTimestampToEvictInMillis: 999) from 10000 rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------------- -In-memory 7 8 0 1.4 715.7 1.0X -RocksDB (trackTotalNumberOfRows: true) 7 7 0 1.5 676.3 1.1X -RocksDB (trackTotalNumberOfRows: false) 4 5 0 2.3 442.3 1.6X +In-memory 7 8 1 1.4 739.1 1.0X +RocksDB (trackTotalNumberOfRows: true) 7 7 0 1.4 697.1 1.1X +RocksDB (trackTotalNumberOfRows: false) 5 5 0 2.2 460.0 1.6X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor evicting 0 rows (maxTimestampToEvictInMillis: -1) from 10000 rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------- -In-memory 0 0 0 23.8 41.9 1.0X -RocksDB (trackTotalNumberOfRows: true) 3 3 0 3.2 309.5 0.1X -RocksDB (trackTotalNumberOfRows: false) 3 3 0 3.2 309.9 0.1X +In-memory 0 1 0 23.9 41.8 1.0X +RocksDB (trackTotalNumberOfRows: true) 3 3 0 3.0 328.1 0.1X +RocksDB (trackTotalNumberOfRows: false) 3 3 0 3.0 329.5 0.1X + + diff --git a/sql/core/benchmarks/StateStoreBasicOperationsBenchmark-results.txt b/sql/core/benchmarks/StateStoreBasicOperationsBenchmark-results.txt index d2aa646d5ec1d..9c99d86e3bfd1 100644 --- a/sql/core/benchmarks/StateStoreBasicOperationsBenchmark-results.txt +++ b/sql/core/benchmarks/StateStoreBasicOperationsBenchmark-results.txt @@ -2,141 +2,143 @@ put rows ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor putting 10000 rows (10000 rows to overwrite - rate 100): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -In-memory 10 12 1 1.0 960.1 1.0X -RocksDB (trackTotalNumberOfRows: true) 42 43 2 0.2 4173.9 0.2X -RocksDB (trackTotalNumberOfRows: false) 16 16 1 0.6 1551.6 0.6X +In-memory 9 10 0 1.1 927.6 1.0X +RocksDB (trackTotalNumberOfRows: true) 41 42 2 0.2 4063.3 0.2X +RocksDB (trackTotalNumberOfRows: false) 15 16 1 0.7 1500.5 0.6X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor putting 10000 rows (5000 rows to overwrite - rate 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------- -In-memory 10 12 1 1.0 970.1 1.0X -RocksDB (trackTotalNumberOfRows: true) 41 42 1 0.2 4095.8 0.2X -RocksDB (trackTotalNumberOfRows: false) 15 17 1 0.6 1544.6 0.6X +In-memory 9 10 1 1.1 926.5 1.0X +RocksDB (trackTotalNumberOfRows: true) 49 50 1 0.2 4853.7 0.2X +RocksDB (trackTotalNumberOfRows: false) 15 16 1 0.7 1495.9 0.6X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor putting 10000 rows (1000 rows to overwrite - rate 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------- -In-memory 9 11 1 1.1 933.3 1.0X -RocksDB (trackTotalNumberOfRows: true) 40 41 1 0.3 3966.2 0.2X -RocksDB (trackTotalNumberOfRows: false) 15 16 1 0.6 1540.2 0.6X +In-memory 9 10 0 1.1 900.9 1.0X +RocksDB (trackTotalNumberOfRows: true) 54 55 1 0.2 5359.6 0.2X +RocksDB (trackTotalNumberOfRows: false) 15 15 1 0.7 1491.9 0.6X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor putting 10000 rows (0 rows to overwrite - rate 0): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -In-memory 9 11 1 1.1 936.1 1.0X -RocksDB (trackTotalNumberOfRows: true) 39 41 1 0.3 3942.4 0.2X -RocksDB (trackTotalNumberOfRows: false) 15 16 0 0.7 1530.1 0.6X +In-memory 9 10 0 1.1 899.6 1.0X +RocksDB (trackTotalNumberOfRows: true) 55 56 1 0.2 5500.9 0.2X +RocksDB (trackTotalNumberOfRows: false) 15 15 0 0.7 1493.8 0.6X ================================================================================================ merge rows ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor merging 10000 rows with 10 values per key (10000 rows to overwrite - rate 100): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------- -RocksDB (trackTotalNumberOfRows: true) 525 538 6 0.0 52516.4 1.0X -RocksDB (trackTotalNumberOfRows: false) 170 177 4 0.1 16960.4 3.1X +RocksDB (trackTotalNumberOfRows: true) 515 526 6 0.0 51507.8 1.0X +RocksDB (trackTotalNumberOfRows: false) 167 175 3 0.1 16747.6 3.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor merging 10000 rows with 10 values per key (5000 rows to overwrite - rate 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------ -RocksDB (trackTotalNumberOfRows: true) 514 528 6 0.0 51351.9 1.0X -RocksDB (trackTotalNumberOfRows: false) 168 174 4 0.1 16794.0 3.1X +RocksDB (trackTotalNumberOfRows: true) 516 526 4 0.0 51588.3 1.0X +RocksDB (trackTotalNumberOfRows: false) 166 171 3 0.1 16579.3 3.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor merging 10000 rows with 10 values per key (1000 rows to overwrite - rate 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------ -RocksDB (trackTotalNumberOfRows: true) 500 513 6 0.0 49955.1 1.0X -RocksDB (trackTotalNumberOfRows: false) 169 174 2 0.1 16867.1 3.0X +RocksDB (trackTotalNumberOfRows: true) 513 523 5 0.0 51287.0 1.0X +RocksDB (trackTotalNumberOfRows: false) 165 170 2 0.1 16532.2 3.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor merging 10000 rows with 10 values per key (0 rows to overwrite - rate 0): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------- -RocksDB (trackTotalNumberOfRows: true) 492 508 8 0.0 49225.8 1.0X -RocksDB (trackTotalNumberOfRows: false) 168 173 3 0.1 16757.2 2.9X +RocksDB (trackTotalNumberOfRows: true) 513 521 4 0.0 51288.3 1.0X +RocksDB (trackTotalNumberOfRows: false) 165 169 2 0.1 16482.6 3.1X ================================================================================================ delete rows ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor trying to delete 10000 rows from 10000 rows(10000 rows are non-existing - rate 100): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------- -In-memory 0 1 0 26.1 38.3 1.0X -RocksDB (trackTotalNumberOfRows: true) 38 40 1 0.3 3835.6 0.0X -RocksDB (trackTotalNumberOfRows: false) 15 16 1 0.7 1455.7 0.0X +In-memory 0 0 0 27.9 35.8 1.0X +RocksDB (trackTotalNumberOfRows: true) 54 56 1 0.2 5448.6 0.0X +RocksDB (trackTotalNumberOfRows: false) 15 15 0 0.7 1458.7 0.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor trying to delete 10000 rows from 10000 rows(5000 rows are non-existing - rate 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------------------------- -In-memory 8 9 1 1.3 793.9 1.0X -RocksDB (trackTotalNumberOfRows: true) 40 41 1 0.2 4018.1 0.2X -RocksDB (trackTotalNumberOfRows: false) 15 16 0 0.7 1505.6 0.5X +In-memory 8 8 0 1.3 772.5 1.0X +RocksDB (trackTotalNumberOfRows: true) 48 49 1 0.2 4773.0 0.2X +RocksDB (trackTotalNumberOfRows: false) 14 15 0 0.7 1445.6 0.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor trying to delete 10000 rows from 10000 rows(1000 rows are non-existing - rate 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------------------------- -In-memory 8 10 1 1.2 837.2 1.0X -RocksDB (trackTotalNumberOfRows: true) 41 42 1 0.2 4073.9 0.2X -RocksDB (trackTotalNumberOfRows: false) 15 16 1 0.7 1470.6 0.6X +In-memory 8 9 0 1.2 826.1 1.0X +RocksDB (trackTotalNumberOfRows: true) 42 43 1 0.2 4198.7 0.2X +RocksDB (trackTotalNumberOfRows: false) 15 15 0 0.7 1460.1 0.6X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor trying to delete 10000 rows from 10000 rows(0 rows are non-existing - rate 0): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------- -In-memory 8 9 0 1.2 843.6 1.0X -RocksDB (trackTotalNumberOfRows: true) 41 42 1 0.2 4088.7 0.2X -RocksDB (trackTotalNumberOfRows: false) 15 15 0 0.7 1466.1 0.6X +In-memory 8 9 0 1.2 833.2 1.0X +RocksDB (trackTotalNumberOfRows: true) 40 41 1 0.2 4043.6 0.2X +RocksDB (trackTotalNumberOfRows: false) 15 15 0 0.7 1457.1 0.6X ================================================================================================ evict rows ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor evicting 10000 rows (maxTimestampToEvictInMillis: 9999) from 10000 rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------- -In-memory 8 9 0 1.2 833.5 1.0X -RocksDB (trackTotalNumberOfRows: true) 40 41 0 0.3 3976.5 0.2X -RocksDB (trackTotalNumberOfRows: false) 16 16 0 0.6 1588.1 0.5X +In-memory 8 9 0 1.2 835.1 1.0X +RocksDB (trackTotalNumberOfRows: true) 40 40 0 0.3 3972.7 0.2X +RocksDB (trackTotalNumberOfRows: false) 15 16 0 0.6 1547.2 0.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor evicting 5000 rows (maxTimestampToEvictInMillis: 4999) from 10000 rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------ -In-memory 8 8 0 1.3 784.3 1.0X -RocksDB (trackTotalNumberOfRows: true) 22 22 0 0.5 2155.1 0.4X -RocksDB (trackTotalNumberOfRows: false) 10 10 0 1.0 986.9 0.8X +In-memory 8 8 0 1.3 775.3 1.0X +RocksDB (trackTotalNumberOfRows: true) 21 22 1 0.5 2130.5 0.4X +RocksDB (trackTotalNumberOfRows: false) 10 10 0 1.0 973.2 0.8X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor evicting 1000 rows (maxTimestampToEvictInMillis: 999) from 10000 rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------------- -In-memory 7 8 0 1.4 722.3 1.0X -RocksDB (trackTotalNumberOfRows: true) 7 7 0 1.4 718.8 1.0X -RocksDB (trackTotalNumberOfRows: false) 5 5 0 2.0 488.7 1.5X +In-memory 7 8 0 1.4 704.9 1.0X +RocksDB (trackTotalNumberOfRows: true) 7 7 0 1.4 717.5 1.0X +RocksDB (trackTotalNumberOfRows: false) 5 5 0 2.1 482.3 1.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure AMD EPYC 7763 64-Core Processor evicting 0 rows (maxTimestampToEvictInMillis: -1) from 10000 rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------- -In-memory 0 1 0 21.3 46.9 1.0X -RocksDB (trackTotalNumberOfRows: true) 4 4 0 2.8 358.9 0.1X -RocksDB (trackTotalNumberOfRows: false) 4 4 0 2.8 358.7 0.1X +In-memory 0 0 0 23.0 43.4 1.0X +RocksDB (trackTotalNumberOfRows: true) 4 4 0 2.8 352.8 0.1X +RocksDB (trackTotalNumberOfRows: false) 4 4 0 2.8 353.7 0.1X + + diff --git a/sql/core/benchmarks/SubExprEliminationBenchmark-jdk21-results.txt b/sql/core/benchmarks/SubExprEliminationBenchmark-jdk21-results.txt index 1ecc3156431a3..73165e6568854 100644 --- a/sql/core/benchmarks/SubExprEliminationBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/SubExprEliminationBenchmark-jdk21-results.txt @@ -3,23 +3,23 @@ Benchmark for performance of subexpression elimination ================================================================================================ Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor from_json as subExpr in Project: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -subExprElimination false, codegen: true 6301 6360 57 0.0 63006990.7 1.0X -subExprElimination false, codegen: false 6059 6228 248 0.0 60587697.9 1.0X -subExprElimination true, codegen: true 1194 1265 92 0.0 11936424.6 5.3X -subExprElimination true, codegen: false 1184 1244 62 0.0 11839767.3 5.3X +subExprElimination false, codegen: true 5718 5952 306 0.0 57180602.0 1.0X +subExprElimination false, codegen: false 5691 5724 36 0.0 56912726.3 1.0X +subExprElimination true, codegen: true 1296 1353 56 0.0 12955974.7 4.4X +subExprElimination true, codegen: false 1195 1274 71 0.0 11946584.3 4.8X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor from_json as subExpr in Filter: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -subExprElimination false, codegen: true 6555 6609 65 0.0 65552511.3 1.0X -subExprElimination false, codegen: false 6432 6501 92 0.0 64321921.4 1.0X -subExprElimination true, codegen: true 1871 1890 27 0.0 18708460.4 3.5X -subExprElimination true, codegen: false 1853 1894 37 0.0 18527264.1 3.5X +subExprElimination false, codegen: true 6036 6207 176 0.0 60362284.0 1.0X +subExprElimination false, codegen: false 6027 6111 106 0.0 60270452.3 1.0X +subExprElimination true, codegen: true 1975 2005 50 0.0 19751387.6 3.1X +subExprElimination true, codegen: false 1844 1969 108 0.0 18442635.2 3.3X diff --git a/sql/core/benchmarks/SubExprEliminationBenchmark-results.txt b/sql/core/benchmarks/SubExprEliminationBenchmark-results.txt index 0ad0b3fdcc6cc..42f2df1de0337 100644 --- a/sql/core/benchmarks/SubExprEliminationBenchmark-results.txt +++ b/sql/core/benchmarks/SubExprEliminationBenchmark-results.txt @@ -3,23 +3,23 @@ Benchmark for performance of subexpression elimination ================================================================================================ Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor from_json as subExpr in Project: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -subExprElimination false, codegen: true 6634 6827 265 0.0 66342414.3 1.0X -subExprElimination false, codegen: false 6492 6677 172 0.0 64915975.3 1.0X -subExprElimination true, codegen: true 1306 1328 31 0.0 13062245.2 5.1X -subExprElimination true, codegen: false 1253 1292 39 0.0 12527565.4 5.3X +subExprElimination false, codegen: true 6332 6606 239 0.0 63318653.1 1.0X +subExprElimination false, codegen: false 6178 6270 117 0.0 61782941.5 1.0X +subExprElimination true, codegen: true 1438 1497 64 0.0 14383249.6 4.4X +subExprElimination true, codegen: false 1382 1415 48 0.0 13817508.7 4.6X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor from_json as subExpr in Filter: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -subExprElimination false, codegen: true 6880 7088 206 0.0 68799261.9 1.0X -subExprElimination false, codegen: false 6806 6929 123 0.0 68063401.4 1.0X -subExprElimination true, codegen: true 1838 1928 93 0.0 18380916.3 3.7X -subExprElimination true, codegen: false 1847 1920 64 0.0 18467889.5 3.7X +subExprElimination false, codegen: true 6539 6660 105 0.0 65387594.7 1.0X +subExprElimination false, codegen: false 6548 6584 49 0.0 65477566.0 1.0X +subExprElimination true, codegen: true 2032 2093 66 0.0 20323994.4 3.2X +subExprElimination true, codegen: false 2016 2078 69 0.0 20155395.9 3.2X diff --git a/sql/core/benchmarks/TPCDSQueryBenchmark-jdk21-results.txt b/sql/core/benchmarks/TPCDSQueryBenchmark-jdk21-results.txt index 614381dc3e578..0d13b70e5682e 100644 --- a/sql/core/benchmarks/TPCDSQueryBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/TPCDSQueryBenchmark-jdk21-results.txt @@ -1,810 +1,810 @@ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q1 550 699 210 0.8 1191.6 1.0X +q1 405 522 130 1.1 878.3 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q2 792 809 24 2.8 355.0 1.0X +q2 676 726 47 3.3 302.9 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q3 221 261 43 13.5 74.3 1.0X +q3 184 201 10 16.1 62.0 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q4 4185 4574 550 1.2 802.9 1.0X +q4 4172 4480 435 1.2 800.6 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q5 1066 1325 366 5.3 189.4 1.0X +q5 1050 1064 20 5.4 186.6 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q6 1063 1110 67 2.9 340.5 1.0X +q6 989 1051 88 3.2 316.9 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q7 549 573 26 8.9 112.2 1.0X +q7 498 515 25 9.8 101.9 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q8 460 515 49 6.8 148.1 1.0X +q8 402 426 20 7.7 129.7 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q9 863 882 25 0.0 24667420.2 1.0X +q9 872 873 1 0.0 24921608.5 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q10 1814 1934 170 1.1 875.8 1.0X +q10 1859 1959 140 1.1 897.9 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q11 1614 1820 290 2.3 428.0 1.0X +q11 1675 1908 330 2.3 444.0 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q12 163 199 47 5.0 200.9 1.0X +q12 143 167 24 5.7 176.8 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q13 726 762 51 6.8 147.2 1.0X +q13 724 762 48 6.8 146.8 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q14a 5344 5424 114 1.0 1041.7 1.0X +q14a 4892 5087 277 1.0 953.6 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q14b 3893 4053 226 1.3 758.8 1.0X +q14b 3769 3856 123 1.4 734.7 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q15 408 443 27 4.1 245.5 1.0X +q15 404 441 43 4.1 242.8 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q16 564 627 42 2.8 360.8 1.0X +q16 587 645 53 2.7 375.9 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q17 1246 1299 74 3.8 265.1 1.0X +q17 1372 1376 5 3.4 292.1 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q18 944 1043 140 3.8 262.2 1.0X +q18 926 1142 304 3.9 257.2 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q19 292 321 31 10.7 93.5 1.0X +q19 279 304 36 11.2 89.3 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q20 173 187 21 8.9 112.8 1.0X +q20 160 183 25 9.6 104.7 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q21 661 672 10 17.9 55.8 1.0X +q21 623 650 28 19.0 52.6 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q22 3223 3302 112 3.7 272.3 1.0X +q22 3106 3138 46 3.8 262.4 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q23a 5979 6048 98 0.9 1143.3 1.0X +q23a 6166 6171 6 0.8 1179.1 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q23b 6074 6126 74 0.9 1161.4 1.0X +q23b 6289 6440 214 0.8 1202.5 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q24a 154 248 48 21.7 46.1 1.0X +q24a 206 235 24 16.2 61.7 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q24b 208 249 30 16.0 62.5 1.0X +q24b 158 232 46 21.1 47.4 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q25 1401 1419 26 3.4 298.1 1.0X +q25 1285 1317 46 3.7 273.5 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q26 300 332 35 11.5 86.8 1.0X +q26 292 314 23 11.8 84.6 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q27 477 504 28 10.2 97.6 1.0X +q27 516 553 38 9.5 105.5 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q28 1136 1207 100 2.5 394.5 1.0X +q28 1176 1179 5 2.4 408.4 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q29 1244 1413 239 3.8 264.7 1.0X +q29 1292 1294 2 3.6 275.0 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q30 388 406 33 0.8 1315.6 1.0X +q30 387 415 33 0.8 1313.2 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q31 749 775 31 5.0 201.3 1.0X +q31 740 843 125 5.0 198.8 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q32 183 218 34 8.3 119.8 1.0X +q32 183 198 20 8.3 119.8 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q33 379 421 47 13.7 73.2 1.0X +q33 401 432 25 12.9 77.3 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q34 341 379 50 9.0 111.4 1.0X +q34 336 375 32 9.1 109.8 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q35 1145 1212 94 1.8 552.9 1.0X +q35 1231 1240 13 1.7 594.2 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q36 499 531 29 5.9 168.1 1.0X +q36 494 531 31 6.0 166.4 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q37 817 864 44 16.3 61.5 1.0X +q37 793 811 24 16.7 59.8 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q38 634 690 52 8.2 121.7 1.0X +q38 645 690 43 8.1 123.8 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q39a 1401 1529 180 8.4 118.4 1.0X +q39a 1410 1507 136 8.4 119.2 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q39b 1356 1362 8 8.7 114.5 1.0X +q39b 1362 1375 19 8.7 115.1 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q40 288 338 43 5.8 172.2 1.0X +q40 269 296 29 6.2 160.8 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q41 151 168 19 0.1 8388.1 1.0X +q41 147 167 20 0.1 8166.0 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q42 142 156 24 20.9 48.0 1.0X +q42 146 164 23 20.4 49.1 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q43 275 300 24 10.8 93.0 1.0X +q43 283 305 26 10.4 96.0 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q44 302 338 39 9.6 104.2 1.0X +q44 329 380 62 8.8 113.7 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q45 182 212 34 5.3 189.9 1.0X +q45 167 201 23 5.7 174.0 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q46 418 447 42 7.4 134.3 1.0X +q46 472 504 27 6.6 151.9 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q47 1518 1551 48 2.0 510.8 1.0X +q47 1488 1654 235 2.0 501.0 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q48 874 891 19 5.6 177.4 1.0X +q48 877 906 27 5.6 178.1 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q49 521 632 105 10.8 92.8 1.0X +q49 647 693 58 8.7 115.2 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q50 565 599 27 5.7 174.4 1.0X +q50 643 676 28 5.0 198.5 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q51 2469 2610 199 1.5 672.5 1.0X +q51 2677 2903 319 1.4 729.0 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q52 140 160 21 21.2 47.1 1.0X +q52 145 157 16 20.5 48.7 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q53 276 300 23 10.8 92.8 1.0X +q53 276 292 14 10.8 92.9 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q54 1168 1204 50 4.5 221.2 1.0X +q54 1223 1256 47 4.3 231.7 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q55 139 151 17 21.3 46.9 1.0X +q55 145 168 24 20.5 48.7 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q56 415 431 20 12.5 80.2 1.0X +q56 397 426 29 13.0 76.7 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q57 772 822 73 2.0 504.5 1.0X +q57 752 812 62 2.0 491.3 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q58 394 426 35 13.0 76.8 1.0X +q58 396 437 38 13.0 77.2 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q59 583 626 41 5.1 197.4 1.0X +q59 582 633 61 5.1 197.0 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q60 383 440 55 13.5 74.0 1.0X +q60 405 467 59 12.8 78.2 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q61 530 642 191 5.9 169.8 1.0X +q61 567 695 169 5.5 181.6 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q62 162 181 23 4.9 204.9 1.0X +q62 166 185 25 4.8 210.1 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q63 269 305 27 11.1 90.4 1.0X +q63 267 308 25 11.1 90.0 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q64 2129 2334 290 3.3 307.6 1.0X +q64 2323 2600 392 3.0 335.6 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q65 614 646 39 4.8 206.6 1.0X +q65 680 704 35 4.4 228.8 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q66 509 575 60 4.6 219.4 1.0X +q66 518 579 65 4.5 223.5 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q67 5168 5274 149 0.6 1739.7 1.0X +q67 5487 5527 57 0.5 1847.0 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q68 416 455 40 7.5 133.6 1.0X +q68 485 505 22 6.4 155.8 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q69 1601 1617 22 1.3 773.3 1.0X +q69 1592 1605 17 1.3 768.9 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q70 548 569 25 5.4 185.6 1.0X +q70 557 594 34 5.3 188.8 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q71 345 377 38 15.1 66.1 1.0X +q71 361 389 31 14.5 69.2 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q72 115881 116420 762 0.1 7550.3 1.0X +q72 111274 114140 4054 0.1 7250.1 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q73 304 342 30 10.1 99.3 1.0X +q73 315 352 36 9.7 103.0 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q74 1079 1503 600 3.5 286.2 1.0X +q74 1104 1493 550 3.4 292.6 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q75 1369 1575 292 4.1 242.9 1.0X +q75 1291 1479 266 4.4 229.1 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q76 269 305 42 19.1 52.5 1.0X +q76 286 304 25 18.0 55.7 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q77 546 755 182 10.3 97.3 1.0X +q77 452 545 83 12.4 80.5 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q78 2326 2453 179 2.4 414.3 1.0X +q78 1995 2312 450 2.8 355.2 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q79 403 432 33 7.6 131.7 1.0X +q79 427 454 38 7.2 139.5 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q80 1534 1569 49 3.7 271.7 1.0X +q80 1196 1286 127 4.7 211.9 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q81 293 359 43 1.3 798.5 1.0X +q81 335 364 39 1.1 914.5 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q82 1067 1080 19 13.8 72.5 1.0X +q82 1066 1075 13 13.8 72.4 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q83 232 285 56 2.6 389.5 1.0X +q83 239 267 18 2.5 401.0 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q84 640 688 70 3.7 270.5 1.0X +q84 629 657 32 3.8 265.9 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q85 1729 2002 386 1.6 610.1 1.0X +q85 1810 2045 333 1.6 638.5 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q86 178 207 30 4.6 219.3 1.0X +q86 176 194 23 4.6 217.4 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q87 660 717 61 7.9 126.7 1.0X +q87 650 705 54 8.0 124.7 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q88 1121 1286 234 2.7 376.9 1.0X +q88 1182 1328 205 2.5 397.7 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q89 292 335 35 10.2 98.4 1.0X +q89 284 314 33 10.5 95.7 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q90 110 130 16 7.4 135.4 1.0X +q90 118 143 23 6.9 145.8 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q91 304 339 35 7.6 132.4 1.0X +q91 324 346 20 7.1 141.2 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q92 133 156 23 6.1 163.8 1.0X +q92 125 151 24 6.5 154.3 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q93 424 453 39 7.5 134.0 1.0X +q93 380 405 28 8.3 120.0 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q94 296 345 53 2.8 352.1 1.0X +q94 312 341 38 2.7 371.0 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q95 5098 5262 232 0.2 6054.1 1.0X +q95 5317 5518 285 0.2 6314.6 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q96 155 167 15 19.1 52.2 1.0X +q96 163 181 24 18.3 54.8 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q97 1147 1178 44 3.8 261.1 1.0X +q97 1110 1176 93 4.0 252.7 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q98 257 297 48 11.6 86.4 1.0X +q98 260 277 24 11.4 87.4 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q99 240 259 24 6.3 158.5 1.0X +q99 245 265 24 6.2 161.7 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q5a-v2.7 968 1233 375 5.8 172.0 1.0X +q5a-v2.7 1019 1185 234 5.5 181.1 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q6-v2.7 917 940 33 3.4 294.0 1.0X +q6-v2.7 909 937 26 3.4 291.2 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q10a-v2.7 1654 1690 51 1.3 798.9 1.0X +q10a-v2.7 1665 1721 79 1.2 803.9 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q11-v2.7 1472 1902 607 2.6 390.3 1.0X +q11-v2.7 1547 1844 419 2.4 410.3 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q12-v2.7 124 139 21 6.5 153.3 1.0X +q12-v2.7 125 139 21 6.5 153.9 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q14-v2.7 3734 3804 99 1.4 728.0 1.0X +q14-v2.7 3522 3706 261 1.5 686.6 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q14a-v2.7 7026 7251 319 0.7 1369.6 1.0X +q14a-v2.7 7188 7299 156 0.7 1401.2 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q18a-v2.7 1874 2095 312 1.9 520.2 1.0X +q18a-v2.7 1854 1861 10 1.9 514.8 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q20-v2.7 150 170 25 10.2 98.2 1.0X +q20-v2.7 149 172 35 10.3 97.6 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q22-v2.7 12490 12574 120 0.9 1055.2 1.0X +q22-v2.7 12659 12738 112 0.9 1069.5 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q22a-v2.7 1930 1938 12 6.1 163.0 1.0X +q22a-v2.7 1850 1901 72 6.4 156.3 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q24-v2.7 101 230 75 33.2 30.2 1.0X +q24-v2.7 197 239 29 17.0 59.0 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q27a-v2.7 1145 1236 129 4.3 234.2 1.0X +q27a-v2.7 1122 1150 40 4.4 229.3 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q34-v2.7 311 348 35 9.8 101.6 1.0X +q34-v2.7 338 357 30 9.1 110.4 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q35-v2.7 1202 1244 58 1.7 580.6 1.0X +q35-v2.7 1221 1238 23 1.7 589.8 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q35a-v2.7 1154 1161 10 1.8 557.5 1.0X +q35a-v2.7 1173 1208 49 1.8 566.3 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q36a-v2.7 454 483 42 6.5 152.7 1.0X +q36a-v2.7 462 493 42 6.4 155.7 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q47-v2.7 1474 1651 250 2.0 496.2 1.0X +q47-v2.7 1492 1637 205 2.0 502.1 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q49-v2.7 513 572 99 10.9 91.4 1.0X +q49-v2.7 541 612 77 10.4 96.4 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q51a-v2.7 15031 15419 550 0.2 4093.6 1.0X +q51a-v2.7 14021 14459 619 0.3 3818.5 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q57-v2.7 748 836 127 2.0 488.5 1.0X +q57-v2.7 749 822 81 2.0 489.0 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q64-v2.7 2164 2350 262 3.2 312.7 1.0X +q64-v2.7 2085 2377 413 3.3 301.3 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q67a-v2.7 6595 6800 289 0.5 2219.9 1.0X +q67a-v2.7 6711 7040 464 0.4 2259.1 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q70a-v2.7 618 630 13 4.8 209.4 1.0X +q70a-v2.7 598 643 49 4.9 202.7 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q72-v2.7 107039 107499 650 0.1 6974.1 1.0X +q72-v2.7 112833 114390 2202 0.1 7351.7 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q74-v2.7 977 1034 81 3.9 259.0 1.0X +q74-v2.7 1067 1068 0 3.5 283.0 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q75-v2.7 1130 1379 352 5.0 200.6 1.0X +q75-v2.7 1182 1411 325 4.8 209.8 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q77a-v2.7 656 906 366 8.6 116.7 1.0X +q77a-v2.7 1302 1356 77 4.3 231.8 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q78-v2.7 2148 2283 191 2.6 382.5 1.0X +q78-v2.7 1772 2057 403 3.2 315.5 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q80a-v2.7 1388 1606 308 4.1 245.9 1.0X +q80a-v2.7 1412 1593 256 4.0 250.2 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q86a-v2.7 222 269 49 3.6 274.3 1.0X +q86a-v2.7 218 251 35 3.7 268.8 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q98-v2.7 246 263 11 12.1 82.7 1.0X +q98-v2.7 258 276 22 11.5 86.7 1.0X diff --git a/sql/core/benchmarks/TPCDSQueryBenchmark-results.txt b/sql/core/benchmarks/TPCDSQueryBenchmark-results.txt index 4b8893f4ab7c4..d66ea7b619588 100644 --- a/sql/core/benchmarks/TPCDSQueryBenchmark-results.txt +++ b/sql/core/benchmarks/TPCDSQueryBenchmark-results.txt @@ -1,810 +1,810 @@ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q1 625 722 101 0.7 1354.8 1.0X +q1 338 403 38 1.4 733.2 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q2 794 870 67 2.8 355.6 1.0X +q2 768 824 74 2.9 344.2 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q3 256 278 23 11.6 86.0 1.0X +q3 214 232 16 13.9 71.9 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q4 4141 4545 571 1.3 794.6 1.0X +q4 4002 4315 442 1.3 767.9 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q5 1061 1329 380 5.3 188.5 1.0X +q5 1343 1497 218 4.2 238.6 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q6 1069 1091 31 2.9 342.6 1.0X +q6 953 1004 72 3.3 305.4 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q7 578 640 66 8.5 118.2 1.0X +q7 554 568 14 8.8 113.3 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q8 535 584 36 5.8 172.4 1.0X +q8 475 498 28 6.5 153.1 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q9 1052 1072 29 0.0 30044514.0 1.0X +q9 818 930 101 0.0 23364476.2 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q10 1818 1929 157 1.1 877.7 1.0X +q10 1842 2015 245 1.1 889.4 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q11 1881 2011 185 2.0 498.6 1.0X +q11 1749 1996 349 2.2 463.8 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q12 204 255 28 4.0 252.2 1.0X +q12 196 226 17 4.1 241.9 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q13 775 830 63 6.4 157.2 1.0X +q13 752 814 54 6.6 152.6 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q14a 4566 5052 687 1.1 890.2 1.0X +q14a 4993 5656 938 1.0 973.4 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q14b 3764 3924 226 1.4 733.8 1.0X +q14b 3721 3867 207 1.4 725.3 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q15 391 409 13 4.3 235.0 1.0X +q15 411 441 20 4.0 247.2 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q16 618 691 83 2.5 395.1 1.0X +q16 552 635 94 2.8 353.2 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q17 1306 1367 86 3.6 278.0 1.0X +q17 1419 1443 33 3.3 302.0 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q18 1161 1278 167 3.1 322.2 1.0X +q18 1100 1167 95 3.3 305.4 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q19 345 380 29 9.1 110.4 1.0X +q19 327 343 18 9.6 104.7 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q20 212 244 17 7.2 138.6 1.0X +q20 195 238 31 7.8 127.6 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q21 700 717 22 16.9 59.1 1.0X +q21 567 600 28 20.9 47.9 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q22 3286 3429 202 3.6 277.6 1.0X +q22 3145 3283 195 3.8 265.8 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q23a 5545 5893 493 0.9 1060.2 1.0X +q23a 5658 5959 425 0.9 1081.9 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q23b 5399 5526 179 1.0 1032.4 1.0X +q23b 5790 5814 34 0.9 1107.2 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q24a 161 264 75 20.7 48.4 1.0X +q24a 105 224 54 31.7 31.5 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q24b 285 310 20 11.7 85.4 1.0X +q24b 218 277 40 15.3 65.3 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q25 1326 1436 155 3.5 282.3 1.0X +q25 1298 1321 32 3.6 276.3 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q26 403 434 26 8.6 116.7 1.0X +q26 335 362 26 10.3 97.0 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q27 553 565 17 8.8 113.0 1.0X +q27 523 550 26 9.4 106.9 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q28 1506 1533 39 1.9 522.9 1.0X +q28 1210 1300 126 2.4 420.3 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q29 1385 1392 9 3.4 294.8 1.0X +q29 1229 1245 23 3.8 261.5 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q30 458 484 21 0.6 1554.1 1.0X +q30 437 476 28 0.7 1481.3 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q31 967 1188 313 3.8 259.9 1.0X +q31 963 1225 371 3.9 258.7 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q32 241 292 57 6.3 157.7 1.0X +q32 226 248 13 6.8 147.8 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q33 506 537 29 10.2 97.8 1.0X +q33 395 461 49 13.1 76.2 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q34 371 412 39 8.2 121.3 1.0X +q34 327 350 26 9.4 106.7 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q35 1388 1409 29 1.5 670.5 1.0X +q35 1228 1234 7 1.7 593.2 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q36 542 560 21 5.5 182.3 1.0X +q36 549 558 13 5.4 184.9 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q37 917 921 7 14.5 69.1 1.0X +q37 772 813 37 17.2 58.1 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q38 790 794 3 6.6 151.7 1.0X +q38 731 907 202 7.1 140.2 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q39a 1582 1743 228 7.5 133.6 1.0X +q39a 1287 1442 219 9.2 108.7 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q39b 1571 1599 39 7.5 132.7 1.0X +q39b 1247 1289 60 9.5 105.4 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q40 344 355 14 4.9 205.3 1.0X +q40 323 348 23 5.2 192.8 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q41 172 204 22 0.1 9531.6 1.0X +q41 171 193 17 0.1 9511.7 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q42 155 189 25 19.2 52.2 1.0X +q42 171 187 14 17.3 57.7 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q43 317 333 16 9.3 107.2 1.0X +q43 302 323 19 9.8 102.2 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q44 382 406 21 7.6 131.8 1.0X +q44 313 335 19 9.3 108.1 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q45 235 264 23 4.1 244.6 1.0X +q45 189 233 32 5.1 196.5 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q46 506 512 6 6.1 162.6 1.0X +q46 498 512 10 6.2 160.1 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q47 1561 1780 310 1.9 525.4 1.0X +q47 1538 1758 310 1.9 517.8 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q48 834 860 26 5.9 169.4 1.0X +q48 847 873 23 5.8 172.1 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q49 629 670 50 8.9 112.0 1.0X +q49 672 741 59 8.4 119.7 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q50 650 710 59 5.0 200.4 1.0X +q50 668 686 24 4.9 206.1 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q51 2555 2704 211 1.4 695.8 1.0X +q51 2662 2825 231 1.4 725.1 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q52 149 178 25 19.9 50.3 1.0X +q52 151 173 12 19.7 50.8 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q53 338 350 11 8.8 113.6 1.0X +q53 255 287 33 11.6 85.9 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q54 1365 1368 4 3.9 258.6 1.0X +q54 1366 1388 31 3.9 258.7 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q55 160 182 8 18.6 53.9 1.0X +q55 153 170 17 19.5 51.4 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q56 485 529 45 10.7 93.7 1.0X +q56 445 501 50 11.6 85.8 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q57 890 948 73 1.7 581.1 1.0X +q57 744 798 47 2.1 486.1 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q58 489 527 46 10.5 95.4 1.0X +q58 474 529 60 10.8 92.4 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q59 657 692 43 4.5 222.4 1.0X +q59 636 673 36 4.6 215.4 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q60 563 577 12 9.2 108.8 1.0X +q60 537 627 141 9.6 103.6 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q61 614 646 24 5.1 196.8 1.0X +q61 612 630 17 5.1 196.1 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q62 169 184 13 4.7 213.5 1.0X +q62 185 204 11 4.3 233.2 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q63 272 289 12 10.9 91.7 1.0X +q63 256 289 49 11.6 86.2 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q64 2573 2662 126 2.7 371.8 1.0X +q64 2327 2744 590 3.0 336.2 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q65 625 655 40 4.8 210.2 1.0X +q65 588 606 30 5.1 197.8 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q66 591 720 148 3.9 254.8 1.0X +q66 510 538 34 4.5 220.0 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q67 5192 5284 130 0.6 1747.6 1.0X +q67 5165 5225 84 0.6 1738.6 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q68 476 514 49 6.5 153.1 1.0X +q68 480 504 21 6.5 154.4 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q69 1523 1543 28 1.4 735.4 1.0X +q69 1648 1648 1 1.3 795.8 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q70 533 561 35 5.5 180.6 1.0X +q70 621 634 10 4.8 210.2 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q71 409 426 16 12.8 78.4 1.0X +q71 425 446 21 12.3 81.5 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q72 91047 91388 482 0.2 5932.2 1.0X +q72 109565 110431 1224 0.1 7138.7 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q73 372 395 24 8.2 121.4 1.0X +q73 371 384 11 8.2 121.4 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q74 1321 1672 496 2.9 350.3 1.0X +q74 1243 1519 390 3.0 329.7 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q75 1578 1746 237 3.6 280.2 1.0X +q75 1365 1649 402 4.1 242.4 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q76 332 349 18 15.5 64.7 1.0X +q76 320 344 23 16.0 62.4 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q77 528 794 237 10.6 94.0 1.0X +q77 530 827 277 10.6 94.4 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q78 2017 2368 497 2.8 359.1 1.0X +q78 1870 2048 251 3.0 333.0 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q79 430 448 14 7.1 140.6 1.0X +q79 431 443 11 7.1 140.9 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q80 1481 1583 145 3.8 262.4 1.0X +q80 1172 1306 191 4.8 207.5 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q81 371 391 19 1.0 1012.4 1.0X +q81 366 386 25 1.0 997.4 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q82 1193 1197 6 12.3 81.1 1.0X +q82 1096 1113 25 13.4 74.4 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q83 303 354 34 2.0 509.2 1.0X +q83 265 288 24 2.2 444.7 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q84 759 769 10 3.1 320.6 1.0X +q84 763 782 24 3.1 322.3 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q85 2197 2337 198 1.3 775.0 1.0X +q85 1875 2089 301 1.5 661.6 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q86 200 220 10 4.0 247.5 1.0X +q86 195 220 13 4.2 240.2 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q87 733 756 23 7.1 140.7 1.0X +q87 745 772 34 7.0 142.9 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q88 1342 1562 310 2.2 451.5 1.0X +q88 1239 1371 187 2.4 416.7 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q89 294 318 22 10.1 99.0 1.0X +q89 324 363 34 9.2 109.1 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q90 150 169 18 5.4 184.2 1.0X +q90 128 144 16 6.4 157.1 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q91 347 367 17 6.6 151.4 1.0X +q91 303 327 20 7.6 131.9 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q92 147 185 23 5.5 180.9 1.0X +q92 125 149 22 6.5 154.9 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q93 429 438 14 7.4 135.4 1.0X +q93 365 375 10 8.7 115.2 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q94 373 397 14 2.3 442.8 1.0X +q94 288 299 6 2.9 342.6 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q95 5202 5314 158 0.2 6178.1 1.0X +q95 5528 5648 169 0.2 6565.0 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q96 170 191 14 17.4 57.3 1.0X +q96 168 194 22 17.7 56.4 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q97 1149 1159 14 3.8 261.6 1.0X +q97 1164 1192 39 3.8 265.0 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q98 297 314 11 10.0 100.0 1.0X +q98 289 311 12 10.3 97.2 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q99 283 310 14 5.3 187.4 1.0X +q99 257 280 14 5.9 169.5 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q5a-v2.7 1412 1633 312 4.0 250.9 1.0X +q5a-v2.7 1275 1493 309 4.4 226.6 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q6-v2.7 988 995 8 3.2 316.6 1.0X +q6-v2.7 1008 1012 4 3.1 323.1 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q10a-v2.7 1815 1823 11 1.1 876.3 1.0X +q10a-v2.7 1754 1765 16 1.2 846.8 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q11-v2.7 1614 1887 387 2.3 427.9 1.0X +q11-v2.7 1667 1798 186 2.3 442.0 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q12-v2.7 145 163 12 5.6 179.5 1.0X +q12-v2.7 140 165 17 5.8 172.7 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q14-v2.7 3903 4064 226 1.3 761.0 1.0X +q14-v2.7 3930 4153 315 1.3 766.1 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q14a-v2.7 7517 7870 499 0.7 1465.3 1.0X +q14a-v2.7 6341 6841 707 0.8 1236.1 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q18a-v2.7 1889 2147 365 1.9 524.4 1.0X +q18a-v2.7 1690 1928 336 2.1 469.2 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q20-v2.7 178 204 24 8.6 116.2 1.0X +q20-v2.7 149 163 17 10.2 97.6 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q22-v2.7 12979 13026 66 0.9 1096.6 1.0X +q22-v2.7 13001 13080 111 0.9 1098.4 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q22a-v2.7 2017 2153 192 5.9 170.4 1.0X +q22a-v2.7 1890 1968 110 6.3 159.7 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q24-v2.7 144 270 77 23.1 43.3 1.0X +q24-v2.7 200 228 25 16.7 59.9 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q27a-v2.7 1470 1847 533 3.3 300.5 1.0X +q27a-v2.7 1274 1474 283 3.8 260.4 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q34-v2.7 365 402 30 8.4 119.4 1.0X +q34-v2.7 362 377 16 8.4 118.4 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q35-v2.7 1297 1305 12 1.6 626.2 1.0X +q35-v2.7 1324 1346 31 1.6 639.4 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q35a-v2.7 1265 1294 41 1.6 611.0 1.0X +q35a-v2.7 1271 1288 24 1.6 613.8 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q36a-v2.7 543 550 11 5.5 182.8 1.0X +q36a-v2.7 535 543 5 5.6 180.2 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q47-v2.7 1448 1619 241 2.1 487.4 1.0X +q47-v2.7 1562 1638 107 1.9 525.9 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q49-v2.7 592 637 83 9.5 105.4 1.0X +q49-v2.7 614 651 52 9.1 109.3 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q51a-v2.7 13982 14156 247 0.3 3807.9 1.0X +q51a-v2.7 14597 14929 469 0.3 3975.4 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q57-v2.7 839 868 49 1.8 548.0 1.0X +q57-v2.7 738 777 61 2.1 482.1 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q64-v2.7 2523 2775 357 2.7 364.5 1.0X +q64-v2.7 2239 2674 615 3.1 323.5 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q67a-v2.7 6557 6809 356 0.5 2207.3 1.0X +q67a-v2.7 7426 7591 233 0.4 2499.5 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q70a-v2.7 674 689 13 4.4 228.3 1.0X +q70a-v2.7 652 711 60 4.5 220.7 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q72-v2.7 88701 89378 957 0.2 5779.4 1.0X +q72-v2.7 108642 111301 3761 0.1 7078.6 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q74-v2.7 1198 1493 416 3.1 317.7 1.0X +q74-v2.7 1000 1158 223 3.8 265.2 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q75-v2.7 1386 1697 441 4.1 246.0 1.0X +q75-v2.7 1435 1692 363 3.9 254.7 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q77a-v2.7 831 1151 454 6.8 147.9 1.0X +q77a-v2.7 803 813 17 7.0 142.9 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q78-v2.7 2326 2451 176 2.4 414.3 1.0X +q78-v2.7 2051 2419 520 2.7 365.2 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q80a-v2.7 1772 1791 26 3.2 313.9 1.0X +q80a-v2.7 1670 1814 204 3.4 295.8 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q86a-v2.7 262 284 16 3.1 324.1 1.0X +q86a-v2.7 258 282 16 3.1 318.7 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TPCDS: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q98-v2.7 284 300 13 10.5 95.6 1.0X +q98-v2.7 288 302 16 10.3 96.8 1.0X diff --git a/sql/core/benchmarks/TakeOrderedAndProjectBenchmark-jdk21-results.txt b/sql/core/benchmarks/TakeOrderedAndProjectBenchmark-jdk21-results.txt index 257288226675f..c725476b53778 100644 --- a/sql/core/benchmarks/TakeOrderedAndProjectBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/TakeOrderedAndProjectBenchmark-jdk21-results.txt @@ -2,11 +2,11 @@ TakeOrderedAndProject ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TakeOrderedAndProject with SMJ: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -TakeOrderedAndProject with SMJ for doExecute 213 227 20 0.0 21332.0 1.0X -TakeOrderedAndProject with SMJ for executeCollect 94 109 17 0.1 9447.1 2.3X +TakeOrderedAndProject with SMJ for doExecute 87 91 4 0.1 8677.0 1.0X +TakeOrderedAndProject with SMJ for executeCollect 63 70 8 0.2 6290.5 1.4X diff --git a/sql/core/benchmarks/TakeOrderedAndProjectBenchmark-results.txt b/sql/core/benchmarks/TakeOrderedAndProjectBenchmark-results.txt index 7e8a7436320d3..d3b09bc5d8958 100644 --- a/sql/core/benchmarks/TakeOrderedAndProjectBenchmark-results.txt +++ b/sql/core/benchmarks/TakeOrderedAndProjectBenchmark-results.txt @@ -2,11 +2,11 @@ TakeOrderedAndProject ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor TakeOrderedAndProject with SMJ: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -TakeOrderedAndProject with SMJ for doExecute 245 276 31 0.0 24456.5 1.0X -TakeOrderedAndProject with SMJ for executeCollect 113 121 8 0.1 11305.6 2.2X +TakeOrderedAndProject with SMJ for doExecute 107 108 1 0.1 10711.2 1.0X +TakeOrderedAndProject with SMJ for executeCollect 76 80 5 0.1 7647.4 1.4X diff --git a/sql/core/benchmarks/TopKBenchmark-jdk21-results.txt b/sql/core/benchmarks/TopKBenchmark-jdk21-results.txt index c0d786b8f8f04..edd607e86e0f4 100644 --- a/sql/core/benchmarks/TopKBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/TopKBenchmark-jdk21-results.txt @@ -2,21 +2,21 @@ Top-K Computation ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Benchmark Top-K: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------- -ROW_NUMBER (PARTITION: , WindowGroupLimit: false) 8527 8649 123 2.5 406.6 1.0X -ROW_NUMBER (PARTITION: , WindowGroupLimit: true) 1687 1769 57 12.4 80.5 5.1X -ROW_NUMBER (PARTITION: PARTITION BY b, WindowGroupLimit: false) 11123 11209 57 1.9 530.4 0.8X -ROW_NUMBER (PARTITION: PARTITION BY b, WindowGroupLimit: true) 4977 5016 30 4.2 237.3 1.7X -RANK (PARTITION: , WindowGroupLimit: false) 9299 9573 159 2.3 443.4 0.9X -RANK (PARTITION: , WindowGroupLimit: true) 1794 1953 123 11.7 85.5 4.8X -RANK (PARTITION: PARTITION BY b, WindowGroupLimit: false) 11622 11881 149 1.8 554.2 0.7X -RANK (PARTITION: PARTITION BY b, WindowGroupLimit: true) 4864 5029 68 4.3 232.0 1.8X -DENSE_RANK (PARTITION: , WindowGroupLimit: false) 9101 9293 72 2.3 434.0 0.9X -DENSE_RANK (PARTITION: , WindowGroupLimit: true) 1796 1939 117 11.7 85.7 4.7X -DENSE_RANK (PARTITION: PARTITION BY b, WindowGroupLimit: false) 11532 11581 38 1.8 549.9 0.7X -DENSE_RANK (PARTITION: PARTITION BY b, WindowGroupLimit: true) 4955 4997 39 4.2 236.3 1.7X +ROW_NUMBER (PARTITION: , WindowGroupLimit: false) 9338 9444 88 2.2 445.3 1.0X +ROW_NUMBER (PARTITION: , WindowGroupLimit: true) 1602 1622 12 13.1 76.4 5.8X +ROW_NUMBER (PARTITION: PARTITION BY b, WindowGroupLimit: false) 11523 11814 140 1.8 549.5 0.8X +ROW_NUMBER (PARTITION: PARTITION BY b, WindowGroupLimit: true) 4612 4824 102 4.5 219.9 2.0X +RANK (PARTITION: , WindowGroupLimit: false) 9780 9938 73 2.1 466.4 1.0X +RANK (PARTITION: , WindowGroupLimit: true) 1780 1937 122 11.8 84.9 5.2X +RANK (PARTITION: PARTITION BY b, WindowGroupLimit: false) 11823 12111 147 1.8 563.8 0.8X +RANK (PARTITION: PARTITION BY b, WindowGroupLimit: true) 4739 4857 78 4.4 226.0 2.0X +DENSE_RANK (PARTITION: , WindowGroupLimit: false) 9565 9822 134 2.2 456.1 1.0X +DENSE_RANK (PARTITION: , WindowGroupLimit: true) 1765 1937 116 11.9 84.1 5.3X +DENSE_RANK (PARTITION: PARTITION BY b, WindowGroupLimit: false) 11830 12062 157 1.8 564.1 0.8X +DENSE_RANK (PARTITION: PARTITION BY b, WindowGroupLimit: true) 4751 4899 67 4.4 226.5 2.0X diff --git a/sql/core/benchmarks/TopKBenchmark-results.txt b/sql/core/benchmarks/TopKBenchmark-results.txt index 8b77fd0a90051..8df7b646b3a69 100644 --- a/sql/core/benchmarks/TopKBenchmark-results.txt +++ b/sql/core/benchmarks/TopKBenchmark-results.txt @@ -2,21 +2,21 @@ Top-K Computation ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Benchmark Top-K: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------- -ROW_NUMBER (PARTITION: , WindowGroupLimit: false) 8973 9156 258 2.3 427.9 1.0X -ROW_NUMBER (PARTITION: , WindowGroupLimit: true) 1686 1695 8 12.4 80.4 5.3X -ROW_NUMBER (PARTITION: PARTITION BY b, WindowGroupLimit: false) 10830 10981 154 1.9 516.4 0.8X -ROW_NUMBER (PARTITION: PARTITION BY b, WindowGroupLimit: true) 4550 4673 79 4.6 217.0 2.0X -RANK (PARTITION: , WindowGroupLimit: false) 9397 9624 166 2.2 448.1 1.0X -RANK (PARTITION: , WindowGroupLimit: true) 1778 1905 97 11.8 84.8 5.0X -RANK (PARTITION: PARTITION BY b, WindowGroupLimit: false) 11454 11605 132 1.8 546.2 0.8X -RANK (PARTITION: PARTITION BY b, WindowGroupLimit: true) 4569 4698 83 4.6 217.9 2.0X -DENSE_RANK (PARTITION: , WindowGroupLimit: false) 9465 9531 50 2.2 451.3 0.9X -DENSE_RANK (PARTITION: , WindowGroupLimit: true) 1804 1920 79 11.6 86.0 5.0X -DENSE_RANK (PARTITION: PARTITION BY b, WindowGroupLimit: false) 11416 11592 185 1.8 544.4 0.8X -DENSE_RANK (PARTITION: PARTITION BY b, WindowGroupLimit: true) 4549 4693 105 4.6 216.9 2.0X +ROW_NUMBER (PARTITION: , WindowGroupLimit: false) 9300 9429 180 2.3 443.5 1.0X +ROW_NUMBER (PARTITION: , WindowGroupLimit: true) 1665 1676 11 12.6 79.4 5.6X +ROW_NUMBER (PARTITION: PARTITION BY b, WindowGroupLimit: false) 12100 12186 61 1.7 577.0 0.8X +ROW_NUMBER (PARTITION: PARTITION BY b, WindowGroupLimit: true) 4751 4805 36 4.4 226.6 2.0X +RANK (PARTITION: , WindowGroupLimit: false) 9883 9971 74 2.1 471.3 0.9X +RANK (PARTITION: , WindowGroupLimit: true) 1919 1960 31 10.9 91.5 4.8X +RANK (PARTITION: PARTITION BY b, WindowGroupLimit: false) 12946 13013 36 1.6 617.3 0.7X +RANK (PARTITION: PARTITION BY b, WindowGroupLimit: true) 4751 4809 45 4.4 226.5 2.0X +DENSE_RANK (PARTITION: , WindowGroupLimit: false) 9882 9953 57 2.1 471.2 0.9X +DENSE_RANK (PARTITION: , WindowGroupLimit: true) 1932 1974 47 10.9 92.1 4.8X +DENSE_RANK (PARTITION: PARTITION BY b, WindowGroupLimit: false) 12891 12989 143 1.6 614.7 0.7X +DENSE_RANK (PARTITION: PARTITION BY b, WindowGroupLimit: true) 4773 4812 23 4.4 227.6 1.9X diff --git a/sql/core/benchmarks/UDFBenchmark-jdk21-results.txt b/sql/core/benchmarks/UDFBenchmark-jdk21-results.txt index 2c7fd8805fddd..c4126410e8f38 100644 --- a/sql/core/benchmarks/UDFBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/UDFBenchmark-jdk21-results.txt @@ -2,58 +2,58 @@ UDF with mixed input types ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor long/nullable int/string to string: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -long/nullable int/string to string wholestage off 108 135 38 0.9 1082.5 1.0X -long/nullable int/string to string wholestage on 69 79 9 1.5 689.4 1.6X +long/nullable int/string to string wholestage off 29 30 1 3.4 290.3 1.0X +long/nullable int/string to string wholestage on 31 34 5 3.3 305.7 0.9X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor long/nullable int/string to option: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -long/nullable int/string to option wholestage off 44 47 4 2.3 441.4 1.0X -long/nullable int/string to option wholestage on 38 41 3 2.6 378.1 1.2X +long/nullable int/string to option wholestage off 22 28 8 4.5 221.8 1.0X +long/nullable int/string to option wholestage on 23 33 7 4.3 230.5 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor long/nullable int/string to primitive: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------ -long/nullable int/string to primitive wholestage off 30 31 2 3.3 298.8 1.0X -long/nullable int/string to primitive wholestage on 28 31 2 3.6 281.4 1.1X +long/nullable int/string to primitive wholestage off 16 18 3 6.3 158.0 1.0X +long/nullable int/string to primitive wholestage on 17 18 2 6.0 165.8 1.0X ================================================================================================ UDF with primitive types ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor long/nullable int to string: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -long/nullable int to string wholestage off 37 39 3 2.7 370.0 1.0X -long/nullable int to string wholestage on 31 40 8 3.2 311.5 1.2X +long/nullable int to string wholestage off 21 21 0 4.8 207.8 1.0X +long/nullable int to string wholestage on 24 29 5 4.3 235.0 0.9X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor long/nullable int to option: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -long/nullable int to option wholestage off 22 26 6 4.6 216.7 1.0X -long/nullable int to option wholestage on 22 23 0 4.5 224.6 1.0X +long/nullable int to option wholestage off 14 15 1 6.9 144.0 1.0X +long/nullable int to option wholestage on 15 17 2 6.8 146.6 1.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor long/nullable int to primitive: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------- -long/nullable int to primitive wholestage off 21 22 1 4.7 214.3 1.0X -long/nullable int to primitive wholestage on 18 19 0 5.4 184.0 1.2X +long/nullable int to primitive wholestage off 11 15 6 9.5 105.3 1.0X +long/nullable int to primitive wholestage on 12 13 2 8.3 120.6 0.9X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor UDF identity overhead: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Baseline 13 19 8 7.8 128.7 1.0X -With identity UDF 16 17 1 6.2 161.5 0.8X +Baseline 7 7 0 14.0 71.3 1.0X +With identity UDF 10 12 2 10.1 99.4 0.7X diff --git a/sql/core/benchmarks/UDFBenchmark-results.txt b/sql/core/benchmarks/UDFBenchmark-results.txt index 84ea01c12a80b..3208259e29b3f 100644 --- a/sql/core/benchmarks/UDFBenchmark-results.txt +++ b/sql/core/benchmarks/UDFBenchmark-results.txt @@ -2,58 +2,58 @@ UDF with mixed input types ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor long/nullable int/string to string: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -long/nullable int/string to string wholestage off 108 138 43 0.9 1075.9 1.0X -long/nullable int/string to string wholestage on 68 82 9 1.5 679.2 1.6X +long/nullable int/string to string wholestage off 32 35 4 3.1 318.8 1.0X +long/nullable int/string to string wholestage on 31 41 8 3.2 314.3 1.0X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor long/nullable int/string to option: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -long/nullable int/string to option wholestage off 53 54 1 1.9 532.8 1.0X -long/nullable int/string to option wholestage on 35 41 5 2.8 354.3 1.5X +long/nullable int/string to option wholestage off 23 27 6 4.4 226.1 1.0X +long/nullable int/string to option wholestage on 27 35 6 3.7 272.8 0.8X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor long/nullable int/string to primitive: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------ -long/nullable int/string to primitive wholestage off 38 39 2 2.6 378.4 1.0X -long/nullable int/string to primitive wholestage on 29 32 3 3.5 288.1 1.3X +long/nullable int/string to primitive wholestage off 18 18 0 5.5 181.5 1.0X +long/nullable int/string to primitive wholestage on 20 21 2 5.1 196.4 0.9X ================================================================================================ UDF with primitive types ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor long/nullable int to string: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -long/nullable int to string wholestage off 28 28 0 3.6 279.2 1.0X -long/nullable int to string wholestage on 30 39 7 3.4 296.0 0.9X +long/nullable int to string wholestage off 22 22 1 4.6 218.7 1.0X +long/nullable int to string wholestage on 23 23 0 4.3 232.0 0.9X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor long/nullable int to option: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -long/nullable int to option wholestage off 20 20 0 5.1 196.0 1.0X -long/nullable int to option wholestage on 22 23 1 4.5 224.4 0.9X +long/nullable int to option wholestage off 14 15 0 7.0 143.4 1.0X +long/nullable int to option wholestage on 15 16 2 6.5 153.0 0.9X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor long/nullable int to primitive: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------- -long/nullable int to primitive wholestage off 16 16 0 6.3 159.8 1.0X -long/nullable int to primitive wholestage on 17 18 0 5.7 174.6 0.9X +long/nullable int to primitive wholestage off 12 12 0 8.3 121.0 1.0X +long/nullable int to primitive wholestage on 13 13 1 7.7 129.5 0.9X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor UDF identity overhead: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Baseline 11 15 7 8.8 114.3 1.0X -With identity UDF 13 15 2 7.5 134.0 0.9X +Baseline 8 8 0 13.1 76.3 1.0X +With identity UDF 11 11 0 9.0 110.7 0.7X diff --git a/sql/core/benchmarks/UnsafeArrayDataBenchmark-jdk21-results.txt b/sql/core/benchmarks/UnsafeArrayDataBenchmark-jdk21-results.txt index 8b70ff9fd40c4..5283d13abce43 100644 --- a/sql/core/benchmarks/UnsafeArrayDataBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/UnsafeArrayDataBenchmark-jdk21-results.txt @@ -2,32 +2,32 @@ Benchmark UnsafeArrayData ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Read UnsafeArrayData: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Int 80 80 0 2100.7 0.5 1.0X -Double 158 158 0 1061.9 0.9 0.5X +Int 73 74 1 2292.6 0.4 1.0X +Double 158 158 0 1063.2 0.9 0.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Write UnsafeArrayData: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Int 12 13 1 1813.0 0.6 1.0X -Double 32 37 3 662.4 1.5 0.4X +Int 13 15 2 1608.6 0.6 1.0X +Double 30 34 4 696.9 1.4 0.4X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Get primitive array from UnsafeArrayData: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Int 19 26 9 3238.9 0.3 1.0X -Double 40 52 19 1578.9 0.6 0.5X +Int 20 23 2 3090.1 0.3 1.0X +Double 42 47 2 1508.0 0.7 0.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Create UnsafeArrayData from primitive array: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -Int 20 21 1 3178.0 0.3 1.0X -Double 42 43 1 1502.2 0.7 0.5X +Int 22 24 2 2892.7 0.3 1.0X +Double 44 47 2 1425.5 0.7 0.5X diff --git a/sql/core/benchmarks/UnsafeArrayDataBenchmark-results.txt b/sql/core/benchmarks/UnsafeArrayDataBenchmark-results.txt index 9c677148c4f57..af96712ae368f 100644 --- a/sql/core/benchmarks/UnsafeArrayDataBenchmark-results.txt +++ b/sql/core/benchmarks/UnsafeArrayDataBenchmark-results.txt @@ -2,32 +2,32 @@ Benchmark UnsafeArrayData ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Read UnsafeArrayData: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Int 76 76 0 2216.5 0.5 1.0X -Double 158 158 0 1063.8 0.9 0.5X +Int 73 73 0 2313.3 0.4 1.0X +Double 152 156 2 1106.9 0.9 0.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Write UnsafeArrayData: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Int 12 13 1 1822.0 0.5 1.0X -Double 29 34 3 724.2 1.4 0.4X +Int 12 14 2 1744.6 0.6 1.0X +Double 28 33 3 738.4 1.4 0.4X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Get primitive array from UnsafeArrayData: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Int 19 21 1 3317.8 0.3 1.0X -Double 37 42 1 1686.1 0.6 0.5X +Int 19 22 2 3335.4 0.3 1.0X +Double 39 44 2 1594.2 0.6 0.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Create UnsafeArrayData from primitive array: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -Int 19 21 1 3263.0 0.3 1.0X -Double 43 46 2 1474.1 0.7 0.5X +Int 20 22 2 3128.0 0.3 1.0X +Double 42 46 2 1481.7 0.7 0.5X diff --git a/sql/core/benchmarks/UpdateFieldsBenchmark-jdk21-results.txt b/sql/core/benchmarks/UpdateFieldsBenchmark-jdk21-results.txt index 71d5b1fe490fe..e289715a15dc9 100644 --- a/sql/core/benchmarks/UpdateFieldsBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/UpdateFieldsBenchmark-jdk21-results.txt @@ -2,25 +2,25 @@ Add 2 columns and drop 2 columns at 3 different depths of nesting ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Add 2 columns and drop 2 columns at 3 different depths of nesting: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------- -To non-nullable StructTypes using performant method 1 2 1 0.0 Infinity 1.0X -To nullable StructTypes using performant method 1 1 0 0.0 Infinity 1.2X -To non-nullable StructTypes using non-performant method 18 19 2 0.0 Infinity 0.1X -To nullable StructTypes using non-performant method 800 877 68 0.0 Infinity 0.0X +To non-nullable StructTypes using performant method 2 3 1 0.0 Infinity 1.0X +To nullable StructTypes using performant method 1 1 0 0.0 Infinity 1.4X +To non-nullable StructTypes using non-performant method 18 18 1 0.0 Infinity 0.1X +To nullable StructTypes using non-performant method 789 799 16 0.0 Infinity 0.0X ================================================================================================ Add 50 columns and drop 50 columns at 100 different depths of nesting ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Add 50 columns and drop 50 columns at 100 different depths of nesting: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------------- -To non-nullable StructTypes using performant method 1390 1390 0 0.0 Infinity 1.0X -To nullable StructTypes using performant method 1336 1381 64 0.0 Infinity 1.0X +To non-nullable StructTypes using performant method 1182 1264 115 0.0 Infinity 1.0X +To nullable StructTypes using performant method 1280 1280 1 0.0 Infinity 0.9X diff --git a/sql/core/benchmarks/UpdateFieldsBenchmark-results.txt b/sql/core/benchmarks/UpdateFieldsBenchmark-results.txt index 75e0f806b0aae..b05804969c1b6 100644 --- a/sql/core/benchmarks/UpdateFieldsBenchmark-results.txt +++ b/sql/core/benchmarks/UpdateFieldsBenchmark-results.txt @@ -2,25 +2,25 @@ Add 2 columns and drop 2 columns at 3 different depths of nesting ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Add 2 columns and drop 2 columns at 3 different depths of nesting: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------- To non-nullable StructTypes using performant method 2 3 1 0.0 Infinity 1.0X -To nullable StructTypes using performant method 1 1 0 0.0 Infinity 1.4X -To non-nullable StructTypes using non-performant method 19 19 1 0.0 Infinity 0.1X -To nullable StructTypes using non-performant method 796 852 54 0.0 Infinity 0.0X +To nullable StructTypes using performant method 1 2 0 0.0 Infinity 1.3X +To non-nullable StructTypes using non-performant method 19 20 2 0.0 Infinity 0.1X +To nullable StructTypes using non-performant method 867 899 30 0.0 Infinity 0.0X ================================================================================================ Add 50 columns and drop 50 columns at 100 different depths of nesting ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Add 50 columns and drop 50 columns at 100 different depths of nesting: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------------- -To non-nullable StructTypes using performant method 1643 1656 18 0.0 Infinity 1.0X -To nullable StructTypes using performant method 1502 1623 171 0.0 Infinity 1.1X +To non-nullable StructTypes using performant method 1554 1575 30 0.0 Infinity 1.0X +To nullable StructTypes using performant method 1666 1704 54 0.0 Infinity 0.9X diff --git a/sql/core/benchmarks/V2FunctionBenchmark-jdk21-results.txt b/sql/core/benchmarks/V2FunctionBenchmark-jdk21-results.txt index 8904d36b19a56..def6739a917fa 100644 --- a/sql/core/benchmarks/V2FunctionBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/V2FunctionBenchmark-jdk21-results.txt @@ -1,44 +1,44 @@ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor scalar function (long + long) -> long, result_nullable = true codegen = true: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------ -native_long_add 9467 9517 55 52.8 18.9 1.0X -java_long_add_default 21990 22037 72 22.7 44.0 0.4X -java_long_add_magic 11660 11741 102 42.9 23.3 0.8X -java_long_add_static_magic 11334 11348 18 44.1 22.7 0.8X -scala_long_add_default 22748 23088 305 22.0 45.5 0.4X -scala_long_add_magic 11839 11875 44 42.2 23.7 0.8X +native_long_add 9638 9709 87 51.9 19.3 1.0X +java_long_add_default 27727 27753 27 18.0 55.5 0.3X +java_long_add_magic 11740 11767 31 42.6 23.5 0.8X +java_long_add_static_magic 11578 11647 85 43.2 23.2 0.8X +scala_long_add_default 23241 23295 73 21.5 46.5 0.4X +scala_long_add_magic 11729 11805 107 42.6 23.5 0.8X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor scalar function (long + long) -> long, result_nullable = false codegen = true: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------- -native_long_add 10114 10166 54 49.4 20.2 1.0X -java_long_add_default 22353 22379 28 22.4 44.7 0.5X -java_long_add_magic 11948 11985 51 41.8 23.9 0.8X -java_long_add_static_magic 10076 10102 26 49.6 20.2 1.0X -scala_long_add_default 22141 22150 9 22.6 44.3 0.5X -scala_long_add_magic 11858 11897 50 42.2 23.7 0.9X +native_long_add 10259 10290 34 48.7 20.5 1.0X +java_long_add_default 22285 22378 127 22.4 44.6 0.5X +java_long_add_magic 11725 11813 83 42.6 23.5 0.9X +java_long_add_static_magic 9877 9966 116 50.6 19.8 1.0X +scala_long_add_default 22320 22495 187 22.4 44.6 0.5X +scala_long_add_magic 11742 11827 77 42.6 23.5 0.9X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor scalar function (long + long) -> long, result_nullable = true codegen = false: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------- -native_long_add 22606 22631 37 22.1 45.2 1.0X -java_long_add_default 28087 28189 102 17.8 56.2 0.8X -java_long_add_magic 32639 32846 298 15.3 65.3 0.7X -java_long_add_static_magic 30810 31179 628 16.2 61.6 0.7X -scala_long_add_default 26433 26511 106 18.9 52.9 0.9X -scala_long_add_magic 32777 32875 99 15.3 65.6 0.7X +native_long_add 22577 22649 123 22.1 45.2 1.0X +java_long_add_default 27897 27935 59 17.9 55.8 0.8X +java_long_add_magic 32443 32564 110 15.4 64.9 0.7X +java_long_add_static_magic 31297 31408 107 16.0 62.6 0.7X +scala_long_add_default 26280 26438 200 19.0 52.6 0.9X +scala_long_add_magic 32608 32625 17 15.3 65.2 0.7X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor scalar function (long + long) -> long, result_nullable = false codegen = false: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------- -native_long_add 22794 22845 68 21.9 45.6 1.0X -java_long_add_default 26502 26650 148 18.9 53.0 0.9X -java_long_add_magic 32876 32962 80 15.2 65.8 0.7X -java_long_add_static_magic 30909 31054 168 16.2 61.8 0.7X -scala_long_add_default 26483 26489 5 18.9 53.0 0.9X -scala_long_add_magic 32883 32899 15 15.2 65.8 0.7X +native_long_add 21616 21652 43 23.1 43.2 1.0X +java_long_add_default 25274 25397 209 19.8 50.5 0.9X +java_long_add_magic 31544 31592 53 15.9 63.1 0.7X +java_long_add_static_magic 30400 30965 492 16.4 60.8 0.7X +scala_long_add_default 25277 25394 138 19.8 50.6 0.9X +scala_long_add_magic 31560 31711 261 15.8 63.1 0.7X diff --git a/sql/core/benchmarks/V2FunctionBenchmark-results.txt b/sql/core/benchmarks/V2FunctionBenchmark-results.txt index 2663912b4769b..69bcb6ca79de0 100644 --- a/sql/core/benchmarks/V2FunctionBenchmark-results.txt +++ b/sql/core/benchmarks/V2FunctionBenchmark-results.txt @@ -1,44 +1,44 @@ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor scalar function (long + long) -> long, result_nullable = true codegen = true: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------ -native_long_add 9352 9422 68 53.5 18.7 1.0X -java_long_add_default 21996 22071 125 22.7 44.0 0.4X -java_long_add_magic 10666 10693 24 46.9 21.3 0.9X -java_long_add_static_magic 10534 10585 45 47.5 21.1 0.9X -scala_long_add_default 22996 23055 91 21.7 46.0 0.4X -scala_long_add_magic 10698 10765 68 46.7 21.4 0.9X +native_long_add 9469 10166 728 52.8 18.9 1.0X +java_long_add_default 22104 22180 123 22.6 44.2 0.4X +java_long_add_magic 10681 10726 53 46.8 21.4 0.9X +java_long_add_static_magic 10526 10622 84 47.5 21.1 0.9X +scala_long_add_default 22671 23034 438 22.1 45.3 0.4X +scala_long_add_magic 10662 10703 39 46.9 21.3 0.9X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor scalar function (long + long) -> long, result_nullable = false codegen = true: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------- -native_long_add 9897 9931 29 50.5 19.8 1.0X -java_long_add_default 21890 21944 49 22.8 43.8 0.5X -java_long_add_magic 10699 10803 173 46.7 21.4 0.9X -java_long_add_static_magic 9882 10183 464 50.6 19.8 1.0X -scala_long_add_default 21844 21920 103 22.9 43.7 0.5X -scala_long_add_magic 10715 10722 6 46.7 21.4 0.9X +native_long_add 9914 9941 27 50.4 19.8 1.0X +java_long_add_default 21984 22016 45 22.7 44.0 0.5X +java_long_add_magic 10683 10700 25 46.8 21.4 0.9X +java_long_add_static_magic 9884 9941 60 50.6 19.8 1.0X +scala_long_add_default 21936 22057 180 22.8 43.9 0.5X +scala_long_add_magic 10677 10997 538 46.8 21.4 0.9X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor scalar function (long + long) -> long, result_nullable = true codegen = false: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------- -native_long_add 22708 22769 69 22.0 45.4 1.0X -java_long_add_default 25849 26143 389 19.3 51.7 0.9X -java_long_add_magic 32210 32379 256 15.5 64.4 0.7X -java_long_add_static_magic 31705 31755 62 15.8 63.4 0.7X -scala_long_add_default 26389 26548 170 18.9 52.8 0.9X -scala_long_add_magic 32369 32429 63 15.4 64.7 0.7X +native_long_add 22579 22718 163 22.1 45.2 1.0X +java_long_add_default 25854 25927 124 19.3 51.7 0.9X +java_long_add_magic 32272 32342 69 15.5 64.5 0.7X +java_long_add_static_magic 30215 30835 987 16.5 60.4 0.7X +scala_long_add_default 26500 26616 161 18.9 53.0 0.9X +scala_long_add_magic 32366 32583 317 15.4 64.7 0.7X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor scalar function (long + long) -> long, result_nullable = false codegen = false: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------- -native_long_add 21779 21955 211 23.0 43.6 1.0X -java_long_add_default 25989 26116 206 19.2 52.0 0.8X -java_long_add_magic 31604 31648 48 15.8 63.2 0.7X -java_long_add_static_magic 31273 31340 109 16.0 62.5 0.7X -scala_long_add_default 25860 25913 48 19.3 51.7 0.8X -scala_long_add_magic 31568 31669 90 15.8 63.1 0.7X +native_long_add 21710 21832 159 23.0 43.4 1.0X +java_long_add_default 25610 25663 49 19.5 51.2 0.8X +java_long_add_magic 31550 31580 45 15.8 63.1 0.7X +java_long_add_static_magic 29780 29820 49 16.8 59.6 0.7X +scala_long_add_default 25753 26613 1063 19.4 51.5 0.8X +scala_long_add_magic 31546 31702 184 15.8 63.1 0.7X diff --git a/sql/core/benchmarks/WideSchemaBenchmark-jdk21-results.txt b/sql/core/benchmarks/WideSchemaBenchmark-jdk21-results.txt index 712436e4c1353..f8b9e3744bf27 100644 --- a/sql/core/benchmarks/WideSchemaBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/WideSchemaBenchmark-jdk21-results.txt @@ -2,157 +2,157 @@ parsing large select expressions ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor parsing large select: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -1 select expressions 1 2 1 0.0 1010700.0 1.0X -100 select expressions 2 3 1 0.0 2025948.0 0.5X -2500 select expressions 39 42 4 0.0 39031401.0 0.0X +1 select expressions 1 1 0 0.0 618123.0 1.0X +100 select expressions 2 3 1 0.0 2251962.0 0.3X +2500 select expressions 46 48 4 0.0 46311762.0 0.0X ================================================================================================ optimize large select expressions ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor optimize large select: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -100 columns 5 5 1 0.0 4506568.0 1.0X -1000 columns 31 33 2 0.0 31376169.0 0.1X -10000 columns 328 344 14 0.0 327627136.0 0.0X +100 columns 5 5 1 0.0 4594183.0 1.0X +1000 columns 34 35 4 0.0 33513952.0 0.1X +10000 columns 359 388 38 0.0 359145545.0 0.0X ================================================================================================ many column field read and write ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor many column field r/w: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -1 cols x 100000 rows (read in-mem) 15 21 5 6.7 149.8 1.0X -1 cols x 100000 rows (exec in-mem) 15 20 5 6.7 149.0 1.0X -1 cols x 100000 rows (read parquet) 26 33 8 3.8 261.7 0.6X -1 cols x 100000 rows (write parquet) 91 100 7 1.1 906.8 0.2X -100 cols x 1000 rows (read in-mem) 12 16 4 8.4 118.8 1.3X -100 cols x 1000 rows (exec in-mem) 16 21 6 6.2 160.5 0.9X -100 cols x 1000 rows (read parquet) 22 28 6 4.5 223.4 0.7X -100 cols x 1000 rows (write parquet) 89 96 9 1.1 894.8 0.2X -2500 cols x 40 rows (read in-mem) 71 75 8 1.4 708.2 0.2X -2500 cols x 40 rows (exec in-mem) 130 135 7 0.8 1297.2 0.1X -2500 cols x 40 rows (read parquet) 308 318 7 0.3 3084.1 0.0X -2500 cols x 40 rows (write parquet) 144 149 4 0.7 1441.9 0.1X +1 cols x 100000 rows (read in-mem) 13 18 5 7.5 134.2 1.0X +1 cols x 100000 rows (exec in-mem) 14 17 4 7.4 135.4 1.0X +1 cols x 100000 rows (read parquet) 25 34 9 4.1 246.6 0.5X +1 cols x 100000 rows (write parquet) 95 106 13 1.1 950.3 0.1X +100 cols x 1000 rows (read in-mem) 12 16 5 8.4 118.8 1.1X +100 cols x 1000 rows (exec in-mem) 16 19 5 6.3 158.4 0.8X +100 cols x 1000 rows (read parquet) 22 28 8 4.6 217.5 0.6X +100 cols x 1000 rows (write parquet) 93 102 15 1.1 934.8 0.1X +2500 cols x 40 rows (read in-mem) 74 83 11 1.4 739.5 0.2X +2500 cols x 40 rows (exec in-mem) 132 150 15 0.8 1324.5 0.1X +2500 cols x 40 rows (read parquet) 289 318 32 0.3 2894.9 0.0X +2500 cols x 40 rows (write parquet) 152 176 26 0.7 1522.8 0.1X ================================================================================================ wide shallowly nested struct field read and write ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor wide shallowly nested struct field r/w: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -1 wide x 100000 rows (read in-mem) 19 24 6 5.3 187.3 1.0X -1 wide x 100000 rows (exec in-mem) 20 24 5 4.9 204.1 0.9X -1 wide x 100000 rows (read parquet) 21 26 6 4.8 208.0 0.9X -1 wide x 100000 rows (write parquet) 95 103 9 1.1 952.3 0.2X -100 wide x 1000 rows (read in-mem) 14 17 4 7.0 143.3 1.3X -100 wide x 1000 rows (exec in-mem) 22 25 4 4.5 220.4 0.8X -100 wide x 1000 rows (read parquet) 21 23 5 4.8 206.5 0.9X -100 wide x 1000 rows (write parquet) 90 96 7 1.1 900.7 0.2X -2500 wide x 40 rows (read in-mem) 20 23 4 5.0 201.5 0.9X -2500 wide x 40 rows (exec in-mem) 203 213 10 0.5 2027.7 0.1X -2500 wide x 40 rows (read parquet) 63 67 5 1.6 627.6 0.3X -2500 wide x 40 rows (write parquet) 96 102 5 1.0 964.9 0.2X +1 wide x 100000 rows (read in-mem) 19 24 7 5.4 186.4 1.0X +1 wide x 100000 rows (exec in-mem) 20 25 7 4.9 204.7 0.9X +1 wide x 100000 rows (read parquet) 21 25 6 4.8 207.1 0.9X +1 wide x 100000 rows (write parquet) 100 108 11 1.0 1000.3 0.2X +100 wide x 1000 rows (read in-mem) 14 17 5 7.0 143.8 1.3X +100 wide x 1000 rows (exec in-mem) 23 28 7 4.3 230.8 0.8X +100 wide x 1000 rows (read parquet) 21 25 6 4.8 206.8 0.9X +100 wide x 1000 rows (write parquet) 96 101 7 1.0 961.9 0.2X +2500 wide x 40 rows (read in-mem) 21 24 5 4.8 210.2 0.9X +2500 wide x 40 rows (exec in-mem) 233 254 23 0.4 2325.6 0.1X +2500 wide x 40 rows (read parquet) 62 68 9 1.6 617.9 0.3X +2500 wide x 40 rows (write parquet) 102 111 19 1.0 1022.9 0.2X ================================================================================================ deeply nested struct field read and write ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor deeply nested struct field r/w: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -1 deep x 100000 rows (read in-mem) 15 19 5 6.8 147.7 1.0X -1 deep x 100000 rows (exec in-mem) 17 21 5 5.9 169.5 0.9X -1 deep x 100000 rows (read parquet) 17 20 5 5.9 169.7 0.9X -1 deep x 100000 rows (write parquet) 90 96 7 1.1 902.1 0.2X -100 deep x 1000 rows (read in-mem) 42 44 4 2.4 415.3 0.4X -100 deep x 1000 rows (exec in-mem) 463 464 1 0.2 4629.9 0.0X -100 deep x 1000 rows (read parquet) 440 447 6 0.2 4402.6 0.0X -100 deep x 1000 rows (write parquet) 118 122 4 0.8 1182.7 0.1X -250 deep x 400 rows (read in-mem) 190 193 3 0.5 1898.3 0.1X -250 deep x 400 rows (exec in-mem) 2955 2961 9 0.0 29549.9 0.0X -250 deep x 400 rows (read parquet) 2657 2661 6 0.0 26566.2 0.0X -250 deep x 400 rows (write parquet) 266 272 6 0.4 2655.3 0.1X +1 deep x 100000 rows (read in-mem) 15 19 6 6.5 154.6 1.0X +1 deep x 100000 rows (exec in-mem) 17 20 5 5.7 174.1 0.9X +1 deep x 100000 rows (read parquet) 17 23 7 5.7 174.6 0.9X +1 deep x 100000 rows (write parquet) 96 106 14 1.0 961.6 0.2X +100 deep x 1000 rows (read in-mem) 47 54 9 2.1 466.7 0.3X +100 deep x 1000 rows (exec in-mem) 521 538 16 0.2 5211.5 0.0X +100 deep x 1000 rows (read parquet) 500 510 13 0.2 5001.6 0.0X +100 deep x 1000 rows (write parquet) 128 134 6 0.8 1278.6 0.1X +250 deep x 400 rows (read in-mem) 221 231 8 0.5 2210.9 0.1X +250 deep x 400 rows (exec in-mem) 3301 3306 7 0.0 33011.6 0.0X +250 deep x 400 rows (read parquet) 3049 3073 34 0.0 30491.4 0.0X +250 deep x 400 rows (write parquet) 298 307 11 0.3 2982.8 0.1X ================================================================================================ bushy struct field read and write ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor bushy struct field r/w: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -1 x 1 deep x 100000 rows (read in-mem) 13 15 3 7.8 127.9 1.0X -1 x 1 deep x 100000 rows (exec in-mem) 14 16 4 7.1 141.1 0.9X -1 x 1 deep x 100000 rows (read parquet) 16 19 5 6.2 160.4 0.8X -1 x 1 deep x 100000 rows (write parquet) 89 92 5 1.1 888.1 0.1X -128 x 8 deep x 1000 rows (read in-mem) 13 14 3 7.8 128.4 1.0X -128 x 8 deep x 1000 rows (exec in-mem) 25 28 4 4.0 249.1 0.5X -128 x 8 deep x 1000 rows (read parquet) 20 22 5 5.1 197.6 0.6X -128 x 8 deep x 1000 rows (write parquet) 87 94 8 1.1 873.8 0.1X -1024 x 11 deep x 100 rows (read in-mem) 18 20 3 5.6 178.4 0.7X -1024 x 11 deep x 100 rows (exec in-mem) 138 143 8 0.7 1375.0 0.1X -1024 x 11 deep x 100 rows (read parquet) 33 36 5 3.0 334.0 0.4X -1024 x 11 deep x 100 rows (write parquet) 93 97 7 1.1 925.2 0.1X +1 x 1 deep x 100000 rows (read in-mem) 13 17 5 7.6 131.7 1.0X +1 x 1 deep x 100000 rows (exec in-mem) 15 19 5 6.7 148.2 0.9X +1 x 1 deep x 100000 rows (read parquet) 16 19 5 6.1 164.3 0.8X +1 x 1 deep x 100000 rows (write parquet) 94 99 6 1.1 935.0 0.1X +128 x 8 deep x 1000 rows (read in-mem) 13 15 4 7.6 131.4 1.0X +128 x 8 deep x 1000 rows (exec in-mem) 27 31 5 3.7 269.8 0.5X +128 x 8 deep x 1000 rows (read parquet) 20 22 4 4.9 202.9 0.6X +128 x 8 deep x 1000 rows (write parquet) 93 99 8 1.1 933.8 0.1X +1024 x 11 deep x 100 rows (read in-mem) 18 20 4 5.6 179.2 0.7X +1024 x 11 deep x 100 rows (exec in-mem) 143 154 10 0.7 1429.6 0.1X +1024 x 11 deep x 100 rows (read parquet) 34 37 5 2.9 344.1 0.4X +1024 x 11 deep x 100 rows (write parquet) 98 102 4 1.0 977.9 0.1X ================================================================================================ wide array field read and write ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor wide array field r/w: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -1 wide x 100000 rows (read in-mem) 14 16 4 7.1 141.0 1.0X -1 wide x 100000 rows (exec in-mem) 16 19 4 6.2 162.5 0.9X -1 wide x 100000 rows (read parquet) 16 20 5 6.1 164.8 0.9X -1 wide x 100000 rows (write parquet) 89 95 6 1.1 888.9 0.2X -100 wide x 1000 rows (read in-mem) 11 13 4 9.3 107.3 1.3X -100 wide x 1000 rows (exec in-mem) 12 14 4 8.3 119.9 1.2X -100 wide x 1000 rows (read parquet) 16 19 6 6.3 157.7 0.9X -100 wide x 1000 rows (write parquet) 86 93 9 1.2 857.0 0.2X -2500 wide x 40 rows (read in-mem) 11 12 3 9.5 105.1 1.3X -2500 wide x 40 rows (exec in-mem) 12 13 3 8.3 120.3 1.2X -2500 wide x 40 rows (read parquet) 16 18 4 6.3 158.2 0.9X -2500 wide x 40 rows (write parquet) 85 92 7 1.2 854.1 0.2X +1 wide x 100000 rows (read in-mem) 15 19 5 6.6 151.2 1.0X +1 wide x 100000 rows (exec in-mem) 17 20 5 5.8 172.4 0.9X +1 wide x 100000 rows (read parquet) 17 19 5 5.8 171.0 0.9X +1 wide x 100000 rows (write parquet) 95 105 6 1.0 952.9 0.2X +100 wide x 1000 rows (read in-mem) 11 13 4 8.9 112.9 1.3X +100 wide x 1000 rows (exec in-mem) 13 15 4 7.8 128.6 1.2X +100 wide x 1000 rows (read parquet) 17 20 5 6.0 166.7 0.9X +100 wide x 1000 rows (write parquet) 92 101 7 1.1 920.3 0.2X +2500 wide x 40 rows (read in-mem) 11 13 4 9.0 110.9 1.4X +2500 wide x 40 rows (exec in-mem) 13 14 3 7.9 127.4 1.2X +2500 wide x 40 rows (read parquet) 16 19 4 6.1 164.4 0.9X +2500 wide x 40 rows (write parquet) 91 98 6 1.1 909.1 0.2X ================================================================================================ wide map field read and write ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor wide map field r/w: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -1 wide x 100000 rows (read in-mem) 11 12 3 8.9 112.0 1.0X -1 wide x 100000 rows (exec in-mem) 14 15 3 7.1 140.1 0.8X -1 wide x 100000 rows (read parquet) 20 21 4 5.1 197.2 0.6X -1 wide x 100000 rows (write parquet) 86 90 5 1.2 858.3 0.1X -100 wide x 1000 rows (read in-mem) 7 8 2 15.0 66.6 1.7X -100 wide x 1000 rows (exec in-mem) 9 11 3 11.1 90.3 1.2X -100 wide x 1000 rows (read parquet) 17 20 5 5.8 172.7 0.6X -100 wide x 1000 rows (write parquet) 82 85 5 1.2 815.3 0.1X -2500 wide x 40 rows (read in-mem) 9 10 2 11.5 86.9 1.3X -2500 wide x 40 rows (exec in-mem) 11 12 3 9.4 106.7 1.0X -2500 wide x 40 rows (read parquet) 17 19 4 5.8 172.4 0.6X -2500 wide x 40 rows (write parquet) 84 91 7 1.2 840.4 0.1X +1 wide x 100000 rows (read in-mem) 12 14 3 8.5 117.9 1.0X +1 wide x 100000 rows (exec in-mem) 15 17 2 6.5 154.8 0.8X +1 wide x 100000 rows (read parquet) 20 23 5 4.9 202.6 0.6X +1 wide x 100000 rows (write parquet) 92 97 5 1.1 918.0 0.1X +100 wide x 1000 rows (read in-mem) 7 8 2 13.5 74.1 1.6X +100 wide x 1000 rows (exec in-mem) 9 10 2 10.7 93.1 1.3X +100 wide x 1000 rows (read parquet) 18 21 5 5.5 181.3 0.7X +100 wide x 1000 rows (write parquet) 88 91 2 1.1 881.6 0.1X +2500 wide x 40 rows (read in-mem) 9 10 2 10.9 91.3 1.3X +2500 wide x 40 rows (exec in-mem) 11 12 2 9.1 109.7 1.1X +2500 wide x 40 rows (read parquet) 18 20 4 5.6 179.7 0.7X +2500 wide x 40 rows (write parquet) 89 97 6 1.1 892.1 0.1X diff --git a/sql/core/benchmarks/WideSchemaBenchmark-results.txt b/sql/core/benchmarks/WideSchemaBenchmark-results.txt index 51a5792330a13..3272e7a72fcc4 100644 --- a/sql/core/benchmarks/WideSchemaBenchmark-results.txt +++ b/sql/core/benchmarks/WideSchemaBenchmark-results.txt @@ -2,157 +2,157 @@ parsing large select expressions ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor parsing large select: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -1 select expressions 1 2 1 0.0 1037637.0 1.0X -100 select expressions 2 3 1 0.0 2276460.0 0.5X -2500 select expressions 44 48 4 0.0 44445446.0 0.0X +1 select expressions 1 1 0 0.0 665640.0 1.0X +100 select expressions 3 3 1 0.0 2542608.0 0.3X +2500 select expressions 53 56 4 0.0 53485744.0 0.0X ================================================================================================ optimize large select expressions ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor optimize large select: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -100 columns 5 6 1 0.0 4963948.0 1.0X -1000 columns 35 36 1 0.0 35350989.0 0.1X -10000 columns 348 367 21 0.0 348345246.0 0.0X +100 columns 5 6 1 0.0 5225216.0 1.0X +1000 columns 38 42 4 0.0 37975149.0 0.1X +10000 columns 395 424 30 0.0 394705382.0 0.0X ================================================================================================ many column field read and write ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor many column field r/w: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -1 cols x 100000 rows (read in-mem) 17 23 3 5.7 174.0 1.0X -1 cols x 100000 rows (exec in-mem) 14 18 3 7.1 140.4 1.2X -1 cols x 100000 rows (read parquet) 27 34 5 3.7 270.5 0.6X -1 cols x 100000 rows (write parquet) 90 97 5 1.1 896.4 0.2X -100 cols x 1000 rows (read in-mem) 11 15 3 8.7 114.9 1.5X -100 cols x 1000 rows (exec in-mem) 16 20 4 6.4 155.7 1.1X -100 cols x 1000 rows (read parquet) 21 26 4 4.7 212.8 0.8X -100 cols x 1000 rows (write parquet) 85 92 6 1.2 854.2 0.2X -2500 cols x 40 rows (read in-mem) 72 76 6 1.4 716.3 0.2X -2500 cols x 40 rows (exec in-mem) 132 138 6 0.8 1319.7 0.1X -2500 cols x 40 rows (read parquet) 273 282 8 0.4 2734.5 0.1X -2500 cols x 40 rows (write parquet) 142 155 9 0.7 1418.2 0.1X +1 cols x 100000 rows (read in-mem) 15 22 5 6.8 146.6 1.0X +1 cols x 100000 rows (exec in-mem) 17 25 6 5.8 171.7 0.9X +1 cols x 100000 rows (read parquet) 28 38 7 3.6 275.8 0.5X +1 cols x 100000 rows (write parquet) 103 123 12 1.0 1034.1 0.1X +100 cols x 1000 rows (read in-mem) 14 21 5 7.1 141.6 1.0X +100 cols x 1000 rows (exec in-mem) 18 25 6 5.6 178.2 0.8X +100 cols x 1000 rows (read parquet) 24 34 7 4.1 243.6 0.6X +100 cols x 1000 rows (write parquet) 106 129 14 0.9 1064.4 0.1X +2500 cols x 40 rows (read in-mem) 84 106 10 1.2 842.2 0.2X +2500 cols x 40 rows (exec in-mem) 155 170 15 0.6 1546.3 0.1X +2500 cols x 40 rows (read parquet) 295 328 41 0.3 2946.7 0.0X +2500 cols x 40 rows (write parquet) 165 183 18 0.6 1651.4 0.1X ================================================================================================ wide shallowly nested struct field read and write ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor wide shallowly nested struct field r/w: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -1 wide x 100000 rows (read in-mem) 18 21 3 5.5 180.4 1.0X -1 wide x 100000 rows (exec in-mem) 19 23 3 5.2 192.4 0.9X -1 wide x 100000 rows (read parquet) 19 23 3 5.2 194.0 0.9X -1 wide x 100000 rows (write parquet) 91 97 5 1.1 905.2 0.2X -100 wide x 1000 rows (read in-mem) 14 17 3 7.3 136.6 1.3X -100 wide x 1000 rows (exec in-mem) 21 23 3 4.8 208.6 0.9X -100 wide x 1000 rows (read parquet) 19 21 3 5.2 194.1 0.9X -100 wide x 1000 rows (write parquet) 87 91 6 1.2 867.4 0.2X -2500 wide x 40 rows (read in-mem) 19 22 3 5.2 194.1 0.9X -2500 wide x 40 rows (exec in-mem) 199 207 7 0.5 1994.6 0.1X -2500 wide x 40 rows (read parquet) 60 63 3 1.7 604.7 0.3X -2500 wide x 40 rows (write parquet) 91 97 4 1.1 914.6 0.2X +1 wide x 100000 rows (read in-mem) 21 28 6 4.7 214.5 1.0X +1 wide x 100000 rows (exec in-mem) 23 30 6 4.4 229.7 0.9X +1 wide x 100000 rows (read parquet) 21 25 4 4.7 210.8 1.0X +1 wide x 100000 rows (write parquet) 104 110 5 1.0 1036.3 0.2X +100 wide x 1000 rows (read in-mem) 15 18 3 6.6 151.7 1.4X +100 wide x 1000 rows (exec in-mem) 23 27 4 4.3 233.9 0.9X +100 wide x 1000 rows (read parquet) 21 24 3 4.7 211.7 1.0X +100 wide x 1000 rows (write parquet) 98 101 4 1.0 979.6 0.2X +2500 wide x 40 rows (read in-mem) 21 24 3 4.7 212.7 1.0X +2500 wide x 40 rows (exec in-mem) 223 233 7 0.4 2227.4 0.1X +2500 wide x 40 rows (read parquet) 65 69 3 1.5 654.4 0.3X +2500 wide x 40 rows (write parquet) 104 108 4 1.0 1035.7 0.2X ================================================================================================ deeply nested struct field read and write ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor deeply nested struct field r/w: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -1 deep x 100000 rows (read in-mem) 14 16 2 7.2 139.0 1.0X -1 deep x 100000 rows (exec in-mem) 16 18 3 6.3 158.7 0.9X -1 deep x 100000 rows (read parquet) 16 18 3 6.2 162.4 0.9X -1 deep x 100000 rows (write parquet) 86 90 4 1.2 859.6 0.2X -100 deep x 1000 rows (read in-mem) 43 45 2 2.3 429.0 0.3X -100 deep x 1000 rows (exec in-mem) 519 528 6 0.2 5188.6 0.0X -100 deep x 1000 rows (read parquet) 507 517 12 0.2 5068.3 0.0X -100 deep x 1000 rows (write parquet) 116 121 5 0.9 1163.8 0.1X -250 deep x 400 rows (read in-mem) 196 203 6 0.5 1963.7 0.1X -250 deep x 400 rows (exec in-mem) 3290 3294 6 0.0 32897.5 0.0X -250 deep x 400 rows (read parquet) 3044 3044 1 0.0 30435.7 0.0X -250 deep x 400 rows (write parquet) 269 277 7 0.4 2688.8 0.1X +1 deep x 100000 rows (read in-mem) 16 18 3 6.2 161.8 1.0X +1 deep x 100000 rows (exec in-mem) 18 22 3 5.4 183.6 0.9X +1 deep x 100000 rows (read parquet) 19 22 3 5.2 192.1 0.8X +1 deep x 100000 rows (write parquet) 99 103 4 1.0 992.0 0.2X +100 deep x 1000 rows (read in-mem) 31 34 4 3.2 314.9 0.5X +100 deep x 1000 rows (exec in-mem) 479 486 8 0.2 4794.2 0.0X +100 deep x 1000 rows (read parquet) 464 469 5 0.2 4643.8 0.0X +100 deep x 1000 rows (write parquet) 115 119 3 0.9 1146.1 0.1X +250 deep x 400 rows (read in-mem) 122 125 2 0.8 1219.8 0.1X +250 deep x 400 rows (exec in-mem) 3018 3025 11 0.0 30175.6 0.0X +250 deep x 400 rows (read parquet) 2818 2822 6 0.0 28178.4 0.0X +250 deep x 400 rows (write parquet) 204 211 5 0.5 2042.2 0.1X ================================================================================================ bushy struct field read and write ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor bushy struct field r/w: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -1 x 1 deep x 100000 rows (read in-mem) 12 14 3 8.5 117.6 1.0X -1 x 1 deep x 100000 rows (exec in-mem) 13 16 4 7.5 133.5 0.9X -1 x 1 deep x 100000 rows (read parquet) 15 17 3 6.6 151.2 0.8X -1 x 1 deep x 100000 rows (write parquet) 85 88 3 1.2 848.9 0.1X -128 x 8 deep x 1000 rows (read in-mem) 12 14 2 8.2 121.3 1.0X -128 x 8 deep x 1000 rows (exec in-mem) 23 26 2 4.3 231.4 0.5X -128 x 8 deep x 1000 rows (read parquet) 19 21 3 5.3 189.3 0.6X -128 x 8 deep x 1000 rows (write parquet) 84 87 5 1.2 838.3 0.1X -1024 x 11 deep x 100 rows (read in-mem) 17 18 2 5.9 168.3 0.7X -1024 x 11 deep x 100 rows (exec in-mem) 131 139 5 0.8 1308.0 0.1X -1024 x 11 deep x 100 rows (read parquet) 33 37 4 3.0 329.1 0.4X -1024 x 11 deep x 100 rows (write parquet) 89 93 3 1.1 888.8 0.1X +1 x 1 deep x 100000 rows (read in-mem) 14 15 2 7.2 138.1 1.0X +1 x 1 deep x 100000 rows (exec in-mem) 16 19 3 6.4 155.6 0.9X +1 x 1 deep x 100000 rows (read parquet) 17 19 3 5.9 169.3 0.8X +1 x 1 deep x 100000 rows (write parquet) 95 99 5 1.1 950.2 0.1X +128 x 8 deep x 1000 rows (read in-mem) 14 15 3 7.4 135.2 1.0X +128 x 8 deep x 1000 rows (exec in-mem) 27 30 4 3.6 274.2 0.5X +128 x 8 deep x 1000 rows (read parquet) 21 23 3 4.9 205.5 0.7X +128 x 8 deep x 1000 rows (write parquet) 95 99 3 1.1 950.6 0.1X +1024 x 11 deep x 100 rows (read in-mem) 18 20 3 5.6 178.7 0.8X +1024 x 11 deep x 100 rows (exec in-mem) 152 161 6 0.7 1518.3 0.1X +1024 x 11 deep x 100 rows (read parquet) 35 37 3 2.9 345.0 0.4X +1024 x 11 deep x 100 rows (write parquet) 100 106 4 1.0 1003.5 0.1X ================================================================================================ wide array field read and write ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor wide array field r/w: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -1 wide x 100000 rows (read in-mem) 14 15 3 7.3 136.2 1.0X -1 wide x 100000 rows (exec in-mem) 16 17 2 6.4 155.5 0.9X -1 wide x 100000 rows (read parquet) 15 17 3 6.5 153.4 0.9X -1 wide x 100000 rows (write parquet) 86 93 6 1.2 860.3 0.2X -100 wide x 1000 rows (read in-mem) 10 12 2 10.0 99.7 1.4X -100 wide x 1000 rows (exec in-mem) 11 13 2 8.8 113.8 1.2X -100 wide x 1000 rows (read parquet) 15 17 3 6.6 151.1 0.9X -100 wide x 1000 rows (write parquet) 83 88 7 1.2 828.2 0.2X -2500 wide x 40 rows (read in-mem) 10 11 2 10.1 98.6 1.4X -2500 wide x 40 rows (exec in-mem) 11 12 2 8.8 113.6 1.2X -2500 wide x 40 rows (read parquet) 15 16 3 6.8 147.9 0.9X -2500 wide x 40 rows (write parquet) 82 84 2 1.2 816.2 0.2X +1 wide x 100000 rows (read in-mem) 16 19 4 6.3 159.2 1.0X +1 wide x 100000 rows (exec in-mem) 18 21 5 5.6 179.4 0.9X +1 wide x 100000 rows (read parquet) 18 21 4 5.5 180.5 0.9X +1 wide x 100000 rows (write parquet) 99 105 6 1.0 990.6 0.2X +100 wide x 1000 rows (read in-mem) 13 14 2 7.9 127.3 1.3X +100 wide x 1000 rows (exec in-mem) 14 16 3 7.4 135.0 1.2X +100 wide x 1000 rows (read parquet) 17 19 3 5.7 174.5 0.9X +100 wide x 1000 rows (write parquet) 96 101 4 1.0 957.0 0.2X +2500 wide x 40 rows (read in-mem) 12 13 2 8.4 118.5 1.3X +2500 wide x 40 rows (exec in-mem) 13 14 2 7.7 130.2 1.2X +2500 wide x 40 rows (read parquet) 17 20 3 5.8 173.7 0.9X +2500 wide x 40 rows (write parquet) 94 99 3 1.1 935.0 0.2X ================================================================================================ wide map field read and write ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor wide map field r/w: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -1 wide x 100000 rows (read in-mem) 11 12 2 9.1 109.6 1.0X -1 wide x 100000 rows (exec in-mem) 14 15 1 7.4 135.2 0.8X -1 wide x 100000 rows (read parquet) 19 22 4 5.4 185.9 0.6X -1 wide x 100000 rows (write parquet) 82 86 5 1.2 815.2 0.1X -100 wide x 1000 rows (read in-mem) 7 7 1 15.1 66.2 1.7X -100 wide x 1000 rows (exec in-mem) 9 10 2 11.7 85.6 1.3X -100 wide x 1000 rows (read parquet) 16 18 3 6.1 164.4 0.7X -100 wide x 1000 rows (write parquet) 78 82 4 1.3 776.2 0.1X -2500 wide x 40 rows (read in-mem) 8 9 1 12.2 82.2 1.3X -2500 wide x 40 rows (exec in-mem) 10 11 2 10.0 100.4 1.1X -2500 wide x 40 rows (read parquet) 16 18 3 6.1 164.9 0.7X -2500 wide x 40 rows (write parquet) 79 84 4 1.3 790.9 0.1X +1 wide x 100000 rows (read in-mem) 12 14 2 8.0 124.2 1.0X +1 wide x 100000 rows (exec in-mem) 16 17 2 6.2 160.1 0.8X +1 wide x 100000 rows (read parquet) 21 24 5 4.8 207.7 0.6X +1 wide x 100000 rows (write parquet) 97 104 7 1.0 970.5 0.1X +100 wide x 1000 rows (read in-mem) 8 9 2 12.9 77.5 1.6X +100 wide x 1000 rows (exec in-mem) 10 12 2 10.0 99.7 1.2X +100 wide x 1000 rows (read parquet) 19 21 3 5.2 191.2 0.6X +100 wide x 1000 rows (write parquet) 91 95 3 1.1 911.0 0.1X +2500 wide x 40 rows (read in-mem) 10 11 1 10.2 98.5 1.3X +2500 wide x 40 rows (exec in-mem) 12 13 1 8.2 121.6 1.0X +2500 wide x 40 rows (read parquet) 19 21 3 5.2 190.7 0.7X +2500 wide x 40 rows (write parquet) 93 99 5 1.1 929.2 0.1X diff --git a/sql/core/benchmarks/WideTableBenchmark-jdk21-results.txt b/sql/core/benchmarks/WideTableBenchmark-jdk21-results.txt index 16d601d2f623e..b9cfa3a8bd0b4 100644 --- a/sql/core/benchmarks/WideTableBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/WideTableBenchmark-jdk21-results.txt @@ -2,16 +2,16 @@ projection on wide table ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor projection on wide table: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -split threshold 10 2615 2687 80 0.4 2494.2 1.0X -split threshold 100 2150 2169 18 0.5 2050.1 1.2X -split threshold 1024 1635 1650 15 0.6 1559.3 1.6X -split threshold 2048 1611 1625 13 0.7 1536.3 1.6X -split threshold 4096 1699 1738 24 0.6 1620.2 1.5X -split threshold 8192 2319 2336 18 0.5 2211.3 1.1X -split threshold 65536 20666 21392 489 0.1 19709.0 0.1X +split threshold 10 2580 2601 18 0.4 2460.7 1.0X +split threshold 100 2137 2154 19 0.5 2038.2 1.2X +split threshold 1024 1652 1660 11 0.6 1575.0 1.6X +split threshold 2048 1586 1601 12 0.7 1512.4 1.6X +split threshold 4096 1715 1727 7 0.6 1635.7 1.5X +split threshold 8192 2359 2366 6 0.4 2250.0 1.1X +split threshold 65536 20935 21321 294 0.1 19964.9 0.1X diff --git a/sql/core/benchmarks/WideTableBenchmark-results.txt b/sql/core/benchmarks/WideTableBenchmark-results.txt index 0ae395b755163..5dc6dde967ec1 100644 --- a/sql/core/benchmarks/WideTableBenchmark-results.txt +++ b/sql/core/benchmarks/WideTableBenchmark-results.txt @@ -2,16 +2,16 @@ projection on wide table ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor projection on wide table: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -split threshold 10 2531 2589 69 0.4 2413.7 1.0X -split threshold 100 2053 2069 18 0.5 1957.9 1.2X -split threshold 1024 1654 1668 13 0.6 1577.2 1.5X -split threshold 2048 1597 1630 19 0.7 1523.2 1.6X -split threshold 4096 1673 1683 10 0.6 1595.4 1.5X -split threshold 8192 2122 2146 21 0.5 2023.7 1.2X -split threshold 65536 21606 21831 217 0.0 20604.8 0.1X +split threshold 10 2477 2481 4 0.4 2362.0 1.0X +split threshold 100 1985 1996 9 0.5 1892.6 1.2X +split threshold 1024 1610 1628 14 0.7 1535.2 1.5X +split threshold 2048 1582 1596 12 0.7 1508.8 1.6X +split threshold 4096 1664 1672 9 0.6 1587.1 1.5X +split threshold 8192 2126 2131 6 0.5 2027.4 1.2X +split threshold 65536 21950 22285 234 0.0 20932.7 0.1X diff --git a/sql/core/pom.xml b/sql/core/pom.xml index 05f906206e5e2..59d798e6e62fe 100644 --- a/sql/core/pom.xml +++ b/sql/core/pom.xml @@ -109,7 +109,7 @@ ${orc.classifier} - org.apache.hive + ${hive.group} hive-storage-api diff --git a/sql/core/src/main/java/org/apache/parquet/filter2/predicate/SparkFilterApi.java b/sql/core/src/main/java/org/apache/parquet/filter2/predicate/SparkFilterApi.java index 884042c824046..f25ae38437014 100644 --- a/sql/core/src/main/java/org/apache/parquet/filter2/predicate/SparkFilterApi.java +++ b/sql/core/src/main/java/org/apache/parquet/filter2/predicate/SparkFilterApi.java @@ -1,20 +1,18 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ package org.apache.parquet.filter2.predicate; diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java index 2bb0b02d4c9c4..1882d990bef55 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java @@ -218,7 +218,9 @@ public byte[] getBytes(int rowId, int count) { Platform.copyMemory(null, data + rowId, array, Platform.BYTE_ARRAY_OFFSET, count); } else { for (int i = 0; i < count; i++) { - array[i] = (byte) dictionary.decodeToInt(dictionaryIds.getDictId(rowId + i)); + if (!isNullAt(rowId + i)) { + array[i] = (byte) dictionary.decodeToInt(dictionaryIds.getDictId(rowId + i)); + } } } return array; @@ -279,7 +281,9 @@ public short[] getShorts(int rowId, int count) { Platform.copyMemory(null, data + rowId * 2L, array, Platform.SHORT_ARRAY_OFFSET, count * 2L); } else { for (int i = 0; i < count; i++) { - array[i] = (short) dictionary.decodeToInt(dictionaryIds.getDictId(rowId + i)); + if (!isNullAt(rowId + i)) { + array[i] = (short) dictionary.decodeToInt(dictionaryIds.getDictId(rowId + i)); + } } } return array; @@ -345,7 +349,9 @@ public int[] getInts(int rowId, int count) { Platform.copyMemory(null, data + rowId * 4L, array, Platform.INT_ARRAY_OFFSET, count * 4L); } else { for (int i = 0; i < count; i++) { - array[i] = dictionary.decodeToInt(dictionaryIds.getDictId(rowId + i)); + if (!isNullAt(rowId + i)) { + array[i] = dictionary.decodeToInt(dictionaryIds.getDictId(rowId + i)); + } } } return array; @@ -423,7 +429,9 @@ public long[] getLongs(int rowId, int count) { Platform.copyMemory(null, data + rowId * 8L, array, Platform.LONG_ARRAY_OFFSET, count * 8L); } else { for (int i = 0; i < count; i++) { - array[i] = dictionary.decodeToLong(dictionaryIds.getDictId(rowId + i)); + if (!isNullAt(rowId + i)) { + array[i] = dictionary.decodeToLong(dictionaryIds.getDictId(rowId + i)); + } } } return array; @@ -487,7 +495,9 @@ public float[] getFloats(int rowId, int count) { Platform.copyMemory(null, data + rowId * 4L, array, Platform.FLOAT_ARRAY_OFFSET, count * 4L); } else { for (int i = 0; i < count; i++) { - array[i] = dictionary.decodeToFloat(dictionaryIds.getDictId(rowId + i)); + if (!isNullAt(rowId + i)) { + array[i] = dictionary.decodeToFloat(dictionaryIds.getDictId(rowId + i)); + } } } return array; @@ -553,7 +563,9 @@ public double[] getDoubles(int rowId, int count) { count * 8L); } else { for (int i = 0; i < count; i++) { - array[i] = dictionary.decodeToDouble(dictionaryIds.getDictId(rowId + i)); + if (!isNullAt(rowId + i)) { + array[i] = dictionary.decodeToDouble(dictionaryIds.getDictId(rowId + i)); + } } } return array; diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java index 2bf2b8d08fcea..1908b511269a6 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java @@ -216,7 +216,9 @@ public byte[] getBytes(int rowId, int count) { System.arraycopy(byteData, rowId, array, 0, count); } else { for (int i = 0; i < count; i++) { - array[i] = (byte) dictionary.decodeToInt(dictionaryIds.getDictId(rowId + i)); + if (!isNullAt(rowId + i)) { + array[i] = (byte) dictionary.decodeToInt(dictionaryIds.getDictId(rowId + i)); + } } } return array; @@ -276,7 +278,9 @@ public short[] getShorts(int rowId, int count) { System.arraycopy(shortData, rowId, array, 0, count); } else { for (int i = 0; i < count; i++) { - array[i] = (short) dictionary.decodeToInt(dictionaryIds.getDictId(rowId + i)); + if (!isNullAt(rowId + i)) { + array[i] = (short) dictionary.decodeToInt(dictionaryIds.getDictId(rowId + i)); + } } } return array; @@ -337,7 +341,9 @@ public int[] getInts(int rowId, int count) { System.arraycopy(intData, rowId, array, 0, count); } else { for (int i = 0; i < count; i++) { - array[i] = dictionary.decodeToInt(dictionaryIds.getDictId(rowId + i)); + if (!isNullAt(rowId + i)) { + array[i] = dictionary.decodeToInt(dictionaryIds.getDictId(rowId + i)); + } } } return array; @@ -409,7 +415,9 @@ public long[] getLongs(int rowId, int count) { System.arraycopy(longData, rowId, array, 0, count); } else { for (int i = 0; i < count; i++) { - array[i] = dictionary.decodeToLong(dictionaryIds.getDictId(rowId + i)); + if (!isNullAt(rowId + i)) { + array[i] = dictionary.decodeToLong(dictionaryIds.getDictId(rowId + i)); + } } } return array; @@ -466,7 +474,9 @@ public float[] getFloats(int rowId, int count) { System.arraycopy(floatData, rowId, array, 0, count); } else { for (int i = 0; i < count; i++) { - array[i] = dictionary.decodeToFloat(dictionaryIds.getDictId(rowId + i)); + if (!isNullAt(rowId + i)) { + array[i] = dictionary.decodeToFloat(dictionaryIds.getDictId(rowId + i)); + } } } return array; @@ -525,7 +535,9 @@ public double[] getDoubles(int rowId, int count) { System.arraycopy(doubleData, rowId, array, 0, count); } else { for (int i = 0; i < count; i++) { - array[i] = dictionary.decodeToDouble(dictionaryIds.getDictId(rowId + i)); + if (!isNullAt(rowId + i)) { + array[i] = dictionary.decodeToDouble(dictionaryIds.getDictId(rowId + i)); + } } } return array; diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala index 22c09c51c2376..2f383f45f1f2e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala @@ -20,7 +20,8 @@ package org.apache.spark.sql import scala.jdk.CollectionConverters._ import org.apache.spark.annotation.Stable -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{LEFT_EXPR, RIGHT_EXPR} import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.catalyst.encoders.{encoderFor, ExpressionEncoder} import org.apache.spark.sql.catalyst.expressions._ @@ -171,29 +172,6 @@ class Column(val expr: Expression) extends Logging { Column.fn(name, this, lit(other)) } - /** - * A version of the `fn` method specifically designed for binary operations in PySpark - * that require logging information. - * This method is used when the operation involves another Column. - * - * @param name The name of the operation to be performed. - * @param other The value to be used in the operation, which will be converted to a - * Column if not already one. - * @param pysparkFragment A string representing the 'fragment' of the PySpark error context, - * typically indicates the name of PySpark function. - * @param pysparkCallSite A string representing the 'callSite' of the PySpark error context, - * providing the exact location within the PySpark code where the - * operation originated. - * @return A Column resulting from the operation. - */ - private def fn( - name: String, other: Any, pysparkFragment: String, pysparkCallSite: String): Column = { - val tupleInfo = (pysparkFragment, pysparkCallSite) - withOrigin(Some(tupleInfo)) { - Column.fn(name, this, lit(other)) - } - } - override def toString: String = toPrettySQL(expr) override def equals(that: Any): Boolean = that match { @@ -310,8 +288,9 @@ class Column(val expr: Expression) extends Logging { val right = lit(other).expr if (this.expr == right) { logWarning( - s"Constructing trivially true equals predicate, '${this.expr} = $right'. " + - "Perhaps you need to use aliases.") + log"Constructing trivially true equals predicate, " + + log"'${MDC(LEFT_EXPR, this.expr)} = ${MDC(RIGHT_EXPR, right)}'. " + + log"Perhaps you need to use aliases.") } fn("=", other) } @@ -516,8 +495,9 @@ class Column(val expr: Expression) extends Logging { val right = lit(other).expr if (this.expr == right) { logWarning( - s"Constructing trivially true equals predicate, '${this.expr} <=> $right'. " + - "Perhaps you need to use aliases.") + log"Constructing trivially true equals predicate, " + + log"'${MDC(LEFT_EXPR, this.expr)} <=> ${MDC(RIGHT_EXPR, right)}'. " + + log"Perhaps you need to use aliases.") } fn("<=>", other) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala index 9992d8cbba076..9d7a765a24c92 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -77,6 +77,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { if (schema != null) { val replaced = CharVarcharUtils.failIfHasCharVarchar(schema).asInstanceOf[StructType] this.userSpecifiedSchema = Option(replaced) + validateSingleVariantColumn() } this } @@ -106,6 +107,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { */ def option(key: String, value: String): DataFrameReader = { this.extraOptions = this.extraOptions + (key -> value) + validateSingleVariantColumn() this } @@ -149,6 +151,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { */ def options(options: scala.collection.Map[String, String]): DataFrameReader = { this.extraOptions ++= options + validateSingleVariantColumn() this } @@ -766,6 +769,17 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { } } + /** + * Ensure that the `singleVariantColumn` option cannot be used if there is also a user specified + * schema. + */ + private def validateSingleVariantColumn(): Unit = { + if (extraOptions.get(JSONOptions.SINGLE_VARIANT_COLUMN).isDefined && + userSpecifiedSchema.isDefined) { + throw QueryCompilationErrors.invalidSingleVariantColumn() + } + } + /////////////////////////////////////////////////////////////////////////////////////// // Builder pattern config options /////////////////////////////////////////////////////////////////////////////////////// diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataSourceRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataSourceRegistration.scala index 63cee8861c5a4..8ffdbb952b082 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataSourceRegistration.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataSourceRegistration.scala @@ -30,7 +30,7 @@ import org.apache.spark.sql.internal.SQLConf * Use `SparkSession.dataSource` to access this. */ @Evolving -private[sql] class DataSourceRegistration private[sql] (dataSourceManager: DataSourceManager) +class DataSourceRegistration private[sql] (dataSourceManager: DataSourceManager) extends Logging { protected[sql] def registerPython( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index c29fd968fc195..c7511737b2b3f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -95,10 +95,26 @@ private[sql] object Dataset { new Dataset[Row](qe, ExpressionEncoder(qe.analyzed.schema)) } + def ofRows( + sparkSession: SparkSession, + logicalPlan: LogicalPlan, + shuffleCleanupMode: ShuffleCleanupMode): DataFrame = + sparkSession.withActive { + val qe = new QueryExecution( + sparkSession, logicalPlan, shuffleCleanupMode = shuffleCleanupMode) + qe.assertAnalyzed() + new Dataset[Row](qe, ExpressionEncoder(qe.analyzed.schema)) + } + /** A variant of ofRows that allows passing in a tracker so we can track query parsing time. */ - def ofRows(sparkSession: SparkSession, logicalPlan: LogicalPlan, tracker: QueryPlanningTracker) + def ofRows( + sparkSession: SparkSession, + logicalPlan: LogicalPlan, + tracker: QueryPlanningTracker, + shuffleCleanupMode: ShuffleCleanupMode = DoNotCleanup) : DataFrame = sparkSession.withActive { - val qe = new QueryExecution(sparkSession, logicalPlan, tracker) + val qe = new QueryExecution( + sparkSession, logicalPlan, tracker, shuffleCleanupMode = shuffleCleanupMode) qe.assertAnalyzed() new Dataset[Row](qe, ExpressionEncoder(qe.analyzed.schema)) } @@ -738,7 +754,7 @@ class Dataset[T] private[sql]( * checkpoint directory. If false creates a local checkpoint using * the caching subsystem */ - private def checkpoint(eager: Boolean, reliableCheckpoint: Boolean): Dataset[T] = { + private[sql] def checkpoint(eager: Boolean, reliableCheckpoint: Boolean): Dataset[T] = { val actionName = if (reliableCheckpoint) "checkpoint" else "localCheckpoint" withAction(actionName, queryExecution) { physicalPlan => val internalRdd = physicalPlan.execute().map(_.copy()) @@ -3888,8 +3904,7 @@ class Dataset[T] private[sql]( * @since 1.6.0 */ def unpersist(blocking: Boolean): this.type = { - sparkSession.sharedState.cacheManager.uncacheQuery( - sparkSession, logicalPlan, cascade = false, blocking) + sparkSession.sharedState.cacheManager.uncacheQuery(this, cascade = false, blocking) this } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala index 862268eba6664..52ab633cd75a7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql import scala.jdk.CollectionConverters._ import org.apache.spark.api.java.function._ +import org.apache.spark.sql.catalyst.analysis.{EliminateEventTimeWatermark, UnresolvedAttribute} import org.apache.spark.sql.catalyst.encoders.{encoderFor, ExpressionEncoder} import org.apache.spark.sql.catalyst.expressions.{Ascending, Attribute, Expression, SortOrder} import org.apache.spark.sql.catalyst.plans.logical._ @@ -676,6 +677,44 @@ class KeyValueGroupedDataset[K, V] private[sql]( ) } + /** + * (Scala-specific) + * Invokes methods defined in the stateful processor used in arbitrary state API v2. + * We allow the user to act on per-group set of input rows along with keyed state and the + * user can choose to output/return 0 or more rows. + * For a streaming dataframe, we will repeatedly invoke the interface methods for new rows + * in each trigger and the user's state/state variables will be stored persistently across + * invocations. + * + * Downstream operators would use specified eventTimeColumnName to calculate watermark. + * Note that TimeMode is set to EventTime to ensure correct flow of watermark. + * + * @tparam U The type of the output objects. Must be encodable to Spark SQL types. + * @param statefulProcessor Instance of statefulProcessor whose functions will + * be invoked by the operator. + * @param eventTimeColumnName eventTime column in the output dataset. Any operations after + * transformWithState will use the new eventTimeColumn. The user + * needs to ensure that the eventTime for emitted output adheres to + * the watermark boundary, otherwise streaming query will fail. + * @param outputMode The output mode of the stateful processor. + * + * See [[Encoder]] for more details on what types are encodable to Spark SQL. + */ + private[sql] def transformWithState[U: Encoder]( + statefulProcessor: StatefulProcessor[K, V, U], + eventTimeColumnName: String, + outputMode: OutputMode): Dataset[U] = { + val transformWithState = TransformWithState[K, V, U]( + groupingAttributes, + dataAttributes, + statefulProcessor, + TimeMode.EventTime(), + outputMode, + child = logicalPlan + ) + updateEventTimeColumnAfterTransformWithState(transformWithState, eventTimeColumnName) + } + /** * (Java-specific) * Invokes methods defined in the stateful processor used in arbitrary state API v2. @@ -702,6 +741,39 @@ class KeyValueGroupedDataset[K, V] private[sql]( transformWithState(statefulProcessor, timeMode, outputMode)(outputEncoder) } + /** + * (Java-specific) + * Invokes methods defined in the stateful processor used in arbitrary state API v2. + * We allow the user to act on per-group set of input rows along with keyed state and the + * user can choose to output/return 0 or more rows. + * + * For a streaming dataframe, we will repeatedly invoke the interface methods for new rows + * in each trigger and the user's state/state variables will be stored persistently across + * invocations. + * + * Downstream operators would use specified eventTimeColumnName to calculate watermark. + * Note that TimeMode is set to EventTime to ensure correct flow of watermark. + * + * @tparam U The type of the output objects. Must be encodable to Spark SQL types. + * @param statefulProcessor Instance of statefulProcessor whose functions will be invoked by the + * operator. + * @param eventTimeColumnName eventTime column in the output dataset. Any operations after + * transformWithState will use the new eventTimeColumn. The user + * needs to ensure that the eventTime for emitted output adheres to + * the watermark boundary, otherwise streaming query will fail. + * @param outputMode The output mode of the stateful processor. + * @param outputEncoder Encoder for the output type. + * + * See [[Encoder]] for more details on what types are encodable to Spark SQL. + */ + private[sql] def transformWithState[U: Encoder]( + statefulProcessor: StatefulProcessor[K, V, U], + eventTimeColumnName: String, + outputMode: OutputMode, + outputEncoder: Encoder[U]): Dataset[U] = { + transformWithState(statefulProcessor, eventTimeColumnName, outputMode)(outputEncoder) + } + /** * (Scala-specific) * Invokes methods defined in the stateful processor used in arbitrary state API v2. @@ -739,19 +811,98 @@ class KeyValueGroupedDataset[K, V] private[sql]( ) } + /** + * (Scala-specific) + * Invokes methods defined in the stateful processor used in arbitrary state API v2. + * Functions as the function above, but with additional eventTimeColumnName for output. + * + * @tparam U The type of the output objects. Must be encodable to Spark SQL types. + * @tparam S The type of initial state objects. Must be encodable to Spark SQL types. + * + * Downstream operators would use specified eventTimeColumnName to calculate watermark. + * Note that TimeMode is set to EventTime to ensure correct flow of watermark. + * + * @param statefulProcessor Instance of statefulProcessor whose functions will + * be invoked by the operator. + * @param eventTimeColumnName eventTime column in the output dataset. Any operations after + * transformWithState will use the new eventTimeColumn. The user + * needs to ensure that the eventTime for emitted output adheres to + * the watermark boundary, otherwise streaming query will fail. + * @param outputMode The output mode of the stateful processor. + * @param initialState User provided initial state that will be used to initiate state for + * the query in the first batch. + * + * See [[Encoder]] for more details on what types are encodable to Spark SQL. + */ + private[sql] def transformWithState[U: Encoder, S: Encoder]( + statefulProcessor: StatefulProcessorWithInitialState[K, V, U, S], + eventTimeColumnName: String, + outputMode: OutputMode, + initialState: KeyValueGroupedDataset[K, S]): Dataset[U] = { + val transformWithState = TransformWithState[K, V, U, S]( + groupingAttributes, + dataAttributes, + statefulProcessor, + TimeMode.EventTime(), + outputMode, + child = logicalPlan, + initialState.groupingAttributes, + initialState.dataAttributes, + initialState.queryExecution.analyzed + ) + + updateEventTimeColumnAfterTransformWithState(transformWithState, eventTimeColumnName) + } + /** * (Java-specific) * Invokes methods defined in the stateful processor used in arbitrary state API v2. - * Functions as the function above, but with additional initial state. + * Functions as the function above, but with additional initialStateEncoder for state encoding. + * + * @tparam U The type of the output objects. Must be encodable to Spark SQL types. + * @tparam S The type of initial state objects. Must be encodable to Spark SQL types. + * @param statefulProcessor Instance of statefulProcessor whose functions will + * be invoked by the operator. + * @param timeMode The time mode semantics of the stateful processor for + * timers and TTL. + * @param outputMode The output mode of the stateful processor. + * @param initialState User provided initial state that will be used to initiate state for + * the query in the first batch. + * @param outputEncoder Encoder for the output type. + * @param initialStateEncoder Encoder for the initial state type. + * + * See [[Encoder]] for more details on what types are encodable to Spark SQL. + */ + private[sql] def transformWithState[U: Encoder, S: Encoder]( + statefulProcessor: StatefulProcessorWithInitialState[K, V, U, S], + timeMode: TimeMode, + outputMode: OutputMode, + initialState: KeyValueGroupedDataset[K, S], + outputEncoder: Encoder[U], + initialStateEncoder: Encoder[S]): Dataset[U] = { + transformWithState(statefulProcessor, timeMode, + outputMode, initialState)(outputEncoder, initialStateEncoder) + } + + /** + * (Java-specific) + * Invokes methods defined in the stateful processor used in arbitrary state API v2. + * Functions as the function above, but with additional eventTimeColumnName for output. + * + * Downstream operators would use specified eventTimeColumnName to calculate watermark. + * Note that TimeMode is set to EventTime to ensure correct flow of watermark. * * @tparam U The type of the output objects. Must be encodable to Spark SQL types. * @tparam S The type of initial state objects. Must be encodable to Spark SQL types. * @param statefulProcessor Instance of statefulProcessor whose functions will * be invoked by the operator. - * @param timeMode The time mode semantics of the stateful processor for timers and TTL. * @param outputMode The output mode of the stateful processor. * @param initialState User provided initial state that will be used to initiate state for * the query in the first batch. + * @param eventTimeColumnName event column in the output dataset. Any operations after + * transformWithState will use the new eventTimeColumn. The user + * needs to ensure that the eventTime for emitted output adheres to + * the watermark boundary, otherwise streaming query will fail. * @param outputEncoder Encoder for the output type. * @param initialStateEncoder Encoder for the initial state type. * @@ -759,15 +910,34 @@ class KeyValueGroupedDataset[K, V] private[sql]( */ private[sql] def transformWithState[U: Encoder, S: Encoder]( statefulProcessor: StatefulProcessorWithInitialState[K, V, U, S], - timeMode: TimeMode, outputMode: OutputMode, initialState: KeyValueGroupedDataset[K, S], + eventTimeColumnName: String, outputEncoder: Encoder[U], initialStateEncoder: Encoder[S]): Dataset[U] = { - transformWithState(statefulProcessor, timeMode, + transformWithState(statefulProcessor, eventTimeColumnName, outputMode, initialState)(outputEncoder, initialStateEncoder) } + /** + * Creates a new dataset with updated eventTimeColumn after the transformWithState + * logical node. + */ + private def updateEventTimeColumnAfterTransformWithState[U: Encoder]( + transformWithState: LogicalPlan, + eventTimeColumnName: String): Dataset[U] = { + val transformWithStateDataset = Dataset[U]( + sparkSession, + transformWithState + ) + + Dataset[U](sparkSession, EliminateEventTimeWatermark( + UpdateEventTimeWatermarkColumn( + UnresolvedAttribute(eventTimeColumnName), + None, + transformWithStateDataset.logicalPlan))) + } + /** * (Scala-specific) * Reduces the elements of each group of data using the specified binary function. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/MergeIntoWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/MergeIntoWriter.scala index ca04b9bfc55f0..b7f9c96f82e04 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/MergeIntoWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/MergeIntoWriter.scala @@ -32,11 +32,17 @@ import org.apache.spark.sql.functions.expr * @param table the name of the target table for the merge operation. * @param ds the source Dataset to merge into the target table. * @param on the merge condition. + * @param schemaEvolutionEnabled whether to enable automatic schema evolution for this merge + * operation. Default is `false`. * * @since 4.0.0 */ @Experimental -class MergeIntoWriter[T] private[sql] (table: String, ds: Dataset[T], on: Column) { +class MergeIntoWriter[T] private[sql] ( + table: String, + ds: Dataset[T], + on: Column, + private[sql] val schemaEvolutionEnabled: Boolean = false) { private val df: DataFrame = ds.toDF() @@ -160,6 +166,17 @@ class MergeIntoWriter[T] private[sql] (table: String, ds: Dataset[T], on: Column new WhenNotMatchedBySource[T](this, Some(condition.expr)) } + /** + * Enable automatic schema evolution for this merge operation. + * @return A `MergeIntoWriter` instance with schema evolution enabled. + */ + def withSchemaEvolution(): MergeIntoWriter[T] = { + new MergeIntoWriter[T](this.table, this.ds, this.on, schemaEvolutionEnabled = true) + .withNewMatchedActions(this.matchedActions: _*) + .withNewNotMatchedActions(this.notMatchedActions: _*) + .withNewNotMatchedBySourceActions(this.notMatchedBySourceActions: _*) + } + /** * Executes the merge operation. */ @@ -176,23 +193,24 @@ class MergeIntoWriter[T] private[sql] (table: String, ds: Dataset[T], on: Column on.expr, matchedActions, notMatchedActions, - notMatchedBySourceActions) + notMatchedBySourceActions, + schemaEvolutionEnabled) val qe = sparkSession.sessionState.executePlan(merge) qe.assertCommandExecuted() } - private[sql] def withNewMatchedAction(action: MergeAction): MergeIntoWriter[T] = { - this.matchedActions = this.matchedActions :+ action + private[sql] def withNewMatchedActions(actions: MergeAction*): MergeIntoWriter[T] = { + this.matchedActions ++= actions this } - private[sql] def withNewNotMatchedAction(action: MergeAction): MergeIntoWriter[T] = { - this.notMatchedActions = this.notMatchedActions :+ action + private[sql] def withNewNotMatchedActions(actions: MergeAction*): MergeIntoWriter[T] = { + this.notMatchedActions ++= actions this } - private[sql] def withNewNotMatchedBySourceAction(action: MergeAction): MergeIntoWriter[T] = { - this.notMatchedBySourceActions = this.notMatchedBySourceActions :+ action + private[sql] def withNewNotMatchedBySourceActions(actions: MergeAction*): MergeIntoWriter[T] = { + this.notMatchedBySourceActions ++= actions this } } @@ -219,7 +237,7 @@ case class WhenMatched[T] private[sql]( * @return The MergeIntoWriter instance with the update all action configured. */ def updateAll(): MergeIntoWriter[T] = { - mergeIntoWriter.withNewMatchedAction(UpdateStarAction(condition)) + mergeIntoWriter.withNewMatchedActions(UpdateStarAction(condition)) } /** @@ -230,7 +248,7 @@ case class WhenMatched[T] private[sql]( * @return The MergeIntoWriter instance with the update action configured. */ def update(map: Map[String, Column]): MergeIntoWriter[T] = { - mergeIntoWriter.withNewMatchedAction( + mergeIntoWriter.withNewMatchedActions( UpdateAction(condition, map.map(x => Assignment(expr(x._1).expr, x._2.expr)).toSeq)) } @@ -240,7 +258,7 @@ case class WhenMatched[T] private[sql]( * @return The MergeIntoWriter instance with the delete action configured. */ def delete(): MergeIntoWriter[T] = { - mergeIntoWriter.withNewMatchedAction(DeleteAction(condition)) + mergeIntoWriter.withNewMatchedActions(DeleteAction(condition)) } } @@ -266,7 +284,7 @@ case class WhenNotMatched[T] private[sql]( * @return The MergeIntoWriter instance with the insert all action configured. */ def insertAll(): MergeIntoWriter[T] = { - mergeIntoWriter.withNewNotMatchedAction(InsertStarAction(condition)) + mergeIntoWriter.withNewNotMatchedActions(InsertStarAction(condition)) } /** @@ -277,7 +295,7 @@ case class WhenNotMatched[T] private[sql]( * @return The MergeIntoWriter instance with the insert action configured. */ def insert(map: Map[String, Column]): MergeIntoWriter[T] = { - mergeIntoWriter.withNewNotMatchedAction( + mergeIntoWriter.withNewNotMatchedActions( InsertAction(condition, map.map(x => Assignment(expr(x._1).expr, x._2.expr)).toSeq)) } } @@ -302,7 +320,7 @@ case class WhenNotMatchedBySource[T] private[sql]( * @return The MergeIntoWriter instance with the update all action configured. */ def updateAll(): MergeIntoWriter[T] = { - mergeIntoWriter.withNewNotMatchedBySourceAction(UpdateStarAction(condition)) + mergeIntoWriter.withNewNotMatchedBySourceActions(UpdateStarAction(condition)) } /** @@ -313,7 +331,7 @@ case class WhenNotMatchedBySource[T] private[sql]( * @return The MergeIntoWriter instance with the update action configured. */ def update(map: Map[String, Column]): MergeIntoWriter[T] = { - mergeIntoWriter.withNewNotMatchedBySourceAction( + mergeIntoWriter.withNewNotMatchedBySourceActions( UpdateAction(condition, map.map(x => Assignment(expr(x._1).expr, x._2.expr)).toSeq)) } @@ -324,6 +342,6 @@ case class WhenNotMatchedBySource[T] private[sql]( * @return The MergeIntoWriter instance with the delete action configured. */ def delete(): MergeIntoWriter[T] = { - mergeIntoWriter.withNewNotMatchedBySourceAction(DeleteAction(condition)) + mergeIntoWriter.withNewNotMatchedBySourceActions(DeleteAction(condition)) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Observation.scala b/sql/core/src/main/scala/org/apache/spark/sql/Observation.scala index 104e7c101fd1c..30d5943c60922 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Observation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Observation.scala @@ -19,8 +19,6 @@ package org.apache.spark.sql import java.util.UUID -import scala.jdk.CollectionConverters.MapHasAsJava - import org.apache.spark.sql.catalyst.plans.logical.CollectMetrics import org.apache.spark.sql.execution.QueryExecution import org.apache.spark.sql.util.QueryExecutionListener @@ -47,9 +45,7 @@ import org.apache.spark.util.ArrayImplicits._ * @param name name of the metric * @since 3.3.0 */ -class Observation(val name: String) { - - if (name.isEmpty) throw new IllegalArgumentException("Name must not be empty") +class Observation(name: String) extends ObservationBase(name) { /** * Create an Observation instance without providing a name. This generates a random name. @@ -60,8 +56,6 @@ class Observation(val name: String) { @volatile private var dataframeId: Option[(SparkSession, Long)] = None - @volatile private var metrics: Option[Map[String, Any]] = None - /** * Attach this observation to the given [[Dataset]] to observe aggregation expressions. * @@ -83,55 +77,6 @@ class Observation(val name: String) { ds.observe(name, expr, exprs: _*) } - /** - * (Scala-specific) Get the observed metrics. This waits for the observed dataset to finish - * its first action. Only the result of the first action is available. Subsequent actions do not - * modify the result. - * - * @return the observed metrics as a `Map[String, Any]` - * @throws InterruptedException interrupted while waiting - */ - @throws[InterruptedException] - def get: Map[String, _] = { - synchronized { - // we need to loop as wait might return without us calling notify - // https://en.wikipedia.org/w/index.php?title=Spurious_wakeup&oldid=992601610 - while (this.metrics.isEmpty) { - wait() - } - } - - this.metrics.get - } - - /** - * (Java-specific) Get the observed metrics. This waits for the observed dataset to finish - * its first action. Only the result of the first action is available. Subsequent actions do not - * modify the result. - * - * @return the observed metrics as a `java.util.Map[String, Object]` - * @throws InterruptedException interrupted while waiting - */ - @throws[InterruptedException] - def getAsJava: java.util.Map[String, AnyRef] = { - get.map { case (key, value) => (key, value.asInstanceOf[Object])}.asJava - } - - /** - * Get the observed metrics. This returns the metrics if they are available, otherwise an empty. - * - * @return the observed metrics as a `Map[String, Any]` - */ - @throws[InterruptedException] - private[sql] def getOrEmpty: Map[String, _] = { - synchronized { - if (metrics.isEmpty) { - wait(100) // Wait for 100ms to see if metrics are available - } - metrics.getOrElse(Map.empty) - } - } - private[sql] def register(sparkSession: SparkSession, dataframeId: Long): Unit = { // makes this class thread-safe: // only the first thread entering this block can set sparkSession @@ -158,9 +103,8 @@ class Observation(val name: String) { case _ => false }) { val row = qe.observedMetrics.get(name) - this.metrics = row.map(r => r.getValuesMap[Any](r.schema.fieldNames.toImmutableArraySeq)) - if (metrics.isDefined) { - notifyAll() + val metrics = row.map(r => r.getValuesMap[Any](r.schema.fieldNames.toImmutableArraySeq)) + if (setMetricsAndNotify(metrics)) { unregister() } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLImplicits.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLImplicits.scala index d257a6b771b93..56f13994277d1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SQLImplicits.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLImplicits.scala @@ -130,54 +130,63 @@ abstract class SQLImplicits extends LowPrioritySQLImplicits { * @since 1.6.1 * @deprecated use [[newSequenceEncoder]] */ + @deprecated("Use newSequenceEncoder instead", "2.2.0") def newIntSeqEncoder: Encoder[Seq[Int]] = ExpressionEncoder() /** * @since 1.6.1 * @deprecated use [[newSequenceEncoder]] */ + @deprecated("Use newSequenceEncoder instead", "2.2.0") def newLongSeqEncoder: Encoder[Seq[Long]] = ExpressionEncoder() /** * @since 1.6.1 * @deprecated use [[newSequenceEncoder]] */ + @deprecated("Use newSequenceEncoder instead", "2.2.0") def newDoubleSeqEncoder: Encoder[Seq[Double]] = ExpressionEncoder() /** * @since 1.6.1 * @deprecated use [[newSequenceEncoder]] */ + @deprecated("Use newSequenceEncoder instead", "2.2.0") def newFloatSeqEncoder: Encoder[Seq[Float]] = ExpressionEncoder() /** * @since 1.6.1 * @deprecated use [[newSequenceEncoder]] */ + @deprecated("Use newSequenceEncoder instead", "2.2.0") def newByteSeqEncoder: Encoder[Seq[Byte]] = ExpressionEncoder() /** * @since 1.6.1 * @deprecated use [[newSequenceEncoder]] */ + @deprecated("Use newSequenceEncoder instead", "2.2.0") def newShortSeqEncoder: Encoder[Seq[Short]] = ExpressionEncoder() /** * @since 1.6.1 * @deprecated use [[newSequenceEncoder]] */ + @deprecated("Use newSequenceEncoder instead", "2.2.0") def newBooleanSeqEncoder: Encoder[Seq[Boolean]] = ExpressionEncoder() /** * @since 1.6.1 * @deprecated use [[newSequenceEncoder]] */ + @deprecated("Use newSequenceEncoder instead", "2.2.0") def newStringSeqEncoder: Encoder[Seq[String]] = ExpressionEncoder() /** * @since 1.6.1 * @deprecated use [[newSequenceEncoder]] */ + @deprecated("Use newSequenceEncoder instead", "2.2.0") def newProductSeqEncoder[A <: Product : TypeTag]: Encoder[Seq[A]] = ExpressionEncoder() /** @since 2.2.0 */ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala index 15eeca87dcf65..466e4cf813185 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala @@ -29,7 +29,8 @@ import scala.util.control.NonFatal import org.apache.spark.{SPARK_VERSION, SparkConf, SparkContext, SparkException, TaskContext} import org.apache.spark.annotation.{DeveloperApi, Experimental, Stable, Unstable} import org.apache.spark.api.java.JavaRDD -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{CALL_SITE_LONG_FORM, CLASS_NAME} import org.apache.spark.internal.config.{ConfigEntry, EXECUTOR_ALLOW_SPARK_CONTEXT} import org.apache.spark.rdd.RDD import org.apache.spark.scheduler.{SparkListener, SparkListenerApplicationEnd} @@ -232,8 +233,12 @@ class SparkSession private( /** * A collection of methods for registering user-defined data sources. + * + * @since 4.0.0 */ - private[sql] def dataSource: DataSourceRegistration = sessionState.dataSourceRegistration + @Experimental + @Unstable + def dataSource: DataSourceRegistration = sessionState.dataSourceRegistration /** * Returns a `StreamingQueryManager` that allows managing all the @@ -1358,13 +1363,13 @@ object SparkSession extends Logging { val session = getActiveSession.orElse(getDefaultSession) if (session.isDefined) { logWarning( - s"""An existing Spark session exists as the active or default session. - |This probably means another suite leaked it. Attempting to stop it before continuing. - |This existing Spark session was created at: - | - |${session.get.creationSite.longForm} - | - """.stripMargin) + log"""An existing Spark session exists as the active or default session. + |This probably means another suite leaked it. Attempting to stop it before continuing. + |This existing Spark session was created at: + | + |${MDC(CALL_SITE_LONG_FORM, session.get.creationSite.longForm)} + | + """.stripMargin) session.get.stop() SparkSession.clearActiveSession() SparkSession.clearDefaultSession() @@ -1391,7 +1396,8 @@ object SparkSession extends Logging { case e@(_: ClassCastException | _: ClassNotFoundException | _: NoClassDefFoundError) => - logWarning(s"Cannot use $extensionConfClassName to configure session extensions.", e) + logWarning(log"Cannot use ${MDC(CLASS_NAME, extensionConfClassName)} to configure " + + log"session extensions.", e) } } extensions diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala index 62e6cc07b3e92..eb8c1d65a8b53 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala @@ -26,7 +26,8 @@ import net.razorvine.pickle.{Pickler, Unpickler} import org.apache.spark.SparkException import org.apache.spark.api.python.DechunkedInputStream -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.CLASS_LOADER import org.apache.spark.security.SocketAuthServer import org.apache.spark.sql.{Column, DataFrame, Row, SparkSession} import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} @@ -136,8 +137,8 @@ private[sql] object PythonSQLUtils extends Logging { def addJarToCurrentClassLoader(path: String): Unit = { Utils.getContextOrSparkClassLoader match { case cl: MutableURLClassLoader => cl.addURL(Utils.resolveURI(path).toURL) - case cl => logWarning( - s"Unsupported class loader $cl will not update jars in the thread class loader.") + case cl => logWarning(log"Unsupported class loader ${MDC(CLASS_LOADER, cl)} will not " + + log"update jars in the thread class loader.") } } @@ -164,10 +165,6 @@ private[sql] object PythonSQLUtils extends Logging { } } - def timestampDiff(unit: String, start: Column, end: Column): Column = { - Column(TimestampDiff(unit, start.expr, end.expr)) - } - def pandasProduct(e: Column, ignoreNA: Boolean): Column = { Column(PandasProduct(e.expr, ignoreNA).toAggregateExpression(false)) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala index 97b701b7380d5..ecbc57f25ad44 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala @@ -27,7 +27,8 @@ import org.apache.spark.TaskContext import org.apache.spark.api.java.{JavaRDD, JavaSparkContext} import org.apache.spark.api.r.SerDe import org.apache.spark.broadcast.Broadcast -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.CONFIG import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.expressions.{ExprUtils, GenericRowWithSchema, Literal} @@ -58,9 +59,9 @@ private[sql] object SQLUtils extends Logging { SparkSession.builder().enableHiveSupport().sparkContext(jsc.sc).getOrCreate() } else { if (enableHiveSupport) { - logWarning("SparkR: enableHiveSupport is requested for SparkSession but " + - s"Spark is not built with Hive or ${CATALOG_IMPLEMENTATION.key} is not set to " + - "'hive', falling back to without Hive support.") + logWarning(log"SparkR: enableHiveSupport is requested for SparkSession but " + + log"Spark is not built with Hive or ${MDC(CONFIG, CATALOG_IMPLEMENTATION.key)} " + + log"is not set to 'hive', falling back to without Hive support.") } SparkSession.builder().sparkContext(jsc.sc).getOrCreate() } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala index 36221d728066e..169aad2f234d6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala @@ -20,6 +20,8 @@ package org.apache.spark.sql.catalyst.analysis import org.apache.commons.lang3.StringUtils import org.apache.spark.SparkException +import org.apache.spark.internal.LogKeys.CONFIG +import org.apache.spark.internal.MDC import org.apache.spark.sql.SaveMode import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier} import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType, CatalogUtils, ClusterBySpec} @@ -379,8 +381,11 @@ class ResolveSessionCatalog(val catalogManager: CatalogManager) case AlterViewAs(ResolvedViewIdentifier(ident), originalText, query) => AlterViewAsCommand(ident, originalText, query) + case AlterViewSchemaBinding(ResolvedViewIdentifier(ident), viewSchemaMode) => + AlterViewSchemaBindingCommand(ident, viewSchemaMode) + case CreateView(ResolvedV1Identifier(ident), userSpecifiedColumns, comment, - properties, originalText, child, allowExisting, replace) => + properties, originalText, child, allowExisting, replace, viewSchemaMode) => CreateViewCommand( name = ident, userSpecifiedColumns = userSpecifiedColumns, @@ -390,9 +395,10 @@ class ResolveSessionCatalog(val catalogManager: CatalogManager) plan = child, allowExisting = allowExisting, replace = replace, - viewType = PersistedView) + viewType = PersistedView, + viewSchemaMode = viewSchemaMode) - case CreateView(ResolvedIdentifier(catalog, _), _, _, _, _, _, _, _) => + case CreateView(ResolvedIdentifier(catalog, _), _, _, _, _, _, _, _, _) => throw QueryCompilationErrors.missingCatalogAbilityError(catalog, "views") case ShowViews(ns: ResolvedNamespace, pattern, output) => @@ -524,9 +530,10 @@ class ResolveSessionCatalog(val catalogManager: CatalogManager) if (!createHiveTableByDefault || (ctas && conf.convertCTAS)) { (nonHiveStorageFormat, conf.defaultDataSourceName) } else { - logWarning("A Hive serde table will be created as there is no table provider " + - s"specified. You can set ${SQLConf.LEGACY_CREATE_HIVE_TABLE_BY_DEFAULT.key} to false " + - "so that native data source table will be created instead.") + logWarning(log"A Hive serde table will be created as there is no table provider " + + log"specified. You can set " + + log"${MDC(CONFIG, SQLConf.LEGACY_CREATE_HIVE_TABLE_BY_DEFAULT.key)} to false so that " + + log"native data source table will be created instead.") (defaultHiveStorage, DDLUtils.HIVE_PROVIDER) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/catalog/SQLFunction.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/catalog/SQLFunction.scala new file mode 100644 index 0000000000000..8ae0341e5646c --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/catalog/SQLFunction.scala @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.catalog + +import org.apache.spark.sql.catalyst.FunctionIdentifier +import org.apache.spark.sql.catalyst.catalog.UserDefinedFunction._ +import org.apache.spark.sql.catalyst.parser.ParserInterface +import org.apache.spark.sql.types.{DataType, StructType} + +/** + * Represent a SQL function. + * + * @param name qualified name of the SQL function + * @param inputParam function input parameters + * @param returnType function return type + * @param exprText function body as an expression + * @param queryText function body as a query + * @param comment function comment + * @param deterministic whether the function is deterministic + * @param containsSQL whether the function has data access routine to be CONTAINS SQL + * @param isTableFunc whether the function is a table function + * @param properties additional properties to be serialized for the SQL function + * @param owner owner of the function + * @param createTimeMs function creation time in milliseconds + */ +case class SQLFunction( + name: FunctionIdentifier, + inputParam: Option[StructType], + returnType: Either[DataType, StructType], + exprText: Option[String], + queryText: Option[String], + comment: Option[String], + deterministic: Option[Boolean], + containsSQL: Option[Boolean], + isTableFunc: Boolean, + properties: Map[String, String], + owner: Option[String] = None, + createTimeMs: Long = System.currentTimeMillis) extends UserDefinedFunction { + + assert(exprText.nonEmpty || queryText.nonEmpty) + assert((isTableFunc && returnType.isRight) || (!isTableFunc && returnType.isLeft)) + + override val language: RoutineLanguage = LanguageSQL +} + +object SQLFunction { + + /** + * This method returns an optional DataType indicating, when present, either the return type for + * scalar user-defined functions, or a StructType indicating the names and types of the columns in + * the output schema for table functions. If the optional value is empty, this indicates that the + * CREATE FUNCTION statement did not have any RETURNS clause at all (for scalar functions), or + * that it included a RETURNS TABLE clause but without any specified output schema (for table + * functions), prompting the analyzer to infer these metadata instead. + */ + def parseReturnTypeText( + text: String, + isTableFunc: Boolean, + parser: ParserInterface): Option[Either[DataType, StructType]] = { + if (!isTableFunc) { + // This is a scalar user-defined function. + if (text.isEmpty) { + // The CREATE FUNCTION statement did not have any RETURNS clause. + Option.empty[Either[DataType, StructType]] + } else { + // The CREATE FUNCTION statement included a RETURNS clause with an explicit return type. + Some(Left(parseDataType(text, parser))) + } + } else { + // This is a table function. + if (text.equalsIgnoreCase("table")) { + // The CREATE FUNCTION statement had a RETURNS TABLE clause but without any explicit schema. + Option.empty[Either[DataType, StructType]] + } else { + // The CREATE FUNCTION statement included a RETURNS TABLE clause with an explicit schema. + Some(Right(parseTableSchema(text, parser))) + } + } + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/catalog/UserDefinedFunction.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/catalog/UserDefinedFunction.scala new file mode 100644 index 0000000000000..1473f19cb71bd --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/catalog/UserDefinedFunction.scala @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.catalog + +import org.apache.spark.sql.catalyst.FunctionIdentifier +import org.apache.spark.sql.catalyst.parser.ParserInterface +import org.apache.spark.sql.catalyst.util.CharVarcharUtils +import org.apache.spark.sql.types.{DataType, StructType} + +/** + * The base class for all user defined functions registered via SQL queries. + */ +trait UserDefinedFunction { + + /** + * Qualified name of the function + */ + def name: FunctionIdentifier + + /** + * Additional properties to be serialized for the function. + * Use this to preserve the runtime configuration that should be used during the function + * execution, such as SQL configs etc. See [[SQLConf]] for more info. + */ + def properties: Map[String, String] + + /** + * Owner of the function + */ + def owner: Option[String] + + /** + * Function creation time in milliseconds since the linux epoch + */ + def createTimeMs: Long + + /** + * The language of the user defined function. + */ + def language: RoutineLanguage +} + +object UserDefinedFunction { + def parseTableSchema(text: String, parser: ParserInterface): StructType = { + val parsed = parser.parseTableSchema(text) + CharVarcharUtils.failIfHasCharVarchar(parsed).asInstanceOf[StructType] + } + + def parseDataType(text: String, parser: ParserInterface): DataType = { + val dataType = parser.parseDataType(text) + CharVarcharUtils.failIfHasCharVarchar(dataType) + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/util/V2ExpressionBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/util/V2ExpressionBuilder.scala index 398f21e01b806..ca04991b50fc2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/util/V2ExpressionBuilder.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/util/V2ExpressionBuilder.scala @@ -95,7 +95,7 @@ class V2ExpressionBuilder(e: Expression, isPredicate: Boolean = false) { } case Cast(child, dataType, _, evalMode) if evalMode == EvalMode.ANSI || Cast.canUpCast(child.dataType, dataType) => - generateExpression(child).map(v => new V2Cast(v, dataType)) + generateExpression(child).map(v => new V2Cast(v, child.dataType, dataType)) case AggregateExpression(aggregateFunction, Complete, isDistinct, None, _) => generateAggregateFunc(aggregateFunction, isDistinct) case Abs(_, true) => generateExpressionWithName("ABS", expr, isPredicate) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/CachedBatchSerializer.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/CachedBatchSerializer.scala index 1113e63cab332..885ddf4110cbb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/CachedBatchSerializer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/CachedBatchSerializer.scala @@ -18,7 +18,8 @@ package org.apache.spark.sql.columnar import org.apache.spark.annotation.{DeveloperApi, Since} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{FILTER, PREDICATE} import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.dsl.expressions._ @@ -307,7 +308,8 @@ abstract class SimpleMetricsCachedBatchSerializer extends CachedBatchSerializer allowFailures = true)) boundFilter.foreach(_ => - filter.foreach(f => logInfo(s"Predicate $p generates partition filter: $f"))) + filter.foreach(f => logInfo(log"Predicate ${MDC(PREDICATE, p)} generates " + + log"partition filter: ${MDC(FILTER, f)}"))) // If the filter can't be resolved then we are missing required statistics. boundFilter.filter(_.resolved) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/BaseScriptTransformationExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/BaseScriptTransformationExec.scala index 91042b59677bf..af3a8d67e3c29 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/BaseScriptTransformationExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/BaseScriptTransformationExec.scala @@ -27,7 +27,8 @@ import scala.util.control.NonFatal import org.apache.hadoop.conf.Configuration import org.apache.spark.{SparkFiles, TaskContext} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, Cast, Expression, GenericInternalRow, JsonToStructs, Literal, StructsToJson, UnsafeProjection} @@ -185,7 +186,7 @@ trait BaseScriptTransformationExec extends UnaryExecNode { if (!proc.isAlive) { val exitCode = proc.exitValue() if (exitCode != 0) { - logError(stderrBuffer.toString) // log the stderr circular buffer + logError(log"${MDC(STDERR, stderrBuffer.toString)}") // log the stderr circular buffer throw QueryExecutionErrors.subprocessExitedError(exitCode, stderrBuffer, cause) } } @@ -329,12 +330,13 @@ abstract class BaseScriptTransformationWriterThread extends Thread with Logging // Javadoc this call will not throw an exception: _exception = t proc.destroy() - logError(s"Thread-${this.getClass.getSimpleName}-Feed exit cause by: ", t) + logError(log"Thread-${MDC(CLASS_NAME, this.getClass.getSimpleName)}-Feed " + + log"exit cause by: ", t) } finally { try { Utils.tryLogNonFatalError(outputStream.close()) if (proc.waitFor() != 0) { - logError(stderrBuffer.toString) // log the stderr circular buffer + logError(log"${MDC(STDERR, stderrBuffer.toString)}") // log the stderr circular buffer } } catch { case NonFatal(exceptionFromFinallyBlock) => diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala index 4f3cecd17894d..b96f257e6b5b6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala @@ -17,17 +17,15 @@ package org.apache.spark.sql.execution -import scala.collection.immutable.IndexedSeq - import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.internal.{LogEntry, Logging, MDC} -import org.apache.spark.internal.LogKey._ +import org.apache.spark.internal.LogKeys._ import org.apache.spark.sql.{Dataset, SparkSession} import org.apache.spark.sql.catalyst.catalog.HiveTableRelation import org.apache.spark.sql.catalyst.expressions.{Attribute, SubqueryExpression} import org.apache.spark.sql.catalyst.optimizer.EliminateResolvedHint -import org.apache.spark.sql.catalyst.plans.logical.{IgnoreCachedData, LogicalPlan, ResolvedHint, SubqueryAlias, View} +import org.apache.spark.sql.catalyst.plans.logical.{IgnoreCachedData, LogicalPlan, ResolvedHint, View} import org.apache.spark.sql.catalyst.trees.TreePattern.PLAN_EXPRESSION import org.apache.spark.sql.catalyst.util.sideBySide import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper @@ -40,7 +38,10 @@ import org.apache.spark.storage.StorageLevel import org.apache.spark.storage.StorageLevel.MEMORY_AND_DISK /** Holds a cached logical plan and its data */ -case class CachedData(plan: LogicalPlan, cachedRepresentation: InMemoryRelation) { +case class CachedData( + // A normalized resolved plan (See QueryExecution#normalized). + plan: LogicalPlan, + cachedRepresentation: InMemoryRelation) { override def toString: String = s""" |CachedData( @@ -55,7 +56,9 @@ case class CachedData(plan: LogicalPlan, cachedRepresentation: InMemoryRelation) * InMemoryRelation. This relation is automatically substituted query plans that return the * `sameResult` as the originally cached query. * - * Internal to Spark SQL. + * Internal to Spark SQL. All its public APIs take analyzed plans and will normalize them before + * further usage, or take [[Dataset]] and get its normalized plan. See `QueryExecution.normalize` + * for more details about plan normalization. */ class CacheManager extends Logging with AdaptiveSparkPlanHelper { @@ -79,41 +82,43 @@ class CacheManager extends Logging with AdaptiveSparkPlanHelper { cachedData.isEmpty } + // Test-only + def cacheQuery(query: Dataset[_]): Unit = { + cacheQuery(query, tableName = None, storageLevel = MEMORY_AND_DISK) + } + /** * Caches the data produced by the logical representation of the given [[Dataset]]. - * Unlike `RDD.cache()`, the default storage level is set to be `MEMORY_AND_DISK` because - * recomputing the in-memory columnar representation of the underlying table is expensive. */ def cacheQuery( query: Dataset[_], - tableName: Option[String] = None, - storageLevel: StorageLevel = MEMORY_AND_DISK): Unit = { - cacheQuery(query.sparkSession, query.queryExecution.normalized, tableName, storageLevel) + tableName: Option[String], + storageLevel: StorageLevel): Unit = { + cacheQueryInternal(query.sparkSession, query.queryExecution.normalized, tableName, storageLevel) } /** - * Caches the data produced by the given [[LogicalPlan]]. - * Unlike `RDD.cache()`, the default storage level is set to be `MEMORY_AND_DISK` because - * recomputing the in-memory columnar representation of the underlying table is expensive. + * Caches the data produced by the given [[LogicalPlan]]. The given plan will be normalized + * before being used further. */ def cacheQuery( spark: SparkSession, planToCache: LogicalPlan, - tableName: Option[String]): Unit = { - cacheQuery(spark, planToCache, tableName, MEMORY_AND_DISK) + tableName: Option[String], + storageLevel: StorageLevel): Unit = { + val normalized = QueryExecution.normalize(spark, planToCache) + cacheQueryInternal(spark, normalized, tableName, storageLevel) } - /** - * Caches the data produced by the given [[LogicalPlan]]. - */ - def cacheQuery( + // The `planToCache` should have been normalized. + private def cacheQueryInternal( spark: SparkSession, planToCache: LogicalPlan, tableName: Option[String], storageLevel: StorageLevel): Unit = { if (storageLevel == StorageLevel.NONE) { // Do nothing for StorageLevel.NONE since it will not actually cache any data. - } else if (lookupCachedData(planToCache).nonEmpty) { + } else if (lookupCachedDataInternal(planToCache).nonEmpty) { logWarning("Asked to cache already cached data.") } else { val sessionWithConfigsOff = getOrCloneSessionWithConfigsOff(spark) @@ -126,7 +131,7 @@ class CacheManager extends Logging with AdaptiveSparkPlanHelper { } this.synchronized { - if (lookupCachedData(planToCache).nonEmpty) { + if (lookupCachedDataInternal(planToCache).nonEmpty) { logWarning("Data has already been cached.") } else { val cd = CachedData(planToCache, inMemoryRelation) @@ -140,38 +145,64 @@ class CacheManager extends Logging with AdaptiveSparkPlanHelper { /** * Un-cache the given plan or all the cache entries that refer to the given plan. - * @param query The [[Dataset]] to be un-cached. - * @param cascade If true, un-cache all the cache entries that refer to the given - * [[Dataset]]; otherwise un-cache the given [[Dataset]] only. + * + * @param query The [[Dataset]] to be un-cached. + * @param cascade If true, un-cache all the cache entries that refer to the given + * [[Dataset]]; otherwise un-cache the given [[Dataset]] only. + * @param blocking Whether to block until all blocks are deleted. */ + def uncacheQuery( + query: Dataset[_], + cascade: Boolean, + blocking: Boolean): Unit = { + uncacheQueryInternal(query.sparkSession, query.queryExecution.normalized, cascade, blocking) + } + + // An overload to provide default value for the `blocking` parameter. def uncacheQuery( query: Dataset[_], cascade: Boolean): Unit = { - uncacheQuery(query.sparkSession, query.queryExecution.normalized, cascade) + uncacheQuery(query, cascade, blocking = false) } /** * Un-cache the given plan or all the cache entries that refer to the given plan. - * @param spark The Spark session. - * @param plan The plan to be un-cached. - * @param cascade If true, un-cache all the cache entries that refer to the given - * plan; otherwise un-cache the given plan only. - * @param blocking Whether to block until all blocks are deleted. + * + * @param spark The Spark session. + * @param plan The plan to be un-cached. + * @param cascade If true, un-cache all the cache entries that refer to the given + * plan; otherwise un-cache the given plan only. + * @param blocking Whether to block until all blocks are deleted. */ def uncacheQuery( spark: SparkSession, plan: LogicalPlan, cascade: Boolean, - blocking: Boolean = false): Unit = { - uncacheQuery(spark, _.sameResult(plan), cascade, blocking) + blocking: Boolean): Unit = { + val normalized = QueryExecution.normalize(spark, plan) + uncacheQueryInternal(spark, normalized, cascade, blocking) + } + + // An overload to provide default value for the `blocking` parameter. + def uncacheQuery( + spark: SparkSession, + plan: LogicalPlan, + cascade: Boolean): Unit = { + uncacheQuery(spark, plan, cascade, blocking = false) + } + + // The `plan` should have been normalized. + private def uncacheQueryInternal( + spark: SparkSession, + plan: LogicalPlan, + cascade: Boolean, + blocking: Boolean): Unit = { + uncacheByCondition(spark, _.sameResult(plan), cascade, blocking) } def uncacheTableOrView(spark: SparkSession, name: Seq[String], cascade: Boolean): Unit = { - uncacheQuery( - spark, - isMatchedTableOrView(_, name, spark.sessionState.conf), - cascade, - blocking = false) + uncacheByCondition( + spark, isMatchedTableOrView(_, name, spark.sessionState.conf), cascade, blocking = false) } private def isMatchedTableOrView(plan: LogicalPlan, name: Seq[String], conf: SQLConf): Boolean = { @@ -180,28 +211,24 @@ class CacheManager extends Logging with AdaptiveSparkPlanHelper { } plan match { - case SubqueryAlias(ident, LogicalRelation(_, _, Some(catalogTable), _)) => - val v1Ident = catalogTable.identifier - isSameName(ident.qualifier :+ ident.name) && isSameName(v1Ident.nameParts) + case LogicalRelation(_, _, Some(catalogTable), _) => + isSameName(catalogTable.identifier.nameParts) - case SubqueryAlias(ident, DataSourceV2Relation(_, _, Some(catalog), Some(v2Ident), _)) => + case DataSourceV2Relation(_, _, Some(catalog), Some(v2Ident), _) => import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.IdentifierHelper - isSameName(ident.qualifier :+ ident.name) && - isSameName(v2Ident.toQualifiedNameParts(catalog)) + isSameName(v2Ident.toQualifiedNameParts(catalog)) - case SubqueryAlias(ident, View(catalogTable, _, _)) => - val v1Ident = catalogTable.identifier - isSameName(ident.qualifier :+ ident.name) && isSameName(v1Ident.nameParts) + case View(catalogTable, _, _) => + isSameName(catalogTable.identifier.nameParts) - case SubqueryAlias(ident, HiveTableRelation(catalogTable, _, _, _, _)) => - val v1Ident = catalogTable.identifier - isSameName(ident.qualifier :+ ident.name) && isSameName(v1Ident.nameParts) + case HiveTableRelation(catalogTable, _, _, _, _) => + isSameName(catalogTable.identifier.nameParts) case _ => false } } - def uncacheQuery( + private def uncacheByCondition( spark: SparkSession, isMatchedPlan: LogicalPlan => Boolean, cascade: Boolean, @@ -254,10 +281,12 @@ class CacheManager extends Logging with AdaptiveSparkPlanHelper { } /** - * Tries to re-cache all the cache entries that refer to the given plan. + * Tries to re-cache all the cache entries that refer to the given plan. The given plan will be + * normalized before being used further. */ def recacheByPlan(spark: SparkSession, plan: LogicalPlan): Unit = { - recacheByCondition(spark, _.plan.exists(_.sameResult(plan))) + val normalized = QueryExecution.normalize(spark, plan) + recacheByCondition(spark, _.plan.exists(_.sameResult(normalized))) } /** @@ -280,7 +309,7 @@ class CacheManager extends Logging with AdaptiveSparkPlanHelper { } val recomputedPlan = cd.copy(cachedRepresentation = newCache) this.synchronized { - if (lookupCachedData(recomputedPlan.plan).nonEmpty) { + if (lookupCachedDataInternal(recomputedPlan.plan).nonEmpty) { logWarning("While recaching, data was already added to cache.") } else { cachedData = recomputedPlan +: cachedData @@ -291,13 +320,23 @@ class CacheManager extends Logging with AdaptiveSparkPlanHelper { } } - /** Optionally returns cached data for the given [[Dataset]] */ + /** + * Optionally returns cached data for the given [[Dataset]] + */ def lookupCachedData(query: Dataset[_]): Option[CachedData] = { - lookupCachedData(query.queryExecution.normalized) + lookupCachedDataInternal(query.queryExecution.normalized) } - /** Optionally returns cached data for the given [[LogicalPlan]]. */ - def lookupCachedData(plan: LogicalPlan): Option[CachedData] = { + /** + * Optionally returns cached data for the given [[LogicalPlan]]. The given plan will be normalized + * before being used further. + */ + def lookupCachedData(session: SparkSession, plan: LogicalPlan): Option[CachedData] = { + val normalized = QueryExecution.normalize(session, plan) + lookupCachedDataInternal(normalized) + } + + private def lookupCachedDataInternal(plan: LogicalPlan): Option[CachedData] = { val result = cachedData.find(cd => plan.sameResult(cd.plan)) if (result.isDefined) { CacheManager.logCacheOperation(log"Dataframe cache hit for input plan:" + @@ -307,13 +346,16 @@ class CacheManager extends Logging with AdaptiveSparkPlanHelper { result } - /** Replaces segments of the given logical plan with cached versions where possible. */ - def useCachedData(plan: LogicalPlan): LogicalPlan = { + /** + * Replaces segments of the given logical plan with cached versions where possible. The input + * plan must be normalized. + */ + private[sql] def useCachedData(plan: LogicalPlan): LogicalPlan = { val newPlan = plan transformDown { case command: IgnoreCachedData => command case currentFragment => - lookupCachedData(currentFragment).map { cached => + lookupCachedDataInternal(currentFragment).map { cached => // After cache lookup, we should still keep the hints from the input plan. val hints = EliminateResolvedHint.extractHintsFromPlan(currentFragment)._2 val cachedPlan = cached.cachedRepresentation.withOutput(currentFragment.output) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala index f583bb665de14..2ebbb9664f67a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala @@ -22,6 +22,8 @@ import java.util.concurrent.TimeUnit._ import org.apache.commons.lang3.StringUtils import org.apache.hadoop.fs.Path +import org.apache.spark.internal.LogKeys.{COUNT, MAX_SPLIT_BYTES, OPEN_COST_IN_BYTES} +import org.apache.spark.internal.MDC import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.{FileSourceOptions, InternalRow, TableIdentifier} import org.apache.spark.sql.catalyst.catalog.BucketSpec @@ -713,7 +715,7 @@ case class FileSourceScanExec( bucketSpec: BucketSpec, readFile: (PartitionedFile) => Iterator[InternalRow], selectedPartitions: ScanFileListing): RDD[InternalRow] = { - logInfo(s"Planning with ${bucketSpec.numBuckets} buckets") + logInfo(log"Planning with ${MDC(COUNT, bucketSpec.numBuckets)} buckets") val partitionArray = selectedPartitions.toPartitionArray val filesGroupedToBuckets = partitionArray.groupBy { f => BucketingUtils @@ -731,7 +733,7 @@ case class FileSourceScanExec( } val filePartitions = optionalNumCoalescedBuckets.map { numCoalescedBuckets => - logInfo(s"Coalescing to ${numCoalescedBuckets} buckets") + logInfo(log"Coalescing to ${MDC(COUNT, numCoalescedBuckets)} buckets") val coalescedBuckets = prunedFilesGroupedToBuckets.groupBy(_._1 % numCoalescedBuckets) Seq.tabulate(numCoalescedBuckets) { bucketId => val partitionedFiles = coalescedBuckets.get(bucketId).map { @@ -764,8 +766,9 @@ case class FileSourceScanExec( val openCostInBytes = relation.sparkSession.sessionState.conf.filesOpenCostInBytes val maxSplitBytes = FilePartition.maxSplitBytes(relation.sparkSession, selectedPartitions) - logInfo(s"Planning scan with bin packing, max size: $maxSplitBytes bytes, " + - s"open cost is considered as scanning $openCostInBytes bytes.") + logInfo(log"Planning scan with bin packing, max size: ${MDC(MAX_SPLIT_BYTES, maxSplitBytes)} " + + log"bytes, open cost is considered as scanning ${MDC(OPEN_COST_IN_BYTES, openCostInBytes)} " + + log"bytes.") // Filter files with bucket pruning if possible val bucketingEnabled = relation.sparkSession.sessionState.conf.bucketingEnabled diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/EmptyRelationExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/EmptyRelationExec.scala new file mode 100644 index 0000000000000..085c0b22524c9 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/EmptyRelationExec.scala @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution + +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.Attribute +import org.apache.spark.sql.catalyst.plans.logical.LocalRelation +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.execution.adaptive.LogicalQueryStage +import org.apache.spark.sql.vectorized.ColumnarBatch + +/** + * A leaf node wrapper for propagated empty relation, which preserved the eliminated logical plan. + * The logical plan might be partial executed, i.e., containing LogicalQueryStage. + */ +case class EmptyRelationExec(@transient logical: LogicalPlan) extends LeafExecNode + with InputRDDCodegen { + private val rdd = sparkContext.emptyRDD[InternalRow] + + // Here we can not use def, because logical won't be serialized to executor while this method + // will be call in executor. + override val output: Seq[Attribute] = logical.output + + override protected def doExecute(): RDD[InternalRow] = rdd + + override def executeCollect(): Array[InternalRow] = Array.empty + + override def executeTake(limit: Int): Array[InternalRow] = Array.empty + + override def executeTail(limit: Int): Array[InternalRow] = Array.empty + + protected override def doExecuteColumnar(): RDD[ColumnarBatch] = sparkContext.emptyRDD + + override def inputRDD: RDD[InternalRow] = rdd + + override protected val createUnsafeProjection: Boolean = false + + protected override def stringArgs: Iterator[Any] = Iterator(s"[plan_id=$id]") + + override def generateTreeString( + depth: Int, + lastChildren: java.util.ArrayList[Boolean], + append: String => Unit, + verbose: Boolean, + prefix: String = "", + addSuffix: Boolean = false, + maxFields: Int, + printNodeId: Boolean, + indent: Int = 0): Unit = { + super.generateTreeString(depth, + lastChildren, + append, + verbose, + prefix, + addSuffix, + maxFields, + printNodeId, + indent) + lastChildren.add(true) + logical.generateTreeString( + depth + 1, lastChildren, append, verbose, "", false, maxFields, printNodeId, indent) + lastChildren.remove(lastChildren.size() - 1) + } + + override def doCanonicalize(): SparkPlan = { + this.copy(logical = LocalRelation(logical.output).canonicalized) + } + + override protected[sql] def cleanupResources(): Unit = { + logical.foreach { + case LogicalQueryStage(_, physical) => + physical.cleanupResources() + case _ => + } + super.cleanupResources() + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala index 252a6290cbc7f..8c7ed7b88d45d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala @@ -17,7 +17,8 @@ package org.apache.spark.sql.execution -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{LOGICAL_PLAN_COLUMNS, OPTIMIZED_PLAN_COLUMNS} import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Dataset, Encoder, SparkSession} import org.apache.spark.sql.catalyst.InternalRow @@ -226,10 +227,11 @@ object LogicalRDD extends Logging { (Some(rewrittenStatistics), Some(rewrittenConstraints)) }.getOrElse { // can't rewrite stats and constraints, give up - logWarning("The output columns are expected to the same (for name and type) for output " + - "between logical plan and optimized plan, but they aren't. output in logical plan: " + - s"${logicalPlan.output.map(_.simpleString(10))} / output in optimized plan: " + - s"${optimizedPlan.output.map(_.simpleString(10))}") + logWarning(log"The output columns are expected to the same (for name and type) for output " + + log"between logical plan and optimized plan, but they aren't. output in logical plan: " + + log"${MDC(LOGICAL_PLAN_COLUMNS, logicalPlan.output.map(_.simpleString(10)))} " + + log"/ output in optimized plan: " + + log"${MDC(OPTIMIZED_PLAN_COLUMNS, optimizedPlan.output.map(_.simpleString(10)))}") (None, None) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExplainUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExplainUtils.scala index 11f6ae0e47ee1..421a963453f0d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExplainUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExplainUtils.scala @@ -17,9 +17,7 @@ package org.apache.spark.sql.execution -import java.util.Collections.newSetFromMap import java.util.IdentityHashMap -import java.util.Set import scala.collection.mutable.{ArrayBuffer, BitSet} @@ -30,6 +28,8 @@ import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanExec, AdaptiveS import org.apache.spark.sql.execution.exchange.{Exchange, ReusedExchangeExec} object ExplainUtils extends AdaptiveSparkPlanHelper { + def localIdMap: ThreadLocal[java.util.Map[QueryPlan[_], Int]] = QueryPlan.localIdMap + /** * Given a input physical plan, performs the following tasks. * 1. Computes the whole stage codegen id for current operator and records it in the @@ -80,24 +80,26 @@ object ExplainUtils extends AdaptiveSparkPlanHelper { * instances but cached plan is an exception. The `InMemoryRelation#innerChildren` use a shared * plan instance across multi-queries. Add lock for this method to avoid tag race condition. */ - def processPlan[T <: QueryPlan[T]](plan: T, append: String => Unit): Unit = synchronized { + def processPlan[T <: QueryPlan[T]](plan: T, append: String => Unit): Unit = { + val prevIdMap = localIdMap.get() try { - // Initialize a reference-unique set of Operators to avoid accdiental overwrites and to allow - // intentional overwriting of IDs generated in previous AQE iteration - val operators = newSetFromMap[QueryPlan[_]](new IdentityHashMap()) + // Initialize a reference-unique id map to store generated ids, which also avoid accidental + // overwrites and to allow intentional overwriting of IDs generated in previous AQE iteration + val idMap = new IdentityHashMap[QueryPlan[_], Int]() + localIdMap.set(idMap) // Initialize an array of ReusedExchanges to help find Adaptively Optimized Out // Exchanges as part of SPARK-42753 val reusedExchanges = ArrayBuffer.empty[ReusedExchangeExec] var currentOperatorID = 0 - currentOperatorID = generateOperatorIDs(plan, currentOperatorID, operators, reusedExchanges, + currentOperatorID = generateOperatorIDs(plan, currentOperatorID, idMap, reusedExchanges, true) val subqueries = ArrayBuffer.empty[(SparkPlan, Expression, BaseSubqueryExec)] getSubqueries(plan, subqueries) currentOperatorID = subqueries.foldLeft(currentOperatorID) { - (curId, plan) => generateOperatorIDs(plan._3.child, curId, operators, reusedExchanges, + (curId, plan) => generateOperatorIDs(plan._3.child, curId, idMap, reusedExchanges, true) } @@ -105,9 +107,9 @@ object ExplainUtils extends AdaptiveSparkPlanHelper { val optimizedOutExchanges = ArrayBuffer.empty[Exchange] reusedExchanges.foreach{ reused => val child = reused.child - if (!operators.contains(child)) { + if (!idMap.containsKey(child)) { optimizedOutExchanges.append(child) - currentOperatorID = generateOperatorIDs(child, currentOperatorID, operators, + currentOperatorID = generateOperatorIDs(child, currentOperatorID, idMap, reusedExchanges, false) } } @@ -144,7 +146,7 @@ object ExplainUtils extends AdaptiveSparkPlanHelper { append("\n") } } finally { - removeTags(plan) + localIdMap.set(prevIdMap) } } @@ -159,13 +161,15 @@ object ExplainUtils extends AdaptiveSparkPlanHelper { * @param plan Input query plan to process * @param startOperatorID The start value of operation id. The subsequent operations will be * assigned higher value. - * @param visited A unique set of operators visited by generateOperatorIds. The set is scoped - * at the callsite function processPlan. It serves two purpose: Firstly, it is - * used to avoid accidentally overwriting existing IDs that were generated in - * the same processPlan call. Secondly, it is used to allow for intentional ID - * overwriting as part of SPARK-42753 where an Adaptively Optimized Out Exchange - * and its subtree may contain IDs that were generated in a previous AQE - * iteration's processPlan call which would result in incorrect IDs. + * @param idMap A reference-unique map store operators visited by generateOperatorIds and its + * id. This Map is scoped at the callsite function processPlan. It serves three + * purpose: + * Firstly, it stores the QueryPlan - generated ID mapping. Secondly, it is used to + * avoid accidentally overwriting existing IDs that were generated in the same + * processPlan call. Thirdly, it is used to allow for intentional ID overwriting as + * part of SPARK-42753 where an Adaptively Optimized Out Exchange and its subtree + * may contain IDs that were generated in a previous AQE iteration's processPlan + * call which would result in incorrect IDs. * @param reusedExchanges A unique set of ReusedExchange nodes visited which will be used to * idenitfy adaptively optimized out exchanges in SPARK-42753. * @param addReusedExchanges Whether to add ReusedExchange nodes to reusedExchanges set. We set it @@ -177,7 +181,7 @@ object ExplainUtils extends AdaptiveSparkPlanHelper { private def generateOperatorIDs( plan: QueryPlan[_], startOperatorID: Int, - visited: Set[QueryPlan[_]], + idMap: java.util.Map[QueryPlan[_], Int], reusedExchanges: ArrayBuffer[ReusedExchangeExec], addReusedExchanges: Boolean): Int = { var currentOperationID = startOperatorID @@ -186,36 +190,35 @@ object ExplainUtils extends AdaptiveSparkPlanHelper { return currentOperationID } - def setOpId(plan: QueryPlan[_]): Unit = if (!visited.contains(plan)) { + def setOpId(plan: QueryPlan[_]): Unit = idMap.computeIfAbsent(plan, plan => { plan match { case r: ReusedExchangeExec if addReusedExchanges => reusedExchanges.append(r) case _ => } - visited.add(plan) currentOperationID += 1 - plan.setTagValue(QueryPlan.OP_ID_TAG, currentOperationID) - } + currentOperationID + }) plan.foreachUp { case _: WholeStageCodegenExec => case _: InputAdapter => case p: AdaptiveSparkPlanExec => - currentOperationID = generateOperatorIDs(p.executedPlan, currentOperationID, visited, + currentOperationID = generateOperatorIDs(p.executedPlan, currentOperationID, idMap, reusedExchanges, addReusedExchanges) if (!p.executedPlan.fastEquals(p.initialPlan)) { - currentOperationID = generateOperatorIDs(p.initialPlan, currentOperationID, visited, + currentOperationID = generateOperatorIDs(p.initialPlan, currentOperationID, idMap, reusedExchanges, addReusedExchanges) } setOpId(p) case p: QueryStageExec => - currentOperationID = generateOperatorIDs(p.plan, currentOperationID, visited, + currentOperationID = generateOperatorIDs(p.plan, currentOperationID, idMap, reusedExchanges, addReusedExchanges) setOpId(p) case other: QueryPlan[_] => setOpId(other) currentOperationID = other.innerChildren.foldLeft(currentOperationID) { - (curId, plan) => generateOperatorIDs(plan, curId, visited, reusedExchanges, + (curId, plan) => generateOperatorIDs(plan, curId, idMap, reusedExchanges, addReusedExchanges) } } @@ -241,7 +244,7 @@ object ExplainUtils extends AdaptiveSparkPlanHelper { } def collectOperatorWithID(plan: QueryPlan[_]): Unit = { - plan.getTagValue(QueryPlan.OP_ID_TAG).foreach { id => + Option(ExplainUtils.localIdMap.get().get(plan)).foreach { id => if (collectedOperators.add(id)) operators += plan } } @@ -334,20 +337,6 @@ object ExplainUtils extends AdaptiveSparkPlanHelper { * `operationId` tag value. */ def getOpId(plan: QueryPlan[_]): String = { - plan.getTagValue(QueryPlan.OP_ID_TAG).map(v => s"$v").getOrElse("unknown") - } - - def removeTags(plan: QueryPlan[_]): Unit = { - def remove(p: QueryPlan[_], children: Seq[QueryPlan[_]]): Unit = { - p.unsetTagValue(QueryPlan.OP_ID_TAG) - p.unsetTagValue(QueryPlan.CODEGEN_ID_TAG) - children.foreach(removeTags) - } - - plan foreach { - case p: AdaptiveSparkPlanExec => remove(p, Seq(p.executedPlan, p.initialPlan)) - case p: QueryStageExec => remove(p, Seq(p.plan)) - case plan: QueryPlan[_] => remove(plan, plan.innerChildren) - } + Option(ExplainUtils.localIdMap.get().get(plan)).map(v => s"$v").getOrElse("unknown") } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExternalAppendOnlyUnsafeRowArray.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExternalAppendOnlyUnsafeRowArray.scala index 56289d73c071f..59810adc4b22e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExternalAppendOnlyUnsafeRowArray.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExternalAppendOnlyUnsafeRowArray.scala @@ -22,7 +22,8 @@ import java.io.Closeable import scala.collection.mutable.ArrayBuffer import org.apache.spark.{SparkEnv, TaskContext} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{CLASS_NAME, MAX_NUM_ROWS_IN_MEMORY_BUFFER} import org.apache.spark.memory.TaskMemoryManager import org.apache.spark.serializer.SerializerManager import org.apache.spark.sql.catalyst.expressions.UnsafeRow @@ -122,8 +123,9 @@ private[sql] class ExternalAppendOnlyUnsafeRowArray( inMemoryBuffer += unsafeRow.copy() } else { if (spillableArray == null) { - logInfo(s"Reached spill threshold of $numRowsInMemoryBufferThreshold rows, switching to " + - s"${classOf[UnsafeExternalSorter].getName}") + logInfo(log"Reached spill threshold of " + + log"${MDC(MAX_NUM_ROWS_IN_MEMORY_BUFFER, numRowsInMemoryBufferThreshold)} rows, " + + log"switching to ${MDC(CLASS_NAME, classOf[UnsafeExternalSorter].getName)}") // We will not sort the rows, so prefixComparator and recordComparator are null spillableArray = UnsafeExternalSorter.create( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/HiveResult.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/HiveResult.scala index c59fd77c4bb35..8df650ca39b7e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/HiveResult.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/HiveResult.scala @@ -17,11 +17,12 @@ package org.apache.spark.sql.execution -import java.nio.charset.StandardCharsets import java.sql.{Date, Timestamp} -import java.time.{Duration, Instant, LocalDate, LocalDateTime, Period} +import java.time._ import org.apache.spark.sql.Row +import org.apache.spark.sql.catalyst.SQLConfHelper +import org.apache.spark.sql.catalyst.expressions.ToStringBase import org.apache.spark.sql.catalyst.util.{DateFormatter, DateTimeUtils, TimestampFormatter} import org.apache.spark.sql.catalyst.util.IntervalStringStyles.HIVE_STYLE import org.apache.spark.sql.catalyst.util.IntervalUtils.{durationToMicros, periodToMonths, toDayTimeIntervalString, toYearMonthIntervalString} @@ -35,7 +36,7 @@ import org.apache.spark.util.ArrayImplicits._ /** * Runs a query returning the result in Hive compatible form. */ -object HiveResult { +object HiveResult extends SQLConfHelper { case class TimeFormatters(date: DateFormatter, timestamp: TimestampFormatter) def getTimeFormatters: TimeFormatters = { @@ -45,6 +46,16 @@ object HiveResult { TimeFormatters(dateFormatter, timestampFormatter) } + type BinaryFormatter = Array[Byte] => String + + def getBinaryFormatter: BinaryFormatter = { + if (conf.getConf(SQLConf.BINARY_OUTPUT_STYLE).isEmpty) { + // Keep the legacy behavior for compatibility. + conf.setConf(SQLConf.BINARY_OUTPUT_STYLE, Some("UTF8")) + } + ToStringBase.getBinaryFormatter(_).toString + } + private def stripRootCommandResult(executedPlan: SparkPlan): SparkPlan = executedPlan match { case CommandResultExec(_, plan, _) => plan case other => other @@ -74,11 +85,12 @@ object HiveResult { executedPlan.executeCollect().map(_.getString(1)).toImmutableArraySeq case other => val timeFormatters = getTimeFormatters + val binaryFormatter = getBinaryFormatter val result: Seq[Seq[Any]] = other.executeCollectPublic().map(_.toSeq).toImmutableArraySeq // We need the types so we can output struct field names val types = executedPlan.output.map(_.dataType) // Reformat to match hive tab delimited output. - result.map(_.zip(types).map(e => toHiveString(e, false, timeFormatters))) + result.map(_.zip(types).map(e => toHiveString(e, false, timeFormatters, binaryFormatter))) .map(_.mkString("\t")) } @@ -95,7 +107,8 @@ object HiveResult { def toHiveString( a: (Any, DataType), nested: Boolean, - formatters: TimeFormatters): String = a match { + formatters: TimeFormatters, + binaryFormatter: BinaryFormatter): String = a match { case (null, _) => if (nested) "null" else "NULL" case (b, BooleanType) => b.toString case (d: Date, DateType) => formatters.date.format(d) @@ -103,21 +116,22 @@ object HiveResult { case (t: Timestamp, TimestampType) => formatters.timestamp.format(t) case (i: Instant, TimestampType) => formatters.timestamp.format(i) case (l: LocalDateTime, TimestampNTZType) => formatters.timestamp.format(l) - case (bin: Array[Byte], BinaryType) => new String(bin, StandardCharsets.UTF_8) + case (bin: Array[Byte], BinaryType) => binaryFormatter(bin) case (decimal: java.math.BigDecimal, DecimalType()) => decimal.toPlainString case (n, _: NumericType) => n.toString case (s: String, _: StringType) => if (nested) "\"" + s + "\"" else s case (interval: CalendarInterval, CalendarIntervalType) => interval.toString case (seq: scala.collection.Seq[_], ArrayType(typ, _)) => - seq.map(v => (v, typ)).map(e => toHiveString(e, true, formatters)).mkString("[", ",", "]") + seq.map(v => (v, typ)).map(e => toHiveString(e, true, formatters, binaryFormatter)) + .mkString("[", ",", "]") case (m: Map[_, _], MapType(kType, vType, _)) => m.map { case (key, value) => - toHiveString((key, kType), true, formatters) + ":" + - toHiveString((value, vType), true, formatters) + toHiveString((key, kType), true, formatters, binaryFormatter) + ":" + + toHiveString((value, vType), true, formatters, binaryFormatter) }.toSeq.sorted.mkString("{", ",", "}") case (struct: Row, StructType(fields)) => struct.toSeq.zip(fields).map { case (v, t) => - s""""${t.name}":${toHiveString((v, t.dataType), true, formatters)}""" + s""""${t.name}":${toHiveString((v, t.dataType), true, formatters, binaryFormatter)}""" }.mkString("{", ",", "}") case (period: Period, YearMonthIntervalType(startField, endField)) => toYearMonthIntervalString( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala index 6280e7dd100c7..357484ca19df2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala @@ -26,7 +26,8 @@ import scala.util.control.NonFatal import org.apache.hadoop.fs.Path import org.apache.spark.SparkException -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.EXTENDED_EXPLAIN_GENERATOR import org.apache.spark.rdd.RDD import org.apache.spark.sql.{AnalysisException, ExtendedExplainGenerator, Row, SparkSession} import org.apache.spark.sql.catalyst.{InternalRow, QueryPlanningTracker} @@ -59,7 +60,8 @@ class QueryExecution( val sparkSession: SparkSession, val logical: LogicalPlan, val tracker: QueryPlanningTracker = new QueryPlanningTracker, - val mode: CommandExecutionMode.Value = CommandExecutionMode.ALL) extends Logging { + val mode: CommandExecutionMode.Value = CommandExecutionMode.ALL, + val shuffleCleanupMode: ShuffleCleanupMode = DoNotCleanup) extends Logging { val id: Long = QueryExecution.nextExecutionId @@ -132,19 +134,7 @@ class QueryExecution( // The plan that has been normalized by custom rules, so that it's more likely to hit cache. lazy val normalized: LogicalPlan = { - val normalizationRules = sparkSession.sessionState.planNormalizationRules - if (normalizationRules.isEmpty) { - commandExecuted - } else { - val planChangeLogger = new PlanChangeLogger[LogicalPlan]() - val normalized = normalizationRules.foldLeft(commandExecuted) { (p, rule) => - val result = rule.apply(p) - planChangeLogger.logRule(rule.ruleName, p, result) - result - } - planChangeLogger.logBatch("Plan Normalization", commandExecuted, normalized) - normalized - } + QueryExecution.normalize(sparkSession, commandExecuted, Some(tracker)) } lazy val withCachedData: LogicalPlan = sparkSession.withActive { @@ -385,7 +375,8 @@ class QueryExecution( append(s"\n== Extended Information (${extension.title}) ==\n") append(extension.generateExtendedInfo(plan)) } catch { - case NonFatal(e) => logWarning(s"Cannot use $extension to get extended information.", e) + case NonFatal(e) => logWarning(log"Cannot use " + + log"${MDC(EXTENDED_EXPLAIN_GENERATOR, extension)} to get extended information.", e) }) } } @@ -457,6 +448,22 @@ object CommandExecutionMode extends Enumeration { val SKIP, NON_ROOT, ALL = Value } +/** + * Modes for shuffle dependency cleanup. + * + * DoNotCleanup: Do not perform any cleanup. + * SkipMigration: Shuffle dependencies will not be migrated at node decommissions. + * RemoveShuffleFiles: Shuffle dependency files are removed at the end of SQL executions. + */ +sealed trait ShuffleCleanupMode + +case object DoNotCleanup extends ShuffleCleanupMode + +case object SkipMigration extends ShuffleCleanupMode + +case object RemoveShuffleFiles extends ShuffleCleanupMode + + object QueryExecution { private val _nextExecutionId = new AtomicLong(0) @@ -594,4 +601,27 @@ object QueryExecution { case e: Throwable => throw toInternalError(msg, e) } } + + def normalize( + session: SparkSession, + plan: LogicalPlan, + tracker: Option[QueryPlanningTracker] = None): LogicalPlan = { + val normalizationRules = session.sessionState.planNormalizationRules + if (normalizationRules.isEmpty) { + plan + } else { + val planChangeLogger = new PlanChangeLogger[LogicalPlan]() + val normalized = normalizationRules.foldLeft(plan) { (p, rule) => + val startTime = System.nanoTime() + val result = rule.apply(p) + val runTime = System.nanoTime() - startTime + val effective = !result.fastEquals(p) + tracker.foreach(_.recordRuleInvocation(rule.ruleName, runTime, effective)) + planChangeLogger.logRule(rule.ruleName, p, result) + result + } + planChangeLogger.logBatch("Plan Normalization", plan, normalized) + normalized + } + } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala index 561deacfb72d9..f4be03c90be75 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala @@ -20,14 +20,16 @@ package org.apache.spark.sql.execution import java.util.concurrent.{ConcurrentHashMap, ExecutorService, Future => JFuture} import java.util.concurrent.atomic.AtomicLong +import scala.jdk.CollectionConverters._ import scala.util.control.NonFatal -import org.apache.spark.{ErrorMessageFormat, JobArtifactSet, SparkException, SparkThrowable, SparkThrowableHelper} +import org.apache.spark.{ErrorMessageFormat, JobArtifactSet, SparkEnv, SparkException, SparkThrowable, SparkThrowableHelper} import org.apache.spark.SparkContext.{SPARK_JOB_DESCRIPTION, SPARK_JOB_INTERRUPT_ON_CANCEL} import org.apache.spark.internal.Logging import org.apache.spark.internal.config.{SPARK_DRIVER_PREFIX, SPARK_EXECUTOR_PREFIX} import org.apache.spark.internal.config.Tests.IS_TESTING import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec import org.apache.spark.sql.execution.ui.{SparkListenerSQLExecutionEnd, SparkListenerSQLExecutionStart} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.StaticSQLConf.SQL_EVENT_TRUNCATE_LENGTH @@ -115,6 +117,7 @@ object SQLExecution extends Logging { withSQLConfPropagated(sparkSession) { var ex: Option[Throwable] = None + var isExecutedPlanAvailable = false val startTime = System.nanoTime() val startEvent = SparkListenerSQLExecutionStart( executionId = executionId, @@ -147,6 +150,7 @@ object SQLExecution extends Logging { } sc.listenerBus.post( startEvent.copy(physicalPlanDescription = planDesc, sparkPlanInfo = planInfo)) + isExecutedPlanAvailable = true f() } } catch { @@ -161,6 +165,24 @@ object SQLExecution extends Logging { case e => Utils.exceptionString(e) } + if (queryExecution.shuffleCleanupMode != DoNotCleanup + && isExecutedPlanAvailable) { + val shuffleIds = queryExecution.executedPlan match { + case ae: AdaptiveSparkPlanExec => + ae.context.shuffleIds.asScala.keys + case _ => + Iterable.empty + } + shuffleIds.foreach { shuffleId => + queryExecution.shuffleCleanupMode match { + case RemoveShuffleFiles => + SparkEnv.get.shuffleManager.unregisterShuffle(shuffleId) + case SkipMigration => + SparkEnv.get.blockManager.migratableResolver.addShuffleToSkip(shuffleId) + case _ => // this should not happen + } + } + } val event = SparkListenerSQLExecutionEnd( executionId, System.currentTimeMillis(), diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala index 70a35ea911538..3382a1161ddba 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala @@ -18,6 +18,7 @@ package org.apache.spark.sql.execution import org.apache.spark.sql.ExperimentalMethods +import org.apache.spark.sql.catalyst.analysis.RewriteCollationJoin import org.apache.spark.sql.catalyst.catalog.SessionCatalog import org.apache.spark.sql.catalyst.optimizer._ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan @@ -89,9 +90,11 @@ class SparkOptimizer( InferWindowGroupLimit, LimitPushDown, LimitPushDownThroughWindow, - EliminateLimits) :+ + EliminateLimits, + ConstantFolding) :+ Batch("User Provided Optimizers", fixedPoint, experimentalMethods.extraOptimizations: _*) :+ - Batch("Replace CTE with Repartition", Once, ReplaceCTERefWithRepartition) + Batch("Replace CTE with Repartition", Once, ReplaceCTERefWithRepartition) :+ + Batch("RewriteCollationJoin", Once, RewriteCollationJoin) override def nonExcludableRules: Seq[String] = super.nonExcludableRules :+ ExtractPythonUDFFromJoinCondition.ruleName :+ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala index 7c45b02ee8468..615c8746a3e52 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala @@ -18,7 +18,9 @@ package org.apache.spark.sql.execution import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanExec, QueryStageExec} +import org.apache.spark.sql.execution.adaptive.LogicalQueryStage import org.apache.spark.sql.execution.columnar.InMemoryTableScanExec import org.apache.spark.sql.execution.exchange.ReusedExchangeExec import org.apache.spark.sql.execution.metric.SQLMetricInfo @@ -51,6 +53,19 @@ class SparkPlanInfo( private[execution] object SparkPlanInfo { + private def fromLogicalPlan(plan: LogicalPlan): SparkPlanInfo = { + val childrenInfo = plan match { + case LogicalQueryStage(_, physical) => Seq(fromSparkPlan(physical)) + case _ => (plan.children ++ plan.subqueries).map(fromLogicalPlan) + } + new SparkPlanInfo( + plan.nodeName, + plan.simpleString(SQLConf.get.maxToStringFields), + childrenInfo, + Map[String, String](), + Seq.empty) + } + def fromSparkPlan(plan: SparkPlan): SparkPlanInfo = { val children = plan match { case ReusedExchangeExec(_, child) => child :: Nil @@ -58,6 +73,7 @@ private[execution] object SparkPlanInfo { case a: AdaptiveSparkPlanExec => a.executedPlan :: Nil case stage: QueryStageExec => stage.plan :: Nil case inMemTab: InMemoryTableScanExec => inMemTab.relation.cachedPlan :: Nil + case EmptyRelationExec(logical) => (logical :: Nil) case _ => plan.children ++ plan.subqueries } val metrics = plan.metrics.toSeq.map { case (key, metric) => @@ -69,10 +85,17 @@ private[execution] object SparkPlanInfo { case fileScan: FileSourceScanLike => fileScan.metadata case _ => Map[String, String]() } + val childrenInfo = children.flatMap { + case child: SparkPlan => + Some(fromSparkPlan(child)) + case child: LogicalPlan => + Some(fromLogicalPlan(child)) + case _ => None + } new SparkPlanInfo( plan.nodeName, plan.simpleString(SQLConf.get.maxToStringFields), - children.map(fromSparkPlan), + childrenInfo, metadata, metrics) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala index 8192be2699933..055fec02d2aea 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala @@ -27,7 +27,7 @@ import org.antlr.v4.runtime.tree.TerminalNode import org.apache.spark.SparkException import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier} -import org.apache.spark.sql.catalyst.analysis.{GlobalTempView, LocalTempView, PersistedView, UnresolvedFunctionName, UnresolvedIdentifier} +import org.apache.spark.sql.catalyst.analysis.{GlobalTempView, LocalTempView, PersistedView, SchemaEvolution, SchemaTypeEvolution, UnresolvedFunctionName, UnresolvedIdentifier, UnresolvedNamespace} import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.expressions.{Expression, Literal} import org.apache.spark.sql.catalyst.parser._ @@ -150,6 +150,9 @@ class SparkSqlAstBuilder extends AstBuilder { * }}} */ override def visitSetCollation(ctx: SetCollationContext): LogicalPlan = withOrigin(ctx) { + if (!SQLConf.get.collationEnabled) { + throw QueryCompilationErrors.collationNotEnabledError() + } val key = SQLConf.DEFAULT_COLLATION.key SetCommand(Some(key -> Some(ctx.identifier.getText.toUpperCase(Locale.ROOT)))) } @@ -340,7 +343,7 @@ class SparkSqlAstBuilder extends AstBuilder { visitCreateTableClauses(ctx.createTableClauses()) val provider = Option(ctx.tableProvider).map(_.multipartIdentifier.getText).getOrElse( throw QueryParsingErrors.createTempTableNotSpecifyProviderError(ctx)) - val schema = Option(ctx.createOrReplaceTableColTypeList()).map(createSchema) + val schema = Option(ctx.colDefinitionList()).map(createSchema) logWarning(s"CREATE TEMPORARY TABLE ... USING ... is deprecated, please use " + "CREATE TEMPORARY VIEW ... USING ... instead") @@ -505,6 +508,7 @@ class SparkSqlAstBuilder extends AstBuilder { } checkDuplicateClauses(ctx.commentSpec(), "COMMENT", ctx) + checkDuplicateClauses(ctx.schemaBinding(), "WITH SCHEMA", ctx) checkDuplicateClauses(ctx.PARTITIONED, "PARTITIONED ON", ctx) checkDuplicateClauses(ctx.TBLPROPERTIES, "TBLPROPERTIES", ctx) @@ -524,6 +528,10 @@ class SparkSqlAstBuilder extends AstBuilder { operationNotAllowed("TBLPROPERTIES can't coexist with CREATE TEMPORARY VIEW", ctx) } + if (ctx.TEMPORARY != null && ctx.schemaBinding(0) != null) { + throw QueryParsingErrors.temporaryViewWithSchemaBindingMode(ctx) + } + val viewType = if (ctx.TEMPORARY == null) { PersistedView } else if (ctx.GLOBAL != null) { @@ -543,6 +551,13 @@ class SparkSqlAstBuilder extends AstBuilder { val originalText = source(ctx.query) assert(Option(originalText).isDefined, "'originalText' must be provided to create permanent view") + val schemaBinding = visitSchemaBinding(ctx.schemaBinding(0)) + val finalSchemaBinding = + if (schemaBinding == SchemaEvolution && userSpecifiedColumns.nonEmpty) { + SchemaTypeEvolution + } else { + schemaBinding + } CreateView( withIdentClause(ctx.identifierReference(), UnresolvedIdentifier(_)), userSpecifiedColumns, @@ -551,7 +566,8 @@ class SparkSqlAstBuilder extends AstBuilder { Some(originalText), qPlan, ctx.EXISTS != null, - ctx.REPLACE != null) + ctx.REPLACE != null, + finalSchemaBinding) } else { // Disallows 'CREATE TEMPORARY VIEW IF NOT EXISTS' to be consistent with // 'CREATE TEMPORARY TABLE' @@ -637,6 +653,173 @@ class SparkSqlAstBuilder extends AstBuilder { }) } + /** + * Create a [[CreateUserDefinedFunctionCommand]]. + * + * For example: + * {{{ + * CREATE [OR REPLACE] [TEMPORARY] FUNCTION [IF NOT EXISTS] [db_name.]function_name + * ([param_name param_type [COMMENT param_comment], ...]) + * RETURNS {ret_type | TABLE (ret_name ret_type [COMMENT ret_comment], ...])} + * [routine_characteristic] + * RETURN {expression | query }; + * + * routine_characteristic + * { LANGUAGE {SQL | IDENTIFIER} | + * [NOT] DETERMINISTIC | + * COMMENT function_comment | + * [CONTAINS SQL | READS SQL DATA] } + * }}} + */ + override def visitCreateUserDefinedFunction(ctx: CreateUserDefinedFunctionContext): LogicalPlan = + withOrigin(ctx) { + assert(ctx.expression != null || ctx.query != null) + + if (ctx.EXISTS != null && ctx.REPLACE != null) { + throw QueryParsingErrors.createFuncWithBothIfNotExistsAndReplaceError(ctx) + } + + val inputParamText = Option(ctx.parameters).map(source) + val returnTypeText: String = + if (ctx.RETURNS != null && + (Option(ctx.dataType).nonEmpty || Option(ctx.returnParams).nonEmpty)) { + source(Option(ctx.dataType).getOrElse(ctx.returnParams)) + } else { + "" + } + val exprText = Option(ctx.expression()).map(source) + val queryText = Option(ctx.query()).map(source) + + val (containsSQL, deterministic, comment, optionalLanguage) = + visitRoutineCharacteristics(ctx.routineCharacteristics()) + val language: RoutineLanguage = optionalLanguage.getOrElse(LanguageSQL) + val isTableFunc = ctx.TABLE() != null || returnTypeText.equalsIgnoreCase("table") + + withIdentClause(ctx.identifierReference(), functionIdentifier => { + if (ctx.TEMPORARY == null) { + // TODO: support creating persistent UDFs. + operationNotAllowed(s"creating persistent SQL functions is not supported", ctx) + } else { + // Disallow to define a temporary function with `IF NOT EXISTS` + if (ctx.EXISTS != null) { + throw QueryParsingErrors.defineTempFuncWithIfNotExistsError(ctx) + } + + if (functionIdentifier.length > 2) { + throw QueryParsingErrors.unsupportedFunctionNameError(functionIdentifier, ctx) + } else if (functionIdentifier.length == 2) { + // Temporary function names should not contain database prefix like "database.function" + throw QueryParsingErrors.specifyingDBInCreateTempFuncError(functionIdentifier.head, ctx) + } + + CreateUserDefinedFunctionCommand( + functionIdentifier.asFunctionIdentifier, + inputParamText, + returnTypeText, + exprText, + queryText, + comment, + deterministic, + containsSQL, + language, + isTableFunc, + isTemp = true, + ctx.EXISTS != null, + ctx.REPLACE != null + ) + } + }) + } + + /** + * SQL function routine characteristics. + * Currently only deterministic clause and comment clause are used. + * + * routine language: [LANGUAGE SQL | IDENTIFIER] + * specific name: [SPECIFIC specific_name] + * routine data access: [NO SQL | CONTAINS SQL | READS SQL DATA | MODIFIES SQL DATA] + * routine null call: [RETURNS NULL ON NULL INPUT | CALLED ON NULL INPUT] + * routine determinism: [DETERMINISTIC | NOT DETERMINISTIC] + * comment: [COMMENT function_comment] + * rights: [SQL SECURITY INVOKER | SQL SECURITY DEFINER] + */ + override def visitRoutineCharacteristics(ctx: RoutineCharacteristicsContext) + : (Option[Boolean], Option[Boolean], Option[String], Option[RoutineLanguage]) = + withOrigin(ctx) { + checkDuplicateClauses(ctx.routineLanguage(), "LANGUAGE", ctx) + checkDuplicateClauses(ctx.specificName(), "SPECIFIC", ctx) + checkDuplicateClauses(ctx.sqlDataAccess(), "SQL DATA ACCESS", ctx) + checkDuplicateClauses(ctx.nullCall(), "NULL CALL", ctx) + checkDuplicateClauses(ctx.deterministic(), "DETERMINISTIC", ctx) + checkDuplicateClauses(ctx.commentSpec(), "COMMENT", ctx) + checkDuplicateClauses(ctx.rightsClause(), "SQL SECURITY RIGHTS", ctx) + + val language: Option[RoutineLanguage] = ctx + .routineLanguage() + .asScala + .headOption + .map(x => { + if (x.SQL() != null) { + LanguageSQL + } else { + val name: String = x.IDENTIFIER().getText() + operationNotAllowed(s"Unsupported language for user defined functions: $name", x) + } + }) + + val deterministic = ctx.deterministic().asScala.headOption.map(visitDeterminism) + val comment = visitCommentSpecList(ctx.commentSpec()) + + ctx.specificName().asScala.headOption.foreach(checkSpecificName) + ctx.nullCall().asScala.headOption.foreach(checkNullCall) + ctx.rightsClause().asScala.headOption.foreach(checkRightsClause) + val containsSQL: Option[Boolean] = + ctx.sqlDataAccess().asScala.headOption.map(visitDataAccess) + (containsSQL, deterministic, comment, language) + } + + /** + * Check if the function has a SPECIFIC name, + * which is a way to provide an alternative name for the function. + * This check applies for all user defined functions. + * Use functionName to specify the function that is currently checked. + */ + private def checkSpecificName(ctx: SpecificNameContext): Unit = + withOrigin(ctx) { + operationNotAllowed(s"SQL function with SPECIFIC name is not supported", ctx) + } + + private def checkNullCall(ctx: NullCallContext): Unit = withOrigin(ctx) { + if (ctx.RETURNS() != null) { + operationNotAllowed("SQL function with RETURNS NULL ON NULL INPUT is not supported", ctx) + } + } + + /** + * Check SQL function data access clause. Currently only READS SQL DATA and CONTAINS SQL + * are supported. Return true if the data access routine is CONTAINS SQL. + */ + private def visitDataAccess(ctx: SqlDataAccessContext): Boolean = withOrigin(ctx) { + if (ctx.NO() != null) { + operationNotAllowed("SQL function with NO SQL is not supported", ctx) + } + if (ctx.MODIFIES() != null) { + operationNotAllowed("SQL function with MODIFIES SQL DATA is not supported", ctx) + } + return ctx.READS() == null + } + + private def checkRightsClause(ctx: RightsClauseContext): Unit = withOrigin(ctx) { + if (ctx.INVOKER() != null) { + operationNotAllowed("SQL function with SQL SECURITY INVOKER is not supported", ctx) + } + } + + private def visitDeterminism(ctx: DeterministicContext): Boolean = withOrigin(ctx) { + blockBang(ctx.errorCapturingNot()) + ctx.errorCapturingNot() == null + } + /** * Create a DROP FUNCTION statement. * @@ -915,4 +1098,22 @@ class SparkSqlAstBuilder extends AstBuilder { (ctx.LOCAL != null, finalStorage, Some(DDLUtils.HIVE_PROVIDER)) } + + /** + * Create a [[UnsetNamespacePropertiesCommand]] command. + * + * Expected format: + * {{{ + * ALTER (DATABASE|SCHEMA|NAMESPACE) database + * UNSET (DBPROPERTIES | PROPERTIES) ('key1', 'key2'); + * }}} + */ + override def visitUnsetNamespaceProperties( + ctx: UnsetNamespacePropertiesContext): LogicalPlan = withOrigin(ctx) { + val properties = visitPropertyKeys(ctx.propertyList) + val cleanedProperties = cleanNamespaceProperties(properties.map(_ -> "").toMap, ctx).keys.toSeq + UnsetNamespacePropertiesCommand( + withIdentClause(ctx.identifierReference(), UnresolvedNamespace(_)), + cleanedProperties) + } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala index d7ebf786168b8..ed7a6162cc9f4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala @@ -31,7 +31,6 @@ import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.streaming.{InternalOutputModes, StreamingRelationV2} import org.apache.spark.sql.catalyst.types.DataTypeUtils -import org.apache.spark.sql.catalyst.util.UnsafeRowUtils import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors} import org.apache.spark.sql.execution.aggregate.AggUtils import org.apache.spark.sql.execution.columnar.{InMemoryRelation, InMemoryTableScanExec} @@ -206,20 +205,6 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] { } } - private def hashJoinSupported - (leftKeys: Seq[Expression], rightKeys: Seq[Expression]): Boolean = { - val result = leftKeys.concat(rightKeys).forall(e => UnsafeRowUtils.isBinaryStable(e.dataType)) - if (!result) { - val keysNotSupportingHashJoin = leftKeys.concat(rightKeys).filterNot( - e => UnsafeRowUtils.isBinaryStable(e.dataType)) - logWarning("Hash based joins are not supported due to " + - "joining on keys that don't support binary equality. " + - "Keys not supporting hash joins: " + keysNotSupportingHashJoin - .map(e => e.toString + " due to DataType: " + e.dataType.typeName).mkString(", ")) - } - result - } - def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { // If it is an equi-join, we first look at the join hints w.r.t. the following order: @@ -246,8 +231,7 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] { val hashJoinSupport = hashJoinSupported(leftKeys, rightKeys) def createBroadcastHashJoin(onlyLookingAtHint: Boolean) = { if (hashJoinSupport) { - val buildSide = getBroadcastBuildSide( - left, right, joinType, hint, onlyLookingAtHint, conf) + val buildSide = getBroadcastBuildSide(j, onlyLookingAtHint, conf) checkHintBuildSide(onlyLookingAtHint, buildSide, joinType, hint, true) buildSide.map { buildSide => @@ -267,8 +251,7 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] { def createShuffleHashJoin(onlyLookingAtHint: Boolean) = { if (hashJoinSupport) { - val buildSide = getShuffleHashJoinBuildSide( - left, right, joinType, hint, onlyLookingAtHint, conf) + val buildSide = getShuffleHashJoinBuildSide(j, onlyLookingAtHint, conf) checkHintBuildSide(onlyLookingAtHint, buildSide, joinType, hint, false) buildSide.map { buildSide => @@ -440,6 +423,18 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] { case EventTimeWatermark(columnName, delay, child) => EventTimeWatermarkExec(columnName, delay, planLater(child)) :: Nil + case UpdateEventTimeWatermarkColumn(columnName, delay, child) => + // we expect watermarkDelay to be resolved before physical planning. + if (delay.isEmpty) { + // This is a sanity check. We should not reach here as delay is updated during + // query plan resolution in [[ResolveUpdateEventTimeWatermarkColumn]] Analyzer rule. + throw SparkException.internalError( + "No watermark delay found in UpdateEventTimeWatermarkColumn logical node. " + + "You have hit a query analyzer bug. " + + "Please report your query to Spark user mailing list.") + } + UpdateEventTimeColumnExec(columnName, delay.get, None, planLater(child)) :: Nil + case PhysicalAggregation( namedGroupingExpressions, aggregateExpressions, rewrittenResultExpressions, child) => @@ -964,6 +959,7 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] { execution.SampleExec(lb, ub, withReplacement, seed, planLater(child)) :: Nil case logical.LocalRelation(output, data, _) => LocalTableScanExec(output, data) :: Nil + case logical.EmptyRelation(l) => EmptyRelationExec(l) :: Nil case CommandResult(output, _, plan, data) => CommandResultExec(output, plan, data) :: Nil // We should match the combination of limit and offset first, to get the optimal physical // plan, instead of planning limit and offset separately. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala index 5a0bf09a1713b..920f61574770d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala @@ -24,6 +24,8 @@ import scala.collection.mutable import scala.util.control.NonFatal import org.apache.spark.{broadcast, SparkException, SparkUnsupportedOperationException} +import org.apache.spark.internal.LogKeys.{CODEGEN_STAGE_ID, CONFIG, ERROR, HUGE_METHOD_LIMIT, MAX_METHOD_CODE_SIZE, TREE_NODE} +import org.apache.spark.internal.MDC import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ @@ -163,8 +165,10 @@ trait CodegenSupport extends SparkPlan { } } + @scala.annotation.nowarn("cat=deprecation") val inputVars = inputVarsCandidate match { - case stream: LazyList[ExprCode] => stream.force + case stream: Stream[ExprCode] => stream.force + case lazyList: LazyList[ExprCode] => lazyList.force case other => other } @@ -406,7 +410,7 @@ trait CodegenSupport extends SparkPlan { if (Utils.isTesting) { throw SparkException.internalError(errMsg) } else { - logWarning(s"[BUG] $errMsg Please open a JIRA ticket to report it.") + logWarning(log"[BUG] ${MDC(ERROR, errMsg)} Please open a JIRA ticket to report it.") } } if (parent.limitNotReachedChecks.isEmpty) { @@ -729,17 +733,21 @@ case class WholeStageCodegenExec(child: SparkPlan)(val codegenStageId: Int) } catch { case NonFatal(_) if !Utils.isTesting && conf.codegenFallback => // We should already saw the error message - logWarning(s"Whole-stage codegen disabled for plan (id=$codegenStageId):\n $treeString") + logWarning(log"Whole-stage codegen disabled for plan " + + log"(id=${MDC(CODEGEN_STAGE_ID, codegenStageId)}):\n " + + log"${MDC(TREE_NODE, treeString)}") return child.execute() } // Check if compiled code has a too large function if (compiledCodeStats.maxMethodCodeSize > conf.hugeMethodLimit) { - logInfo(s"Found too long generated codes and JIT optimization might not work: " + - s"the bytecode size (${compiledCodeStats.maxMethodCodeSize}) is above the limit " + - s"${conf.hugeMethodLimit}, and the whole-stage codegen was disabled " + - s"for this plan (id=$codegenStageId). To avoid this, you can raise the limit " + - s"`${SQLConf.WHOLESTAGE_HUGE_METHOD_LIMIT.key}`:\n$treeString") + logInfo(log"Found too long generated codes and JIT optimization might not work: " + + log"the bytecode size (${MDC(MAX_METHOD_CODE_SIZE, compiledCodeStats.maxMethodCodeSize)})" + + log" is above the limit ${MDC(HUGE_METHOD_LIMIT, conf.hugeMethodLimit)}, " + + log"and the whole-stage codegen was disabled for this plan " + + log"(id=${MDC(CODEGEN_STAGE_ID, codegenStageId)}). To avoid this, you can raise the limit" + + log" `${MDC(CONFIG, SQLConf.WHOLESTAGE_HUGE_METHOD_LIMIT.key)}`:\n" + + log"${MDC(TREE_NODE, treeString)}") return child.execute() } @@ -947,6 +955,10 @@ case class CollapseCodegenStages( // Do not make LogicalTableScanExec the root of WholeStageCodegen // to support the fast driver-local collect/take paths. plan + case plan: EmptyRelationExec => + // Do not make EmptyRelationExec the root of WholeStageCodegen + // to support the fast driver-local collect/take paths. + plan case plan: CommandResultExec => // Do not make CommandResultExec the root of WholeStageCodegen // to support the fast driver-local collect/take paths. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQEOptimizer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQEOptimizer.scala index 7bee641a00e73..014d23f2f4101 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQEOptimizer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQEOptimizer.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.execution.adaptive +import org.apache.spark.internal.LogKeys.{BATCH_NAME, RULE_NAME} +import org.apache.spark.internal.MDC import org.apache.spark.sql.catalyst.analysis.UpdateAttributeNullability import org.apache.spark.sql.catalyst.optimizer.{ConvertToLocalRelation, EliminateLimits, OptimizeOneRowPlan} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, LogicalPlanIntegrity} @@ -52,7 +54,8 @@ class AQEOptimizer(conf: SQLConf, extendedRuntimeOptimizerRules: Seq[Rule[Logica val filteredRules = batch.rules.filter { rule => val exclude = excludedRules.contains(rule.ruleName) if (exclude) { - logInfo(s"Optimization rule '${rule.ruleName}' is excluded from the optimizer.") + logInfo(log"Optimization rule '${MDC(RULE_NAME, rule.ruleName)}' is excluded from " + + log"the optimizer.") } !exclude } @@ -61,8 +64,8 @@ class AQEOptimizer(conf: SQLConf, extendedRuntimeOptimizerRules: Seq[Rule[Logica } else if (filteredRules.nonEmpty) { Some(Batch(batch.name, batch.strategy, filteredRules: _*)) } else { - logInfo(s"Optimization batch '${batch.name}' is excluded from the optimizer " + - s"as all enclosed rules have been excluded.") + logInfo(log"Optimization batch '${MDC(BATCH_NAME, batch.name)}' is excluded from " + + log"the optimizer as all enclosed rules have been excluded.") None } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQEPropagateEmptyRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQEPropagateEmptyRelation.scala index 7951a6f36b9bd..7b3e0cd549b85 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQEPropagateEmptyRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQEPropagateEmptyRelation.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.adaptive import org.apache.spark.sql.catalyst.optimizer.PropagateEmptyRelationBase import org.apache.spark.sql.catalyst.planning.ExtractSingleColumnNullAwareAntiJoin +import org.apache.spark.sql.catalyst.plans.logical.EmptyRelation import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.trees.TreePattern.{LOCAL_RELATION, LOGICAL_QUERY_STAGE, TRUE_OR_FALSE_LITERAL} import org.apache.spark.sql.execution.aggregate.BaseAggregateExec @@ -39,6 +40,8 @@ object AQEPropagateEmptyRelation extends PropagateEmptyRelationBase { override protected def nonEmpty(plan: LogicalPlan): Boolean = super.nonEmpty(plan) || getEstimatedRowCount(plan).exists(_ > 0) + override protected def empty(plan: LogicalPlan): LogicalPlan = EmptyRelation(plan) + private def isRootRepartition(plan: LogicalPlan): Boolean = plan match { case l: LogicalQueryStage if l.getTagValue(ROOT_REPARTITION).isDefined => true case _ => false @@ -61,6 +64,8 @@ object AQEPropagateEmptyRelation extends PropagateEmptyRelationBase { None } + case _: EmptyRelation => Some(0) + case _ => None } @@ -82,6 +87,13 @@ object AQEPropagateEmptyRelation extends PropagateEmptyRelationBase { case _ => false } + // A broadcast query stage can't be executed without the join operator. + // TODO: we can return the original query plan before broadcast. + override protected def canExecuteWithoutJoin(plan: LogicalPlan): Boolean = plan match { + case LogicalQueryStage(_, _: BroadcastQueryStageExec) => false + case _ => true + } + override protected def applyInternal(p: LogicalPlan): LogicalPlan = p.transformUpWithPruning( // LOCAL_RELATION and TRUE_OR_FALSE_LITERAL pattern are matched at // `PropagateEmptyRelationBase.commonApplyFunc` diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveRuleContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveRuleContext.scala new file mode 100644 index 0000000000000..23817be71c89c --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveRuleContext.scala @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.adaptive + +import scala.collection.mutable + +import org.apache.spark.annotation.{DeveloperApi, Experimental} +import org.apache.spark.sql.catalyst.SQLConfHelper + +/** + * Provide the functionality to modify the next plan fragment configs in AQE rules. + * The configs will be cleanup before going to execute next plan fragment. + * To get instance, use: {{{ AdaptiveRuleContext.get() }}} + * + * @param isSubquery if the input query plan is subquery + * @param isFinalStage if the next stage is final stage + */ +@Experimental +@DeveloperApi +case class AdaptiveRuleContext(isSubquery: Boolean, isFinalStage: Boolean) { + + /** + * Set SQL configs for next plan fragment. The configs will affect all of rules in AQE, + * i.e., the runtime optimizer, planner, queryStagePreparationRules, queryStageOptimizerRules, + * columnarRules. + * This configs will be cleared before going to get the next plan fragment. + */ + private val nextPlanFragmentConf = new mutable.HashMap[String, String]() + + private[sql] def withFinalStage(isFinalStage: Boolean): AdaptiveRuleContext = { + if (this.isFinalStage == isFinalStage) { + this + } else { + val newRuleContext = copy(isFinalStage = isFinalStage) + newRuleContext.setConfigs(this.configs()) + newRuleContext + } + } + + def setConfig(key: String, value: String): Unit = { + nextPlanFragmentConf.put(key, value) + } + + def setConfigs(kvs: Map[String, String]): Unit = { + kvs.foreach(kv => nextPlanFragmentConf.put(kv._1, kv._2)) + } + + private[sql] def configs(): Map[String, String] = nextPlanFragmentConf.toMap + + private[sql] def clearConfigs(): Unit = nextPlanFragmentConf.clear() +} + +object AdaptiveRuleContext extends SQLConfHelper { + private val ruleContextThreadLocal = new ThreadLocal[AdaptiveRuleContext] + + /** + * If a rule is applied inside AQE then the returned value is always defined, else return None. + */ + def get(): Option[AdaptiveRuleContext] = Option(ruleContextThreadLocal.get()) + + private[sql] def withRuleContext[T](ruleContext: AdaptiveRuleContext)(block: => T): T = { + assert(ruleContext != null) + val origin = ruleContextThreadLocal.get() + ruleContextThreadLocal.set(ruleContext) + try { + val conf = ruleContext.configs() + withSQLConf(conf.toSeq: _*) { + block + } + } finally { + ruleContextThreadLocal.set(origin) + } + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala index a5e681535cb82..f21960aeedd64 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql.execution.adaptive import java.util -import java.util.concurrent.LinkedBlockingQueue +import java.util.concurrent.{ConcurrentHashMap, LinkedBlockingQueue} import scala.collection.concurrent.TrieMap import scala.collection.mutable @@ -28,6 +28,8 @@ import scala.util.control.NonFatal import org.apache.spark.SparkException import org.apache.spark.broadcast +import org.apache.spark.internal.{MDC, MessageWithContext} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow @@ -74,13 +76,32 @@ case class AdaptiveSparkPlanExec( @transient private val lock = new Object() - @transient private val logOnLevel: ( => String) => Unit = conf.adaptiveExecutionLogLevel match { - case "TRACE" => logTrace(_) - case "DEBUG" => logDebug(_) - case "INFO" => logInfo(_) - case "WARN" => logWarning(_) - case "ERROR" => logError(_) - case _ => logDebug(_) + @transient private val logOnLevel: ( => MessageWithContext) => Unit = + conf.adaptiveExecutionLogLevel match { + case "TRACE" => logTrace(_) + case "INFO" => logInfo(_) + case "WARN" => logWarning(_) + case "ERROR" => logError(_) + case _ => logDebug(_) + } + + @transient private var ruleContext = new AdaptiveRuleContext( + isSubquery = isSubquery, + isFinalStage = false) + + private def withRuleContext[T](f: => T): T = + AdaptiveRuleContext.withRuleContext(ruleContext) { f } + + private def applyPhysicalRulesWithRuleContext( + plan: => SparkPlan, + rules: Seq[Rule[SparkPlan]], + loggerAndBatchName: Option[(PlanChangeLogger[SparkPlan], String)] = None): SparkPlan = { + // Apply the last rules if exists before going to apply the next batch of rules, + // so that we can propagate the configs. + val newPlan = plan + withRuleContext { + applyPhysicalRules(newPlan, rules, loggerAndBatchName) + } } @transient private val planChangeLogger = new PlanChangeLogger[SparkPlan]() @@ -159,7 +180,9 @@ case class AdaptiveSparkPlanExec( collapseCodegenStagesRule ) - private def optimizeQueryStage(plan: SparkPlan, isFinalStage: Boolean): SparkPlan = { + private def optimizeQueryStage( + plan: SparkPlan, + isFinalStage: Boolean): SparkPlan = withRuleContext { val rules = if (isFinalStage && !conf.getConf(SQLConf.ADAPTIVE_EXECUTION_APPLY_FINAL_STAGE_SHUFFLE_OPTIMIZATIONS)) { queryStageOptimizerRules.filterNot(_.isInstanceOf[AQEShuffleReadRule]) @@ -195,7 +218,7 @@ case class AdaptiveSparkPlanExec( } private def applyQueryPostPlannerStrategyRules(plan: SparkPlan): SparkPlan = { - applyPhysicalRules( + applyPhysicalRulesWithRuleContext( plan, context.session.sessionState.adaptiveRulesHolder.queryPostPlannerStrategyRules, Some((planChangeLogger, "AQE Query Post Planner Strategy Rules")) @@ -203,7 +226,7 @@ case class AdaptiveSparkPlanExec( } @transient val initialPlan = context.session.withActive { - applyPhysicalRules( + applyPhysicalRulesWithRuleContext( applyQueryPostPlannerStrategyRules(inputPlan), queryStagePreparationRules, Some((planChangeLogger, "AQE Preparations"))) @@ -280,6 +303,7 @@ case class AdaptiveSparkPlanExec( val errors = new mutable.ArrayBuffer[Throwable]() var stagesToReplace = Seq.empty[QueryStageExec] while (!result.allChildStagesMaterialized) { + ruleContext.clearConfigs() currentPhysicalPlan = result.newPlan if (result.newStages.nonEmpty) { stagesToReplace = result.newStages ++ stagesToReplace @@ -302,6 +326,11 @@ case class AdaptiveSparkPlanExec( try { stage.materialize().onComplete { res => if (res.isSuccess) { + // record shuffle IDs for successful stages for cleanup + stage.plan.collect { + case s: ShuffleExchangeLike => + context.shuffleIds.put(s.shuffleId, true) + } events.offer(StageSuccess(stage, res.get)) } else { events.offer(StageFailure(stage, res.failed.get)) @@ -353,8 +382,9 @@ case class AdaptiveSparkPlanExec( val newCost = costEvaluator.evaluateCost(newPhysicalPlan) if (newCost < origCost || (newCost == origCost && currentPhysicalPlan != newPhysicalPlan)) { - logOnLevel("Plan changed:\n" + - sideBySide(currentPhysicalPlan.treeString, newPhysicalPlan.treeString).mkString("\n")) + lazy val plans = + sideBySide(currentPhysicalPlan.treeString, newPhysicalPlan.treeString).mkString("\n") + logOnLevel(log"Plan changed:\n${MDC(QUERY_PLAN, plans)}") cleanUpTempTags(newPhysicalPlan) currentPhysicalPlan = newPhysicalPlan currentLogicalPlan = newLogicalPlan @@ -365,11 +395,13 @@ case class AdaptiveSparkPlanExec( result = createQueryStages(currentPhysicalPlan) } + ruleContext = ruleContext.withFinalStage(isFinalStage = true) // Run the final plan when there's no more unfinished stages. - currentPhysicalPlan = applyPhysicalRules( + currentPhysicalPlan = applyPhysicalRulesWithRuleContext( optimizeQueryStage(result.newPlan, isFinalStage = true), postStageCreationRules(supportsColumnar), Some((planChangeLogger, "AQE Post Stage Creation"))) + ruleContext.clearConfigs() _isFinalPlan = true executionId.foreach(onUpdatePlan(_, Seq(currentPhysicalPlan))) currentPhysicalPlan @@ -384,7 +416,7 @@ case class AdaptiveSparkPlanExec( if (shouldUpdatePlan && currentPhysicalPlan.exists(_.subqueries.nonEmpty)) { getExecutionId.foreach(onUpdatePlan(_, Seq.empty)) } - logOnLevel(s"Final plan:\n$currentPhysicalPlan") + logOnLevel(log"Final plan:\n${MDC(QUERY_PLAN, currentPhysicalPlan)}") } override def executeCollect(): Array[InternalRow] = { @@ -587,7 +619,7 @@ case class AdaptiveSparkPlanExec( val queryStage = plan match { case e: Exchange => val optimized = e.withNewChildren(Seq(optimizeQueryStage(e.child, isFinalStage = false))) - val newPlan = applyPhysicalRules( + val newPlan = applyPhysicalRulesWithRuleContext( optimized, postStageCreationRules(outputsColumnar = plan.supportsColumnar), Some((planChangeLogger, "AQE Post Stage Creation"))) @@ -714,9 +746,11 @@ case class AdaptiveSparkPlanExec( private def reOptimize(logicalPlan: LogicalPlan): Option[(SparkPlan, LogicalPlan)] = { try { logicalPlan.invalidateStatsCache() - val optimized = optimizer.execute(logicalPlan) - val sparkPlan = context.session.sessionState.planner.plan(ReturnAnswer(optimized)).next() - val newPlan = applyPhysicalRules( + val optimized = withRuleContext { optimizer.execute(logicalPlan) } + val sparkPlan = withRuleContext { + context.session.sessionState.planner.plan(ReturnAnswer(optimized)).next() + } + val newPlan = applyPhysicalRulesWithRuleContext( applyQueryPostPlannerStrategyRules(sparkPlan), preprocessingRules ++ queryStagePreparationRules, Some((planChangeLogger, "AQE Replanning"))) @@ -737,7 +771,8 @@ case class AdaptiveSparkPlanExec( Some((finalPlan, optimized)) } catch { case e: InvalidAQEPlanException[_] => - logOnLevel(s"Re-optimize - ${e.getMessage()}:\n${e.plan}") + logOnLevel(log"Re-optimize - ${MDC(ERROR, e.getMessage())}:\n" + + log"${MDC(QUERY_PLAN, e.plan)}") None } } @@ -795,7 +830,8 @@ case class AdaptiveSparkPlanExec( s.cancel() } catch { case NonFatal(t) => - logError(s"Exception in cancelling query stage: ${s.treeString}", t) + logError(log"Exception in cancelling query stage: " + + log"${MDC(QUERY_PLAN, s.treeString)}", t) } case _ => } @@ -869,6 +905,8 @@ case class AdaptiveExecutionContext(session: SparkSession, qe: QueryExecution) { */ val stageCache: TrieMap[SparkPlan, ExchangeQueryStageExec] = new TrieMap[SparkPlan, ExchangeQueryStageExec]() + + val shuffleIds: ConcurrentHashMap[Int, Boolean] = new ConcurrentHashMap[Int, Boolean]() } /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/InsertAdaptiveSparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/InsertAdaptiveSparkPlan.scala index 50f2b7c81453e..8517911d70262 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/InsertAdaptiveSparkPlan.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/InsertAdaptiveSparkPlan.scala @@ -19,6 +19,8 @@ package org.apache.spark.sql.execution.adaptive import scala.collection.mutable +import org.apache.spark.internal.LogKeys.{CONFIG, SUB_QUERY} +import org.apache.spark.internal.MDC import org.apache.spark.sql.catalyst.expressions import org.apache.spark.sql.catalyst.expressions.{DynamicPruningSubquery, ListQuery, SubqueryExpression} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan @@ -67,8 +69,8 @@ case class InsertAdaptiveSparkPlan( AdaptiveSparkPlanExec(newPlan, adaptiveExecutionContext, preprocessingRules, isSubquery) } catch { case SubqueryAdaptiveNotSupportedException(subquery) => - logWarning(s"${SQLConf.ADAPTIVE_EXECUTION_ENABLED.key} is enabled " + - s"but is not supported for sub-query: $subquery.") + logWarning(log"${MDC(CONFIG, SQLConf.ADAPTIVE_EXECUTION_ENABLED.key)} is enabled " + + log"but is not supported for sub-query: ${MDC(SUB_QUERY, subquery)}.") plan } } else { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/LogicalQueryStage.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/LogicalQueryStage.scala index 8ce2452cc141d..506f52fd9072e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/LogicalQueryStage.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/LogicalQueryStage.scala @@ -18,7 +18,8 @@ package org.apache.spark.sql.execution.adaptive import org.apache.spark.sql.catalyst.expressions.{Attribute, SortOrder} -import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, RepartitionOperation, Statistics} +import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, RepartitionOperation, Statistics} +import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.trees.TreePattern.{LOGICAL_QUERY_STAGE, REPARTITION_OPERATION, TreePattern} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.aggregate.BaseAggregateExec @@ -35,8 +36,8 @@ import org.apache.spark.sql.execution.aggregate.BaseAggregateExec // TODO we can potentially include only [[QueryStageExec]] in this class if we make the aggregation // planning aware of partitioning. case class LogicalQueryStage( - logicalPlan: LogicalPlan, - physicalPlan: SparkPlan) extends LeafNode { + override val logicalPlan: LogicalPlan, + override val physicalPlan: SparkPlan) extends logical.LogicalQueryStage { override def output: Seq[Attribute] = logicalPlan.output override val isStreaming: Boolean = logicalPlan.isStreaming @@ -71,4 +72,14 @@ case class LogicalQueryStage( } override def maxRows: Option[Long] = stats.rowCount.map(_.min(Long.MaxValue).toLong) + + override def isMaterialized: Boolean = physicalPlan.exists { + case s: QueryStageExec => s.isMaterialized + case _ => false + } + + override def isDirectStage: Boolean = physicalPlan match { + case _: QueryStageExec => true + case _ => false + } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/QueryStageExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/QueryStageExec.scala index 7db9271aee0c4..71e138e6152b5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/QueryStageExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/QueryStageExec.scala @@ -21,7 +21,7 @@ import java.util.concurrent.atomic.AtomicReference import scala.concurrent.Future -import org.apache.spark.{FutureAction, MapOutputStatistics, SparkException} +import org.apache.spark.{MapOutputStatistics, SparkException} import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow @@ -51,13 +51,18 @@ abstract class QueryStageExec extends LeafExecNode { */ val plan: SparkPlan + /** + * Name of this query stage which is unique in the entire query plan. + */ + val name: String = s"${this.getClass.getSimpleName}-$id" + /** * Materialize this query stage, to prepare for the execution, like submitting map stages, * broadcasting data, etc. The caller side can use the returned [[Future]] to wait until this * stage is ready. */ final def materialize(): Future[Any] = { - logDebug(s"Materialize query stage ${this.getClass.getSimpleName}: $id") + logDebug(s"Materialize query stage: $name") doMaterialize() } @@ -151,7 +156,12 @@ abstract class ExchangeQueryStageExec extends QueryStageExec { /** * Cancel the stage materialization if in progress; otherwise do nothing. */ - def cancel(): Unit + final def cancel(): Unit = { + logDebug(s"Cancel query stage: $name") + doCancel() + } + + protected def doCancel(): Unit /** * The canonicalized plan before applying query stage optimizer rules. @@ -184,9 +194,7 @@ case class ShuffleQueryStageExec( def advisoryPartitionSize: Option[Long] = shuffle.advisoryPartitionSize - @transient private lazy val shuffleFuture = shuffle.submitShuffleJob - - override protected def doMaterialize(): Future[Any] = shuffleFuture + override protected def doMaterialize(): Future[Any] = shuffle.submitShuffleJob override def newReuseInstance( newStageId: Int, newOutput: Seq[Attribute]): ExchangeQueryStageExec = { @@ -198,18 +206,14 @@ case class ShuffleQueryStageExec( reuse } - override def cancel(): Unit = shuffleFuture match { - case action: FutureAction[MapOutputStatistics] if !action.isCompleted => - action.cancel() - case _ => - } + override protected def doCancel(): Unit = shuffle.cancelShuffleJob /** * Returns the Option[MapOutputStatistics]. If the shuffle map stage has no partition, * this method returns None, as there is no map statistics. */ def mapStats: Option[MapOutputStatistics] = { - assert(resultOption.get().isDefined, s"${getClass.getSimpleName} should already be ready") + assert(resultOption.get().isDefined, s"$name should already be ready") val stats = resultOption.get().get.asInstanceOf[MapOutputStatistics] Option(stats) } @@ -236,9 +240,7 @@ case class BroadcastQueryStageExec( throw SparkException.internalError(s"wrong plan for broadcast stage:\n ${plan.treeString}") } - override protected def doMaterialize(): Future[Any] = { - broadcast.submitBroadcastJob - } + override protected def doMaterialize(): Future[Any] = broadcast.submitBroadcastJob override def newReuseInstance( newStageId: Int, newOutput: Seq[Attribute]): ExchangeQueryStageExec = { @@ -250,12 +252,7 @@ case class BroadcastQueryStageExec( reuse } - override def cancel(): Unit = { - if (!broadcast.relationFuture.isDone) { - sparkContext.cancelJobsWithTag(broadcast.jobTag) - broadcast.relationFuture.cancel(true) - } - } + override protected def doCancel(): Unit = broadcast.cancelBroadcastJob() override def getRuntimeStatistics: Statistics = broadcast.runtimeStatistics } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/ShufflePartitionsUtil.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/ShufflePartitionsUtil.scala index 9370b3d8d1d74..bb7d904402ded 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/ShufflePartitionsUtil.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/ShufflePartitionsUtil.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.execution.adaptive import scala.collection.mutable.ArrayBuffer import org.apache.spark.{MapOutputStatistics, MapOutputTrackerMaster, SparkEnv} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.sql.execution.{CoalescedPartitionSpec, PartialReducerPartitionSpec, ShufflePartitionSpec} object ShufflePartitionsUtil extends Logging { @@ -61,8 +61,10 @@ object ShufflePartitionsUtil extends Logging { val targetSize = maxTargetSize.min(advisoryTargetSize).max(minPartitionSize) val shuffleIds = mapOutputStatistics.flatMap(_.map(_.shuffleId)).mkString(", ") - logInfo(s"For shuffle($shuffleIds), advisory target size: $advisoryTargetSize, " + - s"actual target size $targetSize, minimum partition size: $minPartitionSize") + logInfo(log"For shuffle(${MDC(LogKeys.SHUFFLE_ID, shuffleIds)}, advisory target size: " + + log"${MDC(LogKeys.ADVISORY_TARGET_SIZE, advisoryTargetSize)}, actual target size " + + log"${MDC(LogKeys.TARGET_SIZE, targetSize)}, minimum partition size: " + + log"${MDC(LogKeys.PARTITION_SIZE, minPartitionSize)}") // If `inputPartitionSpecs` are all empty, it means skew join optimization is not applied. if (inputPartitionSpecs.forall(_.isEmpty)) { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregateCodegenSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregateCodegenSupport.scala index 9523bf1a1c023..e2d8ac8988043 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregateCodegenSupport.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregateCodegenSupport.scala @@ -18,6 +18,8 @@ package org.apache.spark.sql.execution.aggregate import org.apache.spark.SparkException +import org.apache.spark.internal.LogKeys.MAX_JVM_METHOD_PARAMS_LENGTH +import org.apache.spark.internal.MDC import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, Expression, ExpressionEquals, UnsafeRow} @@ -340,11 +342,11 @@ trait AggregateCodegenSupport } Some(splitCodes) } else { - val errMsg = "Failed to split aggregate code into small functions because the parameter " + - "length of at least one split function went over the JVM limit: " + - CodeGenerator.MAX_JVM_METHOD_PARAMS_LENGTH + val errMsg = log"Failed to split aggregate code into small functions because the " + + log"parameter length of at least one split function went over the JVM limit: " + + log"${MDC(MAX_JVM_METHOD_PARAMS_LENGTH, CodeGenerator.MAX_JVM_METHOD_PARAMS_LENGTH)}" if (Utils.isTesting) { - throw SparkException.internalError(errMsg) + throw SparkException.internalError(errMsg.message) } else { logInfo(errMsg) None diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala index bdf17607d77c5..8f2b7ca5cba25 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala @@ -22,6 +22,8 @@ import java.util.concurrent.TimeUnit._ import scala.collection.mutable import org.apache.spark.TaskContext +import org.apache.spark.internal.LogKeys.CONFIG +import org.apache.spark.internal.MDC import org.apache.spark.memory.SparkOutOfMemoryError import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow @@ -410,8 +412,8 @@ case class HashAggregateExec( private def enableTwoLevelHashMap(): Unit = { if (!checkIfFastHashMapSupported()) { if (!Utils.isTesting) { - logInfo(s"${SQLConf.ENABLE_TWOLEVEL_AGG_MAP.key} is set to true, but" - + " current version of codegened fast hashmap does not support this aggregate.") + logInfo(log"${MDC(CONFIG, SQLConf.ENABLE_TWOLEVEL_AGG_MAP.key)} is set to true, but" + + log" current version of codegened fast hashmap does not support this aggregate.") } } else { isFastHashMapEnabled = true diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectAggregationIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectAggregationIterator.scala index 57b8fd8570f2b..a4a6dc8e4ab01 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectAggregationIterator.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectAggregationIterator.scala @@ -18,7 +18,8 @@ package org.apache.spark.sql.execution.aggregate import org.apache.spark.{SparkEnv, SparkException, TaskContext} -import org.apache.spark.internal.{config, Logging} +import org.apache.spark.internal.{config, Logging, MDC} +import org.apache.spark.internal.LogKeys.{CONFIG, HASH_MAP_SIZE, OBJECT_AGG_SORT_BASED_FALLBACK_THRESHOLD} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate._ @@ -174,10 +175,12 @@ class ObjectAggregationIterator( // The hash map gets too large, makes a sorted spill and clear the map. if (hashMap.size >= fallbackCountThreshold && inputRows.hasNext) { logInfo( - s"Aggregation hash map size ${hashMap.size} reaches threshold " + - s"capacity ($fallbackCountThreshold entries), spilling and falling back to sort" + - " based aggregation. You may change the threshold by adjust option " + - SQLConf.OBJECT_AGG_SORT_BASED_FALLBACK_THRESHOLD.key + log"Aggregation hash map size ${MDC(HASH_MAP_SIZE, hashMap.size)} reaches threshold " + + log"capacity " + + log"(${MDC(OBJECT_AGG_SORT_BASED_FALLBACK_THRESHOLD, fallbackCountThreshold)}" + + log" entries), spilling and falling back to sort based aggregation. You may change " + + log"the threshold by adjust option " + + log"${MDC(CONFIG, SQLConf.OBJECT_AGG_SORT_BASED_FALLBACK_THRESHOLD.key)}" ) // Falls back to sort-based aggregation diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnAccessor.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnAccessor.scala index 4a922dcb062e5..9652a48e5270e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnAccessor.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnAccessor.scala @@ -100,8 +100,8 @@ private[columnar] class FloatColumnAccessor(buffer: ByteBuffer) private[columnar] class DoubleColumnAccessor(buffer: ByteBuffer) extends NativeColumnAccessor(buffer, DOUBLE) -private[columnar] class StringColumnAccessor(buffer: ByteBuffer) - extends NativeColumnAccessor(buffer, STRING) +private[columnar] class StringColumnAccessor(buffer: ByteBuffer, dataType: StringType) + extends NativeColumnAccessor(buffer, STRING(dataType)) private[columnar] class BinaryColumnAccessor(buffer: ByteBuffer) extends BasicColumnAccessor[Array[Byte]](buffer, BINARY) @@ -147,7 +147,7 @@ private[sql] object ColumnAccessor { new LongColumnAccessor(buf) case FloatType => new FloatColumnAccessor(buf) case DoubleType => new DoubleColumnAccessor(buf) - case StringType => new StringColumnAccessor(buf) + case s: StringType => new StringColumnAccessor(buf, s) case BinaryType => new BinaryColumnAccessor(buf) case dt: DecimalType if dt.precision <= Decimal.MAX_LONG_DIGITS => new CompactDecimalColumnAccessor(buf, dt) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnBuilder.scala index 367547155beef..9fafdb7948416 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnBuilder.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnBuilder.scala @@ -122,7 +122,8 @@ private[columnar] class DoubleColumnBuilder extends NativeColumnBuilder(new DoubleColumnStats, DOUBLE) private[columnar] -class StringColumnBuilder extends NativeColumnBuilder(new StringColumnStats, STRING) +class StringColumnBuilder(dataType: StringType) + extends NativeColumnBuilder(new StringColumnStats(dataType), STRING(dataType)) private[columnar] class BinaryColumnBuilder extends ComplexColumnBuilder(new BinaryColumnStats, BINARY) @@ -185,7 +186,7 @@ private[columnar] object ColumnBuilder { new LongColumnBuilder case FloatType => new FloatColumnBuilder case DoubleType => new DoubleColumnBuilder - case StringType => new StringColumnBuilder + case s: StringType => new StringColumnBuilder(s) case BinaryType => new BinaryColumnBuilder case CalendarIntervalType => new IntervalColumnBuilder case dt: DecimalType if dt.precision <= Decimal.MAX_LONG_DIGITS => diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala index 18ef84262aad3..45f489cb13c2a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala @@ -255,14 +255,16 @@ private[columnar] final class DoubleColumnStats extends ColumnStats { Array[Any](lower, upper, nullCount, count, sizeInBytes) } -private[columnar] final class StringColumnStats extends ColumnStats { +private[columnar] final class StringColumnStats(collationId: Int) extends ColumnStats { + def this(dt: StringType) = this(dt.collationId) + protected var upper: UTF8String = null protected var lower: UTF8String = null override def gatherStats(row: InternalRow, ordinal: Int): Unit = { if (!row.isNullAt(ordinal)) { val value = row.getUTF8String(ordinal) - val size = STRING.actualSize(row, ordinal) + val size = STRING(collationId).actualSize(row, ordinal) gatherValueStats(value, size) } else { gatherNullStats() @@ -270,8 +272,8 @@ private[columnar] final class StringColumnStats extends ColumnStats { } def gatherValueStats(value: UTF8String, size: Int): Unit = { - if (upper == null || value.binaryCompare(upper) > 0) upper = value.clone() - if (lower == null || value.binaryCompare(lower) < 0) lower = value.clone() + if (upper == null || value.semanticCompare(upper, collationId) > 0) upper = value.clone() + if (lower == null || value.semanticCompare(lower, collationId) < 0) lower = value.clone() sizeInBytes += size count += 1 } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnType.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnType.scala index ee1f9b4133026..b8e63294f3cdc 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnType.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnType.scala @@ -491,8 +491,8 @@ private[columnar] trait DirectCopyColumnType[JvmType] extends ColumnType[JvmType } } -private[columnar] object STRING - extends NativeColumnType(PhysicalStringType(StringType.collationId), 8) +private[columnar] case class STRING(collationId: Int) + extends NativeColumnType(PhysicalStringType(collationId), 8) with DirectCopyColumnType[UTF8String] { override def actualSize(row: InternalRow, ordinal: Int): Int = { @@ -532,6 +532,12 @@ private[columnar] object STRING override def clone(v: UTF8String): UTF8String = v.clone() } +private[columnar] object STRING { + def apply(dt: StringType): STRING = { + STRING(dt.collationId) + } +} + private[columnar] case class COMPACT_DECIMAL(precision: Int, scale: Int) extends NativeColumnType(PhysicalDecimalType(precision, scale), 8) { @@ -821,7 +827,7 @@ private[columnar] object ColumnType { case LongType | TimestampType | TimestampNTZType | _: DayTimeIntervalType => LONG case FloatType => FLOAT case DoubleType => DOUBLE - case StringType => STRING + case s: StringType => STRING(s) case BinaryType => BINARY case i: CalendarIntervalType => CALENDAR_INTERVAL case dt: DecimalType if dt.precision <= Decimal.MAX_LONG_DIGITS => COMPACT_DECIMAL(dt) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/GenerateColumnAccessor.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/GenerateColumnAccessor.scala index 5eadc7d47c92e..75416b8789142 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/GenerateColumnAccessor.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/GenerateColumnAccessor.scala @@ -86,7 +86,7 @@ object GenerateColumnAccessor extends CodeGenerator[Seq[DataType], ColumnarItera classOf[LongColumnAccessor].getName case FloatType => classOf[FloatColumnAccessor].getName case DoubleType => classOf[DoubleColumnAccessor].getName - case StringType => classOf[StringColumnAccessor].getName + case _: StringType => classOf[StringColumnAccessor].getName case BinaryType => classOf[BinaryColumnAccessor].getName case CalendarIntervalType => classOf[IntervalColumnAccessor].getName case dt: DecimalType if dt.precision <= Decimal.MAX_LONG_DIGITS => @@ -101,7 +101,7 @@ object GenerateColumnAccessor extends CodeGenerator[Seq[DataType], ColumnarItera val createCode = dt match { case t if CodeGenerator.isPrimitiveType(dt) => s"$accessorName = new $accessorCls(ByteBuffer.wrap(buffers[$index]).order(nativeOrder));" - case NullType | StringType | BinaryType | CalendarIntervalType => + case NullType | BinaryType | CalendarIntervalType => s"$accessorName = new $accessorCls(ByteBuffer.wrap(buffers[$index]).order(nativeOrder));" case other => s"""$accessorName = new $accessorCls(ByteBuffer.wrap(buffers[$index]).order(nativeOrder), diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/compression/compressionSchemes.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/compression/compressionSchemes.scala index 46044f6919d17..86d76856e12bb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/compression/compressionSchemes.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/compression/compressionSchemes.scala @@ -176,7 +176,7 @@ private[columnar] case object RunLengthEncoding extends CompressionScheme { } override def supports(columnType: ColumnType[_]): Boolean = columnType match { - case INT | LONG | SHORT | BYTE | STRING | BOOLEAN => true + case INT | LONG | SHORT | BYTE | _: STRING | BOOLEAN => true case _ => false } @@ -373,7 +373,7 @@ private[columnar] case object DictionaryEncoding extends CompressionScheme { } override def supports(columnType: ColumnType[_]): Boolean = columnType match { - case INT | LONG | STRING => true + case INT | LONG | _: STRING => true case _ => false } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala index 299f41eb55e17..65a7a0ebbd916 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala @@ -42,7 +42,7 @@ case class AnalyzeColumnCommand( val sessionState = sparkSession.sessionState tableIdent.database match { - case Some(db) if db == sparkSession.sharedState.globalTempViewManager.database => + case Some(db) if db == sparkSession.sharedState.globalTempDB => val plan = sessionState.catalog.getGlobalTempView(tableIdent.identifier).getOrElse { throw QueryCompilationErrors.noSuchTableError(db, tableIdent.identifier) } @@ -61,8 +61,8 @@ case class AnalyzeColumnCommand( private def analyzeColumnInCachedData(plan: LogicalPlan, sparkSession: SparkSession): Boolean = { val cacheManager = sparkSession.sharedState.cacheManager - val planToLookup = sparkSession.sessionState.executePlan(plan).analyzed - cacheManager.lookupCachedData(planToLookup).map { cachedData => + val df = Dataset.ofRows(sparkSession, plan) + cacheManager.lookupCachedData(df).map { cachedData => val columnsToAnalyze = getColumnsToAnalyze( tableIdent, cachedData.cachedRepresentation, columnNames, allColumns) cacheManager.analyzeColumnCacheQuery(sparkSession, cachedData, columnsToAnalyze) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTablesCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTablesCommand.scala index c9b22a7d1b258..1650af74bc242 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTablesCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTablesCommand.scala @@ -19,6 +19,8 @@ package org.apache.spark.sql.execution.command import scala.util.control.NonFatal +import org.apache.spark.internal.LogKeys.{DATABASE_NAME, ERROR, TABLE_NAME} +import org.apache.spark.internal.MDC import org.apache.spark.sql.{Row, SparkSession} @@ -37,8 +39,8 @@ case class AnalyzeTablesCommand( CommandUtils.analyzeTable(sparkSession, tbl, noScan) } catch { case NonFatal(e) => - logWarning(s"Failed to analyze table ${tbl.table} in the " + - s"database $db because of ${e.toString}", e) + logWarning(log"Failed to analyze table ${MDC(TABLE_NAME, tbl.table)} in the " + + log"database ${MDC(DATABASE_NAME, db)} because of ${MDC(ERROR, e.toString)}}", e) } } Seq.empty[Row] diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala index eccf16ecea13f..7acd1cb0852b9 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala @@ -24,7 +24,8 @@ import scala.util.control.NonFatal import org.apache.hadoop.fs.{FileStatus, FileSystem, Path, PathFilter} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{COUNT, DATABASE_NAME, ERROR, TABLE_NAME, TIME} import org.apache.spark.sql.{Column, SparkSession} import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier} import org.apache.spark.sql.catalyst.analysis.{ResolvedIdentifier, UnresolvedAttribute} @@ -91,11 +92,12 @@ object CommandUtils extends Logging { } else { // Calculate table size as a sum of the visible partitions. See SPARK-21079 val partitions = sessionState.catalog.listPartitions(catalogTable.identifier) - logInfo(s"Starting to calculate sizes for ${partitions.length} partitions.") + logInfo(log"Starting to calculate sizes for ${MDC(COUNT, partitions.length)} " + + log"partitions.") calculatePartitionStats(spark, catalogTable, partitions, partitionRowCount) } - logInfo(s"It took ${(System.nanoTime() - startTime) / (1000 * 1000)} ms to calculate" + - s" the total size for table ${catalogTable.identifier}.") + logInfo(log"It took ${MDC(TIME, (System.nanoTime() - startTime) / (1000 * 1000))} ms to " + + log"calculate the total size for table ${MDC(TABLE_NAME, catalogTable.identifier)}.") (totalSize, newPartitions) } @@ -154,9 +156,9 @@ object CommandUtils extends Logging { getPathSize(fs, fs.getFileStatus(path)) } catch { case NonFatal(e) => - logWarning( - s"Failed to get the size of table ${identifier.table} in the " + - s"database ${identifier.database} because of ${e.toString}", e) + logWarning(log"Failed to get the size of table ${MDC(TABLE_NAME, identifier.table)} " + + log"in the database ${MDC(DATABASE_NAME, identifier.database)} because of " + + log"${MDC(ERROR, e.toString)}", e) 0L } }.getOrElse(0L) @@ -238,7 +240,7 @@ object CommandUtils extends Logging { // Analyzes a catalog view if the view is cached val table = sparkSession.table(tableIdent.quotedString) val cacheManager = sparkSession.sharedState.cacheManager - if (cacheManager.lookupCachedData(table.logicalPlan).isDefined) { + if (cacheManager.lookupCachedData(table).isDefined) { if (!noScan) { // To collect table stats, materializes an underlying columnar RDD table.count() diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CreateSQLFunctionCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CreateSQLFunctionCommand.scala new file mode 100644 index 0000000000000..d2aaa93fcca06 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CreateSQLFunctionCommand.scala @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.command + +import org.apache.spark.sql.{Row, SparkSession} +import org.apache.spark.sql.catalyst.FunctionIdentifier +import org.apache.spark.sql.catalyst.catalog.SQLFunction + +/** + * The DDL command that creates a SQL function. + * For example: + * {{{ + * CREATE [OR REPLACE] [TEMPORARY] FUNCTION [IF NOT EXISTS] [db_name.]function_name + * ([param_name param_type [COMMENT param_comment], ...]) + * RETURNS {ret_type | TABLE (ret_name ret_type [COMMENT ret_comment], ...])} + * [function_properties] function_body; + * + * function_properties: + * [NOT] DETERMINISTIC | COMMENT function_comment | [ CONTAINS SQL | READS SQL DATA ] + * + * function_body: + * RETURN {expression | TABLE ( query )} + * }}} + */ +case class CreateSQLFunctionCommand( + name: FunctionIdentifier, + inputParamText: Option[String], + returnTypeText: String, + exprText: Option[String], + queryText: Option[String], + comment: Option[String], + isDeterministic: Option[Boolean], + containsSQL: Option[Boolean], + isTableFunc: Boolean, + isTemp: Boolean, + ignoreIfExists: Boolean, + replace: Boolean) + extends CreateUserDefinedFunctionCommand { + + override def run(sparkSession: SparkSession): Seq[Row] = { + import SQLFunction._ + + val parser = sparkSession.sessionState.sqlParser + + val inputParam = inputParamText.map(parser.parseTableSchema) + val returnType = parseReturnTypeText(returnTypeText, isTableFunc, parser) + + val function = SQLFunction( + name, + inputParam, + returnType.getOrElse(if (isTableFunc) Right(null) else Left(null)), + exprText, + queryText, + comment, + isDeterministic, + containsSQL, + isTableFunc, + Map.empty) + + // TODO: Implement the rest of the method. + + Seq.empty + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CreateUserDefinedFunctionCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CreateUserDefinedFunctionCommand.scala new file mode 100644 index 0000000000000..bebb0f5cf6c38 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CreateUserDefinedFunctionCommand.scala @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.command + +import org.apache.spark.sql.catalyst.FunctionIdentifier +import org.apache.spark.sql.catalyst.catalog.{LanguageSQL, RoutineLanguage, UserDefinedFunctionErrors} +import org.apache.spark.sql.catalyst.plans.logical.IgnoreCachedData + +/** + * The base class for CreateUserDefinedFunctionCommand + */ +abstract class CreateUserDefinedFunctionCommand + extends LeafRunnableCommand with IgnoreCachedData + + +object CreateUserDefinedFunctionCommand { + + /** + * This factory methods serves as a central place to verify required inputs and + * returns the CREATE command for the parsed user defined function. + */ + // scalastyle:off argcount + def apply( + name: FunctionIdentifier, + inputParamText: Option[String], + returnTypeText: String, + exprText: Option[String], + queryText: Option[String], + comment: Option[String], + isDeterministic: Option[Boolean], + containsSQL: Option[Boolean], + language: RoutineLanguage, + isTableFunc: Boolean, + isTemp: Boolean, + ignoreIfExists: Boolean, + replace: Boolean + ): CreateUserDefinedFunctionCommand = { + // scalastyle:on argcount + + assert(language != null) + + language match { + case LanguageSQL => + CreateSQLFunctionCommand( + name, + inputParamText, + returnTypeText, + exprText, + queryText, + comment, + isDeterministic, + containsSQL, + isTableFunc, + isTemp, + ignoreIfExists, + replace) + + case other => + throw UserDefinedFunctionErrors.unsupportedUserDefinedFunction(other) + } + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/InsertIntoDataSourceDirCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/InsertIntoDataSourceDirCommand.scala index 67d38b28c83ea..7c690c8ccc08d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/InsertIntoDataSourceDirCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/InsertIntoDataSourceDirCommand.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.execution.command +import org.apache.spark.internal.LogKeys._ +import org.apache.spark.internal.MDC import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.plans.logical.{CTEInChildren, CTERelationDef, LogicalPlan, WithCTE} @@ -70,7 +72,7 @@ case class InsertIntoDataSourceDirCommand( sparkSession.sessionState.executePlan(dataSource.planForWriting(saveMode, query)).toRdd } catch { case ex: AnalysisException => - logError(s"Failed to write to directory " + storage.locationUri.toString, ex) + logError(log"Failed to write to directory ${MDC(URI, storage.locationUri.toString)}", ex) throw ex } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala index 672417f1adbf0..4e513fc3e8c1d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala @@ -17,7 +17,8 @@ package org.apache.spark.sql.execution.command -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{CONFIG, CONFIG2, KEY, VALUE} import org.apache.spark.sql.{AnalysisException, Row, SparkSession} import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.parser.ParseException @@ -51,8 +52,9 @@ case class SetCommand(kv: Option[(String, Option[String])]) case Some((SQLConf.Deprecated.MAPRED_REDUCE_TASKS, Some(value))) => val runFunc = (sparkSession: SparkSession) => { logWarning( - s"Property ${SQLConf.Deprecated.MAPRED_REDUCE_TASKS} is deprecated, " + - s"automatically converted to ${SQLConf.SHUFFLE_PARTITIONS.key} instead.") + log"Property ${MDC(CONFIG, SQLConf.Deprecated.MAPRED_REDUCE_TASKS)} is deprecated, " + + log"automatically converted to ${MDC(CONFIG2, SQLConf.SHUFFLE_PARTITIONS.key)} " + + log"instead.") if (value.toInt < 1) { val msg = s"Setting negative ${SQLConf.Deprecated.MAPRED_REDUCE_TASKS} for automatically " + @@ -68,8 +70,9 @@ case class SetCommand(kv: Option[(String, Option[String])]) case Some((SQLConf.Replaced.MAPREDUCE_JOB_REDUCES, Some(value))) => val runFunc = (sparkSession: SparkSession) => { logWarning( - s"Property ${SQLConf.Replaced.MAPREDUCE_JOB_REDUCES} is Hadoop's property, " + - s"automatically converted to ${SQLConf.SHUFFLE_PARTITIONS.key} instead.") + log"Property ${MDC(CONFIG, SQLConf.Replaced.MAPREDUCE_JOB_REDUCES)} is Hadoop's " + + log"property, automatically converted to " + + log"${MDC(CONFIG2, SQLConf.SHUFFLE_PARTITIONS.key)} instead.") if (value.toInt < 1) { val msg = s"Setting negative ${SQLConf.Replaced.MAPREDUCE_JOB_REDUCES} for automatically " + @@ -111,11 +114,12 @@ case class SetCommand(kv: Option[(String, Option[String])]) } if (sparkSession.conf.get(CATALOG_IMPLEMENTATION.key).equals("hive") && key.startsWith("hive.")) { - logWarning(s"'SET $key=$value' might not work, since Spark doesn't support changing " + - "the Hive config dynamically. Please pass the Hive-specific config by adding the " + - s"prefix spark.hadoop (e.g. spark.hadoop.$key) when starting a Spark application. " + - "For details, see the link: https://spark.apache.org/docs/latest/configuration.html#" + - "dynamically-loading-spark-properties.") + logWarning(log"'SET ${MDC(KEY, key)}=${MDC(VALUE, value)}' might not work, since Spark " + + log"doesn't support changing the Hive config dynamically. Please pass the " + + log"Hive-specific config by adding the prefix spark.hadoop " + + log"(e.g. spark.hadoop.${MDC(KEY, key)}) when starting a Spark application. For " + + log"details, see the link: https://spark.apache.org/docs/latest/configuration.html#" + + log"dynamically-loading-spark-properties.") } sparkSession.conf.set(key, value) Seq(Row(key, value)) @@ -155,8 +159,8 @@ case class SetCommand(kv: Option[(String, Option[String])]) case Some((SQLConf.Deprecated.MAPRED_REDUCE_TASKS, None)) => val runFunc = (sparkSession: SparkSession) => { logWarning( - s"Property ${SQLConf.Deprecated.MAPRED_REDUCE_TASKS} is deprecated, " + - s"showing ${SQLConf.SHUFFLE_PARTITIONS.key} instead.") + log"Property ${MDC(CONFIG, SQLConf.Deprecated.MAPRED_REDUCE_TASKS)} is deprecated, " + + log"showing ${MDC(CONFIG2, SQLConf.SHUFFLE_PARTITIONS.key)} instead.") Seq(Row( SQLConf.SHUFFLE_PARTITIONS.key, sparkSession.sessionState.conf.defaultNumShufflePartitions.toString)) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/UnsetNamespacePropertiesCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/UnsetNamespacePropertiesCommand.scala new file mode 100644 index 0000000000000..243b51b09e3bc --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/UnsetNamespacePropertiesCommand.scala @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.command + +import org.apache.spark.sql.{Row, SparkSession} +import org.apache.spark.sql.catalyst.analysis.ResolvedNamespace +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.CatalogHelper +import org.apache.spark.sql.connector.catalog.NamespaceChange + +/** + * A command that unsets database/schema/namespace properties. + * + * The syntax of this command is: + * {{{ + * ALTER (DATABASE|SCHEMA|NAMESPACE) ... + * UNSET (DBPROPERTIES|PROPERTIES) ('key1', 'key2', ...); + * }}} + */ +case class UnsetNamespacePropertiesCommand( + ident: LogicalPlan, + propKeys: Seq[String]) extends UnaryRunnableCommand { + + override def run(sparkSession: SparkSession): Seq[Row] = { + val ResolvedNamespace(catalog, ns, _) = child + val changes = propKeys.map { + NamespaceChange.removeProperty + } + // If the property does not exist, the change should succeed. + catalog.asNamespaceCatalog.alterNamespace(ns.toArray, changes: _*) + + Seq.empty + } + + override def child: LogicalPlan = ident + + override protected def withNewChildInternal( + newChild: LogicalPlan): UnsetNamespacePropertiesCommand = + copy(ident = newChild) +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala index a1e9c4229b194..ea2736b2c1266 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala @@ -25,8 +25,8 @@ import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.catalyst.plans.QueryPlan -import org.apache.spark.sql.catalyst.plans.logical.{Command, LogicalPlan} -import org.apache.spark.sql.catalyst.trees.LeafLike +import org.apache.spark.sql.catalyst.plans.logical.{Command, LogicalPlan, SupervisingCommand} +import org.apache.spark.sql.catalyst.trees.{LeafLike, UnaryLike} import org.apache.spark.sql.connector.ExternalCommandRunner import org.apache.spark.sql.execution.{CommandExecutionMode, ExplainMode, LeafExecNode, SparkPlan, UnaryExecNode} import org.apache.spark.sql.execution.metric.SQLMetric @@ -51,6 +51,7 @@ trait RunnableCommand extends Command { } trait LeafRunnableCommand extends RunnableCommand with LeafLike[LogicalPlan] +trait UnaryRunnableCommand extends RunnableCommand with UnaryLike[LogicalPlan] /** * A physical operator that executes the run method of a `RunnableCommand` and @@ -157,7 +158,7 @@ case class DataWritingCommandExec(cmd: DataWritingCommand, child: SparkPlan) case class ExplainCommand( logicalPlan: LogicalPlan, mode: ExplainMode) - extends LeafRunnableCommand { + extends RunnableCommand with SupervisingCommand { override val output: Seq[Attribute] = Seq(AttributeReference("plan", StringType, nullable = true)()) @@ -171,6 +172,9 @@ case class ExplainCommand( ("Error occurred during query planning: \n" + cause.getMessage).split("\n") .map(Row(_)).toImmutableArraySeq } + + def withTransformedSupervisedPlan(transformer: LogicalPlan => LogicalPlan): LogicalPlan = + copy(logicalPlan = transformer(logicalPlan)) } /** An explain command for users to see how a streaming batch is executed. */ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala index 1283bd8809082..539d8346a5cad 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala @@ -19,6 +19,8 @@ package org.apache.spark.sql.execution.command import java.net.URI +import org.apache.spark.internal.LogKeys._ +import org.apache.spark.internal.MDC import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.plans.logical.{CTEInChildren, CTERelationDef, LogicalPlan, WithCTE} @@ -230,7 +232,8 @@ case class CreateDataSourceTableAsSelectCommand( dataSource.writeAndRead(mode, query, outputColumnNames) } catch { case ex: AnalysisException => - logError(s"Failed to write to table ${table.identifier.unquotedString}", ex) + logError(log"Failed to write to table " + + log"${MDC(TABLE_NAME, table.identifier.unquotedString)}", ex) throw ex } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala index a5e48784ada1a..6f402188910e0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala @@ -28,7 +28,7 @@ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs._ import org.apache.hadoop.mapred.{FileInputFormat, JobConf} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.internal.config.RDD_PARALLEL_LISTING_THRESHOLD import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.TableIdentifier @@ -356,8 +356,8 @@ case class AlterTableUnsetPropertiesCommand( /** - * A command to change the column for a table, only support changing the comment of a non-partition - * column for now. + * A command to change the column for a table, only support changing the comment or collation of + * the data type or nested types (recursively) of a non-partition column for now. * * The syntax of using this command in SQL is: * {{{ @@ -387,32 +387,45 @@ case class AlterTableChangeColumnCommand( } // Find the origin column from dataSchema by column name. val originColumn = findColumnByName(table.dataSchema, columnName, resolver) - // Throw an AnalysisException if the column name/dataType is changed. - if (!columnEqual(originColumn, newColumn, resolver)) { + val validType = canEvolveType(originColumn, newColumn) + // Throw an AnalysisException on attempt to change collation of bucket column. + if (validType && originColumn.dataType != newColumn.dataType) { + val isBucketColumn = table.bucketSpec match { + case Some(bucketSpec) => bucketSpec.bucketColumnNames.exists(resolver(columnName, _)) + case _ => false + } + if (isBucketColumn) { + throw QueryCompilationErrors.cannotAlterCollationBucketColumn( + table.qualifiedName, columnName) + } + } + // Throw an AnalysisException if the column name is changed or we cannot evolve the data type. + // Only changes in collation of column data type or its nested types (recursively) are allowed. + if (!validType || !namesEqual(originColumn, newColumn, resolver)) { throw QueryCompilationErrors.alterTableChangeColumnNotSupportedForColumnTypeError( toSQLId(table.identifier.nameParts), originColumn, newColumn, this.origin) } val newDataSchema = table.dataSchema.fields.map { field => if (field.name == originColumn.name) { - // Create a new column from the origin column with the new comment. - val withNewComment: StructField = - addComment(field, newColumn.getComment()) + // Create a new column from the origin column with the new type and new comment. + val withNewTypeAndComment: StructField = + addComment(withNewType(field, newColumn.dataType), newColumn.getComment()) // Create a new column from the origin column with the new current default value. if (newColumn.getCurrentDefaultValue().isDefined) { if (newColumn.getCurrentDefaultValue().get.nonEmpty) { val result: StructField = - addCurrentDefaultValue(withNewComment, newColumn.getCurrentDefaultValue()) + addCurrentDefaultValue(withNewTypeAndComment, newColumn.getCurrentDefaultValue()) // Check that the proposed default value parses and analyzes correctly, and that the // type of the resulting expression is equivalent or coercible to the destination column // type. ResolveDefaultColumns.analyze(result, "ALTER TABLE ALTER COLUMN") result } else { - withNewComment.clearCurrentDefaultValue() + withNewTypeAndComment.clearCurrentDefaultValue() } } else { - withNewComment + withNewTypeAndComment } } else { field @@ -432,6 +445,10 @@ case class AlterTableChangeColumnCommand( }.getOrElse(throw QueryCompilationErrors.cannotFindColumnError(name, schema.fieldNames)) } + // Change the dataType of the column. + private def withNewType(column: StructField, dataType: DataType): StructField = + column.copy(dataType = dataType) + // Add the comment to a column, if comment is empty, return the original column. private def addComment(column: StructField, comment: Option[String]): StructField = comment.map(column.withComment).getOrElse(column) @@ -442,10 +459,17 @@ case class AlterTableChangeColumnCommand( value.map(column.withCurrentDefaultValue).getOrElse(column) // Compare a [[StructField]] to another, return true if they have the same column - // name(by resolver) and dataType. - private def columnEqual( + // name(by resolver). + private def namesEqual( field: StructField, other: StructField, resolver: Resolver): Boolean = { - resolver(field.name, other.name) && field.dataType == other.dataType + resolver(field.name, other.name) + } + + // Compare dataType of [[StructField]] to another, return true if it is valid to evolve the type + // when altering column. Only changes in collation of data type or its nested types (recursively) + // are allowed. + private def canEvolveType(from: StructField, to: StructField): Boolean = { + DataType.equalsIgnoreCompatibleCollation(from.dataType, to.dataType) } } @@ -696,7 +720,7 @@ case class RepairTableCommand( } val root = new Path(table.location) - logInfo(s"Recover all the partitions in $root") + logInfo(log"Recover all the partitions in ${MDC(LogKeys.PATH, root)}") val hadoopConf = spark.sessionState.newHadoopConf() val fs = root.getFileSystem(hadoopConf) @@ -716,14 +740,16 @@ case class RepairTableCommand( evalPool.shutdown() } val total = partitionSpecsAndLocs.length - logInfo(s"Found $total partitions in $root") + logInfo(log"Found ${MDC(LogKeys.NUM_PARTITIONS, total)} partitions " + + log"in ${MDC(LogKeys.PATH, root)}") val partitionStats = if (spark.sessionState.conf.gatherFastStats) { gatherPartitionStats(spark, partitionSpecsAndLocs, fs, pathFilter, threshold) } else { Map.empty[Path, PartitionStatistics] } - logInfo(s"Finished to gather the fast stats for all $total partitions.") + logInfo(log"Finished to gather the fast stats for all " + + log"${MDC(LogKeys.NUM_PARTITIONS, total)} partitions.") addPartitions(spark, table, partitionSpecsAndLocs, partitionStats) total @@ -736,12 +762,14 @@ case class RepairTableCommand( spark.catalog.refreshTable(tableIdentWithDB) } catch { case NonFatal(e) => - logError(s"Cannot refresh the table '$tableIdentWithDB'. A query of the table " + - "might return wrong result if the table was cached. To avoid such issue, you should " + - "uncache the table manually via the UNCACHE TABLE command after table recovering will " + - "complete fully.", e) + logError(log"Cannot refresh the table '${MDC(LogKeys.TABLE_NAME, tableIdentWithDB)}'. " + + log"A query of the table might return wrong result if the table was cached. " + + log"To avoid such issue, you should uncache the table manually via the UNCACHE TABLE " + + log"command after table recovering will complete fully.", e) } - logInfo(s"Recovered all partitions: added ($addedAmount), dropped ($droppedAmount).") + logInfo(log"Recovered all partitions: " + + log"added (${MDC(LogKeys.NUM_ADDED_PARTITIONS, addedAmount)}), " + + log"dropped (${MDC(LogKeys.NUM_DROPPED_PARTITIONS, droppedAmount)}).") Seq.empty[Row] } @@ -782,12 +810,13 @@ case class RepairTableCommand( scanPartitions(spark, fs, filter, st.getPath, spec ++ Map(partitionNames.head -> value), partitionNames.drop(1), threshold, resolver, evalTaskSupport) } else { - logWarning( - s"expected partition column ${partitionNames.head}, but got ${ps(0)}, ignoring it") + logWarning(log"expected partition column " + + log"${MDC(LogKeys.EXPECTED_PARTITION_COLUMN, partitionNames.head)}," + + log" but got ${MDC(LogKeys.ACTUAL_PARTITION_COLUMN, ps(0))}, ignoring it") Seq.empty } } else { - logWarning(s"ignore ${new Path(path, name)}") + logWarning(log"ignore ${MDC(LogKeys.PATH, new Path(path, name))}") Seq.empty } } @@ -811,7 +840,8 @@ case class RepairTableCommand( Math.min(spark.sparkContext.defaultParallelism, 10000)) // gather the fast stats for all the partitions otherwise Hive metastore will list all the // files for all the new partitions in sequential way, which is super slow. - logInfo(s"Gather the fast stats in parallel using $numParallelism tasks.") + logInfo(log"Gather the fast stats in parallel using ${MDC(LogKeys.COUNT, numParallelism)} " + + log"tasks.") spark.sparkContext.parallelize(locations, numParallelism) .mapPartitions { locationsEachPartition => val pathFilter = getPathFilter(serializableConfiguration.value) @@ -1028,7 +1058,8 @@ object DDLUtils extends Logging { DataSource.lookupDataSource(provider, SQLConf.get).getConstructor().newInstance() } catch { case e: Throwable => - logError(s"Failed to find data source: $provider when check data column names.", e) + logError(log"Failed to find data source: ${MDC(LogKeys.DATA_SOURCE, provider)} " + + log"when check data column names.", e) return } source match { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala index 1a97b965da2bd..ee0074dfe61b2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala @@ -760,7 +760,7 @@ case class DescribeTableCommand( * 7. Common table expressions (CTEs) */ case class DescribeQueryCommand(queryText: String, plan: LogicalPlan) - extends DescribeCommandBase with CTEInChildren { + extends DescribeCommandBase with SupervisingCommand with CTEInChildren { override val output = DescribeCommandSchema.describeTableAttributes() @@ -776,6 +776,9 @@ case class DescribeQueryCommand(queryText: String, plan: LogicalPlan) override def withCTEDefs(cteDefs: Seq[CTERelationDef]): LogicalPlan = { copy(plan = WithCTE(plan, cteDefs)) } + + def withTransformedSupervisedPlan(transformer: LogicalPlan => LogicalPlan): LogicalPlan = + copy(plan = transformer(plan)) } /** @@ -1110,6 +1113,7 @@ trait ShowCreateTableCommandBase extends SQLConfHelper { showViewDataColumns(metadata, builder) showTableComment(metadata, builder) showViewProperties(metadata, builder) + showViewSchemaBinding(metadata, builder) showViewText(metadata, builder) } @@ -1139,6 +1143,12 @@ trait ShowCreateTableCommandBase extends SQLConfHelper { } } + private def showViewSchemaBinding(metadata: CatalogTable, builder: StringBuilder): Unit = { + if (SQLConf.get.viewSchemaBindingEnabled) { + builder ++= s"WITH SCHEMA ${metadata.viewSchemaMode.toString}\n" + } + } + private def showViewText(metadata: CatalogTable, builder: StringBuilder): Unit = { builder ++= metadata.viewText.mkString("AS ", "", "\n") } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala index d71d0d43683cb..e1061a46db7b0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala @@ -22,10 +22,11 @@ import scala.collection.mutable import org.json4s.JsonAST.{JArray, JString} import org.json4s.jackson.JsonMethods._ +import org.apache.spark.SparkException import org.apache.spark.internal.Logging import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.{SQLConfHelper, TableIdentifier} -import org.apache.spark.sql.catalyst.analysis.{AnalysisContext, GlobalTempView, LocalTempView, ViewType} +import org.apache.spark.sql.catalyst.analysis.{AnalysisContext, GlobalTempView, LocalTempView, SchemaEvolution, SchemaUnsupported, ViewSchemaMode, ViewType} import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType, TemporaryViewRelation} import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, SubqueryExpression, VariableReference} import org.apache.spark.sql.catalyst.plans.logical.{AnalysisOnlyCommand, CTEInChildren, CTERelationDef, LogicalPlan, Project, View, WithCTE} @@ -56,6 +57,7 @@ import org.apache.spark.util.ArrayImplicits._ * @param replace if true, and if the view already exists, updates it; if false, and if the view * already exists, throws analysis exception. * @param viewType the expected view type to be created with this command. + * @param viewSchemaMode the tolerance of the view towards schema changes * @param isAnalyzed whether this command is analyzed or not. */ case class CreateViewCommand( @@ -68,6 +70,7 @@ case class CreateViewCommand( allowExisting: Boolean, replace: Boolean, viewType: ViewType, + viewSchemaMode: ViewSchemaMode = SchemaUnsupported, isAnalyzed: Boolean = false, referredTempFunctions: Seq[String] = Seq.empty) extends RunnableCommand with AnalysisOnlyCommand with CTEInChildren { @@ -106,6 +109,10 @@ case class CreateViewCommand( throw QueryCompilationErrors.cannotCreateViewTooManyColumnsError( name, userSpecifiedColumns.map(_._1), analyzedPlan) } + if (viewSchemaMode == SchemaEvolution) { + throw SparkException.internalError( + "View with user column list has viewSchemaMode EVOLUTION") + } } val catalog = sparkSession.sessionState.catalog @@ -203,7 +210,7 @@ case class CreateViewCommand( val aliasedSchema = CharVarcharUtils.getRawSchema( aliasPlan(session, analyzedPlan).schema, session.sessionState.conf) val newProperties = generateViewProperties( - properties, session, analyzedPlan, aliasedSchema.fieldNames) + properties, session, analyzedPlan.schema.fieldNames, aliasedSchema.fieldNames, viewSchemaMode) CatalogTable( identifier = name, @@ -301,7 +308,11 @@ case class AlterViewAsCommand( CommandUtils.uncacheTableOrView(session, viewIdent) val newProperties = generateViewProperties( - viewMeta.properties, session, analyzedPlan, analyzedPlan.schema.fieldNames) + viewMeta.properties, + session, + analyzedPlan.schema.fieldNames, // The query output column names + analyzedPlan.schema.fieldNames, // Will match the view schema names + viewMeta.viewSchemaMode) val newSchema = CharVarcharUtils.getRawSchema(analyzedPlan.schema) val updatedViewMeta = viewMeta.copy( @@ -318,6 +329,50 @@ case class AlterViewAsCommand( } } +/** + * Alter a view with given schema binding. If the view name contains database prefix, this command + * will alter a permanent view matching the given name, or throw an exception if view not exist. + * Else, this command will try to alter a temporary view first, if view not exist, try permanent + * view next, if still not exist, throw an exception. + * + * @param name the name of this view. + * @param viewSchemaMode The new schema binding mode. + */ +case class AlterViewSchemaBindingCommand(name: TableIdentifier, viewSchemaMode: ViewSchemaMode) + extends LeafRunnableCommand { + + import ViewHelper._ + + override def run(session: SparkSession): Seq[Row] = { + val isTemporary = session.sessionState.catalog.isTempView(name) + if (isTemporary) { + throw QueryCompilationErrors.cannotAlterTempViewWithSchemaBindingError() + } + alterPermanentView(session, viewSchemaMode) + Seq.empty[Row] + } + + private def alterPermanentView(session: SparkSession, viewSchemaMode: ViewSchemaMode): Unit = { + val viewMeta = session.sessionState.catalog.getTableMetadata(name) + + val viewIdent = viewMeta.identifier + + logDebug(s"Try to uncache ${viewIdent.quotedString} before replacing.") + CommandUtils.uncacheTableOrView(session, viewIdent) + + val newProperties = generateViewProperties( + viewMeta.properties, + session, + viewMeta.viewQueryColumnNames.toArray, + viewMeta.schema.fieldNames, + viewSchemaMode) + + val updatedViewMeta = viewMeta.copy(properties = newProperties) + + session.sessionState.catalog.alterTable(updatedViewMeta) + } +} + /** * A command for users to get views in the given database. * If a databaseName is not given, the current database will be used. @@ -360,6 +415,7 @@ object ViewHelper extends SQLConfHelper with Logging { "spark.sql.hive.convertMetastoreParquet", "spark.sql.hive.convertMetastoreOrc", "spark.sql.hive.convertInsertingPartitionedTable", + "spark.sql.hive.convertInsertingUnpartitionedTable", "spark.sql.hive.convertMetastoreCtas", SQLConf.ADDITIONAL_REMOTE_REPOSITORIES.key) @@ -436,6 +492,20 @@ object ViewHelper extends SQLConfHelper with Logging { } } + /** + * Convert the viewSchemaMode to `properties`. + * If the mode UNSUPPORTED do not store anything for backward compatibility. + */ + private def viewSchemaModeToProps(viewSchemaMode: ViewSchemaMode): Map[String, String] = { + if (viewSchemaMode == SchemaUnsupported) { + Map.empty + } else { + val props = new mutable.HashMap[String, String] + props.put(VIEW_SCHEMA_MODE, viewSchemaMode.toString) + props.toMap + } + } + /** * Convert the temporary object names to `properties`. */ @@ -485,13 +555,12 @@ object ViewHelper extends SQLConfHelper with Logging { def generateViewProperties( properties: Map[String, String], session: SparkSession, - analyzedPlan: LogicalPlan, + queryOutput: Array[String], fieldNames: Array[String], + viewSchemaMode: ViewSchemaMode, tempViewNames: Seq[Seq[String]] = Seq.empty, tempFunctionNames: Seq[String] = Seq.empty, tempVariableNames: Seq[Seq[String]] = Seq.empty): Map[String, String] = { - // for createViewCommand queryOutput may be different from fieldNames - val queryOutput = analyzedPlan.schema.fieldNames val conf = session.sessionState.conf @@ -506,7 +575,8 @@ object ViewHelper extends SQLConfHelper with Logging { manager.currentCatalog.name, manager.currentNamespace.toImmutableArraySeq) ++ sqlConfigsToProps(conf) ++ generateQueryColumnNames(queryOutput.toImmutableArraySeq) ++ - referredTempNamesToProps(tempViewNames, tempFunctionNames, tempVariableNames) + referredTempNamesToProps(tempViewNames, tempFunctionNames, tempVariableNames) ++ + viewSchemaModeToProps(viewSchemaMode) } /** @@ -718,8 +788,8 @@ object ViewHelper extends SQLConfHelper with Logging { // TBLPROPERTIES is not allowed for temporary view, so we don't use it for // generating temporary view properties val newProperties = generateViewProperties( - Map.empty, session, analyzedPlan, viewSchema.fieldNames, tempViews, - tempFunctions, tempVariables) + Map.empty, session, analyzedPlan.schema.fieldNames, viewSchema.fieldNames, SchemaUnsupported, + tempViews, tempFunctions, tempVariables) CatalogTable( identifier = viewName, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ApplyCharTypePadding.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ApplyCharTypePadding.scala index b5bf337a5a2e6..1b7b0d702ab98 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ApplyCharTypePadding.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ApplyCharTypePadding.scala @@ -24,6 +24,7 @@ import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.catalyst.trees.TreePattern.{BINARY_COMPARISON, IN} import org.apache.spark.sql.catalyst.util.CharVarcharUtils import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{CharType, Metadata, StringType} import org.apache.spark.unsafe.types.UTF8String @@ -66,9 +67,10 @@ object ApplyCharTypePadding extends Rule[LogicalPlan] { r.copy(dataCols = cleanedDataCols, partitionCols = cleanedPartCols) }) } - paddingForStringComparison(newPlan) + paddingForStringComparison(newPlan, padCharCol = false) } else { - paddingForStringComparison(plan) + paddingForStringComparison( + plan, padCharCol = !conf.getConf(SQLConf.LEGACY_NO_CHAR_PADDING_IN_PREDICATE)) } } @@ -90,7 +92,7 @@ object ApplyCharTypePadding extends Rule[LogicalPlan] { } } - private def paddingForStringComparison(plan: LogicalPlan): LogicalPlan = { + private def paddingForStringComparison(plan: LogicalPlan, padCharCol: Boolean): LogicalPlan = { plan.resolveOperatorsUpWithPruning(_.containsAnyPattern(BINARY_COMPARISON, IN)) { case operator => operator.transformExpressionsUpWithPruning( _.containsAnyPattern(BINARY_COMPARISON, IN)) { @@ -99,12 +101,12 @@ object ApplyCharTypePadding extends Rule[LogicalPlan] { // String literal is treated as char type when it's compared to a char type column. // We should pad the shorter one to the longer length. case b @ BinaryComparison(e @ AttrOrOuterRef(attr), lit) if lit.foldable => - padAttrLitCmp(e, attr.metadata, lit).map { newChildren => + padAttrLitCmp(e, attr.metadata, padCharCol, lit).map { newChildren => b.withNewChildren(newChildren) }.getOrElse(b) case b @ BinaryComparison(lit, e @ AttrOrOuterRef(attr)) if lit.foldable => - padAttrLitCmp(e, attr.metadata, lit).map { newChildren => + padAttrLitCmp(e, attr.metadata, padCharCol, lit).map { newChildren => b.withNewChildren(newChildren.reverse) }.getOrElse(b) @@ -117,9 +119,10 @@ object ApplyCharTypePadding extends Rule[LogicalPlan] { val literalCharLengths = literalChars.map(_.numChars()) val targetLen = (length +: literalCharLengths).max Some(i.copy( - value = addPadding(e, length, targetLen), + value = addPadding(e, length, targetLen, alwaysPad = padCharCol), list = list.zip(literalCharLengths).map { - case (lit, charLength) => addPadding(lit, charLength, targetLen) + case (lit, charLength) => + addPadding(lit, charLength, targetLen, alwaysPad = false) } ++ nulls.map(Literal.create(_, StringType)))) case _ => None }.getOrElse(i) @@ -162,6 +165,7 @@ object ApplyCharTypePadding extends Rule[LogicalPlan] { private def padAttrLitCmp( expr: Expression, metadata: Metadata, + padCharCol: Boolean, lit: Expression): Option[Seq[Expression]] = { if (expr.dataType == StringType) { CharVarcharUtils.getRawType(metadata).flatMap { @@ -174,7 +178,14 @@ object ApplyCharTypePadding extends Rule[LogicalPlan] { if (length < stringLitLen) { Some(Seq(StringRPad(expr, Literal(stringLitLen)), lit)) } else if (length > stringLitLen) { - Some(Seq(expr, StringRPad(lit, Literal(length)))) + val paddedExpr = if (padCharCol) { + StringRPad(expr, Literal(length)) + } else { + expr + } + Some(Seq(paddedExpr, StringRPad(lit, Literal(length)))) + } else if (padCharCol) { + Some(Seq(StringRPad(expr, Literal(length)), lit)) } else { None } @@ -186,7 +197,15 @@ object ApplyCharTypePadding extends Rule[LogicalPlan] { } } - private def addPadding(expr: Expression, charLength: Int, targetLength: Int): Expression = { - if (targetLength > charLength) StringRPad(expr, Literal(targetLength)) else expr + private def addPadding( + expr: Expression, + charLength: Int, + targetLength: Int, + alwaysPad: Boolean): Expression = { + if (targetLength > charLength) { + StringRPad(expr, Literal(targetLength)) + } else if (alwaysPad) { + StringRPad(expr, Literal(charLength)) + } else expr } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/BasicWriteStatsTracker.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/BasicWriteStatsTracker.scala index 8a9fbd15e2e81..1858a84213598 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/BasicWriteStatsTracker.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/BasicWriteStatsTracker.scala @@ -26,7 +26,8 @@ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.{SparkContext, TaskContext} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} +import org.apache.spark.internal.LogKeys.{ACTUAL_NUM_FILES, EXPECTED_NUM_FILES} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.SQLExecution import org.apache.spark.sql.execution.datasources.BasicWriteJobStatsTracker._ @@ -119,9 +120,9 @@ class BasicWriteTaskStatsTracker( } catch { case e: NumberFormatException => // warn but don't dump the whole stack - logInfo(s"Failed to parse" + - s" ${BasicWriteJobStatsTracker.FILE_LENGTH_XATTR}:$e;" + - s" bytes written may be under-reported"); + logInfo(log"Failed to parse " + + log"${MDC(LogKeys.FILE_LENGTH_XATTR, BasicWriteJobStatsTracker.FILE_LENGTH_XATTR)}:" + + log"${MDC(LogKeys.ERROR, e)}; bytes written may be under-reported"); case e: UnsupportedOperationException => // this is not unusual; ignore logDebug(s"XAttr not supported on path $path", e); @@ -166,9 +167,9 @@ class BasicWriteTaskStatsTracker( } if (numSubmittedFiles != numFiles) { - logWarning(s"Expected $numSubmittedFiles files, but only saw $numFiles. " + - "This could be due to the output format not writing empty files, " + - "or files being not immediately visible in the filesystem.") + logWarning(log"Expected ${MDC(EXPECTED_NUM_FILES, numSubmittedFiles)} files, but only saw " + + log"${MDC(ACTUAL_NUM_FILES, numFiles)}. This could be due to the output format not " + + log"writing empty files, or files being not immediately visible in the filesystem.") } taskCommitTimeMetric.foreach(_ += taskCommitTime) BasicWriteTaskStats(partitions.toSeq, numFiles, numBytes, numRows) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala index 4c2d6a4cdf5ef..d88b5ee8877d7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala @@ -27,7 +27,8 @@ import org.apache.hadoop.fs.Path import org.apache.spark.SparkException import org.apache.spark.deploy.SparkHadoopUtil -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{CLASS_NAME, DATA_SOURCE, DATA_SOURCES, PATHS} import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogStorageFormat, CatalogTable, CatalogUtils} @@ -695,8 +696,9 @@ object DataSource extends Logging { throw QueryCompilationErrors .foundMultipleXMLDataSourceError(provider1, sourceNames, externalSource.getName) } else if (internalSources.size == 1) { - logWarning(s"Multiple sources found for $provider1 (${sourceNames.mkString(", ")}), " + - s"defaulting to the internal datasource (${internalSources.head.getClass.getName}).") + logWarning(log"Multiple sources found for ${MDC(DATA_SOURCE, provider1)} " + + log"(${MDC(DATA_SOURCES, sourceNames.mkString(", "))}), defaulting to the " + + log"internal datasource (${MDC(CLASS_NAME, internalSources.head.getClass.getName)}).") internalSources.head.getClass } else { throw QueryCompilationErrors.findMultipleDataSourceError(provider1, sourceNames) @@ -784,7 +786,7 @@ object DataSource extends Logging { globResult }.flatten } catch { - case e: SparkException => throw e.getCause + case e: SparkException => throw ThreadUtils.wrapCallerStacktrace(e.getCause) } if (checkFilesExist) { @@ -796,7 +798,7 @@ object DataSource extends Logging { } } } catch { - case e: SparkException => throw e.getCause + case e: SparkException => throw ThreadUtils.wrapCallerStacktrace(e.getCause) } } @@ -807,7 +809,7 @@ object DataSource extends Logging { } if (filteredIn.isEmpty) { logWarning( - s"All paths were ignored:\n ${filteredOut.mkString("\n ")}") + log"All paths were ignored:\n ${MDC(PATHS, filteredOut.mkString("\n "))}") } else { logDebug( s"Some paths were ignored:\n ${filteredOut.mkString("\n ")}") diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceManager.scala index 2f4555effce3a..93fc6cf367cfc 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceManager.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceManager.scala @@ -21,7 +21,8 @@ import java.util.Locale import java.util.concurrent.ConcurrentHashMap import org.apache.spark.api.python.PythonUtils -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.DATA_SOURCE import org.apache.spark.sql.errors.QueryCompilationErrors import org.apache.spark.sql.execution.datasources.v2.python.UserDefinedPythonDataSource import org.apache.spark.util.Utils @@ -53,7 +54,8 @@ class DataSourceManager extends Logging { } val previousValue = runtimeDataSourceBuilders.put(normalizedName, source) if (previousValue != null) { - logWarning(f"The data source $name replaced a previously registered data source.") + logWarning(log"The data source ${MDC(DATA_SOURCE, name)} replaced a previously " + + log"registered data source.") } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala index 845d969df0885..5d2310c130703 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala @@ -24,7 +24,8 @@ import scala.collection.mutable import org.apache.hadoop.fs.Path -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.PREDICATES import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow, QualifiedTableName, SQLConfHelper} @@ -53,7 +54,7 @@ import org.apache.spark.sql.execution.streaming.StreamingRelation import org.apache.spark.sql.sources._ import org.apache.spark.sql.types._ import org.apache.spark.sql.util.{PartitioningUtils => CatalystPartitioningUtils} -import org.apache.spark.sql.util.CaseInsensitiveStringMap +import org.apache.spark.sql.util.{CaseInsensitiveStringMap, SchemaUtils} import org.apache.spark.unsafe.types.UTF8String /** @@ -494,47 +495,81 @@ object DataSourceStrategy val partitionSet = AttributeSet(partitionColumns) val predicates = ExpressionSet(normalizedFilters .flatMap(extractPredicatesWithinOutputSet(_, partitionSet))) - logInfo(s"Pruning directories with: ${predicates.mkString(",")}") + logInfo(log"Pruning directories with: ${MDC(PREDICATES, predicates.mkString(","))}") predicates } } + /** + * Creates a collation aware filter if the input data type is string with non-default collation + */ + private def collationAwareFilter(filter: sources.Filter, dataType: DataType): Filter = { + if (!SchemaUtils.hasNonUTF8BinaryCollation(dataType)) { + return filter + } + + filter match { + case sources.EqualTo(attribute, value) => + CollatedEqualTo(attribute, value, dataType) + case sources.EqualNullSafe(attribute, value) => + CollatedEqualNullSafe(attribute, value, dataType) + case sources.GreaterThan(attribute, value) => + CollatedGreaterThan(attribute, value, dataType) + case sources.GreaterThanOrEqual(attribute, value) => + CollatedGreaterThanOrEqual(attribute, value, dataType) + case sources.LessThan(attribute, value) => + CollatedLessThan(attribute, value, dataType) + case sources.LessThanOrEqual(attribute, value) => + CollatedLessThanOrEqual(attribute, value, dataType) + case sources.In(attribute, values) => + CollatedIn(attribute, values, dataType) + case sources.StringStartsWith(attribute, value) => + CollatedStringStartsWith(attribute, value, dataType) + case sources.StringEndsWith(attribute, value) => + CollatedStringEndsWith(attribute, value, dataType) + case sources.StringContains(attribute, value) => + CollatedStringContains(attribute, value, dataType) + case other => + other + } + } + private def translateLeafNodeFilter( predicate: Expression, pushableColumn: PushableColumnBase): Option[Filter] = predicate match { - case expressions.EqualTo(pushableColumn(name), Literal(v, t)) => - Some(sources.EqualTo(name, convertToScala(v, t))) - case expressions.EqualTo(Literal(v, t), pushableColumn(name)) => - Some(sources.EqualTo(name, convertToScala(v, t))) - - case expressions.EqualNullSafe(pushableColumn(name), Literal(v, t)) => - Some(sources.EqualNullSafe(name, convertToScala(v, t))) - case expressions.EqualNullSafe(Literal(v, t), pushableColumn(name)) => - Some(sources.EqualNullSafe(name, convertToScala(v, t))) - - case expressions.GreaterThan(pushableColumn(name), Literal(v, t)) => - Some(sources.GreaterThan(name, convertToScala(v, t))) - case expressions.GreaterThan(Literal(v, t), pushableColumn(name)) => - Some(sources.LessThan(name, convertToScala(v, t))) - - case expressions.LessThan(pushableColumn(name), Literal(v, t)) => - Some(sources.LessThan(name, convertToScala(v, t))) - case expressions.LessThan(Literal(v, t), pushableColumn(name)) => - Some(sources.GreaterThan(name, convertToScala(v, t))) - - case expressions.GreaterThanOrEqual(pushableColumn(name), Literal(v, t)) => - Some(sources.GreaterThanOrEqual(name, convertToScala(v, t))) - case expressions.GreaterThanOrEqual(Literal(v, t), pushableColumn(name)) => - Some(sources.LessThanOrEqual(name, convertToScala(v, t))) - - case expressions.LessThanOrEqual(pushableColumn(name), Literal(v, t)) => - Some(sources.LessThanOrEqual(name, convertToScala(v, t))) - case expressions.LessThanOrEqual(Literal(v, t), pushableColumn(name)) => - Some(sources.GreaterThanOrEqual(name, convertToScala(v, t))) + case expressions.EqualTo(e @ pushableColumn(name), Literal(v, t)) => + Some(collationAwareFilter(sources.EqualTo(name, convertToScala(v, t)), e.dataType)) + case expressions.EqualTo(Literal(v, t), e @ pushableColumn(name)) => + Some(collationAwareFilter(sources.EqualTo(name, convertToScala(v, t)), e.dataType)) + + case expressions.EqualNullSafe(e @ pushableColumn(name), Literal(v, t)) => + Some(collationAwareFilter(sources.EqualNullSafe(name, convertToScala(v, t)), e.dataType)) + case expressions.EqualNullSafe(Literal(v, t), e @ pushableColumn(name)) => + Some(collationAwareFilter(sources.EqualNullSafe(name, convertToScala(v, t)), e.dataType)) + + case expressions.GreaterThan(e @ pushableColumn(name), Literal(v, t)) => + Some(collationAwareFilter(sources.GreaterThan(name, convertToScala(v, t)), e.dataType)) + case expressions.GreaterThan(Literal(v, t), e @ pushableColumn(name)) => + Some(collationAwareFilter(sources.LessThan(name, convertToScala(v, t)), e.dataType)) + + case expressions.LessThan(e @ pushableColumn(name), Literal(v, t)) => + Some(collationAwareFilter(sources.LessThan(name, convertToScala(v, t)), e.dataType)) + case expressions.LessThan(Literal(v, t), e @ pushableColumn(name)) => + Some(collationAwareFilter(sources.GreaterThan(name, convertToScala(v, t)), e.dataType)) + + case expressions.GreaterThanOrEqual(e @ pushableColumn(name), Literal(v, t)) => + Some(collationAwareFilter(sources.GreaterThanOrEqual(name, convertToScala(v, t)), e.dataType)) + case expressions.GreaterThanOrEqual(Literal(v, t), e @ pushableColumn(name)) => + Some(collationAwareFilter(sources.LessThanOrEqual(name, convertToScala(v, t)), e.dataType)) + + case expressions.LessThanOrEqual(e @ pushableColumn(name), Literal(v, t)) => + Some(collationAwareFilter(sources.LessThanOrEqual(name, convertToScala(v, t)), e.dataType)) + case expressions.LessThanOrEqual(Literal(v, t), e @ pushableColumn(name)) => + Some(collationAwareFilter(sources.GreaterThanOrEqual(name, convertToScala(v, t)), e.dataType)) case expressions.InSet(e @ pushableColumn(name), set) => val toScala = CatalystTypeConverters.createToScalaConverter(e.dataType) - Some(sources.In(name, set.toArray.map(toScala))) + Some(collationAwareFilter(sources.In(name, set.toArray.map(toScala)), e.dataType)) // Because we only convert In to InSet in Optimizer when there are more than certain // items. So it is possible we still get an In expression here that needs to be pushed @@ -542,20 +577,20 @@ object DataSourceStrategy case expressions.In(e @ pushableColumn(name), list) if list.forall(_.isInstanceOf[Literal]) => val hSet = list.map(_.eval(EmptyRow)) val toScala = CatalystTypeConverters.createToScalaConverter(e.dataType) - Some(sources.In(name, hSet.toArray.map(toScala))) + Some(collationAwareFilter(sources.In(name, hSet.toArray.map(toScala)), e.dataType)) case expressions.IsNull(pushableColumn(name)) => Some(sources.IsNull(name)) case expressions.IsNotNull(pushableColumn(name)) => Some(sources.IsNotNull(name)) - case expressions.StartsWith(pushableColumn(name), Literal(v: UTF8String, StringType)) => - Some(sources.StringStartsWith(name, v.toString)) + case expressions.StartsWith(e @ pushableColumn(name), Literal(v: UTF8String, StringType)) => + Some(collationAwareFilter(sources.StringStartsWith(name, v.toString), e.dataType)) - case expressions.EndsWith(pushableColumn(name), Literal(v: UTF8String, StringType)) => - Some(sources.StringEndsWith(name, v.toString)) + case expressions.EndsWith(e @ pushableColumn(name), Literal(v: UTF8String, StringType)) => + Some(collationAwareFilter(sources.StringEndsWith(name, v.toString), e.dataType)) - case expressions.Contains(pushableColumn(name), Literal(v: UTF8String, StringType)) => - Some(sources.StringContains(name, v.toString)) + case expressions.Contains(e @ pushableColumn(name), Literal(v: UTF8String, StringType)) => + Some(collationAwareFilter(sources.StringContains(name, v.toString), e.dataType)) case expressions.Literal(true, BooleanType) => Some(sources.AlwaysTrue) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceUtils.scala index 0db5de7243404..c80dc83079675 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceUtils.scala @@ -26,9 +26,9 @@ import org.json4s.{Formats, NoTypeHints} import org.json4s.jackson.Serialization import org.apache.spark.{SparkException, SparkUpgradeException} -import org.apache.spark.sql.{SPARK_LEGACY_DATETIME_METADATA_KEY, SPARK_LEGACY_INT96_METADATA_KEY, SPARK_TIMEZONE_METADATA_KEY, SPARK_VERSION_METADATA_KEY} +import org.apache.spark.sql.{sources, SPARK_LEGACY_DATETIME_METADATA_KEY, SPARK_LEGACY_INT96_METADATA_KEY, SPARK_TIMEZONE_METADATA_KEY, SPARK_VERSION_METADATA_KEY} import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogUtils} -import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, AttributeSet, Expression, ExpressionSet, GetStructField, PredicateHelper} +import org.apache.spark.sql.catalyst.expressions.{AttributeReference, AttributeSet, Expression, ExpressionSet, PredicateHelper} import org.apache.spark.sql.catalyst.util.RebaseDateTime import org.apache.spark.sql.catalyst.util.RebaseDateTime.RebaseSpec import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors} @@ -36,7 +36,7 @@ import org.apache.spark.sql.execution.datasources.parquet.ParquetOptions import org.apache.spark.sql.internal.{LegacyBehaviorPolicy, SQLConf} import org.apache.spark.sql.sources.BaseRelation import org.apache.spark.sql.types._ -import org.apache.spark.sql.util.{CaseInsensitiveStringMap, SchemaUtils} +import org.apache.spark.sql.util.CaseInsensitiveStringMap import org.apache.spark.util.Utils @@ -280,22 +280,15 @@ object DataSourceUtils extends PredicateHelper { (ExpressionSet(partitionFilters ++ extraPartitionFilter).toSeq, dataFilters) } - /** - * Determines whether a filter should be pushed down to the data source or not. - * - * @param expression The filter expression to be evaluated. - * @param isCollationPushDownSupported Whether the data source supports collation push down. - * @return A boolean indicating whether the filter should be pushed down or not. - */ - def shouldPushFilter(expression: Expression, isCollationPushDownSupported: Boolean): Boolean = { - if (!expression.deterministic) return false - - isCollationPushDownSupported || !expression.exists { - case childExpression @ (_: Attribute | _: GetStructField) => - // don't push down filters for types with non-binary sortable collation - // as it could lead to incorrect results - SchemaUtils.hasNonBinarySortableCollatedString(childExpression.dataType) - + def containsFiltersWithCollation(filter: sources.Filter): Boolean = { + filter match { + case sources.And(left, right) => + containsFiltersWithCollation(left) || containsFiltersWithCollation(right) + case sources.Or(left, right) => + containsFiltersWithCollation(left) || containsFiltersWithCollation(right) + case sources.Not(child) => + containsFiltersWithCollation(child) + case _: sources.CollatedFilter => true case _ => false } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala index 0785b0cbe9e23..36c59950fe209 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala @@ -223,12 +223,6 @@ trait FileFormat { */ def fileConstantMetadataExtractors: Map[String, PartitionedFile => Any] = FileFormat.BASE_METADATA_EXTRACTORS - - /** - * Returns whether the file format supports filter push down - * for non utf8 binary collated columns. - */ - def supportsCollationPushDown: Boolean = false } object FileFormat { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatDataWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatDataWriter.scala index 1dbb6ce26f693..7d071124b0b30 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatDataWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatDataWriter.scala @@ -22,7 +22,8 @@ import org.apache.hadoop.fs.{FileAlreadyExistsException, Path} import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.spark.TaskOutputFileAlreadyExistException -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{CONFIG, NUM_CONCURRENT_WRITER} import org.apache.spark.internal.io.{FileCommitProtocol, FileNameSpec} import org.apache.spark.internal.io.FileCommitProtocol.TaskCommitMessage import org.apache.spark.shuffle.FetchFailedException @@ -558,9 +559,11 @@ class DynamicPartitionDataConcurrentWriter( new WriterStatus(currentWriter, recordsInFile, fileCounter)) if (concurrentWriters.size >= concurrentOutputWriterSpec.maxWriters && !sorted) { // Fall back to sort-based sequential writer mode. - logInfo(s"Number of concurrent writers ${concurrentWriters.size} reaches the threshold. " + - "Fall back from concurrent writers to sort-based sequential writer. You may change " + - s"threshold with configuration ${SQLConf.MAX_CONCURRENT_OUTPUT_FILE_WRITERS.key}") + logInfo(log"Number of concurrent writers " + + log"${MDC(NUM_CONCURRENT_WRITER, concurrentWriters.size)} reaches the threshold. " + + log"Fall back from concurrent writers to sort-based sequential writer. You may change " + + log"threshold with configuration " + + log"${MDC(CONFIG, SQLConf.MAX_CONCURRENT_OUTPUT_FILE_WRITERS.key)}") sorted = true } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala index 3bfa3413f6796..91749ddd794fb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala @@ -26,7 +26,8 @@ import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl import org.apache.spark._ -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.io.{FileCommitProtocol, SparkHadoopWriterUtils} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow @@ -269,19 +270,20 @@ object FileFormatWriter extends Logging { val ret = f val commitMsgs = ret.map(_.commitMsg) - logInfo(s"Start to commit write Job ${description.uuid}.") + logInfo(log"Start to commit write Job ${MDC(LogKeys.UUID, description.uuid)}.") val (_, duration) = Utils .timeTakenMs { committer.commitJob(job, commitMsgs.toImmutableArraySeq) } - logInfo(s"Write Job ${description.uuid} committed. Elapsed time: $duration ms.") + logInfo(log"Write Job ${MDC(LogKeys.UUID, description.uuid)} committed. " + + log"Elapsed time: ${MDC(LogKeys.ELAPSED_TIME, duration)} ms.") processStats( description.statsTrackers, ret.map(_.summary.stats).toImmutableArraySeq, duration) - logInfo(s"Finished processing stats for write job ${description.uuid}.") + logInfo(log"Finished processing stats for write job ${MDC(LogKeys.UUID, description.uuid)}.") // return a set of all the partition paths that were updated during this job ret.map(_.summary.updatedPartitions).reduceOption(_ ++ _).getOrElse(Set.empty) } catch { case cause: Throwable => - logError(s"Aborting job ${description.uuid}.", cause) + logError(log"Aborting job ${MDC(WRITE_JOB_UUID, description.uuid)}.", cause) committer.abortJob(job) throw cause } @@ -404,7 +406,7 @@ object FileFormatWriter extends Logging { })(catchBlock = { // If there is an error, abort the task dataWriter.abort() - logError(s"Job $jobId aborted.") + logError(log"Job ${MDC(JOB_ID, jobId)} aborted.") }, finallyBlock = { dataWriter.close() }) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileIndexOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileIndexOptions.scala index 1c352e3748f21..5a300dae4daab 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileIndexOptions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileIndexOptions.scala @@ -22,6 +22,7 @@ import org.apache.spark.sql.catalyst.util.DateTimeUtils object FileIndexOptions extends DataSourceOptions { val IGNORE_MISSING_FILES = newOption(FileSourceOptions.IGNORE_MISSING_FILES) + val IGNORE_INVALID_PARTITION_PATHS = newOption("ignoreInvalidPartitionPaths") val TIME_ZONE = newOption(DateTimeUtils.TIMEZONE_OPTION) val RECURSIVE_FILE_LOOKUP = newOption("recursiveFileLookup") val BASE_PATH_PARAM = newOption("basePath") diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FilePartition.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FilePartition.scala index 836f0b0698793..8a47a28de845c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FilePartition.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FilePartition.scala @@ -21,7 +21,8 @@ import scala.collection.mutable.ArrayBuffer import scala.math.BigDecimal.RoundingMode import org.apache.spark.Partition -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{CONFIG, DESIRED_NUM_PARTITIONS, MAX_NUM_PARTITIONS, NUM_PARTITIONS} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.connector.read.InputPartition import org.apache.spark.sql.execution.ScanFileListing @@ -98,9 +99,11 @@ object FilePartition extends Logging { val desiredSplitBytes = (totalSizeInBytes / BigDecimal(maxPartNum.get)).setScale(0, RoundingMode.UP).longValue val desiredPartitions = getFilePartitions(partitionedFiles, desiredSplitBytes, openCostBytes) - logWarning(s"The number of partitions is ${partitions.size}, which exceeds the maximum " + - s"number configured: ${maxPartNum.get}. Spark rescales it to ${desiredPartitions.size} " + - s"by ignoring the configuration of ${SQLConf.FILES_MAX_PARTITION_BYTES.key}.") + logWarning(log"The number of partitions is ${MDC(NUM_PARTITIONS, partitions.size)}, " + + log"which exceeds the maximum number configured: " + + log"${MDC(MAX_NUM_PARTITIONS, maxPartNum.get)}. Spark rescales it to " + + log"${MDC(DESIRED_NUM_PARTITIONS, desiredPartitions.size)} by ignoring the " + + log"configuration of ${MDC(CONFIG, SQLConf.FILES_MAX_PARTITION_BYTES.key)}.") desiredPartitions } else { partitions diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala index 825b8154f6815..9bcdbadf7c5c0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala @@ -24,6 +24,8 @@ import org.apache.hadoop.fs.Path import org.apache.spark.{Partition => RDDPartition, TaskContext} import org.apache.spark.deploy.SparkHadoopUtil +import org.apache.spark.internal.LogKeys.{CURRENT_FILE, PATH} +import org.apache.spark.internal.MDC import org.apache.spark.paths.SparkPath import org.apache.spark.rdd.{InputFileBlockHolder, RDD} import org.apache.spark.sql.SparkSession @@ -231,7 +233,7 @@ class FileScanRDD( if (files.hasNext) { currentFile = files.next() updateMetadataRow() - logInfo(s"Reading File $currentFile") + logInfo(log"Reading File ${MDC(CURRENT_FILE, currentFile)}") // Sets InputFileBlockHolder for the file block's information InputFileBlockHolder .set(currentFile.urlEncodedPath, currentFile.start, currentFile.length) @@ -259,14 +261,14 @@ class FileScanRDD( } } catch { case e: FileNotFoundException if ignoreMissingFiles => - logWarning(s"Skipped missing file: $currentFile", e) + logWarning(log"Skipped missing file: ${MDC(PATH, currentFile)}", e) finished = true null // Throw FileNotFoundException even if `ignoreCorruptFiles` is true case e: FileNotFoundException if !ignoreMissingFiles => throw e case e @ (_: RuntimeException | _: IOException) if ignoreCorruptFiles => - logWarning( - s"Skipped the rest of the content in the corrupted file: $currentFile", e) + logWarning(log"Skipped the rest of the content in the corrupted file: " + + log"${MDC(PATH, currentFile)}", e) finished = true null } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala index f2dcbe26104f7..27019ab047ff2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala @@ -21,7 +21,8 @@ import java.util.Locale import scala.collection.mutable -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{NUM_PRUNED, POST_SCAN_FILTERS, PUSHED_FILTERS, TOTAL} import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.catalog.BucketSpec import org.apache.spark.sql.catalyst.expressions @@ -137,9 +138,8 @@ object FileSourceStrategy extends Strategy with PredicateHelper with Logging { val numBucketsSelected = matchedBuckets.cardinality() - logInfo { - s"Pruned ${numBuckets - numBucketsSelected} out of $numBuckets buckets." - } + logInfo(log"Pruned ${MDC(NUM_PRUNED, numBuckets - numBucketsSelected)} " + + log"out of ${MDC(TOTAL, numBuckets)} buckets.") // None means all the buckets need to be scanned if (numBucketsSelected == numBuckets) { @@ -160,11 +160,8 @@ object FileSourceStrategy extends Strategy with PredicateHelper with Logging { // - filters that need to be evaluated again after the scan val filterSet = ExpressionSet(filters) - val filtersToPush = filters.filter(f => - DataSourceUtils.shouldPushFilter(f, fsRelation.fileFormat.supportsCollationPushDown)) - val normalizedFilters = DataSourceStrategy.normalizeExprs( - filtersToPush, l.output) + filters.filter(_.deterministic), l.output) val partitionColumns = l.resolve( @@ -206,19 +203,18 @@ object FileSourceStrategy extends Strategy with PredicateHelper with Logging { DataSourceUtils.supportNestedPredicatePushdown(fsRelation) val pushedFilters = dataFilters .flatMap(DataSourceStrategy.translateFilter(_, supportNestedPredicatePushdown)) - logInfo(s"Pushed Filters: ${pushedFilters.mkString(",")}") + logInfo(log"Pushed Filters: ${MDC(PUSHED_FILTERS, pushedFilters.mkString(","))}") // Predicates with both partition keys and attributes need to be evaluated after the scan. val afterScanFilters = filterSet -- partitionKeyFilters.filter(_.references.nonEmpty) - logInfo(s"Post-Scan Filters: ${afterScanFilters.mkString(",")}") + logInfo(log"Post-Scan Filters: ${MDC(POST_SCAN_FILTERS, afterScanFilters.mkString(","))}") val filterAttributes = AttributeSet(afterScanFilters ++ stayUpFilters) val requiredExpressions: Seq[NamedExpression] = filterAttributes.toSeq ++ projects val requiredAttributes = AttributeSet(requiredExpressions) - val readDataColumns = dataColumns + val readDataColumns = dataColumnsWithoutPartitionCols .filter(requiredAttributes.contains) - .filterNot(partitionColumns.contains) // Metadata attributes are part of a column of type struct up to this point. Here we extract // this column from the schema and specify a matcher for that. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala index 80002ecdaf8da..2bb8476a9f0e8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala @@ -25,7 +25,8 @@ import scala.jdk.CollectionConverters._ import com.google.common.cache._ import org.apache.hadoop.fs.{FileStatus, Path} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{CACHED_TABLE_PARTITION_METADATA_SIZE, MAX_TABLE_PARTITION_METADATA_SIZE} import org.apache.spark.sql.SparkSession import org.apache.spark.util.SizeEstimator @@ -111,8 +112,8 @@ private class SharedInMemoryCache(maxSizeInBytes: Long, cacheTTL: Long) extends override def weigh(key: (ClientId, Path), value: Array[FileStatus]): Int = { val estimate = (SizeEstimator.estimate(key) + SizeEstimator.estimate(value)) / weightScale if (estimate > Int.MaxValue) { - logWarning(s"Cached table partition metadata size is too big. Approximating to " + - s"${Int.MaxValue.toLong * weightScale}.") + logWarning(log"Cached table partition metadata size is too big. Approximating to " + + log"${MDC(CACHED_TABLE_PARTITION_METADATA_SIZE, Int.MaxValue.toLong * weightScale)}.") Int.MaxValue } else { estimate.toInt @@ -126,9 +127,10 @@ private class SharedInMemoryCache(maxSizeInBytes: Long, cacheTTL: Long) extends if (removed.getCause == RemovalCause.SIZE && warnedAboutEviction.compareAndSet(false, true)) { logWarning( - "Evicting cached table partition metadata from memory due to size constraints " + - "(spark.sql.hive.filesourcePartitionFileCacheSize = " - + maxSizeInBytes + " bytes). This may impact query planning performance.") + log"Evicting cached table partition metadata from memory due to size constraints " + + log"(spark.sql.hive.filesourcePartitionFileCacheSize = " + + log"${MDC(MAX_TABLE_PARTITION_METADATA_SIZE, maxSizeInBytes)} bytes). " + + log"This may impact query planning performance.") } } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala index 0f66aa816d96c..3b8a20c7cf741 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala @@ -24,7 +24,8 @@ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs._ import org.apache.hadoop.mapred.{FileInputFormat, JobConf} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{COUNT, ELAPSED_TIME} import org.apache.spark.metrics.source.HiveCatalogMetrics import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.FileSourceOptions @@ -136,8 +137,8 @@ class InMemoryFileIndex( fileStatusCache.putLeafFiles(path, leafFiles.toArray) output ++= leafFiles } - logInfo(s"It took ${(System.nanoTime() - startTime) / (1000 * 1000)} ms to list leaf files" + - s" for ${paths.length} paths.") + logInfo(log"It took ${MDC(ELAPSED_TIME, (System.nanoTime() - startTime) / (1000 * 1000))} ms" + + log" to list leaf files for ${MDC(COUNT, paths.length)} paths.") output } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala index 3efe614bcef92..07be3f89872cc 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala @@ -24,7 +24,8 @@ import scala.collection.mutable import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs._ -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{COUNT, PERCENT, TOTAL} import org.apache.spark.paths.SparkPath import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.{expressions, InternalRow} @@ -69,6 +70,13 @@ abstract class PartitioningAwareFileIndex( caseInsensitiveMap.getOrElse(FileIndexOptions.RECURSIVE_FILE_LOOKUP, "false").toBoolean } + protected lazy val ignoreInvalidPartitionPaths: Boolean = { + caseInsensitiveMap + .get(FileIndexOptions.IGNORE_INVALID_PARTITION_PATHS) + .map(_.toBoolean) + .getOrElse(sparkSession.sessionState.conf.ignoreInvalidPartitionPaths) + } + override def listFiles( partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): Seq[PartitionDirectory] = { def isNonEmptyFile(f: FileStatus): Boolean = { @@ -161,7 +169,8 @@ abstract class PartitioningAwareFileIndex( userSpecifiedSchema = userSpecifiedSchema, caseSensitive = sparkSession.sessionState.conf.caseSensitiveAnalysis, validatePartitionColumns = sparkSession.sessionState.conf.validatePartitionColumns, - timeZoneId = timeZoneId) + timeZoneId = timeZoneId, + ignoreInvalidPartitionPaths = ignoreInvalidPartitionPaths) } } @@ -190,8 +199,8 @@ abstract class PartitioningAwareFileIndex( val total = partitions.length val selectedSize = selected.length val percentPruned = (1 - selectedSize.toDouble / total.toDouble) * 100 - s"Selected $selectedSize partitions out of $total, " + - s"pruned ${if (total == 0) "0" else s"$percentPruned%"} partitions." + log"Selected ${MDC(COUNT, selectedSize)} partitions out of ${MDC(TOTAL, total)}, " + + log"pruned ${MDC(PERCENT, if (total == 0) "0" else percentPruned)} partitions." } selected diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala index 56cba0e0561d1..3b2d601b81fb5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala @@ -106,9 +106,10 @@ object PartitioningUtils extends SQLConfHelper { userSpecifiedSchema: Option[StructType], caseSensitive: Boolean, validatePartitionColumns: Boolean, - timeZoneId: String): PartitionSpec = { + timeZoneId: String, + ignoreInvalidPartitionPaths: Boolean): PartitionSpec = { parsePartitions(paths, typeInference, basePaths, userSpecifiedSchema, caseSensitive, - validatePartitionColumns, DateTimeUtils.getZoneId(timeZoneId)) + validatePartitionColumns, DateTimeUtils.getZoneId(timeZoneId), ignoreInvalidPartitionPaths) } private[datasources] def parsePartitions( @@ -118,7 +119,8 @@ object PartitioningUtils extends SQLConfHelper { userSpecifiedSchema: Option[StructType], caseSensitive: Boolean, validatePartitionColumns: Boolean, - zoneId: ZoneId): PartitionSpec = { + zoneId: ZoneId, + ignoreInvalidPartitionPaths: Boolean): PartitionSpec = { val userSpecifiedDataTypes = if (userSpecifiedSchema.isDefined) { val nameToDataType = userSpecifiedSchema.get.fields.map(f => f.name -> f.dataType).toMap if (!caseSensitive) { @@ -171,7 +173,7 @@ object PartitioningUtils extends SQLConfHelper { // TODO: Selective case sensitivity. val discoveredBasePaths = optDiscoveredBasePaths.flatten.map(_.toString.toLowerCase()) assert( - discoveredBasePaths.distinct.size == 1, + ignoreInvalidPartitionPaths || discoveredBasePaths.distinct.size == 1, "Conflicting directory structures detected. Suspicious paths:\b" + discoveredBasePaths.distinct.mkString("\n\t", "\n\t", "\n\n") + "If provided paths are partition directories, please set " + diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala index b0431d1df3987..1dffea4e1bc87 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala @@ -63,8 +63,7 @@ private[sql] object PruneFileSourcePartitions extends Rule[LogicalPlan] { _)) if filters.nonEmpty && fsRelation.partitionSchema.nonEmpty => val normalizedFilters = DataSourceStrategy.normalizeExprs( - filters.filter(f => !SubqueryExpression.hasSubquery(f) && - DataSourceUtils.shouldPushFilter(f, fsRelation.fileFormat.supportsCollationPushDown)), + filters.filter(f => f.deterministic && !SubqueryExpression.hasSubquery(f)), logicalRelation.output) val (partitionKeyFilters, _) = DataSourceUtils .getPartitionFiltersAndDataFilters(partitionSchema, normalizedFilters) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala index 144be2316f091..03e988eb0bd2b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala @@ -21,7 +21,8 @@ import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{OutputCommitter, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.CLASS_NAME import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol import org.apache.spark.sql.internal.SQLConf @@ -44,7 +45,8 @@ class SQLHadoopMapReduceCommitProtocol( configuration.getClass(SQLConf.OUTPUT_COMMITTER_CLASS.key, null, classOf[OutputCommitter]) if (clazz != null) { - logInfo(s"Using user defined output committer class ${clazz.getCanonicalName}") + logInfo(log"Using user defined output committer class " + + log"${MDC(CLASS_NAME, clazz.getCanonicalName)}") // Every output format based on org.apache.hadoop.mapreduce.lib.output.OutputFormat // has an associated output committer. To override this output committer, @@ -64,7 +66,8 @@ class SQLHadoopMapReduceCommitProtocol( committer = ctor.newInstance() } } - logInfo(s"Using output committer class ${committer.getClass.getCanonicalName}") + logInfo(log"Using output committer class " + + log"${MDC(CLASS_NAME, committer.getClass.getCanonicalName)}") committer } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/V1Writes.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/V1Writes.scala index d7a8d7aec0b7b..1d6c2a6f81124 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/V1Writes.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/V1Writes.scala @@ -213,9 +213,9 @@ object V1WritesUtils { } } - def getWriteFilesOpt(child: SparkPlan): Option[WriteFilesExec] = { + def getWriteFilesOpt(child: SparkPlan): Option[WriteFilesExecBase] = { child.collectFirst { - case w: WriteFilesExec => w + case w: WriteFilesExecBase => w } } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriteFiles.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriteFiles.scala index a4fd57e7dffad..c6c34b7fcea3f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriteFiles.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriteFiles.scala @@ -58,6 +58,14 @@ case class WriteFiles( copy(child = newChild) } +trait WriteFilesExecBase extends UnaryExecNode { + override def output: Seq[Attribute] = Seq.empty + + override protected def doExecute(): RDD[InternalRow] = { + throw SparkException.internalError(s"$nodeName does not support doExecute") + } +} + /** * Responsible for writing files. */ @@ -67,9 +75,7 @@ case class WriteFilesExec( partitionColumns: Seq[Attribute], bucketSpec: Option[BucketSpec], options: Map[String, String], - staticPartitions: TablePartitionSpec) extends UnaryExecNode { - override def output: Seq[Attribute] = Seq.empty - + staticPartitions: TablePartitionSpec) extends WriteFilesExecBase { override protected def doExecuteWrite( writeFilesSpec: WriteFilesSpec): RDD[WriterCommitMessage] = { val rdd = child.execute() @@ -105,10 +111,6 @@ case class WriteFilesExec( } } - override protected def doExecute(): RDD[InternalRow] = { - throw SparkException.internalError(s"$nodeName does not support doExecute") - } - override protected def stringArgs: Iterator[Any] = Iterator(child) override protected def withNewChildInternal(newChild: SparkPlan): WriteFilesExec = diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala index cf7c536bdaecb..a8730c20dbcb5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala @@ -20,6 +20,8 @@ package org.apache.spark.sql.execution.datasources.csv import java.io.{FileNotFoundException, IOException} import java.nio.charset.{Charset, StandardCharsets} +import scala.util.control.NonFatal + import com.univocity.parsers.csv.CsvParser import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, Path} @@ -28,11 +30,14 @@ import org.apache.hadoop.mapreduce.lib.input.FileInputFormat import org.apache.spark.TaskContext import org.apache.spark.input.{PortableDataStream, StreamInputFormat} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.PATH +import org.apache.spark.paths.SparkPath import org.apache.spark.rdd.{BinaryFileRDD, RDD} import org.apache.spark.sql.{Dataset, Encoders, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.csv.{CSVHeaderChecker, CSVInferSchema, CSVOptions, UnivocityParser} +import org.apache.spark.sql.errors.QueryExecutionErrors import org.apache.spark.sql.execution.SQLExecution import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.execution.datasources.text.TextFileFormat @@ -202,13 +207,16 @@ object MultiLineCSVDataSource extends CSVDataSource with Logging { encoding = parsedOptions.charset) } catch { case e: FileNotFoundException if ignoreMissingFiles => - logWarning(s"Skipped missing file: ${lines.getPath()}", e) + logWarning(log"Skipped missing file: ${MDC(PATH, lines.getPath())}", e) Array.empty[Array[String]] case e: FileNotFoundException if !ignoreMissingFiles => throw e case e @ (_: RuntimeException | _: IOException) if ignoreCorruptFiles => - logWarning( - s"Skipped the rest of the content in the corrupted file: ${lines.getPath()}", e) + logWarning(log"Skipped the rest of the content in the corrupted file: " + + log"${MDC(PATH, lines.getPath())}", e) Array.empty[Array[String]] + case NonFatal(e) => + val path = SparkPath.fromPathString(lines.getPath()) + throw QueryExecutionErrors.cannotReadFilesError(e, path.urlEncoded) } }.take(1).headOption match { case Some(firstRow) => diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala index 43db0c6eef114..481cc80fe5225 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala @@ -25,7 +25,7 @@ import org.apache.commons.io.FilenameUtils import org.apache.spark.SparkFiles import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap -import org.apache.spark.sql.errors.QueryExecutionErrors +import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.TimestampNTZType import org.apache.spark.util.Utils @@ -52,7 +52,14 @@ class JDBCOptions( */ val asProperties: Properties = { val properties = new Properties() - parameters.originalMap.foreach { case (k, v) => properties.setProperty(k, v) } + parameters.originalMap.foreach { case (k, v) => + // If an option value is `null`, throw a user-friendly error. Keys here cannot be null, as + // scala's implementation of Maps prohibits null keys. + if (v == null) { + throw QueryCompilationErrors.nullDataSourceOption(k) + } + properties.setProperty(k, v) + } properties } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala index 8c430e231e399..1b71dc9221f78 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala @@ -23,7 +23,8 @@ import scala.util.Using import scala.util.control.NonFatal import org.apache.spark.{InterruptibleIterator, Partition, SparkContext, TaskContext} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.SQL_TEXT import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.connector.expressions.filter.Predicate @@ -266,7 +267,7 @@ class JDBCRDD( options.sessionInitStatement match { case Some(sql) => val statement = conn.prepareStatement(sql) - logInfo(s"Executing sessionInitStatement: $sql") + logInfo(log"Executing sessionInitStatement: ${MDC(SQL_TEXT, sql)}") try { statement.setQueryTimeout(options.queryTimeout) statement.execute() diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala index 4f19d3df40b3c..2c4158dfe1533 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala @@ -21,7 +21,8 @@ import scala.collection.mutable.ArrayBuffer import scala.math.BigDecimal.RoundingMode import org.apache.spark.Partition -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{CLAUSES, LOWER_BOUND, NEW_VALUE, NUM_PARTITIONS, OLD_VALUE, UPPER_BOUND} import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Row, SaveMode, SparkSession, SQLContext} import org.apache.spark.sql.catalyst.analysis._ @@ -114,12 +115,12 @@ private[sql] object JDBCRelation extends Logging { (upperBound - lowerBound) < 0) { partitioning.numPartitions } else { - logWarning("The number of partitions is reduced because the specified number of " + - "partitions is less than the difference between upper bound and lower bound. " + - s"Updated number of partitions: ${upperBound - lowerBound}; Input number of " + - s"partitions: ${partitioning.numPartitions}; " + - s"Lower bound: ${boundValueToString(lowerBound)}; " + - s"Upper bound: ${boundValueToString(upperBound)}.") + logWarning(log"The number of partitions is reduced because the specified number of " + + log"partitions is less than the difference between upper bound and lower bound. " + + log"Updated number of partitions: ${MDC(NEW_VALUE, upperBound - lowerBound)}; " + + log"Input number of partitions: ${MDC(OLD_VALUE, partitioning.numPartitions)}; " + + log"Lower bound: ${MDC(LOWER_BOUND, boundValueToString(lowerBound))}; " + + log"Upper bound: ${MDC(UPPER_BOUND, boundValueToString(upperBound))}.") upperBound - lowerBound } @@ -163,8 +164,9 @@ private[sql] object JDBCRelation extends Logging { i = i + 1 } val partitions = ans.toArray - logInfo(s"Number of partitions: $numPartitions, WHERE clauses of these partitions: " + - partitions.map(_.asInstanceOf[JDBCPartition].whereClause).mkString(", ")) + val clauses = partitions.map(_.asInstanceOf[JDBCPartition].whereClause).mkString(", ") + logInfo(log"Number of partitions: ${MDC(NUM_PARTITIONS, numPartitions)}, " + + log"WHERE clauses of these partitions: ${MDC(CLAUSES, clauses)}") partitions } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala index 53b0b8b5d29de..f7d2d61eab653 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala @@ -19,10 +19,9 @@ package org.apache.spark.sql.execution.datasources.jdbc import java.math.{BigDecimal => JBigDecimal} import java.nio.charset.StandardCharsets -import java.sql.{Connection, Date, JDBCType, PreparedStatement, ResultSet, ResultSetMetaData, SQLException, Timestamp} +import java.sql.{Connection, Date, JDBCType, PreparedStatement, ResultSet, ResultSetMetaData, SQLException, Time, Timestamp} import java.time.{Instant, LocalDate} import java.util -import java.util.concurrent.TimeUnit import scala.annotation.tailrec import scala.collection.mutable.ArrayBuffer @@ -32,7 +31,8 @@ import scala.util.control.NonFatal import org.apache.spark.{SparkThrowable, SparkUnsupportedOperationException, TaskContext} import org.apache.spark.executor.InputMetrics -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{DEFAULT_ISOLATION_LEVEL, ISOLATION_LEVEL} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.catalyst.{InternalRow, SQLConfHelper} import org.apache.spark.sql.catalyst.analysis.{DecimalPrecision, Resolver} @@ -40,6 +40,7 @@ import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.expressions.SpecificInternalRow import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, CharVarcharUtils, GenericArrayData} +import org.apache.spark.sql.catalyst.util.DateTimeConstants.MICROS_PER_MILLIS import org.apache.spark.sql.catalyst.util.DateTimeUtils._ import org.apache.spark.sql.connector.catalog.{Identifier, TableChange} import org.apache.spark.sql.connector.catalog.index.{SupportsIndex, TableIndex} @@ -486,15 +487,8 @@ object JdbcUtils extends Logging with SQLConfHelper { // It stores the number of milliseconds after midnight, 00:00:00.000000 case TimestampType if metadata.contains("logical_time_type") => (rs: ResultSet, row: InternalRow, pos: Int) => { - val rawTime = rs.getTime(pos + 1) - if (rawTime != null) { - val localTimeMicro = TimeUnit.NANOSECONDS.toMicros( - rawTime.toLocalTime().toNanoOfDay()) - val utcTimeMicro = toUTCTime(localTimeMicro, conf.sessionLocalTimeZone) - row.setLong(pos, utcTimeMicro) - } else { - row.update(pos, null) - } + row.update(pos, nullSafeConvert[Time]( + rs.getTime(pos + 1), t => Math.multiplyExact(t.getTime, MICROS_PER_MILLIS))) } case TimestampType => @@ -506,6 +500,14 @@ object JdbcUtils extends Logging with SQLConfHelper { row.update(pos, null) } + case TimestampNTZType if metadata.contains("logical_time_type") => + (rs: ResultSet, row: InternalRow, pos: Int) => + val micros = nullSafeConvert[Time](rs.getTime(pos + 1), t => { + val time = dialect.convertJavaTimestampToTimestampNTZ(new Timestamp(t.getTime)) + localDateTimeToMicros(time) + }) + row.update(pos, micros) + case TimestampNTZType => (rs: ResultSet, row: InternalRow, pos: Int) => val t = rs.getTimestamp(pos + 1) @@ -771,11 +773,13 @@ object JdbcUtils extends Logging with SQLConfHelper { // Finally update to actually requested level if possible finalIsolationLevel = isolationLevel } else { - logWarning(s"Requested isolation level $isolationLevel is not supported; " + - s"falling back to default isolation level $defaultIsolation") + logWarning(log"Requested isolation level ${MDC(ISOLATION_LEVEL, isolationLevel)} " + + log"is not supported; falling back to default isolation level " + + log"${MDC(DEFAULT_ISOLATION_LEVEL, defaultIsolation)}") } } else { - logWarning(s"Requested isolation level $isolationLevel, but transactions are unsupported") + logWarning(log"Requested isolation level ${MDC(ISOLATION_LEVEL, isolationLevel)}, " + + log"but transactions are unsupported") } } catch { case NonFatal(e) => logWarning("Exception while detecting transaction support", e) @@ -875,16 +879,15 @@ object JdbcUtils extends Logging with SQLConfHelper { * Compute the schema string for this RDD. */ def schemaString( + dialect: JdbcDialect, schema: StructType, caseSensitive: Boolean, - url: String, createTableColumnTypes: Option[String] = None): String = { val sb = new StringBuilder() - val dialect = JdbcDialects.get(url) val userSpecifiedColTypesMap = createTableColumnTypes - .map(parseUserSpecifiedCreateTableColumnTypes(schema, caseSensitive, _)) + .map(parseUserSpecifiedCreateTableColumnTypes(dialect, schema, caseSensitive, _)) .getOrElse(Map.empty[String, String]) - schema.fields.foreach { field => + schema.foreach { field => val name = dialect.quoteIdentifier(field.name) val typ = userSpecifiedColTypesMap .getOrElse(field.name, getJdbcType(field.dataType, dialect).databaseTypeDefinition) @@ -900,6 +903,7 @@ object JdbcUtils extends Logging with SQLConfHelper { * use in-place of the default data type. */ private def parseUserSpecifiedCreateTableColumnTypes( + dialect: JdbcDialect, schema: StructType, caseSensitive: Boolean, createTableColumnTypes: String): Map[String, String] = { @@ -916,7 +920,9 @@ object JdbcUtils extends Logging with SQLConfHelper { } } - val userSchemaMap = userSchema.fields.map(f => f.name -> f.dataType.catalogString).toMap + val userSchemaMap = userSchema + .map(f => f.name -> getJdbcType(f.dataType, dialect).databaseTypeDefinition) + .toMap if (caseSensitive) userSchemaMap else CaseInsensitiveMap(userSchemaMap) } @@ -985,7 +991,7 @@ object JdbcUtils extends Logging with SQLConfHelper { val statement = conn.createStatement val dialect = JdbcDialects.get(options.url) val strSchema = schemaString( - schema, caseSensitive, options.url, options.createTableColumnTypes) + dialect, schema, caseSensitive, options.createTableColumnTypes) try { statement.setQueryTimeout(options.queryTimeout) dialect.createTable(statement, tableName, strSchema, options) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala index 7fb6e98fb0468..6174c017f6047 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala @@ -55,8 +55,10 @@ class JsonFileFormat extends TextBasedFileFormat with DataSourceRegister { options, sparkSession.sessionState.conf.sessionLocalTimeZone, sparkSession.sessionState.conf.columnNameOfCorruptRecord) - JsonDataSource(parsedOptions).inferSchema( - sparkSession, files, parsedOptions) + parsedOptions.singleVariantColumn match { + case Some(columnName) => Some(StructType(Array(StructField(columnName, VariantType)))) + case None => JsonDataSource(parsedOptions).inferSchema(sparkSession, files, parsedOptions) + } } override def prepareWrite( @@ -134,7 +136,7 @@ class JsonFileFormat extends TextBasedFileFormat with DataSourceRegister { override def equals(other: Any): Boolean = other.isInstanceOf[JsonFileFormat] override def supportDataType(dataType: DataType): Boolean = dataType match { - case _: VariantType => false + case _: VariantType => true case _: AtomicType => true diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonOutputWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonOutputWriter.scala index 55602ce2ed9b4..5727c502a7097 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonOutputWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonOutputWriter.scala @@ -21,7 +21,8 @@ import java.nio.charset.{Charset, StandardCharsets} import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.TaskAttemptContext -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{ENCODING, PATH} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.json.{JacksonGenerator, JSONOptions, JSONOptionsInRead} import org.apache.spark.sql.execution.datasources.{CodecStreams, OutputWriter} @@ -40,8 +41,9 @@ class JsonOutputWriter( } if (JSONOptionsInRead.denyList.contains(encoding)) { - logWarning(s"The JSON file ($path) was written in the encoding ${encoding.displayName()}" + - " which can be read back by Spark only if multiLine is enabled.") + logWarning(log"The JSON file (${MDC(PATH, path)}) was written in the encoding " + + log"${MDC(ENCODING, encoding.displayName())} which can be read back by Spark only " + + log"if multiLine is enabled.") } private val writer = CodecStreams.createOutputStreamWriter(context, new Path(path), encoding) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala index 24943b37d0590..50c28c783b4cd 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala @@ -31,7 +31,8 @@ import org.apache.orc.{BooleanColumnStatistics, ColumnStatistics, DateColumnStat import org.apache.spark.{SPARK_VERSION_SHORT, SparkException} import org.apache.spark.deploy.SparkHadoopUtil -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.PATH import org.apache.spark.sql.{SPARK_VERSION_METADATA_KEY, SparkSession} import org.apache.spark.sql.catalyst.{FileSourceOptions, InternalRow} import org.apache.spark.sql.catalyst.analysis.caseSensitiveResolution @@ -87,7 +88,7 @@ object OrcUtils extends Logging { } catch { case e: org.apache.orc.FileFormatException => if (ignoreCorruptFiles) { - logWarning(s"Skipped the footer in the corrupted file: $file", e) + logWarning(log"Skipped the footer in the corrupted file: ${MDC(PATH, file)}", e) None } else { throw QueryExecutionErrors.cannotReadFooterForFileError(file, e) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala index df367766501d4..e5fbf8be1f0c2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala @@ -32,7 +32,8 @@ import org.apache.parquet.format.converter.ParquetMetadataConverter.SKIP_ROW_GRO import org.apache.parquet.hadoop._ import org.apache.spark.TaskContext -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{PATH, SCHEMA} import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ @@ -409,8 +410,8 @@ object ParquetFileFormat extends Logging { } .recover { case cause: Throwable => logWarning( - s"""Failed to parse serialized Spark schema in Parquet key-value metadata: - |\t$serializedSchema + log"""Failed to parse serialized Spark schema in Parquet key-value metadata: + |\t${MDC(SCHEMA, serializedSchema)} """.stripMargin, cause) } @@ -450,7 +451,7 @@ object ParquetFileFormat extends Logging { conf, currentFile, SKIP_ROW_GROUPS))) } catch { case e: RuntimeException => if (ignoreCorruptFiles) { - logWarning(s"Skipped the footer in the corrupted file: $currentFile", e) + logWarning(log"Skipped the footer in the corrupted file: ${MDC(PATH, currentFile)}", e) None } else { throw QueryExecutionErrors.cannotReadFooterForFileError(currentFile.getPath, e) @@ -526,8 +527,8 @@ object ParquetFileFormat extends Logging { }.recoverWith { case cause: Throwable => logWarning( - "Failed to parse and ignored serialized Spark schema in " + - s"Parquet key-value metadata:\n\t$schemaString", cause) + log"Failed to parse and ignored serialized Spark schema in " + + log"Parquet key-value metadata:\n\t${MDC(SCHEMA, schemaString)}", cause) Failure(cause) }.toOption } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetUtils.scala index 5020bf7333dea..3e111252bc6fe 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetUtils.scala @@ -33,7 +33,8 @@ import org.apache.parquet.schema.{PrimitiveType, Types} import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName import org.apache.spark.{SparkException, SparkUnsupportedOperationException} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{CLASS_NAME, CONFIG} import org.apache.spark.sql.Row import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow @@ -433,10 +434,11 @@ object ParquetUtils extends Logging { classOf[OutputCommitter]) if (conf.get(SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key) == null) { - logInfo("Using default output committer for Parquet: " + - classOf[ParquetOutputCommitter].getCanonicalName) + logInfo(log"Using default output committer for Parquet: " + + log"${MDC(CLASS_NAME, classOf[ParquetOutputCommitter].getCanonicalName)}") } else { - logInfo("Using user defined output committer for Parquet: " + committerClass.getCanonicalName) + logInfo(log"Using user defined output committer for Parquet: " + + log"${MDC(CLASS_NAME, committerClass.getCanonicalName)}") } conf.setClass( @@ -485,9 +487,9 @@ object ParquetUtils extends Logging { if (ParquetOutputFormat.getJobSummaryLevel(conf) != JobSummaryLevel.NONE && !classOf[ParquetOutputCommitter].isAssignableFrom(committerClass)) { // output summary is requested, but the class is not a Parquet Committer - logWarning(s"Committer $committerClass is not a ParquetOutputCommitter and cannot" + - s" create job summaries. " + - s"Set Parquet option ${ParquetOutputFormat.JOB_SUMMARY_LEVEL} to NONE.") + logWarning(log"Committer ${MDC(CLASS_NAME, committerClass)} is not a " + + log"ParquetOutputCommitter and cannot create job summaries. Set Parquet option " + + log"${MDC(CONFIG, ParquetOutputFormat.JOB_SUMMARY_LEVEL)} to NONE.") } new OutputWriterFactory { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala index fbb2ecb70d395..2f39a1962d2c0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala @@ -30,11 +30,12 @@ import org.apache.spark.sql.catalyst.util.TypeUtils._ import org.apache.spark.sql.connector.expressions.{FieldReference, RewritableTransform} import org.apache.spark.sql.errors.QueryCompilationErrors import org.apache.spark.sql.execution.command.DDLUtils +import org.apache.spark.sql.execution.command.ViewHelper.generateViewProperties import org.apache.spark.sql.execution.datasources.{CreateTable => CreateTableV1} import org.apache.spark.sql.execution.datasources.v2.FileDataSourceV2 import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.sources.InsertableRelation -import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.types.{StructField, StructType} import org.apache.spark.sql.util.PartitioningUtils.normalizePartitionSpec import org.apache.spark.sql.util.SchemaUtils import org.apache.spark.util.ArrayImplicits._ @@ -620,3 +621,65 @@ object CollationCheck extends (LogicalPlan => Unit) { private def isCollationExpression(expression: Expression): Boolean = expression.isInstanceOf[Collation] || expression.isInstanceOf[Collate] } + + +/** + * This rule checks for references to views WITH SCHEMA [TYPE] EVOLUTION and synchronizes the + * catalog if evolution was detected. + * It does so by walking the resolved plan looking for View operators for persisted views. + */ +object ViewSyncSchemaToMetaStore extends (LogicalPlan => Unit) { + def apply(plan: LogicalPlan): Unit = { + plan.foreach { + case View(metaData, false, viewQuery) + if (metaData.viewSchemaMode == SchemaTypeEvolution || + metaData.viewSchemaMode == SchemaEvolution) => + val viewSchemaMode = metaData.viewSchemaMode + val viewFields = metaData.schema.fields + val viewQueryFields = viewQuery.schema.fields + val session = SparkSession.getActiveSession.get + val redoSignature = + viewSchemaMode == SchemaEvolution && viewFields.length != viewQueryFields.length + val fieldNames = viewQuery.schema.fieldNames + + val redo = redoSignature || viewFields.zipWithIndex.exists { case (field, index) => + val planField = viewQueryFields(index) + (field.dataType != planField.dataType || + field.nullable != planField.nullable || + (viewSchemaMode == SchemaEvolution && ( + field.getComment() != planField.getComment() || + field.name != planField.name))) + } + + if (redo) { + val newProperties = if (viewSchemaMode == SchemaEvolution) { + generateViewProperties( + metaData.properties, + session, + fieldNames, + fieldNames, + metaData.viewSchemaMode) + } else { + metaData.properties + } + val newSchema = if (viewSchemaMode == SchemaTypeEvolution) { + val newFields = viewQuery.schema.map { + case StructField(name, dataType, nullable, _) => + StructField(name, dataType, nullable, + viewFields.find(_.name == name).get.metadata) + } + StructType(newFields) + } else { + viewQuery.schema + } + SchemaUtils.checkColumnNameDuplication(fieldNames.toImmutableArraySeq, + session.sessionState.conf.resolver) + val updatedViewMeta = metaData.copy( + properties = newProperties, + schema = newSchema) + session.sessionState.catalog.alterTable(updatedViewMeta) + } + case _ => // OK + } + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextFileFormat.scala index caa4e3ed386b3..5de51e55816e7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextFileFormat.scala @@ -49,6 +49,12 @@ class TextFileFormat extends TextBasedFileFormat with DataSourceRegister { } } + private def verifyReadSchema(schema: StructType): Unit = { + if (schema.size > 1) { + throw QueryCompilationErrors.textDataSourceWithMultiColumnsError(schema) + } + } + override def isSplitable( sparkSession: SparkSession, options: Map[String, String], @@ -98,9 +104,7 @@ class TextFileFormat extends TextBasedFileFormat with DataSourceRegister { filters: Seq[Filter], options: Map[String, String], hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = { - assert( - requiredSchema.length <= 1, - "Text data source only produces a single data column named \"value\".") + verifyReadSchema(requiredSchema) val textOptions = new TextOptions(options) val broadcastedHadoopConf = sparkSession.sparkContext.broadcast(new SerializableConfiguration(hadoopConf)) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CacheTableExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CacheTableExec.scala index 28241fb0a67ae..56c44a1256815 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CacheTableExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CacheTableExec.scala @@ -19,9 +19,11 @@ package org.apache.spark.sql.execution.datasources.v2 import java.util.Locale -import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.internal.LogKeys.OPTIONS +import org.apache.spark.internal.MDC +import org.apache.spark.sql.Dataset import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier} -import org.apache.spark.sql.catalyst.analysis.LocalTempView +import org.apache.spark.sql.catalyst.analysis.{LocalTempView, UnresolvedRelation} import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap @@ -32,7 +34,6 @@ import org.apache.spark.storage.StorageLevel trait BaseCacheTableExec extends LeafV2CommandExec { def relationName: String def planToCache: LogicalPlan - def dataFrameForCachedPlan: DataFrame def isLazy: Boolean def options: Map[String, String] @@ -44,18 +45,15 @@ trait BaseCacheTableExec extends LeafV2CommandExec { val withoutStorageLevel = options .filter { case (k, _) => k.toLowerCase(Locale.ROOT) != storageLevelKey } if (withoutStorageLevel.nonEmpty) { - logWarning(s"Invalid options: ${withoutStorageLevel.mkString(", ")}") + logWarning(log"Invalid options: ${MDC(OPTIONS, withoutStorageLevel.mkString(", "))}") } - session.sharedState.cacheManager.cacheQuery( - session, - planToCache, - Some(relationName), - storageLevel) + val df = Dataset.ofRows(session, planToCache) + session.sharedState.cacheManager.cacheQuery(df, Some(relationName), storageLevel) if (!isLazy) { // Performs eager caching. - dataFrameForCachedPlan.count() + df.count() } Seq.empty @@ -72,10 +70,6 @@ case class CacheTableExec( override lazy val relationName: String = multipartIdentifier.quoted override lazy val planToCache: LogicalPlan = relation - - override lazy val dataFrameForCachedPlan: DataFrame = { - Dataset.ofRows(session, planToCache) - } } case class CacheTableAsSelectExec( @@ -87,7 +81,10 @@ case class CacheTableAsSelectExec( referredTempFunctions: Seq[String]) extends BaseCacheTableExec { override lazy val relationName: String = tempViewName - override lazy val planToCache: LogicalPlan = { + override def planToCache: LogicalPlan = UnresolvedRelation(Seq(tempViewName)) + + override def run(): Seq[InternalRow] = { + // CACHE TABLE AS TABLE creates a temp view and caches the temp view. CreateViewCommand( name = TableIdentifier(tempViewName), userSpecifiedColumns = Nil, @@ -101,12 +98,7 @@ case class CacheTableAsSelectExec( isAnalyzed = true, referredTempFunctions = referredTempFunctions ).run(session) - - dataFrameForCachedPlan.logicalPlan - } - - override lazy val dataFrameForCachedPlan: DataFrame = { - session.table(tempViewName) + super.run() } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CreateIndexExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CreateIndexExec.scala index 63c8dc6517b9e..60d44101da3b1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CreateIndexExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CreateIndexExec.scala @@ -21,6 +21,8 @@ import java.util import scala.jdk.CollectionConverters._ +import org.apache.spark.internal.LogKeys.{INDEX_NAME, TABLE_NAME} +import org.apache.spark.internal.MDC import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.IndexAlreadyExistsException import org.apache.spark.sql.catalyst.expressions.Attribute @@ -55,7 +57,8 @@ case class CreateIndexExec( indexName, columns.map(_._1).toArray, colProperties, propertiesWithIndexType.asJava) } catch { case _: IndexAlreadyExistsException if ignoreIfExists => - logWarning(s"Index $indexName already exists in table ${table.name}. Ignoring.") + logWarning(log"Index ${MDC(INDEX_NAME, indexName)} already exists in " + + log"table ${MDC(TABLE_NAME, table.name)}. Ignoring.") } Seq.empty } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CreateNamespaceExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CreateNamespaceExec.scala index cb51b7f75f33a..76ba53ef99a00 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CreateNamespaceExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CreateNamespaceExec.scala @@ -19,6 +19,8 @@ package org.apache.spark.sql.execution.datasources.v2 import scala.jdk.CollectionConverters.MapHasAsJava +import org.apache.spark.internal.LogKeys.NAMESPACE +import org.apache.spark.internal.MDC import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.NamespaceAlreadyExistsException import org.apache.spark.sql.catalyst.expressions.Attribute @@ -47,7 +49,8 @@ case class CreateNamespaceExec( catalog.createNamespace(ns, (properties ++ ownership).asJava) } catch { case _: NamespaceAlreadyExistsException if ifNotExists => - logWarning(s"Namespace ${namespace.quoted} was created concurrently. Ignoring.") + logWarning(log"Namespace ${MDC(NAMESPACE, namespace.quoted)} was created concurrently. " + + log"Ignoring.") } } else if (!ifNotExists) { throw QueryCompilationErrors.namespaceAlreadyExistsError(ns) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CreateTableExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CreateTableExec.scala index 5f3ed7a5bc76c..f55fbafe11ddb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CreateTableExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CreateTableExec.scala @@ -19,6 +19,8 @@ package org.apache.spark.sql.execution.datasources.v2 import scala.jdk.CollectionConverters._ +import org.apache.spark.internal.LogKeys.TABLE_NAME +import org.apache.spark.internal.MDC import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException import org.apache.spark.sql.catalyst.expressions.Attribute @@ -44,7 +46,8 @@ case class CreateTableExec( catalog.createTable(identifier, columns, partitioning.toArray, tableProperties.asJava) } catch { case _: TableAlreadyExistsException if ignoreIfExists => - logWarning(s"Table ${identifier.quoted} was created concurrently. Ignoring.") + logWarning( + log"Table ${MDC(TABLE_NAME, identifier.quoted)} was created concurrently. Ignoring.") } } else if (!ignoreIfExists) { throw QueryCompilationErrors.tableAlreadyExistsError(identifier) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala index 828d737f93fa9..7a668b75c3c73 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala @@ -22,7 +22,8 @@ import scala.collection.mutable import org.apache.commons.lang3.StringUtils import org.apache.spark.SparkException -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.EXPR import org.apache.spark.sql.{SparkSession, Strategy} import org.apache.spark.sql.catalyst.analysis.{ResolvedIdentifier, ResolvedNamespace, ResolvedPartitionSpec, ResolvedTable} import org.apache.spark.sql.catalyst.catalog.CatalogUtils @@ -82,7 +83,7 @@ class DataSourceV2Strategy(session: SparkSession) extends Strategy with Predicat // given table, the cache's storage level is returned. private def invalidateTableCache(r: ResolvedTable)(): Option[StorageLevel] = { val v2Relation = DataSourceV2Relation.create(r.table, Some(r.catalog), Some(r.identifier)) - val cache = session.sharedState.cacheManager.lookupCachedData(v2Relation) + val cache = session.sharedState.cacheManager.lookupCachedData(session, v2Relation) session.sharedState.cacheManager.uncacheQuery(session, v2Relation, cascade = true) if (cache.isDefined) { val cacheLevel = cache.get.cachedRepresentation.cacheBuilder.storageLevel @@ -650,7 +651,7 @@ private[sql] object DataSourceV2Strategy extends Logging { Some(new Predicate("IN", FieldReference(name) +: literals)) case other => - logWarning(s"Can't translate $other to source filter, unsupported expression") + logWarning(log"Can't translate ${MDC(EXPR, other)} to source filter, unsupported expression") None } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DropIndexExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DropIndexExec.scala index 085f961193771..4fe6c3cd4a0e8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DropIndexExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DropIndexExec.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.execution.datasources.v2 +import org.apache.spark.internal.LogKeys.INDEX_NAME +import org.apache.spark.internal.MDC import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.NoSuchIndexException import org.apache.spark.sql.catalyst.expressions.Attribute @@ -34,7 +36,7 @@ case class DropIndexExec( table.dropIndex(indexName) } catch { case _: NoSuchIndexException if ignoreIfNotExists => - logWarning(s"Index $indexName does not exist. Ignoring.") + logWarning(log"Index ${MDC(INDEX_NAME, indexName)} does not exist. Ignoring.") } Seq.empty } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileBatchWrite.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileBatchWrite.scala index 2f443a0bb1fad..b9f058b55ed02 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileBatchWrite.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileBatchWrite.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql.execution.datasources.v2 import org.apache.hadoop.mapreduce.Job -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.internal.io.FileCommitProtocol import org.apache.spark.sql.connector.write.{BatchWrite, DataWriterFactory, PhysicalWriteInfo, WriterCommitMessage} import org.apache.spark.sql.execution.datasources.{WriteJobDescription, WriteTaskResult} @@ -33,14 +33,15 @@ class FileBatchWrite( extends BatchWrite with Logging { override def commit(messages: Array[WriterCommitMessage]): Unit = { val results = messages.map(_.asInstanceOf[WriteTaskResult]) - logInfo(s"Start to commit write Job ${description.uuid}.") + logInfo(log"Start to commit write Job ${MDC(LogKeys.UUID, description.uuid)}.") val (_, duration) = Utils .timeTakenMs { committer.commitJob(job, results.map(_.commitMsg).toImmutableArraySeq) } - logInfo(s"Write Job ${description.uuid} committed. Elapsed time: $duration ms.") + logInfo(log"Write Job ${MDC(LogKeys.UUID, description.uuid)} committed. " + + log"Elapsed time: ${MDC(LogKeys.ELAPSED_TIME, duration)} ms.") processStats( description.statsTrackers, results.map(_.summary.stats).toImmutableArraySeq, duration) - logInfo(s"Finished processing stats for write job ${description.uuid}.") + logInfo(log"Finished processing stats for write job ${MDC(LogKeys.UUID, description.uuid)}.") } override def useCommitCoordinator(): Boolean = false diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FilePartitionReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FilePartitionReader.scala index c7783c4e9b29b..2679f14144569 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FilePartitionReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FilePartitionReader.scala @@ -18,7 +18,8 @@ package org.apache.spark.sql.execution.datasources.v2 import java.io.{FileNotFoundException, IOException} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{CURRENT_FILE, PARTITIONED_FILE_READER} import org.apache.spark.rdd.InputFileBlockHolder import org.apache.spark.sql.catalyst.FileSourceOptions import org.apache.spark.sql.connector.read.PartitionReader @@ -38,7 +39,7 @@ class FilePartitionReader[T]( if (currentReader == null) { if (files.hasNext) { val file = files.next() - logInfo(s"Reading file $file") + logInfo(log"Reading file ${MDC(CURRENT_FILE, file)}") // Sets InputFileBlockHolder for the file block's information InputFileBlockHolder.set(file.urlEncodedPath, file.start, file.length) try { @@ -64,8 +65,8 @@ class FilePartitionReader[T]( currentReader != null && currentReader.next() } catch { case e @ (_: RuntimeException | _: IOException) if ignoreCorruptFiles => - logWarning( - s"Skipped the rest of the content in the corrupted file: $currentReader", e) + logWarning(log"Skipped the rest of the content in the corrupted file: " + + log"${MDC(PARTITIONED_FILE_READER, currentReader)}", e) false case e: Throwable => throw FileDataSourceV2.attachFilePath(currentReader.file.urlEncodedPath, e) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileScan.scala index 61d61ee7af250..d890107277d6c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileScan.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileScan.scala @@ -21,7 +21,8 @@ import java.util.{Locale, OptionalLong} import org.apache.commons.lang3.StringUtils import org.apache.hadoop.fs.Path -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{PATH, REASON} import org.apache.spark.internal.config.IO_WARNING_LARGEFILETHRESHOLD import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.expressions.{AttributeSet, Expression, ExpressionSet} @@ -164,8 +165,8 @@ trait FileScan extends Scan val path = splitFiles(0).toPath if (!isSplitable(path) && splitFiles(0).length > sparkSession.sparkContext.getConf.get(IO_WARNING_LARGEFILETHRESHOLD)) { - logWarning(s"Loading one large unsplittable file ${path.toString} with only one " + - s"partition, the reason is: ${getFileUnSplittableReason(path)}") + logWarning(log"Loading one large unsplittable file ${MDC(PATH, path.toString)} with only " + + log"one partition, the reason is: ${MDC(REASON, getFileUnSplittableReason(path))}") } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileScanBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileScanBuilder.scala index 7cd2779f86f95..447a36fe622c9 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileScanBuilder.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileScanBuilder.scala @@ -70,10 +70,9 @@ abstract class FileScanBuilder( } override def pushFilters(filters: Seq[Expression]): Seq[Expression] = { - val (filtersToPush, filtersToRemain) = filters.partition( - f => DataSourceUtils.shouldPushFilter(f, supportsCollationPushDown)) + val (deterministicFilters, nonDeterminsticFilters) = filters.partition(_.deterministic) val (partitionFilters, dataFilters) = - DataSourceUtils.getPartitionFiltersAndDataFilters(partitionSchema, filtersToPush) + DataSourceUtils.getPartitionFiltersAndDataFilters(partitionSchema, deterministicFilters) this.partitionFilters = partitionFilters this.dataFilters = dataFilters val translatedFilters = mutable.ArrayBuffer.empty[sources.Filter] @@ -84,7 +83,7 @@ abstract class FileScanBuilder( } } pushedDataFilters = pushDataFilters(translatedFilters.toArray) - dataFilters ++ filtersToRemain + dataFilters ++ nonDeterminsticFilters } override def pushedFilters: Array[Predicate] = pushedDataFilters.map(_.toV2) @@ -96,12 +95,6 @@ abstract class FileScanBuilder( */ protected def pushDataFilters(dataFilters: Array[Filter]): Array[Filter] = Array.empty[Filter] - /** - * Returns whether the file scan builder supports filter pushdown - * for non utf8 binary collated columns. - */ - protected def supportsCollationPushDown: Boolean = false - private def createRequiredNameSet(): Set[String] = requiredSchema.fields.map(PartitioningUtils.getColName(_, isCaseSensitive)).toSet diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWriterFactory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWriterFactory.scala index 4b1a099d3bac9..f18424b4bcb86 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWriterFactory.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWriterFactory.scala @@ -38,7 +38,7 @@ case class FileWriterFactory ( @transient private lazy val jobId = SparkHadoopWriterUtils.createJobID(jobTrackerID, 0) override def createWriter(partitionId: Int, realTaskId: Long): DataWriter[InternalRow] = { - val taskAttemptContext = createTaskAttemptContext(partitionId) + val taskAttemptContext = createTaskAttemptContext(partitionId, realTaskId.toInt & Int.MaxValue) committer.setupTask(taskAttemptContext) if (description.partitionColumns.isEmpty) { new SingleDirectoryDataWriter(description, taskAttemptContext, committer) @@ -47,9 +47,11 @@ case class FileWriterFactory ( } } - private def createTaskAttemptContext(partitionId: Int): TaskAttemptContextImpl = { + private def createTaskAttemptContext( + partitionId: Int, + realTaskId: Int): TaskAttemptContextImpl = { val taskId = new TaskID(jobId, TaskType.MAP, partitionId) - val taskAttemptId = new TaskAttemptID(taskId, 0) + val taskAttemptId = new TaskAttemptID(taskId, realTaskId) // Set up the configuration object val hadoopConf = description.serializableHadoopConf.value hadoopConf.set("mapreduce.job.id", jobId.toString) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/GroupBasedRowLevelOperationScanPlanning.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/GroupBasedRowLevelOperationScanPlanning.scala index 87f70eb696b66..8b8cdc06d398b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/GroupBasedRowLevelOperationScanPlanning.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/GroupBasedRowLevelOperationScanPlanning.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql.execution.datasources.v2 +import org.apache.spark.internal.{LogKeys, MDC} import org.apache.spark.sql.catalyst.expressions.{And, AttributeReference, AttributeSet, Expression, ExpressionSet, PredicateHelper, SubqueryExpression} import org.apache.spark.sql.catalyst.expressions.Literal.TrueLiteral import org.apache.spark.sql.catalyst.planning.{GroupBasedRowLevelOperation, PhysicalOperation} @@ -62,14 +63,16 @@ object GroupBasedRowLevelOperationScanPlanning extends Rule[LogicalPlan] with Pr val (scan, output) = PushDownUtils.pruneColumns(scanBuilder, relation, relation.output, Nil) + // scalastyle:off line.size.limit logInfo( - s""" - |Pushing operators to ${relation.name} - |Pushed filters: $pushedFiltersStr - |Filters evaluated on data source side: ${evaluatedFilters.mkString(", ")} - |Filters evaluated on Spark side: ${postScanFilters.mkString(", ")} - |Output: ${output.mkString(", ")} - """.stripMargin) + log""" + |Pushing operators to ${MDC(LogKeys.RELATION_NAME, relation.name)} + |Pushed filters: ${MDC(LogKeys.PUSHED_FILTERS, pushedFiltersStr)} + |Filters evaluated on data source side: ${MDC(LogKeys.EVALUATED_FILTERS, evaluatedFilters.mkString(", "))} + |Filters evaluated on Spark side: ${MDC(LogKeys.POST_SCAN_FILTERS, postScanFilters.mkString(", "))}} + |Output: ${MDC(LogKeys.RELATION_OUTPUT, output.mkString(", "))} + """.stripMargin) + // scalastyle:on line.size.limit rd transformDown { // simplify the join condition in MERGE operations by discarding already evaluated filters diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2ScanPartitioningAndOrdering.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2ScanPartitioningAndOrdering.scala index cb7c3efdbe482..cfab28bbd15ed 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2ScanPartitioningAndOrdering.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2ScanPartitioningAndOrdering.scala @@ -16,7 +16,8 @@ */ package org.apache.spark.sql.execution.datasources.v2 -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.CLASS_NAME import org.apache.spark.sql.catalyst.SQLConfHelper import org.apache.spark.sql.catalyst.expressions.V2ExpressionUtils import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan @@ -58,8 +59,9 @@ object V2ScanPartitioningAndOrdering extends Rule[LogicalPlan] with SQLConfHelpe } case _: UnknownPartitioning => None case p => - logWarning(s"Spark ignores the partitioning ${p.getClass.getSimpleName}." + - " Please use KeyGroupedPartitioning for better performance") + logWarning( + log"Spark ignores the partitioning ${MDC(CLASS_NAME, p.getClass.getSimpleName)}. " + + log"Please use KeyGroupedPartitioning for better performance") None } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2ScanRelationPushDown.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2ScanRelationPushDown.scala index 8c262cf56e8b5..2b6fcd9d547f1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2ScanRelationPushDown.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2ScanRelationPushDown.scala @@ -19,6 +19,8 @@ package org.apache.spark.sql.execution.datasources.v2 import scala.collection.mutable +import org.apache.spark.internal.LogKeys.{AGGREGATE_FUNCTIONS, GROUP_BY_EXPRS, POST_SCAN_FILTERS, PUSHED_FILTERS, RELATION_NAME, RELATION_OUTPUT} +import org.apache.spark.internal.MDC import org.apache.spark.sql.catalyst.expressions.{aggregate, Alias, And, Attribute, AttributeMap, AttributeReference, AttributeSet, Cast, Expression, IntegerLiteral, Literal, NamedExpression, PredicateHelper, ProjectionOverSchema, SortOrder, SubqueryExpression} import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression import org.apache.spark.sql.catalyst.optimizer.CollapseProject @@ -86,11 +88,11 @@ object V2ScanRelationPushDown extends Rule[LogicalPlan] with PredicateHelper { val postScanFilters = postScanFiltersWithoutSubquery ++ normalizedFiltersWithSubquery logInfo( - s""" - |Pushing operators to ${sHolder.relation.name} - |Pushed Filters: $pushedFiltersStr - |Post-Scan Filters: ${postScanFilters.mkString(",")} - """.stripMargin) + log""" + |Pushing operators to ${MDC(RELATION_NAME, sHolder.relation.name)} + |Pushed Filters: ${MDC(PUSHED_FILTERS, pushedFiltersStr)} + |Post-Scan Filters: ${MDC(POST_SCAN_FILTERS, postScanFilters.mkString(","))} + """.stripMargin) val filterCondition = postScanFilters.reduceLeftOption(And) filterCondition.map(Filter(_, sHolder)).getOrElse(sHolder) @@ -214,13 +216,13 @@ object V2ScanRelationPushDown extends Rule[LogicalPlan] with PredicateHelper { holder.pushedAggOutputMap = AttributeMap(groupOutputMap ++ aggOutputMap) holder.output = newOutput logInfo( - s""" - |Pushing operators to ${holder.relation.name} - |Pushed Aggregate Functions: - | ${translatedAgg.aggregateExpressions().mkString(", ")} - |Pushed Group by: - | ${translatedAgg.groupByExpressions.mkString(", ")} - """.stripMargin) + log""" + |Pushing operators to ${MDC(RELATION_NAME, holder.relation.name)} + |Pushed Aggregate Functions: + | ${MDC(AGGREGATE_FUNCTIONS, translatedAgg.aggregateExpressions().mkString(", "))} + |Pushed Group by: + | ${MDC(GROUP_BY_EXPRS, translatedAgg.groupByExpressions.mkString(", "))} + """.stripMargin) if (canCompletePushDown) { val projectExpressions = finalResultExprs.map { expr => @@ -361,9 +363,9 @@ object V2ScanRelationPushDown extends Rule[LogicalPlan] with PredicateHelper { sHolder.builder, sHolder.relation, normalizedProjects, normalizedFilters) logInfo( - s""" - |Output: ${output.mkString(", ")} - """.stripMargin) + log""" + |Output: ${MDC(RELATION_OUTPUT, output.mkString(", "))} + """.stripMargin) val wrappedScan = getWrappedScan(scan, sHolder) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2Exec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2Exec.scala index c65c15fb0ef28..5632595de7cf8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2Exec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2Exec.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.execution.datasources.v2 import scala.jdk.CollectionConverters._ import org.apache.spark.{SparkEnv, SparkException, TaskContext} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.NoSuchTableException @@ -376,8 +376,9 @@ trait V2TableWriteExec extends V2CommandExec with UnaryExecNode { val messages = new Array[WriterCommitMessage](rdd.partitions.length) val totalNumRowsAccumulator = new LongAccumulator() - logInfo(s"Start processing data source write support: $batchWrite. " + - s"The input RDD has ${messages.length} partitions.") + logInfo(log"Start processing data source write support: " + + log"${MDC(LogKeys.BATCH_WRITE, batchWrite)}. The input RDD has " + + log"${MDC(LogKeys.COUNT, messages.length)}} partitions.") // Avoid object not serializable issue. val writeMetrics: Map[String, SQLMetric] = customMetrics @@ -396,22 +397,24 @@ trait V2TableWriteExec extends V2CommandExec with UnaryExecNode { } ) - logInfo(s"Data source write support $batchWrite is committing.") + logInfo(log"Data source write support ${MDC(LogKeys.BATCH_WRITE, batchWrite)} is committing.") batchWrite.commit(messages) - logInfo(s"Data source write support $batchWrite committed.") + logInfo(log"Data source write support ${MDC(LogKeys.BATCH_WRITE, batchWrite)} committed.") commitProgress = Some(StreamWriterCommitProgress(totalNumRowsAccumulator.value)) } catch { case cause: Throwable => - logError(s"Data source write support $batchWrite is aborting.") + logError( + log"Data source write support ${MDC(LogKeys.BATCH_WRITE, batchWrite)} is aborting.") try { batchWrite.abort(messages) } catch { case t: Throwable => - logError(s"Data source write support $batchWrite failed to abort.") + logError(log"Data source write support ${MDC(LogKeys.BATCH_WRITE, batchWrite)} " + + log"failed to abort.") cause.addSuppressed(t) throw QueryExecutionErrors.writingJobFailedError(cause) } - logError(s"Data source write support $batchWrite aborted.") + logError(log"Data source write support ${MDC(LogKeys.BATCH_WRITE, batchWrite)} aborted.") throw cause } @@ -449,34 +452,45 @@ trait WritingSparkTask[W <: DataWriter[InternalRow]] extends Logging with Serial val coordinator = SparkEnv.get.outputCommitCoordinator val commitAuthorized = coordinator.canCommit(stageId, stageAttempt, partId, attemptId) if (commitAuthorized) { - logInfo(s"Commit authorized for partition $partId (task $taskId, attempt $attemptId, " + - s"stage $stageId.$stageAttempt)") + logInfo(log"Commit authorized for partition ${MDC(LogKeys.PARTITION_ID, partId)} " + + log"(task ${MDC(LogKeys.TASK_ID, taskId)}, " + + log"attempt ${MDC(LogKeys.TASK_ATTEMPT_ID, attemptId)}, " + + log"stage ${MDC(LogKeys.STAGE_ID, stageId)}." + + log"${MDC(LogKeys.STAGE_ATTEMPT, stageAttempt)})") + dataWriter.commit() } else { val commitDeniedException = QueryExecutionErrors.commitDeniedError( partId, taskId, attemptId, stageId, stageAttempt) - logInfo(commitDeniedException.getMessage) + logInfo(log"${MDC(LogKeys.ERROR, commitDeniedException.getMessage)}") // throwing CommitDeniedException will trigger the catch block for abort throw commitDeniedException } } else { - logInfo(s"Writer for partition ${context.partitionId()} is committing.") + logInfo(log"Writer for partition ${MDC(LogKeys.PARTITION_ID, context.partitionId())} " + + log"is committing.") dataWriter.commit() } - logInfo(s"Committed partition $partId (task $taskId, attempt $attemptId, " + - s"stage $stageId.$stageAttempt)") + logInfo(log"Committed partition ${MDC(LogKeys.PARTITION_ID, partId)} " + + log"(task ${MDC(LogKeys.TASK_ID, taskId)}, " + + log"attempt ${MDC(LogKeys.TASK_ATTEMPT_ID, attemptId)}, " + + log"stage ${MDC(LogKeys.STAGE_ID, stageId)}.${MDC(LogKeys.STAGE_ATTEMPT, stageAttempt)})") DataWritingSparkTaskResult(iterWithMetrics.count, msg) })(catchBlock = { // If there is an error, abort this writer - logError(s"Aborting commit for partition $partId (task $taskId, attempt $attemptId, " + - s"stage $stageId.$stageAttempt)") + logError(log"Aborting commit for partition ${MDC(LogKeys.PARTITION_ID, partId)} " + + log"(task ${MDC(LogKeys.TASK_ID, taskId)}, " + + log"attempt ${MDC(LogKeys.TASK_ATTEMPT_ID, attemptId)}, " + + log"stage ${MDC(LogKeys.STAGE_ID, stageId)}.${MDC(LogKeys.STAGE_ATTEMPT, stageAttempt)})") dataWriter.abort() - logError(s"Aborted commit for partition $partId (task $taskId, attempt $attemptId, " + - s"stage $stageId.$stageAttempt)") + logError(log"Aborted commit for partition ${MDC(LogKeys.PARTITION_ID, partId)} " + + log"(task ${MDC(LogKeys.TASK_ID, taskId)}, " + + log"attempt ${MDC(LogKeys.TASK_ATTEMPT_ID, attemptId)}, " + + log"stage ${MDC(LogKeys.STAGE_ID, stageId)}.${MDC(LogKeys.STAGE_ATTEMPT, stageAttempt)})") }, finallyBlock = { dataWriter.close() }) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/JDBCScanBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/JDBCScanBuilder.scala index dc834893db210..230f30fb1d069 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/JDBCScanBuilder.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/JDBCScanBuilder.scala @@ -89,7 +89,7 @@ case class JDBCScanBuilder( override def pushAggregation(aggregation: Aggregation): Boolean = { if (!jdbcOptions.pushDownAggregate) return false - val compiledAggs = aggregation.aggregateExpressions.flatMap(dialect.compileAggregate) + val compiledAggs = aggregation.aggregateExpressions.flatMap(dialect.compileExpression) if (compiledAggs.length != aggregation.aggregateExpressions.length) return false val compiledGroupBys = aggregation.groupByExpressions.flatMap(dialect.compileExpression) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/JDBCTableCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/JDBCTableCatalog.scala index dbd8ee5981daa..e7a3fe0f8aa7b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/JDBCTableCatalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/JDBCTableCatalog.scala @@ -131,13 +131,16 @@ class JDBCTableCatalog extends TableCatalog checkNamespace(ident.namespace()) val optionsWithTableName = new JDBCOptions( options.parameters + (JDBCOptions.JDBC_TABLE_NAME -> getTableName(ident))) - try { + JdbcUtils.classifyException( + errorClass = "FAILED_JDBC.LOAD_TABLE", + messageParameters = Map( + "url" -> options.getRedactUrl(), + "tableName" -> toSQLId(ident)), + dialect, + description = s"Failed to load table: $ident" + ) { val schema = JDBCRDD.resolveTable(optionsWithTableName) JDBCTable(ident, schema, optionsWithTableName) - } catch { - case e: SQLException => - logWarning("Failed to load table", e) - throw QueryCompilationErrors.noSuchTableError(ident) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/python/PythonMicroBatchStream.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/python/PythonMicroBatchStream.scala index 71e6c29bc299b..0fc1df4cd1e9b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/python/PythonMicroBatchStream.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/python/PythonMicroBatchStream.scala @@ -16,12 +16,15 @@ */ package org.apache.spark.sql.execution.datasources.v2.python +import org.apache.spark.SparkEnv import org.apache.spark.internal.Logging import org.apache.spark.sql.connector.read.{InputPartition, PartitionReaderFactory} -import org.apache.spark.sql.connector.read.streaming.{MicroBatchStream, Offset} +import org.apache.spark.sql.connector.read.streaming.{AcceptsLatestSeenOffset, MicroBatchStream, Offset} +import org.apache.spark.sql.execution.datasources.v2.python.PythonMicroBatchStream.nextStreamId import org.apache.spark.sql.execution.python.PythonStreamingSourceRunner import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.CaseInsensitiveStringMap +import org.apache.spark.storage.{PythonStreamBlockId, StorageLevel} case class PythonStreamingSourceOffset(json: String) extends Offset @@ -30,11 +33,22 @@ class PythonMicroBatchStream( shortName: String, outputSchema: StructType, options: CaseInsensitiveStringMap - ) extends MicroBatchStream with Logging { + ) + extends MicroBatchStream + with Logging + with AcceptsLatestSeenOffset { private def createDataSourceFunc = ds.source.createPythonFunction( ds.getOrCreateDataSourceInPython(shortName, options, Some(outputSchema)).dataSource) + private val streamId = nextStreamId + private var nextBlockId = 0L + + // planInputPartitions() maybe be called multiple times for the current microbatch. + // Cache the result of planInputPartitions() because it may involve sending data + // from python to JVM. + private var cachedInputPartition: Option[(String, String, PythonStreamingInputPartition)] = None + private val runner: PythonStreamingSourceRunner = new PythonStreamingSourceRunner(createDataSourceFunc, outputSchema) runner.init() @@ -44,9 +58,35 @@ class PythonMicroBatchStream( override def latestOffset(): Offset = PythonStreamingSourceOffset(runner.latestOffset()) override def planInputPartitions(start: Offset, end: Offset): Array[InputPartition] = { - runner.partitions(start.asInstanceOf[PythonStreamingSourceOffset].json, - end.asInstanceOf[PythonStreamingSourceOffset].json) - .zipWithIndex.map(p => PythonInputPartition(p._2, p._1)) + val startOffsetJson = start.asInstanceOf[PythonStreamingSourceOffset].json + val endOffsetJson = end.asInstanceOf[PythonStreamingSourceOffset].json + + if (cachedInputPartition.exists(p => p._1 == startOffsetJson && p._2 == endOffsetJson)) { + return Array(cachedInputPartition.get._3) + } + + val (partitions, rows) = runner.partitions(startOffsetJson, endOffsetJson) + if (rows.isDefined) { + // Only SimpleStreamReader without partitioning prefetch data. + assert(partitions.length == 1) + nextBlockId = nextBlockId + 1 + val blockId = PythonStreamBlockId(streamId, nextBlockId) + SparkEnv.get.blockManager.putIterator( + blockId, rows.get, StorageLevel.MEMORY_AND_DISK_SER, true) + val partition = PythonStreamingInputPartition(0, partitions.head, Some(blockId)) + cachedInputPartition.foreach(_._3.dropCache()) + cachedInputPartition = Some((startOffsetJson, endOffsetJson, partition)) + Array(partition) + } else { + partitions.zipWithIndex + .map(p => PythonStreamingInputPartition(p._2, p._1, None)) + } + } + + override def setLatestSeenOffset(offset: Offset): Unit = { + // Call planPartition on python with an empty offset range to initialize the start offset + // for the prefetching of simple reader. + runner.partitions(offset.json(), offset.json()) } private lazy val readInfo: PythonDataSourceReadInfo = { @@ -57,7 +97,7 @@ class PythonMicroBatchStream( } override def createReaderFactory(): PartitionReaderFactory = { - new PythonPartitionReaderFactory( + new PythonStreamingPartitionReaderFactory( ds.source, readInfo.func, outputSchema, None) } @@ -66,9 +106,18 @@ class PythonMicroBatchStream( } override def stop(): Unit = { + cachedInputPartition.foreach(_._3.dropCache()) runner.stop() } override def deserializeOffset(json: String): Offset = PythonStreamingSourceOffset(json) } +object PythonMicroBatchStream { + private var currentId = 0 + def nextStreamId: Int = synchronized { + currentId = currentId + 1 + currentId + } +} + diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/python/PythonScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/python/PythonScan.scala index 8fefc8b144a1f..8ebb91c01fc5c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/python/PythonScan.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/python/PythonScan.scala @@ -41,6 +41,9 @@ class PythonScan( override def supportedCustomMetrics(): Array[CustomMetric] = ds.source.createPythonMetrics() + + override def columnarSupportMode(): Scan.ColumnarSupportMode = + Scan.ColumnarSupportMode.UNSUPPORTED } class PythonBatch( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/python/PythonStreamingPartitionReaderFactory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/python/PythonStreamingPartitionReaderFactory.scala new file mode 100644 index 0000000000000..7d80cc2728102 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/python/PythonStreamingPartitionReaderFactory.scala @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package org.apache.spark.sql.execution.datasources.v2.python + +import org.apache.spark.SparkEnv +import org.apache.spark.internal.{Logging, LogKeys, MDC} +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.connector.metric.CustomTaskMetric +import org.apache.spark.sql.connector.read.{InputPartition, PartitionReader, PartitionReaderFactory} +import org.apache.spark.sql.execution.metric.SQLMetric +import org.apache.spark.sql.types.StructType +import org.apache.spark.storage.PythonStreamBlockId + + +case class PythonStreamingInputPartition( + index: Int, + pickedPartition: Array[Byte], + blockId: Option[PythonStreamBlockId]) extends InputPartition { + def dropCache(): Unit = { + blockId.foreach(SparkEnv.get.blockManager.master.removeBlock(_)) + } +} + +class PythonStreamingPartitionReaderFactory( + source: UserDefinedPythonDataSource, + pickledReadFunc: Array[Byte], + outputSchema: StructType, + jobArtifactUUID: Option[String]) + extends PartitionReaderFactory with Logging { + + override def createReader(partition: InputPartition): PartitionReader[InternalRow] = { + val part = partition.asInstanceOf[PythonStreamingInputPartition] + + // Maybe read from cached block prefetched by SimpleStreamReader + lazy val cachedBlock = if (part.blockId.isDefined) { + val block = SparkEnv.get.blockManager.get[InternalRow](part.blockId.get) + .map(_.data.asInstanceOf[Iterator[InternalRow]]) + if (block.isEmpty) { + logWarning(log"Prefetched block ${MDC(LogKeys.BLOCK_ID, part.blockId)} " + + log"for Python data source not found.") + } + block + } else None + + new PartitionReader[InternalRow] { + + private[this] val metrics: Map[String, SQLMetric] = PythonCustomMetric.pythonMetrics + + private val outputIter = if (cachedBlock.isEmpty) { + // Evaluate the python read UDF if the partition is not cached as block. + val evaluatorFactory = source.createMapInBatchEvaluatorFactory( + pickledReadFunc, + "read_from_data_source", + UserDefinedPythonDataSource.readInputSchema, + outputSchema, + metrics, + jobArtifactUUID) + + evaluatorFactory.createEvaluator().eval( + part.index, Iterator.single(InternalRow(part.pickedPartition))) + } else cachedBlock.get + + override def next(): Boolean = outputIter.hasNext + + override def get(): InternalRow = outputIter.next() + + override def close(): Unit = {} + + override def currentMetricsValues(): Array[CustomTaskMetric] = { + source.createPythonTaskMetrics(metrics.map { case (k, v) => k -> v.value }) + } + } + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/python/PythonStreamingSinkCommitRunner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/python/PythonStreamingSinkCommitRunner.scala index a444fdfff7d96..b04ebe92910ab 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/python/PythonStreamingSinkCommitRunner.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/python/PythonStreamingSinkCommitRunner.scala @@ -17,17 +17,14 @@ package org.apache.spark.sql.execution.datasources.v2.python -import java.io.{BufferedInputStream, BufferedOutputStream, DataInputStream, DataOutputStream} +import java.io.{DataInputStream, DataOutputStream} -import scala.jdk.CollectionConverters._ +import net.razorvine.pickle.Pickler -import org.apache.spark.SparkEnv -import org.apache.spark.api.python.{PythonFunction, PythonWorker, PythonWorkerFactory, PythonWorkerUtils, SpecialLengths} -import org.apache.spark.internal.Logging -import org.apache.spark.internal.config.BUFFER_SIZE -import org.apache.spark.internal.config.Python.PYTHON_AUTH_SOCKET_TIMEOUT +import org.apache.spark.api.python.{PythonFunction, PythonWorkerUtils, SpecialLengths} import org.apache.spark.sql.connector.write.WriterCommitMessage import org.apache.spark.sql.errors.QueryExecutionErrors +import org.apache.spark.sql.execution.python.PythonPlannerRunner import org.apache.spark.sql.types.StructType /** @@ -38,78 +35,22 @@ import org.apache.spark.sql.types.StructType * from the socket, then commit or abort a microbatch. */ class PythonStreamingSinkCommitRunner( - func: PythonFunction, + dataSourceCls: PythonFunction, schema: StructType, - overwrite: Boolean) extends Logging { - val workerModule: String = "pyspark.sql.worker.python_streaming_sink_runner" - - private val conf = SparkEnv.get.conf - protected val bufferSize: Int = conf.get(BUFFER_SIZE) - protected val authSocketTimeout = conf.get(PYTHON_AUTH_SOCKET_TIMEOUT) - - private val envVars: java.util.Map[String, String] = func.envVars - private val pythonExec: String = func.pythonExec - private var pythonWorker: Option[PythonWorker] = None - private var pythonWorkerFactory: Option[PythonWorkerFactory] = None - protected val pythonVer: String = func.pythonVer - - private var dataOut: DataOutputStream = null - private var dataIn: DataInputStream = null - - /** - * Initializes the Python worker for running the streaming sink committer. - */ - def init(): Unit = { - logInfo(s"Initializing Python runner pythonExec: $pythonExec") - val env = SparkEnv.get - - val localdir = env.blockManager.diskBlockManager.localDirs.map(f => f.getPath()).mkString(",") - envVars.put("SPARK_LOCAL_DIRS", localdir) - - envVars.put("SPARK_AUTH_SOCKET_TIMEOUT", authSocketTimeout.toString) - envVars.put("SPARK_BUFFER_SIZE", bufferSize.toString) - - val workerFactory = - new PythonWorkerFactory(pythonExec, workerModule, envVars.asScala.toMap, false) - val (worker: PythonWorker, _) = workerFactory.createSimpleWorker(blockingMode = true) - pythonWorker = Some(worker) - pythonWorkerFactory = Some(workerFactory) - - val stream = new BufferedOutputStream( - pythonWorker.get.channel.socket().getOutputStream, bufferSize) - dataOut = new DataOutputStream(stream) - - PythonWorkerUtils.writePythonVersion(pythonVer, dataOut) - - val pythonIncludes = func.pythonIncludes.asScala.toSet - PythonWorkerUtils.writeSparkFiles(Some("streaming_job"), pythonIncludes, dataOut) - - // Send the user function to python process - PythonWorkerUtils.writePythonFunction(func, dataOut) - + messages: Array[WriterCommitMessage], + batchId: Long, + overwrite: Boolean, + abort: Boolean) extends PythonPlannerRunner[Unit](dataSourceCls) { + override val workerModule: String = "pyspark.sql.worker.python_streaming_sink_runner" + + override protected def writeToPython(dataOut: DataOutputStream, pickler: Pickler): Unit = { + // Send the user function to python process. + PythonWorkerUtils.writePythonFunction(dataSourceCls, dataOut) + // Send the output schema. PythonWorkerUtils.writeUTF(schema.json, dataOut) - dataOut.writeBoolean(overwrite) - dataOut.flush() - - dataIn = new DataInputStream( - new BufferedInputStream(pythonWorker.get.channel.socket().getInputStream, bufferSize)) - - val initStatus = dataIn.readInt() - if (initStatus == SpecialLengths.PYTHON_EXCEPTION_THROWN) { - val msg = PythonWorkerUtils.readUTF(dataIn) - throw QueryExecutionErrors.pythonStreamingDataSourceRuntimeError( - action = "initialize streaming sink", msg) - } - } - - init() - - def commitOrAbort( - messages: Array[WriterCommitMessage], - batchId: Long, - abort: Boolean): Unit = { + // Send the commit messages. dataOut.writeInt(messages.length) messages.foreach { message => // Commit messages can be null if there are task failures. @@ -121,10 +62,14 @@ class PythonStreamingSinkCommitRunner( } } dataOut.writeLong(batchId) + // Send whether to invoke `abort` instead of `commit`. dataOut.writeBoolean(abort) - dataOut.flush() - val status = dataIn.readInt() - if (status == SpecialLengths.PYTHON_EXCEPTION_THROWN) { + } + + override protected def receiveFromPython(dataIn: DataInputStream): Unit = { + // Receive any exceptions thrown in the Python worker. + val code = dataIn.readInt() + if (code == SpecialLengths.PYTHON_EXCEPTION_THROWN) { val msg = PythonWorkerUtils.readUTF(dataIn) val action = if (abort) "abort" else "commit" throw QueryExecutionErrors.pythonStreamingDataSourceRuntimeError(action, msg) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/python/PythonStreamingWrite.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/python/PythonStreamingWrite.scala index 483fd5a4e0a1e..4c149437a3009 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/python/PythonStreamingWrite.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/python/PythonStreamingWrite.scala @@ -42,9 +42,6 @@ class PythonStreamingWrite( ds.getOrCreateDataSourceInPython(shortName, info.options(), Some(info.schema())).dataSource ) - private lazy val pythonStreamingSinkCommitRunner = - new PythonStreamingSinkCommitRunner(createDataSourceFunc, info.schema(), isTruncate) - override def createStreamingWriterFactory( physicalInfo: PhysicalWriteInfo): StreamingDataWriterFactory = { val writeInfo = ds.source.createWriteInfoInPython( @@ -60,11 +57,23 @@ class PythonStreamingWrite( } override def commit(epochId: Long, messages: Array[WriterCommitMessage]): Unit = { - pythonStreamingSinkCommitRunner.commitOrAbort(messages, epochId, false) + new PythonStreamingSinkCommitRunner( + createDataSourceFunc, + info.schema(), + messages, + batchId = epochId, + overwrite = isTruncate, + abort = false).runInPython() } override def abort(epochId: Long, messages: Array[WriterCommitMessage]): Unit = { - pythonStreamingSinkCommitRunner.commitOrAbort(messages, epochId, true) + new PythonStreamingSinkCommitRunner( + createDataSourceFunc, + info.schema(), + messages, + batchId = epochId, + overwrite = isTruncate, + abort = true).runInPython() } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StateDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StateDataSource.scala index 1a8f444042c23..e2724cb59754d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StateDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StateDataSource.scala @@ -116,12 +116,16 @@ case class StateSourceOptions( batchId: Long, operatorId: Int, storeName: String, - joinSide: JoinSideValues) { + joinSide: JoinSideValues, + snapshotStartBatchId: Option[Long], + snapshotPartitionId: Option[Int]) { def stateCheckpointLocation: Path = new Path(resolvedCpLocation, DIR_NAME_STATE) override def toString: String = { s"StateSourceOptions(checkpointLocation=$resolvedCpLocation, batchId=$batchId, " + - s"operatorId=$operatorId, storeName=$storeName, joinSide=$joinSide)" + s"operatorId=$operatorId, storeName=$storeName, joinSide=$joinSide, " + + s"snapshotStartBatchId=${snapshotStartBatchId.getOrElse("None")}, " + + s"snapshotPartitionId=${snapshotPartitionId.getOrElse("None")})" } } @@ -131,6 +135,8 @@ object StateSourceOptions extends DataSourceOptions { val OPERATOR_ID = newOption("operatorId") val STORE_NAME = newOption("storeName") val JOIN_SIDE = newOption("joinSide") + val SNAPSHOT_START_BATCH_ID = newOption("snapshotStartBatchId") + val SNAPSHOT_PARTITION_ID = newOption("snapshotPartitionId") object JoinSideValues extends Enumeration { type JoinSideValues = Value @@ -190,7 +196,30 @@ object StateSourceOptions extends DataSourceOptions { throw StateDataSourceErrors.conflictOptions(Seq(JOIN_SIDE, STORE_NAME)) } - StateSourceOptions(resolvedCpLocation, batchId, operatorId, storeName, joinSide) + val snapshotStartBatchId = Option(options.get(SNAPSHOT_START_BATCH_ID)).map(_.toLong) + if (snapshotStartBatchId.exists(_ < 0)) { + throw StateDataSourceErrors.invalidOptionValueIsNegative(SNAPSHOT_START_BATCH_ID) + } else if (snapshotStartBatchId.exists(_ > batchId)) { + throw StateDataSourceErrors.invalidOptionValue( + SNAPSHOT_START_BATCH_ID, s"value should be less than or equal to $batchId") + } + + val snapshotPartitionId = Option(options.get(SNAPSHOT_PARTITION_ID)).map(_.toInt) + if (snapshotPartitionId.exists(_ < 0)) { + throw StateDataSourceErrors.invalidOptionValueIsNegative(SNAPSHOT_PARTITION_ID) + } + + // both snapshotPartitionId and snapshotStartBatchId are required at the same time, because + // each partition may have different checkpoint status + if (snapshotPartitionId.isDefined && snapshotStartBatchId.isEmpty) { + throw StateDataSourceErrors.requiredOptionUnspecified(SNAPSHOT_START_BATCH_ID) + } else if (snapshotPartitionId.isEmpty && snapshotStartBatchId.isDefined) { + throw StateDataSourceErrors.requiredOptionUnspecified(SNAPSHOT_PARTITION_ID) + } + + StateSourceOptions( + resolvedCpLocation, batchId, operatorId, storeName, + joinSide, snapshotStartBatchId, snapshotPartitionId) } private def resolvedCheckpointLocation( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StatePartitionReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StatePartitionReader.scala index bbfe3a3f373ec..f09a2763031e0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StatePartitionReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StatePartitionReader.scala @@ -22,7 +22,7 @@ import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeRow} import org.apache.spark.sql.connector.read.{InputPartition, PartitionReader, PartitionReaderFactory} import org.apache.spark.sql.execution.datasources.v2.state.metadata.StateMetadataPartitionReader import org.apache.spark.sql.execution.datasources.v2.state.utils.SchemaUtil -import org.apache.spark.sql.execution.streaming.state.{NoPrefixKeyStateEncoderSpec, PrefixKeyScanStateEncoderSpec, ReadStateStore, StateStoreConf, StateStoreId, StateStoreProvider, StateStoreProviderId} +import org.apache.spark.sql.execution.streaming.state._ import org.apache.spark.sql.types.StructType import org.apache.spark.util.SerializableConfiguration @@ -93,7 +93,19 @@ class StatePartitionReader( } private lazy val store: ReadStateStore = { - provider.getReadStore(partition.sourceOptions.batchId + 1) + partition.sourceOptions.snapshotStartBatchId match { + case None => provider.getReadStore(partition.sourceOptions.batchId + 1) + + case Some(snapshotStartBatchId) => + if (!provider.isInstanceOf[SupportsFineGrainedReplay]) { + throw StateStoreErrors.stateStoreProviderDoesNotSupportFineGrainedReplay( + provider.getClass.toString) + } + provider.asInstanceOf[SupportsFineGrainedReplay] + .replayReadStateFromSnapshot( + snapshotStartBatchId + 1, + partition.sourceOptions.batchId + 1) + } } private lazy val iter: Iterator[InternalRow] = { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StateScanBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StateScanBuilder.scala index 0d69bf708e94f..ffcbcd0872e10 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StateScanBuilder.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StateScanBuilder.scala @@ -26,7 +26,7 @@ import org.apache.spark.sql.SparkSession import org.apache.spark.sql.connector.read.{Batch, InputPartition, PartitionReaderFactory, Scan, ScanBuilder} import org.apache.spark.sql.execution.datasources.v2.state.StateSourceOptions.JoinSideValues import org.apache.spark.sql.execution.streaming.StreamingSymmetricHashJoinHelper.{LeftSide, RightSide} -import org.apache.spark.sql.execution.streaming.state.StateStoreConf +import org.apache.spark.sql.execution.streaming.state.{StateStoreConf, StateStoreErrors} import org.apache.spark.sql.types.StructType import org.apache.spark.util.SerializableConfiguration @@ -81,9 +81,20 @@ class StateScan( assert((tail - head + 1) == partitionNums.length, s"No continuous partitions in state: ${partitionNums.mkString("Array(", ", ", ")")}") - partitionNums.map { - pn => new StateStoreInputPartition(pn, queryId, sourceOptions) - }.toArray + sourceOptions.snapshotPartitionId match { + case None => partitionNums.map { pn => + new StateStoreInputPartition(pn, queryId, sourceOptions) + }.toArray + + case Some(snapshotPartitionId) => + if (partitionNums.contains(snapshotPartitionId)) { + Array(new StateStoreInputPartition(snapshotPartitionId, queryId, sourceOptions)) + } else { + throw StateStoreErrors.stateStoreSnapshotPartitionNotFound( + snapshotPartitionId, sourceOptions.operatorId, + sourceOptions.stateCheckpointLocation.toString) + } + } } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StateTable.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StateTable.scala index 824968e709baf..dbd39f519e500 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StateTable.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StateTable.scala @@ -49,16 +49,21 @@ class StateTable( } override def name(): String = { - val desc = s"StateTable " + + var desc = s"StateTable " + s"[stateCkptLocation=${sourceOptions.stateCheckpointLocation}]" + s"[batchId=${sourceOptions.batchId}][operatorId=${sourceOptions.operatorId}]" + s"[storeName=${sourceOptions.storeName}]" if (sourceOptions.joinSide != JoinSideValues.none) { - desc + s"[joinSide=${sourceOptions.joinSide}]" - } else { - desc + desc += s"[joinSide=${sourceOptions.joinSide}]" + } + if (sourceOptions.snapshotStartBatchId.isDefined) { + desc += s"[snapshotStartBatchId=${sourceOptions.snapshotStartBatchId}]" + } + if (sourceOptions.snapshotPartitionId.isDefined) { + desc += s"[snapshotPartitionId=${sourceOptions.snapshotPartitionId}]" } + desc } override def capabilities(): util.Set[TableCapability] = CAPABILITY diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StreamStreamJoinStatePartitionReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StreamStreamJoinStatePartitionReader.scala index e5a5dddefef5b..91f42db46dfb0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StreamStreamJoinStatePartitionReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StreamStreamJoinStatePartitionReader.scala @@ -116,7 +116,8 @@ class StreamStreamJoinStatePartitionReader( partitionId = partition.partition, formatVersion, skippedNullValueCount = None, - useStateStoreCoordinator = false + useStateStoreCoordinator = false, + snapshotStartVersion = partition.sourceOptions.snapshotStartBatchId.map(_ + 1) ) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/text/TextScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/text/TextScan.scala index 761d88b5431fa..d19a8adb6ba26 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/text/TextScan.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/text/TextScan.scala @@ -23,6 +23,7 @@ import org.apache.hadoop.fs.Path import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.connector.read.PartitionReaderFactory +import org.apache.spark.sql.errors.QueryCompilationErrors import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex import org.apache.spark.sql.execution.datasources.text.TextOptions import org.apache.spark.sql.execution.datasources.v2.TextBasedFileScan @@ -44,6 +45,12 @@ case class TextScan( private val optionsAsScala = options.asScala.toMap private lazy val textOptions: TextOptions = new TextOptions(optionsAsScala) + private def verifyReadSchema(schema: StructType): Unit = { + if (schema.size > 1) { + throw QueryCompilationErrors.textDataSourceWithMultiColumnsError(schema) + } + } + override def isSplitable(path: Path): Boolean = { super.isSplitable(path) && !textOptions.wholeText } @@ -58,9 +65,7 @@ case class TextScan( } override def createReaderFactory(): PartitionReaderFactory = { - assert( - readDataSchema.length <= 1, - "Text data source only produces a single data column named \"value\".") + verifyReadSchema(readDataSchema) val hadoopConf = { val caseSensitiveMap = options.asCaseSensitiveMap.asScala.toMap // Hadoop Configurations are case sensitive. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/xml/XSDToSchema.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/xml/XSDToSchema.scala index 87082299615c3..c03c0ba11de57 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/xml/XSDToSchema.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/xml/XSDToSchema.scala @@ -47,7 +47,7 @@ object XSDToSchema extends Logging{ def read(xsdPath: Path): StructType = { val in = ValidatorUtil.openSchemaFile(xsdPath) val xmlSchemaCollection = new XmlSchemaCollection() - xmlSchemaCollection.setBaseUri(xsdPath.getParent.toString) + xmlSchemaCollection.setBaseUri(xsdPath.toString) val xmlSchema = xmlSchemaCollection.read(new InputStreamReader(in)) getStructType(xmlSchema) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/xml/XmlInputFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/xml/XmlInputFormat.scala index 4359ac02f5f58..6169cec6f8210 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/xml/XmlInputFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/xml/XmlInputFormat.scala @@ -20,7 +20,7 @@ import java.io.{InputStream, InputStreamReader, IOException, Reader} import java.nio.ByteBuffer import java.nio.charset.Charset -import org.apache.commons.io.input.CountingInputStream +import org.apache.commons.io.input.BoundedInputStream import org.apache.hadoop.fs.Seekable import org.apache.hadoop.io.{LongWritable, Text} import org.apache.hadoop.io.compress._ @@ -67,7 +67,7 @@ private[xml] class XmlRecordReader extends RecordReader[LongWritable, Text] { private var end: Long = _ private var reader: Reader = _ private var filePosition: Seekable = _ - private var countingIn: CountingInputStream = _ + private var countingIn: BoundedInputStream = _ private var readerLeftoverCharFn: () => Boolean = _ private var readerByteBuffer: ByteBuffer = _ private var decompressor: Decompressor = _ @@ -117,7 +117,9 @@ private[xml] class XmlRecordReader extends RecordReader[LongWritable, Text] { } } else { fsin.seek(start) - countingIn = new CountingInputStream(fsin) + countingIn = BoundedInputStream.builder() + .setInputStream(fsin) + .get() in = countingIn // don't use filePosition in this case. We have to count bytes read manually } @@ -156,7 +158,7 @@ private[xml] class XmlRecordReader extends RecordReader[LongWritable, Text] { if (filePosition != null) { filePosition.getPos } else { - start + countingIn.getByteCount - + start + countingIn.getCount - readerByteBuffer.remaining() - (if (readerLeftoverCharFn()) 1 else 0) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/BroadcastExchangeExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/BroadcastExchangeExec.scala index 866a62a3a0776..7844f470b0ef0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/BroadcastExchangeExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/BroadcastExchangeExec.scala @@ -25,6 +25,8 @@ import scala.concurrent.duration.NANOSECONDS import scala.util.control.NonFatal import org.apache.spark.{broadcast, SparkException} +import org.apache.spark.internal.LogKeys._ +import org.apache.spark.internal.MDC import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.UnsafeRow @@ -65,11 +67,22 @@ trait BroadcastExchangeLike extends Exchange { * It also does the preparations work, such as waiting for the subqueries. */ final def submitBroadcastJob: scala.concurrent.Future[broadcast.Broadcast[Any]] = executeQuery { + materializationStarted.set(true) completionFuture } protected def completionFuture: scala.concurrent.Future[broadcast.Broadcast[Any]] + /** + * Cancels broadcast job. + */ + final def cancelBroadcastJob(): Unit = { + if (isMaterializationStarted() && !this.relationFuture.isDone) { + sparkContext.cancelJobsWithTag(this.jobTag) + this.relationFuture.cancel(true) + } + } + /** * Returns the runtime statistics after broadcast materialization. */ @@ -212,7 +225,7 @@ case class BroadcastExchangeExec( relationFuture.get(timeout, TimeUnit.SECONDS).asInstanceOf[broadcast.Broadcast[T]] } catch { case ex: TimeoutException => - logError(s"Could not execute broadcast in $timeout secs.", ex) + logError(log"Could not execute broadcast in ${MDC(TIMEOUT, timeout)} secs.", ex) if (!relationFuture.isDone) { sparkContext.cancelJobsWithTag(jobTag) relationFuture.cancel(true) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala index a0f74ef6c3d02..67d879bdd8bf4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.execution.exchange import scala.collection.mutable import scala.collection.mutable.ArrayBuffer +import org.apache.spark.internal.{LogKeys, MDC} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans._ @@ -407,11 +408,13 @@ case class EnsureRequirements( val leftPartValues = leftSpec.partitioning.partitionValues val rightPartValues = rightSpec.partitioning.partitionValues + val numLeftPartValues = MDC(LogKeys.NUM_LEFT_PARTITION_VALUES, leftPartValues.size) + val numRightPartValues = MDC(LogKeys.NUM_RIGHT_PARTITION_VALUES, rightPartValues.size) logInfo( - s""" - |Left side # of partitions: ${leftPartValues.size} - |Right side # of partitions: ${rightPartValues.size} - |""".stripMargin) + log""" + |Left side # of partitions: $numLeftPartValues + |Right side # of partitions: $numRightPartValues + |""".stripMargin) // As partition keys are compatible, we can pick either left or right as partition // expressions @@ -421,7 +424,8 @@ case class EnsureRequirements( .mergePartitions(leftSpec.partitioning, rightSpec.partitioning, partitionExprs) .map(v => (v, 1)) - logInfo(s"After merging, there are ${mergedPartValues.size} partitions") + logInfo(log"After merging, there are " + + log"${MDC(LogKeys.NUM_PARTITIONS, mergedPartValues.size)} partitions") var replicateLeftSide = false var replicateRightSide = false @@ -445,8 +449,8 @@ case class EnsureRequirements( val canReplicateRight = canReplicateRightSide(joinType) if (!canReplicateLeft && !canReplicateRight) { - logInfo("Skipping partially clustered distribution as it cannot be applied for " + - s"join type '$joinType'") + logInfo(log"Skipping partially clustered distribution as it cannot be applied for " + + log"join type '${MDC(LogKeys.JOIN_TYPE, joinType)}'") } else { val leftLink = left.logicalLink val rightLink = right.logicalLink @@ -455,12 +459,16 @@ case class EnsureRequirements( leftLink.isDefined && rightLink.isDefined && leftLink.get.stats.sizeInBytes > 1 && rightLink.get.stats.sizeInBytes > 1) { + val leftLinkStatsSizeInBytes = MDC(LogKeys.LEFT_LOGICAL_PLAN_STATS_SIZE_IN_BYTES, + leftLink.get.stats.sizeInBytes) + val rightLinkStatsSizeInBytes = MDC(LogKeys.RIGHT_LOGICAL_PLAN_STATS_SIZE_IN_BYTES, + rightLink.get.stats.sizeInBytes) logInfo( - s""" + log""" |Using plan statistics to determine which side of join to fully |cluster partition values: - |Left side size (in bytes): ${leftLink.get.stats.sizeInBytes} - |Right side size (in bytes): ${rightLink.get.stats.sizeInBytes} + |Left side size (in bytes): $leftLinkStatsSizeInBytes + |Right side size (in bytes): $rightLinkStatsSizeInBytes |""".stripMargin) leftLink.get.stats.sizeInBytes < rightLink.get.stats.sizeInBytes } else { @@ -477,12 +485,14 @@ case class EnsureRequirements( // of partitions can be applied. For instance, replication should not be allowed for // the left-hand side of a right outer join. if (replicateLeftSide && !canReplicateLeft) { - logInfo("Left-hand side is picked but cannot be applied to join type " + - s"'$joinType'. Skipping partially clustered distribution.") + logInfo(log"Left-hand side is picked but cannot be applied to join type " + + log"'${MDC(LogKeys.JOIN_TYPE, joinType)}'. Skipping partially clustered " + + log"distribution.") replicateLeftSide = false } else if (replicateRightSide && !canReplicateRight) { - logInfo("Right-hand side is picked but cannot be applied to join type " + - s"'$joinType'. Skipping partially clustered distribution.") + logInfo(log"Right-hand side is picked but cannot be applied to join type " + + log"'${MDC(LogKeys.JOIN_TYPE, joinType)}'. Skipping partially clustered " + + log"distribution.") replicateRightSide = false } else { // In partially clustered distribution, we should use un-grouped partition values @@ -499,8 +509,8 @@ case class EnsureRequirements( InternalRowComparableWrapper(partVal, partitionExprs), numParts)) } - logInfo("After applying partially clustered distribution, there are " + - s"${mergedPartValues.map(_._2).sum} partitions.") + logInfo(log"After applying partially clustered distribution, there are " + + log"${MDC(LogKeys.NUM_PARTITIONS, mergedPartValues.map(_._2).sum)} partitions.") applyPartialClustering = true } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/Exchange.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/Exchange.scala index c02beea4f879c..154070a954f3a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/Exchange.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/Exchange.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.execution.exchange +import java.util.concurrent.atomic.AtomicBoolean + import org.apache.spark.broadcast import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow @@ -34,6 +36,17 @@ import org.apache.spark.sql.vectorized.ColumnarBatch * "Volcano -- An Extensible and Parallel Query Evaluation System" by Goetz Graefe. */ abstract class Exchange extends UnaryExecNode { + /** + * This flag aims to detect if the stage materialization is started. This helps + * to avoid unnecessary AQE stage materialization when the stage is canceled. + */ + protected val materializationStarted = new AtomicBoolean() + + /** + * Exposes status if the materialization is started + */ + def isMaterializationStarted(): Boolean = materializationStarted.get() + override def output: Seq[Attribute] = child.output final override val nodePatterns: Seq[TreePattern] = Seq(EXCHANGE) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala index 69705afbb7c71..70c08edfd8678 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala @@ -47,6 +47,15 @@ import org.apache.spark.util.random.XORShiftRandom */ trait ShuffleExchangeLike extends Exchange { + /** + * The asynchronous job that materializes the shuffle. It also does the preparations work, + * such as waiting for the subqueries. + */ + @transient private lazy val shuffleFuture: Future[MapOutputStatistics] = executeQuery { + materializationStarted.set(true) + mapOutputStatisticsFuture + } + /** * Returns the number of mappers of this shuffle. */ @@ -68,15 +77,25 @@ trait ShuffleExchangeLike extends Exchange { def shuffleOrigin: ShuffleOrigin /** - * The asynchronous job that materializes the shuffle. It also does the preparations work, - * such as waiting for the subqueries. + * Submits the shuffle job. */ - final def submitShuffleJob: Future[MapOutputStatistics] = executeQuery { - mapOutputStatisticsFuture - } + final def submitShuffleJob: Future[MapOutputStatistics] = shuffleFuture protected def mapOutputStatisticsFuture: Future[MapOutputStatistics] + /** + * Cancels the shuffle job. + */ + final def cancelShuffleJob: Unit = { + if (isMaterializationStarted()) { + shuffleFuture match { + case action: FutureAction[MapOutputStatistics] if !action.isCompleted => + action.cancel() + case _ => + } + } + } + /** * Returns the shuffle RDD with specified partition specs. */ @@ -86,6 +105,11 @@ trait ShuffleExchangeLike extends Exchange { * Returns the runtime statistics after shuffle materialization. */ def runtimeStatistics: Statistics + + /** + * The shuffle ID. + */ + def shuffleId: Int } // Describes where the shuffle operator comes from. @@ -166,6 +190,8 @@ case class ShuffleExchangeExec( Statistics(dataSize, Some(rowCount)) } + override def shuffleId: Int = shuffleDependency.shuffleId + /** * A [[ShuffleDependency]] that will partition rows of its child based on * the partitioning scheme defined in `newPartitioning`. Those partitions of diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala index 8ace74c517943..c0fb1c37b2102 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql.execution +import org.apache.spark.TaskContext import org.apache.spark.rdd.{ParallelCollectionRDD, RDD} import org.apache.spark.serializer.Serializer import org.apache.spark.sql.catalyst.InternalRow @@ -26,7 +27,7 @@ import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.catalyst.util.truncatedString import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec import org.apache.spark.sql.execution.metric.{SQLShuffleReadMetricsReporter, SQLShuffleWriteMetricsReporter} -import org.apache.spark.util.ArrayImplicits._ +import org.apache.spark.sql.execution.python.HybridRowQueue import org.apache.spark.util.collection.Utils /** @@ -68,13 +69,13 @@ case class CollectLimitExec(limit: Int = -1, child: SparkPlan, offset: Int = 0) override lazy val metrics = readMetrics ++ writeMetrics protected override def doExecute(): RDD[InternalRow] = { val childRDD = child.execute() - if (childRDD.getNumPartitions == 0) { + if (childRDD.getNumPartitions == 0 || limit == 0) { new ParallelCollectionRDD(sparkContext, Seq.empty[InternalRow], 1, Map.empty) } else { val singlePartitionRDD = if (childRDD.getNumPartitions == 1) { childRDD } else { - val locallyLimited = if (limit >= 0) { + val locallyLimited = if (limit > 0) { childRDD.mapPartitionsInternal(_.take(limit)) } else { childRDD @@ -118,18 +119,57 @@ case class CollectLimitExec(limit: Int = -1, child: SparkPlan, offset: Int = 0) * logical plan, which happens when the user is collecting results back to the driver. */ case class CollectTailExec(limit: Int, child: SparkPlan) extends LimitExec { + assert(limit >= 0) + override def output: Seq[Attribute] = child.output override def outputPartitioning: Partitioning = SinglePartition override def executeCollect(): Array[InternalRow] = child.executeTail(limit) + private val serializer: Serializer = new UnsafeRowSerializer(child.output.size) + private lazy val writeMetrics = + SQLShuffleWriteMetricsReporter.createShuffleWriteMetrics(sparkContext) + private lazy val readMetrics = + SQLShuffleReadMetricsReporter.createShuffleReadMetrics(sparkContext) + override lazy val metrics = readMetrics ++ writeMetrics protected override def doExecute(): RDD[InternalRow] = { - // This is a bit hacky way to avoid a shuffle and scanning all data when it performs - // at `Dataset.tail`. - // Since this execution plan and `execute` are currently called only when - // `Dataset.tail` is invoked, the jobs are always executed when they are supposed to be. - - // If we use this execution plan separately like `Dataset.limit` without an actual - // job launch, we might just have to mimic the implementation of `CollectLimitExec`. - sparkContext.parallelize(executeCollect().toImmutableArraySeq, numSlices = 1) + val childRDD = child.execute() + if (childRDD.getNumPartitions == 0 || limit == 0) { + new ParallelCollectionRDD(sparkContext, Seq.empty[InternalRow], 1, Map.empty) + } else { + val singlePartitionRDD = if (childRDD.getNumPartitions == 1) { + childRDD + } else { + val locallyLimited = childRDD.mapPartitionsInternal(takeRight) + new ShuffledRowRDD( + ShuffleExchangeExec.prepareShuffleDependency( + locallyLimited, + child.output, + SinglePartition, + serializer, + writeMetrics), + readMetrics) + } + singlePartitionRDD.mapPartitionsInternal(takeRight) + } + } + + private def takeRight(iter: Iterator[InternalRow]): Iterator[InternalRow] = { + if (iter.isEmpty) { + Iterator.empty[InternalRow] + } else { + val context = TaskContext.get() + val queue = HybridRowQueue.apply(context.taskMemoryManager(), output.size) + context.addTaskCompletionListener[Unit](_ => queue.close()) + var count = 0 + while (iter.hasNext) { + queue.add(iter.next().copy().asInstanceOf[UnsafeRow]) + if (count < limit) { + count += 1 + } else { + queue.remove() + } + } + Iterator.range(0, count).map(_ => queue.remove()) + } } override protected def withNewChildInternal(newChild: SparkPlan): SparkPlan = @@ -355,7 +395,8 @@ case class TakeOrderedAndProjectExec( val orderByString = truncatedString(sortOrder, "[", ",", "]", maxFields) val outputString = truncatedString(output, "[", ",", "]", maxFields) - s"TakeOrderedAndProject(limit=$limit, orderBy=$orderByString, output=$outputString)" + val offsetStr = if (offset > 0) s" offset=$offset," else "" + s"TakeOrderedAndProject(limit=$limit,$offsetStr orderBy=$orderByString, output=$outputString)" } override def stringArgs: Iterator[Any] = { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ApplyInPandasWithStatePythonRunner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ApplyInPandasWithStatePythonRunner.scala index 8eeb919d0bafd..ae982f2f87f2e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ApplyInPandasWithStatePythonRunner.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ApplyInPandasWithStatePythonRunner.scala @@ -27,6 +27,8 @@ import org.json4s._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.api.python._ +import org.apache.spark.internal.LogKeys.CONFIG +import org.apache.spark.internal.MDC import org.apache.spark.sql.Row import org.apache.spark.sql.api.python.PythonSQLUtils import org.apache.spark.sql.catalyst.InternalRow @@ -88,9 +90,9 @@ class ApplyInPandasWithStatePythonRunner( override val bufferSize: Int = { val configuredSize = sqlConf.pandasUDFBufferSize if (configuredSize < 4) { - logWarning("Pandas execution requires more than 4 bytes. Please configure bigger value " + - s"for the configuration '${SQLConf.PANDAS_UDF_BUFFER_SIZE.key}'. " + - "Force using the value '4'.") + logWarning(log"Pandas execution requires more than 4 bytes. Please configure bigger value " + + log"for the configuration '${MDC(CONFIG, SQLConf.PANDAS_UDF_BUFFER_SIZE.key)}'. " + + log"Force using the value '4'.") 4 } else { configuredSize diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/AttachDistributedSequenceExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/AttachDistributedSequenceExec.scala index e353bf5a51e9a..a8a6fa97c52a9 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/AttachDistributedSequenceExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/AttachDistributedSequenceExec.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.execution.python +import org.apache.spark.internal.LogKeys.{RDD_ID, SPARK_PLAN_ID} +import org.apache.spark.internal.MDC import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ @@ -108,7 +110,8 @@ case class AttachDistributedSequenceExec( override protected[sql] def cleanupResources(): Unit = { try { if (cached != null && cached.getStorageLevel != StorageLevel.NONE) { - logWarning(s"clean up cached RDD(${cached.id}) in AttachDistributedSequenceExec($id)") + logWarning(log"clean up cached RDD(${MDC(RDD_ID, cached.id)}) in " + + log"AttachDistributedSequenceExec(${MDC(SPARK_PLAN_ID, id)})") cached.unpersist(blocking = false) } } finally { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala index d69213552136d..fca277dae5d55 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala @@ -24,7 +24,7 @@ import scala.jdk.CollectionConverters._ import net.razorvine.pickle.{IObjectPickler, Opcodes, Pickler} -import org.apache.spark.SparkException +import org.apache.spark.SparkIllegalArgumentException import org.apache.spark.api.python.SerDeUtil import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow @@ -183,10 +183,11 @@ object EvaluatePython { case c if c.getClass.isArray => val array = c.asInstanceOf[Array[_]] if (array.length != fields.length) { - throw SparkException.internalError( - s"Input row doesn't have expected number of values required by the schema. " + - s"${fields.length} fields are required while ${array.length} values are provided." - ) + throw new SparkIllegalArgumentException( + errorClass = "STRUCT_ARRAY_LENGTH_MISMATCH", + messageParameters = Map( + "expected" -> fields.length.toString, + "actual" -> array.length.toString)) } val row = new GenericInternalRow(fields.length) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonArrowOutput.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonArrowOutput.scala index 90922d89ad10b..e7d4aa9f04607 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonArrowOutput.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonArrowOutput.scala @@ -49,7 +49,7 @@ private[python] trait PythonArrowOutput[OUT <: AnyRef] { self: BasePythonRunner[ startTime: Long, env: SparkEnv, worker: PythonWorker, - pid: Option[Long], + pid: Option[Int], releasedOrClosed: AtomicBoolean, context: TaskContext): Iterator[OUT] = { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonStreamingSourceRunner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonStreamingSourceRunner.scala index dd9c5a25e8a74..33612b6947f27 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonStreamingSourceRunner.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonStreamingSourceRunner.scala @@ -23,13 +23,20 @@ import java.io.{BufferedInputStream, BufferedOutputStream, DataInputStream, Data import scala.collection.mutable.ArrayBuffer import scala.jdk.CollectionConverters._ +import org.apache.arrow.vector.ipc.ArrowStreamReader + import org.apache.spark.SparkEnv import org.apache.spark.api.python.{PythonFunction, PythonWorker, PythonWorkerFactory, PythonWorkerUtils, SpecialLengths} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} +import org.apache.spark.internal.LogKeys.PYTHON_EXEC import org.apache.spark.internal.config.BUFFER_SIZE import org.apache.spark.internal.config.Python.PYTHON_AUTH_SOCKET_TIMEOUT +import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors} +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.util.ArrowUtils +import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch, ColumnVector} object PythonStreamingSourceRunner { // When the python process for python_streaming_source_runner receives one of the @@ -38,6 +45,11 @@ object PythonStreamingSourceRunner { val LATEST_OFFSET_FUNC_ID = 885 val PARTITIONS_FUNC_ID = 886 val COMMIT_FUNC_ID = 887 + // Status code for JVM to decide how to receive prefetched record batches + // for simple stream reader. + val PREFETCHED_RECORDS_NOT_FOUND = 0 + val NON_EMPTY_PYARROW_RECORD_BATCHES = 1 + val EMPTY_PYARROW_RECORD_BATCHES = 2 } /** @@ -71,7 +83,7 @@ class PythonStreamingSourceRunner( * Initializes the Python worker for running the streaming source. */ def init(): Unit = { - logInfo(s"Initializing Python runner pythonExec: $pythonExec") + logInfo(log"Initializing Python runner pythonExec: ${MDC(PYTHON_EXEC, pythonExec)}") val env = SparkEnv.get val localdir = env.blockManager.diskBlockManager.localDirs.map(f => f.getPath()).mkString(",") @@ -101,6 +113,8 @@ class PythonStreamingSourceRunner( // Send output schema PythonWorkerUtils.writeUTF(outputSchema.json, dataOut) + dataOut.writeInt(SQLConf.get.arrowMaxRecordsPerBatch) + dataOut.flush() dataIn = new DataInputStream( @@ -147,7 +161,8 @@ class PythonStreamingSourceRunner( /** * Invokes partitions(start, end) function of the stream reader and receive the return value. */ - def partitions(start: String, end: String): Array[Array[Byte]] = { + def partitions(start: String, end: String): (Array[Array[Byte]], Option[Iterator[InternalRow]]) = + { dataOut.writeInt(PARTITIONS_FUNC_ID) PythonWorkerUtils.writeUTF(start, dataOut) PythonWorkerUtils.writeUTF(end, dataOut) @@ -164,7 +179,20 @@ class PythonStreamingSourceRunner( val pickledPartition: Array[Byte] = PythonWorkerUtils.readBytes(dataIn) pickledPartitions.append(pickledPartition) } - pickledPartitions.toArray + val prefetchedRecordsStatus = dataIn.readInt() + val iter: Option[Iterator[InternalRow]] = prefetchedRecordsStatus match { + case NON_EMPTY_PYARROW_RECORD_BATCHES => Some(readArrowRecordBatches()) + case PREFETCHED_RECORDS_NOT_FOUND => None + case EMPTY_PYARROW_RECORD_BATCHES => Some(Iterator.empty) + case SpecialLengths.PYTHON_EXCEPTION_THROWN => + val msg = PythonWorkerUtils.readUTF(dataIn) + throw QueryExecutionErrors.pythonStreamingDataSourceRuntimeError( + action = "planPartitions", msg) + case _ => + throw QueryExecutionErrors.pythonStreamingDataSourceRuntimeError( + action = "planPartitions", s"unknown status code $prefetchedRecordsStatus") + } + (pickledPartitions.toArray, iter) } /** @@ -186,7 +214,8 @@ class PythonStreamingSourceRunner( * Stop the python worker process and invoke stop() on stream reader. */ def stop(): Unit = { - logInfo(s"Stopping streaming runner for module: $workerModule.") + logInfo(log"Stopping streaming runner for module: " + + log"${MDC(LogKeys.MODULE_NAME, workerModule)}.") try { pythonWorkerFactory.foreach { factory => pythonWorker.foreach { worker => @@ -199,4 +228,30 @@ class PythonStreamingSourceRunner( logError("Exception when trying to kill worker", e) } } + + private val allocator = ArrowUtils.rootAllocator.newChildAllocator( + s"stream reader for $pythonExec", 0, Long.MaxValue) + + def readArrowRecordBatches(): Iterator[InternalRow] = { + assert(dataIn.readInt() == SpecialLengths.START_ARROW_STREAM) + val reader = new ArrowStreamReader(dataIn, allocator) + val root = reader.getVectorSchemaRoot() + // When input is empty schema can't be read. + val schema = ArrowUtils.fromArrowSchema(root.getSchema()) + assert(schema == outputSchema) + + val vectors = root.getFieldVectors().asScala.map { vector => + new ArrowColumnVector(vector) + }.toArray[ColumnVector] + val rows = ArrayBuffer[InternalRow]() + while (reader.loadNextBatch()) { + val batch = new ColumnarBatch(vectors) + batch.setNumRows(root.getRowCount) + // Need to copy the row because the ColumnarBatch row iterator use + // the same underlying Internal row. + rows.appendAll(batch.rowIterator().asScala.map(_.copy())) + } + reader.close(false) + rows.iterator + } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonUDFRunner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonUDFRunner.scala index bbe9fbfc748db..87ff5a0ec4333 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonUDFRunner.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonUDFRunner.scala @@ -80,7 +80,7 @@ abstract class BasePythonUDFRunner( startTime: Long, env: SparkEnv, worker: PythonWorker, - pid: Option[Long], + pid: Option[Int], releasedOrClosed: AtomicBoolean, context: TaskContext): Iterator[Array[Byte]] = { new ReaderIterator( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/RowQueue.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/RowQueue.scala index 5e0c5ff92fdab..ce30a54c8d4e8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/RowQueue.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/RowQueue.scala @@ -29,6 +29,7 @@ import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.errors.QueryExecutionErrors import org.apache.spark.unsafe.Platform import org.apache.spark.unsafe.memory.MemoryBlock +import org.apache.spark.util.Utils /** * A RowQueue is an FIFO queue for UnsafeRow. @@ -288,8 +289,12 @@ private[python] case class HybridRowQueue( } } -private[python] object HybridRowQueue { +private[sql] object HybridRowQueue { def apply(taskMemoryMgr: TaskMemoryManager, file: File, fields: Int): HybridRowQueue = { HybridRowQueue(taskMemoryMgr, file, fields, SparkEnv.get.serializerManager) } + + def apply(taskMemoryMgr: TaskMemoryManager, fields: Int): HybridRowQueue = { + apply(taskMemoryMgr, new File(Utils.getLocalDir(SparkEnv.get.conf)), fields) + } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/r/ArrowRRunner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/r/ArrowRRunner.scala index 819fd1bd297f8..45ecf87009505 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/r/ArrowRRunner.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/r/ArrowRRunner.scala @@ -30,6 +30,7 @@ import org.apache.spark.TaskContext import org.apache.spark.api.r._ import org.apache.spark.api.r.SpecialLengths import org.apache.spark.broadcast.Broadcast +import org.apache.spark.internal.{LogKeys, MDC} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.arrow.ArrowWriter import org.apache.spark.sql.types.StructType @@ -138,6 +139,10 @@ class ArrowRRunner( private var batchLoaded = true + private def format(v: Double): String = { + "%.3f".format(v) + } + protected override def read(): ColumnarBatch = try { if (reader != null && batchLoaded) { batchLoaded = reader.loadNextBatch() @@ -161,17 +166,14 @@ class ArrowRRunner( val input = dataStream.readDouble val compute = dataStream.readDouble val output = dataStream.readDouble - logInfo( - ("Times: boot = %.3f s, init = %.3f s, broadcast = %.3f s, " + - "read-input = %.3f s, compute = %.3f s, write-output = %.3f s, " + - "total = %.3f s").format( - boot, - init, - broadcast, - input, - compute, - output, - boot + init + broadcast + input + compute + output)) + logInfo(log"Times: boot = ${MDC(LogKeys.BOOT, format(boot))} s, " + + log"init = ${MDC(LogKeys.INIT, format(init))} s, " + + log"broadcast = ${MDC(LogKeys.BROADCAST, format(broadcast))} s, " + + log"read-input = ${MDC(LogKeys.INPUT, format(input))} s, " + + log"compute = ${MDC(LogKeys.COMPUTE, format(compute))} s, " + + log"write-output = ${MDC(LogKeys.OUTPUT, format(output))} s, " + + log"total = ${MDC(LogKeys.TOTAL, + format(boot + init + broadcast + input + compute + output))} s") read() case length if length > 0 => // Likewise, there looks no way to send each batch in streaming format via socket diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/AsyncCommitLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/AsyncCommitLog.scala index 686e0bb868865..6db01624fd26b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/AsyncCommitLog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/AsyncCommitLog.scala @@ -22,6 +22,7 @@ import java.util.concurrent.{CompletableFuture, ConcurrentLinkedDeque, ThreadPoo import scala.jdk.CollectionConverters._ +import org.apache.spark.internal.{LogKeys, MDC} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.errors.QueryExecutionErrors @@ -125,7 +126,8 @@ class AsyncCommitLog(sparkSession: SparkSession, path: String, executorService: } } catch { case e: Throwable => - logError(s"Encountered error while writing batch ${batchId} to commit log", e) + logError(log"Encountered error while writing batch " + + log"${MDC(LogKeys.BATCH_ID, batchId)} to commit log", e) future.completeExceptionally(e) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/AsyncOffsetSeqLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/AsyncOffsetSeqLog.scala index a89a9132e03e0..54a8855b77cdb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/AsyncOffsetSeqLog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/AsyncOffsetSeqLog.scala @@ -23,6 +23,7 @@ import java.util.concurrent.atomic.AtomicLong import scala.jdk.CollectionConverters._ +import org.apache.spark.internal.{LogKeys, MDC} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.errors.QueryExecutionErrors import org.apache.spark.util.{Clock, SystemClock} @@ -159,7 +160,8 @@ class AsyncOffsetSeqLog( } } catch { case e: Throwable => - logError(s"Encountered error while writing batch ${batchId} to offset log", e) + logError(log"Encountered error while writing batch " + + log"${MDC(LogKeys.BATCH_ID, batchId)} to offset log", e) future.completeExceptionally(e) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/AsyncProgressTrackingMicroBatchExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/AsyncProgressTrackingMicroBatchExecution.scala index ec24ec0fd335c..4a7cb5b71a77f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/AsyncProgressTrackingMicroBatchExecution.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/AsyncProgressTrackingMicroBatchExecution.scala @@ -20,6 +20,8 @@ package org.apache.spark.sql.execution.streaming import java.util.concurrent._ import java.util.concurrent.atomic.AtomicLong +import org.apache.spark.internal.LogKeys.{BATCH_ID, PRETTY_ID_STRING} +import org.apache.spark.internal.MDC import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.streaming.WriteToStream import org.apache.spark.sql.errors.QueryExecutionErrors @@ -156,8 +158,8 @@ class AsyncProgressTrackingMicroBatchExecution( } }) .exceptionally((th: Throwable) => { - logError(s"Encountered error while performing" + - s" async offset write for batch ${execCtx.batchId}", th) + logError(log"Encountered error while performing async offset write for batch " + + log"${MDC(BATCH_ID, execCtx.batchId)}", th) errorNotifier.markError(th) return }) @@ -188,8 +190,8 @@ class AsyncProgressTrackingMicroBatchExecution( commitLog .addAsync(execCtx.batchId, CommitMetadata(watermarkTracker.currentWatermark)) .exceptionally((th: Throwable) => { - logError(s"Got exception during async write to commit log" + - s" for batch ${execCtx.batchId}", th) + logError(log"Got exception during async write to commit log for batch " + + log"${MDC(BATCH_ID, execCtx.batchId)}", th) errorNotifier.markError(th) return }) @@ -221,7 +223,8 @@ class AsyncProgressTrackingMicroBatchExecution( super.cleanup() ThreadUtils.shutdown(asyncWritesExecutorService) - logInfo(s"Async progress tracking executor pool for query ${prettyIdString} has been shutdown") + logInfo(log"Async progress tracking executor pool for query " + + log"${MDC(PRETTY_ID_STRING, prettyIdString)} has been shutdown") } // used for testing diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/AvailableNowDataStreamWrapper.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/AvailableNowDataStreamWrapper.scala index 18dd2eba083ad..f42250c3c702d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/AvailableNowDataStreamWrapper.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/AvailableNowDataStreamWrapper.scala @@ -17,7 +17,8 @@ package org.apache.spark.sql.execution.streaming -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{DELEGATE, READ_LIMIT} import org.apache.spark.sql.connector.read.streaming.{MicroBatchStream, ReadLimit, SparkDataStream, SupportsAdmissionControl, SupportsTriggerAvailableNow} import org.apache.spark.sql.connector.read.streaming @@ -29,10 +30,10 @@ class AvailableNowDataStreamWrapper(val delegate: SparkDataStream) extends SparkDataStream with SupportsTriggerAvailableNow with Logging { // See SPARK-45178 for more details. - logWarning("Activating the wrapper implementation of Trigger.AvailableNow for source " + - s"[$delegate]. Note that this might introduce possibility of deduplication, dataloss, " + - "correctness issue. Enable the config with extreme care. We strongly recommend to contact " + - "the data source developer to support Trigger.AvailableNow.") + logWarning(log"Activating the wrapper implementation of Trigger.AvailableNow for source " + + log"[${MDC(DELEGATE, delegate)}]. Note that this might introduce possibility of " + + log"deduplication, dataloss, correctness issue. Enable the config with extreme care. We " + + log"strongly recommend to contact the data source developer to support Trigger.AvailableNow.") private var fetchedOffset: streaming.Offset = _ @@ -71,8 +72,8 @@ class AvailableNowDataStreamWrapper(val delegate: SparkDataStream) case s: SupportsAdmissionControl => val limit = s.getDefaultReadLimit if (limit != ReadLimit.allAvailable()) { - logWarning(s"The read limit $limit is ignored because source $delegate does not " + - "support running Trigger.AvailableNow queries.") + logWarning(log"The read limit ${MDC(READ_LIMIT, limit)} is ignored because source " + + log"${MDC(DELEGATE, delegate)} does not support running Trigger.AvailableNow queries.") } ReadLimit.allAvailable() diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CheckpointFileManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CheckpointFileManager.scala index 34c5dee0997b9..982cc13c40868 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CheckpointFileManager.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CheckpointFileManager.scala @@ -26,7 +26,8 @@ import org.apache.hadoop.fs._ import org.apache.hadoop.fs.local.{LocalFs, RawLocalFs} import org.apache.hadoop.fs.permission.FsPermission -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{FINAL_PATH, PATH, TEMP_PATH} import org.apache.spark.sql.errors.QueryExecutionErrors import org.apache.spark.sql.execution.streaming.CheckpointFileManager.RenameHelperMethods import org.apache.spark.sql.internal.SQLConf @@ -143,7 +144,8 @@ object CheckpointFileManager extends Logging { this(fm, path, generateTempPath(path), overwrite) } - logInfo(s"Writing atomically to $finalPath using temp file $tempPath") + logInfo(log"Writing atomically to ${MDC(FINAL_PATH, finalPath)} using temp file " + + log"${MDC(TEMP_PATH, tempPath)}") @volatile private var terminated = false override def close(): Unit = synchronized { @@ -154,8 +156,8 @@ object CheckpointFileManager extends Logging { fm.renameTempFile(tempPath, finalPath, overwriteIfPossible) } catch { case fe: FileAlreadyExistsException => - logWarning( - s"Failed to rename temp file $tempPath to $finalPath because file exists", fe) + logWarning(log"Failed to rename temp file ${MDC(TEMP_PATH, tempPath)} to " + + log"${MDC(PATH, finalPath)} because file exists", fe) if (!overwriteIfPossible) throw fe } @@ -165,7 +167,8 @@ object CheckpointFileManager extends Logging { s"But $finalPath does not exist.") } - logInfo(s"Renamed temp file $tempPath to $finalPath") + logInfo(log"Renamed temp file ${MDC(TEMP_PATH, tempPath)} to " + + log"${MDC(FINAL_PATH, finalPath)}") } finally { terminated = true } @@ -178,13 +181,13 @@ object CheckpointFileManager extends Logging { underlyingStream.close() } catch { case NonFatal(e) => - logWarning(s"Error cancelling write to $finalPath, " + - s"continuing to delete temp path $tempPath", e) + logWarning(log"Error cancelling write to ${MDC(PATH, finalPath)}, continuing to " + + log"delete temp path ${MDC(TEMP_PATH, tempPath)}", e) } fm.delete(tempPath) } catch { case NonFatal(e) => - logWarning(s"Error deleting temp file $tempPath", e) + logWarning(log"Error deleting temp file ${MDC(TEMP_PATH, tempPath)}", e) } finally { terminated = true } @@ -210,10 +213,10 @@ object CheckpointFileManager extends Logging { } catch { case e: UnsupportedFileSystemException => logWarning( - "Could not use FileContext API for managing Structured Streaming checkpoint files at " + - s"$path. Using FileSystem API instead for managing log files. If the implementation " + - s"of FileSystem.rename() is not atomic, then the correctness and fault-tolerance of" + - s"your Structured Streaming is not guaranteed.") + log"Could not use FileContext API for managing Structured Streaming checkpoint files " + + log"at ${MDC(PATH, path)}. Using FileSystem API instead for managing log files. If " + + log"the implementation of FileSystem.rename() is not atomic, then the correctness " + + log"and fault-tolerance of your Structured Streaming is not guaranteed.") new FileSystemBasedCheckpointFileManager(path, hadoopConf) } } @@ -274,7 +277,8 @@ class FileSystemBasedCheckpointFileManager(path: Path, hadoopConf: Configuration throw QueryExecutionErrors.renameSrcPathNotFoundError(srcPath) } else { val e = QueryExecutionErrors.failedRenameTempFileError(srcPath, dstPath) - logWarning(e.getMessage) + logWarning(log"Failed to rename temp file ${MDC(TEMP_PATH, srcPath)} to " + + log"${MDC(PATH, dstPath)} as FileSystem.rename returned false.") throw e } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ColumnFamilySchemaUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ColumnFamilySchemaUtils.scala new file mode 100644 index 0000000000000..68f3fa434389e --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ColumnFamilySchemaUtils.scala @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution.streaming + +import org.apache.spark.sql.Encoder +import org.apache.spark.sql.execution.streaming.TransformWithStateKeyValueRowSchema.{COMPOSITE_KEY_ROW_SCHEMA, KEY_ROW_SCHEMA, VALUE_ROW_SCHEMA, VALUE_ROW_SCHEMA_WITH_TTL} +import org.apache.spark.sql.execution.streaming.state.{ColumnFamilySchema, ColumnFamilySchemaV1, NoPrefixKeyStateEncoderSpec, PrefixKeyScanStateEncoderSpec} + +trait ColumnFamilySchemaUtils { + def getValueStateSchema[T](stateName: String, hasTtl: Boolean): ColumnFamilySchema + + def getListStateSchema[T](stateName: String, hasTtl: Boolean): ColumnFamilySchema + + def getMapStateSchema[K, V]( + stateName: String, + userKeyEnc: Encoder[K], + hasTtl: Boolean): ColumnFamilySchema +} + +object ColumnFamilySchemaUtilsV1 extends ColumnFamilySchemaUtils { + + def getValueStateSchema[T](stateName: String, hasTtl: Boolean): ColumnFamilySchemaV1 = { + ColumnFamilySchemaV1( + stateName, + KEY_ROW_SCHEMA, + if (hasTtl) { + VALUE_ROW_SCHEMA_WITH_TTL + } else { + VALUE_ROW_SCHEMA + }, + NoPrefixKeyStateEncoderSpec(KEY_ROW_SCHEMA)) + } + + def getListStateSchema[T](stateName: String, hasTtl: Boolean): ColumnFamilySchemaV1 = { + ColumnFamilySchemaV1( + stateName, + KEY_ROW_SCHEMA, + if (hasTtl) { + VALUE_ROW_SCHEMA_WITH_TTL + } else { + VALUE_ROW_SCHEMA + }, + NoPrefixKeyStateEncoderSpec(KEY_ROW_SCHEMA)) + } + + def getMapStateSchema[K, V]( + stateName: String, + userKeyEnc: Encoder[K], + hasTtl: Boolean): ColumnFamilySchemaV1 = { + ColumnFamilySchemaV1( + stateName, + COMPOSITE_KEY_ROW_SCHEMA, + if (hasTtl) { + VALUE_ROW_SCHEMA_WITH_TTL + } else { + VALUE_ROW_SCHEMA + }, + PrefixKeyScanStateEncoderSpec(COMPOSITE_KEY_ROW_SCHEMA, 1), + Some(userKeyEnc.schema)) + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLog.scala index 8d38bba1f2a63..d6770452e71f3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLog.scala @@ -27,6 +27,7 @@ import org.apache.hadoop.fs.Path import org.json4s.{Formats, NoTypeHints} import org.json4s.jackson.Serialization +import org.apache.spark.internal.{LogKeys, MDC} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.errors.QueryExecutionErrors import org.apache.spark.util.Utils @@ -103,8 +104,9 @@ abstract class CompactibleFileStreamLog[T <: AnyRef : ClassTag]( defaultCompactInterval, compactibleBatchIds(0).toInt) } assert(interval > 0, s"intervalValue = $interval not positive value.") - logInfo(s"Set the compact interval to $interval " + - s"[defaultCompactInterval: $defaultCompactInterval]") + logInfo(log"Set the compact interval to ${MDC(LogKeys.COMPACT_INTERVAL, interval)} " + + log"[defaultCompactInterval: " + + log"${MDC(LogKeys.DEFAULT_COMPACT_INTERVAL, defaultCompactInterval)}]") interval } @@ -240,7 +242,8 @@ abstract class CompactibleFileStreamLog[T <: AnyRef : ClassTag]( } if (elapsedMs >= COMPACT_LATENCY_WARN_THRESHOLD_MS) { - logWarning(s"Compacting took $elapsedMs ms for compact batch $batchId") + logWarning(log"Compacting took ${MDC(LogKeys.ELAPSED_TIME, elapsedMs)} ms for " + + log"compact batch ${MDC(LogKeys.BATCH_ID, batchId)}") } else { logDebug(s"Compacting took $elapsedMs ms for compact batch $batchId") } @@ -306,8 +309,9 @@ abstract class CompactibleFileStreamLog[T <: AnyRef : ClassTag]( assert(isCompactionBatch(minCompactionBatchId, compactInterval), s"$minCompactionBatchId is not a compaction batch") - logInfo(s"Current compact batch id = $currentBatchId " + - s"min compaction batch id to delete = $minCompactionBatchId") + logInfo(log"Current compact batch id = ${MDC(LogKeys.CURRENT_BATCH_ID, currentBatchId)} " + + log"min compaction batch id to delete = " + + log"${MDC(LogKeys.MIN_COMPACTION_BATCH_ID, minCompactionBatchId)}") val expiredTime = System.currentTimeMillis() - fileCleanupDelayMs fileManager.list(metadataPath, (path: Path) => { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala index 7e094fee32547..54041abdc9ab4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala @@ -19,11 +19,14 @@ package org.apache.spark.sql.execution.streaming import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} +import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, Predicate, SortOrder, UnsafeProjection} +import org.apache.spark.sql.catalyst.expressions.BindReferences.bindReference import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark +import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark.updateEventTimeColumn +import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.catalyst.util.DateTimeUtils.microsToMillis +import org.apache.spark.sql.errors.QueryExecutionErrors import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} -import org.apache.spark.sql.types.MetadataBuilder import org.apache.spark.unsafe.types.CalendarInterval import org.apache.spark.util.AccumulatorV2 @@ -107,25 +110,72 @@ case class EventTimeWatermarkExec( } // Update the metadata on the eventTime column to include the desired delay. - override val output: Seq[Attribute] = child.output.map { a => - if (a semanticEquals eventTime) { - val updatedMetadata = new MetadataBuilder() - .withMetadata(a.metadata) - .putLong(EventTimeWatermark.delayKey, delayMs) - .build() - a.withMetadata(updatedMetadata) - } else if (a.metadata.contains(EventTimeWatermark.delayKey)) { - // Remove existing watermark - val updatedMetadata = new MetadataBuilder() - .withMetadata(a.metadata) - .remove(EventTimeWatermark.delayKey) - .build() - a.withMetadata(updatedMetadata) - } else { - a - } + override val output: Seq[Attribute] = { + val delayMs = EventTimeWatermark.getDelayMs(delay) + updateEventTimeColumn(child.output, delayMs, eventTime) } override protected def withNewChildInternal(newChild: SparkPlan): EventTimeWatermarkExec = copy(child = newChild) } + +/** + * Updates the event time column to [[eventTime]] in the child output. + * Any watermark calculations performed after this node will use the + * updated eventTimeColumn. + * + * This node also ensures that output emitted by the child node adheres + * to watermark. If the child node emits rows which are older than global + * watermark, the node will throw an query execution error and fail the user + * query. + */ +case class UpdateEventTimeColumnExec( + eventTime: Attribute, + delay: CalendarInterval, + eventTimeWatermarkForLateEvents: Option[Long], + child: SparkPlan) extends UnaryExecNode { + + override protected def doExecute(): RDD[InternalRow] = { + child.execute().mapPartitions[InternalRow] { dataIterator => + val watermarkExpression = WatermarkSupport.watermarkExpression( + Some(eventTime), eventTimeWatermarkForLateEvents) + + if (watermarkExpression.isEmpty) { + // watermark should always be defined in this node. + throw QueryExecutionErrors.cannotGetEventTimeWatermarkError() + } + + val predicate = Predicate.create(watermarkExpression.get, child.output) + new Iterator[InternalRow] { + override def hasNext: Boolean = dataIterator.hasNext + + override def next(): InternalRow = { + val row = dataIterator.next() + if (predicate.eval(row)) { + // child node emitted a row which is older than current watermark + // this is not allowed + val boundEventTimeExpression = bindReference[Expression](eventTime, child.output) + val eventTimeProjection = UnsafeProjection.create(boundEventTimeExpression) + val rowEventTime = eventTimeProjection(row) + throw QueryExecutionErrors.emittedRowsAreOlderThanWatermark( + eventTimeWatermarkForLateEvents.get, rowEventTime.getLong(0)) + } + row + } + } + } + } + + override def outputOrdering: Seq[SortOrder] = child.outputOrdering + + override def outputPartitioning: Partitioning = child.outputPartitioning + + // Update the metadata on the eventTime column to include the desired delay. + override val output: Seq[Attribute] = { + val delayMs = EventTimeWatermark.getDelayMs(delay) + updateEventTimeColumn(child.output, delayMs, eventTime) + } + + override protected def withNewChildInternal(newChild: SparkPlan): UpdateEventTimeColumnExec = + copy(child = newChild) +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamOptions.scala index 07c1ccc432cdb..b259f9dbcdcb2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamOptions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamOptions.scala @@ -125,6 +125,30 @@ class FileStreamOptions(parameters: CaseInsensitiveMap[String]) extends Logging matchedMode } + /** + * maximum number of files to cache to be processed in subsequent batches + */ + val maxCachedFiles: Int = parameters.get("maxCachedFiles").map { str => + Try(str.toInt).filter(_ >= 0).getOrElse { + throw new IllegalArgumentException( + s"Invalid value '$str' for option 'maxCachedFiles', must be an integer greater than or " + + "equal to 0") + } + }.getOrElse(10000) + + /** + * ratio of cached input to max files/bytes to allow for listing from input source when + * there are fewer cached files/bytes than could be available to be read + */ + val discardCachedInputRatio: Float = parameters.get("discardCachedInputRatio").map { str => + Try(str.toFloat).filter(x => 0 <= x && x <= 1).getOrElse { + throw new IllegalArgumentException( + s"Invalid value '$str' for option 'discardCachedInputRatio', must be a positive float " + + "between 0 and 1" + ) + } + }.getOrElse(0.2f) + private def withBooleanParameter(name: String, default: Boolean) = { parameters.get(name).map { str => try { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala index ea8db3c99de92..638da08d0fd9b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala @@ -23,7 +23,8 @@ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.SparkException -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{BATCH_ID, ERROR, PATH} import org.apache.spark.internal.io.FileCommitProtocol import org.apache.spark.sql.{DataFrame, SparkSession} import org.apache.spark.sql.catalyst.expressions._ @@ -60,8 +61,8 @@ object FileStreamSink extends Logging { } catch { case e: SparkException => throw e case NonFatal(e) => - logWarning(s"Assume no metadata directory. Error while looking for " + - s"metadata directory in the path: $singlePath.", e) + logWarning(log"Assume no metadata directory. Error while looking for " + + log"metadata directory in the path: ${MDC(PATH, singlePath)}.", e) false } case _ => false @@ -84,7 +85,7 @@ object FileStreamSink extends Logging { } catch { case NonFatal(e) => // We may not have access to this directory. Don't fail the query if that happens. - logWarning(e.getMessage, e) + logWarning(log"${MDC(ERROR, e.getMessage)}", e) false } if (legacyMetadataPathExists) { @@ -145,7 +146,7 @@ class FileStreamSink( override def addBatch(batchId: Long, data: DataFrame): Unit = { if (batchId <= fileLog.getLatestBatchId().getOrElse(-1L)) { - logInfo(s"Skipping already committed batch $batchId") + logInfo(log"Skipping already committed batch ${MDC(BATCH_ID, batchId)}") } else { val committer = FileCommitProtocol.instantiate( className = sparkSession.sessionState.conf.streamingFileCommitProtocolClass, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLog.scala index d8aa31be47972..556438811c44d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLog.scala @@ -19,6 +19,8 @@ package org.apache.spark.sql.execution.streaming import org.apache.hadoop.fs.FileStatus +import org.apache.spark.internal.LogKeys._ +import org.apache.spark.internal.MDC import org.apache.spark.paths.SparkPath import org.apache.spark.sql.SparkSession import org.apache.spark.sql.internal.SQLConf @@ -101,7 +103,7 @@ class FileStreamSinkLog( val retentionMs: Long = _retentionMs match { case Some(retention) => - logInfo(s"Retention is set to $retention ms") + logInfo(log"Retention is set to ${MDC(TIME_UNITS, retention)} ms") retention case _ => Long.MaxValue diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala index eacbd0447d16f..4a9b2d11b7e0f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala @@ -27,7 +27,7 @@ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, FileSystem, GlobFilter, Path} import org.apache.spark.deploy.SparkHadoopUtil -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.paths.SparkPath import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap @@ -114,6 +114,11 @@ class FileStreamSource( "the same and causes data lost.") } + + private val maxCachedFiles = sourceOptions.maxCachedFiles + + private val discardCachedInputRatio = sourceOptions.discardCachedInputRatio + /** A mapping from a file that we have processed to some timestamp it was last modified. */ // Visible for testing and debugging in production. val seenFiles = new SeenFilesMap(maxFileAgeMs, fileNameOnly) @@ -125,8 +130,9 @@ class FileStreamSource( } seenFiles.purge() - logInfo(s"maxFilesPerBatch = $maxFilesPerBatch, " + - s"maxBytesPerBatch = $maxBytesPerBatch, maxFileAgeMs = $maxFileAgeMs") + logInfo(log"maxFilesPerBatch = ${MDC(LogKeys.NUM_FILES, maxFilesPerBatch)}, " + + log"maxBytesPerBatch = ${MDC(LogKeys.NUM_BYTES, maxBytesPerBatch)}, " + + log"maxFileAgeMs = ${MDC(LogKeys.TIME_UNITS, maxFileAgeMs)}") private var unreadFiles: Seq[NewFileEntry] = _ @@ -178,12 +184,14 @@ class FileStreamSource( } } + val shouldCache = !sourceOptions.latestFirst && allFilesForTriggerAvailableNow == null + // Obey user's setting to limit the number of files in this batch trigger. val (batchFiles, unselectedFiles) = limit match { - case files: ReadMaxFiles if !sourceOptions.latestFirst => + case files: ReadMaxFiles if shouldCache => // we can cache and reuse remaining fetched list of files in further batches val (bFiles, usFiles) = newFiles.splitAt(files.maxFiles()) - if (usFiles.size < files.maxFiles() * DISCARD_UNSEEN_INPUT_RATIO) { + if (usFiles.size < files.maxFiles() * discardCachedInputRatio) { // Discard unselected files if the number of files are smaller than threshold. // This is to avoid the case when the next batch would have too few files to read // whereas there're new files available. @@ -194,14 +202,14 @@ class FileStreamSource( } case files: ReadMaxFiles => - // implies "sourceOptions.latestFirst = true" which we want to refresh the list per batch + // don't use the cache, just take files for the next batch (newFiles.take(files.maxFiles()), null) - case files: ReadMaxBytes if !sourceOptions.latestFirst => + case files: ReadMaxBytes if shouldCache => // we can cache and reuse remaining fetched list of files in further batches val (FilesSplit(bFiles, _), FilesSplit(usFiles, rSize)) = takeFilesUntilMax(newFiles, files.maxBytes()) - if (rSize.toDouble < (files.maxBytes() * DISCARD_UNSEEN_INPUT_RATIO)) { + if (rSize.toDouble < (files.maxBytes() * discardCachedInputRatio)) { // Discard unselected files if the total size of files is smaller than threshold. // This is to avoid the case when the next batch would have too small of a size of // files to read whereas there're new files available. @@ -212,16 +220,16 @@ class FileStreamSource( } case files: ReadMaxBytes => + // don't use the cache, just take files for the next batch val (FilesSplit(bFiles, _), _) = takeFilesUntilMax(newFiles, files.maxBytes()) - // implies "sourceOptions.latestFirst = true" which we want to refresh the list per batch (bFiles, null) case _: ReadAllAvailable => (newFiles, null) } if (unselectedFiles != null && unselectedFiles.nonEmpty) { - logTrace(s"Taking first $MAX_CACHED_UNSEEN_FILES unread files.") - unreadFiles = unselectedFiles.take(MAX_CACHED_UNSEEN_FILES) + logTrace(s"Taking first $maxCachedFiles unread files.") + unreadFiles = unselectedFiles.take(maxCachedFiles) logTrace(s"${unreadFiles.size} unread files are available for further batches.") } else { unreadFiles = null @@ -250,7 +258,8 @@ class FileStreamSource( FileEntry(path = p.urlEncoded, timestamp = timestamp, batchId = metadataLogCurrentOffset) }.toArray if (metadataLog.add(metadataLogCurrentOffset, fileEntries)) { - logInfo(s"Log offset set to $metadataLogCurrentOffset with ${batchFiles.size} new files") + logInfo(log"Log offset set to ${MDC(LogKeys.LOG_OFFSET, metadataLogCurrentOffset)} " + + log"with ${MDC(LogKeys.NUM_FILES, batchFiles.size)} new files") } else { throw new IllegalStateException("Concurrent update to the log. Multiple streaming jobs " + s"detected for $metadataLogCurrentOffset") @@ -290,7 +299,9 @@ class FileStreamSource( assert(startOffset <= endOffset) val files = metadataLog.get(Some(startOffset + 1), Some(endOffset)).flatMap(_._2) - logInfo(s"Processing ${files.length} files from ${startOffset + 1}:$endOffset") + logInfo(log"Processing ${MDC(LogKeys.NUM_FILES, files.length)} files from " + + log"${MDC(LogKeys.FILE_START_OFFSET, startOffset + 1)}:" + + log"${MDC(LogKeys.FILE_END_OFFSET, endOffset)}") logTrace(s"Files are:\n\t" + files.mkString("\n\t")) val newDataSource = DataSource( @@ -380,7 +391,8 @@ class FileStreamSource( val listingTimeMs = NANOSECONDS.toMillis(endTime - startTime) if (listingTimeMs > 2000) { // Output a warning when listing files uses more than 2 seconds. - logWarning(s"Listed ${files.size} file(s) in $listingTimeMs ms") + logWarning(log"Listed ${MDC(LogKeys.NUM_FILES, files.size)} file(s) in " + + log"${MDC(LogKeys.ELAPSED_TIME, listingTimeMs)} ms") } else { logTrace(s"Listed ${files.size} file(s) in $listingTimeMs ms") } @@ -421,9 +433,6 @@ object FileStreamSource { /** Timestamp for file modification time, in ms since January 1, 1970 UTC. */ type Timestamp = Long - val DISCARD_UNSEEN_INPUT_RATIO = 0.2 - val MAX_CACHED_UNSEEN_FILES = 10000 - case class FileEntry( path: String, // uri-encoded path string timestamp: Timestamp, @@ -628,11 +637,13 @@ object FileStreamSource { logDebug(s"Archiving completed file $curPath to $newPath") if (!fileSystem.rename(curPath, newPath)) { - logWarning(s"Fail to move $curPath to $newPath / skip moving file.") + logWarning(log"Fail to move ${MDC(LogKeys.CURRENT_PATH, curPath)} to " + + log"${MDC(LogKeys.NEW_PATH, newPath)} / skip moving file.") } } catch { case NonFatal(e) => - logWarning(s"Fail to move $curPath to $newPath / skip moving file.", e) + logWarning(log"Fail to move ${MDC(LogKeys.CURRENT_PATH, curPath)} to " + + log"${MDC(LogKeys.NEW_PATH, newPath)} / skip moving file.", e) } } } @@ -646,12 +657,14 @@ object FileStreamSource { logDebug(s"Removing completed file $curPath") if (!fileSystem.delete(curPath, false)) { - logWarning(s"Failed to remove $curPath / skip removing file.") + logWarning( + log"Failed to remove ${MDC(LogKeys.CURRENT_PATH, curPath)} / skip removing file.") } } catch { case NonFatal(e) => // Log to error but swallow exception to avoid process being stopped - logWarning(s"Fail to remove $curPath / skip removing file.", e) + logWarning( + log"Fail to remove ${MDC(LogKeys.CURRENT_PATH, curPath)} / skip removing file.", e) } } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FlatMapGroupsWithStateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FlatMapGroupsWithStateExec.scala index 01b16b63fa278..3969320aa1a8b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FlatMapGroupsWithStateExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FlatMapGroupsWithStateExec.scala @@ -18,6 +18,8 @@ package org.apache.spark.sql.execution.streaming import java.util.concurrent.TimeUnit.NANOSECONDS +import org.apache.hadoop.conf.Configuration + import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder @@ -187,6 +189,14 @@ trait FlatMapGroupsWithStateExecBase }) } + override def validateAndMaybeEvolveStateSchema( + hadoopConf: Configuration, + batchId: Long, + stateSchemaVersion: Int): Array[String] = { + StateSchemaCompatibilityChecker.validateAndMaybeEvolveStateSchema(getStateInfo, hadoopConf, + groupingAttributes.toStructType, stateManager.stateSchema, session.sqlContext.sessionState) + } + override protected def doExecute(): RDD[InternalRow] = { stateManager // force lazy init at driver metrics // force lazy init at driver diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala index b3eedbf93f040..251cc16acdf43 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala @@ -29,7 +29,8 @@ import org.apache.hadoop.fs._ import org.json4s.{Formats, NoTypeHints} import org.json4s.jackson.Serialization -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.sql.SparkSession import org.apache.spark.sql.errors.QueryExecutionErrors import org.apache.spark.sql.internal.SQLConf @@ -265,7 +266,7 @@ class HDFSMetadataLog[T <: AnyRef : ClassTag](sparkSession: SparkSession, path: override def getLatest(): Option[(Long, T)] = { listBatches.sorted.lastOption.map { batchId => - logInfo(s"Getting latest batch $batchId") + logInfo(log"Getting latest batch ${MDC(BATCH_ID, batchId)}") (batchId, getExistingBatch(batchId)) } } @@ -335,7 +336,7 @@ class HDFSMetadataLog[T <: AnyRef : ClassTag](sparkSession: SparkSession, path: batchCache.synchronized { batchCache.keySet.asScala.toArray } - logInfo("BatchIds found from listing: " + batchIds.sorted.mkString(", ")) + logInfo(log"BatchIds found from listing: ${MDC(BATCH_ID, batchIds.sorted.mkString(", "))}") if (batchIds.isEmpty) { Array.empty diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala index cfccfff3a1382..2759af639390b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala @@ -22,7 +22,8 @@ import java.util.concurrent.atomic.AtomicInteger import org.apache.hadoop.fs.Path -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{BATCH_TIMESTAMP, ERROR} import org.apache.spark.sql.{SparkSession, Strategy} import org.apache.spark.sql.catalyst.QueryPlanningTracker import org.apache.spark.sql.catalyst.expressions.{CurrentBatchTimestamp, ExpressionWithRandomSeed} @@ -30,7 +31,7 @@ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.catalyst.trees.TreePattern._ import org.apache.spark.sql.errors.QueryExecutionErrors -import org.apache.spark.sql.execution.{LocalLimitExec, QueryExecution, SparkPlan, SparkPlanner, UnaryExecNode} +import org.apache.spark.sql.execution.{LocalLimitExec, QueryExecution, SerializeFromObjectExec, SparkPlan, SparkPlanner, UnaryExecNode} import org.apache.spark.sql.execution.aggregate.{HashAggregateExec, MergingSessionsExec, ObjectHashAggregateExec, SortAggregateExec, UpdatingSessionsExec} import org.apache.spark.sql.execution.datasources.v2.state.metadata.StateMetadataPartitionReader import org.apache.spark.sql.execution.exchange.ShuffleExchangeLike @@ -84,6 +85,12 @@ class IncrementalExecution( .map(SQLConf.SHUFFLE_PARTITIONS.valueConverter) .getOrElse(sparkSession.sessionState.conf.numShufflePartitions) + /** + * This value dictates which schema format version the state schema should be written in + * for all operators other than TransformWithState. + */ + private val STATE_SCHEMA_DEFAULT_VERSION: Int = 2 + /** * See [SPARK-18339] * Walk the optimized logical plan and replace CurrentBatchTimestamp @@ -101,7 +108,7 @@ class IncrementalExecution( tracker).transformAllExpressionsWithPruning( _.containsAnyPattern(CURRENT_LIKE, EXPRESSION_WITH_RANDOM_SEED)) { case ts @ CurrentBatchTimestamp(timestamp, _, _) => - logInfo(s"Current batch timestamp = $timestamp") + logInfo(log"Current batch timestamp = ${MDC(BATCH_TIMESTAMP, timestamp)}") ts.toLiteral case e: ExpressionWithRandomSeed => e.withNewSeed(Utils.random.nextLong()) } @@ -186,14 +193,31 @@ class IncrementalExecution( } } - object WriteStatefulOperatorMetadataRule extends SparkPlanPartialRule { + // Planning rule used to record the state schema for the first run and validate state schema + // changes across query runs. + object StateSchemaAndOperatorMetadataRule extends SparkPlanPartialRule { override val rule: PartialFunction[SparkPlan, SparkPlan] = { - case stateStoreWriter: StateStoreWriter if isFirstBatch => - val metadata = stateStoreWriter.operatorStateMetadata() - val metadataWriter = new OperatorStateMetadataWriter(new Path( - checkpointLocation, stateStoreWriter.getStateInfo.operatorId.toString), hadoopConf) - metadataWriter.write(metadata) - stateStoreWriter + // In the case of TransformWithStateExec, we want to collect this StateSchema + // filepath, and write this path out in the OperatorStateMetadata file + case statefulOp: StatefulOperator if isFirstBatch => + val stateSchemaVersion = statefulOp match { + case _: TransformWithStateExec => sparkSession.sessionState.conf. + getConf(SQLConf.STREAMING_TRANSFORM_WITH_STATE_OP_STATE_SCHEMA_VERSION) + case _ => STATE_SCHEMA_DEFAULT_VERSION + } + val stateSchemaPaths = statefulOp. + validateAndMaybeEvolveStateSchema(hadoopConf, currentBatchId, stateSchemaVersion) + // write out the state schema paths to the metadata file + statefulOp match { + case stateStoreWriter: StateStoreWriter => + val metadata = stateStoreWriter.operatorStateMetadata() + // TODO: [SPARK-48849] Populate metadata with stateSchemaPaths if metadata version is v2 + val metadataWriter = new OperatorStateMetadataWriter(new Path( + checkpointLocation, stateStoreWriter.getStateInfo.operatorId.toString), hadoopConf) + metadataWriter.write(metadata) + case _ => + } + statefulOp } } @@ -346,6 +370,28 @@ class IncrementalExecution( eventTimeWatermarkForEviction = inputWatermarkForEviction(m.stateInfo.get) ) + // UpdateEventTimeColumnExec is used to tag the eventTime column, and validate + // emitted rows adhere to watermark in the output of transformWithState. + // Hence, this node shares the same watermark value as TransformWithStateExec. + // However, given that UpdateEventTimeColumnExec does not store any state, it + // does not have any StateInfo. We simply use the StateInfo of transformWithStateExec + // to propagate watermark to both UpdateEventTimeColumnExec and transformWithStateExec. + case UpdateEventTimeColumnExec(eventTime, delay, None, + SerializeFromObjectExec(serializer, + t: TransformWithStateExec)) if t.stateInfo.isDefined => + + val stateInfo = t.stateInfo.get + val iwLateEvents = inputWatermarkForLateEvents(stateInfo) + val iwEviction = inputWatermarkForEviction(stateInfo) + + UpdateEventTimeColumnExec(eventTime, delay, iwLateEvents, + SerializeFromObjectExec(serializer, + t.copy( + eventTimeWatermarkForLateEvents = iwLateEvents, + eventTimeWatermarkForEviction = iwEviction) + )) + + case t: TransformWithStateExec if t.stateInfo.isDefined => t.copy( eventTimeWatermarkForLateEvents = inputWatermarkForLateEvents(t.stateInfo.get), @@ -419,9 +465,9 @@ class IncrementalExecution( } catch { case e: Exception => // no need to throw fatal error, returns empty map - logWarning("Error reading metadata path for stateful operator. " + - s"This may due to no prior committed batch, or previously run on lower versions:" + - s" ${e.getMessage}") + logWarning(log"Error reading metadata path for stateful operator. This may due to " + + log"no prior committed batch, or previously run on lower versions: " + + log"${MDC(ERROR, e.getMessage)}") } } ret @@ -448,9 +494,11 @@ class IncrementalExecution( if (isFirstBatch && currentBatchId != 0) { checkOperatorValidWithMetadata(planWithStateOpId) } - // The rule doesn't change the plan but cause the side effect that metadata is written - // in the checkpoint directory of stateful operator. - planWithStateOpId transform WriteStatefulOperatorMetadataRule.rule + + // The rule below doesn't change the plan but can cause the side effect that + // metadata/schema is written in the checkpoint directory of stateful operator. + planWithStateOpId transform StateSchemaAndOperatorMetadataRule.rule + simulateWatermarkPropagation(planWithStateOpId) planWithStateOpId transform WatermarkPropagationRule.rule } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ListStateImplWithTTL.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ListStateImplWithTTL.scala index 32bc21cea6ed4..dc72f8bcd5600 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ListStateImplWithTTL.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ListStateImplWithTTL.scala @@ -137,6 +137,7 @@ class ListStateImplWithTTL[S]( /** Remove this state. */ override def clear(): Unit = { store.remove(stateTypesEncoder.encodeGroupingKey(), stateName) + clearTTLState() } private def validateNewState(newState: Array[S]): Unit = { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala index 46ce33687890d..b382642eb6bf6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala @@ -25,7 +25,8 @@ import scala.collection.mutable.ArrayBuffer import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{BATCH_ID, PATH} import org.apache.spark.internal.io.FileCommitProtocol import org.apache.spark.internal.io.FileCommitProtocol.TaskCommitMessage import org.apache.spark.sql.errors.QueryExecutionErrors @@ -74,7 +75,7 @@ class ManifestFileCommitProtocol(jobId: String, path: String) pendingCommitFiles.clear() if (fileLog.add(batchId, fileStatuses)) { - logInfo(s"Committed batch $batchId") + logInfo(log"Committed batch ${MDC(BATCH_ID, batchId)}") } else { throw new IllegalStateException(s"Race while writing batch $batchId") } @@ -95,7 +96,8 @@ class ManifestFileCommitProtocol(jobId: String, path: String) } } catch { case e: IOException => - logWarning(s"Fail to remove temporary file $path, continue removing next.", e) + logWarning(log"Fail to remove temporary file ${MDC(PATH, path)}, " + + log"continue removing next.", e) } } pendingCommitFiles.clear() diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MapStateImplWithTTL.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MapStateImplWithTTL.scala new file mode 100644 index 0000000000000..2ab06f36dd5f7 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MapStateImplWithTTL.scala @@ -0,0 +1,252 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution.streaming + +import org.apache.spark.internal.Logging +import org.apache.spark.sql.Encoder +import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder +import org.apache.spark.sql.execution.streaming.TransformWithStateKeyValueRowSchema.{COMPOSITE_KEY_ROW_SCHEMA, VALUE_ROW_SCHEMA_WITH_TTL} +import org.apache.spark.sql.execution.streaming.state.{PrefixKeyScanStateEncoderSpec, StateStore, StateStoreErrors} +import org.apache.spark.sql.streaming.{MapState, TTLConfig} +import org.apache.spark.util.NextIterator + +/** + * Class that provides a concrete implementation for map state associated with state + * variables (with ttl expiration support) used in the streaming transformWithState operator. + * @param store - reference to the StateStore instance to be used for storing state + * @param stateName - name of the state variable + * @param keyExprEnc - Spark SQL encoder for key + * @param userKeyEnc - Spark SQL encoder for the map key + * @param valEncoder - SQL encoder for state variable + * @param ttlConfig - the ttl configuration (time to live duration etc.) + * @param batchTimestampMs - current batch processing timestamp. + * @tparam K - type of key for map state variable + * @tparam V - type of value for map state variable + * @return - instance of MapState of type [K,V] that can be used to store state persistently + */ +class MapStateImplWithTTL[K, V]( + store: StateStore, + stateName: String, + keyExprEnc: ExpressionEncoder[Any], + userKeyEnc: Encoder[K], + valEncoder: Encoder[V], + ttlConfig: TTLConfig, + batchTimestampMs: Long) extends CompositeKeyTTLStateImpl(stateName, store, batchTimestampMs) + with MapState[K, V] with Logging { + + private val keySerializer = keyExprEnc.createSerializer() + private val stateTypesEncoder = new CompositeKeyStateEncoder( + keySerializer, userKeyEnc, valEncoder, COMPOSITE_KEY_ROW_SCHEMA, stateName, hasTtl = true) + + private val ttlExpirationMs = + StateTTL.calculateExpirationTimeForDuration(ttlConfig.ttlDuration, batchTimestampMs) + + initialize() + + private def initialize(): Unit = { + store.createColFamilyIfAbsent(stateName, COMPOSITE_KEY_ROW_SCHEMA, VALUE_ROW_SCHEMA_WITH_TTL, + PrefixKeyScanStateEncoderSpec(COMPOSITE_KEY_ROW_SCHEMA, 1)) + } + + /** Whether state exists or not. */ + override def exists(): Boolean = { + iterator().nonEmpty + } + + /** Get the state value if it exists */ + override def getValue(key: K): V = { + StateStoreErrors.requireNonNullStateValue(key, stateName) + val encodedCompositeKey = stateTypesEncoder.encodeCompositeKey(key) + val retRow = store.get(encodedCompositeKey, stateName) + + if (retRow != null) { + if (!stateTypesEncoder.isExpired(retRow, batchTimestampMs)) { + stateTypesEncoder.decodeValue(retRow) + } else { + null.asInstanceOf[V] + } + } else { + null.asInstanceOf[V] + } + } + + /** Check if the user key is contained in the map */ + override def containsKey(key: K): Boolean = { + StateStoreErrors.requireNonNullStateValue(key, stateName) + getValue(key) != null + } + + /** Update value for given user key */ + override def updateValue(key: K, value: V): Unit = { + StateStoreErrors.requireNonNullStateValue(key, stateName) + StateStoreErrors.requireNonNullStateValue(value, stateName) + + val serializedGroupingKey = stateTypesEncoder.serializeGroupingKey() + val serializedUserKey = stateTypesEncoder.serializeUserKey(key) + + val encodedValue = stateTypesEncoder.encodeValue(value, ttlExpirationMs) + val encodedCompositeKey = stateTypesEncoder.encodeCompositeKey( + serializedGroupingKey, serializedUserKey) + store.put(encodedCompositeKey, encodedValue, stateName) + + upsertTTLForStateKey(ttlExpirationMs, serializedGroupingKey, serializedUserKey) + } + + /** Get the map associated with grouping key */ + override def iterator(): Iterator[(K, V)] = { + val encodedGroupingKey = stateTypesEncoder.encodeGroupingKey() + val unsafeRowPairIterator = store.prefixScan(encodedGroupingKey, stateName) + new NextIterator[(K, V)] { + override protected def getNext(): (K, V) = { + val iter = unsafeRowPairIterator.dropWhile { rowPair => + stateTypesEncoder.isExpired(rowPair.value, batchTimestampMs) + } + if (iter.hasNext) { + val currentRowPair = iter.next() + val key = stateTypesEncoder.decodeCompositeKey(currentRowPair.key) + val value = stateTypesEncoder.decodeValue(currentRowPair.value) + (key, value) + } else { + finished = true + null.asInstanceOf[(K, V)] + } + } + + override protected def close(): Unit = {} + } + } + + /** Get the list of keys present in map associated with grouping key */ + override def keys(): Iterator[K] = { + iterator().map(_._1) + } + + /** Get the list of values present in map associated with grouping key */ + override def values(): Iterator[V] = { + iterator().map(_._2) + } + + /** Remove user key from map state */ + override def removeKey(key: K): Unit = { + StateStoreErrors.requireNonNullStateValue(key, stateName) + val compositeKey = stateTypesEncoder.encodeCompositeKey(key) + store.remove(compositeKey, stateName) + } + + /** Remove this state. */ + override def clear(): Unit = { + keys().foreach { itr => + removeKey(itr) + } + clearTTLState() + } + + /** + * Clears the user state associated with this grouping key + * if it has expired. This function is called by Spark to perform + * cleanup at the end of transformWithState processing. + * + * Spark uses a secondary index to determine if the user state for + * this grouping key has expired. However, its possible that the user + * has updated the TTL and secondary index is out of date. Implementations + * must validate that the user State has actually expired before cleanup based + * on their own State data. + * + * @param groupingKey grouping key for which cleanup should be performed. + * @param userKey user key for which cleanup should be performed. + */ + override def clearIfExpired(groupingKey: Array[Byte], userKey: Array[Byte]): Long = { + val encodedCompositeKey = stateTypesEncoder.encodeCompositeKey(groupingKey, userKey) + val retRow = store.get(encodedCompositeKey, stateName) + var numRemovedElements = 0L + if (retRow != null) { + if (stateTypesEncoder.isExpired(retRow, batchTimestampMs)) { + store.remove(encodedCompositeKey, stateName) + numRemovedElements += 1 + } + } + numRemovedElements + } + + /* + * Internal methods to probe state for testing. The below methods exist for unit tests + * to read the state ttl values, and ensure that values are persisted correctly in + * the underlying state store. + */ + + /** + * Retrieves the value from State even if its expired. This method is used + * in tests to read the state store value, and ensure if its cleaned up at the + * end of the micro-batch. + */ + private[sql] def getWithoutEnforcingTTL(userKey: K): Option[V] = { + val encodedCompositeKey = stateTypesEncoder.encodeCompositeKey(userKey) + val retRow = store.get(encodedCompositeKey, stateName) + + if (retRow != null) { + val resState = stateTypesEncoder.decodeValue(retRow) + Some(resState) + } else { + None + } + } + + /** + * Read the ttl value associated with the grouping and user key. + */ + private[sql] def getTTLValue(userKey: K): Option[(V, Long)] = { + val encodedCompositeKey = stateTypesEncoder.encodeCompositeKey(userKey) + val retRow = store.get(encodedCompositeKey, stateName) + + // if the returned row is not null, we want to return the value associated with the + // ttlExpiration + Option(retRow).flatMap { row => + val ttlExpiration = stateTypesEncoder.decodeTtlExpirationMs(row) + ttlExpiration.map(expiration => (stateTypesEncoder.decodeValue(row), expiration)) + } + } + + /** + * Get all ttl values stored in ttl state for current implicit + * grouping key. + */ + private[sql] def getKeyValuesInTTLState(): Iterator[(K, Long)] = { + val ttlIterator = ttlIndexIterator() + val implicitGroupingKey = stateTypesEncoder.serializeGroupingKey() + var nextValue: Option[(K, Long)] = None + + new Iterator[(K, Long)] { + override def hasNext: Boolean = { + while (nextValue.isEmpty && ttlIterator.hasNext) { + val nextTtlValue = ttlIterator.next() + val groupingKey = nextTtlValue.groupingKey + if (groupingKey sameElements implicitGroupingKey) { + val userKey = stateTypesEncoder.decodeUserKeyFromTTLRow(nextTtlValue) + nextValue = Some(userKey, nextTtlValue.expirationMs) + } + } + nextValue.isDefined + } + + override def next(): (K, Long) = { + val result = nextValue.get + nextValue = None + result + } + } + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLogFileIndex.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLogFileIndex.scala index 6eaccfb6d9347..45bb69a9c056b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLogFileIndex.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLogFileIndex.scala @@ -21,6 +21,8 @@ import scala.collection.mutable import org.apache.hadoop.fs.{FileStatus, Path} +import org.apache.spark.internal.LogKeys._ +import org.apache.spark.internal.MDC import org.apache.spark.sql.SparkSession import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.types.StructType @@ -47,7 +49,7 @@ class MetadataLogFileIndex( metadataDir } - logInfo(s"Reading streaming file log from $metadataDirectory") + logInfo(log"Reading streaming file log from ${MDC(METADATA_DIRECTORY, metadataDirectory)}") private val metadataLog = new FileStreamSinkLog(FileStreamSinkLog.VERSION, sparkSession, metadataDirectory.toString) private val allFilesFromLog = metadataLog.allFiles().map(_.toFileStatus).filterNot(_.isDirectory) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala index ae5a033538abf..f636413f7c518 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.execution.streaming import scala.collection.mutable.{Map => MutableMap} import scala.collection.mutable +import org.apache.spark.internal.{LogKeys, MDC} import org.apache.spark.sql.{Dataset, SparkSession} import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, CurrentBatchTimestamp, CurrentDate, CurrentTimestamp, FileSourceMetadataAttribute, LocalTimestamp} @@ -101,17 +102,17 @@ class MicroBatchExecution( // See SPARK-45178 for more details. if (sparkSession.sessionState.conf.getConf( SQLConf.STREAMING_TRIGGER_AVAILABLE_NOW_WRAPPER_ENABLED)) { - logInfo("Configured to use the wrapper of Trigger.AvailableNow for query " + - s"$prettyIdString.") + logInfo(log"Configured to use the wrapper of Trigger.AvailableNow for query " + + log"${MDC(LogKeys.PRETTY_ID_STRING, prettyIdString)}.") MultiBatchExecutor() } else { val supportsTriggerAvailableNow = sources.distinct.forall { src => val supports = src.isInstanceOf[SupportsTriggerAvailableNow] if (!supports) { - logWarning(s"source [$src] does not support Trigger.AvailableNow. Falling back to " + - "single batch execution. Note that this may not guarantee processing new data if " + - "there is an uncommitted batch. Please consult with data source developer to " + - "support Trigger.AvailableNow.") + logWarning(log"source [${MDC(LogKeys.SPARK_DATA_STREAM, src)}] does not support " + + log"Trigger.AvailableNow. Falling back to single batch execution. Note that this " + + log"may not guarantee processing new data if there is an uncommitted batch. " + + log"Please consult with data source developer to support Trigger.AvailableNow.") } supports @@ -156,7 +157,9 @@ class MicroBatchExecution( val metadataPath = s"$resolvedCheckpointRoot/sources/$nextSourceId" val source = dataSourceV1.createSource(metadataPath) nextSourceId += 1 - logInfo(s"Using Source [$source] from DataSourceV1 named '$sourceName' [$dataSourceV1]") + logInfo(log"Using Source [${MDC(LogKeys.STREAMING_SOURCE, source)}] " + + log"from DataSourceV1 named '${MDC(LogKeys.STREAMING_DATA_SOURCE_NAME, sourceName)}' " + + log"[${MDC(LogKeys.STREAMING_DATA_SOURCE_DESCRIPTION, dataSourceV1)}]") StreamingExecutionRelation(source, output, dataSourceV1.catalogTable)(sparkSession) }) @@ -169,7 +172,9 @@ class MicroBatchExecution( // Materialize source to avoid creating it in every batch val metadataPath = s"$resolvedCheckpointRoot/sources/$nextSourceId" nextSourceId += 1 - logInfo(s"Reading table [$table] from DataSourceV2 named '$srcName' $dsStr") + logInfo(log"Reading table [${MDC(LogKeys.STREAMING_TABLE, table)}] " + + log"from DataSourceV2 named '${MDC(LogKeys.STREAMING_DATA_SOURCE_NAME, srcName)}' " + + log"${MDC(LogKeys.STREAMING_DATA_SOURCE_DESCRIPTION, dsStr)}") // TODO: operator pushdown. val scan = table.newScanBuilder(options).build() val stream = scan.toMicroBatchStream(metadataPath) @@ -187,7 +192,9 @@ class MicroBatchExecution( val source = v1.get.asInstanceOf[StreamingRelation].dataSource.createSource(metadataPath) nextSourceId += 1 - logInfo(s"Using Source [$source] from DataSourceV2 named '$srcName' $dsStr") + logInfo(log"Using Source [${MDC(LogKeys.STREAMING_SOURCE, source)}] from " + + log"DataSourceV2 named '${MDC(LogKeys.STREAMING_DATA_SOURCE_NAME, srcName)}' " + + log"${MDC(LogKeys.STREAMING_DATA_SOURCE_DESCRIPTION, dsStr)}") // We don't have a catalog table but may have a table identifier. Given this is about // v1 fallback path, we just give up and set the catalog table as None. StreamingExecutionRelation(source, output, None)(sparkSession) @@ -211,8 +218,8 @@ class MicroBatchExecution( case s: SupportsAdmissionControl => val limit = s.getDefaultReadLimit if (limit != ReadLimit.allAvailable()) { - logWarning( - s"The read limit $limit for $s is ignored when Trigger.Once is used.") + logWarning(log"The read limit ${MDC(LogKeys.READ_LIMIT, limit)} for " + + log"${MDC(LogKeys.SPARK_DATA_STREAM, s)} is ignored when Trigger.Once is used.") } s -> ReadLimit.allAvailable() case s => @@ -278,7 +285,7 @@ class MicroBatchExecution( // microBatchThread may spawn new jobs, so we need to cancel again to prevent a leak sparkSession.sparkContext.cancelJobGroup(runId.toString) } - logInfo(s"Query $prettyIdString was stopped") + logInfo(log"Query ${MDC(LogKeys.PRETTY_ID_STRING, prettyIdString)} was stopped") } private val watermarkPropagator = WatermarkPropagator(sparkSession.sessionState.conf) @@ -288,7 +295,8 @@ class MicroBatchExecution( // shutdown and cleanup required for async log purge mechanism asyncLogPurgeShutdown() - logInfo(s"Async log purge executor pool for query ${prettyIdString} has been shutdown") + logInfo(log"Async log purge executor pool for query " + + log"${MDC(LogKeys.PRETTY_ID_STRING, prettyIdString)} has been shutdown") } private def initializeExecution( @@ -304,7 +312,7 @@ class MicroBatchExecution( setLatestExecutionContext(execCtx) populateStartOffsets(execCtx, sparkSessionForStream) - logInfo(s"Stream started from ${execCtx.startOffsets}") + logInfo(log"Stream started from ${MDC(LogKeys.STREAMING_OFFSETS_START, execCtx.startOffsets)}") execCtx } /** @@ -411,12 +419,12 @@ class MicroBatchExecution( def validateOffsetLogAndGetPrevOffset(latestBatchId: Long): Option[OffsetSeq] = { if (latestBatchId != 0) { Some(offsetLog.get(latestBatchId - 1).getOrElse { - logError(s"The offset log for batch ${latestBatchId - 1} doesn't exist, " + - s"which is required to restart the query from the latest batch $latestBatchId " + - "from the offset log. Please ensure there are two subsequent offset logs " + - "available for the latest batch via manually deleting the offset file(s). " + - "Please also ensure the latest batch for commit log is equal or one batch " + - "earlier than the latest batch for offset log.") + logError(log"The offset log for batch ${MDC(LogKeys.BATCH_ID, latestBatchId - 1)} " + + log"doesn't exist, which is required to restart the query from the latest batch " + + log"${MDC(LogKeys.LATEST_BATCH_ID, latestBatchId)} from the offset log. Please ensure " + + log"there are two subsequent offset logs available for the latest batch via manually " + + log"deleting the offset file(s). Please also ensure the latest batch for commit log is " + + log"equal or one batch earlier than the latest batch for offset log.") throw new IllegalStateException(s"batch ${latestBatchId - 1} doesn't exist") }) } else { @@ -510,16 +518,17 @@ class MicroBatchExecution( // here, so we do nothing here. } } else if (latestCommittedBatchId < latestBatchId - 1) { - logWarning(s"Batch completion log latest batch id is " + - s"${latestCommittedBatchId}, which is not trailing " + - s"batchid $latestBatchId by one") + logWarning(log"Batch completion log latest batch id is " + + log"${MDC(LogKeys.LATEST_COMMITTED_BATCH_ID, latestCommittedBatchId)}, which is " + + log"not trailing batchid ${MDC(LogKeys.LATEST_BATCH_ID, latestBatchId)} by one") } case None => logInfo("no commit log present") } // initialize committed offsets to start offsets of the most recent committed batch committedOffsets = execCtx.startOffsets - logInfo(s"Resuming at batch ${execCtx.batchId} with committed offsets " + - s"${execCtx.startOffsets} and available offsets ${execCtx.endOffsets}") + logInfo(log"Resuming at batch ${MDC(LogKeys.BATCH_ID, execCtx.batchId)} with committed " + + log"offsets ${MDC(LogKeys.STREAMING_OFFSETS_START, execCtx.startOffsets)} and " + + log"available offsets ${MDC(LogKeys.STREAMING_OFFSETS_END, execCtx.endOffsets)}") case None => // We are starting this stream for the first time. logInfo(s"Starting new streaming query.") execCtx.batchId = 0 @@ -748,8 +757,8 @@ class MicroBatchExecution( } } else if (catalogTable.exists(_ ne newRelation.catalogTable.get)) { // Output a warning if `catalogTable` is provided by the source rather than engine - logWarning( - s"Source $source should not produce the information of catalog table by its own.") + logWarning(log"Source ${MDC(LogKeys.SPARK_DATA_STREAM, source)} should not " + + log"produce the information of catalog table by its own.") } newRelation } @@ -873,8 +882,8 @@ class MicroBatchExecution( throw QueryExecutionErrors.concurrentStreamLogUpdate(execCtx.batchId) } - logInfo(s"Committed offsets for batch ${execCtx.batchId}. " + - s"Metadata ${execCtx.offsetSeqMetadata.toString}") + logInfo(log"Committed offsets for batch ${MDC(LogKeys.BATCH_ID, execCtx.batchId)}. " + + log"Metadata ${MDC(LogKeys.OFFSET_SEQUENCE_METADATA, execCtx.offsetSeqMetadata.toString)}") } /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala index 006d6221e55aa..f0be33ad9a9d8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala @@ -20,12 +20,13 @@ package org.apache.spark.sql.execution.streaming import org.json4s.{Formats, NoTypeHints} import org.json4s.jackson.Serialization -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{CONFIG, DEFAULT_VALUE, NEW_VALUE, OLD_VALUE, TIP} import org.apache.spark.io.CompressionCodec import org.apache.spark.sql.RuntimeConfig import org.apache.spark.sql.connector.read.streaming.{Offset => OffsetV2, SparkDataStream} import org.apache.spark.sql.execution.streaming.state.{FlatMapGroupsWithStateExecHelper, StreamingAggregationStateManager, SymmetricHashJoinStateManager} -import org.apache.spark.sql.internal.SQLConf.{FLATMAPGROUPSWITHSTATE_STATE_FORMAT_VERSION, _} +import org.apache.spark.sql.internal.SQLConf._ /** @@ -143,8 +144,9 @@ object OffsetSeqMetadata extends Logging { // Config value exists in the metadata, update the session config with this value val optionalValueInSession = sessionConf.getOption(confKey) if (optionalValueInSession.isDefined && optionalValueInSession.get != valueInMetadata) { - logWarning(s"Updating the value of conf '$confKey' in current session from " + - s"'${optionalValueInSession.get}' to '$valueInMetadata'.") + logWarning(log"Updating the value of conf '${MDC(CONFIG, confKey)}' in current " + + log"session from '${MDC(OLD_VALUE, optionalValueInSession.get)}' " + + log"to '${MDC(NEW_VALUE, valueInMetadata)}'.") } sessionConf.set(confKey, valueInMetadata) @@ -156,14 +158,15 @@ object OffsetSeqMetadata extends Logging { case Some(defaultValue) => sessionConf.set(confKey, defaultValue) - logWarning(s"Conf '$confKey' was not found in the offset log, " + - s"using default value '$defaultValue'") + logWarning(log"Conf '${MDC(CONFIG, confKey)}' was not found in the offset log, " + + log"using default value '${MDC(DEFAULT_VALUE, defaultValue)}'") case None => val valueStr = sessionConf.getOption(confKey).map { v => s" Using existing session conf value '$v'." }.getOrElse { " No value set in session conf." } - logWarning(s"Conf '$confKey' was not found in the offset log. $valueStr") + logWarning(log"Conf '${MDC(CONFIG, confKey)}' was not found in the offset log. " + + log"${MDC(TIP, valueStr)}") } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala index 0d32eed9b6bdb..c440ec451b724 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala @@ -25,7 +25,7 @@ import java.util.{Optional, UUID} import scala.collection.mutable import scala.jdk.CollectionConverters._ -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.optimizer.InlineCTE import org.apache.spark.sql.catalyst.plans.logical.{EventTimeWatermark, LogicalPlan, WithCTE} @@ -37,7 +37,7 @@ import org.apache.spark.sql.execution.QueryExecution import org.apache.spark.sql.execution.datasources.v2.{MicroBatchScanExec, StreamingDataSourceV2ScanRelation, StreamWriterCommitProgress} import org.apache.spark.sql.streaming._ import org.apache.spark.sql.streaming.StreamingQueryListener.{QueryIdleEvent, QueryProgressEvent} -import org.apache.spark.util.Clock +import org.apache.spark.util.{Clock, Utils} /** * Responsible for continually reporting statistics about the amount of data processed as well @@ -81,7 +81,8 @@ class ProgressReporter( addNewProgress(newProgress) postEvent(new QueryProgressEvent(newProgress)) - logInfo(s"Streaming query made progress: $newProgress") + logInfo( + log"Streaming query made progress: ${MDC(LogKeys.STREAMING_QUERY_PROGRESS, newProgress)}") } private def addNewProgress(newProgress: StreamingQueryProgress): Unit = { @@ -103,8 +104,8 @@ class ProgressReporter( addNewProgress(newProgress) if (lastNoExecutionProgressEventTime > Long.MinValue) { postEvent(new QueryIdleEvent(id, runId, formatTimestamp(currentTriggerStartTimestamp))) - logInfo(s"Streaming query has been idle and waiting for new data more than " + - s"${noDataProgressEventInterval} ms.") + logInfo(log"Streaming query has been idle and waiting for new data more than " + + log"${MDC(LogKeys.TIME_UNITS, noDataProgressEventInterval)} ms.") } lastNoExecutionProgressEventTime = now @@ -333,33 +334,44 @@ abstract class ProgressContext( inputTimeSec: Double, processingTimeSec: Double): Seq[SourceProgress] = { sources.distinct.map { source => - val numRecords = execStats.flatMap(_.inputRows.get(source)).getOrElse(0L) - val sourceMetrics = source match { - case withMetrics: ReportsSourceMetrics => - withMetrics.metrics(Optional.ofNullable(latestStreamProgress.get(source).orNull)) - case _ => Map[String, String]().asJava + val (result, duration) = Utils.timeTakenMs { + val numRecords = execStats.flatMap(_.inputRows.get(source)).getOrElse(0L) + val sourceMetrics = source match { + case withMetrics: ReportsSourceMetrics => + withMetrics.metrics(Optional.ofNullable(latestStreamProgress.get(source).orNull)) + case _ => Map[String, String]().asJava + } + new SourceProgress( + description = source.toString, + startOffset = currentTriggerStartOffsets.get(source).orNull, + endOffset = currentTriggerEndOffsets.get(source).orNull, + latestOffset = currentTriggerLatestOffsets.get(source).orNull, + numInputRows = numRecords, + inputRowsPerSecond = numRecords / inputTimeSec, + processedRowsPerSecond = numRecords / processingTimeSec, + metrics = sourceMetrics + ) } - new SourceProgress( - description = source.toString, - startOffset = currentTriggerStartOffsets.get(source).orNull, - endOffset = currentTriggerEndOffsets.get(source).orNull, - latestOffset = currentTriggerLatestOffsets.get(source).orNull, - numInputRows = numRecords, - inputRowsPerSecond = numRecords / inputTimeSec, - processedRowsPerSecond = numRecords / processingTimeSec, - metrics = sourceMetrics - ) + logInfo(log"Extracting source progress metrics for source=" + + log"${MDC(LogKeys.SOURCE, source.toString)} " + + log"took duration_ms=${MDC(LogKeys.DURATION, duration)}") + result } } private def extractSinkProgress(execStats: Option[ExecutionStats]): SinkProgress = { - val sinkOutput = execStats.flatMap(_.outputRows) - val sinkMetrics = sink match { - case withMetrics: ReportsSinkMetrics => withMetrics.metrics() - case _ => Map[String, String]().asJava - } + val (result, duration) = Utils.timeTakenMs { + val sinkOutput = execStats.flatMap(_.outputRows) + val sinkMetrics = sink match { + case withMetrics: ReportsSinkMetrics => withMetrics.metrics() + case _ => Map[String, String]().asJava + } - SinkProgress(sink.toString, sinkOutput, sinkMetrics) + SinkProgress(sink.toString, sinkOutput, sinkMetrics) + } + logInfo(log"Extracting sink progress metrics for sink=${MDC(LogKeys.SINK, sink.toString)} " + + log"took duration_ms=${MDC(LogKeys.DURATION, duration)}") + result } /** @@ -382,9 +394,10 @@ abstract class ProgressContext( val finishTriggerDurationMillis = triggerClock.getTimeMillis() - triggerEndTimestamp val thresholdForLoggingMillis = 60 * 1000 if (finishTriggerDurationMillis > math.max(thresholdForLoggingMillis, processingTimeMills)) { - logWarning("Query progress update takes longer than batch processing time. Progress " + - s"update takes $finishTriggerDurationMillis milliseconds. Batch processing takes " + - s"$processingTimeMills milliseconds") + logWarning(log"Query progress update takes longer than batch processing time. Progress " + + log"update takes ${MDC(LogKeys.FINISH_TRIGGER_DURATION, finishTriggerDurationMillis)} " + + log"milliseconds. Batch processing takes " + + log"${MDC(LogKeys.PROCESSING_TIME, processingTimeMills)} milliseconds") } } @@ -485,11 +498,10 @@ abstract class ProgressContext( if (!metricWarningLogged) { def toString[T](seq: Seq[T]): String = s"(size = ${seq.size}), ${seq.mkString(", ")}" - logWarning( - "Could not report metrics as number leaves in trigger logical plan did not match that" + - s" of the execution plan:\n" + - s"logical plan leaves: ${toString(allLogicalPlanLeaves)}\n" + - s"execution plan leaves: ${toString(allExecPlanLeaves)}\n") + logWarning(log"Could not report metrics as number leaves in trigger logical plan did " + + log"not match that of the execution plan:\nlogical plan leaves: " + + log"${MDC(LogKeys.LOGICAL_PLAN_LEAVES, toString(allLogicalPlanLeaves))}\nexecution " + + log"plan leaves: ${MDC(LogKeys.EXECUTION_PLAN_LEAVES, toString(allExecPlanLeaves))}\n") metricWarningLogged = true } Map.empty diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ResolveWriteToStream.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ResolveWriteToStream.scala index 35bb7db6a6e13..0d4ab9d147b8d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ResolveWriteToStream.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ResolveWriteToStream.scala @@ -23,6 +23,8 @@ import scala.util.control.NonFatal import org.apache.hadoop.fs.Path +import org.apache.spark.internal.LogKeys.{CHECKPOINT_LOCATION, CHECKPOINT_ROOT, CONFIG, PATH} +import org.apache.spark.internal.MDC import org.apache.spark.sql.catalyst.SQLConfHelper import org.apache.spark.sql.catalyst.analysis.UnsupportedOperationChecker import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan @@ -42,8 +44,8 @@ object ResolveWriteToStream extends Rule[LogicalPlan] with SQLConfHelper { val (resolvedCheckpointLocation, deleteCheckpointOnStop) = resolveCheckpointLocation(s) if (conf.adaptiveExecutionEnabled) { - logWarning(s"${SQLConf.ADAPTIVE_EXECUTION_ENABLED.key} " + - "is not supported in streaming DataFrames/Datasets and will be disabled.") + logWarning(log"${MDC(CONFIG, SQLConf.ADAPTIVE_EXECUTION_ENABLED.key)} " + + log"is not supported in streaming DataFrames/Datasets and will be disabled.") } if (conf.isUnsupportedOperationCheckEnabled) { @@ -77,10 +79,11 @@ object ResolveWriteToStream extends Rule[LogicalPlan] with SQLConfHelper { if (s.useTempCheckpointLocation) { deleteCheckpointOnStop = true val tempDir = Utils.createTempDir(namePrefix = "temporary").getCanonicalPath - logWarning("Temporary checkpoint location created which is deleted normally when" + - s" the query didn't fail: $tempDir. If it's required to delete it under any" + - s" circumstances, please set ${SQLConf.FORCE_DELETE_TEMP_CHECKPOINT_LOCATION.key} to" + - s" true. Important to know deleting temp checkpoint folder is best effort.") + logWarning(log"Temporary checkpoint location created which is deleted normally when" + + log" the query didn't fail: ${MDC(PATH, tempDir)}. If it's required to delete " + + log"it under any circumstances, please set " + + log"${MDC(CONFIG, SQLConf.FORCE_DELETE_TEMP_CHECKPOINT_LOCATION.key)} to" + + log" true. Important to know deleting temp checkpoint folder is best effort.") // SPARK-42676 - Write temp checkpoints for streaming queries to local filesystem // even if default FS is set differently. // This is a band-aid fix. Ideally we should convert `tempDir` to URIs, but there @@ -131,7 +134,8 @@ object ResolveWriteToStream extends Rule[LogicalPlan] with SQLConfHelper { val checkpointDir = fileManager.createCheckpointDirectory() checkpointDir.toString } - logInfo(s"Checkpoint root $checkpointLocation resolved to $resolvedCheckpointRoot.") + logInfo(log"Checkpoint root ${MDC(CHECKPOINT_LOCATION, checkpointLocation)} " + + log"resolved to ${MDC(CHECKPOINT_ROOT, resolvedCheckpointRoot)}.") (resolvedCheckpointRoot, deleteCheckpointOnStop) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StateTypesEncoderUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StateTypesEncoderUtils.scala index 56b0731e0db47..ed881b49ec1e9 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StateTypesEncoderUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StateTypesEncoderUtils.scala @@ -27,6 +27,9 @@ import org.apache.spark.sql.types.{BinaryType, LongType, StructType} object TransformWithStateKeyValueRowSchema { val KEY_ROW_SCHEMA: StructType = new StructType().add("key", BinaryType) + val COMPOSITE_KEY_ROW_SCHEMA: StructType = new StructType() + .add("key", BinaryType) + .add("userKey", BinaryType) val VALUE_ROW_SCHEMA: StructType = new StructType() .add("value", BinaryType) val VALUE_ROW_SCHEMA_WITH_TTL: StructType = new StructType() @@ -192,6 +195,28 @@ class CompositeKeyStateEncoder[GK, K, V]( compositeKeyRow } + def decodeUserKeyFromTTLRow(row: CompositeKeyTTLRow): K = { + val bytes = row.userKey + reusedKeyRow.pointTo(bytes, bytes.length) + val userKey = userKeyRowToObjDeserializer.apply(reusedKeyRow) + userKey + } + + /** + * Grouping key and user key are encoded as a row of `schemaForCompositeKeyRow` schema. + * Grouping key will be encoded in `RocksDBStateEncoder` as the prefix column. + */ + def encodeCompositeKey( + groupingKeyByteArr: Array[Byte], + userKeyByteArr: Array[Byte]): UnsafeRow = { + val compositeKeyRow = compositeKeyProjection(InternalRow(groupingKeyByteArr, userKeyByteArr)) + compositeKeyRow + } + + def serializeUserKey(userKey: K): Array[Byte] = { + userKeySerializer.apply(userKey).asInstanceOf[UnsafeRow].getBytes + } + /** * The input row is of composite Key schema. * Only user key is returned though grouping key also exist in the row. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StatefulProcessorHandleImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StatefulProcessorHandleImpl.scala index 885df96a206a0..277e1516425d8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StatefulProcessorHandleImpl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StatefulProcessorHandleImpl.scala @@ -19,14 +19,16 @@ package org.apache.spark.sql.execution.streaming import java.util import java.util.UUID +import scala.collection.mutable + import org.apache.spark.TaskContext import org.apache.spark.internal.Logging import org.apache.spark.sql.Encoder import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder -import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.execution.metric.SQLMetric +import org.apache.spark.sql.execution.streaming.StatefulProcessorHandleState.PRE_INIT import org.apache.spark.sql.execution.streaming.state._ -import org.apache.spark.sql.streaming.{ListState, MapState, QueryInfo, StatefulProcessorHandle, TimeMode, TTLConfig, ValueState} +import org.apache.spark.sql.streaming.{ListState, MapState, QueryInfo, TimeMode, TTLConfig, ValueState} import org.apache.spark.util.Utils /** @@ -48,7 +50,7 @@ object ImplicitGroupingKeyTracker { */ object StatefulProcessorHandleState extends Enumeration { type StatefulProcessorHandleState = Value - val CREATED, INITIALIZED, DATA_PROCESSED, TIMER_PROCESSED, CLOSED = Value + val CREATED, PRE_INIT, INITIALIZED, DATA_PROCESSED, TIMER_PROCESSED, CLOSED = Value } class QueryInfoImpl( @@ -85,7 +87,7 @@ class StatefulProcessorHandleImpl( isStreaming: Boolean = true, batchTimestampMs: Option[Long] = None, metrics: Map[String, SQLMetric] = Map.empty) - extends StatefulProcessorHandle with Logging { + extends StatefulProcessorHandleImplBase(timeMode) with Logging { import StatefulProcessorHandleState._ /** @@ -96,6 +98,8 @@ class StatefulProcessorHandleImpl( private val BATCH_QUERY_ID = "00000000-0000-0000-0000-000000000000" + currState = CREATED + private def buildQueryInfo(): QueryInfo = { val taskCtxOpt = Option(TaskContext.get()) val (queryId, batchId) = if (!isStreaming) { @@ -113,22 +117,14 @@ class StatefulProcessorHandleImpl( private lazy val currQueryInfo: QueryInfo = buildQueryInfo() - private var currState: StatefulProcessorHandleState = CREATED - private def incrementMetric(metricName: String): Unit = { metrics.get(metricName).foreach(_.add(1)) } - def setHandleState(newState: StatefulProcessorHandleState): Unit = { - currState = newState - } - - def getHandleState: StatefulProcessorHandleState = currState - override def getValueState[T]( stateName: String, valEncoder: Encoder[T]): ValueState[T] = { - verifyStateVarOperations("get_value_state") + verifyStateVarOperations("get_value_state", CREATED) incrementMetric("numValueStateVars") val resultState = new ValueStateImpl[T](store, stateName, keyEncoder, valEncoder) resultState @@ -138,7 +134,7 @@ class StatefulProcessorHandleImpl( stateName: String, valEncoder: Encoder[T], ttlConfig: TTLConfig): ValueState[T] = { - verifyStateVarOperations("get_value_state") + verifyStateVarOperations("get_value_state", CREATED) validateTTLConfig(ttlConfig, stateName) assert(batchTimestampMs.isDefined) @@ -153,25 +149,6 @@ class StatefulProcessorHandleImpl( private lazy val timerState = new TimerStateImpl(store, timeMode, keyEncoder) - private def verifyStateVarOperations(operationType: String): Unit = { - if (currState != CREATED) { - throw StateStoreErrors.cannotPerformOperationWithInvalidHandleState(operationType, - currState.toString) - } - } - - private def verifyTimerOperations(operationType: String): Unit = { - if (timeMode == NoTime) { - throw StateStoreErrors.cannotPerformOperationWithInvalidTimeMode(operationType, - timeMode.toString) - } - - if (currState < INITIALIZED || currState >= TIMER_PROCESSED) { - throw StateStoreErrors.cannotPerformOperationWithInvalidHandleState(operationType, - currState.toString) - } - } - /** * Function to register a timer for the given expiryTimestampMs * @param expiryTimestampMs - timestamp in milliseconds for the timer to expire @@ -232,14 +209,14 @@ class StatefulProcessorHandleImpl( * @param stateName - name of the state variable */ override def deleteIfExists(stateName: String): Unit = { - verifyStateVarOperations("delete_if_exists") + verifyStateVarOperations("delete_if_exists", CREATED) if (store.removeColFamilyIfExists(stateName)) { incrementMetric("numDeletedStateVars") } } override def getListState[T](stateName: String, valEncoder: Encoder[T]): ListState[T] = { - verifyStateVarOperations("get_list_state") + verifyStateVarOperations("get_list_state", CREATED) incrementMetric("numListStateVars") val resultState = new ListStateImpl[T](store, stateName, keyEncoder, valEncoder) resultState @@ -265,7 +242,7 @@ class StatefulProcessorHandleImpl( valEncoder: Encoder[T], ttlConfig: TTLConfig): ListState[T] = { - verifyStateVarOperations("get_list_state") + verifyStateVarOperations("get_list_state", CREATED) validateTTLConfig(ttlConfig, stateName) assert(batchTimestampMs.isDefined) @@ -281,12 +258,29 @@ class StatefulProcessorHandleImpl( stateName: String, userKeyEnc: Encoder[K], valEncoder: Encoder[V]): MapState[K, V] = { - verifyStateVarOperations("get_map_state") + verifyStateVarOperations("get_map_state", CREATED) incrementMetric("numMapStateVars") val resultState = new MapStateImpl[K, V](store, stateName, keyEncoder, userKeyEnc, valEncoder) resultState } + override def getMapState[K, V]( + stateName: String, + userKeyEnc: Encoder[K], + valEncoder: Encoder[V], + ttlConfig: TTLConfig): MapState[K, V] = { + verifyStateVarOperations("get_map_state", CREATED) + validateTTLConfig(ttlConfig, stateName) + + assert(batchTimestampMs.isDefined) + val mapStateWithTTL = new MapStateImplWithTTL[K, V](store, stateName, keyEncoder, userKeyEnc, + valEncoder, ttlConfig, batchTimestampMs.get) + incrementMetric("numMapStateWithTTLVars") + ttlStates.add(mapStateWithTTL) + + mapStateWithTTL + } + private def validateTTLConfig(ttlConfig: TTLConfig, stateName: String): Unit = { val ttlDuration = ttlConfig.ttlDuration if (timeMode != TimeMode.ProcessingTime()) { @@ -296,3 +290,111 @@ class StatefulProcessorHandleImpl( } } } + +/** + * This DriverStatefulProcessorHandleImpl is used within TransformWithExec + * on the driver side to collect the columnFamilySchemas before any processing is + * actually done. We need this class because we can only collect the schemas after + * the StatefulProcessor is initialized. + */ +class DriverStatefulProcessorHandleImpl(timeMode: TimeMode) + extends StatefulProcessorHandleImplBase(timeMode) { + + private[sql] val columnFamilySchemaUtils = ColumnFamilySchemaUtilsV1 + + // Because this is only happening on the driver side, there is only + // one task modifying and accessing this map at a time + private[sql] val columnFamilySchemas: mutable.Map[String, ColumnFamilySchema] = + new mutable.HashMap[String, ColumnFamilySchema]() + + def getColumnFamilySchemas: Map[String, ColumnFamilySchema] = columnFamilySchemas.toMap + + override def getValueState[T](stateName: String, valEncoder: Encoder[T]): ValueState[T] = { + verifyStateVarOperations("get_value_state", PRE_INIT) + val colFamilySchema = columnFamilySchemaUtils. + getValueStateSchema(stateName, false) + columnFamilySchemas.put(stateName, colFamilySchema) + null.asInstanceOf[ValueState[T]] + } + + override def getValueState[T]( + stateName: String, + valEncoder: Encoder[T], + ttlConfig: TTLConfig): ValueState[T] = { + verifyStateVarOperations("get_value_state", PRE_INIT) + val colFamilySchema = columnFamilySchemaUtils. + getValueStateSchema(stateName, true) + columnFamilySchemas.put(stateName, colFamilySchema) + null.asInstanceOf[ValueState[T]] + } + + override def getListState[T](stateName: String, valEncoder: Encoder[T]): ListState[T] = { + verifyStateVarOperations("get_list_state", PRE_INIT) + val colFamilySchema = columnFamilySchemaUtils. + getListStateSchema(stateName, false) + columnFamilySchemas.put(stateName, colFamilySchema) + null.asInstanceOf[ListState[T]] + } + + override def getListState[T]( + stateName: String, + valEncoder: Encoder[T], + ttlConfig: TTLConfig): ListState[T] = { + verifyStateVarOperations("get_list_state", PRE_INIT) + val colFamilySchema = columnFamilySchemaUtils. + getListStateSchema(stateName, true) + columnFamilySchemas.put(stateName, colFamilySchema) + null.asInstanceOf[ListState[T]] + } + + override def getMapState[K, V]( + stateName: String, + userKeyEnc: Encoder[K], + valEncoder: Encoder[V]): MapState[K, V] = { + verifyStateVarOperations("get_map_state", PRE_INIT) + val colFamilySchema = columnFamilySchemaUtils. + getMapStateSchema(stateName, userKeyEnc, false) + columnFamilySchemas.put(stateName, colFamilySchema) + null.asInstanceOf[MapState[K, V]] + } + + override def getMapState[K, V]( + stateName: String, + userKeyEnc: Encoder[K], + valEncoder: Encoder[V], + ttlConfig: TTLConfig): MapState[K, V] = { + verifyStateVarOperations("get_map_state", PRE_INIT) + val colFamilySchema = columnFamilySchemaUtils. + getMapStateSchema(stateName, userKeyEnc, true) + columnFamilySchemas.put(stateName, colFamilySchema) + null.asInstanceOf[MapState[K, V]] + } + + /** Function to return queryInfo for currently running task */ + override def getQueryInfo(): QueryInfo = { + new QueryInfoImpl(UUID.randomUUID(), UUID.randomUUID(), 0L) + } + + /** + * Methods that are only included to satisfy the interface. + * These methods will fail if called from the driver side, as the handle + * will be in the PRE_INIT phase, and all these timer operations need to be + * called from the INITIALIZED phase. + */ + override def registerTimer(expiryTimestampMs: Long): Unit = { + verifyTimerOperations("register_timer") + } + + override def deleteTimer(expiryTimestampMs: Long): Unit = { + verifyTimerOperations("delete_timer") + } + + override def listTimers(): Iterator[Long] = { + verifyTimerOperations("list_timers") + Iterator.empty + } + + override def deleteIfExists(stateName: String): Unit = { + verifyStateVarOperations("delete_if_exists", PRE_INIT) + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StatefulProcessorHandleImplBase.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StatefulProcessorHandleImplBase.scala new file mode 100644 index 0000000000000..3b952967e35d9 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StatefulProcessorHandleImplBase.scala @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution.streaming + +import org.apache.spark.sql.catalyst.plans.logical.NoTime +import org.apache.spark.sql.execution.streaming.StatefulProcessorHandleState.{INITIALIZED, PRE_INIT, StatefulProcessorHandleState, TIMER_PROCESSED} +import org.apache.spark.sql.execution.streaming.state.StateStoreErrors +import org.apache.spark.sql.streaming.{StatefulProcessorHandle, TimeMode} + +abstract class StatefulProcessorHandleImplBase(timeMode: TimeMode) + extends StatefulProcessorHandle { + + protected var currState: StatefulProcessorHandleState = PRE_INIT + + def setHandleState(newState: StatefulProcessorHandleState): Unit = { + currState = newState + } + + def getHandleState: StatefulProcessorHandleState = currState + + def verifyTimerOperations(operationType: String): Unit = { + if (timeMode == NoTime) { + throw StateStoreErrors.cannotPerformOperationWithInvalidTimeMode(operationType, + timeMode.toString) + } + + if (currState < INITIALIZED || currState >= TIMER_PROCESSED) { + throw StateStoreErrors.cannotPerformOperationWithInvalidHandleState(operationType, + currState.toString) + } + } + + def verifyStateVarOperations( + operationType: String, + requiredState: StatefulProcessorHandleState): Unit = { + if (currState != requiredState) { + throw StateStoreErrors.cannotPerformOperationWithInvalidHandleState(operationType, + currState.toString) + } + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala index 50a73082a8c4a..81f7acdb755bc 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala @@ -32,7 +32,8 @@ import com.google.common.util.concurrent.UncheckedExecutionException import org.apache.hadoop.fs.Path import org.apache.spark.{JobArtifactSet, SparkContext, SparkException, SparkThrowable} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{CHECKPOINT_PATH, CHECKPOINT_ROOT, LOGICAL_PLAN, PATH, PRETTY_ID_STRING, SPARK_DATA_STREAM} import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.streaming.InternalOutputModes._ @@ -260,7 +261,8 @@ abstract class StreamExecution( * has been posted to all the listeners. */ def start(): Unit = { - logInfo(s"Starting $prettyIdString. Use $resolvedCheckpointRoot to store the query checkpoint.") + logInfo(log"Starting ${MDC(PRETTY_ID_STRING, prettyIdString)}. " + + log"Use ${MDC(CHECKPOINT_ROOT, resolvedCheckpointRoot)} to store the query checkpoint.") queryExecutionThread.setDaemon(true) queryExecutionThread.start() startLatch.await() // Wait until thread started and QueryStart event has been posted @@ -318,6 +320,10 @@ abstract class StreamExecution( batchWatermarkMs = 0, batchTimestampMs = 0, sparkSessionForStream.conf) if (state.compareAndSet(INITIALIZING, ACTIVE)) { + // Log logical plan at the start of the query to help debug issues related to + // plan changes. + logInfo(log"Finish initializing with logical plan:\n${MDC(LOGICAL_PLAN, logicalPlan)}") + // Unblock `awaitInitialization` initializationLatch.countDown() runActivatedStream(sparkSessionForStream) @@ -366,7 +372,7 @@ abstract class StreamExecution( case _ => None } - logError(s"Query $prettyIdString terminated with error", e) + logError(log"Query ${MDC(PRETTY_ID_STRING, prettyIdString)} terminated with error", e) getLatestExecutionContext().updateStatusMessage(s"Terminated with exception: $message") // Rethrow the fatal errors to allow the user using `Thread.UncaughtExceptionHandler` to // handle them @@ -405,13 +411,13 @@ abstract class StreamExecution( .getConf(SQLConf.FORCE_DELETE_TEMP_CHECKPOINT_LOCATION) || exception.isEmpty)) { val checkpointPath = new Path(resolvedCheckpointRoot) try { - logInfo(s"Deleting checkpoint $checkpointPath.") + logInfo(log"Deleting checkpoint ${MDC(CHECKPOINT_PATH, checkpointPath)}.") fileManager.delete(checkpointPath) } catch { case NonFatal(e) => // Deleting temp checkpoint folder is best effort, don't throw non fatal exceptions // when we cannot delete them. - logWarning(s"Cannot delete $checkpointPath", e) + logWarning(log"Cannot delete ${MDC(PATH, checkpointPath)}", e) } } } finally { @@ -446,7 +452,8 @@ abstract class StreamExecution( source.stop() } catch { case NonFatal(e) => - logWarning(s"Failed to stop streaming source: $source. Resources may have leaked.", e) + logWarning(log"Failed to stop streaming source: ${MDC(SPARK_DATA_STREAM, source)}. " + + log"Resources may have leaked.", e) } } } @@ -682,7 +689,7 @@ object StreamExecution { classOf[ClosedByInterruptException].getName) val PROXY_ERROR = ( "py4j.protocol.Py4JJavaError: An error occurred while calling" + - s".+(\\r\\n|\\r|\\n): (${IO_EXCEPTION_NAMES.mkString("|")})").r + s"((.|\\r\\n|\\r|\\n)*)(${IO_EXCEPTION_NAMES.mkString("|")})").r @scala.annotation.tailrec def isInterruptionException(e: Throwable, sc: SparkContext): Boolean = e match { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamMetadata.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamMetadata.scala index 978cb3c34f606..84519150ca42b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamMetadata.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamMetadata.scala @@ -28,7 +28,7 @@ import org.apache.hadoop.fs.{FileAlreadyExistsException, FSDataInputStream, Path import org.json4s.{Formats, NoTypeHints} import org.json4s.jackson.Serialization -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.sql.errors.QueryExecutionErrors import org.apache.spark.sql.execution.streaming.CheckpointFileManager.CancellableFSDataOutputStream @@ -60,7 +60,7 @@ object StreamMetadata extends Logging { Some(metadata) } catch { case NonFatal(e) => - logError(s"Error reading stream metadata from $metadataFile", e) + logError(log"Error reading stream metadata from ${MDC(LogKeys.PATH, metadataFile)}", e) throw e } finally { IOUtils.closeQuietly(input) @@ -91,7 +91,8 @@ object StreamMetadata extends Logging { if (output != null) { output.cancel() } - logError(s"Error writing stream metadata $metadata to $metadataFile", e) + logError(log"Error writing stream metadata ${MDC(LogKeys.METADATA, metadata)} to " + + log"${MDC(LogKeys.PATH, metadataFile)}", e) throw e } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala index 20a05a1000338..ea275a28780ef 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala @@ -19,6 +19,8 @@ package org.apache.spark.sql.execution.streaming import java.util.concurrent.TimeUnit.NANOSECONDS +import org.apache.hadoop.conf.Configuration + import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, GenericInternalRow, JoinedRow, Literal, Predicate, UnsafeProjection, UnsafeRow} @@ -31,6 +33,7 @@ import org.apache.spark.sql.execution.streaming.StreamingSymmetricHashJoinHelper import org.apache.spark.sql.execution.streaming.state._ import org.apache.spark.sql.execution.streaming.state.SymmetricHashJoinStateManager.KeyToValuePair import org.apache.spark.sql.internal.{SessionState, SQLConf} +import org.apache.spark.sql.types.StructType import org.apache.spark.util.{CompletionIterator, SerializableConfiguration} @@ -243,6 +246,26 @@ case class StreamingSymmetricHashJoinExec( watermarkUsedForStateCleanup && watermarkHasChanged } + override def validateAndMaybeEvolveStateSchema( + hadoopConf: Configuration, + batchId: Long, + stateSchemaVersion: Int): Array[String] = { + var result: Map[String, (StructType, StructType)] = Map.empty + // get state schema for state stores on left side of the join + result ++= SymmetricHashJoinStateManager.getSchemaForStateStores(LeftSide, + left.output, leftKeys, stateFormatVersion) + + // get state schema for state stores on right side of the join + result ++= SymmetricHashJoinStateManager.getSchemaForStateStores(RightSide, + right.output, rightKeys, stateFormatVersion) + + // validate and maybe evolve schema for all state stores across both sides of the join + result.iterator.flatMap { case (stateStoreName, (keySchema, valueSchema)) => + StateSchemaCompatibilityChecker.validateAndMaybeEvolveStateSchema(getStateInfo, hadoopConf, + keySchema, valueSchema, session.sessionState, storeName = stateStoreName) + }.toArray + } + protected override def doExecute(): RDD[InternalRow] = { val stateStoreCoord = session.sessionState.streamingQueryManager.stateStoreCoordinator val stateStoreNames = SymmetricHashJoinStateManager.allStateStoreNames(LeftSide, RightSide) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/TTLState.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/TTLState.scala index b245f8fc14d47..02efcefe19ca6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/TTLState.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/TTLState.scala @@ -27,6 +27,10 @@ object StateTTLSchema { val TTL_KEY_ROW_SCHEMA: StructType = new StructType() .add("expirationMs", LongType) .add("groupingKey", BinaryType) + val TTL_COMPOSITE_KEY_ROW_SCHEMA: StructType = new StructType() + .add("expirationMs", LongType) + .add("groupingKey", BinaryType) + .add("userKey", BinaryType) val TTL_VALUE_ROW_SCHEMA: StructType = StructType(Array(StructField("__dummy__", NullType))) } @@ -41,6 +45,18 @@ case class SingleKeyTTLRow( groupingKey: Array[Byte], expirationMs: Long) +/** + * Encapsulates the ttl row information stored in [[CompositeKeyTTLStateImpl]]. + * + * @param groupingKey grouping key for which ttl is set + * @param userKey user key for which ttl is set + * @param expirationMs expiration time for the grouping key + */ +case class CompositeKeyTTLRow( + groupingKey: Array[Byte], + userKey: Array[Byte], + expirationMs: Long) + /** * Represents the underlying state for secondary TTL Index for a user defined * state variable. @@ -59,23 +75,6 @@ trait TTLState { * @return number of values cleaned up. */ def clearExpiredState(): Long - - /** - * Clears the user state associated with this grouping key - * if it has expired. This function is called by Spark to perform - * cleanup at the end of transformWithState processing. - * - * Spark uses a secondary index to determine if the user state for - * this grouping key has expired. However, its possible that the user - * has updated the TTL and secondary index is out of date. Implementations - * must validate that the user State has actually expired before cleanup based - * on their own State data. - * - * @param groupingKey grouping key for which cleanup should be performed. - * - * @return how many state objects were cleaned up. - */ - def clearIfExpired(groupingKey: Array[Byte]): Long } /** @@ -99,6 +98,18 @@ abstract class SingleKeyTTLStateImpl( store.createColFamilyIfAbsent(ttlColumnFamilyName, TTL_KEY_ROW_SCHEMA, TTL_VALUE_ROW_SCHEMA, RangeKeyScanStateEncoderSpec(TTL_KEY_ROW_SCHEMA, Seq(0)), isInternal = true) + /** + * This function will be called when clear() on State Variables + * with ttl enabled is called. This function should clear any + * associated ttlState, since we are clearing the user state. + */ + def clearTTLState(): Unit = { + val iterator = store.iterator(ttlColumnFamilyName) + iterator.foreach { kv => + store.remove(kv.key, ttlColumnFamilyName) + } + } + def upsertTTLForStateKey( expirationMs: Long, groupingKey: Array[Byte]): Unit = { @@ -163,6 +174,112 @@ abstract class SingleKeyTTLStateImpl( } } } + + /** + * Clears the user state associated with this grouping key + * if it has expired. This function is called by Spark to perform + * cleanup at the end of transformWithState processing. + * + * Spark uses a secondary index to determine if the user state for + * this grouping key has expired. However, its possible that the user + * has updated the TTL and secondary index is out of date. Implementations + * must validate that the user State has actually expired before cleanup based + * on their own State data. + * + * @param groupingKey grouping key for which cleanup should be performed. + * + * @return true if the state was cleared, false otherwise. + */ + def clearIfExpired(groupingKey: Array[Byte]): Long +} + +/** + * Manages the ttl information for user state keyed with a single key (grouping key). + */ +abstract class CompositeKeyTTLStateImpl( + stateName: String, + store: StateStore, + ttlExpirationMs: Long) + extends TTLState { + + import org.apache.spark.sql.execution.streaming.StateTTLSchema._ + + private val ttlColumnFamilyName = s"_ttl_$stateName" + private val ttlKeyEncoder = UnsafeProjection.create(TTL_COMPOSITE_KEY_ROW_SCHEMA) + + // empty row used for values + private val EMPTY_ROW = + UnsafeProjection.create(Array[DataType](NullType)).apply(InternalRow.apply(null)) + + store.createColFamilyIfAbsent(ttlColumnFamilyName, TTL_COMPOSITE_KEY_ROW_SCHEMA, + TTL_VALUE_ROW_SCHEMA, RangeKeyScanStateEncoderSpec(TTL_COMPOSITE_KEY_ROW_SCHEMA, + Seq(0)), isInternal = true) + + def clearTTLState(): Unit = { + val iterator = store.iterator(ttlColumnFamilyName) + iterator.foreach { kv => + store.remove(kv.key, ttlColumnFamilyName) + } + } + + def upsertTTLForStateKey( + expirationMs: Long, + groupingKey: Array[Byte], + userKey: Array[Byte]): Unit = { + val encodedTtlKey = ttlKeyEncoder(InternalRow(expirationMs, groupingKey, userKey)) + store.put(encodedTtlKey, EMPTY_ROW, ttlColumnFamilyName) + } + + /** + * Clears any state which has ttl older than [[ttlExpirationMs]]. + */ + override def clearExpiredState(): Long = { + val iterator = store.iterator(ttlColumnFamilyName) + var numRemovedElements = 0L + iterator.takeWhile { kv => + val expirationMs = kv.key.getLong(0) + StateTTL.isExpired(expirationMs, ttlExpirationMs) + }.foreach { kv => + val groupingKey = kv.key.getBinary(1) + val userKey = kv.key.getBinary(2) + numRemovedElements += clearIfExpired(groupingKey, userKey) + store.remove(kv.key, ttlColumnFamilyName) + } + numRemovedElements + } + + private[sql] def ttlIndexIterator(): Iterator[CompositeKeyTTLRow] = { + val ttlIterator = store.iterator(ttlColumnFamilyName) + + new Iterator[CompositeKeyTTLRow] { + override def hasNext: Boolean = ttlIterator.hasNext + + override def next(): CompositeKeyTTLRow = { + val kv = ttlIterator.next() + CompositeKeyTTLRow( + expirationMs = kv.key.getLong(0), + groupingKey = kv.key.getBinary(1), + userKey = kv.key.getBinary(2) + ) + } + } + } + + /** + * Clears the user state associated with this grouping key + * if it has expired. This function is called by Spark to perform + * cleanup at the end of transformWithState processing. + * + * Spark uses a secondary index to determine if the user state for + * this grouping key has expired. However, its possible that the user + * has updated the TTL and secondary index is out of date. Implementations + * must validate that the user State has actually expired before cleanup based + * on their own State data. + * + * @param groupingKey grouping key for which cleanup should be performed. + * @param userKey user key for which cleanup should be performed. + */ + def clearIfExpired(groupingKey: Array[Byte], userKey: Array[Byte]): Long } /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/TimerStateImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/TimerStateImpl.scala index e83c83df53229..a9c5a70e995d3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/TimerStateImpl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/TimerStateImpl.scala @@ -16,7 +16,8 @@ */ package org.apache.spark.sql.execution.streaming -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{EXPIRY_TIMESTAMP, KEY} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.expressions._ @@ -130,8 +131,8 @@ class TimerStateImpl( def registerTimer(expiryTimestampMs: Long): Unit = { val groupingKey = getGroupingKey(keyToTsCFName) if (exists(groupingKey, expiryTimestampMs)) { - logWarning(s"Failed to register timer for key=$groupingKey and " + - s"timestamp=$expiryTimestampMs since it already exists") + logWarning(log"Failed to register timer for key=${MDC(KEY, groupingKey)} and " + + log"timestamp=${MDC(EXPIRY_TIMESTAMP, expiryTimestampMs)} ms since it already exists") } else { store.put(encodeKey(groupingKey, expiryTimestampMs), EMPTY_ROW, keyToTsCFName) store.put(encodeSecIndexKey(groupingKey, expiryTimestampMs), EMPTY_ROW, tsToKeyCFName) @@ -147,8 +148,8 @@ class TimerStateImpl( val groupingKey = getGroupingKey(keyToTsCFName) if (!exists(groupingKey, expiryTimestampMs)) { - logWarning(s"Failed to delete timer for key=$groupingKey and " + - s"timestamp=$expiryTimestampMs since it does not exist") + logWarning(log"Failed to delete timer for key=${MDC(KEY, groupingKey)} and " + + log"timestamp=${MDC(EXPIRY_TIMESTAMP, expiryTimestampMs)} ms since it does not exist") } else { store.remove(encodeKey(groupingKey, expiryTimestampMs), keyToTsCFName) store.remove(encodeSecIndexKey(groupingKey, expiryTimestampMs), tsToKeyCFName) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/TransformWithStateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/TransformWithStateExec.scala index f5d2610d78d9e..60a9f54628a66 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/TransformWithStateExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/TransformWithStateExec.scala @@ -19,6 +19,9 @@ package org.apache.spark.sql.execution.streaming import java.util.UUID import java.util.concurrent.TimeUnit.NANOSECONDS +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path + import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow @@ -78,15 +81,66 @@ case class TransformWithStateExec( override def shortName: String = "transformWithStateExec" override def shouldRunAnotherBatch(newInputWatermark: Long): Boolean = { + if (timeMode == ProcessingTime) { + // TODO: check if we can return true only if actual timers are registered, or there is + // expired state + true + } else if (outputMode == OutputMode.Append || outputMode == OutputMode.Update) { + eventTimeWatermarkForEviction.isDefined && + newInputWatermark > eventTimeWatermarkForEviction.get + } else { + false + } + } + + /** + * We initialize this processor handle in the driver to run the init function + * and fetch the schemas of the state variables initialized in this processor. + * @return a new instance of the driver processor handle + */ + private def getDriverProcessorHandle(): DriverStatefulProcessorHandleImpl = { + val driverProcessorHandle = new DriverStatefulProcessorHandleImpl(timeMode) + driverProcessorHandle.setHandleState(StatefulProcessorHandleState.PRE_INIT) + statefulProcessor.setHandle(driverProcessorHandle) + statefulProcessor.init(outputMode, timeMode) + driverProcessorHandle + } + + /** + * Fetching the columnFamilySchemas from the StatefulProcessorHandle + * after init is called. + */ + private def getColFamilySchemas(): Map[String, ColumnFamilySchema] = { + val columnFamilySchemas = getDriverProcessorHandle().getColumnFamilySchemas + closeProcessorHandle() + columnFamilySchemas + } + + /** + * This method is used for the driver-side stateful processor after we + * have collected all the necessary schemas. + * This instance of the stateful processor won't be used again. + */ + private def closeProcessorHandle(): Unit = { + statefulProcessor.close() + statefulProcessor.setHandle(null) + } + + /** + * Controls watermark propagation to downstream modes. If timeMode is + * ProcessingTime, the output rows cannot be interpreted in eventTime, hence + * this node will not propagate watermark in this timeMode. + * + * For timeMode EventTime, output watermark is same as input Watermark because + * transformWithState does not allow users to set the event time column to be + * earlier than the watermark. + */ + override def produceOutputWatermark(inputWatermarkMs: Long): Option[Long] = { timeMode match { case ProcessingTime => - // TODO: check if we can return true only if actual timers are registered, or there is - // expired state - true - case EventTime => - eventTimeWatermarkForEviction.isDefined && - newInputWatermark > eventTimeWatermarkForEviction.get - case _ => false + None + case _ => + Some(inputWatermarkMs) } } @@ -313,11 +367,55 @@ case class TransformWithStateExec( "Number of value state variables with TTL"), StatefulOperatorCustomSumMetric("numListStateWithTTLVars", "Number of list state variables with TTL"), + StatefulOperatorCustomSumMetric("numMapStateWithTTLVars", + "Number of map state variables with TTL"), StatefulOperatorCustomSumMetric("numValuesRemovedDueToTTLExpiry", "Number of values removed due to TTL expiry") ) } + override def validateAndMaybeEvolveStateSchema( + hadoopConf: Configuration, + batchId: Long, + stateSchemaVersion: Int): Array[String] = { + assert(stateSchemaVersion >= 3) + val newColumnFamilySchemas = getColFamilySchemas() + val schemaFile = new StateSchemaV3File( + hadoopConf, stateSchemaDirPath(StateStoreId.DEFAULT_STORE_NAME).toString) + // TODO: [SPARK-48849] Read the schema path from the OperatorStateMetadata file + // and validate it with the new schema + + // Write the new schema to the schema file + val schemaPath = schemaFile.addWithUUID(batchId, newColumnFamilySchemas.values.toList) + Array(schemaPath.toString) + } + + private def validateSchemas( + oldSchemas: List[ColumnFamilySchema], + newSchemas: Map[String, ColumnFamilySchema]): Unit = { + oldSchemas.foreach { case oldSchema: ColumnFamilySchemaV1 => + newSchemas.get(oldSchema.columnFamilyName).foreach { + case newSchema: ColumnFamilySchemaV1 => + StateSchemaCompatibilityChecker.check( + (oldSchema.keySchema, oldSchema.valueSchema), + (newSchema.keySchema, newSchema.valueSchema), + ignoreValueSchema = false + ) + } + } + } + + private def stateSchemaDirPath(storeName: String): Path = { + assert(storeName == StateStoreId.DEFAULT_STORE_NAME) + def stateInfo = getStateInfo + val stateCheckpointPath = + new Path(getStateInfo.checkpointLocation, + s"${stateInfo.operatorId.toString}") + + val storeNamePath = new Path(stateCheckpointPath, storeName) + new Path(new Path(storeNamePath, "_metadata"), "schema") + } + override protected def doExecute(): RDD[InternalRow] = { metrics // force lazy init at driver diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/TriggerExecutor.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/TriggerExecutor.scala index 143230759724a..bfa838e43e288 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/TriggerExecutor.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/TriggerExecutor.scala @@ -17,7 +17,8 @@ package org.apache.spark.sql.execution.streaming -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{ELAPSED_TIME, TRIGGER_INTERVAL} import org.apache.spark.util.{Clock, SystemClock} trait TriggerExecutor { @@ -98,8 +99,9 @@ case class ProcessingTimeExecutor( /** Called when a batch falls behind */ def notifyBatchFallingBehind(realElapsedTimeMs: Long): Unit = { - logWarning("Current batch is falling behind. The trigger interval is " + - s"${intervalMs} milliseconds, but spent ${realElapsedTimeMs} milliseconds") + logWarning(log"Current batch is falling behind. The trigger interval is " + + log"${MDC(TRIGGER_INTERVAL, intervalMs)}} milliseconds, but spent " + + log"${MDC(ELAPSED_TIME, realElapsedTimeMs)} milliseconds") } /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ValueStateImplWithTTL.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ValueStateImplWithTTL.scala index dbfa4586dc0a6..0ed5a6f29a984 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ValueStateImplWithTTL.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ValueStateImplWithTTL.scala @@ -96,6 +96,7 @@ class ValueStateImplWithTTL[S]( /** Function to remove state for given key */ override def clear(): Unit = { store.remove(stateTypesEncoder.encodeGroupingKey(), stateName) + clearTTLState() } def clearIfExpired(groupingKey: Array[Byte]): Long = { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/WatermarkTracker.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/WatermarkTracker.scala index b0f8cf9cd1846..54c47ec4e6ed8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/WatermarkTracker.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/WatermarkTracker.scala @@ -21,7 +21,8 @@ import java.util.Locale import scala.collection.mutable -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.sql.RuntimeConfig import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.internal.SQLConf @@ -113,7 +114,9 @@ case class WatermarkTracker(policy: MultipleWatermarkPolicy) extends Logging { // `org.apache.spark.sql.execution.streaming.MultipleWatermarkPolicy` implementations. val chosenGlobalWatermark = policy.chooseGlobalWatermark(operatorToWatermarkMap.values.toSeq) if (chosenGlobalWatermark > globalWatermarkMs) { - logInfo(s"Updating event-time watermark from $globalWatermarkMs to $chosenGlobalWatermark ms") + logInfo(log"Updating event-time watermark from " + + log"${MDC(GLOBAL_WATERMARK, globalWatermarkMs)} " + + log"to ${MDC(CHOSEN_WATERMARK, chosenGlobalWatermark)} ms") globalWatermarkMs = chosenGlobalWatermark } else { logDebug(s"Event time watermark didn't move: $chosenGlobalWatermark < $globalWatermarkMs") diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala index 920a7c68314b7..633aaf2682dbb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala @@ -25,6 +25,8 @@ import java.util.function.UnaryOperator import scala.collection.mutable.{Map => MutableMap} import org.apache.spark.SparkEnv +import org.apache.spark.internal.LogKeys._ +import org.apache.spark.internal.MDC import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.expressions.{CurrentDate, CurrentTimestampLike, LocalTimestamp} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan @@ -83,7 +85,9 @@ class ContinuousExecution( v2ToRelationMap.getOrElseUpdate(s, { val metadataPath = s"$resolvedCheckpointRoot/sources/$nextSourceId" nextSourceId += 1 - logInfo(s"Reading table [$table] from DataSourceV2 named '$sourceName' $dsStr") + logInfo(log"Reading table [${MDC(STREAMING_TABLE, table)}] " + + log"from DataSourceV2 named '${MDC(STREAMING_DATA_SOURCE_NAME, sourceName)}' " + + log"${MDC(STREAMING_DATA_SOURCE_DESCRIPTION, dsStr)}") // TODO: operator pushdown. val scan = table.newScanBuilder(options).build() val stream = scan.toContinuousStream(metadataPath) @@ -276,7 +280,7 @@ class ContinuousExecution( false } else if (isActive) { execCtx.batchId = epochEndpoint.askSync[Long](IncrementAndGetEpoch) - logInfo(s"New epoch ${execCtx.batchId} is starting.") + logInfo(log"New epoch ${MDC(BATCH_ID, execCtx.batchId)} is starting.") true } else { false @@ -307,7 +311,8 @@ class ContinuousExecution( } catch { case t: Throwable if StreamExecution.isInterruptionException(t, sparkSession.sparkContext) && state.get() == RECONFIGURING => - logInfo(s"Query $id ignoring exception from reconfiguring: $t") + logInfo(log"Query ${MDC(QUERY_ID, id)} ignoring exception from reconfiguring: " + + log"${MDC(ERROR, t)}") // interrupted by reconfiguration - swallow exception so we can restart the query } finally { // The above execution may finish before getting interrupted, for example, a Spark job having @@ -440,7 +445,8 @@ class ContinuousExecution( */ def stopInNewThread(error: Throwable): Unit = { if (failure.compareAndSet(null, error)) { - logError(s"Query $prettyIdString received exception $error") + logError(log"Query ${MDC(PRETTY_ID_STRING, prettyIdString)} received exception " + + log"${MDC(ERROR, error)}") stopInNewThread() } } @@ -476,7 +482,7 @@ class ContinuousExecution( // We just need to interrupt the long running job. interruptAndAwaitExecutionThreadTermination() } - logInfo(s"Query $prettyIdString was stopped") + logInfo(log"Query ${MDC(PRETTY_ID_STRING, prettyIdString)} was stopped") } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousQueuedDataReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousQueuedDataReader.scala index 8e5548ca2acad..398df496d15ff 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousQueuedDataReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousQueuedDataReader.scala @@ -23,7 +23,8 @@ import java.util.concurrent.{ArrayBlockingQueue, TimeUnit} import scala.util.control.NonFatal import org.apache.spark.{SparkEnv, TaskContext} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.UnsafeProjection import org.apache.spark.sql.connector.read.PartitionReader @@ -159,7 +160,8 @@ class ContinuousQueuedDataReader( } catch { case _: InterruptedException => // Continuous shutdown always involves an interrupt; do nothing and shut down quietly. - logInfo(s"shutting down interrupted data reader thread $getName") + logInfo(log"shutting down interrupted data reader thread " + + log"${MDC(THREAD_NAME, getName)}") case NonFatal(t) => failureReason = t diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousTextSocketSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousTextSocketSource.scala index b41b3c329712f..420c3e3be16d6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousTextSocketSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousTextSocketSource.scala @@ -29,7 +29,8 @@ import org.json4s.{DefaultFormats, Formats, NoTypeHints} import org.json4s.jackson.Serialization import org.apache.spark.SparkEnv -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{HOST, PORT} import org.apache.spark.rpc.RpcEndpointRef import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder @@ -179,7 +180,7 @@ class TextSocketContinuousStream( val line = reader.readLine() if (line == null) { // End of file reached - logWarning(s"Stream closed by $host:$port") + logWarning(log"Stream closed by ${MDC(HOST, host)}:${MDC(PORT, port)}") return } TextSocketContinuousStream.this.synchronized { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousWriteRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousWriteRDD.scala index 1d6ba87145d4a..d5daa9a875f83 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousWriteRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousWriteRDD.scala @@ -18,6 +18,8 @@ package org.apache.spark.sql.execution.streaming.continuous import org.apache.spark.{Partition, SparkEnv, TaskContext} +import org.apache.spark.internal.{LogKeys, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.connector.write.DataWriter @@ -68,8 +70,8 @@ class ContinuousWriteRDD(var prev: RDD[InternalRow], writerFactory: StreamingDat } CustomMetrics.updateMetrics( dataWriter.currentMetricsValues.toImmutableArraySeq, customMetrics) - logInfo(s"Writer for partition ${context.partitionId()} " + - s"in epoch ${EpochTracker.getCurrentEpoch.get} is committing.") + logInfo(log"Writer for partition ${MDC(PARTITION_ID, context.partitionId())} " + + log"in epoch ${MDC(EPOCH, EpochTracker.getCurrentEpoch.get)} is committing.") val msg = dataWriter.commit() epochCoordinator.send( CommitPartitionEpoch( @@ -77,8 +79,8 @@ class ContinuousWriteRDD(var prev: RDD[InternalRow], writerFactory: StreamingDat EpochTracker.getCurrentEpoch.get, msg) ) - logInfo(s"Writer for partition ${context.partitionId()} " + - s"in epoch ${EpochTracker.getCurrentEpoch.get} committed.") + logInfo(log"Writer for partition ${MDC(PARTITION_ID, context.partitionId())} " + + log"in epoch ${MDC(EPOCH, EpochTracker.getCurrentEpoch.get)} committed.") EpochTracker.incrementCurrentEpoch() } catch { case _: InterruptedException => @@ -87,9 +89,11 @@ class ContinuousWriteRDD(var prev: RDD[InternalRow], writerFactory: StreamingDat })(catchBlock = { // If there is an error, abort this writer. We enter this callback in the middle of // rethrowing an exception, so compute() will stop executing at this point. - logError(s"Writer for partition ${context.partitionId()} is aborting.") + logError(log"Writer for partition ${MDC(LogKeys.PARTITION_ID, context.partitionId())} " + + log"is aborting.") if (dataWriter != null) dataWriter.abort() - logError(s"Writer for partition ${context.partitionId()} aborted.") + logError(log"Writer for partition ${MDC(LogKeys.PARTITION_ID, context.partitionId())} " + + log"aborted.") }, finallyBlock = { if (dataWriter != null) dataWriter.close() }) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/WriteToContinuousDataSourceExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/WriteToContinuousDataSourceExec.scala index 2b7d68f9b98bf..42ce32e1bc674 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/WriteToContinuousDataSourceExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/WriteToContinuousDataSourceExec.scala @@ -17,7 +17,8 @@ package org.apache.spark.sql.execution.streaming.continuous -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute @@ -47,8 +48,8 @@ case class WriteToContinuousDataSourceExec(write: StreamingWrite, query: SparkPl PhysicalWriteInfoImpl(queryRdd.getNumPartitions)) val rdd = new ContinuousWriteRDD(queryRdd, writerFactory, metrics) - logInfo(s"Start processing data source write support: $write. " + - s"The input RDD has ${rdd.partitions.length} partitions.") + logInfo(log"Start processing data source write support: ${MDC(STREAMING_WRITE, write)}. " + + log"The input RDD has ${MDC(NUM_PARTITIONS, rdd.partitions.length)} partitions.") EpochCoordinatorRef.get( sparkContext.getLocalProperty(ContinuousExecution.EPOCH_COORDINATOR_ID_KEY), sparkContext.env) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ForeachBatchSink.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ForeachBatchSink.scala index 60cbaa7e79b95..c687caafdef37 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ForeachBatchSink.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ForeachBatchSink.scala @@ -63,7 +63,7 @@ class ForeachBatchSink[T](batchWriter: (Dataset[T], Long) => Unit, encoder: Expr /** * Exception that wraps the exception thrown in the user provided function in ForeachBatch sink. */ -private[streaming] case class ForeachBatchUserFuncException(cause: Throwable) +private[sql] case class ForeachBatchUserFuncException(cause: Throwable) extends SparkException( errorClass = "FOREACH_BATCH_USER_FUNCTION_ERROR", messageParameters = Map("reason" -> Option(cause.getMessage).getOrElse("")), diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamMicroBatchStream.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamMicroBatchStream.scala index 5640c7d3ca769..6705201c67316 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamMicroBatchStream.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamMicroBatchStream.scala @@ -23,7 +23,8 @@ import java.util.concurrent.TimeUnit import org.apache.commons.io.IOUtils -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.util.DateTimeUtils @@ -92,7 +93,7 @@ class RateStreamMicroBatchStream( metadataLog.get(0).getOrElse { val offset = LongOffset(clock.getTimeMillis()) metadataLog.add(0, offset) - logInfo(s"Start time: $offset") + logInfo(log"Start time: ${MDC(TIME_UNITS, offset)}") offset }.offset } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/TextSocketMicroBatchStream.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/TextSocketMicroBatchStream.scala index a01f40bead893..597b981ebe556 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/TextSocketMicroBatchStream.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/TextSocketMicroBatchStream.scala @@ -25,7 +25,8 @@ import javax.annotation.concurrent.GuardedBy import scala.collection.mutable.ListBuffer -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{HOST, PORT} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.connector.read.{InputPartition, PartitionReader, PartitionReaderFactory} @@ -79,7 +80,7 @@ class TextSocketMicroBatchStream(host: String, port: Int, numPartitions: Int) val line = reader.readLine() if (line == null) { // End of file reached - logWarning(s"Stream closed by $host:$port") + logWarning(log"Stream closed by ${MDC(HOST, host)}:${MDC(PORT, port)}") return } TextSocketMicroBatchStream.this.synchronized { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreMap.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreMap.scala index 32ff87f754d74..fe59703a1f458 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreMap.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreMap.scala @@ -32,7 +32,6 @@ trait HDFSBackedStateStoreMap { def remove(key: UnsafeRow): UnsafeRow def iterator(): Iterator[UnsafeRowPair] def prefixScan(prefixKey: UnsafeRow): Iterator[UnsafeRowPair] - def clear(): Unit } object HDFSBackedStateStoreMap { @@ -80,8 +79,6 @@ class NoPrefixHDFSBackedStateStoreMap extends HDFSBackedStateStoreMap { override def prefixScan(prefixKey: UnsafeRow): Iterator[UnsafeRowPair] = { throw SparkUnsupportedOperationException() } - - override def clear(): Unit = map.clear() } class PrefixScannableHDFSBackedStateStoreMap( @@ -170,9 +167,4 @@ class PrefixScannableHDFSBackedStateStoreMap( .iterator .map { key => unsafeRowPair.withRows(key, map.get(key)) } } - - override def clear(): Unit = { - map.clear() - prefixKeyToKeysMap.clear() - } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala index 2ecfa0931042b..c4a41ceb4caf4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala @@ -32,7 +32,7 @@ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs._ import org.apache.spark.{SparkConf, SparkEnv} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC, MessageWithContext} import org.apache.spark.io.CompressionCodec import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.errors.QueryExecutionErrors @@ -71,7 +71,8 @@ import org.apache.spark.util.ArrayImplicits._ * to ensure re-executed RDD operations re-apply updates on the correct past version of the * store. */ -private[sql] class HDFSBackedStateStoreProvider extends StateStoreProvider with Logging { +private[sql] class HDFSBackedStateStoreProvider extends StateStoreProvider with Logging + with SupportsFineGrainedReplay { private val providerName = "HDFSBackedStateStoreProvider" @@ -169,7 +170,9 @@ private[sql] class HDFSBackedStateStoreProvider extends StateStoreProvider with verify(state == UPDATING, "Cannot commit after already committed or aborted") commitUpdates(newVersion, mapToUpdate, compressedStream) state = COMMITTED - logInfo(s"Committed version $newVersion for $this to file $finalDeltaFile") + logInfo(log"Committed version ${MDC(LogKeys.COMMITTED_VERSION, newVersion)} " + + log"for ${MDC(LogKeys.STATE_STORE_PROVIDER, this)} to file " + + log"${MDC(LogKeys.FILE_NAME, finalDeltaFile)}") newVersion } catch { case e: Throwable => @@ -187,7 +190,8 @@ private[sql] class HDFSBackedStateStoreProvider extends StateStoreProvider with } else { state = ABORTED } - logInfo(s"Aborted version $newVersion for $this") + logInfo(log"Aborted version ${MDC(LogKeys.STATE_STORE_VERSION, newVersion)} " + + log"for ${MDC(LogKeys.STATE_STORE_PROVIDER, this)}") } /** @@ -253,14 +257,16 @@ private[sql] class HDFSBackedStateStoreProvider extends StateStoreProvider with /** Get the state store for making updates to create a new `version` of the store. */ override def getStore(version: Long): StateStore = { val newMap = getLoadedMapForStore(version) - logInfo(s"Retrieved version $version of ${HDFSBackedStateStoreProvider.this} for update") + logInfo(log"Retrieved version ${MDC(LogKeys.STATE_STORE_VERSION, version)} " + + log"of ${MDC(LogKeys.STATE_STORE_PROVIDER, HDFSBackedStateStoreProvider.this)} for update") new HDFSBackedStateStore(version, newMap) } /** Get the state store for reading to specific `version` of the store. */ override def getReadStore(version: Long): ReadStateStore = { val newMap = getLoadedMapForStore(version) - logInfo(s"Retrieved version $version of ${HDFSBackedStateStoreProvider.this} for readonly") + logInfo(log"Retrieved version ${MDC(LogKeys.STATE_STORE_VERSION, version)} of " + + log"${MDC(LogKeys.STATE_STORE_PROVIDER, HDFSBackedStateStoreProvider.this)} for readonly") new HDFSBackedReadStateStore(version, newMap) } @@ -337,16 +343,19 @@ private[sql] class HDFSBackedStateStoreProvider extends StateStoreProvider with /** Do maintenance backing data files, including creating snapshots and cleaning up old files */ override def doMaintenance(): Unit = { try { - doSnapshot() + doSnapshot("maintenance") cleanup() } catch { case NonFatal(e) => - logWarning(s"Error performing snapshot and cleaning up $this") + logWarning(log"Error performing snapshot and cleaning up " + toMessageWithContext) } } override def close(): Unit = { - synchronized { loadedMaps.values.asScala.foreach(_.clear()) } + // Clearing the map resets the TreeMap.root to null, and therefore entries inside the + // `loadedMaps` will be de-referenced and GCed automatically when their reference + // counts become 0. + synchronized { loadedMaps.clear() } } override def supportedCustomMetrics: Seq[StateStoreCustomMetric] = { @@ -354,9 +363,14 @@ private[sql] class HDFSBackedStateStoreProvider extends StateStoreProvider with Nil } + private def toMessageWithContext: MessageWithContext = { + log"HDFSStateStoreProvider[id = (op=${MDC(LogKeys.OP_ID, stateStoreId.operatorId)}," + + log"part=${MDC(LogKeys.PARTITION_ID, stateStoreId.partitionId)})," + + log"dir = ${MDC(LogKeys.PATH, baseDir)}]" + } + override def toString(): String = { - s"HDFSStateStoreProvider[" + - s"id = (op=${stateStoreId.operatorId},part=${stateStoreId.partitionId}),dir = $baseDir]" + toMessageWithContext.message } /* Internal fields and methods */ @@ -428,6 +442,27 @@ private[sql] class HDFSBackedStateStoreProvider extends StateStoreProvider with private def putStateIntoStateCacheMap( newVersion: Long, map: HDFSBackedStateStoreMap): Unit = synchronized { + val loadedEntries = loadedMaps.size() + val earliestLoadedVersion: Option[Long] = if (loadedEntries > 0) { + Some(loadedMaps.lastKey()) + } else { + None + } + + if (earliestLoadedVersion.isDefined) { + logInfo(log"Trying to add version=${MDC(LogKeys.STATE_STORE_VERSION, newVersion)} to state " + + log"cache map with current_size=${MDC(LogKeys.NUM_LOADED_ENTRIES, loadedEntries)} and " + + log"earliest_loaded_version=" + + log"${MDC(LogKeys.EARLIEST_LOADED_VERSION, earliestLoadedVersion.get)}} " + + log"and max_versions_to_retain_in_memory=" + + log"${MDC(LogKeys.NUM_VERSIONS_RETAIN, numberOfVersionsToRetainInMemory)}") + } else { + logInfo(log"Trying to add version=${MDC(LogKeys.STATE_STORE_VERSION, newVersion)} to state " + + log"cache map with current_size=${MDC(LogKeys.NUM_LOADED_ENTRIES, loadedEntries)} and " + + log"max_versions_to_retain_in_memory=" + + log"${MDC(LogKeys.NUM_VERSIONS_RETAIN, numberOfVersionsToRetainInMemory)}") + } + if (numberOfVersionsToRetainInMemory <= 0) { if (loadedMaps.size() > 0) loadedMaps.clear() return @@ -463,9 +498,9 @@ private[sql] class HDFSBackedStateStoreProvider extends StateStoreProvider with return loadedCurrentVersionMap.get } - logWarning(s"The state for version $version doesn't exist in loadedMaps. " + - "Reading snapshot file and delta files if needed..." + - "Note that this is normal for the first batch of starting query.") + logWarning(log"The state for version ${MDC(LogKeys.FILE_VERSION, version)} doesn't exist in " + + log"loadedMaps. Reading snapshot file and delta files if needed..." + + log"Note that this is normal for the first batch of starting query.") loadedMapCacheMissCount.increment() @@ -585,10 +620,14 @@ private[sql] class HDFSBackedStateStoreProvider extends StateStoreProvider with } finally { if (input != null) input.close() } - logInfo(s"Read delta file for version $version of $this from $fileToRead") + logInfo(log"Read delta file for version ${MDC(LogKeys.FILE_VERSION, version)} " + + log"of ${MDC(LogKeys.STATE_STORE_PROVIDER, this)} from ${MDC(LogKeys.FILE_NAME, fileToRead)}") } - private def writeSnapshotFile(version: Long, map: HDFSBackedStateStoreMap): Unit = { + private def writeSnapshotFile( + version: Long, + map: HDFSBackedStateStoreMap, + opType: String): Unit = { val targetFile = snapshotFile(version) var rawOutput: CancellableFSDataOutputStream = null var output: DataOutputStream = null @@ -612,7 +651,9 @@ private[sql] class HDFSBackedStateStoreProvider extends StateStoreProvider with cancelDeltaFile(compressedStream = output, rawStream = rawOutput) throw e } - logInfo(s"Written snapshot file for version $version of $this at $targetFile") + logInfo(log"Written snapshot file for version ${MDC(LogKeys.FILE_VERSION, version)} of " + + log"${MDC(LogKeys.STATE_STORE_PROVIDER, this)} at ${MDC(LogKeys.FILE_NAME, targetFile)} " + + log"for ${MDC(LogKeys.OP_TYPE, opType)}") } /** @@ -637,11 +678,17 @@ private[sql] class HDFSBackedStateStoreProvider extends StateStoreProvider with // SPARK-42668 - Catch and log any other exception thrown while trying to cancel // raw stream or close compressed stream. case NonFatal(ex) => - logInfo(s"Failed to cancel delta file for provider=$stateStoreId " + - s"with exception=$ex") + logInfo(log"Failed to cancel delta file for " + + log"provider=${MDC(LogKeys.STATE_STORE_ID, stateStoreId)} " + + log"with exception=${MDC(LogKeys.ERROR, ex)}") } } + /** + * Try to read the snapshot file. If the snapshot file is not available, return [[None]]. + * + * @param version the version of the snapshot file + */ private def readSnapshotFile(version: Long): Option[HDFSBackedStateStoreMap] = { val fileToRead = snapshotFile(version) val map = HDFSBackedStateStoreMap.create(keySchema, numColsPrefixKey) @@ -687,7 +734,8 @@ private[sql] class HDFSBackedStateStoreProvider extends StateStoreProvider with } } } - logInfo(s"Read snapshot file for version $version of $this from $fileToRead") + logInfo(log"Read snapshot file for version ${MDC(LogKeys.SNAPSHOT_VERSION, version)} of " + + log"${MDC(LogKeys.STATE_STORE_PROVIDER, this)} from ${MDC(LogKeys.FILE_NAME, fileToRead)}") Some(map) } catch { case _: FileNotFoundException => @@ -699,7 +747,7 @@ private[sql] class HDFSBackedStateStoreProvider extends StateStoreProvider with /** Perform a snapshot of the store to allow delta files to be consolidated */ - private def doSnapshot(): Unit = { + private def doSnapshot(opType: String): Unit = { try { val (files, e1) = Utils.timeTakenMs(fetchFiles()) logDebug(s"fetchFiles() took $e1 ms.") @@ -711,7 +759,7 @@ private[sql] class HDFSBackedStateStoreProvider extends StateStoreProvider with synchronized { Option(loadedMaps.get(lastVersion)) } match { case Some(map) => if (deltaFilesForLastVersion.size > storeConf.minDeltasForSnapshot) { - val (_, e2) = Utils.timeTakenMs(writeSnapshotFile(lastVersion, map)) + val (_, e2) = Utils.timeTakenMs(writeSnapshotFile(lastVersion, map, opType)) logDebug(s"writeSnapshotFile() took $e2 ms.") } case None => @@ -720,7 +768,7 @@ private[sql] class HDFSBackedStateStoreProvider extends StateStoreProvider with } } catch { case NonFatal(e) => - logWarning(s"Error doing snapshots for $this", e) + logWarning(log"Error doing snapshots for " + toMessageWithContext, e) } } @@ -745,13 +793,15 @@ private[sql] class HDFSBackedStateStoreProvider extends StateStoreProvider with } } logDebug(s"deleting files took $e2 ms.") - logInfo(s"Deleted files older than ${earliestFileToRetain.version} for $this: " + - filesToDelete.mkString(", ")) + logInfo(log"Deleted files older than " + + log"${MDC(LogKeys.FILE_VERSION, earliestFileToRetain.version)} for " + + log"${MDC(LogKeys.STATE_STORE_PROVIDER, this)}: " + + log"${MDC(LogKeys.FILE_NAME, filesToDelete.mkString(", "))}") } } } catch { case NonFatal(e) => - logWarning(s"Error cleaning up files for $this", e) + logWarning(log"Error cleaning up files for " + toMessageWithContext, e) } } @@ -804,8 +854,8 @@ private[sql] class HDFSBackedStateStoreProvider extends StateStoreProvider with } case "snapshot" => versionToFiles.put(version, StoreFile(version, path, isSnapshot = true)) - case _ => - logWarning(s"Could not identify file $path for $this") + case _ => logWarning( + log"Could not identify file ${MDC(LogKeys.PATH, path)} for " + toMessageWithContext) } } } @@ -839,4 +889,93 @@ private[sql] class HDFSBackedStateStoreProvider extends StateStoreProvider with throw new IllegalStateException(msg) } } + + /** + * Get the state store of endVersion by applying delta files on the snapshot of snapshotVersion. + * If snapshot for snapshotVersion does not exist, an error will be thrown. + * + * @param snapshotVersion checkpoint version of the snapshot to start with + * @param endVersion checkpoint version to end with + * @return [[HDFSBackedStateStore]] + */ + override def replayStateFromSnapshot(snapshotVersion: Long, endVersion: Long): StateStore = { + val newMap = replayLoadedMapFromSnapshot(snapshotVersion, endVersion) + logInfo(log"Retrieved snapshot at version " + + log"${MDC(LogKeys.STATE_STORE_VERSION, snapshotVersion)} and apply delta files to version " + + log"${MDC(LogKeys.STATE_STORE_VERSION, endVersion)} of " + + log"${MDC(LogKeys.STATE_STORE_PROVIDER, HDFSBackedStateStoreProvider.this)} for update") + new HDFSBackedStateStore(endVersion, newMap) + } + + /** + * Get the state store of endVersion for reading by applying delta files on the snapshot of + * snapshotVersion. If snapshot for snapshotVersion does not exist, an error will be thrown. + * + * @param snapshotVersion checkpoint version of the snapshot to start with + * @param endVersion checkpoint version to end with + * @return [[HDFSBackedReadStateStore]] + */ + override def replayReadStateFromSnapshot(snapshotVersion: Long, endVersion: Long): + ReadStateStore = { + val newMap = replayLoadedMapFromSnapshot(snapshotVersion, endVersion) + logInfo(log"Retrieved snapshot at version " + + log"${MDC(LogKeys.STATE_STORE_VERSION, snapshotVersion)} and apply delta files to version " + + log"${MDC(LogKeys.STATE_STORE_VERSION, endVersion)} of " + + log"${MDC(LogKeys.STATE_STORE_PROVIDER, HDFSBackedStateStoreProvider.this)} for read-only") + new HDFSBackedReadStateStore(endVersion, newMap) + } + + /** + * Construct the state map at endVersion from snapshot of version snapshotVersion. + * Returns a new [[HDFSBackedStateStoreMap]] + * @param snapshotVersion checkpoint version of the snapshot to start with + * @param endVersion checkpoint version to end with + */ + private def replayLoadedMapFromSnapshot(snapshotVersion: Long, endVersion: Long): + HDFSBackedStateStoreMap = synchronized { + try { + if (snapshotVersion < 1) { + throw QueryExecutionErrors.unexpectedStateStoreVersion(snapshotVersion) + } + if (endVersion < snapshotVersion) { + throw QueryExecutionErrors.unexpectedStateStoreVersion(endVersion) + } + + val newMap = HDFSBackedStateStoreMap.create(keySchema, numColsPrefixKey) + newMap.putAll(constructMapFromSnapshot(snapshotVersion, endVersion)) + + newMap + } + catch { + case e: Throwable => throw QueryExecutionErrors.cannotLoadStore(e) + } + } + + private def constructMapFromSnapshot(snapshotVersion: Long, endVersion: Long): + HDFSBackedStateStoreMap = { + val (result, elapsedMs) = Utils.timeTakenMs { + val startVersionMap = synchronized { Option(loadedMaps.get(snapshotVersion)) } match { + case Some(value) => Option(value) + case None => readSnapshotFile(snapshotVersion) + } + if (startVersionMap.isEmpty) { + throw StateStoreErrors.stateStoreSnapshotFileNotFound( + snapshotFile(snapshotVersion).toString, toString()) + } + + // Load all the deltas from the version after the start version up to the end version. + val resultMap = HDFSBackedStateStoreMap.create(keySchema, numColsPrefixKey) + resultMap.putAll(startVersionMap.get) + for (deltaVersion <- snapshotVersion + 1 to endVersion) { + updateFromDeltaFile(deltaVersion, resultMap) + } + + resultMap + } + + logDebug(s"Loading snapshot at version $snapshotVersion and apply delta files to version " + + s"$endVersion takes $elapsedMs ms.") + + result + } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/OperatorStateMetadata.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/OperatorStateMetadata.scala index b58c805af9d60..8ce883038401d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/OperatorStateMetadata.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/OperatorStateMetadata.scala @@ -27,7 +27,7 @@ import org.apache.hadoop.fs.{FSDataOutputStream, Path} import org.json4s.{Formats, NoTypeHints} import org.json4s.jackson.Serialization -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.sql.execution.streaming.{CheckpointFileManager, MetadataVersionUtil} /** @@ -105,7 +105,8 @@ class OperatorStateMetadataWriter(stateCheckpointPath: Path, hadoopConf: Configu outputStream.close() } catch { case e: Throwable => - logError(s"Fail to write state metadata file to $metadataFilePath", e) + logError( + log"Fail to write state metadata file to ${MDC(LogKeys.META_FILE, metadataFilePath)}", e) outputStream.cancel() throw e } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDB.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDB.scala index 61c3d349655fd..28ad197ffb4af 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDB.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDB.scala @@ -23,7 +23,7 @@ import java.util.concurrent.TimeUnit import javax.annotation.concurrent.GuardedBy import scala.collection.{mutable, Map} -import scala.collection.mutable.ArrayBuffer +import scala.collection.mutable.{ArrayBuffer, ListBuffer} import scala.jdk.CollectionConverters._ import scala.ref.WeakReference import scala.util.Try @@ -36,7 +36,7 @@ import org.rocksdb.CompressionType._ import org.rocksdb.TickerType._ import org.apache.spark.TaskContext -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{LogEntry, Logging, LogKeys, MDC} import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap import org.apache.spark.sql.errors.QueryExecutionErrors import org.apache.spark.util.{NextIterator, Utils} @@ -74,7 +74,11 @@ class RocksDB( loggingId: String = "", useColumnFamilies: Boolean = false) extends Logging { - case class RocksDBSnapshot(checkpointDir: File, version: Long, numKeys: Long) { + case class RocksDBSnapshot( + checkpointDir: File, + version: Long, + numKeys: Long, + capturedFileMappings: RocksDBFileMappings) { def close(): Unit = { silentDeleteRecursively(checkpointDir, s"Free up local checkpoint of snapshot $version") } @@ -82,6 +86,7 @@ class RocksDB( @volatile private var latestSnapshot: Option[RocksDBSnapshot] = None @volatile private var lastSnapshotVersion = 0L + private val oldSnapshots = new ListBuffer[RocksDBSnapshot] RocksDBLoader.loadLibrary() @@ -177,10 +182,13 @@ class RocksDB( assert(version >= 0) acquire(LoadStore) recordedMetrics = None - logInfo(s"Loading $version") + logInfo(log"Loading ${MDC(LogKeys.VERSION_NUM, version)}") try { if (loadedVersion != version) { closeDB() + // deep copy is needed to avoid race condition + // between maintenance and task threads + fileManager.copyFileMapping() val latestSnapshotVersion = fileManager.getLatestSnapshotVersion(version) val metadata = fileManager.loadCheckpointFromDfs(latestSnapshotVersion, workingDir) loadedVersion = latestSnapshotVersion @@ -189,7 +197,6 @@ class RocksDB( if (lastSnapshotVersion > latestSnapshotVersion) { // discard any newer snapshots lastSnapshotVersion = 0L - latestSnapshot = None } openDB() @@ -212,7 +219,7 @@ class RocksDB( if (conf.resetStatsOnLoad) { nativeStats.reset } - logInfo(s"Loaded $version") + logInfo(log"Loaded ${MDC(LogKeys.VERSION_NUM, version)}") } catch { case t: Throwable => loadedVersion = -1 // invalidate loaded data @@ -226,12 +233,88 @@ class RocksDB( this } + /** + * Load from the start snapshot version and apply all the changelog records to reach the + * end version. Note that this will copy all the necessary files from DFS to local disk as needed, + * and possibly restart the native RocksDB instance. + * + * @param snapshotVersion version of the snapshot to start with + * @param endVersion end version + * @return A RocksDB instance loaded with the state endVersion replayed from snapshotVersion. + * Note that the instance will be read-only since this method is only used in State Data + * Source. + */ + def loadFromSnapshot(snapshotVersion: Long, endVersion: Long): RocksDB = { + assert(snapshotVersion >= 0 && endVersion >= snapshotVersion) + acquire(LoadStore) + recordedMetrics = None + logInfo( + log"Loading snapshot at version ${MDC(LogKeys.VERSION_NUM, snapshotVersion)} and apply " + + log"changelog files to version ${MDC(LogKeys.VERSION_NUM, endVersion)}.") + try { + replayFromCheckpoint(snapshotVersion, endVersion) + + logInfo( + log"Loaded snapshot at version ${MDC(LogKeys.VERSION_NUM, snapshotVersion)} and apply " + + log"changelog files to version ${MDC(LogKeys.VERSION_NUM, endVersion)}.") + } catch { + case t: Throwable => + loadedVersion = -1 // invalidate loaded data + throw t + } + this + } + + /** + * Load from the start checkpoint version and apply all the changelog records to reach the + * end version. + * If the start version does not exist, it will throw an exception. + * + * @param snapshotVersion start checkpoint version + * @param endVersion end version + */ + private def replayFromCheckpoint(snapshotVersion: Long, endVersion: Long): Any = { + closeDB() + val metadata = fileManager.loadCheckpointFromDfs(snapshotVersion, workingDir) + loadedVersion = snapshotVersion + + // reset last snapshot version + if (lastSnapshotVersion > snapshotVersion) { + // discard any newer snapshots + lastSnapshotVersion = 0L + latestSnapshot = None + } + openDB() + + numKeysOnWritingVersion = if (!conf.trackTotalNumberOfRows) { + // we don't track the total number of rows - discard the number being track + -1L + } else if (metadata.numKeys < 0) { + // we track the total number of rows, but the snapshot doesn't have tracking number + // need to count keys now + countKeys() + } else { + metadata.numKeys + } + if (loadedVersion != endVersion) replayChangelog(endVersion) + // After changelog replay the numKeysOnWritingVersion will be updated to + // the correct number of keys in the loaded version. + numKeysOnLoadedVersion = numKeysOnWritingVersion + fileManagerMetrics = fileManager.latestLoadCheckpointMetrics + + if (conf.resetStatsOnLoad) { + nativeStats.reset + } + } + /** * Replay change log from the loaded version to the target version. */ private def replayChangelog(endVersion: Long): Unit = { for (v <- loadedVersion + 1 to endVersion) { - logInfo(s"replaying changelog from version $loadedVersion -> $endVersion") + logInfo(log"replaying changelog from version " + + log"${MDC(LogKeys.LOADED_VERSION, loadedVersion)} -> " + + log"${MDC(LogKeys.END_VERSION, endVersion)}") var changelogReader: StateStoreChangelogReader = null try { changelogReader = fileManager.getChangelogReader(v, useColumnFamilies) @@ -461,7 +544,7 @@ class RocksDB( verifyColFamilyOperations("iterator", colFamilyName) val iter = db.newIterator(colFamilyNameToHandleMap(colFamilyName)) - logInfo(s"Getting iterator from version $loadedVersion") + logInfo(log"Getting iterator from version ${MDC(LogKeys.LOADED_VERSION, loadedVersion)}") iter.seekToFirst() // Attempt to close this iterator if there is a task failure, or a task interruption. @@ -491,7 +574,8 @@ class RocksDB( val iter = db.newIterator(colFamilyNameToHandleMap(colFamilyName)) try { - logInfo(s"Counting keys - getting iterator from version $loadedVersion") + logInfo(log"Counting keys - getting iterator from version " + + log"${MDC(LogKeys.LOADED_VERSION, loadedVersion)}") iter.seekToFirst() @@ -545,7 +629,7 @@ class RocksDB( val newVersion = loadedVersion + 1 try { - logInfo(s"Flushing updates for $newVersion") + logInfo(log"Flushing updates for ${MDC(LogKeys.VERSION_NUM, newVersion)}") var compactTimeMs = 0L var flushTimeMs = 0L @@ -553,7 +637,7 @@ class RocksDB( if (shouldCreateSnapshot()) { // Need to flush the change to disk before creating a checkpoint // because rocksdb wal is disabled. - logInfo(s"Flushing updates for $newVersion") + logInfo(log"Flushing updates for ${MDC(LogKeys.VERSION_NUM, newVersion)}") flushTimeMs = timeTakenMs { // Flush updates to all available column families assert(!colFamilyNameToHandleMap.isEmpty) @@ -571,7 +655,8 @@ class RocksDB( checkpointTimeMs = timeTakenMs { val checkpointDir = createTempDir("checkpoint") - logInfo(s"Creating checkpoint for $newVersion in $checkpointDir") + logInfo(log"Creating checkpoint for ${MDC(LogKeys.VERSION_NUM, newVersion)} " + + log"in ${MDC(LogKeys.PATH, checkpointDir)}") // Make sure the directory does not exist. Native RocksDB fails if the directory to // checkpoint exists. Utils.deleteRecursively(checkpointDir) @@ -584,14 +669,21 @@ class RocksDB( // inside the uploadSnapshot() called below. // If changelog checkpointing is enabled, snapshot will be uploaded asynchronously // during state store maintenance. - latestSnapshot.foreach(_.close()) - latestSnapshot = Some( - RocksDBSnapshot(checkpointDir, newVersion, numKeysOnWritingVersion)) - lastSnapshotVersion = newVersion + synchronized { + if (latestSnapshot.isDefined) { + oldSnapshots += latestSnapshot.get + } + latestSnapshot = Some( + RocksDBSnapshot(checkpointDir, + newVersion, + numKeysOnWritingVersion, + fileManager.captureFileMapReference())) + lastSnapshotVersion = newVersion + } } } - logInfo(s"Syncing checkpoint for $newVersion to DFS") + logInfo(log"Syncing checkpoint for ${MDC(LogKeys.VERSION_NUM, newVersion)} to DFS") val fileSyncTimeMs = timeTakenMs { if (enableChangelogCheckpointing) { try { @@ -615,7 +707,8 @@ class RocksDB( "fileSync" -> fileSyncTimeMs ) recordedMetrics = Some(metrics) - logInfo(s"Committed $newVersion, stats = ${recordedMetrics.get.json}") + logInfo(log"Committed ${MDC(LogKeys.VERSION_NUM, newVersion)}, " + + log"stats = ${MDC(LogKeys.METRICS_JSON, recordedMetrics.get.json)}") loadedVersion } catch { case t: Throwable => @@ -638,22 +731,36 @@ class RocksDB( } private def uploadSnapshot(): Unit = { + var oldSnapshotsImmutable: List[RocksDBSnapshot] = Nil val localCheckpoint = synchronized { val checkpoint = latestSnapshot latestSnapshot = None + + // Convert mutable list buffer to immutable to prevent + // race condition with commit where old snapshot is added + oldSnapshotsImmutable = oldSnapshots.toList + oldSnapshots.clear() + checkpoint } localCheckpoint match { - case Some(RocksDBSnapshot(localDir, version, numKeys)) => + case Some(RocksDBSnapshot(localDir, version, numKeys, capturedFileMappings)) => try { val uploadTime = timeTakenMs { - fileManager.saveCheckpointToDfs(localDir, version, numKeys) + fileManager.saveCheckpointToDfs(localDir, version, numKeys, capturedFileMappings) fileManagerMetrics = fileManager.latestSaveCheckpointMetrics } - logInfo(s"$loggingId: Upload snapshot of version $version," + - s" time taken: $uploadTime ms") + logInfo(log"${MDC(LogKeys.LOG_ID, loggingId)}: Upload snapshot of version " + + log"${MDC(LogKeys.VERSION_NUM, version)}," + + log" time taken: ${MDC(LogKeys.TIME_UNITS, uploadTime)} ms") } finally { localCheckpoint.foreach(_.close()) + + // Clean up old latestSnapshots + for (snapshot <- oldSnapshotsImmutable) { + snapshot.close() + } + } case _ => } @@ -670,30 +777,17 @@ class RocksDB( // Make sure changelogWriter gets recreated next time. changelogWriter = None release(RollbackStore) - logInfo(s"Rolled back to $loadedVersion") + logInfo(log"Rolled back to ${MDC(LogKeys.VERSION_NUM, loadedVersion)}") } def doMaintenance(): Unit = { if (enableChangelogCheckpointing) { - // There is race to update latestSnapshot between load(), commit() - // and uploadSnapshot(). - // The load method will reset latestSnapshot to discard any snapshots taken - // from newer versions (when a old version is reloaded). - // commit() method deletes the existing snapshot while creating a new snapshot. - // In order to ensure that the snapshot being uploaded would not be modified - // concurrently, we need to synchronize the snapshot access between task thread - // and maintenance thread. - acquire(StoreMaintenance) - try { - uploadSnapshot() - } finally { - release(StoreMaintenance) - } + uploadSnapshot() } val cleanupTime = timeTakenMs { fileManager.deleteOldVersions(conf.minVersionsToRetain) } - logInfo(s"Cleaned old data, time taken: $cleanupTime ms") + logInfo(log"Cleaned old data, time taken: ${MDC(LogKeys.TIME_UNITS, cleanupTime)} ms") } /** Release all resources */ @@ -771,10 +865,19 @@ class RocksDB( .keys.filter(checkInternalColumnFamilies(_)).size val numExternalColFamilies = colFamilyNameToHandleMap.keys.size - numInternalColFamilies + // if bounded memory usage is enabled, we share the block cache across all state providers + // running on the same node and account the usage to this single cache. In this case, its not + // possible to provide partition level or query level memory usage. + val memoryUsage = if (conf.boundedMemoryUsage) { + 0L + } else { + readerMemUsage + memTableMemUsage + blockCacheUsage + } + RocksDBMetrics( numKeysOnLoadedVersion, numKeysOnWritingVersion, - readerMemUsage + memTableMemUsage + blockCacheUsage, + memoryUsage, pinnedBlocksMemUsage, totalSSTFilesBytes, nativeOpsLatencyMicros, @@ -800,7 +903,7 @@ class RocksDB( rocksDBMetricsOpt = recordedMetrics } catch { case ex: Exception => - logInfo(s"Failed to acquire metrics with exception=$ex") + logInfo(log"Failed to acquire metrics with exception=${MDC(LogKeys.ERROR, ex)}") } finally { release(ReportStoreMetrics) } @@ -838,7 +941,8 @@ class RocksDB( Option(TaskContext.get()).foreach(_.addTaskCompletionListener[Unit] { _ => this.release(StoreTaskCompletionListener) }) - logInfo(s"RocksDB instance was acquired by $acquiredThreadInfo for opType=${opType.toString}") + logInfo(log"RocksDB instance was acquired by ${MDC(LogKeys.THREAD, acquiredThreadInfo)} " + + log"for opType=${MDC(LogKeys.OP_TYPE, opType.toString)}") } } @@ -849,7 +953,8 @@ class RocksDB( * @param opType - operation type releasing the lock */ private def release(opType: RocksDBOpType): Unit = acquireLock.synchronized { - logInfo(s"RocksDB instance was released by $acquiredThreadInfo for opType=${opType.toString}") + logInfo(log"RocksDB instance was released by ${MDC(LogKeys.THREAD, acquiredThreadInfo)} " + + log"for opType=${MDC(LogKeys.OP_TYPE, opType.toString)}") acquiredThreadInfo = null acquireLock.notifyAll() } @@ -887,7 +992,7 @@ class RocksDB( colFamilyHandles.asScala.toList.foreach { handle => colFamilyNameToHandleMap(handle.getName.map(_.toChar).mkString) = handle } - logInfo(s"Opened DB with conf ${conf}") + logInfo(log"Opened DB with conf ${MDC(LogKeys.CONFIG, conf)}") } private def closeDB(): Unit = { @@ -911,13 +1016,14 @@ class RocksDB( // Map DB log level to log4j levels // Warn is mapped to info because RocksDB warn is too verbose // (e.g. dumps non-warning stuff like stats) - val loggingFunc: ( => String) => Unit = infoLogLevel match { + val loggingFunc: ( => LogEntry) => Unit = infoLogLevel match { case InfoLogLevel.FATAL_LEVEL | InfoLogLevel.ERROR_LEVEL => logError(_) case InfoLogLevel.WARN_LEVEL | InfoLogLevel.INFO_LEVEL => logInfo(_) case InfoLogLevel.DEBUG_LEVEL => logDebug(_) case _ => logTrace(_) } - loggingFunc(s"[NativeRocksDB-${infoLogLevel.getValue}] $logMsg") + loggingFunc(log"[NativeRocksDB-${MDC(LogKeys.ROCKS_DB_LOG_LEVEL, infoLogLevel.getValue)}]" + + log" ${MDC(LogKeys.ROCKS_DB_LOG_MESSAGE, logMsg)}") } } @@ -930,7 +1036,7 @@ class RocksDB( // customized logger. We still set it as it might show up in RocksDB config file or logging. dbOptions.setInfoLogLevel(dbLogLevel) dbOptions.setLogger(dbLogger) - logInfo(s"Set RocksDB native logging level to $dbLogLevel") + logInfo(log"Set RocksDB native logging level to ${MDC(LogKeys.ROCKS_DB_LOG_LEVEL, dbLogLevel)}") dbLogger } @@ -945,7 +1051,8 @@ class RocksDB( Utils.deleteRecursively(file) } catch { case e: Exception => - logWarning(s"Error recursively deleting local dir $file while $msg", e) + logWarning(log"Error recursively deleting local dir ${MDC(LogKeys.PATH, file)} " + + log"while ${MDC(LogKeys.ERROR, msg)}", e) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDBFileManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDBFileManager.scala index bd1daa48f809b..fe7aeeb6fd3f6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDBFileManager.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDBFileManager.scala @@ -38,7 +38,7 @@ import org.json4s.{Formats, NoTypeHints} import org.json4s.jackson.Serialization import org.apache.spark.{SparkConf, SparkEnv} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC, MessageWithContext} import org.apache.spark.io.CompressionCodec import org.apache.spark.sql.errors.QueryExecutionErrors import org.apache.spark.sql.execution.streaming.CheckpointFileManager @@ -133,16 +133,6 @@ class RocksDBFileManager( import RocksDBImmutableFile._ - private val versionToRocksDBFiles = new ConcurrentHashMap[Long, Seq[RocksDBImmutableFile]] - - - // used to keep a mapping of the exact Dfs file that was used to create a local SST file. - // The reason this is a separate map because versionToRocksDBFiles can contain multiple similar - // SST files to a particular local file (for example 1.sst can map to 1-UUID1.sst in v1 and - // 1-UUID2.sst in v2). We need to capture the exact file used to ensure Version ID compatibility - // across SST files and RocksDB manifest. - private[sql] val localFilesToDfsFiles = new ConcurrentHashMap[String, RocksDBImmutableFile] - private lazy val fm = CheckpointFileManager.create(new Path(dfsRootDir), hadoopConf) private val fs = new Path(dfsRootDir).getFileSystem(hadoopConf) private val onlyZipFiles = new PathFilter { @@ -157,6 +147,29 @@ class RocksDBFileManager( private def codec = CompressionCodec.createCodec(sparkConf, codecName) @volatile private var rootDirChecked: Boolean = false + @volatile private var fileMappings = RocksDBFileMappings( + new ConcurrentHashMap[Long, Seq[RocksDBImmutableFile]], + new ConcurrentHashMap[String, RocksDBImmutableFile] + ) + + /** + * Make a deep copy of versionToRocksDBFiles and localFilesToDfsFiles to avoid + * current task thread from overwriting the file mapping whenever background maintenance + * thread attempts to upload a snapshot + */ + def copyFileMapping() : Unit = { + val newVersionToRocksDBFiles = new ConcurrentHashMap[Long, Seq[RocksDBImmutableFile]] + val newLocalFilesToDfsFiles = new ConcurrentHashMap[String, RocksDBImmutableFile] + + newVersionToRocksDBFiles.putAll(fileMappings.versionToRocksDBFiles) + newLocalFilesToDfsFiles.putAll(fileMappings.localFilesToDfsFiles) + + fileMappings = RocksDBFileMappings(newVersionToRocksDBFiles, newLocalFilesToDfsFiles) + } + + def captureFileMapReference(): RocksDBFileMappings = { + fileMappings + } def getChangeLogWriter( version: Long, @@ -204,14 +217,20 @@ class RocksDBFileManager( def latestSaveCheckpointMetrics: RocksDBFileManagerMetrics = saveCheckpointMetrics /** Save all the files in given local checkpoint directory as a committed version in DFS */ - def saveCheckpointToDfs(checkpointDir: File, version: Long, numKeys: Long): Unit = { - logFilesInDir(checkpointDir, s"Saving checkpoint files for version $version") + def saveCheckpointToDfs( + checkpointDir: File, + version: Long, + numKeys: Long, + capturedFileMappings: RocksDBFileMappings): Unit = { + logFilesInDir(checkpointDir, log"Saving checkpoint files " + + log"for version ${MDC(LogKeys.VERSION_NUM, version)}") val (localImmutableFiles, localOtherFiles) = listRocksDBFiles(checkpointDir) - val rocksDBFiles = saveImmutableFilesToDfs(version, localImmutableFiles) + val rocksDBFiles = saveImmutableFilesToDfs(version, localImmutableFiles, capturedFileMappings) val metadata = RocksDBCheckpointMetadata(rocksDBFiles, numKeys) val metadataFile = localMetadataFile(checkpointDir) metadata.writeToFile(metadataFile) - logInfo(s"Written metadata for version $version:\n${metadata.prettyJson}") + logInfo(log"Written metadata for version ${MDC(LogKeys.VERSION_NUM, version)}:\n" + + log"${MDC(LogKeys.METADATA_JSON, metadata.prettyJson)}") if (version <= 1 && numKeys <= 0) { // If we're writing the initial version and there's no data, we have to explicitly initialize @@ -227,7 +246,7 @@ class RocksDBFileManager( } } zipToDfsFile(localOtherFiles :+ metadataFile, dfsBatchZipFile(version)) - logInfo(s"Saved checkpoint file for version $version") + logInfo(log"Saved checkpoint file for version ${MDC(LogKeys.VERSION_NUM, version)}") } /** @@ -237,14 +256,14 @@ class RocksDBFileManager( * local directory. */ def loadCheckpointFromDfs(version: Long, localDir: File): RocksDBCheckpointMetadata = { - logInfo(s"Loading checkpoint files for version $version") + logInfo(log"Loading checkpoint files for version ${MDC(LogKeys.VERSION_NUM, version)}") // The unique ids of SST files are checked when opening a rocksdb instance. The SST files // in larger versions can't be reused even if they have the same size and name because // they belong to another rocksdb instance. - versionToRocksDBFiles.keySet().removeIf(_ >= version) + fileMappings.versionToRocksDBFiles.keySet().removeIf(_ >= version) val metadata = if (version == 0) { if (localDir.exists) Utils.deleteRecursively(localDir) - localFilesToDfsFiles.clear() + fileMappings.localFilesToDfsFiles.clear() localDir.mkdirs() RocksDBCheckpointMetadata(Seq.empty, 0) } else { @@ -255,13 +274,15 @@ class RocksDBFileManager( // Copy the necessary immutable files val metadataFile = localMetadataFile(localDir) val metadata = RocksDBCheckpointMetadata.readFromFile(metadataFile) - logInfo(s"Read metadata for version $version:\n${metadata.prettyJson}") + logInfo(log"Read metadata for version ${MDC(LogKeys.VERSION_NUM, version)}:\n" + + log"${MDC(LogKeys.METADATA_JSON, metadata.prettyJson)}") loadImmutableFilesFromDfs(metadata.immutableFiles, localDir) - versionToRocksDBFiles.put(version, metadata.immutableFiles) + fileMappings.versionToRocksDBFiles.put(version, metadata.immutableFiles) metadataFile.delete() metadata } - logFilesInDir(localDir, s"Loaded checkpoint files for version $version") + logFilesInDir(localDir, log"Loaded checkpoint files " + + log"for version ${MDC(LogKeys.VERSION_NUM, version)}") metadata } @@ -327,8 +348,9 @@ class RocksDBFileManager( val orphanFiles = fileModificationTimes .filter(_._2 < oldestTrackedFileModificationTime).keys.toSeq if (orphanFiles.nonEmpty) { - logInfo(s"Found ${orphanFiles.size} orphan files: ${orphanFiles.take(20).mkString(", ")}" + - "... (display at most 20 filenames) that should be deleted.") + logInfo(log"Found ${MDC(LogKeys.NUM_FILES, orphanFiles.size)} orphan files: " + + log"${MDC(LogKeys.FILE_MODIFICATION_TIME, orphanFiles.take(20).mkString(", "))}" + + log"... (display at most 20 filenames) that should be deleted.") } orphanFiles } else { @@ -340,10 +362,11 @@ class RocksDBFileManager( versionsToDelete.foreach { version => try { fm.delete(dfsChangelogFile(version)) - logInfo(s"Deleted changelog file $version") + logInfo(log"Deleted changelog file ${MDC(LogKeys.VERSION_NUM, version)}") } catch { case e: Exception => - logWarning(s"Error deleting changelog file for version $version", e) + logWarning( + log"Error deleting changelog file for version ${MDC(LogKeys.FILE_VERSION, version)}", e) } } } @@ -411,9 +434,9 @@ class RocksDBFileManager( // Resolve RocksDB files for all the versions and find the max version each file is used val fileToMaxUsedVersion = new mutable.HashMap[String, Long] sortedSnapshotVersions.foreach { version => - val files = Option(versionToRocksDBFiles.get(version)).getOrElse { + val files = Option(fileMappings.versionToRocksDBFiles.get(version)).getOrElse { val newResolvedFiles = getImmutableFilesFromVersionZip(version) - versionToRocksDBFiles.put(version, newResolvedFiles) + fileMappings.versionToRocksDBFiles.put(version, newResolvedFiles) newResolvedFiles } files.foreach(f => fileToMaxUsedVersion(f.dfsFileName) = @@ -431,7 +454,8 @@ class RocksDBFileManager( val allLogFiles = if (fm.exists(logDir)) fm.list(logDir).toImmutableArraySeq else Seq.empty filesToDelete ++= findOrphanFiles(fileToMaxUsedVersion.keys.toSeq, allSstFiles ++ allLogFiles) .map(_ -> -1L) - logInfo(s"Deleting ${filesToDelete.size} files not used in versions >= $minVersionToRetain") + logInfo(log"Deleting ${MDC(LogKeys.NUM_FILES, filesToDelete.size)} " + + log"files not used in versions >= ${MDC(LogKeys.VERSION_NUM, minVersionToRetain)}") var failedToDelete = 0 filesToDelete.foreach { case (dfsFileName, maxUsedVersion) => try { @@ -446,9 +470,10 @@ class RocksDBFileManager( case e: Exception => failedToDelete += 1 if (maxUsedVersion == -1) { - logWarning(s"Error deleting orphan file $dfsFileName", e) + logWarning(log"Error deleting orphan file ${MDC(LogKeys.PATH, dfsFileName)}", e) } else { - logWarning(s"Error deleting file $dfsFileName, last used in version $maxUsedVersion", e) + logWarning(log"Error deleting file ${MDC(LogKeys.PATH, dfsFileName)}, " + + log"last used in version ${MDC(LogKeys.MAX_FILE_VERSION, maxUsedVersion)}", e) } } } @@ -458,16 +483,18 @@ class RocksDBFileManager( val versionFile = dfsBatchZipFile(version) try { fm.delete(versionFile) - versionToRocksDBFiles.remove(version) + fileMappings.versionToRocksDBFiles.remove(version) logDebug(s"Deleted version $version") } catch { case e: Exception => - logWarning(s"Error deleting version file $versionFile for version $version", e) + logWarning(log"Error deleting version file ${MDC(LogKeys.PATH, versionFile)} for " + + log"version ${MDC(LogKeys.FILE_VERSION, version)}", e) } } - logInfo(s"Deleted ${filesToDelete.size - failedToDelete} files (failed to delete" + - s"$failedToDelete files) not used in versions >= $minVersionToRetain") - + logInfo(log"Deleted ${MDC(LogKeys.NUM_FILES, filesToDelete.size - failedToDelete)} files " + + log"(failed to delete" + + log"${MDC(LogKeys.NUM_FILES_FAILED_TO_DELETE, failedToDelete)} files) " + + log"not used in versions >= ${MDC(LogKeys.MIN_VERSION_NUM, minVersionToRetain)}") val changelogVersionsToDelete = changelogFiles .map(_.getName.stripSuffix(".changelog")).map(_.toLong) .filter(_ < minVersionToRetain) @@ -477,21 +504,24 @@ class RocksDBFileManager( /** Save immutable files to DFS directory */ private def saveImmutableFilesToDfs( version: Long, - localFiles: Seq[File]): Seq[RocksDBImmutableFile] = { + localFiles: Seq[File], + capturedFileMappings: RocksDBFileMappings): Seq[RocksDBImmutableFile] = { // Get the immutable files used in previous versions, as some of those uploaded files can be // reused for this version - logInfo(s"Saving RocksDB files to DFS for $version") + logInfo(log"Saving RocksDB files to DFS for ${MDC(LogKeys.VERSION_NUM, version)}") var bytesCopied = 0L var filesCopied = 0L var filesReused = 0L val immutableFiles = localFiles.map { localFile => - val existingDfsFile = localFilesToDfsFiles.asScala.get(localFile.getName) + val existingDfsFile = + capturedFileMappings.localFilesToDfsFiles.asScala.get(localFile.getName) if (existingDfsFile.isDefined && existingDfsFile.get.sizeBytes == localFile.length()) { val dfsFile = existingDfsFile.get filesReused += 1 - logInfo(s"reusing file $dfsFile for $localFile") + logInfo(log"reusing file ${MDC(LogKeys.DFS_FILE, dfsFile)} for " + + log"${MDC(LogKeys.FILE_NAME, localFile)}") RocksDBImmutableFile(localFile.getName, dfsFile.dfsFileName, dfsFile.sizeBytes) } else { val localFileName = localFile.getName @@ -504,19 +534,22 @@ class RocksDBFileManager( fs.copyFromLocalFile( new Path(localFile.getAbsoluteFile.toURI), dfsFile) val localFileSize = localFile.length() - logInfo(s"Copied $localFile to $dfsFile - $localFileSize bytes") + logInfo(log"Copied ${MDC(LogKeys.FILE_NAME, localFile)} to " + + log"${MDC(LogKeys.DFS_FILE, dfsFile)} - ${MDC(LogKeys.NUM_BYTES, localFileSize)} bytes") filesCopied += 1 bytesCopied += localFileSize val immutableDfsFile = RocksDBImmutableFile(localFile.getName, dfsFileName, localFileSize) - localFilesToDfsFiles.put(localFileName, immutableDfsFile) + capturedFileMappings.localFilesToDfsFiles.put(localFileName, immutableDfsFile) immutableDfsFile } } - logInfo(s"Copied $filesCopied files ($bytesCopied bytes) from local to" + - s" DFS for version $version. $filesReused files reused without copying.") - versionToRocksDBFiles.put(version, immutableFiles) + logInfo(log"Copied ${MDC(LogKeys.NUM_FILES_COPIED, filesCopied)} files " + + log"(${MDC(LogKeys.NUM_BYTES, bytesCopied)} bytes) from local to" + + log" DFS for version ${MDC(LogKeys.VERSION_NUM, version)}. " + + log"${MDC(LogKeys.NUM_FILES_REUSED, filesReused)} files reused without copying.") + capturedFileMappings.versionToRocksDBFiles.put(version, immutableFiles) // Cleanup locally deleted files from the localFilesToDfsFiles map // Locally, SST Files can be deleted due to RocksDB compaction. These files need @@ -556,7 +589,7 @@ class RocksDBFileManager( .foreach { existingFile => val existingFileSize = existingFile.length() val requiredFile = requiredFileNameToFileDetails.get(existingFile.getName) - val prevDfsFile = localFilesToDfsFiles.asScala.get(existingFile.getName) + val prevDfsFile = fileMappings.localFilesToDfsFiles.asScala.get(existingFile.getName) val isSameFile = if (requiredFile.isDefined && prevDfsFile.isDefined) { requiredFile.get.dfsFileName == prevDfsFile.get.dfsFileName && existingFile.length() == requiredFile.get.sizeBytes @@ -566,11 +599,14 @@ class RocksDBFileManager( if (!isSameFile) { existingFile.delete() - localFilesToDfsFiles.remove(existingFile.getName) - logInfo(s"Deleted local file $existingFile with size $existingFileSize mapped" + - s" to previous dfsFile ${prevDfsFile.getOrElse("null")}") + fileMappings.localFilesToDfsFiles.remove(existingFile.getName) + logInfo(log"Deleted local file ${MDC(LogKeys.FILE_NAME, existingFile)} " + + log"with size ${MDC(LogKeys.NUM_BYTES, existingFileSize)} mapped" + + log" to previous dfsFile ${MDC(LogKeys.DFS_FILE, prevDfsFile.getOrElse("null"))}") } else { - logInfo(s"reusing $prevDfsFile present at $existingFile for $requiredFile") + logInfo(log"reusing ${MDC(LogKeys.DFS_FILE, prevDfsFile)} present at " + + log"${MDC(LogKeys.EXISTING_FILE, existingFile)} " + + log"for ${MDC(LogKeys.FILE_NAME, requiredFile)}") } } @@ -595,14 +631,17 @@ class RocksDBFileManager( } filesCopied += 1 bytesCopied += localFileSize - localFilesToDfsFiles.put(localFileName, file) - logInfo(s"Copied $dfsFile to $localFile - $localFileSize bytes") + fileMappings.localFilesToDfsFiles.put(localFileName, file) + logInfo(log"Copied ${MDC(LogKeys.DFS_FILE, dfsFile)} to " + + log"${MDC(LogKeys.FILE_NAME, localFile)} - " + + log"${MDC(LogKeys.NUM_BYTES, localFileSize)} bytes") } else { filesReused += 1 } } - logInfo(s"Copied $filesCopied files ($bytesCopied bytes) from DFS to local with " + - s"$filesReused files reused.") + logInfo(log"Copied ${MDC(LogKeys.NUM_FILES_COPIED, filesCopied)} files " + + log"(${MDC(LogKeys.NUM_BYTES, bytesCopied)} bytes) from DFS to local with " + + log"${MDC(LogKeys.NUM_FILES_REUSED, filesReused)} files reused.") loadCheckpointMetrics = RocksDBFileManagerMetrics( bytesCopied = bytesCopied, @@ -613,13 +652,13 @@ class RocksDBFileManager( private def removeLocallyDeletedSSTFilesFromDfsMapping(localFiles: Seq[File]): Unit = { // clean up deleted SST files from the localFilesToDfsFiles Map val currentLocalFiles = localFiles.map(_.getName).toSet - val mappingsToClean = localFilesToDfsFiles.asScala + val mappingsToClean = fileMappings.localFilesToDfsFiles.asScala .keys .filterNot(currentLocalFiles.contains) mappingsToClean.foreach { f => - logInfo(s"cleaning $f from the localFilesToDfsFiles map") - localFilesToDfsFiles.remove(f) + logInfo(log"cleaning ${MDC(LogKeys.FILE_NAME, f)} from the localFilesToDfsFiles map") + fileMappings.localFilesToDfsFiles.remove(f) } } @@ -653,7 +692,8 @@ class RocksDBFileManager( totalBytes += bytes } zout.close() // so that any error in closing also cancels the output stream - logInfo(s"Zipped $totalBytes bytes (before compression) to $filesStr") + logInfo(log"Zipped ${MDC(LogKeys.NUM_BYTES, totalBytes)} bytes (before compression) to " + + log"${MDC(LogKeys.FILE_NAME, filesStr)}") // The other fields saveCheckpointMetrics should have been filled saveCheckpointMetrics = saveCheckpointMetrics.copy(zipFileBytesUncompressed = Some(totalBytes)) @@ -661,7 +701,7 @@ class RocksDBFileManager( case e: Exception => // Cancel the actual output stream first, so that zout.close() does not write the file out.cancel() - logError(s"Error zipping to $filesStr", e) + logError(log"Error zipping to ${MDC(LogKeys.FILE_NAME, filesStr)}", e) throw e } finally { // Close everything no matter what happened @@ -671,11 +711,12 @@ class RocksDBFileManager( } /** Log the files present in a directory. This is useful for debugging. */ - private def logFilesInDir(dir: File, msg: String): Unit = { + private def logFilesInDir(dir: File, msg: MessageWithContext): Unit = { lazy val files = Option(Utils.recursiveList(dir)).getOrElse(Array.empty).map { f => s"${f.getAbsolutePath} - ${f.length()} bytes" } - logInfo(s"$msg - ${files.length} files\n\t${files.mkString("\n\t")}") + logInfo(msg + log" - ${MDC(LogKeys.NUM_FILES, files.length)} files\n\t" + + log"${MDC(LogKeys.FILE_NAME, files.mkString("\n\t"))}") } private def newDFSFileName(localFileName: String): String = { @@ -727,6 +768,20 @@ class RocksDBFileManager( } } +/** + * Track file mappings in RocksDB across local and remote directories + * @param versionToRocksDBFiles Mapping of RocksDB files used across versions for maintenance + * @param localFilesToDfsFiles Mapping of the exact Dfs file used to create a local SST file + * The reason localFilesToDfsFiles is a separate map because versionToRocksDBFiles can contain + * multiple similar SST files to a particular local file (for example 1.sst can map to 1-UUID1.sst + * in v1 and 1-UUID2.sst in v2). We need to capture the exact file used to ensure Version ID + * compatibility across SST files and RocksDB manifest. + */ + +case class RocksDBFileMappings( + versionToRocksDBFiles: ConcurrentHashMap[Long, Seq[RocksDBImmutableFile]], + localFilesToDfsFiles: ConcurrentHashMap[String, RocksDBImmutableFile]) + /** * Metrics regarding RocksDB file sync between local and DFS. */ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDBMemoryManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDBMemoryManager.scala index 38b9dc56838ee..273cbbc5e87d5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDBMemoryManager.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDBMemoryManager.scala @@ -19,7 +19,8 @@ package org.apache.spark.sql.execution.streaming.state import org.rocksdb._ -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ /** * Singleton responsible for managing cache and write buffer manager associated with all RocksDB @@ -47,8 +48,8 @@ object RocksDBMemoryManager extends Logging { } val totalMemoryUsageInBytes: Long = conf.totalMemoryUsageMB * 1024 * 1024 - logInfo(s"Creating RocksDB state store LRU cache with " + - s"total_size=$totalMemoryUsageInBytes") + logInfo(log"Creating RocksDB state store LRU cache with " + + log"total_size=${MDC(NUM_BYTES, totalMemoryUsageInBytes)}") // SPARK-44878 - avoid using strict limit to prevent insertion exception on cache full. // Please refer to RocksDB issue here - https://github.com/facebook/rocksdb/issues/8670 diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDBStateStoreProvider.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDBStateStoreProvider.scala index e05f9c24f7193..a555f9a40044a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDBStateStoreProvider.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDBStateStoreProvider.scala @@ -24,14 +24,16 @@ import scala.util.control.NonFatal import org.apache.hadoop.conf.Configuration import org.apache.spark.{SparkConf, SparkEnv} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.errors.QueryExecutionErrors import org.apache.spark.sql.types.StructType import org.apache.spark.util.Utils private[sql] class RocksDBStateStoreProvider - extends StateStoreProvider with Logging with Closeable { + extends StateStoreProvider with Logging with Closeable + with SupportsFineGrainedReplay { import RocksDBStateStoreProvider._ class RocksDBStateStore(lastVersion: Long) extends StateStore { @@ -164,7 +166,8 @@ private[sql] class RocksDBStateStoreProvider verify(state == UPDATING, "Cannot commit after already committed or aborted") val newVersion = rocksDB.commit() state = COMMITTED - logInfo(s"Committed $newVersion for $id") + logInfo(log"Committed ${MDC(VERSION_NUM, newVersion)} " + + log"for ${MDC(STATE_STORE_ID, id)}") newVersion } catch { case e: Throwable => @@ -174,7 +177,8 @@ private[sql] class RocksDBStateStoreProvider override def abort(): Unit = { verify(state == UPDATING || state == ABORTED, "Cannot abort after already committed") - logInfo(s"Aborting ${version + 1} for $id") + logInfo(log"Aborting ${MDC(VERSION_NUM, version + 1)} " + + log"for ${MDC(STATE_STORE_ID, id)}") rocksDB.rollback() state = ABORTED } @@ -238,7 +242,8 @@ private[sql] class RocksDBStateStoreProvider rocksDBMetrics.totalMemUsageBytes, stateStoreCustomMetrics) } else { - logInfo(s"Failed to collect metrics for store_id=$id and version=$version") + logInfo(log"Failed to collect metrics for store_id=${MDC(STATE_STORE_ID, id)} " + + log"and version=${MDC(VERSION_NUM, version)}") StateStoreMetrics(0, 0, Map.empty) } } @@ -363,6 +368,30 @@ private[sql] class RocksDBStateStoreProvider private def verify(condition: => Boolean, msg: String): Unit = { if (!condition) { throw new IllegalStateException(msg) } } + + /** + * Get the state store of endVersion by applying delta files on the snapshot of snapshotVersion. + * If snapshot for snapshotVersion does not exist, an error will be thrown. + * + * @param snapshotVersion checkpoint version of the snapshot to start with + * @param endVersion checkpoint version to end with + * @return [[StateStore]] + */ + override def replayStateFromSnapshot(snapshotVersion: Long, endVersion: Long): StateStore = { + try { + if (snapshotVersion < 1) { + throw QueryExecutionErrors.unexpectedStateStoreVersion(snapshotVersion) + } + if (endVersion < snapshotVersion) { + throw QueryExecutionErrors.unexpectedStateStoreVersion(endVersion) + } + rocksDB.loadFromSnapshot(snapshotVersion, endVersion) + new RocksDBStateStore(endVersion) + } + catch { + case e: Throwable => throw QueryExecutionErrors.cannotLoadStore(e) + } + } } object RocksDBStateStoreProvider { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/SchemaHelper.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/SchemaHelper.scala index 2eef3d9fc22ed..0a8021ab3de2b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/SchemaHelper.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/SchemaHelper.scala @@ -20,6 +20,11 @@ package org.apache.spark.sql.execution.streaming.state import java.io.StringReader import org.apache.hadoop.fs.{FSDataInputStream, FSDataOutputStream} +import org.json4s.DefaultFormats +import org.json4s.JsonAST._ +import org.json4s.JsonDSL._ +import org.json4s.jackson.JsonMethods +import org.json4s.jackson.JsonMethods.{compact, render} import org.apache.spark.sql.execution.streaming.MetadataVersionUtil import org.apache.spark.sql.types.StructType @@ -28,6 +33,57 @@ import org.apache.spark.util.Utils /** * Helper classes for reading/writing state schema. */ +sealed trait ColumnFamilySchema extends Serializable { + def jsonValue: JValue + + def json: String + + def columnFamilyName: String +} + +case class ColumnFamilySchemaV1( + columnFamilyName: String, + keySchema: StructType, + valueSchema: StructType, + keyStateEncoderSpec: KeyStateEncoderSpec, + userKeyEncoder: Option[StructType] = None) extends ColumnFamilySchema { + def jsonValue: JValue = { + ("columnFamilyName" -> JString(columnFamilyName)) ~ + ("keySchema" -> JString(keySchema.json)) ~ + ("valueSchema" -> JString(valueSchema.json)) ~ + ("keyStateEncoderSpec" -> keyStateEncoderSpec.jsonValue) ~ + ("userKeyEncoder" -> userKeyEncoder.map(s => JString(s.json)).getOrElse(JNothing)) + } + + def json: String = { + compact(render(jsonValue)) + } +} + +object ColumnFamilySchemaV1 { + + /** + * Create a ColumnFamilySchemaV1 object from the Json string + * This function is to read the StateSchemaV3 file + */ + def fromJson(json: String): ColumnFamilySchema = { + implicit val formats: DefaultFormats.type = DefaultFormats + val colFamilyMap = JsonMethods.parse(json).extract[Map[String, Any]] + assert(colFamilyMap.isInstanceOf[Map[_, _]], + s"Expected Map but got ${colFamilyMap.getClass}") + val keySchema = StructType.fromString(colFamilyMap("keySchema").asInstanceOf[String]) + val valueSchema = StructType.fromString(colFamilyMap("valueSchema").asInstanceOf[String]) + ColumnFamilySchemaV1( + colFamilyMap("columnFamilyName").asInstanceOf[String], + keySchema, + valueSchema, + KeyStateEncoderSpec.fromJson(keySchema, colFamilyMap("keyStateEncoderSpec") + .asInstanceOf[Map[String, Any]]), + colFamilyMap.get("userKeyEncoder").map(_.asInstanceOf[String]).map(StructType.fromString) + ) + } +} + object SchemaHelper { sealed trait SchemaReader { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateSchemaCompatibilityChecker.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateSchemaCompatibilityChecker.scala index a385c09b38fc3..8aabc0846fe61 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateSchemaCompatibilityChecker.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateSchemaCompatibilityChecker.scala @@ -17,17 +17,19 @@ package org.apache.spark.sql.execution.streaming.state +import scala.util.Try + import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path -import org.apache.spark.internal.Logging -import org.apache.spark.sql.execution.streaming.CheckpointFileManager +import org.apache.spark.SparkUnsupportedOperationException +import org.apache.spark.internal.{Logging, LogKeys, MDC} +import org.apache.spark.sql.catalyst.util.UnsafeRowUtils +import org.apache.spark.sql.execution.streaming.{CheckpointFileManager, StatefulOperatorStateInfo} import org.apache.spark.sql.execution.streaming.state.SchemaHelper.{SchemaReader, SchemaWriter} -import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.SessionState import org.apache.spark.sql.types.{DataType, StructType} -case class StateSchemaNotCompatible(message: String) extends Exception(message) - class StateSchemaCompatibilityChecker( providerId: StateStoreProviderId, hadoopConf: Configuration) extends Logging { @@ -40,54 +42,6 @@ class StateSchemaCompatibilityChecker( fm.mkdirs(schemaFileLocation.getParent) - def check(keySchema: StructType, valueSchema: StructType): Unit = { - check(keySchema, valueSchema, ignoreValueSchema = false) - } - - def check(keySchema: StructType, valueSchema: StructType, ignoreValueSchema: Boolean): Unit = { - if (fm.exists(schemaFileLocation)) { - logDebug(s"Schema file for provider $providerId exists. Comparing with provided schema.") - val (storedKeySchema, storedValueSchema) = readSchemaFile() - if (storedKeySchema.equals(keySchema) && - (ignoreValueSchema || storedValueSchema.equals(valueSchema))) { - // schema is exactly same - } else if (!schemasCompatible(storedKeySchema, keySchema) || - (!ignoreValueSchema && !schemasCompatible(storedValueSchema, valueSchema))) { - val errorMsgForKeySchema = s"- Provided key schema: $keySchema\n" + - s"- Existing key schema: $storedKeySchema\n" - - // If it is requested to skip checking the value schema, we also don't expose the value - // schema information to the error message. - val errorMsgForValueSchema = if (!ignoreValueSchema) { - s"- Provided value schema: $valueSchema\n" + - s"- Existing value schema: $storedValueSchema\n" - } else { - "" - } - val errorMsg = "Provided schema doesn't match to the schema for existing state! " + - "Please note that Spark allow difference of field name: check count of fields " + - "and data type of each field.\n" + - errorMsgForKeySchema + - errorMsgForValueSchema + - s"If you want to force running query without schema validation, please set " + - s"${SQLConf.STATE_SCHEMA_CHECK_ENABLED.key} to false.\n" + - "Please note running query with incompatible schema could cause indeterministic" + - " behavior." - logError(errorMsg) - throw StateSchemaNotCompatible(errorMsg) - } else { - logInfo("Detected schema change which is compatible. Allowing to put rows.") - } - } else { - // schema doesn't exist, create one now - logDebug(s"Schema file for provider $providerId doesn't exist. Creating one.") - createSchemaFile(keySchema, valueSchema) - } - } - - private def schemasCompatible(storedSchema: StructType, schema: StructType): Boolean = - DataType.equalsIgnoreNameAndCompatibleNullability(schema, storedSchema) - def readSchemaFile(): (StructType, StructType) = { val inStream = fm.open(schemaFileLocation) try { @@ -96,14 +50,27 @@ class StateSchemaCompatibilityChecker( schemaReader.read(inStream) } catch { case e: Throwable => - logError(s"Fail to read schema file from $schemaFileLocation", e) + logError(log"Fail to read schema file from ${MDC(LogKeys.PATH, schemaFileLocation)}", e) throw e } finally { inStream.close() } } - def createSchemaFile(keySchema: StructType, valueSchema: StructType): Unit = { + /** + * Function to read and return the existing key and value schema from the schema file, if it + * exists + * @return - Option of (keySchema, valueSchema) if the schema file exists, None otherwise + */ + private def getExistingKeyAndValueSchema(): Option[(StructType, StructType)] = { + if (fm.exists(schemaFileLocation)) { + Some(readSchemaFile()) + } else { + None + } + } + + private def createSchemaFile(keySchema: StructType, valueSchema: StructType): Unit = { createSchemaFile(keySchema, valueSchema, schemaWriter) } @@ -118,16 +85,123 @@ class StateSchemaCompatibilityChecker( outStream.close() } catch { case e: Throwable => - logError(s"Fail to write schema file to $schemaFileLocation", e) + logError(log"Fail to write schema file to ${MDC(LogKeys.PATH, schemaFileLocation)}", e) outStream.cancel() throw e } } + def validateAndMaybeEvolveStateSchema( + newKeySchema: StructType, + newValueSchema: StructType, + ignoreValueSchema: Boolean): Unit = { + val existingSchema = getExistingKeyAndValueSchema() + if (existingSchema.isEmpty) { + // write the schema file if it doesn't exist + createSchemaFile(newKeySchema, newValueSchema) + } else { + // validate if the new schema is compatible with the existing schema + StateSchemaCompatibilityChecker. + check(existingSchema.get, (newKeySchema, newValueSchema), ignoreValueSchema) + } + } + private def schemaFile(storeCpLocation: Path): Path = new Path(new Path(storeCpLocation, "_metadata"), "schema") } -object StateSchemaCompatibilityChecker { +object StateSchemaCompatibilityChecker extends Logging { val VERSION = 2 + + /** + * Function to check if new state store schema is compatible with the existing schema. + * @param oldSchema - old state schema + * @param newSchema - new state schema + * @param ignoreValueSchema - whether to ignore value schema or not + */ + def check( + oldSchema: (StructType, StructType), + newSchema: (StructType, StructType), + ignoreValueSchema: Boolean) : Unit = { + val (storedKeySchema, storedValueSchema) = oldSchema + val (keySchema, valueSchema) = newSchema + + if (storedKeySchema.equals(keySchema) && + (ignoreValueSchema || storedValueSchema.equals(valueSchema))) { + // schema is exactly same + } else if (!schemasCompatible(storedKeySchema, keySchema)) { + throw StateStoreErrors.stateStoreKeySchemaNotCompatible(storedKeySchema.toString, + keySchema.toString) + } else if (!ignoreValueSchema && !schemasCompatible(storedValueSchema, valueSchema)) { + throw StateStoreErrors.stateStoreValueSchemaNotCompatible(storedValueSchema.toString, + valueSchema.toString) + } else { + logInfo("Detected schema change which is compatible. Allowing to put rows.") + } + } + + private def schemasCompatible(storedSchema: StructType, schema: StructType): Boolean = + DataType.equalsIgnoreNameAndCompatibleNullability(schema, storedSchema) + + private def disallowBinaryInequalityColumn(schema: StructType): Unit = { + if (!UnsafeRowUtils.isBinaryStable(schema)) { + throw new SparkUnsupportedOperationException( + errorClass = "STATE_STORE_UNSUPPORTED_OPERATION_BINARY_INEQUALITY", + messageParameters = Map("schema" -> schema.json) + ) + } + } + + /** + * Function to validate the schema of the state store and maybe evolve it if needed. + * We also verify for binary inequality columns in the schema and disallow them. We then perform + * key and value schema validation. Depending on the passed configs, a warning might be logged + * or an exception might be thrown if the schema is not compatible. + * + * @param stateInfo - StatefulOperatorStateInfo containing the state store information + * @param hadoopConf - Hadoop configuration + * @param newKeySchema - New key schema + * @param newValueSchema - New value schema + * @param sessionState - session state used to retrieve session config + * @param extraOptions - any extra options to be passed for StateStoreConf creation + * @param storeName - optional state store name + */ + def validateAndMaybeEvolveStateSchema( + stateInfo: StatefulOperatorStateInfo, + hadoopConf: Configuration, + newKeySchema: StructType, + newValueSchema: StructType, + sessionState: SessionState, + extraOptions: Map[String, String] = Map.empty, + storeName: String = StateStoreId.DEFAULT_STORE_NAME): Array[String] = { + // SPARK-47776: collation introduces the concept of binary (in)equality, which means + // in some collation we no longer be able to just compare the binary format of two + // UnsafeRows to determine equality. For example, 'aaa' and 'AAA' can be "semantically" + // same in case insensitive collation. + // State store is basically key-value storage, and the most provider implementations + // rely on the fact that all the columns in the key schema support binary equality. + // We need to disallow using binary inequality column in the key schema, before we + // could support this in majority of state store providers (or high-level of state + // store.) + disallowBinaryInequalityColumn(newKeySchema) + + val storeConf = new StateStoreConf(sessionState.conf, extraOptions) + val providerId = StateStoreProviderId(StateStoreId(stateInfo.checkpointLocation, + stateInfo.operatorId, 0, storeName), stateInfo.queryRunId) + val checker = new StateSchemaCompatibilityChecker(providerId, hadoopConf) + // regardless of configuration, we check compatibility to at least write schema file + // if necessary + // if the format validation for value schema is disabled, we also disable the schema + // compatibility checker for value schema as well. + val result = Try( + checker.validateAndMaybeEvolveStateSchema(newKeySchema, newValueSchema, + ignoreValueSchema = !storeConf.formatValidationCheckValue) + ).toEither.fold(Some(_), _ => None) + + // if schema validation is enabled and an exception is thrown, we re-throw it and fail the query + if (storeConf.stateSchemaCheckEnabled && result.isDefined) { + throw result.get + } + Array(checker.schemaFileLocation.toString) + } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateSchemaV3File.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateSchemaV3File.scala new file mode 100644 index 0000000000000..38e6484728126 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateSchemaV3File.scala @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.streaming.state + +import java.io.{InputStream, OutputStream} +import java.nio.charset.StandardCharsets.UTF_8 +import java.util.UUID + +import scala.io.{Source => IOSource} + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path + +import org.apache.spark.sql.execution.streaming.CheckpointFileManager +import org.apache.spark.sql.execution.streaming.MetadataVersionUtil.validateVersion + +/** + * The StateSchemaV3File is used to write the schema of multiple column families. + * Right now, this is primarily used for the TransformWithState operator, which supports + * multiple column families to keep the data for multiple state variables. + * We only expect ColumnFamilySchemaV1 to be written and read from this file. + * @param hadoopConf Hadoop configuration that is used to read / write metadata files. + * @param path Path to the directory that will be used for writing metadata. + */ +class StateSchemaV3File( + hadoopConf: Configuration, + path: String) { + + val metadataPath = new Path(path) + + protected val fileManager: CheckpointFileManager = + CheckpointFileManager.create(metadataPath, hadoopConf) + + if (!fileManager.exists(metadataPath)) { + fileManager.mkdirs(metadataPath) + } + + private def deserialize(in: InputStream): List[ColumnFamilySchema] = { + val lines = IOSource.fromInputStream(in, UTF_8.name()).getLines() + + if (!lines.hasNext) { + throw new IllegalStateException("Incomplete log file in the offset commit log") + } + + val version = lines.next().trim + validateVersion(version, StateSchemaV3File.VERSION) + + lines.map(ColumnFamilySchemaV1.fromJson).toList + } + + private def serialize(schemas: List[ColumnFamilySchema], out: OutputStream): Unit = { + out.write(s"v${StateSchemaV3File.VERSION}".getBytes(UTF_8)) + out.write('\n') + out.write(schemas.map(_.json).mkString("\n").getBytes(UTF_8)) + } + + def addWithUUID(batchId: Long, metadata: List[ColumnFamilySchema]): Path = { + val schemaFilePath = new Path(metadataPath, s"${batchId}_${UUID.randomUUID().toString}") + write(schemaFilePath, out => serialize(metadata, out)) + schemaFilePath + } + + def getWithPath(schemaFilePath: Path): List[ColumnFamilySchema] = { + deserialize(fileManager.open(schemaFilePath)) + } + + protected def write( + batchMetadataFile: Path, + fn: OutputStream => Unit): Unit = { + val output = fileManager.createAtomic(batchMetadataFile, overwriteIfPossible = false) + try { + fn(output) + output.close() + } catch { + case e: Throwable => + output.cancel() + throw e + } + } +} + +object StateSchemaV3File { + val VERSION = 3 +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala index 85f7fed90c6c8..484a6850ce79e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala @@ -23,14 +23,17 @@ import java.util.concurrent.atomic.AtomicReference import javax.annotation.concurrent.GuardedBy import scala.collection.mutable -import scala.util.Try import scala.util.control.NonFatal import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path +import org.json4s.{JInt, JString} +import org.json4s.JsonAST.JValue +import org.json4s.JsonDSL._ +import org.json4s.jackson.JsonMethods.{compact, render} -import org.apache.spark.{SparkContext, SparkEnv, SparkUnsupportedOperationException} -import org.apache.spark.internal.Logging +import org.apache.spark.{SparkContext, SparkEnv, SparkException} +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.catalyst.util.UnsafeRowUtils import org.apache.spark.sql.errors.QueryExecutionErrors @@ -279,19 +282,33 @@ case class StateStoreCustomTimingMetric(name: String, desc: String) extends Stat SQLMetrics.createTimingMetric(sparkContext, desc) } -/** - * An exception thrown when an invalid UnsafeRow is detected in state store. - */ -class InvalidUnsafeRowException(error: String) - extends RuntimeException("The streaming query failed by state format invalidation. " + - "The following reasons may cause this: 1. An old Spark version wrote the checkpoint that is " + - "incompatible with the current one; 2. Broken checkpoint files; 3. The query is changed " + - "among restart. For the first case, you can try to restart the application without " + - s"checkpoint or use the legacy Spark version to process the streaming state.\n$error", null) +sealed trait KeyStateEncoderSpec { + def jsonValue: JValue + def json: String = compact(render(jsonValue)) +} -sealed trait KeyStateEncoderSpec +object KeyStateEncoderSpec { + def fromJson(keySchema: StructType, m: Map[String, Any]): KeyStateEncoderSpec = { + // match on type + m("keyStateEncoderType").asInstanceOf[String] match { + case "NoPrefixKeyStateEncoderSpec" => + NoPrefixKeyStateEncoderSpec(keySchema) + case "RangeKeyScanStateEncoderSpec" => + val orderingOrdinals = m("orderingOrdinals"). + asInstanceOf[List[_]].map(_.asInstanceOf[BigInt].toInt) + RangeKeyScanStateEncoderSpec(keySchema, orderingOrdinals) + case "PrefixKeyScanStateEncoderSpec" => + val numColsPrefixKey = m("numColsPrefixKey").asInstanceOf[BigInt].toInt + PrefixKeyScanStateEncoderSpec(keySchema, numColsPrefixKey) + } + } +} -case class NoPrefixKeyStateEncoderSpec(keySchema: StructType) extends KeyStateEncoderSpec +case class NoPrefixKeyStateEncoderSpec(keySchema: StructType) extends KeyStateEncoderSpec { + override def jsonValue: JValue = { + ("keyStateEncoderType" -> JString("NoPrefixKeyStateEncoderSpec")) + } +} case class PrefixKeyScanStateEncoderSpec( keySchema: StructType, @@ -299,6 +316,11 @@ case class PrefixKeyScanStateEncoderSpec( if (numColsPrefixKey == 0 || numColsPrefixKey >= keySchema.length) { throw StateStoreErrors.incorrectNumOrderingColsForPrefixScan(numColsPrefixKey.toString) } + + override def jsonValue: JValue = { + ("keyStateEncoderType" -> JString("PrefixKeyScanStateEncoderSpec")) ~ + ("numColsPrefixKey" -> JInt(numColsPrefixKey)) + } } /** Encodes rows so that they can be range-scanned based on orderingOrdinals */ @@ -308,6 +330,11 @@ case class RangeKeyScanStateEncoderSpec( if (orderingOrdinals.isEmpty || orderingOrdinals.length > keySchema.length) { throw StateStoreErrors.incorrectNumOrderingColsForRangeScan(orderingOrdinals.length.toString) } + + override def jsonValue: JValue = { + ("keyStateEncoderType" -> JString("RangeKeyScanStateEncoderSpec")) ~ + ("orderingOrdinals" -> orderingOrdinals.map(JInt(_))) + } } /** @@ -396,6 +423,12 @@ object StateStoreProvider { */ def create(providerClassName: String): StateStoreProvider = { val providerClass = Utils.classForName(providerClassName) + if (!classOf[StateStoreProvider].isAssignableFrom(providerClass)) { + throw new SparkException( + errorClass = "STATE_STORE_INVALID_PROVIDER", + messageParameters = Map("inputClass" -> providerClassName), + cause = null) + } providerClass.getConstructor().newInstance().asInstanceOf[StateStoreProvider] } @@ -428,17 +461,54 @@ object StateStoreProvider { conf: StateStoreConf): Unit = { if (conf.formatValidationEnabled) { val validationError = UnsafeRowUtils.validateStructuralIntegrityWithReason(keyRow, keySchema) - validationError.foreach { error => throw new InvalidUnsafeRowException(error) } + validationError.foreach { error => + throw StateStoreErrors.keyRowFormatValidationFailure(error) + } if (conf.formatValidationCheckValue) { val validationError = UnsafeRowUtils.validateStructuralIntegrityWithReason(valueRow, valueSchema) - validationError.foreach { error => throw new InvalidUnsafeRowException(error) } + validationError.foreach { error => + throw StateStoreErrors.valueRowFormatValidationFailure(error) + } } } } } +/** + * This is an optional trait to be implemented by [[StateStoreProvider]]s that can read fine + * grained state data which is replayed from a specific snapshot version. It is used by the + * snapshotStartBatchId option in state data source. + */ +trait SupportsFineGrainedReplay { + + /** + * Return an instance of [[StateStore]] representing state data of the given version. + * The State Store will be constructed from the snapshot at snapshotVersion, and applying delta + * files up to the endVersion. If there is no snapshot file at snapshotVersion, an exception will + * be thrown. + * + * @param snapshotVersion checkpoint version of the snapshot to start with + * @param endVersion checkpoint version to end with + */ + def replayStateFromSnapshot(snapshotVersion: Long, endVersion: Long): StateStore + + /** + * Return an instance of [[ReadStateStore]] representing state data of the given version. + * The State Store will be constructed from the snapshot at snapshotVersion, and applying delta + * files up to the endVersion. If there is no snapshot file at snapshotVersion, an exception will + * be thrown. + * Only implement this if there is read-only optimization for the state store. + * + * @param snapshotVersion checkpoint version of the snapshot to start with + * @param endVersion checkpoint version to end with + */ + def replayReadStateFromSnapshot(snapshotVersion: Long, endVersion: Long): ReadStateStore = { + new WrappedReadStateStore(replayStateFromSnapshot(snapshotVersion, endVersion)) + } +} + /** * Unique identifier for a provider, used to identify when providers can be reused. * Note that `queryRunId` is used uniquely identify a provider, so that the same provider @@ -524,9 +594,6 @@ object StateStore extends Logging { @GuardedBy("loadedProviders") private val loadedProviders = new mutable.HashMap[StateStoreProviderId, StateStoreProvider]() - @GuardedBy("loadedProviders") - private val schemaValidated = new mutable.HashMap[StateStoreProviderId, Option[Throwable]]() - private val maintenanceThreadPoolLock = new Object // Shared exception between threads in thread pool that the scheduling thread @@ -584,7 +651,21 @@ object StateStore extends Logging { } def stop(): Unit = { - threadPool.shutdown() + logInfo("Shutting down MaintenanceThreadPool") + threadPool.shutdown() // Disable new tasks from being submitted + + // Wait a while for existing tasks to terminate + if (!threadPool.awaitTermination(5 * 60, TimeUnit.SECONDS)) { + logWarning( + s"MaintenanceThreadPool is not able to be terminated within 300 seconds," + + " forcefully shutting down now.") + threadPool.shutdownNow() // Cancel currently executing tasks + + // Wait a while for tasks to respond to being cancelled + if (!threadPool.awaitTermination(60, TimeUnit.SECONDS)) { + logError("MaintenanceThreadPool did not terminate") + } + } } } @@ -635,15 +716,6 @@ object StateStore extends Logging { storeProvider.getStore(version) } - private def disallowBinaryInequalityColumn(schema: StructType): Unit = { - if (!UnsafeRowUtils.isBinaryStable(schema)) { - throw new SparkUnsupportedOperationException( - errorClass = "STATE_STORE_UNSUPPORTED_OPERATION_BINARY_INEQUALITY", - messageParameters = Map("schema" -> schema.json) - ) - } - } - private def getStateStoreProvider( storeProviderId: StateStoreProviderId, keySchema: StructType, @@ -656,40 +728,6 @@ object StateStore extends Logging { loadedProviders.synchronized { startMaintenanceIfNeeded(storeConf) - if (storeProviderId.storeId.partitionId == PARTITION_ID_TO_CHECK_SCHEMA) { - val result = schemaValidated.getOrElseUpdate(storeProviderId, { - // SPARK-47776: collation introduces the concept of binary (in)equality, which means - // in some collation we no longer be able to just compare the binary format of two - // UnsafeRows to determine equality. For example, 'aaa' and 'AAA' can be "semantically" - // same in case insensitive collation. - // State store is basically key-value storage, and the most provider implementations - // rely on the fact that all the columns in the key schema support binary equality. - // We need to disallow using binary inequality column in the key schema, before we - // could support this in majority of state store providers (or high-level of state - // store.) - disallowBinaryInequalityColumn(keySchema) - - val checker = new StateSchemaCompatibilityChecker(storeProviderId, hadoopConf) - // regardless of configuration, we check compatibility to at least write schema file - // if necessary - // if the format validation for value schema is disabled, we also disable the schema - // compatibility checker for value schema as well. - val ret = Try( - checker.check(keySchema, valueSchema, - ignoreValueSchema = !storeConf.formatValidationCheckValue) - ).toEither.fold(Some(_), _ => None) - if (storeConf.stateSchemaCheckEnabled) { - ret - } else { - None - } - }) - - if (result.isDefined) { - throw result.get - } - } - // SPARK-42567 - Track load time for state store provider and log warning if takes longer // than 2s. val (provider, loadTimeMs) = Utils.timeTakenMs { @@ -702,9 +740,10 @@ object StateStore extends Logging { } if (loadTimeMs > 2000L) { - logWarning(s"Loaded state store provider in loadTimeMs=$loadTimeMs " + - s"for storeId=${storeProviderId.storeId.toString} and " + - s"queryRunId=${storeProviderId.queryRunId}") + logWarning(log"Loaded state store provider in loadTimeMs=" + + log"${MDC(LogKeys.LOAD_TIME, loadTimeMs)} " + + log"for storeId=${MDC(LogKeys.STORE_ID, storeProviderId.storeId.toString)} and " + + log"queryRunId=${MDC(LogKeys.QUERY_RUN_ID, storeProviderId.queryRunId)}") } val otherProviderIds = loadedProviders.keys.filter(_ != storeProviderId).toSeq @@ -820,16 +859,18 @@ object StateStore extends Logging { provider.doMaintenance() if (!verifyIfStoreInstanceActive(id)) { unload(id) - logInfo(s"Unloaded $provider") + logInfo(log"Unloaded ${MDC(LogKeys.STATE_STORE_PROVIDER, provider)}") } } catch { case NonFatal(e) => - logWarning(s"Error managing $provider, stopping management thread", e) + logWarning(log"Error managing ${MDC(LogKeys.STATE_STORE_PROVIDER, provider)}, " + + log"stopping management thread", e) threadPoolException.set(e) } finally { val duration = System.currentTimeMillis() - startTime - val logMsg = s"Finished maintenance task for provider=$id" + - s" in elapsed_time=$duration\n" + val logMsg = + log"Finished maintenance task for provider=${MDC(LogKeys.STATE_STORE_PROVIDER, id)}" + + log" in elapsed_time=${MDC(LogKeys.TIME_UNITS, duration)}\n" if (duration > 5000) { logInfo(logMsg) } else { @@ -841,8 +882,9 @@ object StateStore extends Logging { } }) } else { - logInfo(s"Not processing partition ${id} for maintenance because it is currently " + - s"being processed") + logInfo(log"Not processing partition ${MDC(LogKeys.PARTITION_ID, id)} " + + log"for maintenance because it is currently " + + log"being processed") } } } @@ -856,8 +898,10 @@ object StateStore extends Logging { val providerIdsToUnload = coordinatorRef .map(_.reportActiveInstance(storeProviderId, host, executorId, otherProviderIds)) .getOrElse(Seq.empty[StateStoreProviderId]) - logInfo(s"Reported that the loaded instance $storeProviderId is active") - logDebug(s"The loaded instances are going to unload: ${providerIdsToUnload.mkString(", ")}") + logInfo(log"Reported that the loaded instance " + + log"${MDC(LogKeys.STATE_STORE_PROVIDER, storeProviderId)} is active") + logDebug(log"The loaded instances are going to unload: " + + log"${MDC(LogKeys.STATE_STORE_PROVIDER, providerIdsToUnload.mkString(", "))}") providerIdsToUnload } else { Seq.empty[StateStoreProviderId] @@ -888,7 +932,8 @@ object StateStore extends Logging { logDebug("Getting StateStoreCoordinatorRef") _coordRef = StateStoreCoordinatorRef.forExecutor(env) } - logInfo(s"Retrieved reference to StateStoreCoordinator: ${_coordRef}") + logInfo(log"Retrieved reference to StateStoreCoordinator: " + + log"${MDC(LogKeys.STATE_STORE_PROVIDER, _coordRef)}") Some(_coordRef) } else { _coordRef = null diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreChangelog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreChangelog.scala index 30cf49d8e56d4..b1860be41ac44 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreChangelog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreChangelog.scala @@ -25,7 +25,8 @@ import com.google.common.io.ByteStreams import org.apache.commons.io.IOUtils import org.apache.hadoop.fs.{FSError, Path} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.io.CompressionCodec import org.apache.spark.sql.errors.QueryExecutionErrors import org.apache.spark.sql.execution.streaming.CheckpointFileManager @@ -108,8 +109,9 @@ abstract class StateStoreChangelogWriter( // IOException into FSError. case e: FSError if e.getCause.isInstanceOf[IOException] => case NonFatal(ex) => - logInfo(s"Failed to cancel changelog file $file for state store provider " + - s"with exception=$ex") + logInfo(log"Failed to cancel changelog file ${MDC(FILE_NAME, file)} " + + log"for state store provider " + + log"with exception=${MDC(ERROR, ex)}") } finally { backingFileStream = null compressedStream = null @@ -174,7 +176,7 @@ class StateStoreChangelogWriterV1( } catch { case e: Throwable => abort() - logError(s"Fail to commit changelog file $file because of exception $e") + logError(log"Fail to commit changelog file ${MDC(PATH, file)} because of exception", e) throw e } finally { backingFileStream = null @@ -253,7 +255,7 @@ class StateStoreChangelogWriterV2( } catch { case e: Throwable => abort() - logError(s"Fail to commit changelog file $file because of exception $e") + logError(log"Fail to commit changelog file ${MDC(PATH, file)} because of exception", e) throw e } finally { backingFileStream = null diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreErrors.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreErrors.scala index b8ab32a00851f..4ac813291c00b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreErrors.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreErrors.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.execution.streaming.state -import org.apache.spark.{SparkException, SparkUnsupportedOperationException} +import org.apache.spark.{SparkException, SparkRuntimeException, SparkUnsupportedOperationException} /** * Object for grouping error messages from (most) exceptions thrown from State API V2 @@ -39,6 +39,16 @@ object StateStoreErrors { ) } + def keyRowFormatValidationFailure(errorMsg: String): + StateStoreKeyRowFormatValidationFailure = { + new StateStoreKeyRowFormatValidationFailure(errorMsg) + } + + def valueRowFormatValidationFailure(errorMsg: String): + StateStoreValueRowFormatValidationFailure = { + new StateStoreValueRowFormatValidationFailure(errorMsg) + } + def unsupportedOperationOnMissingColumnFamily(operationName: String, colFamilyName: String): StateStoreUnsupportedOperationOnMissingColumnFamily = { new StateStoreUnsupportedOperationOnMissingColumnFamily(operationName, colFamilyName) @@ -127,6 +137,42 @@ object StateStoreErrors { stateName: String): StatefulProcessorTTLMustBePositive = { new StatefulProcessorTTLMustBePositive(operationType, stateName) } + + def stateStoreKeySchemaNotCompatible( + storedKeySchema: String, + newKeySchema: String): StateStoreKeySchemaNotCompatible = { + new StateStoreKeySchemaNotCompatible(storedKeySchema, newKeySchema) + } + + def stateStoreValueSchemaNotCompatible( + storedValueSchema: String, + newValueSchema: String): StateStoreValueSchemaNotCompatible = { + new StateStoreValueSchemaNotCompatible(storedValueSchema, newValueSchema) + } + + def stateStoreColumnFamilyMismatch( + columnFamilyName: String, + oldColumnFamilySchema: String, + newColumnFamilySchema: String): StateStoreColumnFamilyMismatch = { + new StateStoreColumnFamilyMismatch( + columnFamilyName, oldColumnFamilySchema, newColumnFamilySchema) + } + + def stateStoreSnapshotFileNotFound(fileToRead: String, clazz: String): + StateStoreSnapshotFileNotFound = { + new StateStoreSnapshotFileNotFound(fileToRead, clazz) + } + + def stateStoreSnapshotPartitionNotFound( + snapshotPartitionId: Long, operatorId: Int, checkpointLocation: String): + StateStoreSnapshotPartitionNotFound = { + new StateStoreSnapshotPartitionNotFound(snapshotPartitionId, operatorId, checkpointLocation) + } + + def stateStoreProviderDoesNotSupportFineGrainedReplay(inputClass: String): + StateStoreProviderDoesNotSupportFineGrainedReplay = { + new StateStoreProviderDoesNotSupportFineGrainedReplay(inputClass) + } } class StateStoreMultipleColumnFamiliesNotSupportedException(stateStoreProvider: String) @@ -156,6 +202,17 @@ class StateStoreUnsupportedOperationException(operationType: String, entity: Str messageParameters = Map("operationType" -> operationType, "entity" -> entity) ) +class StateStoreColumnFamilyMismatch( + columnFamilyName: String, + oldColumnFamilySchema: String, + newColumnFamilySchema: String) + extends SparkUnsupportedOperationException( + errorClass = "STATE_STORE_COLUMN_FAMILY_SCHEMA_INCOMPATIBLE", + messageParameters = Map( + "columnFamilyName" -> columnFamilyName, + "oldColumnFamilySchema" -> oldColumnFamilySchema, + "newColumnFamilySchema" -> newColumnFamilySchema)) + class StatefulProcessorCannotPerformOperationWithInvalidTimeMode( operationType: String, timeMode: String) @@ -214,3 +271,52 @@ class StatefulProcessorTTLMustBePositive( extends SparkUnsupportedOperationException( errorClass = "STATEFUL_PROCESSOR_TTL_DURATION_MUST_BE_POSITIVE", messageParameters = Map("operationType" -> operationType, "stateName" -> stateName)) + +class StateStoreKeySchemaNotCompatible( + storedKeySchema: String, + newKeySchema: String) + extends SparkUnsupportedOperationException( + errorClass = "STATE_STORE_KEY_SCHEMA_NOT_COMPATIBLE", + messageParameters = Map( + "storedKeySchema" -> storedKeySchema, + "newKeySchema" -> newKeySchema)) + +class StateStoreValueSchemaNotCompatible( + storedValueSchema: String, + newValueSchema: String) + extends SparkUnsupportedOperationException( + errorClass = "STATE_STORE_VALUE_SCHEMA_NOT_COMPATIBLE", + messageParameters = Map( + "storedValueSchema" -> storedValueSchema, + "newValueSchema" -> newValueSchema)) + +class StateStoreSnapshotFileNotFound(fileToRead: String, clazz: String) + extends SparkRuntimeException( + errorClass = "CANNOT_LOAD_STATE_STORE.CANNOT_READ_MISSING_SNAPSHOT_FILE", + messageParameters = Map( + "fileToRead" -> fileToRead, + "clazz" -> clazz)) + +class StateStoreSnapshotPartitionNotFound( + snapshotPartitionId: Long, operatorId: Int, checkpointLocation: String) + extends SparkRuntimeException( + errorClass = "CANNOT_LOAD_STATE_STORE.SNAPSHOT_PARTITION_ID_NOT_FOUND", + messageParameters = Map( + "snapshotPartitionId" -> snapshotPartitionId.toString, + "operatorId" -> operatorId.toString, + "checkpointLocation" -> checkpointLocation)) + +class StateStoreKeyRowFormatValidationFailure(errorMsg: String) + extends SparkRuntimeException( + errorClass = "STATE_STORE_KEY_ROW_FORMAT_VALIDATION_FAILURE", + messageParameters = Map("errorMsg" -> errorMsg)) + +class StateStoreValueRowFormatValidationFailure(errorMsg: String) + extends SparkRuntimeException( + errorClass = "STATE_STORE_VALUE_ROW_FORMAT_VALIDATION_FAILURE", + messageParameters = Map("errorMsg" -> errorMsg)) + +class StateStoreProviderDoesNotSupportFineGrainedReplay(inputClass: String) + extends SparkUnsupportedOperationException( + errorClass = "STATE_STORE_PROVIDER_DOES_NOT_SUPPORT_FINE_GRAINED_STATE_REPLAY", + messageParameters = Map("inputClass" -> inputClass)) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StreamingSessionWindowStateManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StreamingSessionWindowStateManager.scala index 5130933f52efa..71df9dc65b419 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StreamingSessionWindowStateManager.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StreamingSessionWindowStateManager.scala @@ -17,7 +17,8 @@ package org.apache.spark.sql.execution.streaming.state -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Literal, UnsafeProjection, UnsafeRow} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection @@ -261,7 +262,7 @@ class StreamingSessionWindowStateManagerImplV1( override def abortIfNeeded(store: StateStore): Unit = { if (!store.hasCommitted) { - logInfo(s"Aborted store ${store.id}") + logInfo(log"Aborted store ${MDC(STATE_STORE_ID, store.id)}") store.abort() } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManager.scala index 9802a4dce4e5c..4de3170f5db33 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManager.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManager.scala @@ -24,7 +24,8 @@ import scala.annotation.tailrec import org.apache.hadoop.conf.Configuration import org.apache.spark.TaskContext -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{END_INDEX, START_INDEX, STATE_STORE_ID} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression, JoinedRow, Literal, SafeProjection, SpecificInternalRow, UnsafeProjection, UnsafeRow} import org.apache.spark.sql.catalyst.types.DataTypeUtils.toAttributes @@ -86,7 +87,8 @@ class SymmetricHashJoinStateManager( partitionId: Int, stateFormatVersion: Int, skippedNullValueCount: Option[SQLMetric] = None, - useStateStoreCoordinator: Boolean = true) extends Logging { + useStateStoreCoordinator: Boolean = true, + snapshotStartVersion: Option[Long] = None) extends Logging { import SymmetricHashJoinStateManager._ /* @@ -366,9 +368,9 @@ class SymmetricHashJoinStateManager( // If nulls were found at the end, log a warning for the range of null indices. if (nonNullIndex != numValues - 1) { - logWarning(s"`keyWithIndexToValue` returns a null value for indices " + - s"with range from startIndex=${nonNullIndex + 1} " + - s"and endIndex=${numValues - 1}.") + logWarning(log"`keyWithIndexToValue` returns a null value for indices " + + log"with range from startIndex=${MDC(START_INDEX, nonNullIndex + 1)} " + + log"and endIndex=${MDC(END_INDEX, numValues - 1)}.") } // Remove all null values from nonNullIndex + 1 onwards @@ -462,7 +464,7 @@ class SymmetricHashJoinStateManager( def abortIfNeeded(): Unit = { if (!stateStore.hasCommitted) { - logInfo(s"Aborted store ${stateStore.id}") + logInfo(log"Aborted store ${MDC(STATE_STORE_ID, stateStore.id)}") stateStore.abort() } // If this class manages a state store provider by itself, it should take care of closing @@ -479,6 +481,8 @@ class SymmetricHashJoinStateManager( val storeProviderId = StateStoreProviderId( stateInfo.get, partitionId, getStateStoreName(joinSide, stateStoreType)) val store = if (useStateStoreCoordinator) { + assert(snapshotStartVersion.isEmpty, "Should not use state store coordinator " + + "when reading state as data source.") StateStore.get( storeProviderId, keySchema, valueSchema, NoPrefixKeyStateEncoderSpec(keySchema), stateInfo.get.storeVersion, useColumnFamilies = false, storeConf, hadoopConf) @@ -488,9 +492,18 @@ class SymmetricHashJoinStateManager( storeProviderId, keySchema, valueSchema, NoPrefixKeyStateEncoderSpec(keySchema), useColumnFamilies = false, storeConf, hadoopConf, useMultipleValuesPerKey = false) - stateStoreProvider.getStore(stateInfo.get.storeVersion) + if (snapshotStartVersion.isDefined) { + if (!stateStoreProvider.isInstanceOf[SupportsFineGrainedReplay]) { + throw StateStoreErrors.stateStoreProviderDoesNotSupportFineGrainedReplay( + stateStoreProvider.getClass.toString) + } + stateStoreProvider.asInstanceOf[SupportsFineGrainedReplay] + .replayStateFromSnapshot(snapshotStartVersion.get, stateInfo.get.storeVersion) + } else { + stateStoreProvider.getStore(stateInfo.get.storeVersion) + } } - logInfo(s"Loaded store ${store.id}") + logInfo(log"Loaded store ${MDC(STATE_STORE_ID, store.id)}") store } } @@ -766,6 +779,35 @@ object SymmetricHashJoinStateManager { } } + def getSchemaForStateStores( + joinSide: JoinSide, + inputValueAttributes: Seq[Attribute], + joinKeys: Seq[Expression], + stateFormatVersion: Int): Map[String, (StructType, StructType)] = { + var result: Map[String, (StructType, StructType)] = Map.empty + + // get the key and value schema for the KeyToNumValues state store + val keySchema = StructType( + joinKeys.zipWithIndex.map { case (k, i) => StructField(s"field$i", k.dataType, k.nullable) }) + val longValueSchema = new StructType().add("value", "long") + result += (getStateStoreName(joinSide, KeyToNumValuesType) -> (keySchema, longValueSchema)) + + // get the key and value schema for the KeyWithIndexToValue state store + val keyWithIndexSchema = keySchema.add("index", LongType) + val valueSchema = if (stateFormatVersion == 1) { + inputValueAttributes + } else if (stateFormatVersion == 2) { + inputValueAttributes :+ AttributeReference("matched", BooleanType)() + } else { + throw new IllegalArgumentException("Incorrect state format version! " + + s"version=$stateFormatVersion") + } + result += (getStateStoreName(joinSide, KeyWithIndexToValueType) -> + (keyWithIndexSchema, valueSchema.toStructType)) + + result + } + private sealed trait StateStoreType private case object KeyToNumValuesType extends StateStoreType { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala index 3bf833816bcc4..94d976b568a5e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala @@ -23,6 +23,8 @@ import java.util.concurrent.TimeUnit._ import scala.collection.mutable import scala.jdk.CollectionConverters._ +import org.apache.hadoop.conf.Configuration + import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.AnalysisException @@ -70,6 +72,13 @@ trait StatefulOperator extends SparkPlan { throw new IllegalStateException("State location not present for execution") } } + + // Function used to record state schema for the first time and validate it against proposed + // schema changes in the future. Runs as part of a planning rule on the driver. + // Returns the schema file path for operators that write this to the metadata file, + // otherwise None + def validateAndMaybeEvolveStateSchema( + hadoopConf: Configuration, batchId: Long, stateSchemaVersion: Int): Array[String] } /** @@ -359,7 +368,7 @@ object WatermarkSupport { if (optionalWatermarkExpression.isEmpty || optionalWatermarkMs.isEmpty) return None val watermarkAttribute = optionalWatermarkExpression.get - // If we are evicting based on a window, use the end of the window. Otherwise just + // If we are evicting based on a window, use the end of the window. Otherwise just // use the attribute itself. val evictionExpression = if (watermarkAttribute.dataType.isInstanceOf[StructType]) { @@ -424,6 +433,13 @@ case class StateStoreRestoreExec( private[sql] val stateManager = StreamingAggregationStateManager.createStateManager( keyExpressions, child.output, stateFormatVersion) + override def validateAndMaybeEvolveStateSchema( + hadoopConf: Configuration, batchId: Long, stateSchemaVersion: Int): + Array[String] = { + StateSchemaCompatibilityChecker.validateAndMaybeEvolveStateSchema(getStateInfo, hadoopConf, + keyExpressions.toStructType, stateManager.getStateValueSchema, session.sessionState) + } + override protected def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") @@ -486,6 +502,13 @@ case class StateStoreSaveExec( private[sql] val stateManager = StreamingAggregationStateManager.createStateManager( keyExpressions, child.output, stateFormatVersion) + override def validateAndMaybeEvolveStateSchema( + hadoopConf: Configuration, batchId: Long, stateSchemaVersion: Int): + Array[String] = { + StateSchemaCompatibilityChecker.validateAndMaybeEvolveStateSchema(getStateInfo, hadoopConf, + keyExpressions.toStructType, stateManager.getStateValueSchema, session.sessionState) + } + override protected def doExecute(): RDD[InternalRow] = { metrics // force lazy init at driver assert(outputMode.nonEmpty, @@ -690,6 +713,13 @@ case class SessionWindowStateStoreRestoreExec( private val stateManager = StreamingSessionWindowStateManager.createStateManager( keyWithoutSessionExpressions, sessionExpression, child.output, stateFormatVersion) + override def validateAndMaybeEvolveStateSchema( + hadoopConf: Configuration, batchId: Long, stateSchemaVersion: Int): + Array[String] = { + StateSchemaCompatibilityChecker.validateAndMaybeEvolveStateSchema(getStateInfo, hadoopConf, + stateManager.getStateKeySchema, stateManager.getStateValueSchema, session.sessionState) + } + override protected def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") @@ -772,6 +802,13 @@ case class SessionWindowStateStoreSaveExec( private val stateManager = StreamingSessionWindowStateManager.createStateManager( keyWithoutSessionExpressions, sessionExpression, child.output, stateFormatVersion) + override def validateAndMaybeEvolveStateSchema( + hadoopConf: Configuration, batchId: Long, stateSchemaVersion: Int): + Array[String] = { + StateSchemaCompatibilityChecker.validateAndMaybeEvolveStateSchema(getStateInfo, hadoopConf, + stateManager.getStateKeySchema, stateManager.getStateValueSchema, session.sessionState) + } + override protected def doExecute(): RDD[InternalRow] = { metrics // force lazy init at driver assert(outputMode.nonEmpty, @@ -1079,6 +1116,13 @@ case class StreamingDeduplicateExec( override protected def withNewChildInternal(newChild: SparkPlan): StreamingDeduplicateExec = copy(child = newChild) + + override def validateAndMaybeEvolveStateSchema( + hadoopConf: Configuration, batchId: Long, stateSchemaVersion: Int): + Array[String] = { + StateSchemaCompatibilityChecker.validateAndMaybeEvolveStateSchema(getStateInfo, hadoopConf, + keyExpressions.toStructType, schemaForValueRow, session.sessionState, extraOptionOnStateStore) + } } object StreamingDeduplicateExec { @@ -1150,6 +1194,13 @@ case class StreamingDeduplicateWithinWatermarkExec( override def shortName: String = "dedupeWithinWatermark" + override def validateAndMaybeEvolveStateSchema( + hadoopConf: Configuration, batchId: Long, stateSchemaVersion: Int): + Array[String] = { + StateSchemaCompatibilityChecker.validateAndMaybeEvolveStateSchema(getStateInfo, hadoopConf, + keyExpressions.toStructType, schemaForValueRow, session.sessionState, extraOptionOnStateStore) + } + override protected def withNewChildInternal( newChild: SparkPlan): StreamingDeduplicateWithinWatermarkExec = copy(child = newChild) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/streamingLimits.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/streamingLimits.scala index e0e3ee582bef0..7b3d393ec75d6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/streamingLimits.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/streamingLimits.scala @@ -18,12 +18,14 @@ package org.apache.spark.sql.execution.streaming import java.util.concurrent.TimeUnit.NANOSECONDS +import org.apache.hadoop.conf.Configuration + import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, GenericInternalRow, SortOrder, UnsafeProjection, UnsafeRow} import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, Distribution, Partitioning} import org.apache.spark.sql.execution.{LimitExec, SparkPlan, UnaryExecNode} -import org.apache.spark.sql.execution.streaming.state.{NoPrefixKeyStateEncoderSpec, StateStoreOps} +import org.apache.spark.sql.execution.streaming.state.{NoPrefixKeyStateEncoderSpec, StateSchemaCompatibilityChecker, StateStoreOps} import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.{LongType, NullType, StructField, StructType} import org.apache.spark.util.{CompletionIterator, NextIterator} @@ -45,6 +47,13 @@ case class StreamingGlobalLimitExec( private val keySchema = StructType(Array(StructField("key", NullType))) private val valueSchema = StructType(Array(StructField("value", LongType))) + override def validateAndMaybeEvolveStateSchema( + hadoopConf: Configuration, batchId: Long, stateSchemaVersion: Int): + Array[String] = { + StateSchemaCompatibilityChecker.validateAndMaybeEvolveStateSchema(getStateInfo, hadoopConf, + keySchema, valueSchema, session.sessionState) + } + override protected def doExecute(): RDD[InternalRow] = { metrics // force lazy init at driver diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala index 8a2a5282b69d9..bf33ba2c96f19 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala @@ -25,7 +25,8 @@ import scala.jdk.CollectionConverters._ import scala.util.control.NonFatal import org.apache.spark.{JobExecutionStatus, SparkConf} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.CLASS_NAME import org.apache.spark.internal.config.Status._ import org.apache.spark.scheduler._ import org.apache.spark.sql.connector.metric.CustomMetric @@ -222,9 +223,9 @@ class SQLAppStatusListener( method } catch { case NonFatal(e) => - logWarning(s"Unable to load custom metric object for class `$className`. " + - "Please make sure that the custom metric class is in the classpath and " + - "it has 0-arg constructor.", e) + logWarning(log"Unable to load custom metric object for class " + + log"`${MDC(CLASS_NAME, className)}`. Please make sure that the custom metric " + + log"class is in the classpath and it has 0-arg constructor.", e) // Cannot initialize custom metric object, we might be in history server that does // not have the custom metric class. val defaultMethod = (_: Array[Long], _: Array[Long]) => "N/A" diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/Aggregator.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/Aggregator.scala index c48c8bbe4697b..88550fac7303f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/Aggregator.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/Aggregator.scala @@ -49,6 +49,7 @@ import org.apache.spark.sql.execution.aggregate.TypedAggregateExpression * @tparam OUT The type of the final output result. * @since 1.6.0 */ +@SerialVersionUID(2093413866369130093L) abstract class Aggregator[-IN, BUF, OUT] extends Serializable { /** @@ -89,8 +90,7 @@ abstract class Aggregator[-IN, BUF, OUT] extends Serializable { def outputEncoder: Encoder[OUT] /** - * Returns this `Aggregator` as a `TypedColumn` that can be used in `Dataset`. - * operations. + * Returns this `Aggregator` as a `TypedColumn` that can be used in `Dataset` operations. * @since 1.6.0 */ def toColumn: TypedColumn[IN, OUT] = { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index e21375713b8a1..882918eb78c7f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -80,6 +80,7 @@ import org.apache.spark.util.Utils * @groupname struct_funcs Struct functions * @groupname csv_funcs CSV functions * @groupname json_funcs JSON functions + * @groupname variant_funcs VARIANT functions * @groupname xml_funcs XML functions * @groupname url_funcs URL functions * @groupname partition_transforms Partition transform functions @@ -1936,6 +1937,15 @@ object functions { */ def try_divide(left: Column, right: Column): Column = Column.fn("try_divide", left, right) + /** + * Returns the remainder of `dividend``/``divisor`. Its result is + * always null if `divisor` is 0. + * + * @group math_funcs + * @since 4.0.0 + */ + def try_remainder(left: Column, right: Column): Column = Column.fn("try_remainder", left, right) + /** * Returns `left``*``right` and the result is null on overflow. The acceptable input types are * the same with the `*` operator. @@ -4144,9 +4154,11 @@ object functions { /** * Splits str around matches of the given pattern. * - * @param str a string expression to split - * @param pattern a string representing a regular expression. The regex string should be - * a Java regular expression. + * @param str + * a string expression to split + * @param pattern + * a string representing a regular expression. The regex string should be a Java regular + * expression. * * @group string_funcs * @since 1.5.0 @@ -4156,17 +4168,31 @@ object functions { /** * Splits str around matches of the given pattern. * - * @param str a string expression to split - * @param pattern a string representing a regular expression. The regex string should be - * a Java regular expression. - * @param limit an integer expression which controls the number of times the regex is applied. - *

        - *
      • limit greater than 0: The resulting array's length will not be more than limit, - * and the resulting array's last entry will contain all input beyond the last - * matched regex.
      • - *
      • limit less than or equal to 0: `regex` will be applied as many times as - * possible, and the resulting array can be of any size.
      • - *
      + * @param str + * a string expression to split + * @param pattern + * a column of string representing a regular expression. The regex string should be a Java + * regular expression. + * + * @group string_funcs + * @since 4.0.0 + */ + def split(str: Column, pattern: Column): Column = Column.fn("split", str, pattern) + + /** + * Splits str around matches of the given pattern. + * + * @param str + * a string expression to split + * @param pattern + * a string representing a regular expression. The regex string should be a Java regular + * expression. + * @param limit + * an integer expression which controls the number of times the regex is applied.
        + *
      • limit greater than 0: The resulting array's length will not be more than limit, and the + * resulting array's last entry will contain all input beyond the last matched regex.
      • + *
      • limit less than or equal to 0: `regex` will be applied as many times as possible, and + * the resulting array can be of any size.
      * * @group string_funcs * @since 3.0.0 @@ -4174,6 +4200,27 @@ object functions { def split(str: Column, pattern: String, limit: Int): Column = Column.fn("split", str, lit(pattern), lit(limit)) + /** + * Splits str around matches of the given pattern. + * + * @param str + * a string expression to split + * @param pattern + * a column of string representing a regular expression. The regex string should be a Java + * regular expression. + * @param limit + * a column of integer expression which controls the number of times the regex is applied. + *
      • limit greater than 0: The resulting array's length will not be more than limit, + * and the resulting array's last entry will contain all input beyond the last matched + * regex.
      • limit less than or equal to 0: `regex` will be applied as many times as + * possible, and the resulting array can be of any size.
      + * + * @group string_funcs + * @since 4.0.0 + */ + def split(str: Column, pattern: Column, limit: Column): Column = + Column.fn("split", str, pattern, limit) + /** * Substring starts at `pos` and is of length `len` when str is String type or * returns the slice of byte array that starts at `pos` in byte and is of length `len` @@ -4187,6 +4234,19 @@ object functions { def substring(str: Column, pos: Int, len: Int): Column = Column.fn("substring", str, lit(pos), lit(len)) + /** + * Substring starts at `pos` and is of length `len` when str is String type or + * returns the slice of byte array that starts at `pos` in byte and is of length `len` + * when str is Binary type + * + * @note The position is not zero based, but 1 based index. + * + * @group string_funcs + * @since 4.0.0 + */ + def substring(str: Column, pos: Column, len: Column): Column = + Column.fn("substring", str, pos, len) + /** * Returns the substring from string str before count occurrences of the delimiter delim. * If count is positive, everything the left of the final delimiter (counting from left) is @@ -5694,6 +5754,27 @@ object functions { */ def timestamp_micros(e: Column): Column = Column.fn("timestamp_micros", e) + /** + * Gets the difference between the timestamps in the specified units by truncating + * the fraction part. + * + * @group datetime_funcs + * @since 4.0.0 + */ + def timestamp_diff(unit: String, start: Column, end: Column): Column = withExpr { + TimestampDiff(unit, start.expr, end.expr) + } + + /** + * Adds the specified number of units to the given timestamp. + * + * @group datetime_funcs + * @since 4.0.0 + */ + def timestamp_add(unit: String, quantity: Column, ts: Column): Column = withExpr { + TimestampAdd(unit, quantity.expr, ts.expr) + } + /** * Parses the `timestamp` expression with the `format` expression * to a timestamp without time zone. Returns null with invalid input. @@ -6595,15 +6676,91 @@ object functions { } /** - * Parses a JSON string and constructs a Variant value. + * Parses a JSON string and constructs a Variant value. Returns null if the input string is not + * a valid JSON value. * * @param json a string column that contains JSON data. * - * @group json_funcs + * @group variant_funcs + * @since 4.0.0 + */ + def try_parse_json(json: Column): Column = Column.fn("try_parse_json", json) + + /** + * Parses a JSON string and constructs a Variant value. + * + * @param json + * a string column that contains JSON data. + * @group variant_funcs * @since 4.0.0 */ def parse_json(json: Column): Column = Column.fn("parse_json", json) + /** + * Check if a variant value is a variant null. Returns true if and only if the input is a + * variant null and false otherwise (including in the case of SQL NULL). + * + * @param v + * a variant column. + * @group variant_funcs + * @since 4.0.0 + */ + def is_variant_null(v: Column): Column = Column.fn("is_variant_null", v) + + /** + * Extracts a sub-variant from `v` according to `path`, and then cast the sub-variant to + * `targetType`. Returns null if the path does not exist. Throws an exception if the cast fails. + * + * @param v + * a variant column. + * @param path + * the extraction path. A valid path should start with `$` and is followed by zero or more + * segments like `[123]`, `.name`, `['name']`, or `["name"]`. + * @param targetType + * the target data type to cast into, in a DDL-formatted string. + * @group variant_funcs + * @since 4.0.0 + */ + def variant_get(v: Column, path: String, targetType: String): Column = + Column.fn("variant_get", v, lit(path), lit(targetType)) + + /** + * Extracts a sub-variant from `v` according to `path`, and then cast the sub-variant to + * `targetType`. Returns null if the path does not exist or the cast fails.. + * + * @param v + * a variant column. + * @param path + * the extraction path. A valid path should start with `$` and is followed by zero or more + * segments like `[123]`, `.name`, `['name']`, or `["name"]`. + * @param targetType + * the target data type to cast into, in a DDL-formatted string. + * @group variant_funcs + * @since 4.0.0 + */ + def try_variant_get(v: Column, path: String, targetType: String): Column = + Column.fn("try_variant_get", v, lit(path), lit(targetType)) + + /** + * Returns schema in the SQL format of a variant. + * + * @param v + * a variant column. + * @group variant_funcs + * @since 4.0.0 + */ + def schema_of_variant(v: Column): Column = Column.fn("schema_of_variant", v) + + /** + * Returns the merged schema in the SQL format of a variant column. + * + * @param v + * a variant column. + * @group variant_funcs + * @since 4.0.0 + */ + def schema_of_variant_agg(v: Column): Column = Column.fn("schema_of_variant_agg", v) + /** * Parses a JSON string and infers its schema in DDL format. * @@ -6817,9 +6974,9 @@ object functions { /** * Returns length of array or map. * - * The function returns null for null input if spark.sql.legacy.sizeOfNull is set to false or - * spark.sql.ansi.enabled is set to true. Otherwise, the function returns -1 for null input. - * With the default settings, the function returns -1 for null input. + * This function returns -1 for null input only if spark.sql.ansi.enabled is false and + * spark.sql.legacy.sizeOfNull is true. Otherwise, it returns null for null input. + * With the default settings, the function returns null for null input. * * @group collection_funcs * @since 1.5.0 @@ -6829,9 +6986,9 @@ object functions { /** * Returns length of array or map. This is an alias of `size` function. * - * The function returns null for null input if spark.sql.legacy.sizeOfNull is set to false or - * spark.sql.ansi.enabled is set to true. Otherwise, the function returns -1 for null input. - * With the default settings, the function returns -1 for null input. + * This function returns -1 for null input only if spark.sql.ansi.enabled is false and + * spark.sql.legacy.sizeOfNull is true. Otherwise, it returns null for null input. + * With the default settings, the function returns null for null input. * * @group collection_funcs * @since 3.5.0 diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala index 63c0d116ba3a4..4660970814e21 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala @@ -224,6 +224,7 @@ abstract class BaseSessionStateBuilder( TableCapabilityCheck +: CommandCheck +: CollationCheck +: + ViewSyncSchemaToMetaStore +: customCheckRules } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala index df7c4ab1a0c7d..3e20a23a0a066 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala @@ -734,9 +734,8 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog { // same way as how a permanent view is handled. This also avoids a potential issue where a // dependent view becomes invalid because of the above while its data is still cached. val viewText = viewDef.desc.viewText - val plan = sparkSession.sessionState.executePlan(viewDef) - sparkSession.sharedState.cacheManager.uncacheQuery( - sparkSession, plan.analyzed, cascade = viewText.isDefined) + val df = Dataset.ofRows(sparkSession, viewDef) + sparkSession.sharedState.cacheManager.uncacheQuery(df, cascade = viewText.isDefined) } catch { case NonFatal(_) => // ignore } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala index 164710cdd8839..2b1451493398f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala @@ -29,7 +29,8 @@ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FsUrlStreamHandlerFactory, Path} import org.apache.spark.{SparkConf, SparkContext} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{CONFIG, CONFIG2, PATH, VALUE} import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.errors.QueryExecutionErrors import org.apache.spark.sql.execution.CacheManager @@ -168,11 +169,12 @@ private[sql] class SharedState( wrapped } + val globalTempDB = conf.get(GLOBAL_TEMP_DATABASE) + /** * A manager for global temporary views. */ lazy val globalTempViewManager: GlobalTempViewManager = { - val globalTempDB = conf.get(GLOBAL_TEMP_DATABASE) if (externalCatalog.databaseExists(globalTempDB)) { throw QueryExecutionErrors.databaseNameConflictWithSystemPreservedDatabaseError(globalTempDB) } @@ -258,8 +260,9 @@ object SharedState extends Logging { val sparkWarehouseOption = initialConfigs.get(WAREHOUSE_PATH.key).orElse(sparkConf.getOption(WAREHOUSE_PATH.key)) if (initialConfigs.contains(HIVE_WAREHOUSE_CONF_NAME)) { - logWarning(s"Not allowing to set $HIVE_WAREHOUSE_CONF_NAME in SparkSession's " + - s"options, please use ${WAREHOUSE_PATH.key} to set statically for cross-session usages") + logWarning(log"Not allowing to set ${MDC(CONFIG, HIVE_WAREHOUSE_CONF_NAME)} in " + + log"SparkSession's options, please use ${MDC(CONFIG2, WAREHOUSE_PATH.key)} to " + + log"set statically for cross-session usages") } // hive.metastore.warehouse.dir only stay in hadoopConf sparkConf.remove(HIVE_WAREHOUSE_CONF_NAME) @@ -268,8 +271,10 @@ object SharedState extends Logging { if (hiveWarehouseDir != null && sparkWarehouseOption.isEmpty) { // If hive.metastore.warehouse.dir is set and spark.sql.warehouse.dir is not set, // we will respect the value of hive.metastore.warehouse.dir. - logInfo(s"${WAREHOUSE_PATH.key} is not set, but $HIVE_WAREHOUSE_CONF_NAME is set. " + - s"Setting ${WAREHOUSE_PATH.key} to the value of $HIVE_WAREHOUSE_CONF_NAME.") + logInfo(log"${MDC(CONFIG, WAREHOUSE_PATH.key)} is not set, but " + + log"${MDC(CONFIG2, HIVE_WAREHOUSE_CONF_NAME)} is set. " + + log"Setting ${MDC(CONFIG, WAREHOUSE_PATH.key)} to " + + log"the value of ${MDC(CONFIG2, HIVE_WAREHOUSE_CONF_NAME)}.") hiveWarehouseDir } else { // If spark.sql.warehouse.dir is set, we will override hive.metastore.warehouse.dir using @@ -277,8 +282,9 @@ object SharedState extends Logging { // When neither spark.sql.warehouse.dir nor hive.metastore.warehouse.dir is set // we will set hive.metastore.warehouse.dir to the default value of spark.sql.warehouse.dir. val sparkWarehouseDir = sparkWarehouseOption.getOrElse(WAREHOUSE_PATH.defaultValueString) - logInfo(s"Setting $HIVE_WAREHOUSE_CONF_NAME ('$hiveWarehouseDir') to the value of " + - s"${WAREHOUSE_PATH.key}.") + logInfo(log"Setting ${MDC(CONFIG, HIVE_WAREHOUSE_CONF_NAME)} " + + log"('${MDC(VALUE, hiveWarehouseDir)}') to the value of " + + log"${MDC(CONFIG2, WAREHOUSE_PATH.key)}.") sparkWarehouseDir } } @@ -286,7 +292,7 @@ object SharedState extends Logging { def qualifyWarehousePath(hadoopConf: Configuration, warehousePath: String): String = { val tempPath = new Path(warehousePath) val qualified = tempPath.getFileSystem(hadoopConf).makeQualified(tempPath).toString - logInfo(s"Warehouse path is '$qualified'.") + logInfo(log"Warehouse path is '${MDC(PATH, qualified)}'.") qualified } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/AggregatedDialect.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/AggregatedDialect.scala index 8f537aacebe5f..5e79dbbb4d72e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/AggregatedDialect.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/AggregatedDialect.scala @@ -26,7 +26,8 @@ import org.apache.spark.sql.types.{DataType, MetadataBuilder} * * @param dialects List of dialects. */ -private class AggregatedDialect(dialects: List[JdbcDialect]) extends JdbcDialect { +private class AggregatedDialect(dialects: List[JdbcDialect]) + extends JdbcDialect with NoLegacyJDBCError { require(dialects.nonEmpty) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/DB2Dialect.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/DB2Dialect.scala index 31a7c783ba60e..8ccf94166a70e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/DB2Dialect.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/DB2Dialect.scala @@ -24,13 +24,14 @@ import scala.util.control.NonFatal import org.apache.spark.SparkUnsupportedOperationException import org.apache.spark.sql.AnalysisException +import org.apache.spark.sql.catalyst.SQLConfHelper import org.apache.spark.sql.catalyst.analysis.NonEmptyNamespaceException import org.apache.spark.sql.connector.catalog.Identifier import org.apache.spark.sql.connector.expressions.Expression import org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions import org.apache.spark.sql.types._ -private case class DB2Dialect() extends JdbcDialect { +private case class DB2Dialect() extends JdbcDialect with SQLConfHelper with NoLegacyJDBCError { override def canHandle(url: String): Boolean = url.toLowerCase(Locale.ROOT).startsWith("jdbc:db2") @@ -86,6 +87,8 @@ private case class DB2Dialect() extends JdbcDialect { typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = sqlType match { + case Types.SMALLINT if !conf.legacyDB2numericMappingEnabled => + Option(ShortType) case Types.REAL => Option(FloatType) case Types.OTHER => typeName match { @@ -99,7 +102,9 @@ private case class DB2Dialect() extends JdbcDialect { override def getJDBCType(dt: DataType): Option[JdbcType] = dt match { case StringType => Option(JdbcType("CLOB", java.sql.Types.CLOB)) - case BooleanType => Option(JdbcType("CHAR(1)", java.sql.Types.CHAR)) + case BooleanType if conf.legacyDB2BooleanMappingEnabled => + Option(JdbcType("CHAR(1)", java.sql.Types.CHAR)) + case BooleanType => Option(JdbcType("BOOLEAN", java.sql.Types.BOOLEAN)) case ShortType | ByteType => Some(JdbcType("SMALLINT", java.sql.Types.SMALLINT)) case _ => None } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/DatabricksDialect.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/DatabricksDialect.scala index 54b8c2622827e..af77f8575dd86 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/DatabricksDialect.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/DatabricksDialect.scala @@ -25,7 +25,7 @@ import org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions import org.apache.spark.sql.execution.datasources.v2.TableSampleInfo import org.apache.spark.sql.types._ -private case class DatabricksDialect() extends JdbcDialect { +private case class DatabricksDialect() extends JdbcDialect with NoLegacyJDBCError { override def canHandle(url: String): Boolean = { url.startsWith("jdbc:databricks") diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/DerbyDialect.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/DerbyDialect.scala index 36af0e6aeaf14..7b65a01b5e702 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/DerbyDialect.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/DerbyDialect.scala @@ -25,7 +25,7 @@ import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors import org.apache.spark.sql.types._ -private case class DerbyDialect() extends JdbcDialect { +private case class DerbyDialect() extends JdbcDialect with NoLegacyJDBCError { override def canHandle(url: String): Boolean = url.toLowerCase(Locale.ROOT).startsWith("jdbc:derby") @@ -48,9 +48,15 @@ private case class DerbyDialect() extends JdbcDialect { case ByteType => Option(JdbcType("SMALLINT", java.sql.Types.SMALLINT)) case ShortType => Option(JdbcType("SMALLINT", java.sql.Types.SMALLINT)) case BooleanType => Option(JdbcType("BOOLEAN", java.sql.Types.BOOLEAN)) - // 31 is the maximum precision and 5 is the default scale for a Derby DECIMAL - case t: DecimalType if t.precision > 31 => - Option(JdbcType("DECIMAL(31,5)", java.sql.Types.DECIMAL)) + // 31 is the maximum precision + // https://db.apache.org/derby/docs/10.13/ref/rrefsqlj15260.html + case t: DecimalType => + val (p, s) = if (t.precision > 31) { + (31, math.max(t.scale - (t.precision - 31), 0)) + } else { + (t.precision, t.scale) + } + Option(JdbcType(s"DECIMAL($p,$s)", java.sql.Types.DECIMAL)) case _ => None } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/H2Dialect.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/H2Dialect.scala index ebfc6093dc167..3ece44ece9e6a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/H2Dialect.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/H2Dialect.scala @@ -37,7 +37,7 @@ import org.apache.spark.sql.connector.expressions.{Expression, FieldReference, N import org.apache.spark.sql.execution.datasources.jdbc.{JDBCOptions, JdbcUtils} import org.apache.spark.sql.types.{BooleanType, ByteType, DataType, DecimalType, MetadataBuilder, ShortType, StringType, TimestampType} -private[sql] case class H2Dialect() extends JdbcDialect { +private[sql] case class H2Dialect() extends JdbcDialect with NoLegacyJDBCError { override def canHandle(url: String): Boolean = url.toLowerCase(Locale.ROOT).startsWith("jdbc:h2") @@ -259,13 +259,6 @@ private[sql] case class H2Dialect() extends JdbcDialect { } class H2SQLBuilder extends JDBCSQLBuilder { - override def escapeSpecialCharsForLikePattern(str: String): String = { - str.map { - case '_' => "\\_" - case '%' => "\\%" - case c => c.toString - }.mkString - } override def visitAggregateFunction( funcName: String, isDistinct: Boolean, inputs: Array[String]): String = diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala index 5f69d18cad756..290665020f883 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala @@ -391,10 +391,10 @@ abstract class JdbcDialect extends Serializable with Logging { quoteIdentifier(namedRef.fieldNames.head) } - override def visitCast(l: String, dataType: DataType): String = { + override def visitCast(expr: String, exprDataType: DataType, dataType: DataType): String = { val databaseTypeDefinition = getJDBCType(dataType).map(_.databaseTypeDefinition).getOrElse(dataType.typeName) - s"CAST($l AS $databaseTypeDefinition)" + s"CAST($expr AS $databaseTypeDefinition)" } override def visitSQLFunction(funcName: String, inputs: Array[String]): String = { @@ -841,6 +841,23 @@ abstract class JdbcDialect extends Serializable with Logging { metadata: MetadataBuilder): Unit = {} } +/** + * Make the `classifyException` method throw out the original exception + */ +trait NoLegacyJDBCError extends JdbcDialect { + + override def classifyException( + e: Throwable, + errorClass: String, + messageParameters: Map[String, String], + description: String): AnalysisException = { + new AnalysisException( + errorClass = errorClass, + messageParameters = messageParameters, + cause = Some(e)) + } +} + /** * :: DeveloperApi :: * Registry of dialects that apply to every new jdbc `org.apache.spark.sql.DataFrame`. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MsSqlServerDialect.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MsSqlServerDialect.scala index 862e99adc3b0d..d03602b0338c7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MsSqlServerDialect.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MsSqlServerDialect.scala @@ -34,7 +34,7 @@ import org.apache.spark.sql.jdbc.MsSqlServerDialect.{GEOGRAPHY, GEOMETRY} import org.apache.spark.sql.types._ -private case class MsSqlServerDialect() extends JdbcDialect { +private case class MsSqlServerDialect() extends JdbcDialect with NoLegacyJDBCError { override def canHandle(url: String): Boolean = url.toLowerCase(Locale.ROOT).startsWith("jdbc:sqlserver") @@ -86,9 +86,13 @@ private case class MsSqlServerDialect() extends JdbcDialect { // We shouldn't propagate these queries to MsSqlServer expr match { case e: Predicate => e.name() match { - case "=" | "<>" | "<=>" | "<" | "<=" | ">" | ">=" - if e.children().exists(_.isInstanceOf[Predicate]) => - super.visitUnexpectedExpr(expr) + case "=" | "<>" | "<=>" | "<" | "<=" | ">" | ">=" => + val Array(l, r) = e.children().map { + case p: Predicate => s"CASE WHEN ${inputToSQL(p)} THEN 1 ELSE 0 END" + case o => inputToSQL(o) + } + visitBinaryComparison(e.name(), l, r) + case "CASE_WHEN" => visitCaseWhen(expressionsToStringArray(e.children())) + " = 1" case _ => super.build(expr) } case _ => super.build(expr) @@ -109,22 +113,22 @@ private case class MsSqlServerDialect() extends JdbcDialect { override def getCatalystType( sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = { - if (typeName.contains("datetimeoffset")) { - // String is recommend by Microsoft SQL Server for datetimeoffset types in non-MS clients - Option(StringType) - } else { - if (SQLConf.get.legacyMsSqlServerNumericMappingEnabled) { - None - } else { - sqlType match { - // Data range of TINYINT is 0-255 so it needs to be stored in ShortType. - // Reference doc: https://learn.microsoft.com/en-us/sql/t-sql/data-types - case java.sql.Types.SMALLINT | java.sql.Types.TINYINT => Some(ShortType) - case java.sql.Types.REAL => Some(FloatType) - case GEOMETRY | GEOGRAPHY => Some(BinaryType) - case _ => None + sqlType match { + case _ if typeName.contains("datetimeoffset") => + if (SQLConf.get.legacyMsSqlServerDatetimeOffsetMappingEnabled) { + Some(StringType) + } else { + Some(TimestampType) } - } + case java.sql.Types.SMALLINT | java.sql.Types.TINYINT + if !SQLConf.get.legacyMsSqlServerNumericMappingEnabled => + // Data range of TINYINT is 0-255 so it needs to be stored in ShortType. + // Reference doc: https://learn.microsoft.com/en-us/sql/t-sql/data-types + Some(ShortType) + case java.sql.Types.REAL if !SQLConf.get.legacyMsSqlServerNumericMappingEnabled => + Some(FloatType) + case GEOMETRY | GEOGRAPHY => Some(BinaryType) + case _ => None } } @@ -136,6 +140,7 @@ private case class MsSqlServerDialect() extends JdbcDialect { case BinaryType => Some(JdbcType("VARBINARY(MAX)", java.sql.Types.VARBINARY)) case ShortType if !SQLConf.get.legacyMsSqlServerNumericMappingEnabled => Some(JdbcType("SMALLINT", java.sql.Types.SMALLINT)) + case ByteType => Some(JdbcType("SMALLINT", java.sql.Types.TINYINT)) case _ => None } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MySQLDialect.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MySQLDialect.scala index d98fcdfd0b23f..0f1bccbb01d51 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MySQLDialect.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MySQLDialect.scala @@ -35,7 +35,7 @@ import org.apache.spark.sql.errors.QueryExecutionErrors import org.apache.spark.sql.execution.datasources.jdbc.{JDBCOptions, JdbcUtils} import org.apache.spark.sql.types._ -private case class MySQLDialect() extends JdbcDialect with SQLConfHelper { +private case class MySQLDialect() extends JdbcDialect with SQLConfHelper with NoLegacyJDBCError { override def canHandle(url : String): Boolean = url.toLowerCase(Locale.ROOT).startsWith("jdbc:mysql") @@ -66,6 +66,21 @@ private case class MySQLDialect() extends JdbcDialect with SQLConfHelper { } } + override def visitStartsWith(l: String, r: String): String = { + val value = r.substring(1, r.length() - 1) + s"$l LIKE '${escapeSpecialCharsForLikePattern(value)}%' ESCAPE '\\\\'" + } + + override def visitEndsWith(l: String, r: String): String = { + val value = r.substring(1, r.length() - 1) + s"$l LIKE '%${escapeSpecialCharsForLikePattern(value)}' ESCAPE '\\\\'" + } + + override def visitContains(l: String, r: String): String = { + val value = r.substring(1, r.length() - 1) + s"$l LIKE '%${escapeSpecialCharsForLikePattern(value)}%' ESCAPE '\\\\'" + } + override def visitAggregateFunction( funcName: String, isDistinct: Boolean, inputs: Array[String]): String = if (isDistinct && distinctUnsupportedAggregateFunctions.contains(funcName)) { @@ -142,7 +157,7 @@ private case class MySQLDialect() extends JdbcDialect with SQLConfHelper { // https://github.com/mysql/mysql-connector-j/blob/8.3.0/src/main/core-api/java/com/mysql/cj/MysqlType.java#L251 // scalastyle:on line.size.limit Some(getTimestampType(md.build())) - case Types.TIMESTAMP => Some(TimestampType) + case Types.TIMESTAMP if !conf.legacyMySqlTimestampNTZMappingEnabled => Some(TimestampType) case _ => None } } @@ -228,7 +243,8 @@ private case class MySQLDialect() extends JdbcDialect with SQLConfHelper { // In MYSQL, DATETIME is TIMESTAMP WITHOUT TIME ZONE // https://github.com/mysql/mysql-connector-j/blob/8.3.0/src/main/core-api/java/com/mysql/cj/MysqlType.java#L251 // scalastyle:on line.size.limit - case TimestampNTZType => Option(JdbcType("DATETIME", java.sql.Types.TIMESTAMP)) + case TimestampNTZType if !conf.legacyMySqlTimestampNTZMappingEnabled => + Option(JdbcType("DATETIME", java.sql.Types.TIMESTAMP)) case _ => JdbcUtils.getCommonJDBCType(dt) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/OracleDialect.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/OracleDialect.scala index 26c816294b52d..627007e275599 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/OracleDialect.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/OracleDialect.scala @@ -30,7 +30,7 @@ import org.apache.spark.sql.jdbc.OracleDialect._ import org.apache.spark.sql.types._ -private case class OracleDialect() extends JdbcDialect with SQLConfHelper { +private case class OracleDialect() extends JdbcDialect with SQLConfHelper with NoLegacyJDBCError { override def canHandle(url: String): Boolean = url.toLowerCase(Locale.ROOT).startsWith("jdbc:oracle") @@ -121,6 +121,7 @@ private case class OracleDialect() extends JdbcDialect with SQLConfHelper { case ByteType => Some(JdbcType("NUMBER(3)", java.sql.Types.SMALLINT)) case ShortType => Some(JdbcType("NUMBER(5)", java.sql.Types.SMALLINT)) case StringType => Some(JdbcType("VARCHAR2(255)", java.sql.Types.VARCHAR)) + case VarcharType(n) => Some(JdbcType(s"VARCHAR2($n)", java.sql.Types.VARCHAR)) case TimestampType if !conf.legacyOracleTimestampMappingEnabled => Some(JdbcType("TIMESTAMP WITH LOCAL TIME ZONE", TIMESTAMP_LTZ)) case _ => None diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala index c2c430a7b39d7..03fefd82802ef 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala @@ -24,6 +24,8 @@ import java.util.Locale import scala.util.Using +import org.apache.spark.internal.LogKeys.COLUMN_NAME +import org.apache.spark.internal.MDC import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.SQLConfHelper import org.apache.spark.sql.catalyst.analysis.{IndexAlreadyExistsException, NonEmptyNamespaceException, NoSuchIndexException} @@ -35,7 +37,8 @@ import org.apache.spark.sql.execution.datasources.v2.TableSampleInfo import org.apache.spark.sql.types._ -private case class PostgresDialect() extends JdbcDialect with SQLConfHelper { +private case class PostgresDialect() + extends JdbcDialect with SQLConfHelper with NoLegacyJDBCError { override def canHandle(url: String): Boolean = url.toLowerCase(Locale.ROOT).startsWith("jdbc:postgresql") @@ -59,8 +62,8 @@ private case class PostgresDialect() extends JdbcDialect with SQLConfHelper { // money type seems to be broken but one workaround is to handle it as string. // See SPARK-34333 and https://github.com/pgjdbc/pgjdbc/issues/100 Some(StringType) - case Types.TIMESTAMP - if "timestamptz".equalsIgnoreCase(typeName) => + case Types.TIMESTAMP if "timestamptz".equalsIgnoreCase(typeName) && + !conf.legacyPostgresDatetimeMappingEnabled => // timestamptz represents timestamp with time zone, currently it maps to Types.TIMESTAMP. // We need to change to Types.TIMESTAMP_WITH_TIMEZONE if the upstream changes. Some(TimestampType) @@ -147,6 +150,8 @@ private case class PostgresDialect() extends JdbcDialect with SQLConfHelper { case FloatType => Some(JdbcType("FLOAT4", Types.FLOAT)) case DoubleType => Some(JdbcType("FLOAT8", Types.DOUBLE)) case ShortType | ByteType => Some(JdbcType("SMALLINT", Types.SMALLINT)) + case TimestampType if !conf.legacyPostgresDatetimeMappingEnabled => + Some(JdbcType("TIMESTAMP WITH TIME ZONE", Types.TIMESTAMP)) case t: DecimalType => Some( JdbcType(s"NUMERIC(${t.precision},${t.scale})", java.sql.Types.NUMERIC)) case ArrayType(et, _) if et.isInstanceOf[AtomicType] || et.isInstanceOf[ArrayType] => @@ -368,7 +373,8 @@ private case class PostgresDialect() extends JdbcDialect with SQLConfHelper { } } catch { case e: SQLException => - logWarning(s"Failed to get array dimension for column $columnName", e) + logWarning( + log"Failed to get array dimension for column ${MDC(COLUMN_NAME, columnName)}", e) } case _ => } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/SnowflakeDialect.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/SnowflakeDialect.scala index 276364d5d89ed..a443a798db7c8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/SnowflakeDialect.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/SnowflakeDialect.scala @@ -22,7 +22,7 @@ import java.util.Locale import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils import org.apache.spark.sql.types.{BooleanType, DataType} -private case class SnowflakeDialect() extends JdbcDialect { +private case class SnowflakeDialect() extends JdbcDialect with NoLegacyJDBCError { override def canHandle(url: String): Boolean = url.toLowerCase(Locale.ROOT).startsWith("jdbc:snowflake") diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/TeradataDialect.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/TeradataDialect.scala index 7acd22a3f10be..322b259485f56 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/TeradataDialect.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/TeradataDialect.scala @@ -24,7 +24,7 @@ import org.apache.spark.sql.connector.catalog.Identifier import org.apache.spark.sql.types._ -private case class TeradataDialect() extends JdbcDialect { +private case class TeradataDialect() extends JdbcDialect with NoLegacyJDBCError { override def canHandle(url: String): Boolean = url.toLowerCase(Locale.ROOT).startsWith("jdbc:teradata") @@ -42,6 +42,7 @@ private case class TeradataDialect() extends JdbcDialect { override def getJDBCType(dt: DataType): Option[JdbcType] = dt match { case StringType => Some(JdbcType("VARCHAR(255)", java.sql.Types.VARCHAR)) case BooleanType => Option(JdbcType("CHAR(1)", java.sql.Types.CHAR)) + case ByteType => Option(JdbcType("BYTEINT", java.sql.Types.TINYINT)) case _ => None } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/package.scala index 1444eea09b27e..96b5e2193f270 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/package.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/package.scala @@ -20,7 +20,7 @@ package org.apache.spark import java.util.regex.Pattern import org.apache.spark.annotation.{DeveloperApi, Unstable} -import org.apache.spark.sql.catalyst.trees.{CurrentOrigin, Origin} +import org.apache.spark.sql.catalyst.trees.{CurrentOrigin, Origin, PySparkCurrentOrigin} import org.apache.spark.sql.execution.SparkStrategy import org.apache.spark.sql.internal.SQLConf @@ -78,31 +78,6 @@ package object sql { */ private[sql] val SPARK_LEGACY_INT96_METADATA_KEY = "org.apache.spark.legacyINT96" - /** - * Captures the current Java stack trace up to a specified depth defined by the - * `spark.sql.stackTracesInDataFrameContext` configuration. This method helps in identifying - * the call sites in Spark code by filtering out the stack frames until it reaches the - * user code calling into Spark. This method is intended to be used for enhancing debuggability - * by providing detailed context about where in the Spark source code a particular operation - * was called from. - * - * This functionality is crucial for both debugging purposes and for providing more insightful - * logging and error messages. By capturing the stack trace up to a certain depth, it enables - * a more precise pinpointing of the execution flow, especially useful when troubleshooting - * complex interactions within Spark. - * - * @return An array of `StackTraceElement` representing the filtered stack trace. - */ - private def captureStackTrace(): Array[StackTraceElement] = { - val st = Thread.currentThread().getStackTrace - var i = 0 - // Find the beginning of Spark code traces - while (i < st.length && !sparkCode(st(i))) i += 1 - // Stop at the end of the first Spark code traces - while (i < st.length && sparkCode(st(i))) i += 1 - st.slice(from = i - 1, until = i + SQLConf.get.stackTracesInDataFrameContext) - } - /** * This helper function captures the Spark API and its call site in the user code from the current * stacktrace. @@ -123,45 +98,16 @@ package object sql { if (CurrentOrigin.get.stackTrace.isDefined) { f } else { - val origin = Origin(stackTrace = Some(captureStackTrace())) - CurrentOrigin.withOrigin(origin)(f) - } - } - - /** - * This overloaded helper function captures the call site information specifically for PySpark, - * using provided PySpark logging information instead of capturing the current Java stack trace. - * - * This method is designed to enhance the debuggability of PySpark by including PySpark-specific - * logging information (e.g., method names and call sites within PySpark scripts) in debug logs, - * without the overhead of capturing and processing Java stack traces that are less relevant - * to PySpark developers. - * - * The `pysparkErrorContext` parameter allows for passing PySpark call site information, which - * is then included in the Origin context. This facilitates more precise and useful logging for - * troubleshooting PySpark applications. - * - * This method should be used in places where PySpark API calls are made, and PySpark logging - * information is available and beneficial for debugging purposes. - * - * @param pysparkErrorContext Optional PySpark logging information including the call site, - * represented as a (String, String). - * This may contain keys like "fragment" and "callSite" to provide - * detailed context about the PySpark call site. - * @param f The function that can utilize the modified Origin context with - * PySpark logging information. - * @return The result of executing `f` within the context of the provided PySpark logging - * information. - */ - private[sql] def withOrigin[T]( - pysparkErrorContext: Option[(String, String)] = None)(f: => T): T = { - if (CurrentOrigin.get.stackTrace.isDefined) { - f - } else { - val origin = Origin( - stackTrace = Some(captureStackTrace()), - pysparkErrorContext = pysparkErrorContext - ) + val st = Thread.currentThread().getStackTrace + var i = 0 + // Find the beginning of Spark code traces + while (i < st.length && !sparkCode(st(i))) i += 1 + // Stop at the end of the first Spark code traces + while (i < st.length && sparkCode(st(i))) i += 1 + val origin = Origin(stackTrace = Some(st.slice( + from = i - 1, + until = i + SQLConf.get.stackTracesInDataFrameContext)), + pysparkErrorContext = PySparkCurrentOrigin.get()) CurrentOrigin.withOrigin(origin)(f) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryListener.scala index 484ed0245ddf6..c1ceed048ae2c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryListener.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryListener.scala @@ -19,6 +19,8 @@ package org.apache.spark.sql.streaming import java.util.UUID +import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper} +import com.fasterxml.jackson.module.scala.{ClassTagExtensions, DefaultScalaModule} import org.json4s.{JObject, JString} import org.json4s.JsonAST.JValue import org.json4s.JsonDSL.{jobject2assoc, pair2Assoc} @@ -140,6 +142,21 @@ object StreamingQueryListener extends Serializable { } } + private[spark] object QueryStartedEvent { + private val mapper = { + val ret = new ObjectMapper() with ClassTagExtensions + ret.registerModule(DefaultScalaModule) + ret.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false) + ret + } + + private[spark] def jsonString(event: QueryStartedEvent): String = + mapper.writeValueAsString(event) + + private[spark] def fromJson(json: String): QueryStartedEvent = + mapper.readValue[QueryStartedEvent](json) + } + /** * Event representing any progress updates in a query. * @param progress The query progress updates. @@ -154,6 +171,21 @@ object StreamingQueryListener extends Serializable { private def jsonValue: JValue = JObject("progress" -> progress.jsonValue) } + private[spark] object QueryProgressEvent { + private val mapper = { + val ret = new ObjectMapper() with ClassTagExtensions + ret.registerModule(DefaultScalaModule) + ret.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false) + ret + } + + private[spark] def jsonString(event: QueryProgressEvent): String = + mapper.writeValueAsString(event) + + private[spark] def fromJson(json: String): QueryProgressEvent = + mapper.readValue[QueryProgressEvent](json) + } + /** * Event representing that query is idle and waiting for new data to process. * @@ -177,6 +209,21 @@ object StreamingQueryListener extends Serializable { } } + private[spark] object QueryIdleEvent { + private val mapper = { + val ret = new ObjectMapper() with ClassTagExtensions + ret.registerModule(DefaultScalaModule) + ret.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false) + ret + } + + private[spark] def jsonString(event: QueryTerminatedEvent): String = + mapper.writeValueAsString(event) + + private[spark] def fromJson(json: String): QueryTerminatedEvent = + mapper.readValue[QueryTerminatedEvent](json) + } + /** * Event representing that termination of a query. * @@ -211,4 +258,19 @@ object StreamingQueryListener extends Serializable { ("errorClassOnException" -> JString(errorClassOnException.orNull)) } } + + private[spark] object QueryTerminatedEvent { + private val mapper = { + val ret = new ObjectMapper() with ClassTagExtensions + ret.registerModule(DefaultScalaModule) + ret.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false) + ret + } + + private[spark] def jsonString(event: QueryTerminatedEvent): String = + mapper.writeValueAsString(event) + + private[spark] def fromJson(json: String): QueryTerminatedEvent = + mapper.readValue[QueryTerminatedEvent](json) + } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala index 225f9d1f19a55..55d2e639a56b1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala @@ -25,7 +25,8 @@ import scala.collection.mutable import scala.jdk.CollectionConverters._ import org.apache.spark.annotation.Evolving -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{CLASS_NAME, QUERY_ID, RUN_ID} import org.apache.spark.sql.{DataFrame, SparkSession} import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.streaming.{WriteToStream, WriteToStreamStatement} @@ -77,7 +78,7 @@ class StreamingQueryManager private[sql] ( Utils.loadExtensions(classOf[StreamingQueryListener], classNames, sparkSession.sparkContext.conf).foreach { listener => addListener(listener) - logInfo(s"Registered listener ${listener.getClass.getName}") + logInfo(log"Registered listener ${MDC(CLASS_NAME, listener.getClass.getName)}") } } } @@ -367,8 +368,8 @@ class StreamingQueryManager private[sql] ( if (activeOption.isDefined) { if (shouldStopActiveRun) { val oldQuery = activeOption.get - logWarning(s"Stopping existing streaming query [id=${query.id}, " + - s"runId=${oldQuery.runId}], as a new run is being started.") + logWarning(log"Stopping existing streaming query [id=${MDC(QUERY_ID, query.id)}, " + + log"runId=${MDC(RUN_ID, oldQuery.runId)}], as a new run is being started.") Some(oldQuery) } else { throw new IllegalStateException( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala index 117daea7d1971..05323d9d03811 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala @@ -294,7 +294,7 @@ private object SafeJsonSerializer { /** Convert map to JValue while handling empty maps. Also, this sorts the keys. */ def safeMapToJValue[T](map: ju.Map[String, T], valueToJValue: T => JValue): JValue = { - if (map.isEmpty) return JNothing + if (map == null || map.isEmpty) return JNothing val keys = map.asScala.keySet.toSeq.sorted keys.map { k => k -> valueToJValue(map.get(k)) : JObject }.reduce(_ ~ _) } diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameReaderWriterSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameReaderWriterSuite.java index 237ca0ba88092..2a0c8c00574a1 100644 --- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameReaderWriterSuite.java +++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameReaderWriterSuite.java @@ -1,19 +1,19 @@ /* -* Licensed to the Apache Software Foundation (ASF) under one or more -* contributor license agreements. See the NOTICE file distributed with -* this work for additional information regarding copyright ownership. -* The ASF licenses this file to You under the Apache License, Version 2.0 -* (the "License"); you may not use this file except in compliance with -* the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package test.org.apache.spark.sql; diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/TestStatefulProcessor.java b/sql/core/src/test/java/test/org/apache/spark/sql/TestStatefulProcessor.java index e53e977da1494..b9841ee0f9735 100644 --- a/sql/core/src/test/java/test/org/apache/spark/sql/TestStatefulProcessor.java +++ b/sql/core/src/test/java/test/org/apache/spark/sql/TestStatefulProcessor.java @@ -24,6 +24,8 @@ import org.apache.spark.sql.Encoders; import org.apache.spark.sql.streaming.*; +import static org.junit.jupiter.api.Assertions.*; + /** * A test stateful processor used with transformWithState arbitrary stateful operator in * Structured Streaming. The processor primarily aims to test various functionality of the Java API @@ -74,7 +76,7 @@ public scala.collection.Iterator handleInputRows( } else { keyCountMap.updateValue(value, 1L); } - assert(keyCountMap.containsKey(value)); + assertTrue(keyCountMap.containsKey(value)); keysList.appendValue(value); sb.append(value); } @@ -82,13 +84,13 @@ public scala.collection.Iterator handleInputRows( scala.collection.Iterator keys = keysList.get(); while (keys.hasNext()) { String keyVal = keys.next(); - assert(keyCountMap.containsKey(keyVal)); - assert(keyCountMap.getValue(keyVal) > 0); + assertTrue(keyCountMap.containsKey(keyVal)); + assertTrue(keyCountMap.getValue(keyVal) > 0); } count += numRows; countState.update(count); - assert (countState.get() == count); + assertEquals(count, (long) countState.get()); result.add(sb.toString()); } diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/TestStatefulProcessorWithInitialState.java b/sql/core/src/test/java/test/org/apache/spark/sql/TestStatefulProcessorWithInitialState.java index bfa542e81e354..55046a7c0d3df 100644 --- a/sql/core/src/test/java/test/org/apache/spark/sql/TestStatefulProcessorWithInitialState.java +++ b/sql/core/src/test/java/test/org/apache/spark/sql/TestStatefulProcessorWithInitialState.java @@ -24,6 +24,8 @@ import org.apache.spark.sql.Encoders; import org.apache.spark.sql.streaming.*; +import static org.junit.jupiter.api.Assertions.assertFalse; + /** * A test stateful processor concatenates all input rows for a key and emits the result. * Primarily used for testing the Java API for arbitrary stateful operator in structured streaming @@ -71,7 +73,7 @@ public scala.collection.Iterator handleInputRows( } testState.clear(); - assert(testState.exists() == false); + assertFalse(testState.exists()); result.add(sb.toString()); } diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/connector/JavaAdvancedDataSourceV2WithV2Filter.java b/sql/core/src/test/java/test/org/apache/spark/sql/connector/JavaAdvancedDataSourceV2WithV2Filter.java index 0e3f6aed3b681..07bef16cdf2da 100644 --- a/sql/core/src/test/java/test/org/apache/spark/sql/connector/JavaAdvancedDataSourceV2WithV2Filter.java +++ b/sql/core/src/test/java/test/org/apache/spark/sql/connector/JavaAdvancedDataSourceV2WithV2Filter.java @@ -34,6 +34,8 @@ import org.apache.spark.sql.types.StructType; import org.apache.spark.sql.util.CaseInsensitiveStringMap; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; + public class JavaAdvancedDataSourceV2WithV2Filter implements TestingV2Source { @Override @@ -66,9 +68,9 @@ public StructType readSchema() { public Predicate[] pushPredicates(Predicate[] predicates) { Predicate[] supported = Arrays.stream(predicates).filter(f -> { if (f.name().equals(">")) { - assert(f.children()[0] instanceof FieldReference); + assertInstanceOf(FieldReference.class, f.children()[0]); FieldReference column = (FieldReference) f.children()[0]; - assert(f.children()[1] instanceof LiteralValue); + assertInstanceOf(LiteralValue.class, f.children()[1]); Literal value = (Literal) f.children()[1]; return column.describe().equals("i") && value.value() instanceof Integer; } else { @@ -78,9 +80,9 @@ public Predicate[] pushPredicates(Predicate[] predicates) { Predicate[] unsupported = Arrays.stream(predicates).filter(f -> { if (f.name().equals(">")) { - assert(f.children()[0] instanceof FieldReference); + assertInstanceOf(FieldReference.class, f.children()[0]); FieldReference column = (FieldReference) f.children()[0]; - assert(f.children()[1] instanceof LiteralValue); + assertInstanceOf(LiteralValue.class, f.children()[1]); Literal value = (LiteralValue) f.children()[1]; return !column.describe().equals("i") || !(value.value() instanceof Integer); } else { @@ -125,9 +127,9 @@ public InputPartition[] planInputPartitions() { Integer lowerBound = null; for (Predicate predicate : predicates) { if (predicate.name().equals(">")) { - assert(predicate.children()[0] instanceof FieldReference); + assertInstanceOf(FieldReference.class, predicate.children()[0]); FieldReference column = (FieldReference) predicate.children()[0]; - assert(predicate.children()[1] instanceof LiteralValue); + assertInstanceOf(LiteralValue.class, predicate.children()[1]); Literal value = (Literal) predicate.children()[1]; if ("i".equals(column.describe()) && value.value() instanceof Integer integer) { lowerBound = integer; diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/execution/datasources/orc/FakeKeyProvider.java b/sql/core/src/test/java/test/org/apache/spark/sql/execution/datasources/orc/FakeKeyProvider.java index 70c9269962089..00cd8e5478a25 100644 --- a/sql/core/src/test/java/test/org/apache/spark/sql/execution/datasources/orc/FakeKeyProvider.java +++ b/sql/core/src/test/java/test/org/apache/spark/sql/execution/datasources/orc/FakeKeyProvider.java @@ -1,13 +1,12 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, diff --git a/sql/core/src/test/resources/collations/ICU-collations-map.md b/sql/core/src/test/resources/collations/ICU-collations-map.md new file mode 100644 index 0000000000000..a704034c694aa --- /dev/null +++ b/sql/core/src/test/resources/collations/ICU-collations-map.md @@ -0,0 +1,144 @@ + +## ICU locale ids to name map +| Locale id | Locale name | +| --------- | ----------- | +| 0 | UNICODE | +| 1 | af | +| 2 | am | +| 3 | ar | +| 4 | ar_SAU | +| 5 | as | +| 6 | az | +| 7 | be | +| 8 | bg | +| 9 | bn | +| 10 | bo | +| 11 | br | +| 12 | bs | +| 13 | bs_Cyrl | +| 14 | ca | +| 15 | ceb | +| 16 | chr | +| 17 | cs | +| 18 | cy | +| 19 | da | +| 20 | de | +| 21 | de_AUT | +| 22 | dsb | +| 23 | dz | +| 24 | ee | +| 25 | el | +| 26 | en | +| 27 | en_USA | +| 28 | eo | +| 29 | es | +| 30 | et | +| 31 | fa | +| 32 | fa_AFG | +| 33 | ff | +| 34 | ff_Adlm | +| 35 | fi | +| 36 | fil | +| 37 | fo | +| 38 | fr | +| 39 | fr_CAN | +| 40 | fy | +| 41 | ga | +| 42 | gl | +| 43 | gu | +| 44 | ha | +| 45 | haw | +| 46 | he | +| 47 | he_ISR | +| 48 | hi | +| 49 | hr | +| 50 | hsb | +| 51 | hu | +| 52 | hy | +| 53 | id | +| 54 | id_IDN | +| 55 | ig | +| 56 | is | +| 57 | it | +| 58 | ja | +| 59 | ka | +| 60 | kk | +| 61 | kl | +| 62 | km | +| 63 | kn | +| 64 | ko | +| 65 | kok | +| 66 | ku | +| 67 | ky | +| 68 | lb | +| 69 | lij | +| 70 | lkt | +| 71 | ln | +| 72 | lo | +| 73 | lt | +| 74 | lv | +| 75 | mk | +| 76 | ml | +| 77 | mn | +| 78 | mr | +| 79 | ms | +| 80 | mt | +| 81 | my | +| 82 | nb | +| 83 | nb_NOR | +| 84 | ne | +| 85 | nl | +| 86 | nn | +| 87 | no | +| 88 | om | +| 89 | or | +| 90 | pa | +| 91 | pa_Guru | +| 92 | pa_Guru_IND | +| 93 | pl | +| 94 | ps | +| 95 | pt | +| 96 | ro | +| 97 | ru | +| 98 | sa | +| 99 | se | +| 100 | si | +| 101 | sk | +| 102 | sl | +| 103 | smn | +| 104 | sq | +| 105 | sr | +| 106 | sr_Cyrl | +| 107 | sr_Cyrl_BIH | +| 108 | sr_Cyrl_MNE | +| 109 | sr_Cyrl_SRB | +| 110 | sr_Latn | +| 111 | sr_Latn_BIH | +| 112 | sr_Latn_SRB | +| 113 | sv | +| 114 | sw | +| 115 | ta | +| 116 | te | +| 117 | th | +| 118 | tk | +| 119 | to | +| 120 | tr | +| 121 | ug | +| 122 | uk | +| 123 | ur | +| 124 | uz | +| 125 | vi | +| 126 | wae | +| 127 | wo | +| 128 | xh | +| 129 | yi | +| 130 | yo | +| 131 | zh | +| 132 | zh_Hans | +| 133 | zh_Hans_CHN | +| 134 | zh_Hans_SGP | +| 135 | zh_Hant | +| 136 | zh_Hant_HKG | +| 137 | zh_Hant_MAC | +| 138 | zh_Hant_TWN | +| 139 | zu | diff --git a/sql/core/src/test/resources/log4j2.properties b/sql/core/src/test/resources/log4j2.properties index 7ab47c16d4f94..fdbc35e70ab44 100644 --- a/sql/core/src/test/resources/log4j2.properties +++ b/sql/core/src/test/resources/log4j2.properties @@ -29,6 +29,12 @@ appender.console.layout.pattern = %d{HH:mm:ss.SSS} %p %c: %maxLen{%m}{512}%n%ex{ appender.console.filter.threshold.type = ThresholdFilter appender.console.filter.threshold.level = warn +appender.structured.type = File +appender.structured.name = structured +appender.structured.fileName = target/LogQuerySuite.log +appender.structured.layout.type = JsonTemplateLayout +appender.structured.layout.eventTemplateUri = classpath:org/apache/spark/SparkLayout.json + #File Appender appender.file.type = File appender.file.name = File @@ -72,3 +78,9 @@ logger.parquet1.level = error logger.parquet2.name = parquet.CorruptStatistics logger.parquet2.level = error + +# Custom loggers +logger.structured.name = org.apache.spark.sql.LogQuerySuite +logger.structured.level = trace +logger.structured.appenderRefs = structured +logger.structured.appenderRef.structured.ref = structured diff --git a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md index 8b70c88332dfb..cf218becdf1d4 100644 --- a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md +++ b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md @@ -81,7 +81,7 @@ | org.apache.spark.sql.catalyst.expressions.Chr | char | SELECT char(65) | struct | | org.apache.spark.sql.catalyst.expressions.Chr | chr | SELECT chr(65) | struct | | org.apache.spark.sql.catalyst.expressions.Coalesce | coalesce | SELECT coalesce(NULL, 1, NULL) | struct | -| org.apache.spark.sql.catalyst.expressions.CollateExpressionBuilder | collate | SELECT COLLATION('Spark SQL' collate UTF8_BINARY_LCASE) | struct | +| org.apache.spark.sql.catalyst.expressions.CollateExpressionBuilder | collate | SELECT COLLATION('Spark SQL' collate UTF8_LCASE) | struct | | org.apache.spark.sql.catalyst.expressions.Collation | collation | SELECT collation('Spark SQL') | struct | | org.apache.spark.sql.catalyst.expressions.Concat | concat | SELECT concat('Spark', 'SQL') | struct | | org.apache.spark.sql.catalyst.expressions.ConcatWs | concat_ws | SELECT concat_ws(' ', 'Spark', 'SQL') | struct | @@ -174,6 +174,7 @@ | org.apache.spark.sql.catalyst.expressions.IsNaN | isnan | SELECT isnan(cast('NaN' as double)) | struct | | org.apache.spark.sql.catalyst.expressions.IsNotNull | isnotnull | SELECT isnotnull(1) | struct<(1 IS NOT NULL):boolean> | | org.apache.spark.sql.catalyst.expressions.IsNull | isnull | SELECT isnull(1) | struct<(1 IS NULL):boolean> | +| org.apache.spark.sql.catalyst.expressions.IsValidUTF8 | is_valid_utf8 | SELECT is_valid_utf8('Spark') | struct | | org.apache.spark.sql.catalyst.expressions.JsonObjectKeys | json_object_keys | SELECT json_object_keys('{}') | struct> | | org.apache.spark.sql.catalyst.expressions.JsonToStructs | from_json | SELECT from_json('{"a":1, "b":0.8}', 'a INT, b DOUBLE') | struct> | | org.apache.spark.sql.catalyst.expressions.JsonTuple | json_tuple | SELECT json_tuple('{"a":1, "b":2}', 'a', 'b') | struct | @@ -207,6 +208,7 @@ | org.apache.spark.sql.catalyst.expressions.MakeTimestamp | make_timestamp | SELECT make_timestamp(2014, 12, 28, 6, 30, 45.887) | struct | | org.apache.spark.sql.catalyst.expressions.MakeTimestampLTZExpressionBuilder | make_timestamp_ltz | SELECT make_timestamp_ltz(2014, 12, 28, 6, 30, 45.887) | struct | | org.apache.spark.sql.catalyst.expressions.MakeTimestampNTZExpressionBuilder | make_timestamp_ntz | SELECT make_timestamp_ntz(2014, 12, 28, 6, 30, 45.887) | struct | +| org.apache.spark.sql.catalyst.expressions.MakeValidUTF8 | make_valid_utf8 | SELECT make_valid_utf8('Spark') | struct | | org.apache.spark.sql.catalyst.expressions.MakeYMInterval | make_ym_interval | SELECT make_ym_interval(1, 2) | struct | | org.apache.spark.sql.catalyst.expressions.MapConcat | map_concat | SELECT map_concat(map(1, 'a', 2, 'b'), map(3, 'c')) | struct> | | org.apache.spark.sql.catalyst.expressions.MapContainsKey | map_contains_key | SELECT map_contains_key(map(1, 'a', 2, 'b'), 1) | struct | @@ -289,8 +291,11 @@ | org.apache.spark.sql.catalyst.expressions.Sha1 | sha | SELECT sha('Spark') | struct | | org.apache.spark.sql.catalyst.expressions.Sha1 | sha1 | SELECT sha1('Spark') | struct | | org.apache.spark.sql.catalyst.expressions.Sha2 | sha2 | SELECT sha2('Spark', 256) | struct | +| org.apache.spark.sql.catalyst.expressions.ShiftLeft | << | SELECT shiftleft(2, 1) | struct | | org.apache.spark.sql.catalyst.expressions.ShiftLeft | shiftleft | SELECT shiftleft(2, 1) | struct | +| org.apache.spark.sql.catalyst.expressions.ShiftRight | >> | SELECT shiftright(4, 1) | struct | | org.apache.spark.sql.catalyst.expressions.ShiftRight | shiftright | SELECT shiftright(4, 1) | struct | +| org.apache.spark.sql.catalyst.expressions.ShiftRightUnsigned | >>> | SELECT shiftrightunsigned(4, 1) | struct | | org.apache.spark.sql.catalyst.expressions.ShiftRightUnsigned | shiftrightunsigned | SELECT shiftrightunsigned(4, 1) | struct | | org.apache.spark.sql.catalyst.expressions.Shuffle | shuffle | SELECT shuffle(array(1, 20, 3, 5)) | struct> | | org.apache.spark.sql.catalyst.expressions.Signum | sign | SELECT sign(40) | struct | @@ -349,10 +354,12 @@ | org.apache.spark.sql.catalyst.expressions.TryElementAt | try_element_at | SELECT try_element_at(array(1, 2, 3), 2) | struct | | org.apache.spark.sql.catalyst.expressions.TryMultiply | try_multiply | SELECT try_multiply(2, 3) | struct | | org.apache.spark.sql.catalyst.expressions.TryReflect | try_reflect | SELECT try_reflect('java.util.UUID', 'randomUUID') | struct | +| org.apache.spark.sql.catalyst.expressions.TryRemainder | try_remainder | SELECT try_remainder(3, 2) | struct | | org.apache.spark.sql.catalyst.expressions.TrySubtract | try_subtract | SELECT try_subtract(2, 1) | struct | | org.apache.spark.sql.catalyst.expressions.TryToBinary | try_to_binary | SELECT try_to_binary('abc', 'utf-8') | struct | | org.apache.spark.sql.catalyst.expressions.TryToNumber | try_to_number | SELECT try_to_number('454', '999') | struct | | org.apache.spark.sql.catalyst.expressions.TryToTimestampExpressionBuilder | try_to_timestamp | SELECT try_to_timestamp('2016-12-31 00:12:00') | struct | +| org.apache.spark.sql.catalyst.expressions.TryValidateUTF8 | try_validate_utf8 | SELECT try_validate_utf8('Spark') | struct | | org.apache.spark.sql.catalyst.expressions.TypeOf | typeof | SELECT typeof(1) | struct | | org.apache.spark.sql.catalyst.expressions.UnBase64 | unbase64 | SELECT unbase64('U3BhcmsgU1FM') | struct | | org.apache.spark.sql.catalyst.expressions.UnaryMinus | negative | SELECT negative(1) | struct | @@ -368,6 +375,7 @@ | org.apache.spark.sql.catalyst.expressions.UrlDecode | url_decode | SELECT url_decode('https%3A%2F%2Fspark.apache.org') | struct | | org.apache.spark.sql.catalyst.expressions.UrlEncode | url_encode | SELECT url_encode('https://spark.apache.org') | struct | | org.apache.spark.sql.catalyst.expressions.Uuid | uuid | SELECT uuid() | struct | +| org.apache.spark.sql.catalyst.expressions.ValidateUTF8 | validate_utf8 | SELECT validate_utf8('Spark') | struct | | org.apache.spark.sql.catalyst.expressions.WeekDay | weekday | SELECT weekday('2009-07-30') | struct | | org.apache.spark.sql.catalyst.expressions.WeekOfYear | weekofyear | SELECT weekofyear('2008-02-20') | struct | | org.apache.spark.sql.catalyst.expressions.WidthBucket | width_bucket | SELECT width_bucket(5.3, 0.2, 10.6, 5) | struct | @@ -436,9 +444,11 @@ | org.apache.spark.sql.catalyst.expressions.aggregate.VariancePop | var_pop | SELECT var_pop(col) FROM VALUES (1), (2), (3) AS tab(col) | struct | | org.apache.spark.sql.catalyst.expressions.aggregate.VarianceSamp | var_samp | SELECT var_samp(col) FROM VALUES (1), (2), (3) AS tab(col) | struct | | org.apache.spark.sql.catalyst.expressions.aggregate.VarianceSamp | variance | SELECT variance(col) FROM VALUES (1), (2), (3) AS tab(col) | struct | -| org.apache.spark.sql.catalyst.expressions.variant.ParseJson | parse_json | SELECT parse_json('{"a":1,"b":0.8}') | struct | +| org.apache.spark.sql.catalyst.expressions.variant.IsVariantNull | is_variant_null | SELECT is_variant_null(parse_json('null')) | struct | +| org.apache.spark.sql.catalyst.expressions.variant.ParseJsonExpressionBuilder | parse_json | SELECT parse_json('{"a":1,"b":0.8}') | struct | | org.apache.spark.sql.catalyst.expressions.variant.SchemaOfVariant | schema_of_variant | SELECT schema_of_variant(parse_json('null')) | struct | | org.apache.spark.sql.catalyst.expressions.variant.SchemaOfVariantAgg | schema_of_variant_agg | SELECT schema_of_variant_agg(parse_json(j)) FROM VALUES ('1'), ('2'), ('3') AS tab(j) | struct | +| org.apache.spark.sql.catalyst.expressions.variant.TryParseJsonExpressionBuilder | try_parse_json | SELECT try_parse_json('{"a":1,"b":0.8}') | struct | | org.apache.spark.sql.catalyst.expressions.variant.TryVariantGetExpressionBuilder | try_variant_get | SELECT try_variant_get(parse_json('{"a": 1}'), '$.a', 'int') | struct | | org.apache.spark.sql.catalyst.expressions.variant.VariantGetExpressionBuilder | variant_get | SELECT variant_get(parse_json('{"a": 1}'), '$.a', 'int') | struct | | org.apache.spark.sql.catalyst.expressions.xml.XPathBoolean | xpath_boolean | SELECT xpath_boolean('
      1','a/b') | struct1, a/b):boolean> | diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/array.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/array.sql.out index 2998803698c35..57108c4582f45 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/array.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/array.sql.out @@ -8,7 +8,7 @@ create temporary view data as select * from values CreateViewCommand `data`, select * from values ("one", array(11, 12, 13), array(array(111, 112, 113), array(121, 122, 123))), ("two", array(21, 22, 23), array(array(211, 212, 213), array(221, 222, 223))) - as data(a, b, c), false, false, LocalTempView, true + as data(a, b, c), false, false, LocalTempView, UNSUPPORTED, true +- Project [a#x, b#x, c#x] +- SubqueryAlias data +- LocalRelation [a#x, b#x, c#x] @@ -97,7 +97,7 @@ CreateViewCommand `primitive_arrays`, select * from values ( float_array, date_array, timestamp_array -), false, false, LocalTempView, true +), false, false, LocalTempView, UNSUPPORTED, true +- Project [boolean_array#x, tinyint_array#x, smallint_array#x, int_array#x, bigint_array#x, decimal_array#x, double_array#x, float_array#x, date_array#x, timestamp_array#x] +- SubqueryAlias primitive_arrays +- LocalRelation [boolean_array#x, tinyint_array#x, smallint_array#x, int_array#x, bigint_array#x, decimal_array#x, double_array#x, float_array#x, date_array#x, timestamp_array#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/date.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/date.sql.out index 7acfc9277679e..fd927b99c6456 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/date.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/date.sql.out @@ -2,7 +2,7 @@ -- !query create temporary view date_view as select '2011-11-11' date_str, '1' int_str -- !query analysis -CreateViewCommand `date_view`, select '2011-11-11' date_str, '1' int_str, false, false, LocalTempView, true +CreateViewCommand `date_view`, select '2011-11-11' date_str, '1' int_str, false, false, LocalTempView, UNSUPPORTED, true +- Project [2011-11-11 AS date_str#x, 1 AS int_str#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/double-quoted-identifiers-disabled.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/double-quoted-identifiers-disabled.sql.out index d73b72eca3e21..f7b0e3370f9f4 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/double-quoted-identifiers-disabled.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/double-quoted-identifiers-disabled.sql.out @@ -276,7 +276,7 @@ Project [hello AS hello#x] -- !query CREATE TEMPORARY VIEW v(c1 COMMENT "hello") AS SELECT 1 -- !query analysis -CreateViewCommand `v`, [(c1,Some(hello))], SELECT 1, false, false, LocalTempView, true +CreateViewCommand `v`, [(c1,Some(hello))], SELECT 1, false, false, LocalTempView, UNSUPPORTED, true +- Project [1 AS 1#x] +- OneRowRelation @@ -304,7 +304,7 @@ Project [hello AS hello#x] -- !query CREATE TEMPORARY VIEW v(c1 COMMENT 'hello') AS SELECT 1 -- !query analysis -CreateViewCommand `v`, [(c1,Some(hello))], SELECT 1, false, false, LocalTempView, true +CreateViewCommand `v`, [(c1,Some(hello))], SELECT 1, false, false, LocalTempView, UNSUPPORTED, true +- Project [1 AS 1#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/double-quoted-identifiers-enabled.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/double-quoted-identifiers-enabled.sql.out index c37acb7879c4a..f241f9bd6867c 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/double-quoted-identifiers-enabled.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/double-quoted-identifiers-enabled.sql.out @@ -380,7 +380,7 @@ Project [hello AS hello#x] -- !query CREATE TEMPORARY VIEW v(c1 COMMENT 'hello') AS SELECT 1 -- !query analysis -CreateViewCommand `v`, [(c1,Some(hello))], SELECT 1, false, false, LocalTempView, true +CreateViewCommand `v`, [(c1,Some(hello))], SELECT 1, false, false, LocalTempView, UNSUPPORTED, true +- Project [1 AS 1#x] +- OneRowRelation @@ -409,7 +409,7 @@ CreateNamespace false CREATE TEMPORARY VIEW "myview"("c1") AS WITH "v"("a") AS (SELECT 1) SELECT "a" FROM "v" -- !query analysis -CreateViewCommand `myview`, [(c1,None)], WITH "v"("a") AS (SELECT 1) SELECT "a" FROM "v", false, false, LocalTempView, true +CreateViewCommand `myview`, [(c1,None)], WITH "v"("a") AS (SELECT 1) SELECT "a" FROM "v", false, false, LocalTempView, UNSUPPORTED, true +- WithCTE :- CTERelationDef xxxx, false : +- SubqueryAlias v diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/higher-order-functions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/higher-order-functions.sql.out index 693cb2a046319..c06d1e5534aed 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/higher-order-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/higher-order-functions.sql.out @@ -10,7 +10,7 @@ CreateViewCommand `nested`, values (1, array(32, 97), array(array(12, 99), array(123, 42), array(1))), (2, array(77, -76), array(array(6, 96, 65), array(-1, -2))), (3, array(12), array(array(17))) - as t(x, ys, zs), false, true, LocalTempView, true + as t(x, ys, zs), false, true, LocalTempView, UNSUPPORTED, true +- SubqueryAlias t +- LocalRelation [x#x, ys#x, zs#x] @@ -35,6 +35,26 @@ org.apache.spark.sql.AnalysisException } +-- !query +select ceil(x -> x) as v +-- !query analysis +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "INVALID_LAMBDA_FUNCTION_CALL.NON_HIGHER_ORDER_FUNCTION", + "sqlState" : "42K0D", + "messageParameters" : { + "class" : "org.apache.spark.sql.catalyst.expressions.CeilExpressionBuilder$" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 19, + "fragment" : "ceil(x -> x)" + } ] +} + + -- !query select transform(zs, z -> z) as v from nested -- !query analysis @@ -258,7 +278,7 @@ create or replace temporary view nested as values CreateViewCommand `nested`, values (1, map(1, 1, 2, 2, 3, 3)), (2, map(4, 4, 5, 5, 6, 6)) - as t(x, ys), false, true, LocalTempView, true + as t(x, ys), false, true, LocalTempView, UNSUPPORTED, true +- SubqueryAlias t +- LocalRelation [x#x, ys#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/interval.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/interval.sql.out index 03183e6c40005..12756576ded9b 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/interval.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/interval.sql.out @@ -1605,7 +1605,7 @@ Project [cast(cast(4 12:12:12 as timestamp) + INTERVAL '4 22:12' DAY TO MINUTE a -- !query create temporary view interval_view as select '1' str -- !query analysis -CreateViewCommand `interval_view`, select '1' str, false, false, LocalTempView, true +CreateViewCommand `interval_view`, select '1' str, false, false, LocalTempView, UNSUPPORTED, true +- Project [1 AS str#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/math.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/math.sql.out index 7eb7fcff356a4..1fa7b7513993d 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/math.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/math.sql.out @@ -431,3 +431,80 @@ SELECT conv('-9223372036854775807', 36, 10) -- !query analysis Project [conv(-9223372036854775807, 36, 10, true) AS conv(-9223372036854775807, 36, 10)#x] +- OneRowRelation + + +-- !query +SELECT BIN(0) +-- !query analysis +Project [bin(cast(0 as bigint)) AS bin(0)#x] ++- OneRowRelation + + +-- !query +SELECT BIN(25) +-- !query analysis +Project [bin(cast(25 as bigint)) AS bin(25)#x] ++- OneRowRelation + + +-- !query +SELECT BIN(25L) +-- !query analysis +Project [bin(25) AS bin(25)#x] ++- OneRowRelation + + +-- !query +SELECT BIN(25.5) +-- !query analysis +Project [bin(cast(25.5 as bigint)) AS bin(25.5)#x] ++- OneRowRelation + + +-- !query +SELECT POSITIVE(0Y) +-- !query analysis +Project [positive(0) AS (+ 0)#x] ++- OneRowRelation + + +-- !query +SELECT POSITIVE(25) +-- !query analysis +Project [positive(25) AS (+ 25)#x] ++- OneRowRelation + + +-- !query +SELECT POSITIVE(-25L) +-- !query analysis +Project [positive(-25) AS (+ -25)#xL] ++- OneRowRelation + + +-- !query +SELECT POSITIVE(25.5) +-- !query analysis +Project [positive(25.5) AS (+ 25.5)#x] ++- OneRowRelation + + +-- !query +SELECT POSITIVE("25.5") +-- !query analysis +Project [positive(cast(25.5 as double)) AS (+ 25.5)#x] ++- OneRowRelation + + +-- !query +SELECT POSITIVE("invalid") +-- !query analysis +Project [positive(cast(invalid as double)) AS (+ invalid)#x] ++- OneRowRelation + + +-- !query +SELECT POSITIVE(null) +-- !query analysis +Project [positive(cast(null as double)) AS (+ NULL)#x] ++- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out index 7ffd3cbd8bac6..98664dedf820c 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out @@ -384,21 +384,21 @@ Project [btrim(xyxtrimyyx, xy) AS btrim(xyxtrimyyx, xy)#x] -- !query SELECT btrim(encode(" xyz ", 'utf-8')) -- !query analysis -Project [btrim(encode( xyz , utf-8, false)) AS btrim(encode( xyz , utf-8))#x] +Project [btrim(encode( xyz , utf-8)) AS btrim(encode( xyz , utf-8))#x] +- OneRowRelation -- !query SELECT btrim(encode('yxTomxx', 'utf-8'), encode('xyz', 'utf-8')) -- !query analysis -Project [btrim(encode(yxTomxx, utf-8, false), encode(xyz, utf-8, false)) AS btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8))#x] +Project [btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8)) AS btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8))#x] +- OneRowRelation -- !query SELECT btrim(encode('xxxbarxxx', 'utf-8'), encode('x', 'utf-8')) -- !query analysis -Project [btrim(encode(xxxbarxxx, utf-8, false), encode(x, utf-8, false)) AS btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8))#x] +Project [btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8)) AS btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8))#x] +- OneRowRelation @@ -649,14 +649,14 @@ SetCommand (spark.sql.legacy.javaCharsets,Some(true)) -- !query select encode('hello', 'WINDOWS-1252') -- !query analysis -Project [encode(hello, WINDOWS-1252, true) AS encode(hello, WINDOWS-1252)#x] +Project [encode(hello, WINDOWS-1252) AS encode(hello, WINDOWS-1252)#x] +- OneRowRelation -- !query select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol) -- !query analysis -Project [encode(scol#x, ecol#x, true) AS encode(scol, ecol)#x] +Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x] +- SubqueryAlias t +- LocalRelation [scol#x, ecol#x] @@ -670,14 +670,14 @@ SetCommand (spark.sql.legacy.javaCharsets,Some(false)) -- !query select encode('hello', 'WINDOWS-1252') -- !query analysis -Project [encode(hello, WINDOWS-1252, false) AS encode(hello, WINDOWS-1252)#x] +Project [encode(hello, WINDOWS-1252) AS encode(hello, WINDOWS-1252)#x] +- OneRowRelation -- !query select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol) -- !query analysis -Project [encode(scol#x, ecol#x, false) AS encode(scol, ecol)#x] +Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x] +- SubqueryAlias t +- LocalRelation [scol#x, ecol#x] @@ -685,18 +685,95 @@ Project [encode(scol#x, ecol#x, false) AS encode(scol, ecol)#x] -- !query select encode('hello', 'Windows-xxx') -- !query analysis -Project [encode(hello, Windows-xxx, false) AS encode(hello, Windows-xxx)#x] +Project [encode(hello, Windows-xxx) AS encode(hello, Windows-xxx)#x] +- OneRowRelation -- !query select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol) -- !query analysis -Project [encode(scol#x, ecol#x, false) AS encode(scol, ecol)#x] +Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x] +- SubqueryAlias t +- LocalRelation [scol#x, ecol#x] +-- !query +set spark.sql.legacy.codingErrorAction=true +-- !query analysis +SetCommand (spark.sql.legacy.codingErrorAction,Some(true)) + + +-- !query +select encode('渭城朝雨浥轻尘', 'US-ASCII') +-- !query analysis +Project [encode(渭城朝雨浥轻尘, US-ASCII) AS encode(渭城朝雨浥轻尘, US-ASCII)#x] ++- OneRowRelation + + +-- !query +select encode(scol, ecol) from values('渭城朝雨浥轻尘', 'US-ASCII') as t(scol, ecol) +-- !query analysis +Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x] ++- SubqueryAlias t + +- LocalRelation [scol#x, ecol#x] + + +-- !query +set spark.sql.legacy.codingErrorAction=false +-- !query analysis +SetCommand (spark.sql.legacy.codingErrorAction,Some(false)) + + +-- !query +select encode('客舍青青柳色新', 'US-ASCII') +-- !query analysis +Project [encode(客舍青青柳色新, US-ASCII) AS encode(客舍青青柳色新, US-ASCII)#x] ++- OneRowRelation + + +-- !query +select encode(scol, ecol) from values('客舍青青柳色新', 'US-ASCII') as t(scol, ecol) +-- !query analysis +Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x] ++- SubqueryAlias t + +- LocalRelation [scol#x, ecol#x] + + +-- !query +select encode(decode(encode('白日依山尽,黄河入海流。欲穷千里目,更上一层楼。', 'UTF-16'), 'UTF-16'), 'UTF-8') +-- !query analysis +Project [encode(decode(encode(白日依山尽,黄河入海流。欲穷千里目,更上一层楼。, UTF-16), UTF-16), UTF-8) AS encode(decode(encode(白日依山尽,黄河入海流。欲穷千里目,更上一层楼。, UTF-16), UTF-16), UTF-8)#x] ++- OneRowRelation + + +-- !query +select encode(decode(encode('南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。', 'UTF-16'), 'UTF-16'), 'UTF-8') +-- !query analysis +Project [encode(decode(encode(南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。, UTF-16), UTF-16), UTF-8) AS encode(decode(encode(南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。, UTF-16), UTF-16), UTF-8)#x] ++- OneRowRelation + + +-- !query +select encode(decode(encode('세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark', 'UTF-16'), 'UTF-16'), 'UTF-8') +-- !query analysis +Project [encode(decode(encode(세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark, UTF-16), UTF-16), UTF-8) AS encode(decode(encode(세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark, UTF-16), UTF-16), UTF-8)#x] ++- OneRowRelation + + +-- !query +select encode(decode(encode('το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας μεγάλων δεδομένων παγκοσμίως', 'UTF-16'), 'UTF-16'), 'UTF-8') +-- !query analysis +Project [encode(decode(encode(το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας μεγάλων δεδομένων παγκοσμίως, UTF-16), UTF-16), UTF-8) AS encode(decode(encode(το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας μεγάλων δεδομένων παγκοσμίως, UTF-16), UTF-16), UTF-8)#x] ++- OneRowRelation + + +-- !query +select encode(decode(encode('Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。', 'UTF-16'), 'UTF-16'), 'UTF-8') +-- !query analysis +Project [encode(decode(encode(Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。, UTF-16), UTF-16), UTF-8) AS encode(decode(encode(Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。, UTF-16), UTF-16), UTF-8)#x] ++- OneRowRelation + + -- !query select decode() -- !query analysis @@ -746,7 +823,14 @@ org.apache.spark.sql.AnalysisException -- !query select decode(encode('abc', 'utf-8'), 'utf-8') -- !query analysis -Project [decode(encode(abc, utf-8, false), utf-8) AS decode(encode(abc, utf-8), utf-8)#x] +Project [decode(encode(abc, utf-8), utf-8) AS decode(encode(abc, utf-8), utf-8)#x] ++- OneRowRelation + + +-- !query +select decode(encode('大千世界', 'utf-32'), 'utf-32') +-- !query analysis +Project [decode(encode(大千世界, utf-32), utf-32) AS decode(encode(大千世界, utf-32), utf-32)#x] +- OneRowRelation @@ -856,6 +940,48 @@ Project [decode(scol#x, ecol#x) AS decode(scol, ecol)#x] +- LocalRelation [scol#x, ecol#x] +-- !query +set spark.sql.legacy.codingErrorAction=true +-- !query analysis +SetCommand (spark.sql.legacy.codingErrorAction,Some(true)) + + +-- !query +select decode(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') +-- !query analysis +Project [decode(0xE58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592, US-ASCII) AS decode(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', US-ASCII)#x] ++- OneRowRelation + + +-- !query +select decode(scol, ecol) from values(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') as t(scol, ecol) +-- !query analysis +Project [decode(scol#x, ecol#x) AS decode(scol, ecol)#x] ++- SubqueryAlias t + +- LocalRelation [scol#x, ecol#x] + + +-- !query +set spark.sql.legacy.codingErrorAction=false +-- !query analysis +SetCommand (spark.sql.legacy.codingErrorAction,Some(false)) + + +-- !query +select decode(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') +-- !query analysis +Project [decode(0xE8A5BFE587BAE998B3E585B3E697A0E69585E4BABA, US-ASCII) AS decode(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', US-ASCII)#x] ++- OneRowRelation + + +-- !query +select decode(scol, ecol) from values(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') as t(scol, ecol) +-- !query analysis +Project [decode(scol#x, ecol#x) AS decode(scol, ecol)#x] ++- SubqueryAlias t + +- LocalRelation [scol#x, ecol#x] + + -- !query SELECT CONTAINS(null, 'Spark') -- !query analysis @@ -1428,7 +1554,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException -- !query CREATE TEMPORARY VIEW fmtTable(fmtField) AS SELECT * FROM VALUES ('invalidFormat') -- !query analysis -CreateViewCommand `fmtTable`, [(fmtField,None)], SELECT * FROM VALUES ('invalidFormat'), false, false, LocalTempView, true +CreateViewCommand `fmtTable`, [(fmtField,None)], SELECT * FROM VALUES ('invalidFormat'), false, false, LocalTempView, UNSUPPORTED, true +- Project [col1#x] +- LocalRelation [col1#x] @@ -1586,3 +1712,87 @@ select luhn_check(123.456) -- !query analysis Project [luhn_check(cast(123.456 as string)) AS luhn_check(123.456)#x] +- OneRowRelation + + +-- !query +select is_valid_utf8('') +-- !query analysis +Project [is_valid_utf8() AS is_valid_utf8()#x] ++- OneRowRelation + + +-- !query +select is_valid_utf8('abc') +-- !query analysis +Project [is_valid_utf8(abc) AS is_valid_utf8(abc)#x] ++- OneRowRelation + + +-- !query +select is_valid_utf8(x'80') +-- !query analysis +Project [is_valid_utf8(cast(0x80 as string)) AS is_valid_utf8(X'80')#x] ++- OneRowRelation + + +-- !query +select make_valid_utf8('') +-- !query analysis +Project [make_valid_utf8() AS make_valid_utf8()#x] ++- OneRowRelation + + +-- !query +select make_valid_utf8('abc') +-- !query analysis +Project [make_valid_utf8(abc) AS make_valid_utf8(abc)#x] ++- OneRowRelation + + +-- !query +select make_valid_utf8(x'80') +-- !query analysis +Project [make_valid_utf8(cast(0x80 as string)) AS make_valid_utf8(X'80')#x] ++- OneRowRelation + + +-- !query +select validate_utf8('') +-- !query analysis +Project [validate_utf8() AS validate_utf8()#x] ++- OneRowRelation + + +-- !query +select validate_utf8('abc') +-- !query analysis +Project [validate_utf8(abc) AS validate_utf8(abc)#x] ++- OneRowRelation + + +-- !query +select validate_utf8(x'80') +-- !query analysis +Project [validate_utf8(cast(0x80 as string)) AS validate_utf8(X'80')#x] ++- OneRowRelation + + +-- !query +select try_validate_utf8('') +-- !query analysis +Project [try_validate_utf8() AS try_validate_utf8()#x] ++- OneRowRelation + + +-- !query +select try_validate_utf8('abc') +-- !query analysis +Project [try_validate_utf8(abc) AS try_validate_utf8(abc)#x] ++- OneRowRelation + + +-- !query +select try_validate_utf8(x'80') +-- !query analysis +Project [try_validate_utf8(cast(0x80 as string)) AS try_validate_utf8(X'80')#x] ++- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/timestamp.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/timestamp.sql.out index ecfc286b79435..bf34490d657e3 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/timestamp.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/timestamp.sql.out @@ -236,7 +236,7 @@ create temporary view ttf1 as select * from values CreateViewCommand `ttf1`, select * from values (1, 2), (2, 3) - as ttf1(`current_date`, `current_timestamp`), false, false, LocalTempView, true + as ttf1(`current_date`, `current_timestamp`), false, false, LocalTempView, UNSUPPORTED, true +- Project [current_date#x, current_timestamp#x] +- SubqueryAlias ttf1 +- LocalRelation [current_date#x, current_timestamp#x] @@ -263,7 +263,7 @@ create temporary view ttf2 as select * from values CreateViewCommand `ttf2`, select * from values (1, 2), (2, 3) - as ttf2(a, b), false, false, LocalTempView, true + as ttf2(a, b), false, false, LocalTempView, UNSUPPORTED, true +- Project [a#x, b#x] +- SubqueryAlias ttf2 +- LocalRelation [a#x, b#x] @@ -575,7 +575,7 @@ select null - timestamp'2011-11-11 11:11:11' -- !query create temporary view ts_view as select '2011-11-11 11:11:11' str -- !query analysis -CreateViewCommand `ts_view`, select '2011-11-11 11:11:11' str, false, false, LocalTempView, true +CreateViewCommand `ts_view`, select '2011-11-11 11:11:11' str, false, false, LocalTempView, UNSUPPORTED, true +- Project [2011-11-11 11:11:11 AS str#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/try_arithmetic.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/try_arithmetic.sql.out index ef17f6b50b90a..30654d1d71e2b 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/try_arithmetic.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/try_arithmetic.sql.out @@ -13,6 +13,20 @@ Project [try_add(2147483647, 1) AS try_add(2147483647, 1)#x] +- OneRowRelation +-- !query +SELECT try_add(2147483647, decimal(1)) +-- !query analysis +Project [try_add(2147483647, cast(1 as decimal(10,0))) AS try_add(2147483647, 1)#x] ++- OneRowRelation + + +-- !query +SELECT try_add(2147483647, "1") +-- !query analysis +Project [try_add(2147483647, 1) AS try_add(2147483647, 1)#xL] ++- OneRowRelation + + -- !query SELECT try_add(-2147483648, -1) -- !query analysis @@ -211,6 +225,20 @@ Project [try_divide(1, (1.0 / 0.0)) AS try_divide(1, (1.0 / 0.0))#x] +- OneRowRelation +-- !query +SELECT try_divide(1, decimal(0)) +-- !query analysis +Project [try_divide(1, cast(0 as decimal(10,0))) AS try_divide(1, 0)#x] ++- OneRowRelation + + +-- !query +SELECT try_divide(1, "0") +-- !query analysis +Project [try_divide(1, 0) AS try_divide(1, 0)#x] ++- OneRowRelation + + -- !query SELECT try_divide(interval 2 year, 2) -- !query analysis @@ -267,6 +295,20 @@ Project [try_subtract(2147483647, -1) AS try_subtract(2147483647, -1)#x] +- OneRowRelation +-- !query +SELECT try_subtract(2147483647, decimal(-1)) +-- !query analysis +Project [try_subtract(2147483647, cast(-1 as decimal(10,0))) AS try_subtract(2147483647, -1)#x] ++- OneRowRelation + + +-- !query +SELECT try_subtract(2147483647, "-1") +-- !query analysis +Project [try_subtract(2147483647, -1) AS try_subtract(2147483647, -1)#xL] ++- OneRowRelation + + -- !query SELECT try_subtract(-2147483648, 1) -- !query analysis @@ -351,6 +393,20 @@ Project [try_multiply(2147483647, -2) AS try_multiply(2147483647, -2)#x] +- OneRowRelation +-- !query +SELECT try_multiply(2147483647, decimal(-2)) +-- !query analysis +Project [try_multiply(2147483647, cast(-2 as decimal(10,0))) AS try_multiply(2147483647, -2)#x] ++- OneRowRelation + + +-- !query +SELECT try_multiply(2147483647, "-2") +-- !query analysis +Project [try_multiply(2147483647, -2) AS try_multiply(2147483647, -2)#xL] ++- OneRowRelation + + -- !query SELECT try_multiply(-2147483648, 2) -- !query analysis diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/array.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/array.sql.out index 459c5613e9196..fb331089d7545 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/array.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/array.sql.out @@ -8,7 +8,7 @@ create temporary view data as select * from values CreateViewCommand `data`, select * from values ("one", array(11, 12, 13), array(array(111, 112, 113), array(121, 122, 123))), ("two", array(21, 22, 23), array(array(211, 212, 213), array(221, 222, 223))) - as data(a, b, c), false, false, LocalTempView, true + as data(a, b, c), false, false, LocalTempView, UNSUPPORTED, true +- Project [a#x, b#x, c#x] +- SubqueryAlias data +- LocalRelation [a#x, b#x, c#x] @@ -97,7 +97,7 @@ CreateViewCommand `primitive_arrays`, select * from values ( float_array, date_array, timestamp_array -), false, false, LocalTempView, true +), false, false, LocalTempView, UNSUPPORTED, true +- Project [boolean_array#x, tinyint_array#x, smallint_array#x, int_array#x, bigint_array#x, decimal_array#x, double_array#x, float_array#x, date_array#x, timestamp_array#x] +- SubqueryAlias primitive_arrays +- LocalRelation [boolean_array#x, tinyint_array#x, smallint_array#x, int_array#x, bigint_array#x, decimal_array#x, double_array#x, float_array#x, date_array#x, timestamp_array#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/binary.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/binary.sql.out new file mode 100644 index 0000000000000..fe61e684a7ff5 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/binary.sql.out @@ -0,0 +1,34 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +SELECT X'' +-- !query analysis +Project [0x AS X''#x] ++- OneRowRelation + + +-- !query +SELECT X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333' +-- !query analysis +Project [0x4561736F6E2059616F20323031382D31312D31373A31333A33333A3333 AS X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333'#x] ++- OneRowRelation + + +-- !query +SELECT CAST('Spark' as BINARY) +-- !query analysis +Project [cast(Spark as binary) AS CAST(Spark AS BINARY)#x] ++- OneRowRelation + + +-- !query +SELECT array( X'', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333', CAST('Spark' as BINARY)) +-- !query analysis +Project [array(0x, 0x4561736F6E2059616F20323031382D31312D31373A31333A33333A3333, cast(Spark as binary)) AS array(X'', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333', CAST(Spark AS BINARY))#x] ++- OneRowRelation + + +-- !query +SELECT to_csv(named_struct('n', 1, 'info', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333')) +-- !query analysis +Project [to_csv(named_struct(n, 1, info, 0x4561736F6E2059616F20323031382D31312D31373A31333A33333A3333), Some(America/Los_Angeles)) AS to_csv(named_struct(n, 1, info, X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333'))#x] ++- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/binary_base64.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/binary_base64.sql.out new file mode 100644 index 0000000000000..fe61e684a7ff5 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/binary_base64.sql.out @@ -0,0 +1,34 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +SELECT X'' +-- !query analysis +Project [0x AS X''#x] ++- OneRowRelation + + +-- !query +SELECT X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333' +-- !query analysis +Project [0x4561736F6E2059616F20323031382D31312D31373A31333A33333A3333 AS X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333'#x] ++- OneRowRelation + + +-- !query +SELECT CAST('Spark' as BINARY) +-- !query analysis +Project [cast(Spark as binary) AS CAST(Spark AS BINARY)#x] ++- OneRowRelation + + +-- !query +SELECT array( X'', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333', CAST('Spark' as BINARY)) +-- !query analysis +Project [array(0x, 0x4561736F6E2059616F20323031382D31312D31373A31333A33333A3333, cast(Spark as binary)) AS array(X'', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333', CAST(Spark AS BINARY))#x] ++- OneRowRelation + + +-- !query +SELECT to_csv(named_struct('n', 1, 'info', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333')) +-- !query analysis +Project [to_csv(named_struct(n, 1, info, 0x4561736F6E2059616F20323031382D31312D31373A31333A33333A3333), Some(America/Los_Angeles)) AS to_csv(named_struct(n, 1, info, X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333'))#x] ++- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/binary_basic.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/binary_basic.sql.out new file mode 100644 index 0000000000000..fe61e684a7ff5 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/binary_basic.sql.out @@ -0,0 +1,34 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +SELECT X'' +-- !query analysis +Project [0x AS X''#x] ++- OneRowRelation + + +-- !query +SELECT X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333' +-- !query analysis +Project [0x4561736F6E2059616F20323031382D31312D31373A31333A33333A3333 AS X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333'#x] ++- OneRowRelation + + +-- !query +SELECT CAST('Spark' as BINARY) +-- !query analysis +Project [cast(Spark as binary) AS CAST(Spark AS BINARY)#x] ++- OneRowRelation + + +-- !query +SELECT array( X'', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333', CAST('Spark' as BINARY)) +-- !query analysis +Project [array(0x, 0x4561736F6E2059616F20323031382D31312D31373A31333A33333A3333, cast(Spark as binary)) AS array(X'', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333', CAST(Spark AS BINARY))#x] ++- OneRowRelation + + +-- !query +SELECT to_csv(named_struct('n', 1, 'info', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333')) +-- !query analysis +Project [to_csv(named_struct(n, 1, info, 0x4561736F6E2059616F20323031382D31312D31373A31333A33333A3333), Some(America/Los_Angeles)) AS to_csv(named_struct(n, 1, info, X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333'))#x] ++- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/binary_hex.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/binary_hex.sql.out new file mode 100644 index 0000000000000..fe61e684a7ff5 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/binary_hex.sql.out @@ -0,0 +1,34 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +SELECT X'' +-- !query analysis +Project [0x AS X''#x] ++- OneRowRelation + + +-- !query +SELECT X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333' +-- !query analysis +Project [0x4561736F6E2059616F20323031382D31312D31373A31333A33333A3333 AS X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333'#x] ++- OneRowRelation + + +-- !query +SELECT CAST('Spark' as BINARY) +-- !query analysis +Project [cast(Spark as binary) AS CAST(Spark AS BINARY)#x] ++- OneRowRelation + + +-- !query +SELECT array( X'', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333', CAST('Spark' as BINARY)) +-- !query analysis +Project [array(0x, 0x4561736F6E2059616F20323031382D31312D31373A31333A33333A3333, cast(Spark as binary)) AS array(X'', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333', CAST(Spark AS BINARY))#x] ++- OneRowRelation + + +-- !query +SELECT to_csv(named_struct('n', 1, 'info', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333')) +-- !query analysis +Project [to_csv(named_struct(n, 1, info, 0x4561736F6E2059616F20323031382D31312D31373A31333A33333A3333), Some(America/Los_Angeles)) AS to_csv(named_struct(n, 1, info, X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333'))#x] ++- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/binary_hex_discrete.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/binary_hex_discrete.sql.out new file mode 100644 index 0000000000000..fe61e684a7ff5 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/binary_hex_discrete.sql.out @@ -0,0 +1,34 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +SELECT X'' +-- !query analysis +Project [0x AS X''#x] ++- OneRowRelation + + +-- !query +SELECT X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333' +-- !query analysis +Project [0x4561736F6E2059616F20323031382D31312D31373A31333A33333A3333 AS X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333'#x] ++- OneRowRelation + + +-- !query +SELECT CAST('Spark' as BINARY) +-- !query analysis +Project [cast(Spark as binary) AS CAST(Spark AS BINARY)#x] ++- OneRowRelation + + +-- !query +SELECT array( X'', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333', CAST('Spark' as BINARY)) +-- !query analysis +Project [array(0x, 0x4561736F6E2059616F20323031382D31312D31373A31333A33333A3333, cast(Spark as binary)) AS array(X'', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333', CAST(Spark AS BINARY))#x] ++- OneRowRelation + + +-- !query +SELECT to_csv(named_struct('n', 1, 'info', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333')) +-- !query analysis +Project [to_csv(named_struct(n, 1, info, 0x4561736F6E2059616F20323031382D31312D31373A31333A33333A3333), Some(America/Los_Angeles)) AS to_csv(named_struct(n, 1, info, X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333'))#x] ++- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/bitwise.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/bitwise.sql.out index 8220aa4bd25bd..1267a984565ad 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/bitwise.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/bitwise.sql.out @@ -182,7 +182,7 @@ CREATE OR REPLACE TEMPORARY VIEW bitwise_test AS SELECT * FROM VALUES CreateViewCommand `bitwise_test`, SELECT * FROM VALUES (1, 1, 1, 1L), (2, 3, 4, null), - (7, 7, 7, 3L) AS bitwise_test(b1, b2, b3, b4), false, true, LocalTempView, true + (7, 7, 7, 3L) AS bitwise_test(b1, b2, b3, b4), false, true, LocalTempView, UNSUPPORTED, true +- Project [b1#x, b2#x, b3#x, b4#xL] +- SubqueryAlias bitwise_test +- LocalRelation [b1#x, b2#x, b3#x, b4#xL] @@ -306,3 +306,136 @@ select getbit(11L, 64) -- !query analysis Project [getbit(11, 64) AS getbit(11, 64)#x] +- OneRowRelation + + +-- !query +SELECT 20181117 >> 2 +-- !query analysis +Project [(20181117 >> 2) AS (20181117 >> 2)#x] ++- OneRowRelation + + +-- !query +SELECT 20181117 << 2 +-- !query analysis +Project [(20181117 << 2) AS (20181117 << 2)#x] ++- OneRowRelation + + +-- !query +SELECT 20181117 >>> 2 +-- !query analysis +Project [(20181117 >>> 2) AS (20181117 >>> 2)#x] ++- OneRowRelation + + +-- !query +SELECT 20181117 > > 2 +-- !query analysis +org.apache.spark.sql.catalyst.parser.ParseException +{ + "errorClass" : "PARSE_SYNTAX_ERROR", + "sqlState" : "42601", + "messageParameters" : { + "error" : "'>'", + "hint" : "" + } +} + + +-- !query +SELECT 20181117 < < 2 +-- !query analysis +org.apache.spark.sql.catalyst.parser.ParseException +{ + "errorClass" : "PARSE_SYNTAX_ERROR", + "sqlState" : "42601", + "messageParameters" : { + "error" : "'<'", + "hint" : "" + } +} + + +-- !query +SELECT 20181117 > >> 2 +-- !query analysis +org.apache.spark.sql.catalyst.parser.ParseException +{ + "errorClass" : "PARSE_SYNTAX_ERROR", + "sqlState" : "42601", + "messageParameters" : { + "error" : "'>>'", + "hint" : "" + } +} + + +-- !query +SELECT 20181117 <<< 2 +-- !query analysis +org.apache.spark.sql.catalyst.parser.ParseException +{ + "errorClass" : "PARSE_SYNTAX_ERROR", + "sqlState" : "42601", + "messageParameters" : { + "error" : "'<'", + "hint" : "" + } +} + + +-- !query +SELECT 20181117 >>>> 2 +-- !query analysis +org.apache.spark.sql.catalyst.parser.ParseException +{ + "errorClass" : "PARSE_SYNTAX_ERROR", + "sqlState" : "42601", + "messageParameters" : { + "error" : "'>'", + "hint" : "" + } +} + + +-- !query +select cast(null as array>), 20181117 >> 2 +-- !query analysis +Project [cast(null as array>) AS NULL#x, (20181117 >> 2) AS (20181117 >> 2)#x] ++- OneRowRelation + + +-- !query +select cast(null as array>), 20181117 >>> 2 +-- !query analysis +Project [cast(null as array>) AS NULL#x, (20181117 >>> 2) AS (20181117 >>> 2)#x] ++- OneRowRelation + + +-- !query +select cast(null as map>), 20181117 >> 2 +-- !query analysis +Project [cast(null as map>) AS NULL#x, (20181117 >> 2) AS (20181117 >> 2)#x] ++- OneRowRelation + + +-- !query +select 1 << 1 + 2 as plus_over_shift +-- !query analysis +Project [(1 << (1 + 2)) AS plus_over_shift#x] ++- OneRowRelation + + +-- !query +select 2 >> 1 << 1 as left_to_right +-- !query analysis +Project [((2 >> 1) << 1) AS left_to_right#x] ++- OneRowRelation + + +-- !query +select 1 & 2 >> 1 as shift_over_ampersand +-- !query analysis +Project [(1 & (2 >> 1)) AS shift_over_ampersand#x] ++- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/change-column.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/change-column.sql.out index 07edfa5e95e1b..645057b85c000 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/change-column.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/change-column.sql.out @@ -204,7 +204,7 @@ DescribeTableCommand `spark_catalog`.`default`.`test_change`, false, [col_name#x -- !query CREATE TEMPORARY VIEW temp_view(a, b) AS SELECT 1, "one" -- !query analysis -CreateViewCommand `temp_view`, [(a,None), (b,None)], SELECT 1, "one", false, false, LocalTempView, true +CreateViewCommand `temp_view`, [(a,None), (b,None)], SELECT 1, "one", false, false, LocalTempView, UNSUPPORTED, true +- Project [1 AS 1#x, one AS one#x] +- OneRowRelation @@ -233,7 +233,7 @@ org.apache.spark.sql.AnalysisException -- !query CREATE GLOBAL TEMPORARY VIEW global_temp_view(a, b) AS SELECT 1, "one" -- !query analysis -CreateViewCommand `global_temp_view`, [(a,None), (b,None)], SELECT 1, "one", false, false, GlobalTempView, true +CreateViewCommand `global_temp_view`, [(a,None), (b,None)], SELECT 1, "one", false, false, GlobalTempView, UNSUPPORTED, true +- Project [1 AS 1#x, one AS one#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/charvarchar.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/charvarchar.sql.out index 02f09e0831d25..5c1417f7c0aae 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/charvarchar.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/charvarchar.sql.out @@ -174,7 +174,7 @@ alter view char_view as select * from char_tbl2 AlterViewAsCommand `spark_catalog`.`default`.`char_view`, select * from char_tbl2, true +- Project [c#x, v#x] +- SubqueryAlias spark_catalog.default.char_tbl2 - +- Project [staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c#x, 5, true, false, true) AS c#x, v#x] + +- Project [static_invoke(CharVarcharCodegenUtils.readSidePadding(c#x, 5)) AS c#x, v#x] +- Relation spark_catalog.default.char_tbl2[c#x,v#x] parquet @@ -348,7 +348,7 @@ CreateViewCommand `str_view`, select c, v from values ('NetE', 'Spar'), ('NetEa ', 'Spark '), ('NetEas ', 'Spark'), - ('NetEase', 'Spark-') t(c, v), false, false, LocalTempView, true + ('NetEase', 'Spark-') t(c, v), false, false, LocalTempView, UNSUPPORTED, true +- Project [c#x, v#x] +- SubqueryAlias t +- LocalRelation [c#x, v#x] @@ -364,7 +364,7 @@ CreateDataSourceTableCommand `spark_catalog`.`default`.`char_tbl4`, false insert into char_tbl4 select c, c, v, c from str_view -- !query analysis InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/char_tbl4, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/char_tbl4], Append, `spark_catalog`.`default`.`char_tbl4`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/char_tbl4), [c7, c8, v, s] -+- Project [staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, charTypeWriteSideCheck, cast(c#x as string), 7, true, false, true) AS c7#x, staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, charTypeWriteSideCheck, cast(c#x as string), 8, true, false, true) AS c8#x, staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, varcharTypeWriteSideCheck, cast(v#x as string), 6, true, false, true) AS v#x, cast(c#x as string) AS s#x] ++- Project [static_invoke(CharVarcharCodegenUtils.charTypeWriteSideCheck(cast(c#x as string), 7)) AS c7#x, static_invoke(CharVarcharCodegenUtils.charTypeWriteSideCheck(cast(c#x as string), 8)) AS c8#x, static_invoke(CharVarcharCodegenUtils.varcharTypeWriteSideCheck(cast(v#x as string), 6)) AS v#x, cast(c#x as string) AS s#x] +- Project [c#x, c#x, v#x, c#x] +- SubqueryAlias str_view +- View (`str_view`, [c#x, v#x]) @@ -379,7 +379,7 @@ select c7, c8, v, s from char_tbl4 -- !query analysis Project [c7#x, c8#x, v#x, s#x] +- SubqueryAlias spark_catalog.default.char_tbl4 - +- Project [staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c7#x, 7, true, false, true) AS c7#x, staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c8#x, 8, true, false, true) AS c8#x, v#x, s#x] + +- Project [static_invoke(CharVarcharCodegenUtils.readSidePadding(c7#x, 7)) AS c7#x, static_invoke(CharVarcharCodegenUtils.readSidePadding(c8#x, 8)) AS c8#x, v#x, s#x] +- Relation spark_catalog.default.char_tbl4[c7#x,c8#x,v#x,s#x] parquet @@ -389,7 +389,7 @@ select c7, c8, v, s from char_tbl4 where c7 = c8 Project [c7#x, c8#x, v#x, s#x] +- Filter (rpad(c7#x, 8, ) = c8#x) +- SubqueryAlias spark_catalog.default.char_tbl4 - +- Project [staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c7#x, 7, true, false, true) AS c7#x, staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c8#x, 8, true, false, true) AS c8#x, v#x, s#x] + +- Project [static_invoke(CharVarcharCodegenUtils.readSidePadding(c7#x, 7)) AS c7#x, static_invoke(CharVarcharCodegenUtils.readSidePadding(c8#x, 8)) AS c8#x, v#x, s#x] +- Relation spark_catalog.default.char_tbl4[c7#x,c8#x,v#x,s#x] parquet @@ -399,7 +399,7 @@ select c7, c8, v, s from char_tbl4 where c7 = v Project [c7#x, c8#x, v#x, s#x] +- Filter (c7#x = v#x) +- SubqueryAlias spark_catalog.default.char_tbl4 - +- Project [staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c7#x, 7, true, false, true) AS c7#x, staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c8#x, 8, true, false, true) AS c8#x, v#x, s#x] + +- Project [static_invoke(CharVarcharCodegenUtils.readSidePadding(c7#x, 7)) AS c7#x, static_invoke(CharVarcharCodegenUtils.readSidePadding(c8#x, 8)) AS c8#x, v#x, s#x] +- Relation spark_catalog.default.char_tbl4[c7#x,c8#x,v#x,s#x] parquet @@ -409,7 +409,7 @@ select c7, c8, v, s from char_tbl4 where c7 = s Project [c7#x, c8#x, v#x, s#x] +- Filter (c7#x = s#x) +- SubqueryAlias spark_catalog.default.char_tbl4 - +- Project [staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c7#x, 7, true, false, true) AS c7#x, staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c8#x, 8, true, false, true) AS c8#x, v#x, s#x] + +- Project [static_invoke(CharVarcharCodegenUtils.readSidePadding(c7#x, 7)) AS c7#x, static_invoke(CharVarcharCodegenUtils.readSidePadding(c8#x, 8)) AS c8#x, v#x, s#x] +- Relation spark_catalog.default.char_tbl4[c7#x,c8#x,v#x,s#x] parquet @@ -419,7 +419,7 @@ select c7, c8, v, s from char_tbl4 where c7 = 'NetEase ' Project [c7#x, c8#x, v#x, s#x] +- Filter (rpad(c7#x, 22, ) = NetEase ) +- SubqueryAlias spark_catalog.default.char_tbl4 - +- Project [staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c7#x, 7, true, false, true) AS c7#x, staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c8#x, 8, true, false, true) AS c8#x, v#x, s#x] + +- Project [static_invoke(CharVarcharCodegenUtils.readSidePadding(c7#x, 7)) AS c7#x, static_invoke(CharVarcharCodegenUtils.readSidePadding(c8#x, 8)) AS c8#x, v#x, s#x] +- Relation spark_catalog.default.char_tbl4[c7#x,c8#x,v#x,s#x] parquet @@ -429,7 +429,7 @@ select c7, c8, v, s from char_tbl4 where v = 'Spark ' Project [c7#x, c8#x, v#x, s#x] +- Filter (v#x = Spark ) +- SubqueryAlias spark_catalog.default.char_tbl4 - +- Project [staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c7#x, 7, true, false, true) AS c7#x, staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c8#x, 8, true, false, true) AS c8#x, v#x, s#x] + +- Project [static_invoke(CharVarcharCodegenUtils.readSidePadding(c7#x, 7)) AS c7#x, static_invoke(CharVarcharCodegenUtils.readSidePadding(c8#x, 8)) AS c8#x, v#x, s#x] +- Relation spark_catalog.default.char_tbl4[c7#x,c8#x,v#x,s#x] parquet @@ -439,7 +439,7 @@ select c7, c8, v, s from char_tbl4 order by c7 Sort [c7#x ASC NULLS FIRST], true +- Project [c7#x, c8#x, v#x, s#x] +- SubqueryAlias spark_catalog.default.char_tbl4 - +- Project [staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c7#x, 7, true, false, true) AS c7#x, staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c8#x, 8, true, false, true) AS c8#x, v#x, s#x] + +- Project [static_invoke(CharVarcharCodegenUtils.readSidePadding(c7#x, 7)) AS c7#x, static_invoke(CharVarcharCodegenUtils.readSidePadding(c8#x, 8)) AS c8#x, v#x, s#x] +- Relation spark_catalog.default.char_tbl4[c7#x,c8#x,v#x,s#x] parquet @@ -449,7 +449,7 @@ select c7, c8, v, s from char_tbl4 order by v Sort [v#x ASC NULLS FIRST], true +- Project [c7#x, c8#x, v#x, s#x] +- SubqueryAlias spark_catalog.default.char_tbl4 - +- Project [staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c7#x, 7, true, false, true) AS c7#x, staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c8#x, 8, true, false, true) AS c8#x, v#x, s#x] + +- Project [static_invoke(CharVarcharCodegenUtils.readSidePadding(c7#x, 7)) AS c7#x, static_invoke(CharVarcharCodegenUtils.readSidePadding(c8#x, 8)) AS c8#x, v#x, s#x] +- Relation spark_catalog.default.char_tbl4[c7#x,c8#x,v#x,s#x] parquet @@ -458,7 +458,7 @@ select ascii(c7), ascii(c8), ascii(v), ascii(s) from char_tbl4 -- !query analysis Project [ascii(c7#x) AS ascii(c7)#x, ascii(c8#x) AS ascii(c8)#x, ascii(v#x) AS ascii(v)#x, ascii(s#x) AS ascii(s)#x] +- SubqueryAlias spark_catalog.default.char_tbl4 - +- Project [staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c7#x, 7, true, false, true) AS c7#x, staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c8#x, 8, true, false, true) AS c8#x, v#x, s#x] + +- Project [static_invoke(CharVarcharCodegenUtils.readSidePadding(c7#x, 7)) AS c7#x, static_invoke(CharVarcharCodegenUtils.readSidePadding(c8#x, 8)) AS c8#x, v#x, s#x] +- Relation spark_catalog.default.char_tbl4[c7#x,c8#x,v#x,s#x] parquet @@ -467,7 +467,7 @@ select base64(c7), base64(c8), base64(v), ascii(s) from char_tbl4 -- !query analysis Project [base64(cast(c7#x as binary)) AS base64(c7)#x, base64(cast(c8#x as binary)) AS base64(c8)#x, base64(cast(v#x as binary)) AS base64(v)#x, ascii(s#x) AS ascii(s)#x] +- SubqueryAlias spark_catalog.default.char_tbl4 - +- Project [staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c7#x, 7, true, false, true) AS c7#x, staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c8#x, 8, true, false, true) AS c8#x, v#x, s#x] + +- Project [static_invoke(CharVarcharCodegenUtils.readSidePadding(c7#x, 7)) AS c7#x, static_invoke(CharVarcharCodegenUtils.readSidePadding(c8#x, 8)) AS c8#x, v#x, s#x] +- Relation spark_catalog.default.char_tbl4[c7#x,c8#x,v#x,s#x] parquet @@ -476,7 +476,7 @@ select bit_length(c7), bit_length(c8), bit_length(v), bit_length(s) from char_tb -- !query analysis Project [bit_length(c7#x) AS bit_length(c7)#x, bit_length(c8#x) AS bit_length(c8)#x, bit_length(v#x) AS bit_length(v)#x, bit_length(s#x) AS bit_length(s)#x] +- SubqueryAlias spark_catalog.default.char_tbl4 - +- Project [staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c7#x, 7, true, false, true) AS c7#x, staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c8#x, 8, true, false, true) AS c8#x, v#x, s#x] + +- Project [static_invoke(CharVarcharCodegenUtils.readSidePadding(c7#x, 7)) AS c7#x, static_invoke(CharVarcharCodegenUtils.readSidePadding(c8#x, 8)) AS c8#x, v#x, s#x] +- Relation spark_catalog.default.char_tbl4[c7#x,c8#x,v#x,s#x] parquet @@ -485,7 +485,7 @@ select char_length(c7), char_length(c8), char_length(v), char_length(s) from cha -- !query analysis Project [char_length(c7#x) AS char_length(c7)#x, char_length(c8#x) AS char_length(c8)#x, char_length(v#x) AS char_length(v)#x, char_length(s#x) AS char_length(s)#x] +- SubqueryAlias spark_catalog.default.char_tbl4 - +- Project [staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c7#x, 7, true, false, true) AS c7#x, staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c8#x, 8, true, false, true) AS c8#x, v#x, s#x] + +- Project [static_invoke(CharVarcharCodegenUtils.readSidePadding(c7#x, 7)) AS c7#x, static_invoke(CharVarcharCodegenUtils.readSidePadding(c8#x, 8)) AS c8#x, v#x, s#x] +- Relation spark_catalog.default.char_tbl4[c7#x,c8#x,v#x,s#x] parquet @@ -494,7 +494,7 @@ select octet_length(c7), octet_length(c8), octet_length(v), octet_length(s) from -- !query analysis Project [octet_length(c7#x) AS octet_length(c7)#x, octet_length(c8#x) AS octet_length(c8)#x, octet_length(v#x) AS octet_length(v)#x, octet_length(s#x) AS octet_length(s)#x] +- SubqueryAlias spark_catalog.default.char_tbl4 - +- Project [staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c7#x, 7, true, false, true) AS c7#x, staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c8#x, 8, true, false, true) AS c8#x, v#x, s#x] + +- Project [static_invoke(CharVarcharCodegenUtils.readSidePadding(c7#x, 7)) AS c7#x, static_invoke(CharVarcharCodegenUtils.readSidePadding(c8#x, 8)) AS c8#x, v#x, s#x] +- Relation spark_catalog.default.char_tbl4[c7#x,c8#x,v#x,s#x] parquet @@ -503,7 +503,7 @@ select concat_ws('|', c7, c8), concat_ws('|', c7, v), concat_ws('|', c7, s), con -- !query analysis Project [concat_ws(|, c7#x, c8#x) AS concat_ws(|, c7, c8)#x, concat_ws(|, c7#x, v#x) AS concat_ws(|, c7, v)#x, concat_ws(|, c7#x, s#x) AS concat_ws(|, c7, s)#x, concat_ws(|, v#x, s#x) AS concat_ws(|, v, s)#x] +- SubqueryAlias spark_catalog.default.char_tbl4 - +- Project [staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c7#x, 7, true, false, true) AS c7#x, staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c8#x, 8, true, false, true) AS c8#x, v#x, s#x] + +- Project [static_invoke(CharVarcharCodegenUtils.readSidePadding(c7#x, 7)) AS c7#x, static_invoke(CharVarcharCodegenUtils.readSidePadding(c8#x, 8)) AS c8#x, v#x, s#x] +- Relation spark_catalog.default.char_tbl4[c7#x,c8#x,v#x,s#x] parquet @@ -512,7 +512,7 @@ select concat(c7, c8), concat(c7, v), concat(c7, s), concat(v, s) from char_tbl4 -- !query analysis Project [concat(c7#x, c8#x) AS concat(c7, c8)#x, concat(c7#x, v#x) AS concat(c7, v)#x, concat(c7#x, s#x) AS concat(c7, s)#x, concat(v#x, s#x) AS concat(v, s)#x] +- SubqueryAlias spark_catalog.default.char_tbl4 - +- Project [staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c7#x, 7, true, false, true) AS c7#x, staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c8#x, 8, true, false, true) AS c8#x, v#x, s#x] + +- Project [static_invoke(CharVarcharCodegenUtils.readSidePadding(c7#x, 7)) AS c7#x, static_invoke(CharVarcharCodegenUtils.readSidePadding(c8#x, 8)) AS c8#x, v#x, s#x] +- Relation spark_catalog.default.char_tbl4[c7#x,c8#x,v#x,s#x] parquet @@ -521,7 +521,7 @@ select like(c7, 'Ne _'), like(c8, 'Ne _') from char_tbl4 -- !query analysis Project [c7#x LIKE Ne _ AS c7 LIKE Ne _#x, c8#x LIKE Ne _ AS c8 LIKE Ne _#x] +- SubqueryAlias spark_catalog.default.char_tbl4 - +- Project [staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c7#x, 7, true, false, true) AS c7#x, staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c8#x, 8, true, false, true) AS c8#x, v#x, s#x] + +- Project [static_invoke(CharVarcharCodegenUtils.readSidePadding(c7#x, 7)) AS c7#x, static_invoke(CharVarcharCodegenUtils.readSidePadding(c8#x, 8)) AS c8#x, v#x, s#x] +- Relation spark_catalog.default.char_tbl4[c7#x,c8#x,v#x,s#x] parquet @@ -530,7 +530,7 @@ select like(v, 'Spark_') from char_tbl4 -- !query analysis Project [v#x LIKE Spark_ AS v LIKE Spark_#x] +- SubqueryAlias spark_catalog.default.char_tbl4 - +- Project [staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c7#x, 7, true, false, true) AS c7#x, staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c8#x, 8, true, false, true) AS c8#x, v#x, s#x] + +- Project [static_invoke(CharVarcharCodegenUtils.readSidePadding(c7#x, 7)) AS c7#x, static_invoke(CharVarcharCodegenUtils.readSidePadding(c8#x, 8)) AS c8#x, v#x, s#x] +- Relation spark_catalog.default.char_tbl4[c7#x,c8#x,v#x,s#x] parquet @@ -540,7 +540,7 @@ select c7 = c8, upper(c7) = upper(c8), lower(c7) = lower(c8) from char_tbl4 wher Project [(rpad(c7#x, 8, ) = c8#x) AS (c7 = c8)#x, (upper(c7#x) = upper(c8#x)) AS (upper(c7) = upper(c8))#x, (lower(c7#x) = lower(c8#x)) AS (lower(c7) = lower(c8))#x] +- Filter (s#x = NetEase) +- SubqueryAlias spark_catalog.default.char_tbl4 - +- Project [staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c7#x, 7, true, false, true) AS c7#x, staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c8#x, 8, true, false, true) AS c8#x, v#x, s#x] + +- Project [static_invoke(CharVarcharCodegenUtils.readSidePadding(c7#x, 7)) AS c7#x, static_invoke(CharVarcharCodegenUtils.readSidePadding(c8#x, 8)) AS c8#x, v#x, s#x] +- Relation spark_catalog.default.char_tbl4[c7#x,c8#x,v#x,s#x] parquet @@ -550,7 +550,7 @@ select c7 = s, upper(c7) = upper(s), lower(c7) = lower(s) from char_tbl4 where s Project [(c7#x = s#x) AS (c7 = s)#x, (upper(c7#x) = upper(s#x)) AS (upper(c7) = upper(s))#x, (lower(c7#x) = lower(s#x)) AS (lower(c7) = lower(s))#x] +- Filter (s#x = NetEase) +- SubqueryAlias spark_catalog.default.char_tbl4 - +- Project [staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c7#x, 7, true, false, true) AS c7#x, staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c8#x, 8, true, false, true) AS c8#x, v#x, s#x] + +- Project [static_invoke(CharVarcharCodegenUtils.readSidePadding(c7#x, 7)) AS c7#x, static_invoke(CharVarcharCodegenUtils.readSidePadding(c8#x, 8)) AS c8#x, v#x, s#x] +- Relation spark_catalog.default.char_tbl4[c7#x,c8#x,v#x,s#x] parquet @@ -560,7 +560,7 @@ select c7 = 'NetEase', upper(c7) = upper('NetEase'), lower(c7) = lower('NetEase' Project [(c7#x = NetEase) AS (c7 = NetEase)#x, (upper(c7#x) = upper(NetEase)) AS (upper(c7) = upper(NetEase))#x, (lower(c7#x) = lower(NetEase)) AS (lower(c7) = lower(NetEase))#x] +- Filter (s#x = NetEase) +- SubqueryAlias spark_catalog.default.char_tbl4 - +- Project [staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c7#x, 7, true, false, true) AS c7#x, staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c8#x, 8, true, false, true) AS c8#x, v#x, s#x] + +- Project [static_invoke(CharVarcharCodegenUtils.readSidePadding(c7#x, 7)) AS c7#x, static_invoke(CharVarcharCodegenUtils.readSidePadding(c8#x, 8)) AS c8#x, v#x, s#x] +- Relation spark_catalog.default.char_tbl4[c7#x,c8#x,v#x,s#x] parquet @@ -569,7 +569,7 @@ select printf('Hey, %s%s%s%s', c7, c8, v, s) from char_tbl4 -- !query analysis Project [printf(Hey, %s%s%s%s, c7#x, c8#x, v#x, s#x) AS printf(Hey, %s%s%s%s, c7, c8, v, s)#x] +- SubqueryAlias spark_catalog.default.char_tbl4 - +- Project [staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c7#x, 7, true, false, true) AS c7#x, staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c8#x, 8, true, false, true) AS c8#x, v#x, s#x] + +- Project [static_invoke(CharVarcharCodegenUtils.readSidePadding(c7#x, 7)) AS c7#x, static_invoke(CharVarcharCodegenUtils.readSidePadding(c8#x, 8)) AS c8#x, v#x, s#x] +- Relation spark_catalog.default.char_tbl4[c7#x,c8#x,v#x,s#x] parquet @@ -578,7 +578,7 @@ select repeat(c7, 2), repeat(c8, 2), repeat(v, 2), repeat(s, 2) from char_tbl4 -- !query analysis Project [repeat(c7#x, 2) AS repeat(c7, 2)#x, repeat(c8#x, 2) AS repeat(c8, 2)#x, repeat(v#x, 2) AS repeat(v, 2)#x, repeat(s#x, 2) AS repeat(s, 2)#x] +- SubqueryAlias spark_catalog.default.char_tbl4 - +- Project [staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c7#x, 7, true, false, true) AS c7#x, staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c8#x, 8, true, false, true) AS c8#x, v#x, s#x] + +- Project [static_invoke(CharVarcharCodegenUtils.readSidePadding(c7#x, 7)) AS c7#x, static_invoke(CharVarcharCodegenUtils.readSidePadding(c8#x, 8)) AS c8#x, v#x, s#x] +- Relation spark_catalog.default.char_tbl4[c7#x,c8#x,v#x,s#x] parquet @@ -587,7 +587,7 @@ select replace(c7, 'Net', 'Apache'), replace(c8, 'Net', 'Apache'), replace(v, 'S -- !query analysis Project [replace(c7#x, Net, Apache) AS replace(c7, Net, Apache)#x, replace(c8#x, Net, Apache) AS replace(c8, Net, Apache)#x, replace(v#x, Spark, Kyuubi) AS replace(v, Spark, Kyuubi)#x, replace(s#x, Net, Apache) AS replace(s, Net, Apache)#x] +- SubqueryAlias spark_catalog.default.char_tbl4 - +- Project [staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c7#x, 7, true, false, true) AS c7#x, staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c8#x, 8, true, false, true) AS c8#x, v#x, s#x] + +- Project [static_invoke(CharVarcharCodegenUtils.readSidePadding(c7#x, 7)) AS c7#x, static_invoke(CharVarcharCodegenUtils.readSidePadding(c8#x, 8)) AS c8#x, v#x, s#x] +- Relation spark_catalog.default.char_tbl4[c7#x,c8#x,v#x,s#x] parquet @@ -596,7 +596,7 @@ select rpad(c7, 10), rpad(c8, 5), rpad(v, 5), rpad(s, 5) from char_tbl4 -- !query analysis Project [rpad(c7#x, 10, ) AS rpad(c7, 10, )#x, rpad(c8#x, 5, ) AS rpad(c8, 5, )#x, rpad(v#x, 5, ) AS rpad(v, 5, )#x, rpad(s#x, 5, ) AS rpad(s, 5, )#x] +- SubqueryAlias spark_catalog.default.char_tbl4 - +- Project [staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c7#x, 7, true, false, true) AS c7#x, staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c8#x, 8, true, false, true) AS c8#x, v#x, s#x] + +- Project [static_invoke(CharVarcharCodegenUtils.readSidePadding(c7#x, 7)) AS c7#x, static_invoke(CharVarcharCodegenUtils.readSidePadding(c8#x, 8)) AS c8#x, v#x, s#x] +- Relation spark_catalog.default.char_tbl4[c7#x,c8#x,v#x,s#x] parquet @@ -605,7 +605,7 @@ select rtrim(c7), rtrim(c8), rtrim(v), rtrim(s) from char_tbl4 -- !query analysis Project [rtrim(c7#x, None) AS rtrim(c7)#x, rtrim(c8#x, None) AS rtrim(c8)#x, rtrim(v#x, None) AS rtrim(v)#x, rtrim(s#x, None) AS rtrim(s)#x] +- SubqueryAlias spark_catalog.default.char_tbl4 - +- Project [staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c7#x, 7, true, false, true) AS c7#x, staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c8#x, 8, true, false, true) AS c8#x, v#x, s#x] + +- Project [static_invoke(CharVarcharCodegenUtils.readSidePadding(c7#x, 7)) AS c7#x, static_invoke(CharVarcharCodegenUtils.readSidePadding(c8#x, 8)) AS c8#x, v#x, s#x] +- Relation spark_catalog.default.char_tbl4[c7#x,c8#x,v#x,s#x] parquet @@ -614,7 +614,7 @@ select split(c7, 'e'), split(c8, 'e'), split(v, 'a'), split(s, 'e') from char_tb -- !query analysis Project [split(c7#x, e, -1) AS split(c7, e, -1)#x, split(c8#x, e, -1) AS split(c8, e, -1)#x, split(v#x, a, -1) AS split(v, a, -1)#x, split(s#x, e, -1) AS split(s, e, -1)#x] +- SubqueryAlias spark_catalog.default.char_tbl4 - +- Project [staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c7#x, 7, true, false, true) AS c7#x, staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c8#x, 8, true, false, true) AS c8#x, v#x, s#x] + +- Project [static_invoke(CharVarcharCodegenUtils.readSidePadding(c7#x, 7)) AS c7#x, static_invoke(CharVarcharCodegenUtils.readSidePadding(c8#x, 8)) AS c8#x, v#x, s#x] +- Relation spark_catalog.default.char_tbl4[c7#x,c8#x,v#x,s#x] parquet @@ -623,7 +623,7 @@ select substring(c7, 2), substring(c8, 2), substring(v, 3), substring(s, 2) from -- !query analysis Project [substring(c7#x, 2, 2147483647) AS substring(c7, 2, 2147483647)#x, substring(c8#x, 2, 2147483647) AS substring(c8, 2, 2147483647)#x, substring(v#x, 3, 2147483647) AS substring(v, 3, 2147483647)#x, substring(s#x, 2, 2147483647) AS substring(s, 2, 2147483647)#x] +- SubqueryAlias spark_catalog.default.char_tbl4 - +- Project [staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c7#x, 7, true, false, true) AS c7#x, staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c8#x, 8, true, false, true) AS c8#x, v#x, s#x] + +- Project [static_invoke(CharVarcharCodegenUtils.readSidePadding(c7#x, 7)) AS c7#x, static_invoke(CharVarcharCodegenUtils.readSidePadding(c8#x, 8)) AS c8#x, v#x, s#x] +- Relation spark_catalog.default.char_tbl4[c7#x,c8#x,v#x,s#x] parquet @@ -632,7 +632,7 @@ select left(c7, 2), left(c8, 2), left(v, 3), left(s, 2) from char_tbl4 -- !query analysis Project [left(c7#x, 2) AS left(c7, 2)#x, left(c8#x, 2) AS left(c8, 2)#x, left(v#x, 3) AS left(v, 3)#x, left(s#x, 2) AS left(s, 2)#x] +- SubqueryAlias spark_catalog.default.char_tbl4 - +- Project [staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c7#x, 7, true, false, true) AS c7#x, staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c8#x, 8, true, false, true) AS c8#x, v#x, s#x] + +- Project [static_invoke(CharVarcharCodegenUtils.readSidePadding(c7#x, 7)) AS c7#x, static_invoke(CharVarcharCodegenUtils.readSidePadding(c8#x, 8)) AS c8#x, v#x, s#x] +- Relation spark_catalog.default.char_tbl4[c7#x,c8#x,v#x,s#x] parquet @@ -641,7 +641,7 @@ select right(c7, 2), right(c8, 2), right(v, 3), right(s, 2) from char_tbl4 -- !query analysis Project [right(c7#x, 2) AS right(c7, 2)#x, right(c8#x, 2) AS right(c8, 2)#x, right(v#x, 3) AS right(v, 3)#x, right(s#x, 2) AS right(s, 2)#x] +- SubqueryAlias spark_catalog.default.char_tbl4 - +- Project [staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c7#x, 7, true, false, true) AS c7#x, staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c8#x, 8, true, false, true) AS c8#x, v#x, s#x] + +- Project [static_invoke(CharVarcharCodegenUtils.readSidePadding(c7#x, 7)) AS c7#x, static_invoke(CharVarcharCodegenUtils.readSidePadding(c8#x, 8)) AS c8#x, v#x, s#x] +- Relation spark_catalog.default.char_tbl4[c7#x,c8#x,v#x,s#x] parquet @@ -652,7 +652,7 @@ GlobalLimit 1 +- LocalLimit 1 +- Project [typeof(c7#x) AS typeof(c7)#x, typeof(c8#x) AS typeof(c8)#x, typeof(v#x) AS typeof(v)#x, typeof(s#x) AS typeof(s)#x] +- SubqueryAlias spark_catalog.default.char_tbl4 - +- Project [staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c7#x, 7, true, false, true) AS c7#x, staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c8#x, 8, true, false, true) AS c8#x, v#x, s#x] + +- Project [static_invoke(CharVarcharCodegenUtils.readSidePadding(c7#x, 7)) AS c7#x, static_invoke(CharVarcharCodegenUtils.readSidePadding(c8#x, 8)) AS c8#x, v#x, s#x] +- Relation spark_catalog.default.char_tbl4[c7#x,c8#x,v#x,s#x] parquet @@ -661,7 +661,7 @@ select cast(c7 as char(1)), cast(c8 as char(10)), cast(v as char(1)), cast(v as -- !query analysis Project [cast(c7#x as string) AS c7#x, cast(c8#x as string) AS c8#x, cast(v#x as string) AS v#x, cast(v#x as string) AS v#x, cast(s#x as string) AS s#x] +- SubqueryAlias spark_catalog.default.char_tbl4 - +- Project [staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c7#x, 7, true, false, true) AS c7#x, staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, c8#x, 8, true, false, true) AS c8#x, v#x, s#x] + +- Project [static_invoke(CharVarcharCodegenUtils.readSidePadding(c7#x, 7)) AS c7#x, static_invoke(CharVarcharCodegenUtils.readSidePadding(c8#x, 8)) AS c8#x, v#x, s#x] +- Relation spark_catalog.default.char_tbl4[c7#x,c8#x,v#x,s#x] parquet diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out index d242a60a17c18..e6409806bad7a 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out @@ -1,6 +1,6 @@ -- Automatically generated by SQLQueryTestSuite -- !query -create table t1(utf8_binary string collate utf8_binary, utf8_binary_lcase string collate utf8_binary_lcase) using parquet +create table t1(utf8_binary string collate utf8_binary, utf8_lcase string collate utf8_lcase) using parquet -- !query analysis CreateDataSourceTableCommand `spark_catalog`.`default`.`t1`, false @@ -8,32 +8,32 @@ CreateDataSourceTableCommand `spark_catalog`.`default`.`t1`, false -- !query insert into t1 values('aaa', 'aaa') -- !query analysis -InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t1, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t1], Append, `spark_catalog`.`default`.`t1`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t1), [utf8_binary, utf8_binary_lcase] -+- Project [cast(col1#x as string) AS utf8_binary#x, cast(col2#x as string collate UTF8_BINARY_LCASE) AS utf8_binary_lcase#x] +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t1, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t1], Append, `spark_catalog`.`default`.`t1`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t1), [utf8_binary, utf8_lcase] ++- Project [cast(col1#x as string) AS utf8_binary#x, cast(col2#x as string collate UTF8_LCASE) AS utf8_lcase#x] +- LocalRelation [col1#x, col2#x] -- !query insert into t1 values('AAA', 'AAA') -- !query analysis -InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t1, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t1], Append, `spark_catalog`.`default`.`t1`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t1), [utf8_binary, utf8_binary_lcase] -+- Project [cast(col1#x as string) AS utf8_binary#x, cast(col2#x as string collate UTF8_BINARY_LCASE) AS utf8_binary_lcase#x] +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t1, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t1], Append, `spark_catalog`.`default`.`t1`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t1), [utf8_binary, utf8_lcase] ++- Project [cast(col1#x as string) AS utf8_binary#x, cast(col2#x as string collate UTF8_LCASE) AS utf8_lcase#x] +- LocalRelation [col1#x, col2#x] -- !query insert into t1 values('bbb', 'bbb') -- !query analysis -InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t1, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t1], Append, `spark_catalog`.`default`.`t1`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t1), [utf8_binary, utf8_binary_lcase] -+- Project [cast(col1#x as string) AS utf8_binary#x, cast(col2#x as string collate UTF8_BINARY_LCASE) AS utf8_binary_lcase#x] +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t1, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t1], Append, `spark_catalog`.`default`.`t1`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t1), [utf8_binary, utf8_lcase] ++- Project [cast(col1#x as string) AS utf8_binary#x, cast(col2#x as string collate UTF8_LCASE) AS utf8_lcase#x] +- LocalRelation [col1#x, col2#x] -- !query insert into t1 values('BBB', 'BBB') -- !query analysis -InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t1, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t1], Append, `spark_catalog`.`default`.`t1`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t1), [utf8_binary, utf8_binary_lcase] -+- Project [cast(col1#x as string) AS utf8_binary#x, cast(col2#x as string collate UTF8_BINARY_LCASE) AS utf8_binary_lcase#x] +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t1, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t1], Append, `spark_catalog`.`default`.`t1`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t1), [utf8_binary, utf8_lcase] ++- Project [cast(col1#x as string) AS utf8_binary#x, cast(col2#x as string collate UTF8_LCASE) AS utf8_lcase#x] +- LocalRelation [col1#x, col2#x] @@ -48,68 +48,68 @@ select count(*) from t1 group by utf8_binary -- !query analysis Aggregate [utf8_binary#x], [count(1) AS count(1)#xL] +- SubqueryAlias spark_catalog.default.t1 - +- Relation spark_catalog.default.t1[utf8_binary#x,utf8_binary_lcase#x] parquet + +- Relation spark_catalog.default.t1[utf8_binary#x,utf8_lcase#x] parquet -- !query -select count(*) from t1 group by utf8_binary_lcase +select count(*) from t1 group by utf8_lcase -- !query analysis -Aggregate [utf8_binary_lcase#x], [count(1) AS count(1)#xL] +Aggregate [utf8_lcase#x], [count(1) AS count(1)#xL] +- SubqueryAlias spark_catalog.default.t1 - +- Relation spark_catalog.default.t1[utf8_binary#x,utf8_binary_lcase#x] parquet + +- Relation spark_catalog.default.t1[utf8_binary#x,utf8_lcase#x] parquet -- !query select * from t1 where utf8_binary = 'aaa' -- !query analysis -Project [utf8_binary#x, utf8_binary_lcase#x] +Project [utf8_binary#x, utf8_lcase#x] +- Filter (utf8_binary#x = aaa) +- SubqueryAlias spark_catalog.default.t1 - +- Relation spark_catalog.default.t1[utf8_binary#x,utf8_binary_lcase#x] parquet + +- Relation spark_catalog.default.t1[utf8_binary#x,utf8_lcase#x] parquet -- !query -select * from t1 where utf8_binary_lcase = 'aaa' collate utf8_binary_lcase +select * from t1 where utf8_lcase = 'aaa' collate utf8_lcase -- !query analysis -Project [utf8_binary#x, utf8_binary_lcase#x] -+- Filter (utf8_binary_lcase#x = collate(aaa, utf8_binary_lcase)) +Project [utf8_binary#x, utf8_lcase#x] ++- Filter (utf8_lcase#x = collate(aaa, utf8_lcase)) +- SubqueryAlias spark_catalog.default.t1 - +- Relation spark_catalog.default.t1[utf8_binary#x,utf8_binary_lcase#x] parquet + +- Relation spark_catalog.default.t1[utf8_binary#x,utf8_lcase#x] parquet -- !query select * from t1 where utf8_binary < 'bbb' -- !query analysis -Project [utf8_binary#x, utf8_binary_lcase#x] +Project [utf8_binary#x, utf8_lcase#x] +- Filter (utf8_binary#x < bbb) +- SubqueryAlias spark_catalog.default.t1 - +- Relation spark_catalog.default.t1[utf8_binary#x,utf8_binary_lcase#x] parquet + +- Relation spark_catalog.default.t1[utf8_binary#x,utf8_lcase#x] parquet -- !query -select * from t1 where utf8_binary_lcase < 'bbb' collate utf8_binary_lcase +select * from t1 where utf8_lcase < 'bbb' collate utf8_lcase -- !query analysis -Project [utf8_binary#x, utf8_binary_lcase#x] -+- Filter (utf8_binary_lcase#x < collate(bbb, utf8_binary_lcase)) +Project [utf8_binary#x, utf8_lcase#x] ++- Filter (utf8_lcase#x < collate(bbb, utf8_lcase)) +- SubqueryAlias spark_catalog.default.t1 - +- Relation spark_catalog.default.t1[utf8_binary#x,utf8_binary_lcase#x] parquet + +- Relation spark_catalog.default.t1[utf8_binary#x,utf8_lcase#x] parquet -- !query -select l.utf8_binary, r.utf8_binary_lcase from t1 l join t1 r on l.utf8_binary_lcase = r.utf8_binary_lcase +select l.utf8_binary, r.utf8_lcase from t1 l join t1 r on l.utf8_lcase = r.utf8_lcase -- !query analysis -Project [utf8_binary#x, utf8_binary_lcase#x] -+- Join Inner, (utf8_binary_lcase#x = utf8_binary_lcase#x) +Project [utf8_binary#x, utf8_lcase#x] ++- Join Inner, (utf8_lcase#x = utf8_lcase#x) :- SubqueryAlias l : +- SubqueryAlias spark_catalog.default.t1 - : +- Relation spark_catalog.default.t1[utf8_binary#x,utf8_binary_lcase#x] parquet + : +- Relation spark_catalog.default.t1[utf8_binary#x,utf8_lcase#x] parquet +- SubqueryAlias r +- SubqueryAlias spark_catalog.default.t1 - +- Relation spark_catalog.default.t1[utf8_binary#x,utf8_binary_lcase#x] parquet + +- Relation spark_catalog.default.t1[utf8_binary#x,utf8_lcase#x] parquet -- !query -create table t2(utf8_binary string collate utf8_binary, utf8_binary_lcase string collate utf8_binary_lcase) using parquet +create table t2(utf8_binary string collate utf8_binary, utf8_lcase string collate utf8_lcase) using parquet -- !query analysis CreateDataSourceTableCommand `spark_catalog`.`default`.`t2`, false @@ -117,28 +117,28 @@ CreateDataSourceTableCommand `spark_catalog`.`default`.`t2`, false -- !query insert into t2 values('aaa', 'aaa') -- !query analysis -InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t2, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t2], Append, `spark_catalog`.`default`.`t2`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t2), [utf8_binary, utf8_binary_lcase] -+- Project [cast(col1#x as string) AS utf8_binary#x, cast(col2#x as string collate UTF8_BINARY_LCASE) AS utf8_binary_lcase#x] +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t2, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t2], Append, `spark_catalog`.`default`.`t2`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t2), [utf8_binary, utf8_lcase] ++- Project [cast(col1#x as string) AS utf8_binary#x, cast(col2#x as string collate UTF8_LCASE) AS utf8_lcase#x] +- LocalRelation [col1#x, col2#x] -- !query insert into t2 values('bbb', 'bbb') -- !query analysis -InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t2, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t2], Append, `spark_catalog`.`default`.`t2`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t2), [utf8_binary, utf8_binary_lcase] -+- Project [cast(col1#x as string) AS utf8_binary#x, cast(col2#x as string collate UTF8_BINARY_LCASE) AS utf8_binary_lcase#x] +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t2, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t2], Append, `spark_catalog`.`default`.`t2`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t2), [utf8_binary, utf8_lcase] ++- Project [cast(col1#x as string) AS utf8_binary#x, cast(col2#x as string collate UTF8_LCASE) AS utf8_lcase#x] +- LocalRelation [col1#x, col2#x] -- !query -select * from t1 anti join t2 on t1.utf8_binary_lcase = t2.utf8_binary_lcase +select * from t1 anti join t2 on t1.utf8_lcase = t2.utf8_lcase -- !query analysis -Project [utf8_binary#x, utf8_binary_lcase#x] -+- Join LeftAnti, (utf8_binary_lcase#x = utf8_binary_lcase#x) +Project [utf8_binary#x, utf8_lcase#x] ++- Join LeftAnti, (utf8_lcase#x = utf8_lcase#x) :- SubqueryAlias spark_catalog.default.t1 - : +- Relation spark_catalog.default.t1[utf8_binary#x,utf8_binary_lcase#x] parquet + : +- Relation spark_catalog.default.t1[utf8_binary#x,utf8_lcase#x] parquet +- SubqueryAlias spark_catalog.default.t2 - +- Relation spark_catalog.default.t2[utf8_binary#x,utf8_binary_lcase#x] parquet + +- Relation spark_catalog.default.t2[utf8_binary#x,utf8_lcase#x] parquet -- !query @@ -156,75 +156,75 @@ DropTable false, false -- !query -select col1 collate utf8_binary_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') except select col1 collate utf8_binary_lcase from values ('aaa'), ('bbb') +select col1 collate utf8_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') except select col1 collate utf8_lcase from values ('aaa'), ('bbb') -- !query analysis Except false -:- Project [collate(col1#x, utf8_binary_lcase) AS collate(col1)#x] +:- Project [collate(col1#x, utf8_lcase) AS collate(col1)#x] : +- LocalRelation [col1#x] -+- Project [collate(col1#x, utf8_binary_lcase) AS collate(col1)#x] ++- Project [collate(col1#x, utf8_lcase) AS collate(col1)#x] +- LocalRelation [col1#x] -- !query -select col1 collate utf8_binary_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') except all select col1 collate utf8_binary_lcase from values ('aaa'), ('bbb') +select col1 collate utf8_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') except all select col1 collate utf8_lcase from values ('aaa'), ('bbb') -- !query analysis Except All true -:- Project [collate(col1#x, utf8_binary_lcase) AS collate(col1)#x] +:- Project [collate(col1#x, utf8_lcase) AS collate(col1)#x] : +- LocalRelation [col1#x] -+- Project [collate(col1#x, utf8_binary_lcase) AS collate(col1)#x] ++- Project [collate(col1#x, utf8_lcase) AS collate(col1)#x] +- LocalRelation [col1#x] -- !query -select col1 collate utf8_binary_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') union select col1 collate utf8_binary_lcase from values ('aaa'), ('bbb') +select col1 collate utf8_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') union select col1 collate utf8_lcase from values ('aaa'), ('bbb') -- !query analysis Distinct +- Union false, false - :- Project [collate(col1#x, utf8_binary_lcase) AS collate(col1)#x] + :- Project [collate(col1#x, utf8_lcase) AS collate(col1)#x] : +- LocalRelation [col1#x] - +- Project [collate(col1#x, utf8_binary_lcase) AS collate(col1)#x] + +- Project [collate(col1#x, utf8_lcase) AS collate(col1)#x] +- LocalRelation [col1#x] -- !query -select col1 collate utf8_binary_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') union all select col1 collate utf8_binary_lcase from values ('aaa'), ('bbb') +select col1 collate utf8_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') union all select col1 collate utf8_lcase from values ('aaa'), ('bbb') -- !query analysis Union false, false -:- Project [collate(col1#x, utf8_binary_lcase) AS collate(col1)#x] +:- Project [collate(col1#x, utf8_lcase) AS collate(col1)#x] : +- LocalRelation [col1#x] -+- Project [collate(col1#x, utf8_binary_lcase) AS collate(col1)#x] ++- Project [collate(col1#x, utf8_lcase) AS collate(col1)#x] +- LocalRelation [col1#x] -- !query -select col1 collate utf8_binary_lcase from values ('aaa'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') intersect select col1 collate utf8_binary_lcase from values ('aaa'), ('bbb') +select col1 collate utf8_lcase from values ('aaa'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') intersect select col1 collate utf8_lcase from values ('aaa'), ('bbb') -- !query analysis Intersect false -:- Project [collate(col1#x, utf8_binary_lcase) AS collate(col1)#x] +:- Project [collate(col1#x, utf8_lcase) AS collate(col1)#x] : +- LocalRelation [col1#x] -+- Project [collate(col1#x, utf8_binary_lcase) AS collate(col1)#x] ++- Project [collate(col1#x, utf8_lcase) AS collate(col1)#x] +- LocalRelation [col1#x] -- !query -create table t1 (c1 struct) USING PARQUET +create table t1 (c1 struct) USING PARQUET -- !query analysis CreateDataSourceTableCommand `spark_catalog`.`default`.`t1`, false -- !query -insert into t1 values (named_struct('utf8_binary', 'aaa', 'utf8_binary_lcase', 'aaa')) +insert into t1 values (named_struct('utf8_binary', 'aaa', 'utf8_lcase', 'aaa')) -- !query analysis InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t1, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t1], Append, `spark_catalog`.`default`.`t1`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t1), [c1] -+- Project [named_struct(utf8_binary, col1#x.utf8_binary, utf8_binary_lcase, cast(col1#x.utf8_binary_lcase as string collate UTF8_BINARY_LCASE)) AS c1#x] ++- Project [named_struct(utf8_binary, col1#x.utf8_binary, utf8_lcase, cast(col1#x.utf8_lcase as string collate UTF8_LCASE)) AS c1#x] +- LocalRelation [col1#x] -- !query -insert into t1 values (named_struct('utf8_binary', 'AAA', 'utf8_binary_lcase', 'AAA')) +insert into t1 values (named_struct('utf8_binary', 'AAA', 'utf8_lcase', 'AAA')) -- !query analysis InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t1, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t1], Append, `spark_catalog`.`default`.`t1`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t1), [c1] -+- Project [named_struct(utf8_binary, col1#x.utf8_binary, utf8_binary_lcase, cast(col1#x.utf8_binary_lcase as string collate UTF8_BINARY_LCASE)) AS c1#x] ++- Project [named_struct(utf8_binary, col1#x.utf8_binary, utf8_lcase, cast(col1#x.utf8_lcase as string collate UTF8_LCASE)) AS c1#x] +- LocalRelation [col1#x] @@ -237,9 +237,9 @@ Aggregate [c1#x.utf8_binary], [count(1) AS count(1)#xL] -- !query -select count(*) from t1 group by c1.utf8_binary_lcase +select count(*) from t1 group by c1.utf8_lcase -- !query analysis -Aggregate [c1#x.utf8_binary_lcase], [count(1) AS count(1)#xL] +Aggregate [c1#x.utf8_lcase], [count(1) AS count(1)#xL] +- SubqueryAlias spark_catalog.default.t1 +- Relation spark_catalog.default.t1[c1#x] parquet @@ -252,63 +252,140 @@ DropTable false, false -- !query -select array_contains(ARRAY('aaa' collate utf8_binary_lcase),'AAA' collate utf8_binary_lcase) +select array_contains(ARRAY('aaa' collate utf8_lcase),'AAA' collate utf8_lcase) -- !query analysis -Project [array_contains(array(collate(aaa, utf8_binary_lcase)), collate(AAA, utf8_binary_lcase)) AS array_contains(array(collate(aaa)), collate(AAA))#x] +Project [array_contains(array(collate(aaa, utf8_lcase)), collate(AAA, utf8_lcase)) AS array_contains(array(collate(aaa)), collate(AAA))#x] +- OneRowRelation -- !query -select array_position(ARRAY('aaa' collate utf8_binary_lcase, 'bbb' collate utf8_binary_lcase),'BBB' collate utf8_binary_lcase) +select array_position(ARRAY('aaa' collate utf8_lcase, 'bbb' collate utf8_lcase),'BBB' collate utf8_lcase) -- !query analysis -Project [array_position(array(collate(aaa, utf8_binary_lcase), collate(bbb, utf8_binary_lcase)), collate(BBB, utf8_binary_lcase)) AS array_position(array(collate(aaa), collate(bbb)), collate(BBB))#xL] +Project [array_position(array(collate(aaa, utf8_lcase), collate(bbb, utf8_lcase)), collate(BBB, utf8_lcase)) AS array_position(array(collate(aaa), collate(bbb)), collate(BBB))#xL] +- OneRowRelation -- !query -select nullif('aaa' COLLATE utf8_binary_lcase, 'AAA' COLLATE utf8_binary_lcase) +select nullif('aaa' COLLATE utf8_lcase, 'AAA' COLLATE utf8_lcase) -- !query analysis -Project [nullif(collate(aaa, utf8_binary_lcase), collate(AAA, utf8_binary_lcase)) AS nullif(collate(aaa), collate(AAA))#x] +Project [nullif(collate(aaa, utf8_lcase), collate(AAA, utf8_lcase)) AS nullif(collate(aaa), collate(AAA))#x] +- OneRowRelation -- !query -select least('aaa' COLLATE utf8_binary_lcase, 'AAA' collate utf8_binary_lcase, 'a' collate utf8_binary_lcase) +select least('aaa' COLLATE utf8_lcase, 'AAA' collate utf8_lcase, 'a' collate utf8_lcase) -- !query analysis -Project [least(collate(aaa, utf8_binary_lcase), collate(AAA, utf8_binary_lcase), collate(a, utf8_binary_lcase)) AS least(collate(aaa), collate(AAA), collate(a))#x] +Project [least(collate(aaa, utf8_lcase), collate(AAA, utf8_lcase), collate(a, utf8_lcase)) AS least(collate(aaa), collate(AAA), collate(a))#x] +- OneRowRelation -- !query -select arrays_overlap(array('aaa' collate utf8_binary_lcase), array('AAA' collate utf8_binary_lcase)) +select arrays_overlap(array('aaa' collate utf8_lcase), array('AAA' collate utf8_lcase)) -- !query analysis -Project [arrays_overlap(array(collate(aaa, utf8_binary_lcase)), array(collate(AAA, utf8_binary_lcase))) AS arrays_overlap(array(collate(aaa)), array(collate(AAA)))#x] +Project [arrays_overlap(array(collate(aaa, utf8_lcase)), array(collate(AAA, utf8_lcase))) AS arrays_overlap(array(collate(aaa)), array(collate(AAA)))#x] +- OneRowRelation -- !query -select array_distinct(array('aaa' collate utf8_binary_lcase, 'AAA' collate utf8_binary_lcase)) +select array_distinct(array('aaa' collate utf8_lcase, 'AAA' collate utf8_lcase)) -- !query analysis -Project [array_distinct(array(collate(aaa, utf8_binary_lcase), collate(AAA, utf8_binary_lcase))) AS array_distinct(array(collate(aaa), collate(AAA)))#x] +Project [array_distinct(array(collate(aaa, utf8_lcase), collate(AAA, utf8_lcase))) AS array_distinct(array(collate(aaa), collate(AAA)))#x] +- OneRowRelation -- !query -select array_union(array('aaa' collate utf8_binary_lcase), array('AAA' collate utf8_binary_lcase)) +select array_union(array('aaa' collate utf8_lcase), array('AAA' collate utf8_lcase)) -- !query analysis -Project [array_union(array(collate(aaa, utf8_binary_lcase)), array(collate(AAA, utf8_binary_lcase))) AS array_union(array(collate(aaa)), array(collate(AAA)))#x] +Project [array_union(array(collate(aaa, utf8_lcase)), array(collate(AAA, utf8_lcase))) AS array_union(array(collate(aaa)), array(collate(AAA)))#x] +- OneRowRelation -- !query -select array_intersect(array('aaa' collate utf8_binary_lcase), array('AAA' collate utf8_binary_lcase)) +select array_intersect(array('aaa' collate utf8_lcase), array('AAA' collate utf8_lcase)) -- !query analysis -Project [array_intersect(array(collate(aaa, utf8_binary_lcase)), array(collate(AAA, utf8_binary_lcase))) AS array_intersect(array(collate(aaa)), array(collate(AAA)))#x] +Project [array_intersect(array(collate(aaa, utf8_lcase)), array(collate(AAA, utf8_lcase))) AS array_intersect(array(collate(aaa)), array(collate(AAA)))#x] +- OneRowRelation -- !query -select array_except(array('aaa' collate utf8_binary_lcase), array('AAA' collate utf8_binary_lcase)) +select array_except(array('aaa' collate utf8_lcase), array('AAA' collate utf8_lcase)) -- !query analysis -Project [array_except(array(collate(aaa, utf8_binary_lcase)), array(collate(AAA, utf8_binary_lcase))) AS array_except(array(collate(aaa)), array(collate(AAA)))#x] +Project [array_except(array(collate(aaa, utf8_lcase)), array(collate(AAA, utf8_lcase))) AS array_except(array(collate(aaa)), array(collate(AAA)))#x] ++- OneRowRelation + + +-- !query +select 'a' collate unicode < 'A' +-- !query analysis +Project [(collate(a, unicode) < cast(A as string collate UNICODE)) AS (collate(a) < A)#x] ++- OneRowRelation + + +-- !query +select 'a' collate unicode_ci = 'A' +-- !query analysis +Project [(collate(a, unicode_ci) = cast(A as string collate UNICODE_CI)) AS (collate(a) = A)#x] ++- OneRowRelation + + +-- !query +select 'a' collate unicode_ai = 'å' +-- !query analysis +Project [(collate(a, unicode_ai) = cast(å as string collate UNICODE_AI)) AS (collate(a) = å)#x] ++- OneRowRelation + + +-- !query +select 'a' collate unicode_ci_ai = 'Å' +-- !query analysis +Project [(collate(a, unicode_ci_ai) = cast(Å as string collate UNICODE_CI_AI)) AS (collate(a) = Å)#x] ++- OneRowRelation + + +-- !query +select 'a' collate en < 'A' +-- !query analysis +Project [(collate(a, en) < cast(A as string collate en)) AS (collate(a) < A)#x] ++- OneRowRelation + + +-- !query +select 'a' collate en_ci = 'A' +-- !query analysis +Project [(collate(a, en_ci) = cast(A as string collate en_CI)) AS (collate(a) = A)#x] ++- OneRowRelation + + +-- !query +select 'a' collate en_ai = 'å' +-- !query analysis +Project [(collate(a, en_ai) = cast(å as string collate en_AI)) AS (collate(a) = å)#x] ++- OneRowRelation + + +-- !query +select 'a' collate en_ci_ai = 'Å' +-- !query analysis +Project [(collate(a, en_ci_ai) = cast(Å as string collate en_CI_AI)) AS (collate(a) = Å)#x] ++- OneRowRelation + + +-- !query +select 'Kypper' collate sv < 'Köpfe' +-- !query analysis +Project [(collate(Kypper, sv) < cast(Köpfe as string collate sv)) AS (collate(Kypper) < Köpfe)#x] ++- OneRowRelation + + +-- !query +select 'Kypper' collate de > 'Köpfe' +-- !query analysis +Project [(collate(Kypper, de) > cast(Köpfe as string collate de)) AS (collate(Kypper) > Köpfe)#x] ++- OneRowRelation + + +-- !query +select 'I' collate tr_ci = 'ı' +-- !query analysis +Project [(collate(I, tr_ci) = cast(ı as string collate tr_CI)) AS (collate(I) = ı)#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/column-resolution-aggregate.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/column-resolution-aggregate.sql.out index 39ab8aa835c48..b3bfec1fe3a8e 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/column-resolution-aggregate.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/column-resolution-aggregate.sql.out @@ -2,7 +2,7 @@ -- !query CREATE TEMPORARY VIEW v1 AS VALUES (1, 1, 1), (2, 2, 1) AS t(a, b, k) -- !query analysis -CreateViewCommand `v1`, VALUES (1, 1, 1), (2, 2, 1) AS t(a, b, k), false, false, LocalTempView, true +CreateViewCommand `v1`, VALUES (1, 1, 1), (2, 2, 1) AS t(a, b, k), false, false, LocalTempView, UNSUPPORTED, true +- SubqueryAlias t +- LocalRelation [a#x, b#x, k#x] @@ -10,7 +10,7 @@ CreateViewCommand `v1`, VALUES (1, 1, 1), (2, 2, 1) AS t(a, b, k), false, false, -- !query CREATE TEMPORARY VIEW v2 AS VALUES (1, 1, 1), (2, 2, 1) AS t(x, y, all) -- !query analysis -CreateViewCommand `v2`, VALUES (1, 1, 1), (2, 2, 1) AS t(x, y, all), false, false, LocalTempView, true +CreateViewCommand `v2`, VALUES (1, 1, 1), (2, 2, 1) AS t(x, y, all), false, false, LocalTempView, UNSUPPORTED, true +- SubqueryAlias t +- LocalRelation [x#x, y#x, all#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/column-resolution-sort.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/column-resolution-sort.sql.out index e4f6a5935dc66..e6fc6b3cf0c8a 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/column-resolution-sort.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/column-resolution-sort.sql.out @@ -2,7 +2,7 @@ -- !query CREATE TEMPORARY VIEW v1 AS VALUES (1, 2, 2), (2, 1, 1) AS t(a, b, k) -- !query analysis -CreateViewCommand `v1`, VALUES (1, 2, 2), (2, 1, 1) AS t(a, b, k), false, false, LocalTempView, true +CreateViewCommand `v1`, VALUES (1, 2, 2), (2, 1, 1) AS t(a, b, k), false, false, LocalTempView, UNSUPPORTED, true +- SubqueryAlias t +- LocalRelation [a#x, b#x, k#x] @@ -10,7 +10,7 @@ CreateViewCommand `v1`, VALUES (1, 2, 2), (2, 1, 1) AS t(a, b, k), false, false, -- !query CREATE TEMPORARY VIEW v2 AS VALUES (1, 2, 2), (2, 1, 1) AS t(a, b, all) -- !query analysis -CreateViewCommand `v2`, VALUES (1, 2, 2), (2, 1, 1) AS t(a, b, all), false, false, LocalTempView, true +CreateViewCommand `v2`, VALUES (1, 2, 2), (2, 1, 1) AS t(a, b, all), false, false, LocalTempView, UNSUPPORTED, true +- SubqueryAlias t +- LocalRelation [a#x, b#x, all#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/columnresolution-negative.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/columnresolution-negative.sql.out index 7348313c970c4..f16d42fac7226 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/columnresolution-negative.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/columnresolution-negative.sql.out @@ -387,7 +387,7 @@ SetCatalogAndNamespace -- !query CREATE VIEW v1 AS SELECT * FROM t1 -- !query analysis -CreateViewCommand `spark_catalog`.`mydb1`.`v1`, SELECT * FROM t1, false, false, PersistedView, true +CreateViewCommand `spark_catalog`.`mydb1`.`v1`, SELECT * FROM t1, false, false, PersistedView, COMPENSATION, true +- Project [i1#x] +- SubqueryAlias spark_catalog.mydb1.t1 +- Relation spark_catalog.mydb1.t1[i1#x] parquet @@ -435,7 +435,7 @@ SetCatalogAndNamespace -- !query CREATE TEMP VIEW v2 AS SELECT * FROM t1 -- !query analysis -CreateViewCommand `v2`, SELECT * FROM t1, false, false, LocalTempView, true +CreateViewCommand `v2`, SELECT * FROM t1, false, false, LocalTempView, UNSUPPORTED, true +- Project [i1#x] +- SubqueryAlias spark_catalog.mydb2.t1 +- Relation spark_catalog.mydb2.t1[i1#x] parquet diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/columnresolution-views.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/columnresolution-views.sql.out index ea852537903ee..96f6469a8fcb5 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/columnresolution-views.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/columnresolution-views.sql.out @@ -2,7 +2,7 @@ -- !query CREATE OR REPLACE TEMPORARY VIEW view1 AS SELECT 2 AS i1 -- !query analysis -CreateViewCommand `view1`, SELECT 2 AS i1, false, true, LocalTempView, true +CreateViewCommand `view1`, SELECT 2 AS i1, false, true, LocalTempView, UNSUPPORTED, true +- Project [2 AS i1#x] +- OneRowRelation @@ -84,7 +84,7 @@ DropTempViewCommand view1 -- !query CREATE OR REPLACE GLOBAL TEMPORARY VIEW view1 as SELECT 1 as i1 -- !query analysis -CreateViewCommand `view1`, SELECT 1 as i1, false, true, GlobalTempView, true +CreateViewCommand `view1`, SELECT 1 as i1, false, true, GlobalTempView, UNSUPPORTED, true +- Project [1 AS i1#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/count.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/count.sql.out index acfe447d8bf4e..732b714615792 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/count.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/count.sql.out @@ -6,7 +6,7 @@ AS testData(a, b) -- !query analysis CreateViewCommand `testData`, SELECT * FROM VALUES (1, 1), (1, 2), (2, 1), (1, 1), (null, 2), (1, null), (null, null) -AS testData(a, b), false, true, LocalTempView, true +AS testData(a, b), false, true, LocalTempView, UNSUPPORTED, true +- Project [a#x, b#x] +- SubqueryAlias testData +- LocalRelation [a#x, b#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/cross-join.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/cross-join.sql.out index 6a1fec839af2b..818093ab4ca66 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/cross-join.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/cross-join.sql.out @@ -10,7 +10,7 @@ CreateViewCommand `nt1`, select * from values ("one", 1), ("two", 2), ("three", 3) - as nt1(k, v1), false, false, LocalTempView, true + as nt1(k, v1), false, false, LocalTempView, UNSUPPORTED, true +- Project [k#x, v1#x] +- SubqueryAlias nt1 +- LocalRelation [k#x, v1#x] @@ -27,7 +27,7 @@ CreateViewCommand `nt2`, select * from values ("one", 1), ("two", 22), ("one", 5) - as nt2(k, v2), false, false, LocalTempView, true + as nt2(k, v2), false, false, LocalTempView, UNSUPPORTED, true +- Project [k#x, v2#x] +- SubqueryAlias nt2 +- LocalRelation [k#x, v2#x] @@ -142,7 +142,7 @@ Project [key#x, key#x] -- !query create temporary view A(a, va) as select * from nt1 -- !query analysis -CreateViewCommand `A`, [(a,None), (va,None)], select * from nt1, false, false, LocalTempView, true +CreateViewCommand `A`, [(a,None), (va,None)], select * from nt1, false, false, LocalTempView, UNSUPPORTED, true +- Project [k#x, v1#x] +- SubqueryAlias nt1 +- View (`nt1`, [k#x, v1#x]) @@ -155,7 +155,7 @@ CreateViewCommand `A`, [(a,None), (va,None)], select * from nt1, false, false, L -- !query create temporary view B(b, vb) as select * from nt1 -- !query analysis -CreateViewCommand `B`, [(b,None), (vb,None)], select * from nt1, false, false, LocalTempView, true +CreateViewCommand `B`, [(b,None), (vb,None)], select * from nt1, false, false, LocalTempView, UNSUPPORTED, true +- Project [k#x, v1#x] +- SubqueryAlias nt1 +- View (`nt1`, [k#x, v1#x]) @@ -168,7 +168,7 @@ CreateViewCommand `B`, [(b,None), (vb,None)], select * from nt1, false, false, L -- !query create temporary view C(c, vc) as select * from nt1 -- !query analysis -CreateViewCommand `C`, [(c,None), (vc,None)], select * from nt1, false, false, LocalTempView, true +CreateViewCommand `C`, [(c,None), (vc,None)], select * from nt1, false, false, LocalTempView, UNSUPPORTED, true +- Project [k#x, v1#x] +- SubqueryAlias nt1 +- View (`nt1`, [k#x, v1#x]) @@ -181,7 +181,7 @@ CreateViewCommand `C`, [(c,None), (vc,None)], select * from nt1, false, false, L -- !query create temporary view D(d, vd) as select * from nt1 -- !query analysis -CreateViewCommand `D`, [(d,None), (vd,None)], select * from nt1, false, false, LocalTempView, true +CreateViewCommand `D`, [(d,None), (vd,None)], select * from nt1, false, false, LocalTempView, UNSUPPORTED, true +- Project [k#x, v1#x] +- SubqueryAlias nt1 +- View (`nt1`, [k#x, v1#x]) diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/csv-functions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/csv-functions.sql.out index d12644b907123..4149f5f09947c 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/csv-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/csv-functions.sql.out @@ -173,7 +173,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException -- !query CREATE TEMPORARY VIEW csvTable(csvField, a) AS SELECT * FROM VALUES ('1,abc', 'a') -- !query analysis -CreateViewCommand `csvTable`, [(csvField,None), (a,None)], SELECT * FROM VALUES ('1,abc', 'a'), false, false, LocalTempView, true +CreateViewCommand `csvTable`, [(csvField,None), (a,None)], SELECT * FROM VALUES ('1,abc', 'a'), false, false, LocalTempView, UNSUPPORTED, true +- Project [col1#x, col2#x] +- LocalRelation [col1#x, col2#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/cte-command.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/cte-command.sql.out index 4aac75ec45a93..0b539267e720f 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/cte-command.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/cte-command.sql.out @@ -24,7 +24,7 @@ Project [col#x] -- !query CREATE TEMPORARY VIEW cte_view AS WITH s AS (SELECT 42 AS col) SELECT * FROM s -- !query analysis -CreateViewCommand `cte_view`, WITH s AS (SELECT 42 AS col) SELECT * FROM s, false, false, LocalTempView, true +CreateViewCommand `cte_view`, WITH s AS (SELECT 42 AS col) SELECT * FROM s, false, false, LocalTempView, UNSUPPORTED, true +- WithCTE :- CTERelationDef xxxx, false : +- SubqueryAlias s diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/cte-legacy.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/cte-legacy.sql.out index 594a30b054edd..f9b78e94236fb 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/cte-legacy.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/cte-legacy.sql.out @@ -43,6 +43,30 @@ Project [scalar-subquery#x [] AS scalarsubquery()#x] +- OneRowRelation +-- !query +SELECT ( + WITH unreferenced AS (SELECT id) + SELECT 1 +) FROM range(1) +-- !query analysis +Project [scalar-subquery#x [] AS scalarsubquery()#x] +: +- Project [1 AS 1#x] +: +- OneRowRelation ++- Range (0, 1, step=1) + + +-- !query +SELECT ( + WITH unreferenced AS (SELECT 1) + SELECT id +) FROM range(1) +-- !query analysis +Project [scalar-subquery#x [id#xL] AS scalarsubquery(id)#xL] +: +- Project [outer(id#xL)] +: +- OneRowRelation ++- Range (0, 1, step=1) + + -- !query SELECT * FROM ( diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/cte-nested.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/cte-nested.sql.out index f1a302b06f2a8..3a9fc5ea1297f 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/cte-nested.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/cte-nested.sql.out @@ -58,6 +58,40 @@ Project [scalar-subquery#x [] AS scalarsubquery()#x] +- OneRowRelation +-- !query +SELECT ( + WITH unreferenced AS (SELECT id) + SELECT 1 +) FROM range(1) +-- !query analysis +Project [scalar-subquery#x [id#xL] AS scalarsubquery(id)#x] +: +- WithCTE +: :- CTERelationDef xxxx, false +: : +- SubqueryAlias unreferenced +: : +- Project [outer(id#xL)] +: : +- OneRowRelation +: +- Project [1 AS 1#x] +: +- OneRowRelation ++- Range (0, 1, step=1) + + +-- !query +SELECT ( + WITH unreferenced AS (SELECT 1) + SELECT id +) FROM range(1) +-- !query analysis +Project [scalar-subquery#x [id#xL] AS scalarsubquery(id)#xL] +: +- WithCTE +: :- CTERelationDef xxxx, false +: : +- SubqueryAlias unreferenced +: : +- Project [1 AS 1#x] +: : +- OneRowRelation +: +- Project [outer(id#xL)] +: +- OneRowRelation ++- Range (0, 1, step=1) + + -- !query SELECT * FROM ( diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/cte-nonlegacy.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/cte-nonlegacy.sql.out index 6e55c6fa83cd9..e8640c3cbb6bd 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/cte-nonlegacy.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/cte-nonlegacy.sql.out @@ -58,6 +58,40 @@ Project [scalar-subquery#x [] AS scalarsubquery()#x] +- OneRowRelation +-- !query +SELECT ( + WITH unreferenced AS (SELECT id) + SELECT 1 +) FROM range(1) +-- !query analysis +Project [scalar-subquery#x [id#xL] AS scalarsubquery(id)#x] +: +- WithCTE +: :- CTERelationDef xxxx, false +: : +- SubqueryAlias unreferenced +: : +- Project [outer(id#xL)] +: : +- OneRowRelation +: +- Project [1 AS 1#x] +: +- OneRowRelation ++- Range (0, 1, step=1) + + +-- !query +SELECT ( + WITH unreferenced AS (SELECT 1) + SELECT id +) FROM range(1) +-- !query analysis +Project [scalar-subquery#x [id#xL] AS scalarsubquery(id)#xL] +: +- WithCTE +: :- CTERelationDef xxxx, false +: : +- SubqueryAlias unreferenced +: : +- Project [1 AS 1#x] +: : +- OneRowRelation +: +- Project [outer(id#xL)] +: +- OneRowRelation ++- Range (0, 1, step=1) + + -- !query SELECT * FROM ( diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/cte.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/cte.sql.out index 1d9d812875c44..155308ee0d7ea 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/cte.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/cte.sql.out @@ -2,7 +2,7 @@ -- !query create temporary view t as select * from values 0, 1, 2 as t(id) -- !query analysis -CreateViewCommand `t`, select * from values 0, 1, 2 as t(id), false, false, LocalTempView, true +CreateViewCommand `t`, select * from values 0, 1, 2 as t(id), false, false, LocalTempView, UNSUPPORTED, true +- Project [id#x] +- SubqueryAlias t +- LocalRelation [id#x] @@ -11,7 +11,7 @@ CreateViewCommand `t`, select * from values 0, 1, 2 as t(id), false, false, Loca -- !query create temporary view t2 as select * from values 0, 1 as t(id) -- !query analysis -CreateViewCommand `t2`, select * from values 0, 1 as t(id), false, false, LocalTempView, true +CreateViewCommand `t2`, select * from values 0, 1 as t(id), false, false, LocalTempView, UNSUPPORTED, true +- Project [id#x] +- SubqueryAlias t +- LocalRelation [id#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/date.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/date.sql.out index 3d20b9641f99d..48137e06467e8 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/date.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/date.sql.out @@ -2,7 +2,7 @@ -- !query create temporary view date_view as select '2011-11-11' date_str, '1' int_str -- !query analysis -CreateViewCommand `date_view`, select '2011-11-11' date_str, '1' int_str, false, false, LocalTempView, true +CreateViewCommand `date_view`, select '2011-11-11' date_str, '1' int_str, false, false, LocalTempView, UNSUPPORTED, true +- Project [2011-11-11 AS date_str#x, 1 AS int_str#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/datetime-formatting-legacy.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/datetime-formatting-legacy.sql.out index bc33537b3a8e5..7c5f1260b6487 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/datetime-formatting-legacy.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/datetime-formatting-legacy.sql.out @@ -16,7 +16,7 @@ CreateViewCommand `v`, select col from values (timestamp '1996-04-01 00:33:33.123Australia/Darwin'), (timestamp '2018-11-17 13:33:33.123Z'), (timestamp '2020-01-01 01:33:33.123Asia/Shanghai'), - (timestamp '2100-01-01 01:33:33.123America/Los_Angeles') t(col), false, false, LocalTempView, true + (timestamp '2100-01-01 01:33:33.123America/Los_Angeles') t(col), false, false, LocalTempView, UNSUPPORTED, true +- Project [col#x] +- SubqueryAlias t +- LocalRelation [col#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/datetime-formatting.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/datetime-formatting.sql.out index bc33537b3a8e5..7c5f1260b6487 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/datetime-formatting.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/datetime-formatting.sql.out @@ -16,7 +16,7 @@ CreateViewCommand `v`, select col from values (timestamp '1996-04-01 00:33:33.123Australia/Darwin'), (timestamp '2018-11-17 13:33:33.123Z'), (timestamp '2020-01-01 01:33:33.123Asia/Shanghai'), - (timestamp '2100-01-01 01:33:33.123America/Los_Angeles') t(col), false, false, LocalTempView, true + (timestamp '2100-01-01 01:33:33.123America/Los_Angeles') t(col), false, false, LocalTempView, UNSUPPORTED, true +- Project [col#x] +- SubqueryAlias t +- LocalRelation [col#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/datetime-legacy.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/datetime-legacy.sql.out index 12a15cc9b8967..1e49f4df8267a 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/datetime-legacy.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/datetime-legacy.sql.out @@ -2,7 +2,7 @@ -- !query create temporary view date_view as select '2011-11-11' date_str, '1' int_str -- !query analysis -CreateViewCommand `date_view`, select '2011-11-11' date_str, '1' int_str, false, false, LocalTempView, true +CreateViewCommand `date_view`, select '2011-11-11' date_str, '1' int_str, false, false, LocalTempView, UNSUPPORTED, true +- Project [2011-11-11 AS date_str#x, 1 AS int_str#x] +- OneRowRelation @@ -1267,7 +1267,7 @@ create temporary view ttf1 as select * from values CreateViewCommand `ttf1`, select * from values (1, 2), (2, 3) - as ttf1(`current_date`, `current_timestamp`), false, false, LocalTempView, true + as ttf1(`current_date`, `current_timestamp`), false, false, LocalTempView, UNSUPPORTED, true +- Project [current_date#x, current_timestamp#x] +- SubqueryAlias ttf1 +- LocalRelation [current_date#x, current_timestamp#x] @@ -1294,7 +1294,7 @@ create temporary view ttf2 as select * from values CreateViewCommand `ttf2`, select * from values (1, 2), (2, 3) - as ttf2(a, b), false, false, LocalTempView, true + as ttf2(a, b), false, false, LocalTempView, UNSUPPORTED, true +- Project [a#x, b#x] +- SubqueryAlias ttf2 +- LocalRelation [a#x, b#x] @@ -1642,7 +1642,7 @@ select null - timestamp'2011-11-11 11:11:11' -- !query create temporary view ts_view as select '2011-11-11 11:11:11' str -- !query analysis -CreateViewCommand `ts_view`, select '2011-11-11 11:11:11' str, false, false, LocalTempView, true +CreateViewCommand `ts_view`, select '2011-11-11 11:11:11' str, false, false, LocalTempView, UNSUPPORTED, true +- Project [2011-11-11 11:11:11 AS str#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/decimalArithmeticOperations.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/decimalArithmeticOperations.sql.out index 5e6b27bc84411..4a2199033f819 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/decimalArithmeticOperations.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/decimalArithmeticOperations.sql.out @@ -2,7 +2,7 @@ -- !query CREATE TEMPORARY VIEW t AS SELECT 1.0 as a, 0.0 as b -- !query analysis -CreateViewCommand `t`, SELECT 1.0 as a, 0.0 as b, false, false, LocalTempView, true +CreateViewCommand `t`, SELECT 1.0 as a, 0.0 as b, false, false, LocalTempView, UNSUPPORTED, true +- Project [1.0 AS a#x, 0.0 AS b#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/describe.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/describe.sql.out index b9fe5c1d74c1d..ff0935bfd03ec 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/describe.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/describe.sql.out @@ -12,7 +12,7 @@ CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false -- !query CREATE TEMPORARY VIEW temp_v AS SELECT * FROM t -- !query analysis -CreateViewCommand `temp_v`, SELECT * FROM t, false, false, LocalTempView, true +CreateViewCommand `temp_v`, SELECT * FROM t, false, false, LocalTempView, UNSUPPORTED, true +- Project [a#x, b#x, c#x, d#x] +- SubqueryAlias spark_catalog.default.t +- Relation spark_catalog.default.t[a#x,b#x,c#x,d#x] parquet @@ -32,7 +32,7 @@ CreateTempViewUsing [tableIdent:`temp_Data_Source_View` replace:false provider:o -- !query CREATE VIEW v AS SELECT * FROM t -- !query analysis -CreateViewCommand `spark_catalog`.`default`.`v`, SELECT * FROM t, false, false, PersistedView, true +CreateViewCommand `spark_catalog`.`default`.`v`, SELECT * FROM t, false, false, PersistedView, COMPENSATION, true +- Project [a#x, b#x, c#x, d#x] +- SubqueryAlias spark_catalog.default.t +- Relation spark_catalog.default.t[a#x,b#x,c#x,d#x] parquet diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/double-quoted-identifiers.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/double-quoted-identifiers.sql.out index d73b72eca3e21..f7b0e3370f9f4 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/double-quoted-identifiers.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/double-quoted-identifiers.sql.out @@ -276,7 +276,7 @@ Project [hello AS hello#x] -- !query CREATE TEMPORARY VIEW v(c1 COMMENT "hello") AS SELECT 1 -- !query analysis -CreateViewCommand `v`, [(c1,Some(hello))], SELECT 1, false, false, LocalTempView, true +CreateViewCommand `v`, [(c1,Some(hello))], SELECT 1, false, false, LocalTempView, UNSUPPORTED, true +- Project [1 AS 1#x] +- OneRowRelation @@ -304,7 +304,7 @@ Project [hello AS hello#x] -- !query CREATE TEMPORARY VIEW v(c1 COMMENT 'hello') AS SELECT 1 -- !query analysis -CreateViewCommand `v`, [(c1,Some(hello))], SELECT 1, false, false, LocalTempView, true +CreateViewCommand `v`, [(c1,Some(hello))], SELECT 1, false, false, LocalTempView, UNSUPPORTED, true +- Project [1 AS 1#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/except-all.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/except-all.sql.out index d16fc0beaf5da..3972cde51bb72 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/except-all.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/except-all.sql.out @@ -4,7 +4,7 @@ CREATE TEMPORARY VIEW tab1 AS SELECT * FROM VALUES (0), (1), (2), (2), (2), (2), (3), (null), (null) AS tab1(c1) -- !query analysis CreateViewCommand `tab1`, SELECT * FROM VALUES - (0), (1), (2), (2), (2), (2), (3), (null), (null) AS tab1(c1), false, false, LocalTempView, true + (0), (1), (2), (2), (2), (2), (3), (null), (null) AS tab1(c1), false, false, LocalTempView, UNSUPPORTED, true +- Project [c1#x] +- SubqueryAlias tab1 +- LocalRelation [c1#x] @@ -15,7 +15,7 @@ CREATE TEMPORARY VIEW tab2 AS SELECT * FROM VALUES (1), (2), (2), (3), (5), (5), (null) AS tab2(c1) -- !query analysis CreateViewCommand `tab2`, SELECT * FROM VALUES - (1), (2), (2), (3), (5), (5), (null) AS tab2(c1), false, false, LocalTempView, true + (1), (2), (2), (3), (5), (5), (null) AS tab2(c1), false, false, LocalTempView, UNSUPPORTED, true +- Project [c1#x] +- SubqueryAlias tab2 +- LocalRelation [c1#x] @@ -36,7 +36,7 @@ CreateViewCommand `tab3`, SELECT * FROM VALUES (1, 3), (2, 3), (2, 2) - AS tab3(k, v), false, false, LocalTempView, true + AS tab3(k, v), false, false, LocalTempView, UNSUPPORTED, true +- Project [k#x, v#x] +- SubqueryAlias tab3 +- LocalRelation [k#x, v#x] @@ -57,7 +57,7 @@ CreateViewCommand `tab4`, SELECT * FROM VALUES (2, 2), (2, 2), (2, 20) - AS tab4(k, v), false, false, LocalTempView, true + AS tab4(k, v), false, false, LocalTempView, UNSUPPORTED, true +- Project [k#x, v#x] +- SubqueryAlias tab4 +- LocalRelation [k#x, v#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/except.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/except.sql.out index 6244a8bed5609..9828956ab843d 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/except.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/except.sql.out @@ -12,7 +12,7 @@ CreateViewCommand `t1`, select * from values ("two", 2), ("three", 3), ("one", NULL) - as t1(k, v), false, false, LocalTempView, true + as t1(k, v), false, false, LocalTempView, UNSUPPORTED, true +- Project [k#x, v#x] +- SubqueryAlias t1 +- LocalRelation [k#x, v#x] @@ -33,7 +33,7 @@ CreateViewCommand `t2`, select * from values ("one", 5), ("one", NULL), (NULL, 5) - as t2(k, v), false, false, LocalTempView, true + as t2(k, v), false, false, LocalTempView, UNSUPPORTED, true +- Project [k#x, v#x] +- SubqueryAlias t2 +- LocalRelation [k#x, v#x] @@ -250,7 +250,7 @@ Except false -- !query CREATE OR REPLACE TEMPORARY VIEW t3 AS VALUES (decimal(1)) tbl(v) -- !query analysis -CreateViewCommand `t3`, VALUES (decimal(1)) tbl(v), false, true, LocalTempView, true +CreateViewCommand `t3`, VALUES (decimal(1)) tbl(v), false, true, LocalTempView, UNSUPPORTED, true +- SubqueryAlias tbl +- LocalRelation [v#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/execute-immediate.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/execute-immediate.sql.out index 1597a29a4d6c4..78bf1ccb1678c 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/execute-immediate.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/execute-immediate.sql.out @@ -18,7 +18,7 @@ CreateViewCommand `tbl_view`, SELECT * FROM VALUES (50, 'name5', named_struct('f1', 5, 's2', named_struct('f2', 505, 'f3', 'e'))), (60, 'name6', named_struct('f1', 6, 's2', named_struct('f2', 606, 'f3', 'f'))), (70, 'name7', named_struct('f1', 7, 's2', named_struct('f2', 707, 'f3', 'g'))) -AS tbl_view(id, name, data), false, false, LocalTempView, true +AS tbl_view(id, name, data), false, false, LocalTempView, UNSUPPORTED, true +- Project [id#x, name#x, data#x] +- SubqueryAlias tbl_view +- LocalRelation [id#x, name#x, data#x] @@ -54,7 +54,7 @@ SetCommand (spark.sql.ansi.enabled,Some(true)) -- !query EXECUTE IMMEDIATE 'CREATE TEMPORARY VIEW IDENTIFIER(:tblName) AS SELECT id, name FROM tbl_view' USING 'tbl_view_tmp' as tblName -- !query analysis -CreateViewCommand `tbl_view_tmp`, SELECT id, name FROM tbl_view, false, false, LocalTempView, true +CreateViewCommand `tbl_view_tmp`, SELECT id, name FROM tbl_view, false, false, LocalTempView, UNSUPPORTED, true +- Project [id#x, name#x] +- SubqueryAlias tbl_view +- View (`tbl_view`, [id#x, name#x, data#x]) diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/explain-aqe.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/explain-aqe.sql.out index 26f40c1011140..3aea86b232cba 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/explain-aqe.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/explain-aqe.sql.out @@ -162,7 +162,7 @@ EXPLAIN FORMATTED CREATE VIEW explain_view AS SELECT key, val FROM explain_temp1 -- !query analysis -ExplainCommand 'CreateView SELECT key, val FROM explain_temp1, false, false, FormattedMode +ExplainCommand 'CreateView SELECT key, val FROM explain_temp1, false, false, COMPENSATION, FormattedMode -- !query diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/explain.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/explain.sql.out index 26f40c1011140..3aea86b232cba 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/explain.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/explain.sql.out @@ -162,7 +162,7 @@ EXPLAIN FORMATTED CREATE VIEW explain_view AS SELECT key, val FROM explain_temp1 -- !query analysis -ExplainCommand 'CreateView SELECT key, val FROM explain_temp1, false, false, FormattedMode +ExplainCommand 'CreateView SELECT key, val FROM explain_temp1, false, false, COMPENSATION, FormattedMode -- !query diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/extract.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/extract.sql.out index 886d1f8616c77..c42f2db3f0f9f 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/extract.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/extract.sql.out @@ -2,7 +2,7 @@ -- !query CREATE TEMPORARY VIEW t AS select '2011-05-06 07:08:09.1234567' as c, to_timestamp_ntz('2011-05-06 07:08:09.1234567') as ntz, interval 10 year 20 month as i, interval 30 day 40 hour 50 minute 6.7890 second as j -- !query analysis -CreateViewCommand `t`, select '2011-05-06 07:08:09.1234567' as c, to_timestamp_ntz('2011-05-06 07:08:09.1234567') as ntz, interval 10 year 20 month as i, interval 30 day 40 hour 50 minute 6.7890 second as j, false, false, LocalTempView, true +CreateViewCommand `t`, select '2011-05-06 07:08:09.1234567' as c, to_timestamp_ntz('2011-05-06 07:08:09.1234567') as ntz, interval 10 year 20 month as i, interval 30 day 40 hour 50 minute 6.7890 second as j, false, false, LocalTempView, UNSUPPORTED, true +- Project [2011-05-06 07:08:09.1234567 AS c#x, to_timestamp_ntz(2011-05-06 07:08:09.1234567, None, TimestampNTZType, Some(America/Los_Angeles), false) AS ntz#x, INTERVAL '11-8' YEAR TO MONTH AS i#x, INTERVAL '31 16:50:06.789' DAY TO SECOND AS j#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/group-analytics.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/group-analytics.sql.out index b692c5c374632..cdb6372bec099 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/group-analytics.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/group-analytics.sql.out @@ -6,7 +6,7 @@ AS testData(a, b) -- !query analysis CreateViewCommand `testData`, SELECT * FROM VALUES (1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2) -AS testData(a, b), false, true, LocalTempView, true +AS testData(a, b), false, true, LocalTempView, UNSUPPORTED, true +- Project [a#x, b#x] +- SubqueryAlias testData +- LocalRelation [a#x, b#x] @@ -75,7 +75,7 @@ AS courseSales(course, year, earnings) -- !query analysis CreateViewCommand `courseSales`, SELECT * FROM VALUES ("dotNET", 2012, 10000), ("Java", 2012, 20000), ("dotNET", 2012, 5000), ("dotNET", 2013, 48000), ("Java", 2013, 30000) -AS courseSales(course, year, earnings), false, true, LocalTempView, true +AS courseSales(course, year, earnings), false, true, LocalTempView, UNSUPPORTED, true +- Project [course#x, year#x, earnings#x] +- SubqueryAlias courseSales +- LocalRelation [course#x, year#x, earnings#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/group-by-all-duckdb.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/group-by-all-duckdb.sql.out index 13649989b4127..5f1cbea709891 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/group-by-all-duckdb.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/group-by-all-duckdb.sql.out @@ -12,7 +12,7 @@ CreateViewCommand `integers`, select * from values (0, 2), (1, 3), (1, NULL) - as integers(g, i), false, false, LocalTempView, true + as integers(g, i), false, false, LocalTempView, UNSUPPORTED, true +- Project [g#x, i#x] +- SubqueryAlias integers +- LocalRelation [g#x, i#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/group-by-all-mosha.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/group-by-all-mosha.sql.out index 76ef52ecd8b63..da3f3de3fb448 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/group-by-all-mosha.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/group-by-all-mosha.sql.out @@ -10,7 +10,7 @@ CreateViewCommand `stuff`, select * from values (42, 9.75, 'hello world', '1970-08-07', '13.37', array(1,20,300)), (1337, 1.2345, 'oh no', '2000-01-01', '42.0', array(4000,50000,600000)), (42, 13.37, 'test', '1970-08-07', '1234567890', array(7000000,80000000,900000000)) - as stuff(i, f, s, t, d, a), false, false, LocalTempView, true + as stuff(i, f, s, t, d, a), false, false, LocalTempView, UNSUPPORTED, true +- Project [i#x, f#x, s#x, t#x, d#x, a#x] +- SubqueryAlias stuff +- LocalRelation [i#x, f#x, s#x, t#x, d#x, a#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/group-by-all.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/group-by-all.sql.out index 79de44697b5f6..c2c77db6c3b1c 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/group-by-all.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/group-by-all.sql.out @@ -18,7 +18,7 @@ CreateViewCommand `data`, select * from values ("China", "Shanghai", "Shanghaiese", 5, 15.0), ("Korea", "Seoul", "Hyukjin", 6, 16.0), ("UK", "London", "Sean", 7, 17.0) - as data(country, city, name, id, power), false, false, LocalTempView, true + as data(country, city, name, id, power), false, false, LocalTempView, UNSUPPORTED, true +- Project [country#x, city#x, name#x, id#x, power#x] +- SubqueryAlias data +- LocalRelation [country#x, city#x, name#x, id#x, power#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/group-by-filter.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/group-by-filter.sql.out index 1e06a01dc07ea..94d39111b29ed 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/group-by-filter.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/group-by-filter.sql.out @@ -6,7 +6,7 @@ AS testData(a, b) -- !query analysis CreateViewCommand `testData`, SELECT * FROM VALUES (1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, null) -AS testData(a, b), false, true, LocalTempView, true +AS testData(a, b), false, true, LocalTempView, UNSUPPORTED, true +- Project [a#x, b#x] +- SubqueryAlias testData +- LocalRelation [a#x, b#x] @@ -35,7 +35,7 @@ CreateViewCommand `EMP`, SELECT * FROM VALUES (600, "emp 6 - no dept", date "2001-01-01", 400.00D, 100), (700, "emp 7", date "2010-01-01", 400.00D, 100), (800, "emp 8", date "2016-01-01", 150.00D, 70) -AS EMP(id, emp_name, hiredate, salary, dept_id), false, true, LocalTempView, true +AS EMP(id, emp_name, hiredate, salary, dept_id), false, true, LocalTempView, UNSUPPORTED, true +- Project [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] +- SubqueryAlias EMP +- LocalRelation [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] @@ -58,7 +58,7 @@ CreateViewCommand `DEPT`, SELECT * FROM VALUES (40, "dept 4 - unassigned", "OR"), (50, "dept 5 - unassigned", "NJ"), (70, "dept 7", "FL") -AS DEPT(dept_id, dept_name, state), false, true, LocalTempView, true +AS DEPT(dept_id, dept_name, state), false, true, LocalTempView, UNSUPPORTED, true +- Project [dept_id#x, dept_name#x, state#x] +- SubqueryAlias DEPT +- LocalRelation [dept_id#x, dept_name#x, state#x] @@ -75,7 +75,7 @@ CreateViewCommand `FilterExpressionTestData`, SELECT * FROM VALUES (1, 2, "asd"), (3, 4, "fgh"), (5, 6, "jkl") -AS FilterExpressionTestData(num1, num2, str), false, true, LocalTempView, true +AS FilterExpressionTestData(num1, num2, str), false, true, LocalTempView, UNSUPPORTED, true +- Project [num1#x, num2#x, str#x] +- SubqueryAlias FilterExpressionTestData +- LocalRelation [num1#x, num2#x, str#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/group-by-ordinal.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/group-by-ordinal.sql.out index 6aa32ed3a8866..904b35559ced8 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/group-by-ordinal.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/group-by-ordinal.sql.out @@ -16,7 +16,7 @@ CreateViewCommand `data`, select * from values (2, 2), (3, 1), (3, 2) - as data(a, b), false, false, LocalTempView, true + as data(a, b), false, false, LocalTempView, UNSUPPORTED, true +- Project [a#x, b#x] +- SubqueryAlias data +- LocalRelation [a#x, b#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/group-by.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/group-by.sql.out index 324a0a366d85e..46b40e4515260 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/group-by.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/group-by.sql.out @@ -6,7 +6,7 @@ AS testData(a, b) -- !query analysis CreateViewCommand `testData`, SELECT * FROM VALUES (1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, null) -AS testData(a, b), false, true, LocalTempView, true +AS testData(a, b), false, true, LocalTempView, UNSUPPORTED, true +- Project [a#x, b#x] +- SubqueryAlias testData +- LocalRelation [a#x, b#x] @@ -273,7 +273,7 @@ CREATE OR REPLACE TEMPORARY VIEW testDataHasSameNameWithAlias AS SELECT * FROM V (1, 1, 3), (1, 2, 1) AS testDataHasSameNameWithAlias(k, a, v) -- !query analysis CreateViewCommand `testDataHasSameNameWithAlias`, SELECT * FROM VALUES -(1, 1, 3), (1, 2, 1) AS testDataHasSameNameWithAlias(k, a, v), false, true, LocalTempView, true +(1, 1, 3), (1, 2, 1) AS testDataHasSameNameWithAlias(k, a, v), false, true, LocalTempView, UNSUPPORTED, true +- Project [k#x, a#x, v#x] +- SubqueryAlias testDataHasSameNameWithAlias +- LocalRelation [k#x, a#x, v#x] @@ -395,7 +395,7 @@ SELECT 1 FROM range(10) HAVING true -- !query analysis Filter cast(true as boolean) +- Aggregate [1 AS 1#x] - +- Range (0, 10, step=1, splits=None) + +- Range (0, 10, step=1) -- !query @@ -404,7 +404,7 @@ SELECT 1 FROM range(10) HAVING MAX(id) > 0 Project [1#x] +- Filter (max(id#xL)#xL > cast(0 as bigint)) +- Aggregate [1 AS 1#x, max(id#xL) AS max(id#xL)#xL] - +- Range (0, 10, step=1, splits=None) + +- Range (0, 10, step=1) -- !query @@ -435,7 +435,7 @@ SELECT 1 FROM range(10) HAVING true -- !query analysis Filter cast(true as boolean) +- Project [1 AS 1#x] - +- Range (0, 10, step=1, splits=None) + +- Range (0, 10, step=1) -- !query @@ -464,7 +464,7 @@ SELECT id FROM range(10) HAVING id > 0 -- !query analysis Filter (id#xL > cast(0 as bigint)) +- Project [id#xL] - +- Range (0, 10, step=1, splits=None) + +- Range (0, 10, step=1) -- !query @@ -486,7 +486,7 @@ CreateViewCommand `test_agg`, SELECT * FROM VALUES (2, true), (3, false), (3, null), (4, null), (4, null), - (5, null), (5, true), (5, false) AS test_agg(k, v), false, true, LocalTempView, true + (5, null), (5, true), (5, false) AS test_agg(k, v), false, true, LocalTempView, UNSUPPORTED, true +- Project [k#x, v#x] +- SubqueryAlias test_agg +- LocalRelation [k#x, v#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/grouping_set.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/grouping_set.sql.out index b82275b5e3129..b73ee16c8bdef 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/grouping_set.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/grouping_set.sql.out @@ -10,7 +10,7 @@ CreateViewCommand `grouping`, SELECT * FROM VALUES ("1", "2", "3", 1), ("4", "5", "6", 1), ("7", "8", "9", 1) - as grouping(a, b, c, d), false, false, LocalTempView, true + as grouping(a, b, c, d), false, false, LocalTempView, UNSUPPORTED, true +- Project [a#x, b#x, c#x, d#x] +- SubqueryAlias grouping +- LocalRelation [a#x, b#x, c#x, d#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/having.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/having.sql.out index 889ca3d244ccd..78cf1223da50d 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/having.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/having.sql.out @@ -12,7 +12,7 @@ CreateViewCommand `hav`, select * from values ("two", 2), ("three", 3), ("one", 5) - as hav(k, v), false, false, LocalTempView, true + as hav(k, v), false, false, LocalTempView, UNSUPPORTED, true +- Project [k#x, v#x] +- SubqueryAlias hav +- LocalRelation [k#x, v#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/higher-order-functions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/higher-order-functions.sql.out index ec6d7271cc235..1281b19eb2f86 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/higher-order-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/higher-order-functions.sql.out @@ -10,7 +10,7 @@ CreateViewCommand `nested`, values (1, array(32, 97), array(array(12, 99), array(123, 42), array(1))), (2, array(77, -76), array(array(6, 96, 65), array(-1, -2))), (3, array(12), array(array(17))) - as t(x, ys, zs), false, true, LocalTempView, true + as t(x, ys, zs), false, true, LocalTempView, UNSUPPORTED, true +- SubqueryAlias t +- LocalRelation [x#x, ys#x, zs#x] @@ -35,6 +35,26 @@ org.apache.spark.sql.AnalysisException } +-- !query +select ceil(x -> x) as v +-- !query analysis +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "INVALID_LAMBDA_FUNCTION_CALL.NON_HIGHER_ORDER_FUNCTION", + "sqlState" : "42K0D", + "messageParameters" : { + "class" : "org.apache.spark.sql.catalyst.expressions.CeilExpressionBuilder$" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 19, + "fragment" : "ceil(x -> x)" + } ] +} + + -- !query select transform(zs, z -> z) as v from nested -- !query analysis @@ -258,7 +278,7 @@ create or replace temporary view nested as values CreateViewCommand `nested`, values (1, map(1, 1, 2, 2, 3, 3)), (2, map(4, 4, 5, 5, 6, 6)) - as t(x, ys), false, true, LocalTempView, true + as t(x, ys), false, true, LocalTempView, UNSUPPORTED, true +- SubqueryAlias t +- LocalRelation [x#x, ys#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/identifier-clause.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/identifier-clause.sql.out index c4221dd5773de..b3e2cd5ada950 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/identifier-clause.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/identifier-clause.sql.out @@ -198,7 +198,7 @@ Project [abs(c1#x) AS abs(c1)#x] SELECT * FROM IDENTIFIER('ra' || 'nge')(0, 1) -- !query analysis Project [id#xL] -+- Range (0, 1, step=1, splits=None) ++- Range (0, 1, step=1) -- !query @@ -394,7 +394,7 @@ DropTable true, false -- !query CREATE OR REPLACE VIEW IDENTIFIER('v')(c1) AS VALUES(1) -- !query analysis -CreateViewCommand `spark_catalog`.`default`.`v`, [(c1,None)], VALUES(1), false, true, PersistedView, true +CreateViewCommand `spark_catalog`.`default`.`v`, [(c1,None)], VALUES(1), false, true, PersistedView, COMPENSATION, true +- LocalRelation [col1#x] @@ -424,7 +424,7 @@ DropTableCommand `spark_catalog`.`default`.`v`, false, true, false -- !query CREATE TEMPORARY VIEW IDENTIFIER('v')(c1) AS VALUES(1) -- !query analysis -CreateViewCommand `v`, [(c1,None)], VALUES(1), false, false, LocalTempView, true +CreateViewCommand `v`, [(c1,None)], VALUES(1), false, false, LocalTempView, UNSUPPORTED, true +- LocalRelation [col1#x] @@ -732,7 +732,7 @@ org.apache.spark.sql.AnalysisException -- !query -CREATE TABLE IDENTIFIER(1)(c1 INT) +CREATE TABLE IDENTIFIER(1)(c1 INT) USING csv -- !query analysis org.apache.spark.sql.AnalysisException { @@ -754,7 +754,7 @@ org.apache.spark.sql.AnalysisException -- !query -CREATE TABLE IDENTIFIER('a.b.c')(c1 INT) +CREATE TABLE IDENTIFIER('a.b.c')(c1 INT) USING csv -- !query analysis org.apache.spark.sql.AnalysisException { @@ -926,6 +926,65 @@ org.apache.spark.sql.catalyst.parser.ParseException } +-- !query +create temporary view identifier('v1') as (select my_col from (values (1), (2), (1) as (my_col)) group by 1) +-- !query analysis +CreateViewCommand `v1`, (select my_col from (values (1), (2), (1) as (my_col)) group by 1), false, false, LocalTempView, UNSUPPORTED, true + +- Aggregate [my_col#x], [my_col#x] + +- SubqueryAlias __auto_generated_subquery_name + +- SubqueryAlias as + +- LocalRelation [my_col#x] + + +-- !query +cache table identifier('t1') as (select my_col from (values (1), (2), (1) as (my_col)) group by 1) +-- !query analysis +CacheTableAsSelect t1, (select my_col from (values (1), (2), (1) as (my_col)) group by 1), false, true + +- Aggregate [my_col#x], [my_col#x] + +- SubqueryAlias __auto_generated_subquery_name + +- SubqueryAlias as + +- LocalRelation [my_col#x] + + +-- !query +create table identifier('t2') using csv as (select my_col from (values (1), (2), (1) as (my_col)) group by 1) +-- !query analysis +CreateDataSourceTableAsSelectCommand `spark_catalog`.`default`.`t2`, ErrorIfExists, [my_col] + +- Aggregate [my_col#x], [my_col#x] + +- SubqueryAlias __auto_generated_subquery_name + +- SubqueryAlias as + +- LocalRelation [my_col#x] + + +-- !query +insert into identifier('t2') select my_col from (values (3) as (my_col)) group by 1 +-- !query analysis +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t2, false, CSV, [path=file:[not included in comparison]/{warehouse_dir}/t2], Append, `spark_catalog`.`default`.`t2`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t2), [my_col] ++- Aggregate [my_col#x], [my_col#x] + +- SubqueryAlias __auto_generated_subquery_name + +- SubqueryAlias as + +- LocalRelation [my_col#x] + + +-- !query +drop view v1 +-- !query analysis +DropTempViewCommand v1 + + +-- !query +drop table t1 +-- !query analysis +DropTempViewCommand t1 + + +-- !query +drop table t2 +-- !query analysis +DropTable false, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t2 + + -- !query SELECT row_number() OVER IDENTIFIER('x.win') FROM VALUES(1) AS T(c1) WINDOW win AS (ORDER BY c1) -- !query analysis diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/ilike-all.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/ilike-all.sql.out index dc36fc2be1a27..cf9c2e12cc72e 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/ilike-all.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/ilike-all.sql.out @@ -10,7 +10,7 @@ CreateViewCommand `ilike_all_table`, SELECT * FROM (VALUES ('gOOgle', '%oo%'), ('facebook', '%OO%'), ('liNkedin', '%In')) - as t1(company, pat), false, true, LocalTempView, true + as t1(company, pat), false, true, LocalTempView, UNSUPPORTED, true +- Project [company#x, pat#x] +- SubqueryAlias t1 +- Project [col1#x AS company#x, col2#x AS pat#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/ilike-any.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/ilike-any.sql.out index ac25d786cda57..e20000f1463d1 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/ilike-any.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/ilike-any.sql.out @@ -10,7 +10,7 @@ CreateViewCommand `ilike_any_table`, SELECT * FROM (VALUES ('Google', '%Oo%'), ('FaceBook', '%oO%'), ('linkedIn', '%IN')) - as t1(company, pat), false, true, LocalTempView, true + as t1(company, pat), false, true, LocalTempView, UNSUPPORTED, true +- Project [company#x, pat#x] +- SubqueryAlias t1 +- Project [col1#x AS company#x, col2#x AS pat#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/inner-join.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/inner-join.sql.out index 698b1056f73ee..ae123fa61e3d1 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/inner-join.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/inner-join.sql.out @@ -2,7 +2,7 @@ -- !query CREATE TEMPORARY VIEW t1 AS SELECT * FROM VALUES (1) AS GROUPING(a) -- !query analysis -CreateViewCommand `t1`, SELECT * FROM VALUES (1) AS GROUPING(a), false, false, LocalTempView, true +CreateViewCommand `t1`, SELECT * FROM VALUES (1) AS GROUPING(a), false, false, LocalTempView, UNSUPPORTED, true +- Project [a#x] +- SubqueryAlias GROUPING +- LocalRelation [a#x] @@ -11,7 +11,7 @@ CreateViewCommand `t1`, SELECT * FROM VALUES (1) AS GROUPING(a), false, false, L -- !query CREATE TEMPORARY VIEW t2 AS SELECT * FROM VALUES (1) AS GROUPING(a) -- !query analysis -CreateViewCommand `t2`, SELECT * FROM VALUES (1) AS GROUPING(a), false, false, LocalTempView, true +CreateViewCommand `t2`, SELECT * FROM VALUES (1) AS GROUPING(a), false, false, LocalTempView, UNSUPPORTED, true +- Project [a#x] +- SubqueryAlias GROUPING +- LocalRelation [a#x] @@ -20,7 +20,7 @@ CreateViewCommand `t2`, SELECT * FROM VALUES (1) AS GROUPING(a), false, false, L -- !query CREATE TEMPORARY VIEW t3 AS SELECT * FROM VALUES (1), (1) AS GROUPING(a) -- !query analysis -CreateViewCommand `t3`, SELECT * FROM VALUES (1), (1) AS GROUPING(a), false, false, LocalTempView, true +CreateViewCommand `t3`, SELECT * FROM VALUES (1), (1) AS GROUPING(a), false, false, LocalTempView, UNSUPPORTED, true +- Project [a#x] +- SubqueryAlias GROUPING +- LocalRelation [a#x] @@ -29,7 +29,7 @@ CreateViewCommand `t3`, SELECT * FROM VALUES (1), (1) AS GROUPING(a), false, fal -- !query CREATE TEMPORARY VIEW t4 AS SELECT * FROM VALUES (1), (1) AS GROUPING(a) -- !query analysis -CreateViewCommand `t4`, SELECT * FROM VALUES (1), (1) AS GROUPING(a), false, false, LocalTempView, true +CreateViewCommand `t4`, SELECT * FROM VALUES (1), (1) AS GROUPING(a), false, false, LocalTempView, UNSUPPORTED, true +- Project [a#x] +- SubqueryAlias GROUPING +- LocalRelation [a#x] @@ -43,7 +43,7 @@ SELECT a, 'b' AS tag FROM t2 -- !query analysis CreateViewCommand `ta`, SELECT a, 'a' AS tag FROM t1 UNION ALL -SELECT a, 'b' AS tag FROM t2, false, false, LocalTempView, true +SELECT a, 'b' AS tag FROM t2, false, false, LocalTempView, UNSUPPORTED, true +- Union false, false :- Project [a#x, a AS tag#x] : +- SubqueryAlias t1 @@ -69,7 +69,7 @@ SELECT a, 'b' AS tag FROM t4 -- !query analysis CreateViewCommand `tb`, SELECT a, 'a' AS tag FROM t3 UNION ALL -SELECT a, 'b' AS tag FROM t4, false, false, LocalTempView, true +SELECT a, 'b' AS tag FROM t4, false, false, LocalTempView, UNSUPPORTED, true +- Union false, false :- Project [a#x, a AS tag#x] : +- SubqueryAlias t3 diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/intersect-all.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/intersect-all.sql.out index b4d791c2772b2..69b4001ff3481 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/intersect-all.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/intersect-all.sql.out @@ -18,7 +18,7 @@ CreateViewCommand `tab1`, SELECT * FROM VALUES (2, 3), (null, null), (null, null) - AS tab1(k, v), false, false, LocalTempView, true + AS tab1(k, v), false, false, LocalTempView, UNSUPPORTED, true +- Project [k#x, v#x] +- SubqueryAlias tab1 +- LocalRelation [k#x, v#x] @@ -41,7 +41,7 @@ CreateViewCommand `tab2`, SELECT * FROM VALUES (3, 4), (null, null), (null, null) - AS tab2(k, v), false, false, LocalTempView, true + AS tab2(k, v), false, false, LocalTempView, UNSUPPORTED, true +- Project [k#x, v#x] +- SubqueryAlias tab2 +- LocalRelation [k#x, v#x] @@ -593,7 +593,7 @@ SetCommand (spark.sql.legacy.setopsPrecedence.enabled,Some(false)) -- !query CREATE OR REPLACE TEMPORARY VIEW tab3 AS VALUES (decimal(1)), (decimal(2)) tbl3(v) -- !query analysis -CreateViewCommand `tab3`, VALUES (decimal(1)), (decimal(2)) tbl3(v), false, true, LocalTempView, true +CreateViewCommand `tab3`, VALUES (decimal(1)), (decimal(2)) tbl3(v), false, true, LocalTempView, UNSUPPORTED, true +- SubqueryAlias tbl3 +- LocalRelation [v#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/interval.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/interval.sql.out index 783255db337a1..290e55052931d 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/interval.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/interval.sql.out @@ -1605,7 +1605,7 @@ Project [cast(cast(4 12:12:12 as timestamp) + INTERVAL '4 22:12' DAY TO MINUTE a -- !query create temporary view interval_view as select '1' str -- !query analysis -CreateViewCommand `interval_view`, select '1' str, false, false, LocalTempView, true +CreateViewCommand `interval_view`, select '1' str, false, false, LocalTempView, UNSUPPORTED, true +- Project [1 AS str#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/join-empty-relation.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/join-empty-relation.sql.out index a4fae64952048..850255a065b3e 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/join-empty-relation.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/join-empty-relation.sql.out @@ -2,7 +2,7 @@ -- !query CREATE TEMPORARY VIEW t1 AS SELECT * FROM VALUES (1) AS GROUPING(a) -- !query analysis -CreateViewCommand `t1`, SELECT * FROM VALUES (1) AS GROUPING(a), false, false, LocalTempView, true +CreateViewCommand `t1`, SELECT * FROM VALUES (1) AS GROUPING(a), false, false, LocalTempView, UNSUPPORTED, true +- Project [a#x] +- SubqueryAlias GROUPING +- LocalRelation [a#x] @@ -11,7 +11,7 @@ CreateViewCommand `t1`, SELECT * FROM VALUES (1) AS GROUPING(a), false, false, L -- !query CREATE TEMPORARY VIEW t2 AS SELECT * FROM VALUES (1) AS GROUPING(a) -- !query analysis -CreateViewCommand `t2`, SELECT * FROM VALUES (1) AS GROUPING(a), false, false, LocalTempView, true +CreateViewCommand `t2`, SELECT * FROM VALUES (1) AS GROUPING(a), false, false, LocalTempView, UNSUPPORTED, true +- Project [a#x] +- SubqueryAlias GROUPING +- LocalRelation [a#x] @@ -20,7 +20,7 @@ CreateViewCommand `t2`, SELECT * FROM VALUES (1) AS GROUPING(a), false, false, L -- !query CREATE TEMPORARY VIEW empty_table as SELECT a FROM t2 WHERE false -- !query analysis -CreateViewCommand `empty_table`, SELECT a FROM t2 WHERE false, false, false, LocalTempView, true +CreateViewCommand `empty_table`, SELECT a FROM t2 WHERE false, false, false, LocalTempView, UNSUPPORTED, true +- Project [a#x] +- Filter false +- SubqueryAlias t2 diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/join-lateral.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/join-lateral.sql.out index a7bc1de1530a9..e81ee769f57d6 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/join-lateral.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/join-lateral.sql.out @@ -2,28 +2,28 @@ -- !query CREATE VIEW t1(c1, c2) AS VALUES (0, 1), (1, 2) -- !query analysis -CreateViewCommand `spark_catalog`.`default`.`t1`, [(c1,None), (c2,None)], VALUES (0, 1), (1, 2), false, false, PersistedView, true +CreateViewCommand `spark_catalog`.`default`.`t1`, [(c1,None), (c2,None)], VALUES (0, 1), (1, 2), false, false, PersistedView, COMPENSATION, true +- LocalRelation [col1#x, col2#x] -- !query CREATE VIEW t2(c1, c2) AS VALUES (0, 2), (0, 3) -- !query analysis -CreateViewCommand `spark_catalog`.`default`.`t2`, [(c1,None), (c2,None)], VALUES (0, 2), (0, 3), false, false, PersistedView, true +CreateViewCommand `spark_catalog`.`default`.`t2`, [(c1,None), (c2,None)], VALUES (0, 2), (0, 3), false, false, PersistedView, COMPENSATION, true +- LocalRelation [col1#x, col2#x] -- !query CREATE VIEW t3(c1, c2) AS VALUES (0, ARRAY(0, 1)), (1, ARRAY(2)), (2, ARRAY()), (null, ARRAY(4)) -- !query analysis -CreateViewCommand `spark_catalog`.`default`.`t3`, [(c1,None), (c2,None)], VALUES (0, ARRAY(0, 1)), (1, ARRAY(2)), (2, ARRAY()), (null, ARRAY(4)), false, false, PersistedView, true +CreateViewCommand `spark_catalog`.`default`.`t3`, [(c1,None), (c2,None)], VALUES (0, ARRAY(0, 1)), (1, ARRAY(2)), (2, ARRAY()), (null, ARRAY(4)), false, false, PersistedView, COMPENSATION, true +- LocalRelation [col1#x, col2#x] -- !query CREATE VIEW t4(c1, c2) AS VALUES (0, 1), (0, 2), (1, 1), (1, 3) -- !query analysis -CreateViewCommand `spark_catalog`.`default`.`t4`, [(c1,None), (c2,None)], VALUES (0, 1), (0, 2), (1, 1), (1, 3), false, false, PersistedView, true +CreateViewCommand `spark_catalog`.`default`.`t4`, [(c1,None), (c2,None)], VALUES (0, 1), (0, 2), (1, 1), (1, 3), false, false, PersistedView, COMPENSATION, true +- LocalRelation [col1#x, col2#x] @@ -2315,7 +2315,7 @@ SELECT * FROM t1, LATERAL RANGE(3) -- !query analysis Project [c1#x, c2#x, id#xL] +- LateralJoin lateral-subquery#x [], Inner - : +- Range (0, 3, step=1, splits=None) + : +- Range (0, 3, step=1) +- SubqueryAlias spark_catalog.default.t1 +- View (`spark_catalog`.`default`.`t1`, [c1#x, c2#x]) +- Project [cast(col1#x as int) AS c1#x, cast(col2#x as int) AS c2#x] @@ -2507,7 +2507,7 @@ CREATE OR REPLACE TEMPORARY VIEW array_struct(id, arr) AS VALUES CreateViewCommand `array_struct`, [(id,None), (arr,None)], VALUES (1, ARRAY(STRUCT(1, 'a'), STRUCT(2, 'b'))), (2, ARRAY()), - (3, ARRAY(STRUCT(3, 'c'))), false, true, LocalTempView, true + (3, ARRAY(STRUCT(3, 'c'))), false, true, LocalTempView, UNSUPPORTED, true +- LocalRelation [col1#x, col2#x] @@ -2714,7 +2714,7 @@ CreateViewCommand `json_table`, [(key,None), (jstring,None)], VALUES ('3', '{"f1": 3, "f4": "4", "f3": "3", "f2": 2, "f5": 5.01}'), ('4', cast(null as string)), ('5', '{"f1": null, "f5": ""}'), - ('6', '[invalid JSON string]'), false, true, LocalTempView, true + ('6', '[invalid JSON string]'), false, true, LocalTempView, UNSUPPORTED, true +- LocalRelation [col1#x, col2#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/json-functions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/json-functions.sql.out index 48b3bb07ef37b..0d7c6b2056231 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/json-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/json-functions.sql.out @@ -241,7 +241,7 @@ Project [c0#x, c1#x, c2#x, c3#x] -- !query CREATE TEMPORARY VIEW jsonTable(jsonField, a) AS SELECT * FROM VALUES ('{"a": 1, "b": 2}', 'a') -- !query analysis -CreateViewCommand `jsonTable`, [(jsonField,None), (a,None)], SELECT * FROM VALUES ('{"a": 1, "b": 2}', 'a'), false, false, LocalTempView, true +CreateViewCommand `jsonTable`, [(jsonField,None), (a,None)], SELECT * FROM VALUES ('{"a": 1, "b": 2}', 'a'), false, false, LocalTempView, UNSUPPORTED, true +- Project [col1#x, col2#x] +- LocalRelation [col1#x, col2#x] @@ -449,7 +449,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException -- !query CREATE TEMPORARY VIEW jsonTable(jsonField, a) AS SELECT * FROM VALUES ('{"a": 1, "b": 2}', 'a') -- !query analysis -CreateViewCommand `jsonTable`, [(jsonField,None), (a,None)], SELECT * FROM VALUES ('{"a": 1, "b": 2}', 'a'), false, false, LocalTempView, true +CreateViewCommand `jsonTable`, [(jsonField,None), (a,None)], SELECT * FROM VALUES ('{"a": 1, "b": 2}', 'a'), false, false, LocalTempView, UNSUPPORTED, true +- Project [col1#x, col2#x] +- LocalRelation [col1#x, col2#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/like-all.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/like-all.sql.out index cc5967a3fc59c..dab3e27be69aa 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/like-all.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/like-all.sql.out @@ -10,7 +10,7 @@ CreateViewCommand `like_all_table`, SELECT * FROM (VALUES ('google', '%oo%'), ('facebook', '%oo%'), ('linkedin', '%in')) - as t1(company, pat), false, true, LocalTempView, true + as t1(company, pat), false, true, LocalTempView, UNSUPPORTED, true +- Project [company#x, pat#x] +- SubqueryAlias t1 +- Project [col1#x AS company#x, col2#x AS pat#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/like-any.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/like-any.sql.out index 91adb137e3679..e075fe6c0a69c 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/like-any.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/like-any.sql.out @@ -10,7 +10,7 @@ CreateViewCommand `like_any_table`, SELECT * FROM (VALUES ('google', '%oo%'), ('facebook', '%oo%'), ('linkedin', '%in')) - as t1(company, pat), false, true, LocalTempView, true + as t1(company, pat), false, true, LocalTempView, UNSUPPORTED, true +- Project [company#x, pat#x] +- SubqueryAlias t1 +- Project [col1#x AS company#x, col2#x AS pat#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/limit.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/limit.sql.out index 19307fc859927..e92dcfbc069a7 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/limit.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/limit.sql.out @@ -191,7 +191,7 @@ Project [id#xL] +- GlobalLimit 5 +- LocalLimit 5 +- Project [id#xL] - +- Range (0, 10, step=1, splits=None) + +- Range (0, 10, step=1) -- !query diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/linear-regression.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/linear-regression.sql.out index fa7d0d331eda3..5f130cd1d422c 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/linear-regression.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/linear-regression.sql.out @@ -6,7 +6,7 @@ AS testRegression(k, y, x) -- !query analysis CreateViewCommand `testRegression`, SELECT * FROM VALUES (1, 10, null), (2, 10, 11), (2, 20, 22), (2, 25, null), (2, 30, 35) -AS testRegression(k, y, x), false, true, LocalTempView, true +AS testRegression(k, y, x), false, true, LocalTempView, UNSUPPORTED, true +- Project [k#x, y#x, x#x] +- SubqueryAlias testRegression +- LocalRelation [k#x, y#x, x#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/math.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/math.sql.out index e4dd1994b2c9e..5fe1b69352f57 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/math.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/math.sql.out @@ -431,3 +431,80 @@ SELECT conv('-9223372036854775807', 36, 10) -- !query analysis Project [conv(-9223372036854775807, 36, 10, false) AS conv(-9223372036854775807, 36, 10)#x] +- OneRowRelation + + +-- !query +SELECT BIN(0) +-- !query analysis +Project [bin(cast(0 as bigint)) AS bin(0)#x] ++- OneRowRelation + + +-- !query +SELECT BIN(25) +-- !query analysis +Project [bin(cast(25 as bigint)) AS bin(25)#x] ++- OneRowRelation + + +-- !query +SELECT BIN(25L) +-- !query analysis +Project [bin(25) AS bin(25)#x] ++- OneRowRelation + + +-- !query +SELECT BIN(25.5) +-- !query analysis +Project [bin(cast(25.5 as bigint)) AS bin(25.5)#x] ++- OneRowRelation + + +-- !query +SELECT POSITIVE(0Y) +-- !query analysis +Project [positive(0) AS (+ 0)#x] ++- OneRowRelation + + +-- !query +SELECT POSITIVE(25) +-- !query analysis +Project [positive(25) AS (+ 25)#x] ++- OneRowRelation + + +-- !query +SELECT POSITIVE(-25L) +-- !query analysis +Project [positive(-25) AS (+ -25)#xL] ++- OneRowRelation + + +-- !query +SELECT POSITIVE(25.5) +-- !query analysis +Project [positive(25.5) AS (+ 25.5)#x] ++- OneRowRelation + + +-- !query +SELECT POSITIVE("25.5") +-- !query analysis +Project [positive(cast(25.5 as double)) AS (+ 25.5)#x] ++- OneRowRelation + + +-- !query +SELECT POSITIVE("invalid") +-- !query analysis +Project [positive(cast(invalid as double)) AS (+ invalid)#x] ++- OneRowRelation + + +-- !query +SELECT POSITIVE(null) +-- !query analysis +Project [positive(cast(null as double)) AS (+ NULL)#x] ++- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/misc-functions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/misc-functions.sql.out index 539a348584217..e30bdf12f4a36 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/misc-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/misc-functions.sql.out @@ -92,7 +92,7 @@ Project [assert_true(false, custom error message) AS assert_true(false, custom e -- !query CREATE TEMPORARY VIEW tbl_misc AS SELECT * FROM (VALUES (1), (8), (2)) AS T(v) -- !query analysis -CreateViewCommand `tbl_misc`, SELECT * FROM (VALUES (1), (8), (2)) AS T(v), false, false, LocalTempView, true +CreateViewCommand `tbl_misc`, SELECT * FROM (VALUES (1), (8), (2)) AS T(v), false, false, LocalTempView, UNSUPPORTED, true +- Project [v#x] +- SubqueryAlias T +- Project [col1#x AS v#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/mode.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/mode.sql.out index 2508b9b5fdd98..d6ecbc72a7178 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/mode.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/mode.sql.out @@ -38,7 +38,7 @@ CreateViewCommand `basic_pays`, SELECT * FROM VALUES ('Pamela Castillo','SCM',11303), ('Larry Bott','SCM',11798), ('Barry Jones','SCM',10586) -AS basic_pays(employee_name, department, salary), false, true, LocalTempView, true +AS basic_pays(employee_name, department, salary), false, true, LocalTempView, UNSUPPORTED, true +- Project [employee_name#x, department#x, salary#x] +- SubqueryAlias basic_pays +- LocalRelation [employee_name#x, department#x, salary#x] @@ -471,7 +471,7 @@ CreateViewCommand `intervals`, SELECT * FROM VALUES (2, INTERVAL '30' MONTH, INTERVAL '30' SECOND, INTERVAL '30' MINUTE), (3, INTERVAL '60' MONTH, INTERVAL '60' SECOND, INTERVAL '60' MINUTE), (4, null, null, null) -AS intervals(k, dt, ym, dt2), false, true, LocalTempView, true +AS intervals(k, dt, ym, dt2), false, true, LocalTempView, UNSUPPORTED, true +- Project [k#x, dt#x, ym#x, dt2#x] +- SubqueryAlias intervals +- LocalRelation [k#x, dt#x, ym#x, dt2#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/named-function-arguments.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/named-function-arguments.sql.out index f0b78dc46873b..72d9b454036d6 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/named-function-arguments.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/named-function-arguments.sql.out @@ -58,7 +58,7 @@ CreateViewCommand `t2`, select * from values ('val1e', 8S, null, 19L, float(17), 25D, 26E2, timestamp '2014-09-04 01:01:00.000', date '2014-09-04'), ('val1f', 19S, null, 19L, float(17), 25D, 26E2, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), ('val1b', null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', null) - as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i), false, false, LocalTempView, true + as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i), false, false, LocalTempView, UNSUPPORTED, true +- Project [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] +- SubqueryAlias t2 +- LocalRelation [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] @@ -192,9 +192,9 @@ org.apache.spark.sql.AnalysisException -- !query CREATE OR REPLACE TEMPORARY VIEW v AS SELECT id FROM range(0, 8) -- !query analysis -CreateViewCommand `v`, SELECT id FROM range(0, 8), false, true, LocalTempView, true +CreateViewCommand `v`, SELECT id FROM range(0, 8), false, true, LocalTempView, UNSUPPORTED, true +- Project [id#xL] - +- Range (0, 8, step=1, splits=None) + +- Range (0, 8, step=1) -- !query @@ -205,7 +205,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.UNSUPPORTED_TABLE_ARGUMENT", "sqlState" : "0A000", "messageParameters" : { - "treeNode" : "'Generate explode(table-argument#x []), false\n: +- SubqueryAlias v\n: +- View (`v`, [id#xL])\n: +- Project [cast(id#xL as bigint) AS id#xL]\n: +- Project [id#xL]\n: +- Range (0, 8, step=1, splits=None)\n+- OneRowRelation\n" + "treeNode" : "'Generate explode(table-argument#x []), false\n: +- SubqueryAlias v\n: +- View (`v`, [id#xL])\n: +- Project [cast(id#xL as bigint) AS id#xL]\n: +- Project [id#xL]\n: +- Range (0, 8, step=1)\n+- OneRowRelation\n" }, "queryContext" : [ { "objectType" : "", diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/natural-join.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/natural-join.sql.out index f8da5c5196357..857c574af3d23 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/natural-join.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/natural-join.sql.out @@ -10,7 +10,7 @@ CreateViewCommand `nt1`, select * from values ("one", 1), ("two", 2), ("three", 3) - as nt1(k, v1), false, false, LocalTempView, true + as nt1(k, v1), false, false, LocalTempView, UNSUPPORTED, true +- Project [k#x, v1#x] +- SubqueryAlias nt1 +- LocalRelation [k#x, v1#x] @@ -27,7 +27,7 @@ CreateViewCommand `nt2`, select * from values ("one", 1), ("two", 22), ("one", 5) - as nt2(k, v2), false, false, LocalTempView, true + as nt2(k, v2), false, false, LocalTempView, UNSUPPORTED, true +- Project [k#x, v2#x] +- SubqueryAlias nt2 +- LocalRelation [k#x, v2#x] @@ -44,7 +44,7 @@ CreateViewCommand `nt3`, select * from values ("one", 4), ("two", 5), ("one", 6) - as nt3(k, v3), false, false, LocalTempView, true + as nt3(k, v3), false, false, LocalTempView, UNSUPPORTED, true +- Project [k#x, v3#x] +- SubqueryAlias nt3 +- LocalRelation [k#x, v3#x] @@ -61,7 +61,7 @@ CreateViewCommand `nt4`, select * from values ("one", 7), ("two", 8), ("one", 9) - as nt4(k, v4), false, false, LocalTempView, true + as nt4(k, v4), false, false, LocalTempView, UNSUPPORTED, true +- Project [k#x, v4#x] +- SubqueryAlias nt4 +- LocalRelation [k#x, v4#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/non-excludable-rule.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/non-excludable-rule.sql.out index b80bed6f7c2aa..6b2c60f25bae3 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/non-excludable-rule.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/non-excludable-rule.sql.out @@ -13,11 +13,11 @@ SELECT -- !query analysis Project [scalar-subquery#x [] AS scalarsubquery()#xL, scalar-subquery#x [] AS scalarsubquery()#xL, scalar-subquery#x [] AS scalarsubquery()#xL] : :- Aggregate [min(id#xL) AS min(id)#xL] -: : +- Range (0, 10, step=1, splits=None) +: : +- Range (0, 10, step=1) : :- Aggregate [sum(id#xL) AS sum(id)#xL] -: : +- Range (0, 10, step=1, splits=None) +: : +- Range (0, 10, step=1) : +- Aggregate [count(distinct id#xL) AS count(DISTINCT id)#xL] -: +- Range (0, 10, step=1, splits=None) +: +- Range (0, 10, step=1) +- OneRowRelation @@ -40,15 +40,15 @@ WithCTE : +- SubqueryAlias tmp : +- Intersect false : :- Project [id#xL] -: : +- Range (0, 2, step=1, splits=None) +: : +- Range (0, 2, step=1) : +- Project [id#xL] -: +- Range (0, 4, step=1, splits=None) +: +- Range (0, 4, step=1) +- Project [id#xL] +- Filter (id#xL > scalar-subquery#x []) : +- Aggregate [max(id#xL) AS max(id)#xL] : +- SubqueryAlias tmp : +- CTERelationRef xxxx, true, [id#xL], false - +- Range (0, 3, step=1, splits=None) + +- Range (0, 3, step=1) -- !query diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/order-by-all.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/order-by-all.sql.out index 729fea0cf858b..b1447307d8549 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/order-by-all.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/order-by-all.sql.out @@ -12,7 +12,7 @@ CreateViewCommand `data`, select * from values (0, 2), (1, 3), (1, NULL) - as data(g, i), false, false, LocalTempView, true + as data(g, i), false, false, LocalTempView, UNSUPPORTED, true +- Project [g#x, i#x] +- SubqueryAlias data +- LocalRelation [g#x, i#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/order-by-ordinal.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/order-by-ordinal.sql.out index 4bf21d02f9a88..f1f189517dea7 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/order-by-ordinal.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/order-by-ordinal.sql.out @@ -16,7 +16,7 @@ CreateViewCommand `data`, select * from values (2, 2), (3, 1), (3, 2) - as data(a, b), false, false, LocalTempView, true + as data(a, b), false, false, LocalTempView, UNSUPPORTED, true +- Project [a#x, b#x] +- SubqueryAlias data +- LocalRelation [a#x, b#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/outer-join.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/outer-join.sql.out index 32088db51ac6c..1ee1cb1fb8fcd 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/outer-join.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/outer-join.sql.out @@ -6,7 +6,7 @@ as t1(int_col1) -- !query analysis CreateViewCommand `t1`, SELECT * FROM VALUES (-234), (145), (367), (975), (298) -as t1(int_col1), false, true, LocalTempView, true +as t1(int_col1), false, true, LocalTempView, UNSUPPORTED, true +- Project [int_col1#x] +- SubqueryAlias t1 +- LocalRelation [int_col1#x] @@ -19,7 +19,7 @@ as t2(int_col0, int_col1) -- !query analysis CreateViewCommand `t2`, SELECT * FROM VALUES (-769, -244), (-800, -409), (940, 86), (-507, 304), (-367, 158) -as t2(int_col0, int_col1), false, true, LocalTempView, true +as t2(int_col0, int_col1), false, true, LocalTempView, UNSUPPORTED, true +- Project [int_col0#x, int_col1#x] +- SubqueryAlias t2 +- LocalRelation [int_col0#x, int_col1#x] @@ -57,7 +57,7 @@ Filter (sum(coalesce(int_col1, int_col0))#xL > cast((coalesce(int_col1, int_col0 -- !query CREATE OR REPLACE TEMPORARY VIEW t1 AS SELECT * FROM VALUES (97) as t1(int_col1) -- !query analysis -CreateViewCommand `t1`, SELECT * FROM VALUES (97) as t1(int_col1), false, true, LocalTempView, true +CreateViewCommand `t1`, SELECT * FROM VALUES (97) as t1(int_col1), false, true, LocalTempView, UNSUPPORTED, true +- Project [int_col1#x] +- SubqueryAlias t1 +- LocalRelation [int_col1#x] @@ -66,7 +66,7 @@ CreateViewCommand `t1`, SELECT * FROM VALUES (97) as t1(int_col1), false, true, -- !query CREATE OR REPLACE TEMPORARY VIEW t2 AS SELECT * FROM VALUES (0) as t2(int_col1) -- !query analysis -CreateViewCommand `t2`, SELECT * FROM VALUES (0) as t2(int_col1), false, true, LocalTempView, true +CreateViewCommand `t2`, SELECT * FROM VALUES (0) as t2(int_col1), false, true, LocalTempView, UNSUPPORTED, true +- Project [int_col1#x] +- SubqueryAlias t2 +- LocalRelation [int_col1#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/percentiles.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/percentiles.sql.out index e763ed48b1364..4a31cff8c7d0f 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/percentiles.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/percentiles.sql.out @@ -6,7 +6,7 @@ AS aggr(k, v) -- !query analysis CreateViewCommand `aggr`, SELECT * FROM VALUES (0, 0), (0, 10), (0, 20), (0, 30), (0, 40), (1, 10), (1, 20), (2, 10), (2, 20), (2, 25), (2, 30), (3, 60), (4, null) -AS aggr(k, v), false, true, LocalTempView, true +AS aggr(k, v), false, true, LocalTempView, UNSUPPORTED, true +- Project [k#x, v#x] +- SubqueryAlias aggr +- LocalRelation [k#x, v#x] @@ -51,7 +51,7 @@ CreateViewCommand `basic_pays`, SELECT * FROM VALUES ('Pamela Castillo','SCM',11303), ('Larry Bott','SCM',11798), ('Barry Jones','SCM',10586) -AS basic_pays(employee_name, department, salary), false, true, LocalTempView, true +AS basic_pays(employee_name, department, salary), false, true, LocalTempView, UNSUPPORTED, true +- Project [employee_name#x, department#x, salary#x] +- SubqueryAlias basic_pays +- LocalRelation [employee_name#x, department#x, salary#x] @@ -885,7 +885,7 @@ CreateViewCommand `intervals`, SELECT * FROM VALUES (2, INTERVAL '30' MONTH, INTERVAL '30' SECOND, INTERVAL '30' MINUTE), (3, INTERVAL '60' MONTH, INTERVAL '60' SECOND, INTERVAL '60' MINUTE), (4, null, null, null) -AS intervals(k, dt, ym, dt2), false, true, LocalTempView, true +AS intervals(k, dt, ym, dt2), false, true, LocalTempView, UNSUPPORTED, true +- Project [k#x, dt#x, ym#x, dt2#x] +- SubqueryAlias intervals +- LocalRelation [k#x, dt#x, ym#x, dt2#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/pivot.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/pivot.sql.out index c0a02dc29d606..93f2e240a0191 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/pivot.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/pivot.sql.out @@ -14,7 +14,7 @@ CreateViewCommand `courseSales`, select * from values ("dotNET", 2012, 5000), ("dotNET", 2013, 48000), ("Java", 2013, 30000) - as courseSales(course, year, earnings), false, false, LocalTempView, true + as courseSales(course, year, earnings), false, false, LocalTempView, UNSUPPORTED, true +- Project [course#x, year#x, earnings#x] +- SubqueryAlias courseSales +- LocalRelation [course#x, year#x, earnings#x] @@ -29,7 +29,7 @@ create temporary view years as select * from values CreateViewCommand `years`, select * from values (2012, 1), (2013, 2) - as years(y, s), false, false, LocalTempView, true + as years(y, s), false, false, LocalTempView, UNSUPPORTED, true +- Project [y#x, s#x] +- SubqueryAlias years +- LocalRelation [y#x, s#x] @@ -44,7 +44,7 @@ create temporary view yearsWithComplexTypes as select * from values CreateViewCommand `yearsWithComplexTypes`, select * from values (2012, array(1, 1), map('1', 1), struct(1, 'a')), (2013, array(2, 2), map('2', 2), struct(2, 'b')) - as yearsWithComplexTypes(y, a, m, s), false, false, LocalTempView, true + as yearsWithComplexTypes(y, a, m, s), false, false, LocalTempView, UNSUPPORTED, true +- Project [y#x, a#x, m#x, s#x] +- SubqueryAlias yearsWithComplexTypes +- LocalRelation [y#x, a#x, m#x, s#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/aggregates_part1.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/aggregates_part1.sql.out index 75eb7df4e2121..0577d73ea6a3c 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/aggregates_part1.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/aggregates_part1.sql.out @@ -154,70 +154,70 @@ Aggregate [stddev_pop(cast(cast(3.0 as decimal(38,0)) as double)) AS stddev_pop( select sum(CAST(null AS int)) from range(1,4) -- !query analysis Aggregate [sum(cast(null as int)) AS sum(CAST(NULL AS INT))#xL] -+- Range (1, 4, step=1, splits=None) ++- Range (1, 4, step=1) -- !query select sum(CAST(null AS long)) from range(1,4) -- !query analysis Aggregate [sum(cast(null as bigint)) AS sum(CAST(NULL AS BIGINT))#xL] -+- Range (1, 4, step=1, splits=None) ++- Range (1, 4, step=1) -- !query select sum(CAST(null AS Decimal(38,0))) from range(1,4) -- !query analysis Aggregate [sum(cast(null as decimal(38,0))) AS sum(CAST(NULL AS DECIMAL(38,0)))#x] -+- Range (1, 4, step=1, splits=None) ++- Range (1, 4, step=1) -- !query select sum(CAST(null AS DOUBLE)) from range(1,4) -- !query analysis Aggregate [sum(cast(null as double)) AS sum(CAST(NULL AS DOUBLE))#x] -+- Range (1, 4, step=1, splits=None) ++- Range (1, 4, step=1) -- !query select avg(CAST(null AS int)) from range(1,4) -- !query analysis Aggregate [avg(cast(null as int)) AS avg(CAST(NULL AS INT))#x] -+- Range (1, 4, step=1, splits=None) ++- Range (1, 4, step=1) -- !query select avg(CAST(null AS long)) from range(1,4) -- !query analysis Aggregate [avg(cast(null as bigint)) AS avg(CAST(NULL AS BIGINT))#x] -+- Range (1, 4, step=1, splits=None) ++- Range (1, 4, step=1) -- !query select avg(CAST(null AS Decimal(38,0))) from range(1,4) -- !query analysis Aggregate [avg(cast(null as decimal(38,0))) AS avg(CAST(NULL AS DECIMAL(38,0)))#x] -+- Range (1, 4, step=1, splits=None) ++- Range (1, 4, step=1) -- !query select avg(CAST(null AS DOUBLE)) from range(1,4) -- !query analysis Aggregate [avg(cast(null as double)) AS avg(CAST(NULL AS DOUBLE))#x] -+- Range (1, 4, step=1, splits=None) ++- Range (1, 4, step=1) -- !query select sum(CAST('NaN' AS DOUBLE)) from range(1,4) -- !query analysis Aggregate [sum(cast(NaN as double)) AS sum(CAST(NaN AS DOUBLE))#x] -+- Range (1, 4, step=1, splits=None) ++- Range (1, 4, step=1) -- !query select avg(CAST('NaN' AS DOUBLE)) from range(1,4) -- !query analysis Aggregate [avg(cast(NaN as double)) AS avg(CAST(NaN AS DOUBLE))#x] -+- Range (1, 4, step=1, splits=None) ++- Range (1, 4, step=1) -- !query @@ -355,7 +355,7 @@ Aggregate [corr(cast(b#x as double), cast(a#x as double)) AS corr(b, a)#x] -- !query CREATE TEMPORARY VIEW regr_test AS SELECT * FROM VALUES (10,150),(20,250),(30,350),(80,540),(100,200) AS regr_test (x, y) -- !query analysis -CreateViewCommand `regr_test`, SELECT * FROM VALUES (10,150),(20,250),(30,350),(80,540),(100,200) AS regr_test (x, y), false, false, LocalTempView, true +CreateViewCommand `regr_test`, SELECT * FROM VALUES (10,150),(20,250),(30,350),(80,540),(100,200) AS regr_test (x, y), false, false, LocalTempView, UNSUPPORTED, true +- Project [x#x, y#x] +- SubqueryAlias regr_test +- LocalRelation [x#x, y#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/aggregates_part2.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/aggregates_part2.sql.out index 84c5a88e6b2aa..98b0a27f04505 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/aggregates_part2.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/aggregates_part2.sql.out @@ -14,7 +14,7 @@ CreateViewCommand `int4_tbl`, select * from values (-123456), (2147483647), (-2147483647) - as int4_tbl(f1), false, false, LocalTempView, true + as int4_tbl(f1), false, false, LocalTempView, UNSUPPORTED, true +- Project [f1#x] +- SubqueryAlias int4_tbl +- LocalRelation [f1#x] @@ -29,7 +29,7 @@ CREATE OR REPLACE TEMPORARY VIEW bitwise_test AS SELECT * FROM VALUES CreateViewCommand `bitwise_test`, SELECT * FROM VALUES (1, 1, 1, 1L), (3, 3, 3, null), - (7, 7, 7, 3L) AS bitwise_test(b1, b2, b3, b4), false, true, LocalTempView, true + (7, 7, 7, 3L) AS bitwise_test(b1, b2, b3, b4), false, true, LocalTempView, UNSUPPORTED, true +- Project [b1#x, b2#x, b3#x, b4#xL] +- SubqueryAlias bitwise_test +- LocalRelation [b1#x, b2#x, b3#x, b4#xL] @@ -178,7 +178,7 @@ CREATE OR REPLACE TEMPORARY VIEW bool_test AS SELECT * FROM VALUES CreateViewCommand `bool_test`, SELECT * FROM VALUES (TRUE, null, FALSE, null), (FALSE, TRUE, null, null), - (null, TRUE, FALSE, null) AS bool_test(b1, b2, b3, b4), false, true, LocalTempView, true + (null, TRUE, FALSE, null) AS bool_test(b1, b2, b3, b4), false, true, LocalTempView, UNSUPPORTED, true +- Project [b1#x, b2#x, b3#x, b4#x] +- SubqueryAlias bool_test +- LocalRelation [b1#x, b2#x, b3#x, b4#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/create_view.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/create_view.sql.out index b200f255ff13e..e4e4be8fee7d3 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/create_view.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/create_view.sql.out @@ -16,7 +16,7 @@ CREATE VIEW toyemp AS FROM emp -- !query analysis CreateViewCommand `spark_catalog`.`default`.`toyemp`, SELECT name, age, /* location ,*/ 12*salary AS annualsal - FROM emp, false, false, PersistedView, true + FROM emp, false, false, PersistedView, COMPENSATION, true +- Project [name#x, age#x, (12 * salary#x) AS annualsal#x] +- SubqueryAlias spark_catalog.default.emp +- Relation spark_catalog.default.emp[name#x,age#x,salary#x,manager#x] parquet @@ -102,7 +102,7 @@ InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_d CREATE OR REPLACE VIEW viewtest AS SELECT * FROM viewtest_tbl -- !query analysis -CreateViewCommand `spark_catalog`.`default`.`viewtest`, SELECT * FROM viewtest_tbl, false, true, PersistedView, true +CreateViewCommand `spark_catalog`.`default`.`viewtest`, SELECT * FROM viewtest_tbl, false, true, PersistedView, COMPENSATION, true +- Project [a#x, b#x] +- SubqueryAlias spark_catalog.default.viewtest_tbl +- Relation spark_catalog.default.viewtest_tbl[a#x,b#x] parquet @@ -112,7 +112,7 @@ CreateViewCommand `spark_catalog`.`default`.`viewtest`, SELECT * FROM viewtest_t CREATE OR REPLACE VIEW viewtest AS SELECT * FROM viewtest_tbl WHERE a > 10 -- !query analysis -CreateViewCommand `spark_catalog`.`default`.`viewtest`, SELECT * FROM viewtest_tbl WHERE a > 10, false, true, PersistedView, true +CreateViewCommand `spark_catalog`.`default`.`viewtest`, SELECT * FROM viewtest_tbl WHERE a > 10, false, true, PersistedView, COMPENSATION, true +- Project [a#x, b#x] +- Filter (a#x > 10) +- SubqueryAlias spark_catalog.default.viewtest_tbl @@ -136,7 +136,7 @@ Project [a#x, b#x] CREATE OR REPLACE VIEW viewtest AS SELECT a, b FROM viewtest_tbl WHERE a > 5 ORDER BY b DESC -- !query analysis -CreateViewCommand `spark_catalog`.`default`.`viewtest`, SELECT a, b FROM viewtest_tbl WHERE a > 5 ORDER BY b DESC, false, true, PersistedView, true +CreateViewCommand `spark_catalog`.`default`.`viewtest`, SELECT a, b FROM viewtest_tbl WHERE a > 5 ORDER BY b DESC, false, true, PersistedView, COMPENSATION, true +- Sort [b#x DESC NULLS LAST], true +- Project [a#x, b#x] +- Filter (a#x > 5) @@ -162,7 +162,7 @@ Project [a#x, b#x] CREATE OR REPLACE VIEW viewtest AS SELECT a FROM viewtest_tbl WHERE a <> 20 -- !query analysis -CreateViewCommand `spark_catalog`.`default`.`viewtest`, SELECT a FROM viewtest_tbl WHERE a <> 20, false, true, PersistedView, true +CreateViewCommand `spark_catalog`.`default`.`viewtest`, SELECT a FROM viewtest_tbl WHERE a <> 20, false, true, PersistedView, COMPENSATION, true +- Project [a#x] +- Filter NOT (a#x = 20) +- SubqueryAlias spark_catalog.default.viewtest_tbl @@ -173,7 +173,7 @@ CreateViewCommand `spark_catalog`.`default`.`viewtest`, SELECT a FROM viewtest_t CREATE OR REPLACE VIEW viewtest AS SELECT 1, * FROM viewtest_tbl -- !query analysis -CreateViewCommand `spark_catalog`.`default`.`viewtest`, SELECT 1, * FROM viewtest_tbl, false, true, PersistedView, true +CreateViewCommand `spark_catalog`.`default`.`viewtest`, SELECT 1, * FROM viewtest_tbl, false, true, PersistedView, COMPENSATION, true +- Project [1 AS 1#x, a#x, b#x] +- SubqueryAlias spark_catalog.default.viewtest_tbl +- Relation spark_catalog.default.viewtest_tbl[a#x,b#x] parquet @@ -183,7 +183,7 @@ CreateViewCommand `spark_catalog`.`default`.`viewtest`, SELECT 1, * FROM viewtes CREATE OR REPLACE VIEW viewtest AS SELECT a, decimal(b) FROM viewtest_tbl -- !query analysis -CreateViewCommand `spark_catalog`.`default`.`viewtest`, SELECT a, decimal(b) FROM viewtest_tbl, false, true, PersistedView, true +CreateViewCommand `spark_catalog`.`default`.`viewtest`, SELECT a, decimal(b) FROM viewtest_tbl, false, true, PersistedView, COMPENSATION, true +- Project [a#x, cast(b#x as decimal(10,0)) AS b#x] +- SubqueryAlias spark_catalog.default.viewtest_tbl +- Relation spark_catalog.default.viewtest_tbl[a#x,b#x] parquet @@ -193,7 +193,7 @@ CreateViewCommand `spark_catalog`.`default`.`viewtest`, SELECT a, decimal(b) FRO CREATE OR REPLACE VIEW viewtest AS SELECT a, b, 0 AS c FROM viewtest_tbl -- !query analysis -CreateViewCommand `spark_catalog`.`default`.`viewtest`, SELECT a, b, 0 AS c FROM viewtest_tbl, false, true, PersistedView, true +CreateViewCommand `spark_catalog`.`default`.`viewtest`, SELECT a, b, 0 AS c FROM viewtest_tbl, false, true, PersistedView, COMPENSATION, true +- Project [a#x, b#x, 0 AS c#x] +- SubqueryAlias spark_catalog.default.viewtest_tbl +- Relation spark_catalog.default.viewtest_tbl[a#x,b#x] parquet @@ -243,7 +243,7 @@ CREATE TEMPORARY VIEW temp_table AS SELECT * FROM VALUES (1, 1) as temp_table(a, id) -- !query analysis CreateViewCommand `temp_table`, SELECT * FROM VALUES - (1, 1) as temp_table(a, id), false, false, LocalTempView, true + (1, 1) as temp_table(a, id), false, false, LocalTempView, UNSUPPORTED, true +- Project [a#x, id#x] +- SubqueryAlias temp_table +- LocalRelation [a#x, id#x] @@ -252,7 +252,7 @@ CreateViewCommand `temp_table`, SELECT * FROM VALUES -- !query CREATE VIEW v1 AS SELECT * FROM base_table -- !query analysis -CreateViewCommand `spark_catalog`.`temp_view_test`.`v1`, SELECT * FROM base_table, false, false, PersistedView, true +CreateViewCommand `spark_catalog`.`temp_view_test`.`v1`, SELECT * FROM base_table, false, false, PersistedView, COMPENSATION, true +- Project [a#x, id#x] +- SubqueryAlias spark_catalog.temp_view_test.base_table +- Relation spark_catalog.temp_view_test.base_table[a#x,id#x] parquet @@ -283,7 +283,7 @@ org.apache.spark.sql.AnalysisException -- !query CREATE TEMP VIEW v2_temp AS SELECT * FROM base_table -- !query analysis -CreateViewCommand `v2_temp`, SELECT * FROM base_table, false, false, LocalTempView, true +CreateViewCommand `v2_temp`, SELECT * FROM base_table, false, false, LocalTempView, UNSUPPORTED, true +- Project [a#x, id#x] +- SubqueryAlias spark_catalog.temp_view_test.base_table +- Relation spark_catalog.temp_view_test.base_table[a#x,id#x] parquet @@ -298,7 +298,7 @@ DescribeTableCommand `v2_temp`, true, [col_name#x, data_type#x, comment#x] -- !query CREATE VIEW temp_view_test.v2 AS SELECT * FROM base_table -- !query analysis -CreateViewCommand `spark_catalog`.`temp_view_test`.`v2`, SELECT * FROM base_table, false, false, PersistedView, true +CreateViewCommand `spark_catalog`.`temp_view_test`.`v2`, SELECT * FROM base_table, false, false, PersistedView, COMPENSATION, true +- Project [a#x, id#x] +- SubqueryAlias spark_catalog.temp_view_test.base_table +- Relation spark_catalog.temp_view_test.base_table[a#x,id#x] parquet @@ -334,7 +334,7 @@ CREATE VIEW v3 AS -- !query analysis CreateViewCommand `spark_catalog`.`temp_view_test`.`v3`, SELECT t1.a AS t1_a, t2.a AS t2_a FROM base_table t1, base_table2 t2 - WHERE t1.id = t2.id, false, false, PersistedView, true + WHERE t1.id = t2.id, false, false, PersistedView, COMPENSATION, true +- Project [a#x AS t1_a#x, a#x AS t2_a#x] +- Filter (id#x = id#x) +- Join Inner @@ -393,7 +393,7 @@ org.apache.spark.sql.AnalysisException -- !query CREATE VIEW v4 AS SELECT * FROM base_table WHERE id IN (SELECT id FROM base_table2) -- !query analysis -CreateViewCommand `spark_catalog`.`temp_view_test`.`v4`, SELECT * FROM base_table WHERE id IN (SELECT id FROM base_table2), false, false, PersistedView, true +CreateViewCommand `spark_catalog`.`temp_view_test`.`v4`, SELECT * FROM base_table WHERE id IN (SELECT id FROM base_table2), false, false, PersistedView, COMPENSATION, true +- Project [a#x, id#x] +- Filter id#x IN (list#x []) : +- Project [id#x] @@ -412,7 +412,7 @@ DescribeTableCommand `spark_catalog`.`temp_view_test`.`v4`, true, [col_name#x, d -- !query CREATE VIEW v5 AS SELECT t1.id, t2.a FROM base_table t1, (SELECT * FROM base_table2) t2 -- !query analysis -CreateViewCommand `spark_catalog`.`temp_view_test`.`v5`, SELECT t1.id, t2.a FROM base_table t1, (SELECT * FROM base_table2) t2, false, false, PersistedView, true +CreateViewCommand `spark_catalog`.`temp_view_test`.`v5`, SELECT t1.id, t2.a FROM base_table t1, (SELECT * FROM base_table2) t2, false, false, PersistedView, COMPENSATION, true +- Project [id#x, a#x] +- Join Inner :- SubqueryAlias t1 @@ -433,7 +433,7 @@ DescribeTableCommand `spark_catalog`.`temp_view_test`.`v5`, true, [col_name#x, d -- !query CREATE VIEW v6 AS SELECT * FROM base_table WHERE EXISTS (SELECT 1 FROM base_table2) -- !query analysis -CreateViewCommand `spark_catalog`.`temp_view_test`.`v6`, SELECT * FROM base_table WHERE EXISTS (SELECT 1 FROM base_table2), false, false, PersistedView, true +CreateViewCommand `spark_catalog`.`temp_view_test`.`v6`, SELECT * FROM base_table WHERE EXISTS (SELECT 1 FROM base_table2), false, false, PersistedView, COMPENSATION, true +- Project [a#x, id#x] +- Filter exists#x [] : +- Project [1 AS 1#x] @@ -452,7 +452,7 @@ DescribeTableCommand `spark_catalog`.`temp_view_test`.`v6`, true, [col_name#x, d -- !query CREATE VIEW v7 AS SELECT * FROM base_table WHERE NOT EXISTS (SELECT 1 FROM base_table2) -- !query analysis -CreateViewCommand `spark_catalog`.`temp_view_test`.`v7`, SELECT * FROM base_table WHERE NOT EXISTS (SELECT 1 FROM base_table2), false, false, PersistedView, true +CreateViewCommand `spark_catalog`.`temp_view_test`.`v7`, SELECT * FROM base_table WHERE NOT EXISTS (SELECT 1 FROM base_table2), false, false, PersistedView, COMPENSATION, true +- Project [a#x, id#x] +- Filter NOT exists#x [] : +- Project [1 AS 1#x] @@ -471,7 +471,7 @@ DescribeTableCommand `spark_catalog`.`temp_view_test`.`v7`, true, [col_name#x, d -- !query CREATE VIEW v8 AS SELECT * FROM base_table WHERE EXISTS (SELECT 1) -- !query analysis -CreateViewCommand `spark_catalog`.`temp_view_test`.`v8`, SELECT * FROM base_table WHERE EXISTS (SELECT 1), false, false, PersistedView, true +CreateViewCommand `spark_catalog`.`temp_view_test`.`v8`, SELECT * FROM base_table WHERE EXISTS (SELECT 1), false, false, PersistedView, COMPENSATION, true +- Project [a#x, id#x] +- Filter exists#x [] : +- Project [1 AS 1#x] @@ -641,7 +641,7 @@ CREATE TEMP VIEW tt AS SELECT * FROM VALUES (1, 'a') AS tt(num2, value) -- !query analysis CreateViewCommand `tt`, SELECT * FROM VALUES - (1, 'a') AS tt(num2, value), false, false, LocalTempView, true + (1, 'a') AS tt(num2, value), false, false, LocalTempView, UNSUPPORTED, true +- Project [num2#x, value#x] +- SubqueryAlias tt +- LocalRelation [num2#x, value#x] @@ -650,7 +650,7 @@ CreateViewCommand `tt`, SELECT * FROM VALUES -- !query CREATE VIEW nontemp1 AS SELECT * FROM t1 CROSS JOIN t2 -- !query analysis -CreateViewCommand `spark_catalog`.`testviewschm2`.`nontemp1`, SELECT * FROM t1 CROSS JOIN t2, false, false, PersistedView, true +CreateViewCommand `spark_catalog`.`testviewschm2`.`nontemp1`, SELECT * FROM t1 CROSS JOIN t2, false, false, PersistedView, COMPENSATION, true +- Project [num#x, name#x, num2#x, value#x] +- Join Cross :- SubqueryAlias spark_catalog.testviewschm2.t1 @@ -684,7 +684,7 @@ org.apache.spark.sql.AnalysisException -- !query CREATE VIEW nontemp2 AS SELECT * FROM t1 INNER JOIN t2 ON t1.num = t2.num2 -- !query analysis -CreateViewCommand `spark_catalog`.`testviewschm2`.`nontemp2`, SELECT * FROM t1 INNER JOIN t2 ON t1.num = t2.num2, false, false, PersistedView, true +CreateViewCommand `spark_catalog`.`testviewschm2`.`nontemp2`, SELECT * FROM t1 INNER JOIN t2 ON t1.num = t2.num2, false, false, PersistedView, COMPENSATION, true +- Project [num#x, name#x, num2#x, value#x] +- Join Inner, (num#x = num2#x) :- SubqueryAlias spark_catalog.testviewschm2.t1 @@ -718,7 +718,7 @@ org.apache.spark.sql.AnalysisException -- !query CREATE VIEW nontemp3 AS SELECT * FROM t1 LEFT JOIN t2 ON t1.num = t2.num2 -- !query analysis -CreateViewCommand `spark_catalog`.`testviewschm2`.`nontemp3`, SELECT * FROM t1 LEFT JOIN t2 ON t1.num = t2.num2, false, false, PersistedView, true +CreateViewCommand `spark_catalog`.`testviewschm2`.`nontemp3`, SELECT * FROM t1 LEFT JOIN t2 ON t1.num = t2.num2, false, false, PersistedView, COMPENSATION, true +- Project [num#x, name#x, num2#x, value#x] +- Join LeftOuter, (num#x = num2#x) :- SubqueryAlias spark_catalog.testviewschm2.t1 @@ -752,7 +752,7 @@ org.apache.spark.sql.AnalysisException -- !query CREATE VIEW nontemp4 AS SELECT * FROM t1 LEFT JOIN t2 ON t1.num = t2.num2 AND t2.value = 'xxx' -- !query analysis -CreateViewCommand `spark_catalog`.`testviewschm2`.`nontemp4`, SELECT * FROM t1 LEFT JOIN t2 ON t1.num = t2.num2 AND t2.value = 'xxx', false, false, PersistedView, true +CreateViewCommand `spark_catalog`.`testviewschm2`.`nontemp4`, SELECT * FROM t1 LEFT JOIN t2 ON t1.num = t2.num2 AND t2.value = 'xxx', false, false, PersistedView, COMPENSATION, true +- Project [num#x, name#x, num2#x, value#x] +- Join LeftOuter, ((num#x = num2#x) AND (value#x = xxx)) :- SubqueryAlias spark_catalog.testviewschm2.t1 @@ -844,7 +844,7 @@ AND EXISTS (SELECT g FROM tbl4 LEFT JOIN tbl3 ON tbl4.h = tbl3.f) -- !query analysis CreateViewCommand `spark_catalog`.`testviewschm2`.`pubview`, SELECT * FROM tbl1 WHERE tbl1.a BETWEEN (SELECT d FROM tbl2 WHERE c = 1) AND (SELECT e FROM tbl3 WHERE f = 2) -AND EXISTS (SELECT g FROM tbl4 LEFT JOIN tbl3 ON tbl4.h = tbl3.f), false, false, PersistedView, true +AND EXISTS (SELECT g FROM tbl4 LEFT JOIN tbl3 ON tbl4.h = tbl3.f), false, false, PersistedView, COMPENSATION, true +- Project [a#x, b#x] +- Filter (between(a#x, scalar-subquery#x [], scalar-subquery#x []) AND exists#x []) : :- Project [d#x] @@ -880,7 +880,7 @@ AND NOT EXISTS (SELECT g FROM tbl4 LEFT JOIN tmptbl ON tbl4.h = tmptbl.j) CreateViewCommand `spark_catalog`.`testviewschm2`.`mytempview`, SELECT * FROM tbl1 WHERE tbl1.a BETWEEN (SELECT d FROM tbl2 WHERE c = 1) AND (SELECT e FROM tbl3 WHERE f = 2) AND EXISTS (SELECT g FROM tbl4 LEFT JOIN tbl3 ON tbl4.h = tbl3.f) -AND NOT EXISTS (SELECT g FROM tbl4 LEFT JOIN tmptbl ON tbl4.h = tmptbl.j), false, false, PersistedView, true +AND NOT EXISTS (SELECT g FROM tbl4 LEFT JOIN tmptbl ON tbl4.h = tmptbl.j), false, false, PersistedView, COMPENSATION, true +- Project [a#x, b#x] +- Filter ((between(a#x, scalar-subquery#x [], scalar-subquery#x []) AND exists#x []) AND NOT exists#x []) : :- Project [d#x] @@ -925,7 +925,7 @@ CreateViewCommand `spark_catalog`.`testviewschm2`.`tt1`, SELECT * FROM ( VALUES ('abc', '0123456789', 42, 'abcd'), ('0123456789', 'abc', 42.12, 'abc') - ) vv(a,b,c,d), false, false, PersistedView, true + ) vv(a,b,c,d), false, false, PersistedView, COMPENSATION, true +- Project [a#x, b#x, c#x, d#x] +- SubqueryAlias vv +- Project [col1#x AS a#x, col2#x AS b#x, col3#x AS c#x, col4#x AS d#x] @@ -988,7 +988,7 @@ CREATE VIEW aliased_view_1 AS where exists (select 1 from tx1 where tt1.f1 = tx1.x1) -- !query analysis CreateViewCommand `spark_catalog`.`testviewschm2`.`aliased_view_1`, select * from tt1 - where exists (select 1 from tx1 where tt1.f1 = tx1.x1), false, false, PersistedView, true + where exists (select 1 from tx1 where tt1.f1 = tx1.x1), false, false, PersistedView, COMPENSATION, true +- Project [f1#x, f2#x, f3#x] +- Filter exists#x [f1#x] : +- Project [1 AS 1#x] @@ -1005,7 +1005,7 @@ CREATE VIEW aliased_view_2 AS where exists (select 1 from tx1 where a1.f1 = tx1.x1) -- !query analysis CreateViewCommand `spark_catalog`.`testviewschm2`.`aliased_view_2`, select * from tt1 a1 - where exists (select 1 from tx1 where a1.f1 = tx1.x1), false, false, PersistedView, true + where exists (select 1 from tx1 where a1.f1 = tx1.x1), false, false, PersistedView, COMPENSATION, true +- Project [f1#x, f2#x, f3#x] +- Filter exists#x [f1#x] : +- Project [1 AS 1#x] @@ -1023,7 +1023,7 @@ CREATE VIEW aliased_view_3 AS where exists (select 1 from tx1 a2 where tt1.f1 = a2.x1) -- !query analysis CreateViewCommand `spark_catalog`.`testviewschm2`.`aliased_view_3`, select * from tt1 - where exists (select 1 from tx1 a2 where tt1.f1 = a2.x1), false, false, PersistedView, true + where exists (select 1 from tx1 a2 where tt1.f1 = a2.x1), false, false, PersistedView, COMPENSATION, true +- Project [f1#x, f2#x, f3#x] +- Filter exists#x [f1#x] : +- Project [1 AS 1#x] @@ -1041,7 +1041,7 @@ CREATE VIEW aliased_view_4 AS where exists (select 1 from tt1 where temp_view_test.tt1.y1 = tt1.f1) -- !query analysis CreateViewCommand `spark_catalog`.`testviewschm2`.`aliased_view_4`, select * from temp_view_test.tt1 - where exists (select 1 from tt1 where temp_view_test.tt1.y1 = tt1.f1), false, false, PersistedView, true + where exists (select 1 from tt1 where temp_view_test.tt1.y1 = tt1.f1), false, false, PersistedView, COMPENSATION, true +- Project [y1#x, f2#x, f3#x] +- Filter exists#x [y1#x] : +- Project [1 AS 1#x] @@ -1180,7 +1180,7 @@ select * from -- !query analysis CreateViewCommand `spark_catalog`.`testviewschm2`.`view_of_joins`, select * from (select * from (tbl1 cross join tbl2) same) ss, - (tbl3 cross join tbl4) same, false, false, PersistedView, true + (tbl3 cross join tbl4) same, false, false, PersistedView, COMPENSATION, true +- Project [a#x, b#x, c#x, d#x, e#x, f#x, g#x, h#x] +- Join Inner :- SubqueryAlias ss @@ -1220,7 +1220,7 @@ CreateDataSourceTableCommand `spark_catalog`.`testviewschm2`.`tt4`, false -- !query create view v1 as select * from tt2 natural join tt3 -- !query analysis -CreateViewCommand `spark_catalog`.`testviewschm2`.`v1`, select * from tt2 natural join tt3, false, false, PersistedView, true +CreateViewCommand `spark_catalog`.`testviewschm2`.`v1`, select * from tt2 natural join tt3, false, false, PersistedView, COMPENSATION, true +- Project [b#x, c#x, a#x, ax#xL] +- Project [b#x, c#x, a#x, ax#xL] +- Join Inner, ((b#x = cast(b#x as int)) AND (cast(c#x as decimal(10,0)) = c#x)) @@ -1233,7 +1233,7 @@ CreateViewCommand `spark_catalog`.`testviewschm2`.`v1`, select * from tt2 natura -- !query create view v1a as select * from (tt2 natural join tt3) j -- !query analysis -CreateViewCommand `spark_catalog`.`testviewschm2`.`v1a`, select * from (tt2 natural join tt3) j, false, false, PersistedView, true +CreateViewCommand `spark_catalog`.`testviewschm2`.`v1a`, select * from (tt2 natural join tt3) j, false, false, PersistedView, COMPENSATION, true +- Project [b#x, c#x, a#x, ax#xL] +- SubqueryAlias j +- Project [b#x, c#x, a#x, ax#xL] @@ -1247,7 +1247,7 @@ CreateViewCommand `spark_catalog`.`testviewschm2`.`v1a`, select * from (tt2 natu -- !query create view v2 as select * from tt2 join tt3 using (b,c) join tt4 using (b) -- !query analysis -CreateViewCommand `spark_catalog`.`testviewschm2`.`v2`, select * from tt2 join tt3 using (b,c) join tt4 using (b), false, false, PersistedView, true +CreateViewCommand `spark_catalog`.`testviewschm2`.`v2`, select * from tt2 join tt3 using (b,c) join tt4 using (b), false, false, PersistedView, COMPENSATION, true +- Project [b#x, c#x, a#x, ax#xL, ay#x, q#x] +- Project [b#x, c#x, a#x, ax#xL, ay#x, q#x] +- Join Inner, (b#x = b#x) @@ -1264,7 +1264,7 @@ CreateViewCommand `spark_catalog`.`testviewschm2`.`v2`, select * from tt2 join t -- !query create view v2a as select * from (tt2 join tt3 using (b,c) join tt4 using (b)) j -- !query analysis -CreateViewCommand `spark_catalog`.`testviewschm2`.`v2a`, select * from (tt2 join tt3 using (b,c) join tt4 using (b)) j, false, false, PersistedView, true +CreateViewCommand `spark_catalog`.`testviewschm2`.`v2a`, select * from (tt2 join tt3 using (b,c) join tt4 using (b)) j, false, false, PersistedView, COMPENSATION, true +- Project [b#x, c#x, a#x, ax#xL, ay#x, q#x] +- SubqueryAlias j +- Project [b#x, c#x, a#x, ax#xL, ay#x, q#x] @@ -1282,7 +1282,7 @@ CreateViewCommand `spark_catalog`.`testviewschm2`.`v2a`, select * from (tt2 join -- !query create view v3 as select * from tt2 join tt3 using (b,c) full join tt4 using (b) -- !query analysis -CreateViewCommand `spark_catalog`.`testviewschm2`.`v3`, select * from tt2 join tt3 using (b,c) full join tt4 using (b), false, false, PersistedView, true +CreateViewCommand `spark_catalog`.`testviewschm2`.`v3`, select * from tt2 join tt3 using (b,c) full join tt4 using (b), false, false, PersistedView, COMPENSATION, true +- Project [b#x, c#x, a#x, ax#xL, ay#x, q#x] +- Project [coalesce(b#x, b#x) AS b#x, c#x, a#x, ax#xL, ay#x, q#x] +- Join FullOuter, (b#x = b#x) @@ -1438,7 +1438,7 @@ CreateDataSourceTableCommand `spark_catalog`.`testviewschm2`.`tt6`, false -- !query create view vv1 as select * from (tt5 cross join tt6) j(aa,bb,cc,dd) -- !query analysis -CreateViewCommand `spark_catalog`.`testviewschm2`.`vv1`, select * from (tt5 cross join tt6) j(aa,bb,cc,dd), false, false, PersistedView, true +CreateViewCommand `spark_catalog`.`testviewschm2`.`vv1`, select * from (tt5 cross join tt6) j(aa,bb,cc,dd), false, false, PersistedView, COMPENSATION, true +- Project [aa#x, bb#x, cc#x, dd#x] +- SubqueryAlias j +- Project [a#x AS aa#x, b#x AS bb#x, c#x AS cc#x, d#x AS dd#x] @@ -1499,7 +1499,7 @@ select * from tt7 full join tt8 using (x), tt8 tt8x -- !query analysis CreateViewCommand `spark_catalog`.`testviewschm2`.`vv2`, select * from (values(1,2,3,4,5)) v(a,b,c,d,e) union all -select * from tt7 full join tt8 using (x), tt8 tt8x, false, false, PersistedView, true +select * from tt7 full join tt8 using (x), tt8 tt8x, false, false, PersistedView, COMPENSATION, true +- Union false, false :- Project [a#x, b#x, c#x, d#x, e#x] : +- SubqueryAlias v @@ -1536,7 +1536,7 @@ CreateViewCommand `spark_catalog`.`testviewschm2`.`vv3`, select * from (values(1 union all select * from tt7 full join tt8 using (x), - tt7 tt7x full join tt8 tt8x using (x), false, false, PersistedView, true + tt7 tt7x full join tt8 tt8x using (x), false, false, PersistedView, COMPENSATION, true +- Union false, false :- Project [a#x, b#x, c#x, x#x, e#x, f#x] : +- SubqueryAlias v @@ -1578,7 +1578,7 @@ CreateViewCommand `spark_catalog`.`testviewschm2`.`vv4`, select * from (values(1 union all select * from tt7 full join tt8 using (x), - tt7 tt7x full join tt8 tt8x using (x) full join tt8 tt8y using (x), false, false, PersistedView, true + tt7 tt7x full join tt8 tt8x using (x) full join tt8 tt8y using (x), false, false, PersistedView, COMPENSATION, true +- Union false, false :- Project [a#x, b#x, c#x, x#x, e#x, f#x, g#x] : +- SubqueryAlias v @@ -1669,7 +1669,7 @@ select * from tt7a left join tt8a using (x), tt8a tt8ax -- !query analysis CreateViewCommand `spark_catalog`.`testviewschm2`.`vv2a`, select * from (values(now(),2,3,now(),5)) v(a,b,c,d,e) union all -select * from tt7a left join tt8a using (x), tt8a tt8ax, false, false, PersistedView, true +select * from tt7a left join tt8a using (x), tt8a tt8ax, false, false, PersistedView, COMPENSATION, true +- Union false, false :- Project [a#x, b#x, c#x, d#x, e#x] : +- SubqueryAlias v @@ -1716,7 +1716,7 @@ CreateDataSourceTableCommand `spark_catalog`.`testviewschm2`.`tt10`, false -- !query create view vv5 as select x,y,z from tt9 join tt10 using(x) -- !query analysis -CreateViewCommand `spark_catalog`.`testviewschm2`.`vv5`, select x,y,z from tt9 join tt10 using(x), false, false, PersistedView, true +CreateViewCommand `spark_catalog`.`testviewschm2`.`vv5`, select x,y,z from tt9 join tt10 using(x), false, false, PersistedView, COMPENSATION, true +- Project [x#x, y#x, z#x] +- Project [x#x, xx#x, y#x, z#x] +- Join Inner, (x#x = x#x) @@ -1761,7 +1761,7 @@ create view vv6 as select x,y,z,q from (tt11 join tt12 using(x)) join tt13 using(z) -- !query analysis CreateViewCommand `spark_catalog`.`testviewschm2`.`vv6`, select x,y,z,q from - (tt11 join tt12 using(x)) join tt13 using(z), false, false, PersistedView, true + (tt11 join tt12 using(x)) join tt13 using(z), false, false, PersistedView, COMPENSATION, true +- Project [x#x, y#x, z#x, q#x] +- Project [z#x, x#x, y#x, q#x] +- Join Inner, (z#x = z#x) @@ -1807,7 +1807,7 @@ create view tt18v as -- !query analysis CreateViewCommand `spark_catalog`.`testviewschm2`.`tt18v`, select * from int8_tbl xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxy union all - select * from int8_tbl xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxz, false, false, PersistedView, true + select * from int8_tbl xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxz, false, false, PersistedView, COMPENSATION, true +- Union false, false :- Project [q1#x, q2#x] : +- SubqueryAlias xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxy @@ -1829,7 +1829,7 @@ DescribeTableCommand `spark_catalog`.`testviewschm2`.`tt18v`, false, [col_name#x create view tt21v as select * from tt5 natural inner join tt6 -- !query analysis -CreateViewCommand `spark_catalog`.`testviewschm2`.`tt21v`, select * from tt5 natural inner join tt6, false, false, PersistedView, true +CreateViewCommand `spark_catalog`.`testviewschm2`.`tt21v`, select * from tt5 natural inner join tt6, false, false, PersistedView, COMPENSATION, true +- Project [c#x, a#x, b#x, cc#x, d#x] +- Project [c#x, a#x, b#x, cc#x, d#x] +- Join Inner, (c#x = c#x) @@ -1849,7 +1849,7 @@ DescribeTableCommand `spark_catalog`.`testviewschm2`.`tt21v`, false, [col_name#x create view tt22v as select * from tt5 natural left join tt6 -- !query analysis -CreateViewCommand `spark_catalog`.`testviewschm2`.`tt22v`, select * from tt5 natural left join tt6, false, false, PersistedView, true +CreateViewCommand `spark_catalog`.`testviewschm2`.`tt22v`, select * from tt5 natural left join tt6, false, false, PersistedView, COMPENSATION, true +- Project [c#x, a#x, b#x, cc#x, d#x] +- Project [c#x, a#x, b#x, cc#x, d#x] +- Join LeftOuter, (c#x = c#x) @@ -1873,7 +1873,7 @@ select 42, 43 -- !query analysis CreateViewCommand `spark_catalog`.`testviewschm2`.`tt23v`, [(col_a,None), (col_b,None)], select q1 as other_name1, q2 as other_name2 from int8_tbl union -select 42, 43, false, false, PersistedView, true +select 42, 43, false, false, PersistedView, COMPENSATION, true +- Distinct +- Union false, false :- Project [q1#x AS other_name1#x, q2#x AS other_name2#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/float8.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/float8.sql.out index d30f971628e4d..2f2beda4f1cec 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/float8.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/float8.sql.out @@ -403,7 +403,7 @@ FROM FLOAT8_TBL -- !query analysis CreateViewCommand `UPDATED_FLOAT8_TBL`, SELECT CASE WHEN FLOAT8_TBL.f1 > '0.0' THEN FLOAT8_TBL.f1 * '-1' ELSE FLOAT8_TBL.f1 END AS f1 -FROM FLOAT8_TBL, false, false, LocalTempView, true +FROM FLOAT8_TBL, false, false, LocalTempView, UNSUPPORTED, true +- Project [CASE WHEN (f1#x > cast(0.0 as double)) THEN (f1#x * cast(-1 as double)) ELSE f1#x END AS f1#x] +- SubqueryAlias spark_catalog.default.float8_tbl +- Relation spark_catalog.default.float8_tbl[f1#x] parquet diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/groupingsets.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/groupingsets.sql.out index 173a324dc456c..27e9707425833 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/groupingsets.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/groupingsets.sql.out @@ -9,7 +9,7 @@ create temp view gstest1(a,b,v) CreateViewCommand `gstest1`, [(a,None), (b,None), (v,None)], values (1,1,10),(1,1,11),(1,2,12),(1,2,13),(1,3,14), (2,3,15), (3,3,16),(3,4,17), - (4,1,18),(4,1,19), false, false, LocalTempView, true + (4,1,18),(4,1,19), false, false, LocalTempView, UNSUPPORTED, true +- LocalRelation [col1#x, col2#x, col3#x] @@ -320,7 +320,7 @@ CreateViewCommand `int8_tbl`, SELECT * FROM VALUES (123L, 4567890123456789L), (4567890123456789L, 123L), (4567890123456789L, 4567890123456789L), - (4567890123456789L, -4567890123456789L) as int8_tbl(q1, q2), false, false, LocalTempView, true + (4567890123456789L, -4567890123456789L) as int8_tbl(q1, q2), false, false, LocalTempView, UNSUPPORTED, true +- Project [q1#xL, q2#xL] +- SubqueryAlias int8_tbl +- LocalRelation [q1#xL, q2#xL] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/int8.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/int8.sql.out index c7f3f7bdbbb80..72972469fa6ef 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/int8.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/int8.sql.out @@ -626,7 +626,7 @@ Project [q1#xL, q2#xL, (q1#xL & q2#xL) AS and#xL, (q1#xL | q2#xL) AS or#xL, ~q1# SELECT * FROM range(bigint('+4567890123456789'), bigint('+4567890123456799')) -- !query analysis Project [id#xL] -+- Range (4567890123456789, 4567890123456799, step=1, splits=None) ++- Range (4567890123456789, 4567890123456799, step=1) -- !query @@ -653,7 +653,7 @@ org.apache.spark.sql.AnalysisException SELECT * FROM range(bigint('+4567890123456789'), bigint('+4567890123456799'), 2) -- !query analysis Project [id#xL] -+- Range (4567890123456789, 4567890123456799, step=2, splits=None) ++- Range (4567890123456789, 4567890123456799, step=2) -- !query diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/join.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/join.sql.out index 0147e84cb5a9c..37ec8291c4e4b 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/join.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/join.sql.out @@ -12,7 +12,7 @@ CreateViewCommand `INT2_TBL`, [(f1,None)], VALUES (smallint(trim(' 1234 '))), (smallint(trim(' -1234'))), (smallint('32767')), - (smallint('-32767')), false, true, LocalTempView, true + (smallint('-32767')), false, true, LocalTempView, UNSUPPORTED, true +- LocalRelation [col1#x] @@ -23,7 +23,7 @@ CREATE OR REPLACE TEMPORARY VIEW INT4_TBL AS SELECT * FROM -- !query analysis CreateViewCommand `INT4_TBL`, SELECT * FROM (VALUES (0), (123456), (-123456), (2147483647), (-2147483647)) - AS v(f1), false, true, LocalTempView, true + AS v(f1), false, true, LocalTempView, UNSUPPORTED, true +- Project [f1#x] +- SubqueryAlias v +- Project [col1#x AS f1#x] @@ -47,7 +47,7 @@ CreateViewCommand `INT8_TBL`, SELECT * FROM (4567890123456789, 123), (4567890123456789, 4567890123456789), (4567890123456789, -4567890123456789)) - AS v(q1, q2), false, true, LocalTempView, true + AS v(q1, q2), false, true, LocalTempView, UNSUPPORTED, true +- Project [q1#xL, q2#xL] +- SubqueryAlias v +- Project [col1#xL AS q1#xL, col2#xL AS q2#xL] @@ -63,7 +63,7 @@ CREATE OR REPLACE TEMPORARY VIEW FLOAT8_TBL AS SELECT * FROM CreateViewCommand `FLOAT8_TBL`, SELECT * FROM (VALUES (0.0), (1004.30), (-34.84), (cast('1.2345678901234e+200' as double)), (cast('1.2345678901234e-200' as double))) - AS v(f1), false, true, LocalTempView, true + AS v(f1), false, true, LocalTempView, UNSUPPORTED, true +- Project [f1#x] +- SubqueryAlias v +- Project [col1#x AS f1#x] @@ -77,7 +77,7 @@ CREATE OR REPLACE TEMPORARY VIEW TEXT_TBL AS SELECT * FROM -- !query analysis CreateViewCommand `TEXT_TBL`, SELECT * FROM (VALUES ('doh!'), ('hi de ho neighbor')) - AS v(f1), false, true, LocalTempView, true + AS v(f1), false, true, LocalTempView, UNSUPPORTED, true +- Project [f1#x] +- SubqueryAlias v +- Project [col1#x AS f1#x] @@ -87,7 +87,7 @@ CreateViewCommand `TEXT_TBL`, SELECT * FROM -- !query CREATE OR REPLACE TEMPORARY VIEW tenk2 AS SELECT * FROM tenk1 -- !query analysis -CreateViewCommand `tenk2`, SELECT * FROM tenk1, false, true, LocalTempView, true +CreateViewCommand `tenk2`, SELECT * FROM tenk1, false, true, LocalTempView, UNSUPPORTED, true +- Project [unique1#x, unique2#x, two#x, four#x, ten#x, twenty#x, hundred#x, thousand#x, twothousand#x, fivethous#x, tenthous#x, odd#x, even#x, stringu1#x, stringu2#x, string4#x] +- SubqueryAlias spark_catalog.default.tenk1 +- Relation spark_catalog.default.tenk1[unique1#x,unique2#x,two#x,four#x,ten#x,twenty#x,hundred#x,thousand#x,twothousand#x,fivethous#x,tenthous#x,odd#x,even#x,stringu1#x,stringu2#x,string4#x] parquet @@ -1001,7 +1001,7 @@ create or replace temporary view x as select * from -- !query analysis CreateViewCommand `x`, select * from (values (1,11), (2,22), (3,null), (4,44), (5,null)) - as v(x1, x2), false, true, LocalTempView, true + as v(x1, x2), false, true, LocalTempView, UNSUPPORTED, true +- Project [x1#x, x2#x] +- SubqueryAlias v +- Project [col1#x AS x1#x, col2#x AS x2#x] @@ -1015,7 +1015,7 @@ create or replace temporary view y as select * from -- !query analysis CreateViewCommand `y`, select * from (values (1,111), (2,222), (3,333), (4,null)) - as v(y1, y2), false, true, LocalTempView, true + as v(y1, y2), false, true, LocalTempView, UNSUPPORTED, true +- Project [y1#x, y2#x] +- SubqueryAlias v +- Project [col1#x AS y1#x, col2#x AS y2#x] @@ -1625,7 +1625,7 @@ create or replace temporary view tt1 as select * from -- !query analysis CreateViewCommand `tt1`, select * from (values (1, 11), (2, NULL)) - as v(tt1_id, joincol), false, true, LocalTempView, true + as v(tt1_id, joincol), false, true, LocalTempView, UNSUPPORTED, true +- Project [tt1_id#x, joincol#x] +- SubqueryAlias v +- Project [col1#x AS tt1_id#x, col2#x AS joincol#x] @@ -1639,7 +1639,7 @@ create or replace temporary view tt2 as select * from -- !query analysis CreateViewCommand `tt2`, select * from (values (21, 11), (22, 11)) - as v(tt2_id, joincol), false, true, LocalTempView, true + as v(tt2_id, joincol), false, true, LocalTempView, UNSUPPORTED, true +- Project [tt2_id#x, joincol#x] +- SubqueryAlias v +- Project [col1#x AS tt2_id#x, col2#x AS joincol#x] @@ -1710,13 +1710,13 @@ create or replace temporary view tt3 as select * from -- !query analysis CreateViewCommand `tt3`, select * from (SELECT cast(x.id as int), repeat('xyzzy', 100) FROM range(1,10001) x) - as v(f1, f2), false, true, LocalTempView, true + as v(f1, f2), false, true, LocalTempView, UNSUPPORTED, true +- Project [f1#x, f2#x] +- SubqueryAlias v +- Project [id#x AS f1#x, repeat(xyzzy, 100)#x AS f2#x] +- Project [cast(id#xL as int) AS id#x, repeat(xyzzy, 100) AS repeat(xyzzy, 100)#x] +- SubqueryAlias x - +- Range (1, 10001, step=1, splits=None) + +- Range (1, 10001, step=1) -- !query @@ -1726,7 +1726,7 @@ create or replace temporary view tt4 as select * from -- !query analysis CreateViewCommand `tt4`, select * from (values (0), (1), (9999)) - as v(f1), false, true, LocalTempView, true + as v(f1), false, true, LocalTempView, UNSUPPORTED, true +- Project [f1#x] +- SubqueryAlias v +- Project [col1#x AS f1#x] @@ -1767,7 +1767,7 @@ Project [f1#x] : +- Project [id#x AS f1#x, repeat(xyzzy, 100)#x AS f2#x] : +- Project [cast(id#xL as int) AS id#x, repeat(xyzzy, 100) AS repeat(xyzzy, 100)#x] : +- SubqueryAlias x - : +- Range (1, 10001, step=1, splits=None) + : +- Range (1, 10001, step=1) +- SubqueryAlias c +- SubqueryAlias tt3 +- View (`tt3`, [f1#x, f2#x]) @@ -1777,7 +1777,7 @@ Project [f1#x] +- Project [id#x AS f1#x, repeat(xyzzy, 100)#x AS f2#x] +- Project [cast(id#xL as int) AS id#x, repeat(xyzzy, 100) AS repeat(xyzzy, 100)#x] +- SubqueryAlias x - +- Range (1, 10001, step=1, splits=None) + +- Range (1, 10001, step=1) -- !query @@ -1787,7 +1787,7 @@ create or replace temporary view tt5 as select * from -- !query analysis CreateViewCommand `tt5`, select * from (values (1, 10), (1, 11)) - as v(f1, f2), false, true, LocalTempView, true + as v(f1, f2), false, true, LocalTempView, UNSUPPORTED, true +- Project [f1#x, f2#x] +- SubqueryAlias v +- Project [col1#x AS f1#x, col2#x AS f2#x] @@ -1801,7 +1801,7 @@ create or replace temporary view tt6 as select * from -- !query analysis CreateViewCommand `tt6`, select * from (values (1, 9), (1, 2), (2, 9)) - as v(f1, f2), false, true, LocalTempView, true + as v(f1, f2), false, true, LocalTempView, UNSUPPORTED, true +- Project [f1#x, f2#x] +- SubqueryAlias v +- Project [col1#x AS f1#x, col2#x AS f2#x] @@ -1837,7 +1837,7 @@ create or replace temporary view xx as select * from -- !query analysis CreateViewCommand `xx`, select * from (values (1), (2), (3)) - as v(pkxx), false, true, LocalTempView, true + as v(pkxx), false, true, LocalTempView, UNSUPPORTED, true +- Project [pkxx#x] +- SubqueryAlias v +- Project [col1#x AS pkxx#x] @@ -1851,7 +1851,7 @@ create or replace temporary view yy as select * from -- !query analysis CreateViewCommand `yy`, select * from (values (101, 1), (201, 2), (301, NULL)) - as v(pkyy, pkxx), false, true, LocalTempView, true + as v(pkyy, pkxx), false, true, LocalTempView, UNSUPPORTED, true +- Project [pkyy#x, pkxx#x] +- SubqueryAlias v +- Project [col1#x AS pkyy#x, col2#x AS pkxx#x] @@ -1912,7 +1912,7 @@ create or replace temporary view zt1 as select * from -- !query analysis CreateViewCommand `zt1`, select * from (values (53)) - as v(f1), false, true, LocalTempView, true + as v(f1), false, true, LocalTempView, UNSUPPORTED, true +- Project [f1#x] +- SubqueryAlias v +- Project [col1#x AS f1#x] @@ -1926,7 +1926,7 @@ create or replace temporary view zt2 as select * from -- !query analysis CreateViewCommand `zt2`, select * from (values (53)) - as v(f2), false, true, LocalTempView, true + as v(f2), false, true, LocalTempView, UNSUPPORTED, true +- Project [f2#x] +- SubqueryAlias v +- Project [col1#x AS f2#x] @@ -1971,7 +1971,7 @@ Project [f2#x, f3#x, f1#x] -- !query create temp view zv1 as select *,'dummy' AS junk from zt1 -- !query analysis -CreateViewCommand `zv1`, select *,'dummy' AS junk from zt1, false, false, LocalTempView, true +CreateViewCommand `zv1`, select *,'dummy' AS junk from zt1, false, false, LocalTempView, UNSUPPORTED, true +- Project [f1#x, dummy AS junk#x] +- SubqueryAlias zt1 +- View (`zt1`, [f1#x]) @@ -2146,7 +2146,7 @@ create or replace temporary view a as select * from -- !query analysis CreateViewCommand `a`, select * from (values ('p'), ('q')) - as v(code), false, true, LocalTempView, true + as v(code), false, true, LocalTempView, UNSUPPORTED, true +- Project [code#x] +- SubqueryAlias v +- Project [col1#x AS code#x] @@ -2160,7 +2160,7 @@ create or replace temporary view b as select * from -- !query analysis CreateViewCommand `b`, select * from (values ('p', 1), ('p', 2)) - as v(a, num), false, true, LocalTempView, true + as v(a, num), false, true, LocalTempView, UNSUPPORTED, true +- Project [a#x, num#x] +- SubqueryAlias v +- Project [col1#x AS a#x, col2#x AS num#x] @@ -2174,7 +2174,7 @@ create or replace temporary view c as select * from -- !query analysis CreateViewCommand `c`, select * from (values ('A', 'p'), ('B', 'q'), ('C', null)) - as v(name, a), false, true, LocalTempView, true + as v(name, a), false, true, LocalTempView, UNSUPPORTED, true +- Project [name#x, a#x] +- SubqueryAlias v +- Project [col1#x AS name#x, col2#x AS a#x] @@ -2346,7 +2346,7 @@ create or replace temporary view nt1 as select * from -- !query analysis CreateViewCommand `nt1`, select * from (values(1,true,true), (2,true,false), (3,false,false)) - as v(id, a1, a2), false, true, LocalTempView, true + as v(id, a1, a2), false, true, LocalTempView, UNSUPPORTED, true +- Project [id#x, a1#x, a2#x] +- SubqueryAlias v +- Project [col1#x AS id#x, col2#x AS a1#x, col3#x AS a2#x] @@ -2360,7 +2360,7 @@ create or replace temporary view nt2 as select * from -- !query analysis CreateViewCommand `nt2`, select * from (values(1,1,true,true), (2,2,true,false), (3,3,false,false)) - as v(id, nt1_id, b1, b2), false, true, LocalTempView, true + as v(id, nt1_id, b1, b2), false, true, LocalTempView, UNSUPPORTED, true +- Project [id#x, nt1_id#x, b1#x, b2#x] +- SubqueryAlias v +- Project [col1#x AS id#x, col2#x AS nt1_id#x, col3#x AS b1#x, col4#x AS b2#x] @@ -2374,7 +2374,7 @@ create or replace temporary view nt3 as select * from -- !query analysis CreateViewCommand `nt3`, select * from (values(1,1,true), (2,2,false), (3,3,true)) - as v(id, nt2_id, c1), false, true, LocalTempView, true + as v(id, nt2_id, c1), false, true, LocalTempView, UNSUPPORTED, true +- Project [id#x, nt2_id#x, c1#x] +- SubqueryAlias v +- Project [col1#x AS id#x, col2#x AS nt2_id#x, col3#x AS c1#x] @@ -3050,7 +3050,7 @@ create or replace temporary view parent as select * from -- !query analysis CreateViewCommand `parent`, select * from (values (1, 10), (2, 20), (3, 30)) - as v(k, pd), false, true, LocalTempView, true + as v(k, pd), false, true, LocalTempView, UNSUPPORTED, true +- Project [k#x, pd#x] +- SubqueryAlias v +- Project [col1#x AS k#x, col2#x AS pd#x] @@ -3064,7 +3064,7 @@ create or replace temporary view child as select * from -- !query analysis CreateViewCommand `child`, select * from (values (1, 100), (4, 400)) - as v(k, cd), false, true, LocalTempView, true + as v(k, cd), false, true, LocalTempView, UNSUPPORTED, true +- Project [k#x, cd#x] +- SubqueryAlias v +- Project [col1#x AS k#x, col2#x AS cd#x] @@ -3189,7 +3189,7 @@ create or replace temporary view a as select * from -- !query analysis CreateViewCommand `a`, select * from (values (0), (1)) - as v(id), false, true, LocalTempView, true + as v(id), false, true, LocalTempView, UNSUPPORTED, true +- Project [id#x] +- SubqueryAlias v +- Project [col1#x AS id#x] @@ -3203,7 +3203,7 @@ create or replace temporary view b as select * from -- !query analysis CreateViewCommand `b`, select * from (values (0, 0), (1, NULL)) - as v(id, a_id), false, true, LocalTempView, true + as v(id, a_id), false, true, LocalTempView, UNSUPPORTED, true +- Project [id#x, a_id#x] +- SubqueryAlias v +- Project [col1#x AS id#x, col2#x AS a_id#x] @@ -3261,7 +3261,7 @@ create or replace temporary view innertab as select * from -- !query analysis CreateViewCommand `innertab`, select * from (values (123L, 42L)) - as v(id, dat1), false, true, LocalTempView, true + as v(id, dat1), false, true, LocalTempView, UNSUPPORTED, true +- Project [id#xL, dat1#xL] +- SubqueryAlias v +- Project [col1#xL AS id#xL, col2#xL AS dat1#xL] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/limit.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/limit.sql.out index c25002d7a6be6..2a5062bd65b72 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/limit.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/limit.sql.out @@ -129,7 +129,7 @@ CreateViewCommand `INT8_TBL`, SELECT * FROM (4567890123456789, 123), (4567890123456789, 4567890123456789), (4567890123456789, -4567890123456789)) - AS v(q1, q2), false, true, LocalTempView, true + AS v(q1, q2), false, true, LocalTempView, UNSUPPORTED, true +- Project [q1#xL, q2#xL] +- SubqueryAlias v +- Project [col1#xL AS q1#xL, col2#xL AS q2#xL] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/numeric.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/numeric.sql.out index 418e3b2626f89..6c2ae23291755 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/numeric.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/numeric.sql.out @@ -4775,21 +4775,21 @@ Project [EXP(cast(1234.5678 as double)) AS EXP(1234.5678)#x] select * from range(cast(0.0 as decimal(38, 18)), cast(4.0 as decimal(38, 18))) -- !query analysis Project [id#xL] -+- Range (0, 4, step=1, splits=None) ++- Range (0, 4, step=1) -- !query select * from range(cast(0.1 as decimal(38, 18)), cast(4.0 as decimal(38, 18)), cast(1.3 as decimal(38, 18))) -- !query analysis Project [id#xL] -+- Range (0, 4, step=1, splits=None) ++- Range (0, 4, step=1) -- !query select * from range(cast(4.0 as decimal(38, 18)), cast(-1.5 as decimal(38, 18)), cast(-2.2 as decimal(38, 18))) -- !query analysis Project [id#xL] -+- Range (4, -1, step=-2, splits=None) ++- Range (4, -1, step=-2) -- !query @@ -4894,14 +4894,14 @@ Project [LOG(cast(1.000016 as double), 8.45201E18) AS LOG(1.000016, 8.45201E18)# SELECT SUM(decimal(9999)) FROM range(1, 100001) -- !query analysis Aggregate [sum(cast(9999 as decimal(10,0))) AS sum(9999)#x] -+- Range (1, 100001, step=1, splits=None) ++- Range (1, 100001, step=1) -- !query SELECT SUM(decimal(-9999)) FROM range(1, 100001) -- !query analysis Aggregate [sum(cast(-9999 as decimal(10,0))) AS sum(-9999)#x] -+- Range (1, 100001, step=1, splits=None) ++- Range (1, 100001, step=1) -- !query diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/select.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/select.sql.out index 875673dd72ec0..ed15cce62dc78 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/select.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/select.sql.out @@ -2,7 +2,7 @@ -- !query create or replace temporary view onek2 as select * from onek -- !query analysis -CreateViewCommand `onek2`, select * from onek, false, true, LocalTempView, true +CreateViewCommand `onek2`, select * from onek, false, true, LocalTempView, UNSUPPORTED, true +- Project [unique1#x, unique2#x, two#x, four#x, ten#x, twenty#x, hundred#x, thousand#x, twothousand#x, fivethous#x, tenthous#x, odd#x, even#x, stringu1#x, stringu2#x, string4#x] +- SubqueryAlias spark_catalog.default.onek +- Relation spark_catalog.default.onek[unique1#x,unique2#x,two#x,four#x,ten#x,twenty#x,hundred#x,thousand#x,twothousand#x,fivethous#x,tenthous#x,odd#x,even#x,stringu1#x,stringu2#x,string4#x] parquet @@ -23,7 +23,7 @@ CreateViewCommand `INT8_TBL`, select * from values (cast('4567890123456789' as bigint),cast('123' as bigint)), (cast(+4567890123456789 as bigint),cast('4567890123456789' as bigint)), (cast('+4567890123456789' as bigint),cast('-4567890123456789' as bigint)) - as INT8_TBL(q1, q2), false, true, LocalTempView, true + as INT8_TBL(q1, q2), false, true, LocalTempView, UNSUPPORTED, true +- Project [q1#xL, q2#xL] +- SubqueryAlias INT8_TBL +- LocalRelation [q1#xL, q2#xL] @@ -240,7 +240,7 @@ Union false, false CREATE OR REPLACE TEMPORARY VIEW foo AS SELECT * FROM (values(42),(3),(10),(7),(null),(null),(1)) as foo (f1) -- !query analysis -CreateViewCommand `foo`, SELECT * FROM (values(42),(3),(10),(7),(null),(null),(1)) as foo (f1), false, true, LocalTempView, true +CreateViewCommand `foo`, SELECT * FROM (values(42),(3),(10),(7),(null),(null),(1)) as foo (f1), false, true, LocalTempView, UNSUPPORTED, true +- Project [f1#x] +- SubqueryAlias foo +- Project [col1#x AS f1#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/select_distinct.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/select_distinct.sql.out index bd653048a2ed7..632df1ed53fd4 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/select_distinct.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/select_distinct.sql.out @@ -5,7 +5,7 @@ SELECT two, stringu1, ten, string4 FROM onek -- !query analysis CreateViewCommand `tmp`, SELECT two, stringu1, ten, string4 -FROM onek, false, true, LocalTempView, true +FROM onek, false, true, LocalTempView, UNSUPPORTED, true +- Project [two#x, stringu1#x, ten#x, string4#x] +- SubqueryAlias spark_catalog.default.onek +- Relation spark_catalog.default.onek[unique1#x,unique2#x,two#x,four#x,ten#x,twenty#x,hundred#x,thousand#x,twothousand#x,fivethous#x,tenthous#x,odd#x,even#x,stringu1#x,stringu2#x,string4#x] parquet @@ -88,7 +88,7 @@ CREATE OR REPLACE TEMPORARY VIEW disttable AS SELECT * FROM -- !query analysis CreateViewCommand `disttable`, SELECT * FROM (VALUES (1), (2), (3), (NULL)) - AS v(f1), false, true, LocalTempView, true + AS v(f1), false, true, LocalTempView, UNSUPPORTED, true +- Project [f1#x] +- SubqueryAlias v +- Project [col1#x AS f1#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/text.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/text.sql.out index 7fb0649f5e778..474c2401f40d0 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/text.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/text.sql.out @@ -137,7 +137,7 @@ Sort [i#xL ASC NULLS FIRST], true +- Project [i#xL, left(ahoj, cast(i#xL as int)) AS left(ahoj, i)#x, right(ahoj, cast(i#xL as int)) AS right(ahoj, i)#x] +- SubqueryAlias t +- Project [id#xL AS i#xL] - +- Range (-5, 6, step=1, splits=None) + +- Range (-5, 6, step=1) -- !query diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/union.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/union.sql.out index 2865d35a3d6a3..05002a7c45386 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/union.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/union.sql.out @@ -6,7 +6,7 @@ CREATE OR REPLACE TEMPORARY VIEW INT4_TBL AS SELECT * FROM -- !query analysis CreateViewCommand `INT4_TBL`, SELECT * FROM (VALUES (0), (123456), (-123456), (2147483647), (-2147483647)) - AS v(f1), false, true, LocalTempView, true + AS v(f1), false, true, LocalTempView, UNSUPPORTED, true +- Project [f1#x] +- SubqueryAlias v +- Project [col1#x AS f1#x] @@ -30,7 +30,7 @@ CreateViewCommand `INT8_TBL`, SELECT * FROM (4567890123456789, 123), (4567890123456789, 4567890123456789), (4567890123456789, -4567890123456789)) - AS v(q1, q2), false, true, LocalTempView, true + AS v(q1, q2), false, true, LocalTempView, UNSUPPORTED, true +- Project [q1#xL, q2#xL] +- SubqueryAlias v +- Project [col1#xL AS q1#xL, col2#xL AS q2#xL] @@ -46,7 +46,7 @@ CREATE OR REPLACE TEMPORARY VIEW FLOAT8_TBL AS SELECT * FROM CreateViewCommand `FLOAT8_TBL`, SELECT * FROM (VALUES (0.0), (-34.84), (-1004.30), (CAST('-1.2345678901234e+200' AS DOUBLE)), (CAST('-1.2345678901234e-200' AS DOUBLE))) - AS v(f1), false, true, LocalTempView, true + AS v(f1), false, true, LocalTempView, UNSUPPORTED, true +- Project [f1#x] +- SubqueryAlias v +- Project [col1#x AS f1#x] @@ -1059,9 +1059,9 @@ select * from range(1,5) union select * from range(1,3) Distinct +- Union false, false :- Project [id#xL] - : +- Range (1, 5, step=1, splits=None) + : +- Range (1, 5, step=1) +- Project [id#xL] - +- Range (1, 3, step=1, splits=None) + +- Range (1, 3, step=1) -- !query @@ -1069,9 +1069,9 @@ select * from range(1,6) union all select * from range(1,4) -- !query analysis Union false, false :- Project [id#xL] -: +- Range (1, 6, step=1, splits=None) +: +- Range (1, 6, step=1) +- Project [id#xL] - +- Range (1, 4, step=1, splits=None) + +- Range (1, 4, step=1) -- !query @@ -1079,9 +1079,9 @@ select * from range(1,6) intersect select * from range(1,4) -- !query analysis Intersect false :- Project [id#xL] -: +- Range (1, 6, step=1, splits=None) +: +- Range (1, 6, step=1) +- Project [id#xL] - +- Range (1, 4, step=1, splits=None) + +- Range (1, 4, step=1) -- !query @@ -1089,9 +1089,9 @@ select * from range(1,6) intersect all select * from range(1,4) -- !query analysis Intersect All true :- Project [id#xL] -: +- Range (1, 6, step=1, splits=None) +: +- Range (1, 6, step=1) +- Project [id#xL] - +- Range (1, 4, step=1, splits=None) + +- Range (1, 4, step=1) -- !query @@ -1099,9 +1099,9 @@ select * from range(1,6) except select * from range(1,4) -- !query analysis Except false :- Project [id#xL] -: +- Range (1, 6, step=1, splits=None) +: +- Range (1, 6, step=1) +- Project [id#xL] - +- Range (1, 4, step=1, splits=None) + +- Range (1, 4, step=1) -- !query @@ -1109,9 +1109,9 @@ select * from range(1,6) except all select * from range(1,4) -- !query analysis Except All true :- Project [id#xL] -: +- Range (1, 6, step=1, splits=None) +: +- Range (1, 6, step=1) +- Project [id#xL] - +- Range (1, 4, step=1, splits=None) + +- Range (1, 4, step=1) -- !query @@ -1120,9 +1120,9 @@ select * from range(1,6) union select * from range(1,4) Distinct +- Union false, false :- Project [id#xL] - : +- Range (1, 6, step=1, splits=None) + : +- Range (1, 6, step=1) +- Project [id#xL] - +- Range (1, 4, step=1, splits=None) + +- Range (1, 4, step=1) -- !query @@ -1130,9 +1130,9 @@ select * from range(1,6) union all select * from range(1,4) -- !query analysis Union false, false :- Project [id#xL] -: +- Range (1, 6, step=1, splits=None) +: +- Range (1, 6, step=1) +- Project [id#xL] - +- Range (1, 4, step=1, splits=None) + +- Range (1, 4, step=1) -- !query @@ -1140,9 +1140,9 @@ select * from range(1,6) intersect select * from range(1,4) -- !query analysis Intersect false :- Project [id#xL] -: +- Range (1, 6, step=1, splits=None) +: +- Range (1, 6, step=1) +- Project [id#xL] - +- Range (1, 4, step=1, splits=None) + +- Range (1, 4, step=1) -- !query @@ -1150,9 +1150,9 @@ select * from range(1,6) intersect all select * from range(1,4) -- !query analysis Intersect All true :- Project [id#xL] -: +- Range (1, 6, step=1, splits=None) +: +- Range (1, 6, step=1) +- Project [id#xL] - +- Range (1, 4, step=1, splits=None) + +- Range (1, 4, step=1) -- !query @@ -1160,9 +1160,9 @@ select * from range(1,6) except select * from range(1,4) -- !query analysis Except false :- Project [id#xL] -: +- Range (1, 6, step=1, splits=None) +: +- Range (1, 6, step=1) +- Project [id#xL] - +- Range (1, 4, step=1, splits=None) + +- Range (1, 4, step=1) -- !query @@ -1170,9 +1170,9 @@ select * from range(1,6) except all select * from range(1,4) -- !query analysis Except All true :- Project [id#xL] -: +- Range (1, 6, step=1, splits=None) +: +- Range (1, 6, step=1) +- Project [id#xL] - +- Range (1, 4, step=1, splits=None) + +- Range (1, 4, step=1) -- !query @@ -1223,7 +1223,7 @@ Sort [x#xL ASC NULLS FIRST], true +- Distinct +- Union false, false :- Project [1 AS t#x, id#xL AS x#xL] - : +- Range (1, 11, step=1, splits=None) + : +- Range (1, 11, step=1) +- Project [t#x, cast(x#x as bigint) AS x#xL] +- Project [2 AS t#x, 4 AS x#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/window_part1.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/window_part1.sql.out index 40355dbd7b9b9..6cdf71e33c73f 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/window_part1.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/window_part1.sql.out @@ -2,7 +2,7 @@ -- !query CREATE TEMPORARY VIEW tenk2 AS SELECT * FROM tenk1 -- !query analysis -CreateViewCommand `tenk2`, SELECT * FROM tenk1, false, false, LocalTempView, true +CreateViewCommand `tenk2`, SELECT * FROM tenk1, false, false, LocalTempView, UNSUPPORTED, true +- Project [unique1#x, unique2#x, two#x, four#x, ten#x, twenty#x, hundred#x, thousand#x, twothousand#x, fivethous#x, tenthous#x, odd#x, even#x, stringu1#x, stringu2#x, string4#x] +- SubqueryAlias spark_catalog.default.tenk1 +- Relation spark_catalog.default.tenk1[unique1#x,unique2#x,two#x,four#x,ten#x,twenty#x,hundred#x,thousand#x,twothousand#x,fivethous#x,tenthous#x,odd#x,even#x,stringu1#x,stringu2#x,string4#x] parquet @@ -343,7 +343,7 @@ CreateViewCommand `int4_tbl`, select * from values (-123456), (2147483647), (-2147483647) - as int4_tbl(f1), false, false, LocalTempView, true + as int4_tbl(f1), false, false, LocalTempView, UNSUPPORTED, true +- Project [f1#x] +- SubqueryAlias int4_tbl +- LocalRelation [f1#x] @@ -522,13 +522,13 @@ SELECT i.id, sum(i.id) over (order by i.id rows between 1 preceding and 1 follow FROM range(1, 11) i -- !query analysis CreateViewCommand `v_window`, SELECT i.id, sum(i.id) over (order by i.id rows between 1 preceding and 1 following) as sum_rows -FROM range(1, 11) i, false, false, LocalTempView, true +FROM range(1, 11) i, false, false, LocalTempView, UNSUPPORTED, true +- Project [id#xL, sum_rows#xL] +- Project [id#xL, sum_rows#xL, sum_rows#xL] +- Window [sum(id#xL) windowspecdefinition(id#xL ASC NULLS FIRST, specifiedwindowframe(RowFrame, -1, 1)) AS sum_rows#xL], [id#xL ASC NULLS FIRST] +- Project [id#xL] +- SubqueryAlias i - +- Range (1, 11, step=1, splits=None) + +- Range (1, 11, step=1) -- !query @@ -543,7 +543,7 @@ Project [id#xL, sum_rows#xL] +- Window [sum(id#xL) windowspecdefinition(id#xL ASC NULLS FIRST, specifiedwindowframe(RowFrame, -1, 1)) AS sum_rows#xL], [id#xL ASC NULLS FIRST] +- Project [id#xL] +- SubqueryAlias i - +- Range (1, 11, step=1, splits=None) + +- Range (1, 11, step=1) -- !query diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/window_part2.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/window_part2.sql.out index dfc4fdde71748..cdcd563de4f6a 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/window_part2.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/window_part2.sql.out @@ -90,7 +90,7 @@ Project [id#xL, y#xL, first(y) OVER (ORDER BY id ASC NULLS FIRST RANGE BETWEEN ( :- Union false, false : :- Project [id#xL, id#xL AS y#xL] : : +- SubqueryAlias x - : : +- Range (1, 6, step=1, splits=None) + : : +- Range (1, 6, step=1) : +- Project [cast(NULL#x as bigint) AS NULL#xL, cast(42#x as bigint) AS 42#xL] : +- Project [null AS NULL#x, 42 AS 42#x] : +- OneRowRelation @@ -119,7 +119,7 @@ Project [id#xL, y#xL, first(y) OVER (ORDER BY id ASC NULLS LAST RANGE BETWEEN (- :- Union false, false : :- Project [id#xL, id#xL AS y#xL] : : +- SubqueryAlias x - : : +- Range (1, 6, step=1, splits=None) + : : +- Range (1, 6, step=1) : +- Project [cast(NULL#x as bigint) AS NULL#xL, cast(42#x as bigint) AS 42#xL] : +- Project [null AS NULL#x, 42 AS 42#x] : +- OneRowRelation @@ -148,7 +148,7 @@ Project [id#xL, y#xL, first(y) OVER (ORDER BY id DESC NULLS FIRST RANGE BETWEEN :- Union false, false : :- Project [id#xL, id#xL AS y#xL] : : +- SubqueryAlias x - : : +- Range (1, 6, step=1, splits=None) + : : +- Range (1, 6, step=1) : +- Project [cast(NULL#x as bigint) AS NULL#xL, cast(42#x as bigint) AS 42#xL] : +- Project [null AS NULL#x, 42 AS 42#x] : +- OneRowRelation @@ -177,7 +177,7 @@ Project [id#xL, y#xL, first(y) OVER (ORDER BY id DESC NULLS LAST RANGE BETWEEN ( :- Union false, false : :- Project [id#xL, id#xL AS y#xL] : : +- SubqueryAlias x - : : +- Range (1, 6, step=1, splits=None) + : : +- Range (1, 6, step=1) : +- Project [cast(NULL#x as bigint) AS NULL#xL, cast(42#x as bigint) AS 42#xL] : +- Project [null AS NULL#x, 42 AS 42#x] : +- OneRowRelation @@ -195,7 +195,7 @@ Project [id#xL, last(id) OVER (ORDER BY id ASC NULLS FIRST RANGE BETWEEN CURRENT +- Window [last(id#xL, false) windowspecdefinition(id#xL ASC NULLS FIRST, specifiedwindowframe(RangeFrame, currentrow$(), cast(2147450884 as bigint))) AS last(id) OVER (ORDER BY id ASC NULLS FIRST RANGE BETWEEN CURRENT ROW AND 2147450884 FOLLOWING)#xL], [id#xL ASC NULLS FIRST] +- Project [id#xL] +- SubqueryAlias x - +- Range (32764, 32767, step=1, splits=None) + +- Range (32764, 32767, step=1) -- !query @@ -207,7 +207,7 @@ Project [id#xL, last(id) OVER (ORDER BY id DESC NULLS LAST RANGE BETWEEN CURRENT +- Window [last(id#xL, false) windowspecdefinition(id#xL DESC NULLS LAST, specifiedwindowframe(RangeFrame, currentrow$(), cast(2147450885 as bigint))) AS last(id) OVER (ORDER BY id DESC NULLS LAST RANGE BETWEEN CURRENT ROW AND 2147450885 FOLLOWING)#xL], [id#xL DESC NULLS LAST] +- Project [id#xL] +- SubqueryAlias x - +- Range (-32766, -32765, step=1, splits=None) + +- Range (-32766, -32765, step=1) -- !query @@ -219,7 +219,7 @@ Project [id#xL, last(id) OVER (ORDER BY id ASC NULLS FIRST RANGE BETWEEN CURRENT +- Window [last(id#xL, false) windowspecdefinition(id#xL ASC NULLS FIRST, specifiedwindowframe(RangeFrame, currentrow$(), cast(4 as bigint))) AS last(id) OVER (ORDER BY id ASC NULLS FIRST RANGE BETWEEN CURRENT ROW AND 4 FOLLOWING)#xL], [id#xL ASC NULLS FIRST] +- Project [id#xL] +- SubqueryAlias x - +- Range (2147483644, 2147483647, step=1, splits=None) + +- Range (2147483644, 2147483647, step=1) -- !query @@ -231,7 +231,7 @@ Project [id#xL, last(id) OVER (ORDER BY id DESC NULLS LAST RANGE BETWEEN CURRENT +- Window [last(id#xL, false) windowspecdefinition(id#xL DESC NULLS LAST, specifiedwindowframe(RangeFrame, currentrow$(), cast(5 as bigint))) AS last(id) OVER (ORDER BY id DESC NULLS LAST RANGE BETWEEN CURRENT ROW AND 5 FOLLOWING)#xL], [id#xL DESC NULLS LAST] +- Project [id#xL] +- SubqueryAlias x - +- Range (-2147483646, -2147483645, step=1, splits=None) + +- Range (-2147483646, -2147483645, step=1) -- !query @@ -243,7 +243,7 @@ Project [id#xL, last(id) OVER (ORDER BY id ASC NULLS FIRST RANGE BETWEEN CURRENT +- Window [last(id#xL, false) windowspecdefinition(id#xL ASC NULLS FIRST, specifiedwindowframe(RangeFrame, currentrow$(), cast(4 as bigint))) AS last(id) OVER (ORDER BY id ASC NULLS FIRST RANGE BETWEEN CURRENT ROW AND 4 FOLLOWING)#xL], [id#xL ASC NULLS FIRST] +- Project [id#xL] +- SubqueryAlias x - +- Range (9223372036854775804, 9223372036854775807, step=1, splits=None) + +- Range (9223372036854775804, 9223372036854775807, step=1) -- !query @@ -255,7 +255,7 @@ Project [id#xL, last(id) OVER (ORDER BY id DESC NULLS LAST RANGE BETWEEN CURRENT +- Window [last(id#xL, false) windowspecdefinition(id#xL DESC NULLS LAST, specifiedwindowframe(RangeFrame, currentrow$(), cast(5 as bigint))) AS last(id) OVER (ORDER BY id DESC NULLS LAST RANGE BETWEEN CURRENT ROW AND 5 FOLLOWING)#xL], [id#xL DESC NULLS LAST] +- Project [id#xL] +- SubqueryAlias x - +- Range (-9223372036854775806, -9223372036854775805, step=1, splits=None) + +- Range (-9223372036854775806, -9223372036854775805, step=1) -- !query diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/window_part3.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/window_part3.sql.out index 7609f898ebf85..9f2dd9bcb1783 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/window_part3.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/window_part3.sql.out @@ -2,7 +2,7 @@ -- !query CREATE TEMPORARY VIEW tenk2 AS SELECT * FROM tenk1 -- !query analysis -CreateViewCommand `tenk2`, SELECT * FROM tenk1, false, false, LocalTempView, true +CreateViewCommand `tenk2`, SELECT * FROM tenk1, false, false, LocalTempView, UNSUPPORTED, true +- Project [unique1#x, unique2#x, two#x, four#x, ten#x, twenty#x, hundred#x, thousand#x, twothousand#x, fivethous#x, tenthous#x, odd#x, even#x, stringu1#x, stringu2#x, string4#x] +- SubqueryAlias spark_catalog.default.tenk1 +- Relation spark_catalog.default.tenk1[unique1#x,unique2#x,two#x,four#x,ten#x,twenty#x,hundred#x,thousand#x,twothousand#x,fivethous#x,tenthous#x,odd#x,even#x,stringu1#x,stringu2#x,string4#x] parquet @@ -93,7 +93,7 @@ WithCTE : +- SubqueryAlias cte : +- Project [id#xL AS x#xL] : +- Project [id#xL] -: +- Range (1, 36, step=2, splits=None) +: +- Range (1, 36, step=2) +- Project [x#xL, sum(x) OVER (ORDER BY x ASC NULLS FIRST ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING)#xL] +- Project [x#xL, sum(x) OVER (ORDER BY x ASC NULLS FIRST ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING)#xL, sum(x) OVER (ORDER BY x ASC NULLS FIRST ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING)#xL] +- Window [sum(x#xL) windowspecdefinition(x#xL ASC NULLS FIRST, specifiedwindowframe(RowFrame, -1, 1)) AS sum(x) OVER (ORDER BY x ASC NULLS FIRST ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING)#xL], [x#xL ASC NULLS FIRST] @@ -115,7 +115,7 @@ WithCTE : +- SubqueryAlias cte : +- Project [id#xL AS x#xL] : +- Project [id#xL] -: +- Range (1, 36, step=2, splits=None) +: +- Range (1, 36, step=2) +- Project [x#xL, sum(x) OVER (ORDER BY x ASC NULLS FIRST RANGE BETWEEN (- 1) FOLLOWING AND 1 FOLLOWING)#xL] +- Project [x#xL, sum(x) OVER (ORDER BY x ASC NULLS FIRST RANGE BETWEEN (- 1) FOLLOWING AND 1 FOLLOWING)#xL, sum(x) OVER (ORDER BY x ASC NULLS FIRST RANGE BETWEEN (- 1) FOLLOWING AND 1 FOLLOWING)#xL] +- Window [sum(x#xL) windowspecdefinition(x#xL ASC NULLS FIRST, specifiedwindowframe(RangeFrame, cast(-1 as bigint), cast(1 as bigint))) AS sum(x) OVER (ORDER BY x ASC NULLS FIRST RANGE BETWEEN (- 1) FOLLOWING AND 1 FOLLOWING)#xL], [x#xL ASC NULLS FIRST] @@ -148,7 +148,7 @@ WithCTE : : +- Project [1 AS 1#x] : : +- OneRowRelation : +- Project [id#xL] -: +- Range (5, 50, step=2, splits=None) +: +- Range (5, 50, step=2) +- Project [x#xL, sum(x) OVER (ORDER BY x ASC NULLS FIRST ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING)#xL] +- Project [x#xL, sum(x) OVER (ORDER BY x ASC NULLS FIRST ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING)#xL, sum(x) OVER (ORDER BY x ASC NULLS FIRST ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING)#xL] +- Window [sum(x#xL) windowspecdefinition(x#xL ASC NULLS FIRST, specifiedwindowframe(RowFrame, -1, 1)) AS sum(x) OVER (ORDER BY x ASC NULLS FIRST ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING)#xL], [x#xL ASC NULLS FIRST] @@ -181,7 +181,7 @@ WithCTE : : +- Project [1 AS 1#x] : : +- OneRowRelation : +- Project [id#xL] -: +- Range (5, 50, step=2, splits=None) +: +- Range (5, 50, step=2) +- Project [x#xL, sum(x) OVER (ORDER BY x ASC NULLS FIRST RANGE BETWEEN (- 1) FOLLOWING AND 1 FOLLOWING)#xL] +- Project [x#xL, sum(x) OVER (ORDER BY x ASC NULLS FIRST RANGE BETWEEN (- 1) FOLLOWING AND 1 FOLLOWING)#xL, sum(x) OVER (ORDER BY x ASC NULLS FIRST RANGE BETWEEN (- 1) FOLLOWING AND 1 FOLLOWING)#xL] +- Window [sum(x#xL) windowspecdefinition(x#xL ASC NULLS FIRST, specifiedwindowframe(RangeFrame, cast(-1 as bigint), cast(1 as bigint))) AS sum(x) OVER (ORDER BY x ASC NULLS FIRST RANGE BETWEEN (- 1) FOLLOWING AND 1 FOLLOWING)#xL], [x#xL ASC NULLS FIRST] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/pred-pushdown.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/pred-pushdown.sql.out index 9e0c055db6dc7..74d1c69dcc532 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/pred-pushdown.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/pred-pushdown.sql.out @@ -2,7 +2,7 @@ -- !query CREATE OR REPLACE TEMPORARY VIEW tbl_a AS VALUES (1, 1), (2, 1), (3, 6) AS T(c1, c2) -- !query analysis -CreateViewCommand `tbl_a`, VALUES (1, 1), (2, 1), (3, 6) AS T(c1, c2), false, true, LocalTempView, true +CreateViewCommand `tbl_a`, VALUES (1, 1), (2, 1), (3, 6) AS T(c1, c2), false, true, LocalTempView, UNSUPPORTED, true +- SubqueryAlias T +- LocalRelation [c1#x, c2#x] @@ -10,7 +10,7 @@ CreateViewCommand `tbl_a`, VALUES (1, 1), (2, 1), (3, 6) AS T(c1, c2), false, tr -- !query CREATE OR REPLACE TEMPORARY VIEW tbl_b AS VALUES 1 AS T(c1) -- !query analysis -CreateViewCommand `tbl_b`, VALUES 1 AS T(c1), false, true, LocalTempView, true +CreateViewCommand `tbl_b`, VALUES 1 AS T(c1), false, true, LocalTempView, UNSUPPORTED, true +- SubqueryAlias T +- LocalRelation [c1#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/predicate-functions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/predicate-functions.sql.out index 772e643027b1e..7e720995c44b4 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/predicate-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/predicate-functions.sql.out @@ -1,4 +1,151 @@ -- Automatically generated by SQLQueryTestSuite +-- !query +select not true +-- !query analysis +Project [NOT true AS (NOT true)#x] ++- OneRowRelation + + +-- !query +select ! true +-- !query analysis +Project [NOT true AS (NOT true)#x] ++- OneRowRelation + + +-- !query +select not null::boolean +-- !query analysis +Project [NOT cast(null as boolean) AS (NOT CAST(NULL AS BOOLEAN))#x] ++- OneRowRelation + + +-- !query +select true and true +-- !query analysis +Project [(true AND true) AS (true AND true)#x] ++- OneRowRelation + + +-- !query +select true and false +-- !query analysis +Project [(true AND false) AS (true AND false)#x] ++- OneRowRelation + + +-- !query +select false and true +-- !query analysis +Project [(false AND true) AS (false AND true)#x] ++- OneRowRelation + + +-- !query +select false and false +-- !query analysis +Project [(false AND false) AS (false AND false)#x] ++- OneRowRelation + + +-- !query +select true and null::boolean +-- !query analysis +Project [(true AND cast(null as boolean)) AS (true AND CAST(NULL AS BOOLEAN))#x] ++- OneRowRelation + + +-- !query +select false and null::boolean +-- !query analysis +Project [(false AND cast(null as boolean)) AS (false AND CAST(NULL AS BOOLEAN))#x] ++- OneRowRelation + + +-- !query +select null::boolean and true +-- !query analysis +Project [(cast(null as boolean) AND true) AS (CAST(NULL AS BOOLEAN) AND true)#x] ++- OneRowRelation + + +-- !query +select null::boolean and false +-- !query analysis +Project [(cast(null as boolean) AND false) AS (CAST(NULL AS BOOLEAN) AND false)#x] ++- OneRowRelation + + +-- !query +select null::boolean and null::boolean +-- !query analysis +Project [(cast(null as boolean) AND cast(null as boolean)) AS (CAST(NULL AS BOOLEAN) AND CAST(NULL AS BOOLEAN))#x] ++- OneRowRelation + + +-- !query +select true or true +-- !query analysis +Project [(true OR true) AS (true OR true)#x] ++- OneRowRelation + + +-- !query +select true or false +-- !query analysis +Project [(true OR false) AS (true OR false)#x] ++- OneRowRelation + + +-- !query +select false or true +-- !query analysis +Project [(false OR true) AS (false OR true)#x] ++- OneRowRelation + + +-- !query +select false or false +-- !query analysis +Project [(false OR false) AS (false OR false)#x] ++- OneRowRelation + + +-- !query +select true or null::boolean +-- !query analysis +Project [(true OR cast(null as boolean)) AS (true OR CAST(NULL AS BOOLEAN))#x] ++- OneRowRelation + + +-- !query +select false or null::boolean +-- !query analysis +Project [(false OR cast(null as boolean)) AS (false OR CAST(NULL AS BOOLEAN))#x] ++- OneRowRelation + + +-- !query +select null::boolean or true +-- !query analysis +Project [(cast(null as boolean) OR true) AS (CAST(NULL AS BOOLEAN) OR true)#x] ++- OneRowRelation + + +-- !query +select null::boolean or false +-- !query analysis +Project [(cast(null as boolean) OR false) AS (CAST(NULL AS BOOLEAN) OR false)#x] ++- OneRowRelation + + +-- !query +select null::boolean or null::boolean +-- !query analysis +Project [(cast(null as boolean) OR cast(null as boolean)) AS (CAST(NULL AS BOOLEAN) OR CAST(NULL AS BOOLEAN))#x] ++- OneRowRelation + + -- !query select 1 = 1 -- !query analysis @@ -450,3 +597,50 @@ Project [NOT between(to_timestamp(2022-12-26 00:00:01, None, TimestampType, Some select rand(123) not between 0.1 AND 0.2 -- !query analysis [Analyzer test output redacted due to nondeterminism] + + +-- !query +set spark.sql.legacy.bangEqualsNot=true +-- !query analysis +SetCommand (spark.sql.legacy.bangEqualsNot,Some(true)) + + +-- !query +select 1 ! between 0 and 2 +-- !query analysis +Project [NOT between(1, 0, 2) AS (NOT between(1, 0, 2))#x] ++- OneRowRelation + + +-- !query +select 1 ! in (3, 4) +-- !query analysis +Project [NOT 1 IN (3,4) AS (NOT (1 IN (3, 4)))#x] ++- OneRowRelation + + +-- !query +select 'hello' ! like 'world' +-- !query analysis +Project [NOT hello LIKE world AS (NOT hello LIKE world)#x] ++- OneRowRelation + + +-- !query +select 1 is ! null +-- !query analysis +Project [isnotnull(1) AS (1 IS NOT NULL)#x] ++- OneRowRelation + + +-- !query +select false is ! true +-- !query analysis +Project [NOT (false <=> true) AS (NOT (false <=> true))#x] ++- OneRowRelation + + +-- !query +set spark.sql.legacy.bangEqualsNot=false +-- !query analysis +SetCommand (spark.sql.legacy.bangEqualsNot,Some(false)) diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/query_regex_column.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/query_regex_column.sql.out index 6c14323382889..d62e4b7921363 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/query_regex_column.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/query_regex_column.sql.out @@ -12,7 +12,7 @@ AS testData(key, value1, value2) -- !query analysis CreateViewCommand `testData`, SELECT * FROM VALUES (1, "1", "11"), (2, "2", "22"), (3, "3", "33"), (4, "4", "44"), (5, "5", "55"), (6, "6", "66") -AS testData(key, value1, value2), false, true, LocalTempView, true +AS testData(key, value1, value2), false, true, LocalTempView, UNSUPPORTED, true +- Project [key#x, value1#x, value2#x] +- SubqueryAlias testData +- LocalRelation [key#x, value1#x, value2#x] @@ -25,7 +25,7 @@ AS testData2(A, B, c, d) -- !query analysis CreateViewCommand `testData2`, SELECT * FROM VALUES (1, 1, 1, 2), (1, 2, 1, 2), (2, 1, 2, 3), (2, 2, 2, 3), (3, 1, 3, 4), (3, 2, 3, 4) -AS testData2(A, B, c, d), false, true, LocalTempView, true +AS testData2(A, B, c, d), false, true, LocalTempView, UNSUPPORTED, true +- Project [A#x, B#x, c#x, d#x] +- SubqueryAlias testData2 +- LocalRelation [A#x, B#x, c#x, d#x] @@ -349,7 +349,7 @@ AS testdata3(a, b) -- !query analysis CreateViewCommand `testdata3`, SELECT * FROM VALUES (0, 1), (1, 2), (2, 3), (3, 4) -AS testdata3(a, b), false, true, LocalTempView, true +AS testdata3(a, b), false, true, LocalTempView, UNSUPPORTED, true +- Project [a#x, b#x] +- SubqueryAlias testdata3 +- LocalRelation [a#x, b#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/selectExcept.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/selectExcept.sql.out index 24119599c532d..e02562e29835f 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/selectExcept.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/selectExcept.sql.out @@ -18,7 +18,7 @@ CreateViewCommand `tbl_view`, SELECT * FROM VALUES (50, "name5", named_struct("f1", 5, "s2", named_struct("f2", 505, "f3", "e"))), (60, "name6", named_struct("f1", 6, "s2", named_struct("f2", 606, "f3", "f"))), (70, "name7", named_struct("f1", 7, "s2", named_struct("f2", 707, "f3", "g"))) -AS tbl_view(id, name, data), false, false, LocalTempView, true +AS tbl_view(id, name, data), false, false, LocalTempView, UNSUPPORTED, true +- Project [id#x, name#x, data#x] +- SubqueryAlias tbl_view +- LocalRelation [id#x, name#x, data#x] @@ -351,7 +351,7 @@ DropTempViewCommand tbl_view -- !query CREATE TEMPORARY VIEW v1 AS VALUES (1, 2, NULL, 4, 5) AS T(c1, c2, c3, c4, c5) -- !query analysis -CreateViewCommand `v1`, VALUES (1, 2, NULL, 4, 5) AS T(c1, c2, c3, c4, c5), false, false, LocalTempView, true +CreateViewCommand `v1`, VALUES (1, 2, NULL, 4, 5) AS T(c1, c2, c3, c4, c5), false, false, LocalTempView, UNSUPPORTED, true +- SubqueryAlias T +- LocalRelation [c1#x, c2#x, c3#x, c4#x, c5#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/show-create-table.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/show-create-table.sql.out index b44edb8dc4d96..b1c3ad59e1515 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/show-create-table.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/show-create-table.sql.out @@ -217,7 +217,7 @@ CreateDataSourceTableCommand `spark_catalog`.`default`.`tbl`, false CREATE VIEW view_SPARK_30302 (aaa, bbb) AS SELECT a, b FROM tbl -- !query analysis -CreateViewCommand `spark_catalog`.`default`.`view_SPARK_30302`, [(aaa,None), (bbb,None)], SELECT a, b FROM tbl, false, false, PersistedView, true +CreateViewCommand `spark_catalog`.`default`.`view_SPARK_30302`, [(aaa,None), (bbb,None)], SELECT a, b FROM tbl, false, false, PersistedView, COMPENSATION, true +- Project [a#x, b#x] +- SubqueryAlias spark_catalog.default.tbl +- Relation spark_catalog.default.tbl[a#x,b#x,c#x] parquet @@ -246,7 +246,7 @@ CREATE VIEW view_SPARK_30302 (aaa COMMENT 'comment with \'quoted text\' for aaa' COMMENT 'This is a comment with \'quoted text\' for view' AS SELECT a, b FROM tbl -- !query analysis -CreateViewCommand `spark_catalog`.`default`.`view_SPARK_30302`, [(aaa,Some(comment with 'quoted text' for aaa)), (bbb,None)], This is a comment with 'quoted text' for view, SELECT a, b FROM tbl, false, false, PersistedView, true +CreateViewCommand `spark_catalog`.`default`.`view_SPARK_30302`, [(aaa,Some(comment with 'quoted text' for aaa)), (bbb,None)], This is a comment with 'quoted text' for view, SELECT a, b FROM tbl, false, false, PersistedView, COMPENSATION, true +- Project [a#x, b#x] +- SubqueryAlias spark_catalog.default.tbl +- Relation spark_catalog.default.tbl[a#x,b#x,c#x] parquet @@ -275,7 +275,7 @@ CREATE VIEW view_SPARK_30302 (aaa, bbb) TBLPROPERTIES ('a' = '1', 'b' = '2') AS SELECT a, b FROM tbl -- !query analysis -CreateViewCommand `spark_catalog`.`default`.`view_SPARK_30302`, [(aaa,None), (bbb,None)], [a=1, b=2], SELECT a, b FROM tbl, false, false, PersistedView, true +CreateViewCommand `spark_catalog`.`default`.`view_SPARK_30302`, [(aaa,None), (bbb,None)], [a=1, b=2], SELECT a, b FROM tbl, false, false, PersistedView, COMPENSATION, true +- Project [a#x, b#x] +- SubqueryAlias spark_catalog.default.tbl +- Relation spark_catalog.default.tbl[a#x,b#x,c#x] parquet diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/show-tables.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/show-tables.sql.out index ce5f7995f5d13..a86cc72f0863c 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/show-tables.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/show-tables.sql.out @@ -40,7 +40,7 @@ CreateTempViewUsing [tableIdent:`show_t3` StructType(StructField(e,IntegerType,t -- !query CREATE GLOBAL TEMP VIEW show_t4 AS SELECT 1 as col1 -- !query analysis -CreateViewCommand `show_t4`, SELECT 1 as col1, false, false, GlobalTempView, true +CreateViewCommand `show_t4`, SELECT 1 as col1, false, false, GlobalTempView, UNSUPPORTED, true +- Project [1 AS col1#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/show-tblproperties.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/show-tblproperties.sql.out index 0ea52d7d1e8cf..f4b29105b58a7 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/show-tblproperties.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/show-tblproperties.sql.out @@ -37,7 +37,7 @@ DropTable false, false -- !query CREATE VIEW view TBLPROPERTIES('p1'='v1', 'p2'='v2') AS SELECT 1 AS c1 -- !query analysis -CreateViewCommand `spark_catalog`.`default`.`view`, [p1=v1, p2=v2], SELECT 1 AS c1, false, false, PersistedView, true +CreateViewCommand `spark_catalog`.`default`.`view`, [p1=v1, p2=v2], SELECT 1 AS c1, false, false, PersistedView, COMPENSATION, true +- Project [1 AS c1#x] +- OneRowRelation @@ -69,7 +69,7 @@ DropTableCommand `spark_catalog`.`default`.`view`, false, true, false -- !query CREATE TEMPORARY VIEW tv AS SELECT 1 AS c1 -- !query analysis -CreateViewCommand `tv`, SELECT 1 AS c1, false, false, LocalTempView, true +CreateViewCommand `tv`, SELECT 1 AS c1, false, false, LocalTempView, UNSUPPORTED, true +- Project [1 AS c1#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/show-views.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/show-views.sql.out index c8f2c6f9cc029..ed3690ec5c6a3 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/show-views.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/show-views.sql.out @@ -22,7 +22,7 @@ CreateDataSourceTableCommand `spark_catalog`.`showdb`.`tbl`, false -- !query CREATE VIEW view_1 AS SELECT * FROM tbl -- !query analysis -CreateViewCommand `spark_catalog`.`showdb`.`view_1`, SELECT * FROM tbl, false, false, PersistedView, true +CreateViewCommand `spark_catalog`.`showdb`.`view_1`, SELECT * FROM tbl, false, false, PersistedView, COMPENSATION, true +- Project [a#x, b#x, c#x, d#x] +- SubqueryAlias spark_catalog.showdb.tbl +- Relation spark_catalog.showdb.tbl[a#x,b#x,c#x,d#x] parquet @@ -31,7 +31,7 @@ CreateViewCommand `spark_catalog`.`showdb`.`view_1`, SELECT * FROM tbl, false, f -- !query CREATE VIEW view_2 AS SELECT * FROM tbl WHERE c='a' -- !query analysis -CreateViewCommand `spark_catalog`.`showdb`.`view_2`, SELECT * FROM tbl WHERE c='a', false, false, PersistedView, true +CreateViewCommand `spark_catalog`.`showdb`.`view_2`, SELECT * FROM tbl WHERE c='a', false, false, PersistedView, COMPENSATION, true +- Project [a#x, b#x, c#x, d#x] +- Filter (c#x = a) +- SubqueryAlias spark_catalog.showdb.tbl @@ -41,7 +41,7 @@ CreateViewCommand `spark_catalog`.`showdb`.`view_2`, SELECT * FROM tbl WHERE c=' -- !query CREATE GLOBAL TEMP VIEW view_3 AS SELECT 1 as col1 -- !query analysis -CreateViewCommand `view_3`, SELECT 1 as col1, false, false, GlobalTempView, true +CreateViewCommand `view_3`, SELECT 1 as col1, false, false, GlobalTempView, UNSUPPORTED, true +- Project [1 AS col1#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/show_columns.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/show_columns.sql.out index 55c744a8c0726..27e75187cdba7 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/show_columns.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/show_columns.sql.out @@ -34,7 +34,7 @@ CreateTempViewUsing [tableIdent:`showColumn3` StructType(StructField(col3,Intege -- !query CREATE GLOBAL TEMP VIEW showColumn4 AS SELECT 1 as col1, 'abc' as `col 5` -- !query analysis -CreateViewCommand `showColumn4`, SELECT 1 as col1, 'abc' as `col 5`, false, false, GlobalTempView, true +CreateViewCommand `showColumn4`, SELECT 1 as col1, 'abc' as `col 5`, false, false, GlobalTempView, UNSUPPORTED, true +- Project [1 AS col1#x, abc AS col 5#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/sql-compatibility-functions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/sql-compatibility-functions.sql.out index f80290c5ab348..a18e4ede957cf 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/sql-compatibility-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/sql-compatibility-functions.sql.out @@ -102,7 +102,7 @@ org.apache.spark.sql.AnalysisException -- !query CREATE TEMPORARY VIEW tempView1 AS VALUES (1, NAMED_STRUCT('col1', 'gamma', 'col2', 'delta')) AS T(id, st) -- !query analysis -CreateViewCommand `tempView1`, VALUES (1, NAMED_STRUCT('col1', 'gamma', 'col2', 'delta')) AS T(id, st), false, false, LocalTempView, true +CreateViewCommand `tempView1`, VALUES (1, NAMED_STRUCT('col1', 'gamma', 'col2', 'delta')) AS T(id, st), false, false, LocalTempView, UNSUPPORTED, true +- SubqueryAlias T +- LocalRelation [id#x, st#x] @@ -122,4 +122,4 @@ Aggregate [nvl(st#x.col1, value)], [nvl(st#x.col1, value) AS nvl(st.col1, value) SELECT nullif(SUM(id), 0) from range(5) -- !query analysis Aggregate [nullif(sum(id#xL), 0) AS nullif(sum(id), 0)#xL] -+- Range (0, 5, step=1, splits=None) ++- Range (0, 5, step=1) diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/sql-session-variables.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/sql-session-variables.sql.out index 6a6ffe85ad592..f5ce5ed2e8b6e 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/sql-session-variables.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/sql-session-variables.sql.out @@ -498,7 +498,7 @@ SELECT (SELECT MAX(id) FROM RANGE(10) WHERE id < title) FROM VALUES 1, 2 AS t(ti Project [scalar-subquery#x [title#x] AS scalarsubquery(title)#xL] : +- Aggregate [max(id#xL) AS max(id)#xL] : +- Filter (id#xL < cast(outer(title#x) as bigint)) -: +- Range (0, 10, step=1, splits=None) +: +- Range (0, 10, step=1) +- SubqueryAlias t +- LocalRelation [title#x] @@ -2060,7 +2060,7 @@ WithCTE -- !query CREATE OR REPLACE TEMPORARY VIEW v AS SELECT var1 AS c1 -- !query analysis -CreateViewCommand `v`, SELECT var1 AS c1, false, true, LocalTempView, true +CreateViewCommand `v`, SELECT var1 AS c1, false, true, LocalTempView, UNSUPPORTED, true +- Project [variablereference(system.session.var1=1) AS c1#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out index 7ffd3cbd8bac6..98664dedf820c 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out @@ -384,21 +384,21 @@ Project [btrim(xyxtrimyyx, xy) AS btrim(xyxtrimyyx, xy)#x] -- !query SELECT btrim(encode(" xyz ", 'utf-8')) -- !query analysis -Project [btrim(encode( xyz , utf-8, false)) AS btrim(encode( xyz , utf-8))#x] +Project [btrim(encode( xyz , utf-8)) AS btrim(encode( xyz , utf-8))#x] +- OneRowRelation -- !query SELECT btrim(encode('yxTomxx', 'utf-8'), encode('xyz', 'utf-8')) -- !query analysis -Project [btrim(encode(yxTomxx, utf-8, false), encode(xyz, utf-8, false)) AS btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8))#x] +Project [btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8)) AS btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8))#x] +- OneRowRelation -- !query SELECT btrim(encode('xxxbarxxx', 'utf-8'), encode('x', 'utf-8')) -- !query analysis -Project [btrim(encode(xxxbarxxx, utf-8, false), encode(x, utf-8, false)) AS btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8))#x] +Project [btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8)) AS btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8))#x] +- OneRowRelation @@ -649,14 +649,14 @@ SetCommand (spark.sql.legacy.javaCharsets,Some(true)) -- !query select encode('hello', 'WINDOWS-1252') -- !query analysis -Project [encode(hello, WINDOWS-1252, true) AS encode(hello, WINDOWS-1252)#x] +Project [encode(hello, WINDOWS-1252) AS encode(hello, WINDOWS-1252)#x] +- OneRowRelation -- !query select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol) -- !query analysis -Project [encode(scol#x, ecol#x, true) AS encode(scol, ecol)#x] +Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x] +- SubqueryAlias t +- LocalRelation [scol#x, ecol#x] @@ -670,14 +670,14 @@ SetCommand (spark.sql.legacy.javaCharsets,Some(false)) -- !query select encode('hello', 'WINDOWS-1252') -- !query analysis -Project [encode(hello, WINDOWS-1252, false) AS encode(hello, WINDOWS-1252)#x] +Project [encode(hello, WINDOWS-1252) AS encode(hello, WINDOWS-1252)#x] +- OneRowRelation -- !query select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol) -- !query analysis -Project [encode(scol#x, ecol#x, false) AS encode(scol, ecol)#x] +Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x] +- SubqueryAlias t +- LocalRelation [scol#x, ecol#x] @@ -685,18 +685,95 @@ Project [encode(scol#x, ecol#x, false) AS encode(scol, ecol)#x] -- !query select encode('hello', 'Windows-xxx') -- !query analysis -Project [encode(hello, Windows-xxx, false) AS encode(hello, Windows-xxx)#x] +Project [encode(hello, Windows-xxx) AS encode(hello, Windows-xxx)#x] +- OneRowRelation -- !query select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol) -- !query analysis -Project [encode(scol#x, ecol#x, false) AS encode(scol, ecol)#x] +Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x] +- SubqueryAlias t +- LocalRelation [scol#x, ecol#x] +-- !query +set spark.sql.legacy.codingErrorAction=true +-- !query analysis +SetCommand (spark.sql.legacy.codingErrorAction,Some(true)) + + +-- !query +select encode('渭城朝雨浥轻尘', 'US-ASCII') +-- !query analysis +Project [encode(渭城朝雨浥轻尘, US-ASCII) AS encode(渭城朝雨浥轻尘, US-ASCII)#x] ++- OneRowRelation + + +-- !query +select encode(scol, ecol) from values('渭城朝雨浥轻尘', 'US-ASCII') as t(scol, ecol) +-- !query analysis +Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x] ++- SubqueryAlias t + +- LocalRelation [scol#x, ecol#x] + + +-- !query +set spark.sql.legacy.codingErrorAction=false +-- !query analysis +SetCommand (spark.sql.legacy.codingErrorAction,Some(false)) + + +-- !query +select encode('客舍青青柳色新', 'US-ASCII') +-- !query analysis +Project [encode(客舍青青柳色新, US-ASCII) AS encode(客舍青青柳色新, US-ASCII)#x] ++- OneRowRelation + + +-- !query +select encode(scol, ecol) from values('客舍青青柳色新', 'US-ASCII') as t(scol, ecol) +-- !query analysis +Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x] ++- SubqueryAlias t + +- LocalRelation [scol#x, ecol#x] + + +-- !query +select encode(decode(encode('白日依山尽,黄河入海流。欲穷千里目,更上一层楼。', 'UTF-16'), 'UTF-16'), 'UTF-8') +-- !query analysis +Project [encode(decode(encode(白日依山尽,黄河入海流。欲穷千里目,更上一层楼。, UTF-16), UTF-16), UTF-8) AS encode(decode(encode(白日依山尽,黄河入海流。欲穷千里目,更上一层楼。, UTF-16), UTF-16), UTF-8)#x] ++- OneRowRelation + + +-- !query +select encode(decode(encode('南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。', 'UTF-16'), 'UTF-16'), 'UTF-8') +-- !query analysis +Project [encode(decode(encode(南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。, UTF-16), UTF-16), UTF-8) AS encode(decode(encode(南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。, UTF-16), UTF-16), UTF-8)#x] ++- OneRowRelation + + +-- !query +select encode(decode(encode('세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark', 'UTF-16'), 'UTF-16'), 'UTF-8') +-- !query analysis +Project [encode(decode(encode(세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark, UTF-16), UTF-16), UTF-8) AS encode(decode(encode(세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark, UTF-16), UTF-16), UTF-8)#x] ++- OneRowRelation + + +-- !query +select encode(decode(encode('το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας μεγάλων δεδομένων παγκοσμίως', 'UTF-16'), 'UTF-16'), 'UTF-8') +-- !query analysis +Project [encode(decode(encode(το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας μεγάλων δεδομένων παγκοσμίως, UTF-16), UTF-16), UTF-8) AS encode(decode(encode(το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας μεγάλων δεδομένων παγκοσμίως, UTF-16), UTF-16), UTF-8)#x] ++- OneRowRelation + + +-- !query +select encode(decode(encode('Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。', 'UTF-16'), 'UTF-16'), 'UTF-8') +-- !query analysis +Project [encode(decode(encode(Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。, UTF-16), UTF-16), UTF-8) AS encode(decode(encode(Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。, UTF-16), UTF-16), UTF-8)#x] ++- OneRowRelation + + -- !query select decode() -- !query analysis @@ -746,7 +823,14 @@ org.apache.spark.sql.AnalysisException -- !query select decode(encode('abc', 'utf-8'), 'utf-8') -- !query analysis -Project [decode(encode(abc, utf-8, false), utf-8) AS decode(encode(abc, utf-8), utf-8)#x] +Project [decode(encode(abc, utf-8), utf-8) AS decode(encode(abc, utf-8), utf-8)#x] ++- OneRowRelation + + +-- !query +select decode(encode('大千世界', 'utf-32'), 'utf-32') +-- !query analysis +Project [decode(encode(大千世界, utf-32), utf-32) AS decode(encode(大千世界, utf-32), utf-32)#x] +- OneRowRelation @@ -856,6 +940,48 @@ Project [decode(scol#x, ecol#x) AS decode(scol, ecol)#x] +- LocalRelation [scol#x, ecol#x] +-- !query +set spark.sql.legacy.codingErrorAction=true +-- !query analysis +SetCommand (spark.sql.legacy.codingErrorAction,Some(true)) + + +-- !query +select decode(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') +-- !query analysis +Project [decode(0xE58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592, US-ASCII) AS decode(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', US-ASCII)#x] ++- OneRowRelation + + +-- !query +select decode(scol, ecol) from values(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') as t(scol, ecol) +-- !query analysis +Project [decode(scol#x, ecol#x) AS decode(scol, ecol)#x] ++- SubqueryAlias t + +- LocalRelation [scol#x, ecol#x] + + +-- !query +set spark.sql.legacy.codingErrorAction=false +-- !query analysis +SetCommand (spark.sql.legacy.codingErrorAction,Some(false)) + + +-- !query +select decode(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') +-- !query analysis +Project [decode(0xE8A5BFE587BAE998B3E585B3E697A0E69585E4BABA, US-ASCII) AS decode(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', US-ASCII)#x] ++- OneRowRelation + + +-- !query +select decode(scol, ecol) from values(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') as t(scol, ecol) +-- !query analysis +Project [decode(scol#x, ecol#x) AS decode(scol, ecol)#x] ++- SubqueryAlias t + +- LocalRelation [scol#x, ecol#x] + + -- !query SELECT CONTAINS(null, 'Spark') -- !query analysis @@ -1428,7 +1554,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException -- !query CREATE TEMPORARY VIEW fmtTable(fmtField) AS SELECT * FROM VALUES ('invalidFormat') -- !query analysis -CreateViewCommand `fmtTable`, [(fmtField,None)], SELECT * FROM VALUES ('invalidFormat'), false, false, LocalTempView, true +CreateViewCommand `fmtTable`, [(fmtField,None)], SELECT * FROM VALUES ('invalidFormat'), false, false, LocalTempView, UNSUPPORTED, true +- Project [col1#x] +- LocalRelation [col1#x] @@ -1586,3 +1712,87 @@ select luhn_check(123.456) -- !query analysis Project [luhn_check(cast(123.456 as string)) AS luhn_check(123.456)#x] +- OneRowRelation + + +-- !query +select is_valid_utf8('') +-- !query analysis +Project [is_valid_utf8() AS is_valid_utf8()#x] ++- OneRowRelation + + +-- !query +select is_valid_utf8('abc') +-- !query analysis +Project [is_valid_utf8(abc) AS is_valid_utf8(abc)#x] ++- OneRowRelation + + +-- !query +select is_valid_utf8(x'80') +-- !query analysis +Project [is_valid_utf8(cast(0x80 as string)) AS is_valid_utf8(X'80')#x] ++- OneRowRelation + + +-- !query +select make_valid_utf8('') +-- !query analysis +Project [make_valid_utf8() AS make_valid_utf8()#x] ++- OneRowRelation + + +-- !query +select make_valid_utf8('abc') +-- !query analysis +Project [make_valid_utf8(abc) AS make_valid_utf8(abc)#x] ++- OneRowRelation + + +-- !query +select make_valid_utf8(x'80') +-- !query analysis +Project [make_valid_utf8(cast(0x80 as string)) AS make_valid_utf8(X'80')#x] ++- OneRowRelation + + +-- !query +select validate_utf8('') +-- !query analysis +Project [validate_utf8() AS validate_utf8()#x] ++- OneRowRelation + + +-- !query +select validate_utf8('abc') +-- !query analysis +Project [validate_utf8(abc) AS validate_utf8(abc)#x] ++- OneRowRelation + + +-- !query +select validate_utf8(x'80') +-- !query analysis +Project [validate_utf8(cast(0x80 as string)) AS validate_utf8(X'80')#x] ++- OneRowRelation + + +-- !query +select try_validate_utf8('') +-- !query analysis +Project [try_validate_utf8() AS try_validate_utf8()#x] ++- OneRowRelation + + +-- !query +select try_validate_utf8('abc') +-- !query analysis +Project [try_validate_utf8(abc) AS try_validate_utf8(abc)#x] ++- OneRowRelation + + +-- !query +select try_validate_utf8(x'80') +-- !query analysis +Project [try_validate_utf8(cast(0x80 as string)) AS try_validate_utf8(X'80')#x] ++- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/struct.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/struct.sql.out index c672353ecda6c..dba912cdff72e 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/struct.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/struct.sql.out @@ -10,7 +10,7 @@ CreateViewCommand `tbl_x`, VALUES (1, NAMED_STRUCT('C', 'gamma', 'D', 'delta')), (2, NAMED_STRUCT('C', 'epsilon', 'D', 'eta')), (3, NAMED_STRUCT('C', 'theta', 'D', 'iota')) - AS T(ID, ST), false, false, LocalTempView, true + AS T(ID, ST), false, false, LocalTempView, UNSUPPORTED, true +- SubqueryAlias T +- LocalRelation [ID#x, ST#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subexp-elimination.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subexp-elimination.sql.out index 8409212f136ad..94073f2751b3e 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subexp-elimination.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subexp-elimination.sql.out @@ -6,7 +6,7 @@ AS testData(a, b) -- !query analysis CreateViewCommand `testData`, SELECT * FROM VALUES ('{"a":1, "b":"2"}', '[{"a": 1, "b":2}, {"a":2, "b":2}]'), ('{"a":1, "b":"2"}', null), ('{"a":2, "b":"3"}', '[{"a": 3, "b":4}, {"a":4, "b":5}]'), ('{"a":5, "b":"6"}', '[{"a": 6, "b":7}, {"a":8, "b":9}]'), (null, '[{"a": 1, "b":2}, {"a":2, "b":2}]') -AS testData(a, b), false, true, LocalTempView, true +AS testData(a, b), false, true, LocalTempView, UNSUPPORTED, true +- Project [a#x, b#x] +- SubqueryAlias testData +- LocalRelation [a#x, b#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-aggregate.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-aggregate.sql.out index b4c9632462527..4da75ae2764f5 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-aggregate.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-aggregate.sql.out @@ -22,7 +22,7 @@ CreateViewCommand `EMP`, SELECT * FROM VALUES (600, "emp 6 - no dept", date "2001-01-01", 400.00D, 100), (700, "emp 7", date "2010-01-01", 400.00D, 100), (800, "emp 8", date "2016-01-01", 150.00D, 70) -AS EMP(id, emp_name, hiredate, salary, dept_id), false, false, LocalTempView, true +AS EMP(id, emp_name, hiredate, salary, dept_id), false, false, LocalTempView, UNSUPPORTED, true +- Project [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] +- SubqueryAlias EMP +- LocalRelation [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] @@ -45,7 +45,7 @@ CreateViewCommand `DEPT`, SELECT * FROM VALUES (40, "dept 4 - unassigned", "OR"), (50, "dept 5 - unassigned", "NJ"), (70, "dept 7", "FL") -AS DEPT(dept_id, dept_name, state), false, false, LocalTempView, true +AS DEPT(dept_id, dept_name, state), false, false, LocalTempView, UNSUPPORTED, true +- Project [dept_id#x, dept_name#x, state#x] +- SubqueryAlias DEPT +- LocalRelation [dept_id#x, dept_name#x, state#x] @@ -72,7 +72,7 @@ CreateViewCommand `BONUS`, SELECT * FROM VALUES ("emp 4", 100.00D), ("emp 5", 1000.00D), ("emp 6 - no dept", 500.00D) -AS BONUS(emp_name, bonus_amt), false, false, LocalTempView, true +AS BONUS(emp_name, bonus_amt), false, false, LocalTempView, UNSUPPORTED, true +- Project [emp_name#x, bonus_amt#x] +- SubqueryAlias BONUS +- LocalRelation [emp_name#x, bonus_amt#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-basic.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-basic.sql.out index 9a2f65a7377f8..81c4a15dc9f4a 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-basic.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-basic.sql.out @@ -22,7 +22,7 @@ CreateViewCommand `EMP`, SELECT * FROM VALUES (600, "emp 6 - no dept", date "2001-01-01", 400.00D, 100), (700, "emp 7", date "2010-01-01", 400.00D, 100), (800, "emp 8", date "2016-01-01", 150.00D, 70) -AS EMP(id, emp_name, hiredate, salary, dept_id), false, false, LocalTempView, true +AS EMP(id, emp_name, hiredate, salary, dept_id), false, false, LocalTempView, UNSUPPORTED, true +- Project [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] +- SubqueryAlias EMP +- LocalRelation [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] @@ -45,7 +45,7 @@ CreateViewCommand `DEPT`, SELECT * FROM VALUES (40, "dept 4 - unassigned", "OR"), (50, "dept 5 - unassigned", "NJ"), (70, "dept 7", "FL") -AS DEPT(dept_id, dept_name, state), false, false, LocalTempView, true +AS DEPT(dept_id, dept_name, state), false, false, LocalTempView, UNSUPPORTED, true +- Project [dept_id#x, dept_name#x, state#x] +- SubqueryAlias DEPT +- LocalRelation [dept_id#x, dept_name#x, state#x] @@ -72,7 +72,7 @@ CreateViewCommand `BONUS`, SELECT * FROM VALUES ("emp 4", 100.00D), ("emp 5", 1000.00D), ("emp 6 - no dept", 500.00D) -AS BONUS(emp_name, bonus_amt), false, false, LocalTempView, true +AS BONUS(emp_name, bonus_amt), false, false, LocalTempView, UNSUPPORTED, true +- Project [emp_name#x, bonus_amt#x] +- SubqueryAlias BONUS +- LocalRelation [emp_name#x, bonus_amt#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-count-bug.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-count-bug.sql.out index 01ab697a0dc30..e003c2624009b 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-count-bug.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-count-bug.sql.out @@ -2,21 +2,21 @@ -- !query create temporary view t1(c1, c2) as values (0, 1), (1, 2) -- !query analysis -CreateViewCommand `t1`, [(c1,None), (c2,None)], values (0, 1), (1, 2), false, false, LocalTempView, true +CreateViewCommand `t1`, [(c1,None), (c2,None)], values (0, 1), (1, 2), false, false, LocalTempView, UNSUPPORTED, true +- LocalRelation [col1#x, col2#x] -- !query create temporary view t2(c1, c2) as values (0, 2), (0, 3) -- !query analysis -CreateViewCommand `t2`, [(c1,None), (c2,None)], values (0, 2), (0, 3), false, false, LocalTempView, true +CreateViewCommand `t2`, [(c1,None), (c2,None)], values (0, 2), (0, 3), false, false, LocalTempView, UNSUPPORTED, true +- LocalRelation [col1#x, col2#x] -- !query create temporary view t3(c1, c2) as values (0, 3), (1, 4), (2, 5) -- !query analysis -CreateViewCommand `t3`, [(c1,None), (c2,None)], values (0, 3), (1, 4), (2, 5), false, false, LocalTempView, true +CreateViewCommand `t3`, [(c1,None), (c2,None)], values (0, 3), (1, 4), (2, 5), false, false, LocalTempView, UNSUPPORTED, true +- LocalRelation [col1#x, col2#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-cte.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-cte.sql.out index b4338b34eea13..7c3678c66c117 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-cte.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-cte.sql.out @@ -22,7 +22,7 @@ CreateViewCommand `EMP`, SELECT * FROM VALUES (600, "emp 6 - no dept", date "2001-01-01", 400.00D, 100), (700, "emp 7", date "2010-01-01", 400.00D, 100), (800, "emp 8", date "2016-01-01", 150.00D, 70) -AS EMP(id, emp_name, hiredate, salary, dept_id), false, false, LocalTempView, true +AS EMP(id, emp_name, hiredate, salary, dept_id), false, false, LocalTempView, UNSUPPORTED, true +- Project [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] +- SubqueryAlias EMP +- LocalRelation [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] @@ -45,7 +45,7 @@ CreateViewCommand `DEPT`, SELECT * FROM VALUES (40, "dept 4 - unassigned", "OR"), (50, "dept 5 - unassigned", "NJ"), (70, "dept 7", "FL") -AS DEPT(dept_id, dept_name, state), false, false, LocalTempView, true +AS DEPT(dept_id, dept_name, state), false, false, LocalTempView, UNSUPPORTED, true +- Project [dept_id#x, dept_name#x, state#x] +- SubqueryAlias DEPT +- LocalRelation [dept_id#x, dept_name#x, state#x] @@ -72,7 +72,7 @@ CreateViewCommand `BONUS`, SELECT * FROM VALUES ("emp 4", 100.00D), ("emp 5", 1000.00D), ("emp 6 - no dept", 500.00D) -AS BONUS(emp_name, bonus_amt), false, false, LocalTempView, true +AS BONUS(emp_name, bonus_amt), false, false, LocalTempView, UNSUPPORTED, true +- Project [emp_name#x, bonus_amt#x] +- SubqueryAlias BONUS +- LocalRelation [emp_name#x, bonus_amt#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-having.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-having.sql.out index b053cb369adca..2409cd0559bd2 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-having.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-having.sql.out @@ -20,7 +20,7 @@ CreateViewCommand `EMP`, [(id,None), (emp_name,None), (hiredate,None), (salary,N (500, 'emp 5', date '2001-01-01', double(400.00), NULL), (600, 'emp 6 - no dept', date '2001-01-01', double(400.00), 100), (700, 'emp 7', date '2010-01-01', double(400.00), 100), - (800, 'emp 8', date '2016-01-01', double(150.00), 70), false, false, LocalTempView, true + (800, 'emp 8', date '2016-01-01', double(150.00), 70), false, false, LocalTempView, UNSUPPORTED, true +- LocalRelation [col1#x, col2#x, col3#x, col4#x, col5#x] @@ -39,7 +39,7 @@ CreateViewCommand `DEPT`, [(dept_id,None), (dept_name,None), (state,None)], VALU (30, 'dept 3', 'TX'), (40, 'dept 4 - unassigned', 'OR'), (50, 'dept 5 - unassigned', 'NJ'), - (70, 'dept 7', 'FL'), false, false, LocalTempView, true + (70, 'dept 7', 'FL'), false, false, LocalTempView, UNSUPPORTED, true +- LocalRelation [col1#x, col2#x, col3#x] @@ -62,7 +62,7 @@ CreateViewCommand `BONUS`, [(emp_name,None), (bonus_amt,None)], VALUES ('emp 3', double(300.00)), ('emp 4', double(100.00)), ('emp 5', double(1000.00)), - ('emp 6 - no dept', double(500.00)), false, false, LocalTempView, true + ('emp 6 - no dept', double(500.00)), false, false, LocalTempView, UNSUPPORTED, true +- LocalRelation [col1#x, col2#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-in-join-condition.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-in-join-condition.sql.out index da8ea257cfa70..3b55a7293bcfa 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-in-join-condition.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-in-join-condition.sql.out @@ -8,7 +8,7 @@ CREATE TEMP VIEW x(x1, x2) AS VALUES CreateViewCommand `x`, [(x1,None), (x2,None)], VALUES (2, 1), (1, 1), - (3, 4), false, false, LocalTempView, true + (3, 4), false, false, LocalTempView, UNSUPPORTED, true +- LocalRelation [col1#x, col2#x] @@ -21,7 +21,7 @@ CREATE TEMP VIEW y(y1, y2) AS VALUES CreateViewCommand `y`, [(y1,None), (y2,None)], VALUES (0, 2), (1, 4), - (4, 11), false, false, LocalTempView, true + (4, 11), false, false, LocalTempView, UNSUPPORTED, true +- LocalRelation [col1#x, col2#x] @@ -34,7 +34,7 @@ CREATE TEMP VIEW z(z1, z2) AS VALUES CreateViewCommand `z`, [(z1,None), (z2,None)], VALUES (4, 2), (3, 3), - (8, 1), false, false, LocalTempView, true + (8, 1), false, false, LocalTempView, UNSUPPORTED, true +- LocalRelation [col1#x, col2#x] @@ -1004,3 +1004,47 @@ Sort [x1#x ASC NULLS FIRST, x2#x ASC NULLS FIRST, y1#x ASC NULLS FIRST, y2#x ASC +- View (`y`, [y1#x, y2#x]) +- Project [cast(col1#x as int) AS y1#x, cast(col2#x as int) AS y2#x] +- LocalRelation [col1#x, col2#x] + + +-- !query +select * from x join y on x1 = y1 and exists (select * from z where z2 = x2 AND z2 = y2) order by x1, x2, y1, y2 +-- !query analysis +Sort [x1#x ASC NULLS FIRST, x2#x ASC NULLS FIRST, y1#x ASC NULLS FIRST, y2#x ASC NULLS FIRST], true ++- Project [x1#x, x2#x, y1#x, y2#x] + +- Join Inner, ((x1#x = y1#x) AND exists#x [x2#x && y2#x]) + : +- Project [z1#x, z2#x] + : +- Filter ((z2#x = outer(x2#x)) AND (z2#x = outer(y2#x))) + : +- SubqueryAlias z + : +- View (`z`, [z1#x, z2#x]) + : +- Project [cast(col1#x as int) AS z1#x, cast(col2#x as int) AS z2#x] + : +- LocalRelation [col1#x, col2#x] + :- SubqueryAlias x + : +- View (`x`, [x1#x, x2#x]) + : +- Project [cast(col1#x as int) AS x1#x, cast(col2#x as int) AS x2#x] + : +- LocalRelation [col1#x, col2#x] + +- SubqueryAlias y + +- View (`y`, [y1#x, y2#x]) + +- Project [cast(col1#x as int) AS y1#x, cast(col2#x as int) AS y2#x] + +- LocalRelation [col1#x, col2#x] + + +-- !query +select * from x join y on x1 = y1 and not exists (select * from z where z2 = x2 AND z2 = y2) order by x1, x2, y1, y2 +-- !query analysis +Sort [x1#x ASC NULLS FIRST, x2#x ASC NULLS FIRST, y1#x ASC NULLS FIRST, y2#x ASC NULLS FIRST], true ++- Project [x1#x, x2#x, y1#x, y2#x] + +- Join Inner, ((x1#x = y1#x) AND NOT exists#x [x2#x && y2#x]) + : +- Project [z1#x, z2#x] + : +- Filter ((z2#x = outer(x2#x)) AND (z2#x = outer(y2#x))) + : +- SubqueryAlias z + : +- View (`z`, [z1#x, z2#x]) + : +- Project [cast(col1#x as int) AS z1#x, cast(col2#x as int) AS z2#x] + : +- LocalRelation [col1#x, col2#x] + :- SubqueryAlias x + : +- View (`x`, [x1#x, x2#x]) + : +- Project [cast(col1#x as int) AS x1#x, cast(col2#x as int) AS x2#x] + : +- LocalRelation [col1#x, col2#x] + +- SubqueryAlias y + +- View (`y`, [y1#x, y2#x]) + +- Project [cast(col1#x as int) AS y1#x, cast(col2#x as int) AS y2#x] + +- LocalRelation [col1#x, col2#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-joins-and-set-ops.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-joins-and-set-ops.sql.out index 99de720ac47d9..55b10125f7680 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-joins-and-set-ops.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-joins-and-set-ops.sql.out @@ -22,7 +22,7 @@ CreateViewCommand `EMP`, SELECT * FROM VALUES (600, "emp 6 - no dept", date "2001-01-01", 400.00D, 100), (700, "emp 7", date "2010-01-01", 400.00D, 100), (800, "emp 8", date "2016-01-01", 150.00D, 70) -AS EMP(id, emp_name, hiredate, salary, dept_id), false, false, LocalTempView, true +AS EMP(id, emp_name, hiredate, salary, dept_id), false, false, LocalTempView, UNSUPPORTED, true +- Project [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] +- SubqueryAlias EMP +- LocalRelation [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] @@ -45,7 +45,7 @@ CreateViewCommand `DEPT`, SELECT * FROM VALUES (40, "dept 4 - unassigned", "OR"), (50, "dept 5 - unassigned", "NJ"), (70, "dept 7", "FL") -AS DEPT(dept_id, dept_name, state), false, false, LocalTempView, true +AS DEPT(dept_id, dept_name, state), false, false, LocalTempView, UNSUPPORTED, true +- Project [dept_id#x, dept_name#x, state#x] +- SubqueryAlias DEPT +- LocalRelation [dept_id#x, dept_name#x, state#x] @@ -72,7 +72,7 @@ CreateViewCommand `BONUS`, SELECT * FROM VALUES ("emp 4", 100.00D), ("emp 5", 1000.00D), ("emp 6 - no dept", 500.00D) -AS BONUS(emp_name, bonus_amt), false, false, LocalTempView, true +AS BONUS(emp_name, bonus_amt), false, false, LocalTempView, UNSUPPORTED, true +- Project [emp_name#x, bonus_amt#x] +- SubqueryAlias BONUS +- LocalRelation [emp_name#x, bonus_amt#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-orderby-limit.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-orderby-limit.sql.out index 4bfb6c5b843cd..3f56e346d0a53 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-orderby-limit.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-orderby-limit.sql.out @@ -22,7 +22,7 @@ CreateViewCommand `EMP`, SELECT * FROM VALUES (600, "emp 6 - no dept", date "2001-01-01", 400.00D, 100), (700, "emp 7", date "2010-01-01", 400.00D, 100), (800, "emp 8", date "2016-01-01", 150.00D, 70) -AS EMP(id, emp_name, hiredate, salary, dept_id), false, false, LocalTempView, true +AS EMP(id, emp_name, hiredate, salary, dept_id), false, false, LocalTempView, UNSUPPORTED, true +- Project [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] +- SubqueryAlias EMP +- LocalRelation [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] @@ -45,7 +45,7 @@ CreateViewCommand `DEPT`, SELECT * FROM VALUES (40, "dept 4 - unassigned", "OR"), (50, "dept 5 - unassigned", "NJ"), (70, "dept 7", "FL") -AS DEPT(dept_id, dept_name, state), false, false, LocalTempView, true +AS DEPT(dept_id, dept_name, state), false, false, LocalTempView, UNSUPPORTED, true +- Project [dept_id#x, dept_name#x, state#x] +- SubqueryAlias DEPT +- LocalRelation [dept_id#x, dept_name#x, state#x] @@ -72,7 +72,7 @@ CreateViewCommand `BONUS`, SELECT * FROM VALUES ("emp 4", 100.00D), ("emp 5", 1000.00D), ("emp 6 - no dept", 500.00D) -AS BONUS(emp_name, bonus_amt), false, false, LocalTempView, true +AS BONUS(emp_name, bonus_amt), false, false, LocalTempView, UNSUPPORTED, true +- Project [emp_name#x, bonus_amt#x] +- SubqueryAlias BONUS +- LocalRelation [emp_name#x, bonus_amt#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-outside-filter.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-outside-filter.sql.out index a0e485b49096e..d15466f04f800 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-outside-filter.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-outside-filter.sql.out @@ -22,7 +22,7 @@ CreateViewCommand `EMP`, SELECT * FROM VALUES (600, "emp 6 - no dept", date "2001-01-01", 400.00D, 100), (700, "emp 7", date "2010-01-01", 400.00D, 100), (800, "emp 8", date "2016-01-01", 150.00D, 70) -AS EMP(id, emp_name, hiredate, salary, dept_id), false, false, LocalTempView, true +AS EMP(id, emp_name, hiredate, salary, dept_id), false, false, LocalTempView, UNSUPPORTED, true +- Project [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] +- SubqueryAlias EMP +- LocalRelation [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] @@ -45,7 +45,7 @@ CreateViewCommand `DEPT`, SELECT * FROM VALUES (40, "dept 4 - unassigned", "OR"), (50, "dept 5 - unassigned", "NJ"), (70, "dept 7", "FL") -AS DEPT(dept_id, dept_name, state), false, false, LocalTempView, true +AS DEPT(dept_id, dept_name, state), false, false, LocalTempView, UNSUPPORTED, true +- Project [dept_id#x, dept_name#x, state#x] +- SubqueryAlias DEPT +- LocalRelation [dept_id#x, dept_name#x, state#x] @@ -72,7 +72,7 @@ CreateViewCommand `BONUS`, SELECT * FROM VALUES ("emp 4", 100.00D), ("emp 5", 1000.00D), ("emp 6 - no dept", 500.00D) -AS BONUS(emp_name, bonus_amt), false, false, LocalTempView, true +AS BONUS(emp_name, bonus_amt), false, false, LocalTempView, UNSUPPORTED, true +- Project [emp_name#x, bonus_amt#x] +- SubqueryAlias BONUS +- LocalRelation [emp_name#x, bonus_amt#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-within-and-or.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-within-and-or.sql.out index c43c920971d0e..7b846e5eae9c8 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-within-and-or.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-within-and-or.sql.out @@ -22,7 +22,7 @@ CreateViewCommand `EMP`, SELECT * FROM VALUES (600, "emp 6 - no dept", date "2001-01-01", 400.00D, 100), (700, "emp 7", date "2010-01-01", 400.00D, 100), (800, "emp 8", date "2016-01-01", 150.00D, 70) -AS EMP(id, emp_name, hiredate, salary, dept_id), false, false, LocalTempView, true +AS EMP(id, emp_name, hiredate, salary, dept_id), false, false, LocalTempView, UNSUPPORTED, true +- Project [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] +- SubqueryAlias EMP +- LocalRelation [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] @@ -45,7 +45,7 @@ CreateViewCommand `DEPT`, SELECT * FROM VALUES (40, "dept 4 - unassigned", "OR"), (50, "dept 5 - unassigned", "NJ"), (70, "dept 7", "FL") -AS DEPT(dept_id, dept_name, state), false, false, LocalTempView, true +AS DEPT(dept_id, dept_name, state), false, false, LocalTempView, UNSUPPORTED, true +- Project [dept_id#x, dept_name#x, state#x] +- SubqueryAlias DEPT +- LocalRelation [dept_id#x, dept_name#x, state#x] @@ -72,7 +72,7 @@ CreateViewCommand `BONUS`, SELECT * FROM VALUES ("emp 4", 100.00D), ("emp 5", 1000.00D), ("emp 6 - no dept", 500.00D) -AS BONUS(emp_name, bonus_amt), false, false, LocalTempView, true +AS BONUS(emp_name, bonus_amt), false, false, LocalTempView, UNSUPPORTED, true +- Project [emp_name#x, bonus_amt#x] +- SubqueryAlias BONUS +- LocalRelation [emp_name#x, bonus_amt#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-basic.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-basic.sql.out index ca68b169b41f8..4d59da8c393af 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-basic.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-basic.sql.out @@ -2,7 +2,7 @@ -- !query create temporary view tab_a as select * from values (1, 1) as tab_a(a1, b1) -- !query analysis -CreateViewCommand `tab_a`, select * from values (1, 1) as tab_a(a1, b1), false, false, LocalTempView, true +CreateViewCommand `tab_a`, select * from values (1, 1) as tab_a(a1, b1), false, false, LocalTempView, UNSUPPORTED, true +- Project [a1#x, b1#x] +- SubqueryAlias tab_a +- LocalRelation [a1#x, b1#x] @@ -11,7 +11,7 @@ CreateViewCommand `tab_a`, select * from values (1, 1) as tab_a(a1, b1), false, -- !query create temporary view tab_b as select * from values (1, 1) as tab_b(a2, b2) -- !query analysis -CreateViewCommand `tab_b`, select * from values (1, 1) as tab_b(a2, b2), false, false, LocalTempView, true +CreateViewCommand `tab_b`, select * from values (1, 1) as tab_b(a2, b2), false, false, LocalTempView, UNSUPPORTED, true +- Project [a2#x, b2#x] +- SubqueryAlias tab_b +- LocalRelation [a2#x, b2#x] @@ -22,7 +22,7 @@ create temporary view struct_tab as select struct(col1 as a, col2 as b) as recor values (1, 1), (1, 2), (2, 1), (2, 2) -- !query analysis CreateViewCommand `struct_tab`, select struct(col1 as a, col2 as b) as record from - values (1, 1), (1, 2), (2, 1), (2, 2), false, false, LocalTempView, true + values (1, 1), (1, 2), (2, 1), (2, 2), false, false, LocalTempView, UNSUPPORTED, true +- Project [struct(a, col1#x, b, col2#x) AS record#x] +- LocalRelation [col1#x, col2#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-count-bug.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-count-bug.sql.out index 390b9a886c31b..39a0a0d5997ff 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-count-bug.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-count-bug.sql.out @@ -2,21 +2,21 @@ -- !query create temporary view t1(c1, c2) as values (0, 1), (1, 2) -- !query analysis -CreateViewCommand `t1`, [(c1,None), (c2,None)], values (0, 1), (1, 2), false, false, LocalTempView, true +CreateViewCommand `t1`, [(c1,None), (c2,None)], values (0, 1), (1, 2), false, false, LocalTempView, UNSUPPORTED, true +- LocalRelation [col1#x, col2#x] -- !query create temporary view t2(c1, c2) as values (0, 2), (0, 3) -- !query analysis -CreateViewCommand `t2`, [(c1,None), (c2,None)], values (0, 2), (0, 3), false, false, LocalTempView, true +CreateViewCommand `t2`, [(c1,None), (c2,None)], values (0, 2), (0, 3), false, false, LocalTempView, UNSUPPORTED, true +- LocalRelation [col1#x, col2#x] -- !query create temporary view t3(c1, c2) as values (0, 3), (1, 4), (2, 5) -- !query analysis -CreateViewCommand `t3`, [(c1,None), (c2,None)], values (0, 3), (1, 4), (2, 5), false, false, LocalTempView, true +CreateViewCommand `t3`, [(c1,None), (c2,None)], values (0, 3), (1, 4), (2, 5), false, false, LocalTempView, UNSUPPORTED, true +- LocalRelation [col1#x, col2#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-group-by.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-group-by.sql.out index 2c5e9eb6733d9..5a253f633bb11 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-group-by.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-group-by.sql.out @@ -28,7 +28,7 @@ CreateViewCommand `t1`, select * from values ("t1d", 10S, null, 12L, float(17.0), 25D, 26E2BD, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), ("t1a", 6S, 8, 10L, float(15.0), 20D, 20E2BD, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), ("t1e", 10S, null, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') - as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i), false, false, LocalTempView, true + as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i), false, false, LocalTempView, UNSUPPORTED, true +- Project [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x] +- SubqueryAlias t1 +- LocalRelation [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x] @@ -65,7 +65,7 @@ CreateViewCommand `t2`, select * from values ("t1e", 8S, null, 19L, float(17), 25D, 26E2BD, timestamp '2014-09-04 01:01:00.000', date '2014-09-04'), ("t1f", 19S, null, 19L, float(17), 25D, 26E2BD, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), ("t1b", null, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', null) - as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i), false, false, LocalTempView, true + as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i), false, false, LocalTempView, UNSUPPORTED, true +- Project [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] +- SubqueryAlias t2 +- LocalRelation [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] @@ -100,7 +100,7 @@ CreateViewCommand `t3`, select * from values ("t1b", null, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-11-04 01:02:00.000', null), ("t3b", 8S, null, 719L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), ("t3b", 8S, null, 19L, float(17), 25D, 26E2BD, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') - as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i), false, false, LocalTempView, true + as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i), false, false, LocalTempView, UNSUPPORTED, true +- Project [t3a#x, t3b#x, t3c#x, t3d#xL, t3e#x, t3f#x, t3g#x, t3h#x, t3i#x] +- SubqueryAlias t3 +- LocalRelation [t3a#x, t3b#x, t3c#x, t3d#xL, t3e#x, t3f#x, t3g#x, t3h#x, t3i#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-having.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-having.sql.out index ec2cc088bf8a9..bddc9d16d7eba 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-having.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-having.sql.out @@ -28,7 +28,7 @@ CreateViewCommand `t1`, select * from values ("val1d", 10S, null, 12L, float(17.0), 25D, 26E2, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), ("val1e", 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') - as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i), false, false, LocalTempView, true + as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i), false, false, LocalTempView, UNSUPPORTED, true +- Project [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x] +- SubqueryAlias t1 +- LocalRelation [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x] @@ -65,7 +65,7 @@ CreateViewCommand `t2`, select * from values ("val1e", 8S, null, 19L, float(17), 25D, 26E2, timestamp '2014-09-04 01:01:00.000', date '2014-09-04'), ("val1f", 19S, null, 19L, float(17), 25D, 26E2, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), ("val1b", null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', null) - as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i), false, false, LocalTempView, true + as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i), false, false, LocalTempView, UNSUPPORTED, true +- Project [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] +- SubqueryAlias t2 +- LocalRelation [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] @@ -100,7 +100,7 @@ CreateViewCommand `t3`, select * from values ("val1b", null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-11-04 01:02:00.000', null), ("val3b", 8S, null, 719L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), ("val3b", 8S, null, 19L, float(17), 25D, 26E2, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') - as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i), false, false, LocalTempView, true + as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i), false, false, LocalTempView, UNSUPPORTED, true +- Project [t3a#x, t3b#x, t3c#x, t3d#xL, t3e#x, t3f#x, t3g#x, t3h#x, t3i#x] +- SubqueryAlias t3 +- LocalRelation [t3a#x, t3b#x, t3c#x, t3d#xL, t3e#x, t3f#x, t3g#x, t3h#x, t3i#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-joins.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-joins.sql.out index 0dfba13398228..46b4a78b9a745 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-joins.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-joins.sql.out @@ -28,7 +28,7 @@ CreateViewCommand `t1`, select * from values ("val1d", 10S, null, 12L, float(17.0), 25D, 26E2, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), ("val1e", 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') - as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i), false, false, LocalTempView, true + as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i), false, false, LocalTempView, UNSUPPORTED, true +- Project [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x] +- SubqueryAlias t1 +- LocalRelation [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x] @@ -65,7 +65,7 @@ CreateViewCommand `t2`, select * from values ("val1e", 8S, null, 19L, float(17), 25D, 26E2, timestamp '2014-09-04 01:01:00.000', date '2014-09-04'), ("val1f", 19S, null, 19L, float(17), 25D, 26E2, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), ("val1b", null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', null) - as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i), false, false, LocalTempView, true + as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i), false, false, LocalTempView, UNSUPPORTED, true +- Project [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] +- SubqueryAlias t2 +- LocalRelation [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] @@ -100,7 +100,7 @@ CreateViewCommand `t3`, select * from values ("val1b", null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-11-04 01:02:00.000', null), ("val3b", 8S, null, 719L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), ("val3b", 8S, null, 19L, float(17), 25D, 26E2, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') - as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i), false, false, LocalTempView, true + as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i), false, false, LocalTempView, UNSUPPORTED, true +- Project [t3a#x, t3b#x, t3c#x, t3d#xL, t3e#x, t3f#x, t3g#x, t3h#x, t3i#x] +- SubqueryAlias t3 +- LocalRelation [t3a#x, t3b#x, t3c#x, t3d#xL, t3e#x, t3f#x, t3g#x, t3h#x, t3i#x] @@ -113,7 +113,7 @@ create temporary view s1 as select * from values -- !query analysis CreateViewCommand `s1`, select * from values (1), (3), (5), (7), (9) - as s1(id), false, false, LocalTempView, true + as s1(id), false, false, LocalTempView, UNSUPPORTED, true +- Project [id#x] +- SubqueryAlias s1 +- LocalRelation [id#x] @@ -126,7 +126,7 @@ create temporary view s2 as select * from values -- !query analysis CreateViewCommand `s2`, select * from values (1), (3), (4), (6), (9) - as s2(id), false, false, LocalTempView, true + as s2(id), false, false, LocalTempView, UNSUPPORTED, true +- Project [id#x] +- SubqueryAlias s2 +- LocalRelation [id#x] @@ -139,7 +139,7 @@ create temporary view s3 as select * from values -- !query analysis CreateViewCommand `s3`, select * from values (3), (4), (6), (9) - as s3(id), false, false, LocalTempView, true + as s3(id), false, false, LocalTempView, UNSUPPORTED, true +- Project [id#x] +- SubqueryAlias s3 +- LocalRelation [id#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-limit.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-limit.sql.out index d19155916f4ad..a828cb92e59d3 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-limit.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-limit.sql.out @@ -28,7 +28,7 @@ CreateViewCommand `t1`, select * from values ("val1d", 10S, null, 12L, float(17.0), 25D, 26E2BD, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2BD, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), ("val1e", 10S, null, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') - as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i), false, false, LocalTempView, true + as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i), false, false, LocalTempView, UNSUPPORTED, true +- Project [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x] +- SubqueryAlias t1 +- LocalRelation [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x] @@ -65,7 +65,7 @@ CreateViewCommand `t2`, select * from values ("val1e", 8S, null, 19L, float(17), 25D, 26E2BD, timestamp '2014-09-04 01:01:00.000', date '2014-09-04'), ("val1f", 19S, null, 19L, float(17), 25D, 26E2BD, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), ("val1b", null, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', null) - as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i), false, false, LocalTempView, true + as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i), false, false, LocalTempView, UNSUPPORTED, true +- Project [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] +- SubqueryAlias t2 +- LocalRelation [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] @@ -100,7 +100,7 @@ CreateViewCommand `t3`, select * from values ("val1b", null, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-11-04 01:02:00.000', null), ("val3b", 8S, null, 719L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), ("val3b", 8S, null, 19L, float(17), 25D, 26E2BD, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') - as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i), false, false, LocalTempView, true + as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i), false, false, LocalTempView, UNSUPPORTED, true +- Project [t3a#x, t3b#x, t3c#x, t3d#xL, t3e#x, t3f#x, t3g#x, t3h#x, t3i#x] +- SubqueryAlias t3 +- LocalRelation [t3a#x, t3b#x, t3c#x, t3d#xL, t3e#x, t3f#x, t3g#x, t3h#x, t3i#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-multiple-columns.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-multiple-columns.sql.out index f4cca5fd385bb..39748a324e527 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-multiple-columns.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-multiple-columns.sql.out @@ -28,7 +28,7 @@ CreateViewCommand `t1`, select * from values ("val1d", 10S, null, 12L, float(17.0), 25D, 26E2, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), ("val1e", 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') - as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i), false, false, LocalTempView, true + as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i), false, false, LocalTempView, UNSUPPORTED, true +- Project [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x] +- SubqueryAlias t1 +- LocalRelation [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x] @@ -65,7 +65,7 @@ CreateViewCommand `t2`, select * from values ("val1e", 8S, null, 19L, float(17), 25D, 26E2, timestamp '2014-09-04 01:01:00.000', date '2014-09-04'), ("val1f", 19S, null, 19L, float(17), 25D, 26E2, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), ("val1b", null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', null) - as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i), false, false, LocalTempView, true + as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i), false, false, LocalTempView, UNSUPPORTED, true +- Project [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] +- SubqueryAlias t2 +- LocalRelation [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] @@ -100,7 +100,7 @@ CreateViewCommand `t3`, select * from values ("val1b", null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-11-04 01:02:00.000', null), ("val3b", 8S, null, 719L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), ("val3b", 8S, null, 19L, float(17), 25D, 26E2, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') - as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i), false, false, LocalTempView, true + as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i), false, false, LocalTempView, UNSUPPORTED, true +- Project [t3a#x, t3b#x, t3c#x, t3d#xL, t3e#x, t3f#x, t3g#x, t3h#x, t3i#x] +- SubqueryAlias t3 +- LocalRelation [t3a#x, t3b#x, t3c#x, t3d#xL, t3e#x, t3f#x, t3g#x, t3h#x, t3i#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-null-semantics.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-null-semantics.sql.out index 7816958795d01..51fb2455c19f8 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-null-semantics.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-null-semantics.sql.out @@ -2,14 +2,14 @@ -- !query create temp view v (c) as values (1), (null) -- !query analysis -CreateViewCommand `v`, [(c,None)], values (1), (null), false, false, LocalTempView, true +CreateViewCommand `v`, [(c,None)], values (1), (null), false, false, LocalTempView, UNSUPPORTED, true +- LocalRelation [col1#x] -- !query create temp view v_empty (e) as select 1 where false -- !query analysis -CreateViewCommand `v_empty`, [(e,None)], select 1 where false, false, false, LocalTempView, true +CreateViewCommand `v_empty`, [(e,None)], select 1 where false, false, false, LocalTempView, UNSUPPORTED, true +- Project [1 AS 1#x] +- Filter false +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-nullability.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-nullability.sql.out index 34a14c6f73d15..24c8273f475a8 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-nullability.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-nullability.sql.out @@ -2,7 +2,7 @@ -- !query create temp view t0 as select 1 as a_nonnullable -- !query analysis -CreateViewCommand `t0`, select 1 as a_nonnullable, false, false, LocalTempView, true +CreateViewCommand `t0`, select 1 as a_nonnullable, false, false, LocalTempView, UNSUPPORTED, true +- Project [1 AS a_nonnullable#x] +- OneRowRelation @@ -10,7 +10,7 @@ CreateViewCommand `t0`, select 1 as a_nonnullable, false, false, LocalTempView, -- !query create temp view t1 as select cast(null as int) as b_nullable -- !query analysis -CreateViewCommand `t1`, select cast(null as int) as b_nullable, false, false, LocalTempView, true +CreateViewCommand `t1`, select cast(null as int) as b_nullable, false, false, LocalTempView, UNSUPPORTED, true +- Project [cast(null as int) AS b_nullable#x] +- OneRowRelation @@ -18,7 +18,7 @@ CreateViewCommand `t1`, select cast(null as int) as b_nullable, false, false, Lo -- !query create temp view t2 as select 2 as c -- !query analysis -CreateViewCommand `t2`, select 2 as c, false, false, LocalTempView, true +CreateViewCommand `t2`, select 2 as c, false, false, LocalTempView, UNSUPPORTED, true +- Project [2 AS c#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-order-by.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-order-by.sql.out index 3f78e0c62c03e..075e4c90d8110 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-order-by.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-order-by.sql.out @@ -28,7 +28,7 @@ CreateViewCommand `t1`, select * from values ("val1d", 10S, null, 12L, float(17.0), 25D, 26E2BD, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2BD, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), ("val1e", 10S, null, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') - as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i), false, false, LocalTempView, true + as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i), false, false, LocalTempView, UNSUPPORTED, true +- Project [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x] +- SubqueryAlias t1 +- LocalRelation [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x] @@ -65,7 +65,7 @@ CreateViewCommand `t2`, select * from values ("val1e", 8S, null, 19L, float(17), 25D, 26E2BD, timestamp '2014-09-04 01:01:00.000', date '2014-09-04'), ("val1f", 19S, null, 19L, float(17), 25D, 26E2BD, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), ("val1b", null, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', null) - as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i), false, false, LocalTempView, true + as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i), false, false, LocalTempView, UNSUPPORTED, true +- Project [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] +- SubqueryAlias t2 +- LocalRelation [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] @@ -100,7 +100,7 @@ CreateViewCommand `t3`, select * from values ("val1b", null, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-11-04 01:02:00.000', null), ("val3b", 8S, null, 719L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), ("val3b", 8S, null, 19L, float(17), 25D, 26E2BD, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') - as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i), false, false, LocalTempView, true + as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i), false, false, LocalTempView, UNSUPPORTED, true +- Project [t3a#x, t3b#x, t3c#x, t3d#xL, t3e#x, t3f#x, t3g#x, t3h#x, t3i#x] +- SubqueryAlias t3 +- LocalRelation [t3a#x, t3b#x, t3c#x, t3d#xL, t3e#x, t3f#x, t3g#x, t3h#x, t3i#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-set-operations.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-set-operations.sql.out index de8864ec73bbd..9239670589bcb 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-set-operations.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-set-operations.sql.out @@ -28,7 +28,7 @@ CreateViewCommand `t1`, select * from values ("val1d", 10S, null, 12L, float(17.0), 25D, 26E2BD, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2BD, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), ("val1e", 10S, null, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') - as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i), false, false, LocalTempView, true + as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i), false, false, LocalTempView, UNSUPPORTED, true +- Project [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x] +- SubqueryAlias t1 +- LocalRelation [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x] @@ -65,7 +65,7 @@ CreateViewCommand `t2`, select * from values ("val1e", 8S, null, 19L, float(17), 25D, 26E2BD, timestamp '2014-09-04 01:01:00.000', date '2014-09-04'), ("val1f", 19S, null, 19L, float(17), 25D, 26E2BD, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), ("val1b", null, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', null) - as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i), false, false, LocalTempView, true + as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i), false, false, LocalTempView, UNSUPPORTED, true +- Project [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] +- SubqueryAlias t2 +- LocalRelation [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] @@ -100,7 +100,7 @@ CreateViewCommand `t3`, select * from values ("val1b", null, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-11-04 01:02:00.000', null), ("val3b", 8S, null, 719L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), ("val3b", 8S, null, 19L, float(17), 25D, 26E2BD, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') - as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i), false, false, LocalTempView, true + as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i), false, false, LocalTempView, UNSUPPORTED, true +- Project [t3a#x, t3b#x, t3c#x, t3d#xL, t3e#x, t3f#x, t3g#x, t3h#x, t3i#x] +- SubqueryAlias t3 +- LocalRelation [t3a#x, t3b#x, t3c#x, t3d#xL, t3e#x, t3f#x, t3g#x, t3h#x, t3i#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-subquery-in-join-condition.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-subquery-in-join-condition.sql.out index 712d94ce323d6..ce6a1a3d7ed53 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-subquery-in-join-condition.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-subquery-in-join-condition.sql.out @@ -8,7 +8,7 @@ CREATE TEMP VIEW x(x1, x2) AS VALUES CreateViewCommand `x`, [(x1,None), (x2,None)], VALUES (2, 1), (1, 1), - (3, 4), false, false, LocalTempView, true + (3, 4), false, false, LocalTempView, UNSUPPORTED, true +- LocalRelation [col1#x, col2#x] @@ -21,7 +21,7 @@ CREATE TEMP VIEW y(y1, y2) AS VALUES CreateViewCommand `y`, [(y1,None), (y2,None)], VALUES (0, 2), (1, 4), - (4, 11), false, false, LocalTempView, true + (4, 11), false, false, LocalTempView, UNSUPPORTED, true +- LocalRelation [col1#x, col2#x] @@ -34,7 +34,7 @@ CREATE TEMP VIEW z(z1, z2) AS VALUES CreateViewCommand `z`, [(z1,None), (z2,None)], VALUES (4, 2), (3, 3), - (8, 1), false, false, LocalTempView, true + (8, 1), false, false, LocalTempView, UNSUPPORTED, true +- LocalRelation [col1#x, col2#x] @@ -916,3 +916,47 @@ Sort [x1#x ASC NULLS FIRST, x2#x ASC NULLS FIRST, y1#x ASC NULLS FIRST, y2#x ASC +- View (`y`, [y1#x, y2#x]) +- Project [cast(col1#x as int) AS y1#x, cast(col2#x as int) AS y2#x] +- LocalRelation [col1#x, col2#x] + + +-- !query +select * from x left join y on x1 = y1 and x2 IN (select z1 from z where z2 = x2 AND z2 = y2) order by x1, x2, y1, y2 +-- !query analysis +Sort [x1#x ASC NULLS FIRST, x2#x ASC NULLS FIRST, y1#x ASC NULLS FIRST, y2#x ASC NULLS FIRST], true ++- Project [x1#x, x2#x, y1#x, y2#x] + +- Join LeftOuter, ((x1#x = y1#x) AND x2#x IN (list#x [x2#x && y2#x])) + : +- Project [z1#x] + : +- Filter ((z2#x = outer(x2#x)) AND (z2#x = outer(y2#x))) + : +- SubqueryAlias z + : +- View (`z`, [z1#x, z2#x]) + : +- Project [cast(col1#x as int) AS z1#x, cast(col2#x as int) AS z2#x] + : +- LocalRelation [col1#x, col2#x] + :- SubqueryAlias x + : +- View (`x`, [x1#x, x2#x]) + : +- Project [cast(col1#x as int) AS x1#x, cast(col2#x as int) AS x2#x] + : +- LocalRelation [col1#x, col2#x] + +- SubqueryAlias y + +- View (`y`, [y1#x, y2#x]) + +- Project [cast(col1#x as int) AS y1#x, cast(col2#x as int) AS y2#x] + +- LocalRelation [col1#x, col2#x] + + +-- !query +select * from x left join y on x1 = y1 and x2 not IN (select z1 from z where z2 = x2 AND z2 = y2) order by x1, x2, y1, y2 +-- !query analysis +Sort [x1#x ASC NULLS FIRST, x2#x ASC NULLS FIRST, y1#x ASC NULLS FIRST, y2#x ASC NULLS FIRST], true ++- Project [x1#x, x2#x, y1#x, y2#x] + +- Join LeftOuter, ((x1#x = y1#x) AND NOT x2#x IN (list#x [x2#x && y2#x])) + : +- Project [z1#x] + : +- Filter ((z2#x = outer(x2#x)) AND (z2#x = outer(y2#x))) + : +- SubqueryAlias z + : +- View (`z`, [z1#x, z2#x]) + : +- Project [cast(col1#x as int) AS z1#x, cast(col2#x as int) AS z2#x] + : +- LocalRelation [col1#x, col2#x] + :- SubqueryAlias x + : +- View (`x`, [x1#x, x2#x]) + : +- Project [cast(col1#x as int) AS x1#x, cast(col2#x as int) AS x2#x] + : +- LocalRelation [col1#x, col2#x] + +- SubqueryAlias y + +- View (`y`, [y1#x, y2#x]) + +- Project [cast(col1#x as int) AS y1#x, cast(col2#x as int) AS y2#x] + +- LocalRelation [col1#x, col2#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-with-cte.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-with-cte.sql.out index 5c1465d4fc136..0074991b4ea6a 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-with-cte.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-with-cte.sql.out @@ -28,7 +28,7 @@ CreateViewCommand `t1`, select * from values ("val1d", 10S, null, 12L, float(17.0), 25D, 26E2, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), ("val1e", 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') - as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i), false, false, LocalTempView, true + as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i), false, false, LocalTempView, UNSUPPORTED, true +- Project [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x] +- SubqueryAlias t1 +- LocalRelation [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x] @@ -65,7 +65,7 @@ CreateViewCommand `t2`, select * from values ("val1e", 8S, null, 19L, float(17), 25D, 26E2, timestamp '2014-09-04 01:01:00.000', date '2014-09-04'), ("val1f", 19S, null, 19L, float(17), 25D, 26E2, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), ("val1b", null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', null) - as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i), false, false, LocalTempView, true + as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i), false, false, LocalTempView, UNSUPPORTED, true +- Project [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] +- SubqueryAlias t2 +- LocalRelation [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] @@ -100,7 +100,7 @@ CreateViewCommand `t3`, select * from values ("val1b", null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-11-04 01:02:00.000', null), ("val3b", 8S, null, 719L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), ("val3b", 8S, null, 19L, float(17), 25D, 26E2, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') - as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i), false, false, LocalTempView, true + as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i), false, false, LocalTempView, UNSUPPORTED, true +- Project [t3a#x, t3b#x, t3c#x, t3d#xL, t3e#x, t3f#x, t3g#x, t3h#x, t3i#x] +- SubqueryAlias t3 +- LocalRelation [t3a#x, t3b#x, t3c#x, t3d#xL, t3e#x, t3f#x, t3g#x, t3h#x, t3i#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/nested-not-in.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/nested-not-in.sql.out index 5dd0d161aed82..a098d19da594d 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/nested-not-in.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/nested-not-in.sql.out @@ -18,7 +18,7 @@ CreateViewCommand `EMP`, SELECT * FROM VALUES (500, "emp 5", NULL), (600, "emp 6", 100), (800, "emp 8", 70) -AS EMP(id, emp_name, dept_id), false, false, LocalTempView, true +AS EMP(id, emp_name, dept_id), false, false, LocalTempView, UNSUPPORTED, true +- Project [id#x, emp_name#x, dept_id#x] +- SubqueryAlias EMP +- LocalRelation [id#x, emp_name#x, dept_id#x] @@ -41,7 +41,7 @@ CreateViewCommand `DEPT`, SELECT * FROM VALUES (40, "dept 4 - unassigned", "OR"), (50, "dept 5 - unassigned", "NJ"), (70, "dept 7", "FL") -AS DEPT(dept_id, dept_name, state), false, false, LocalTempView, true +AS DEPT(dept_id, dept_name, state), false, false, LocalTempView, UNSUPPORTED, true +- Project [dept_id#x, dept_name#x, state#x] +- SubqueryAlias DEPT +- LocalRelation [dept_id#x, dept_name#x, state#x] @@ -68,7 +68,7 @@ CreateViewCommand `BONUS`, SELECT * FROM VALUES ("emp 4", 100.00D), ("emp 5", 1000.00D), ("emp 6 - no dept", 500.00D) -AS BONUS(emp_name, bonus_amt), false, false, LocalTempView, true +AS BONUS(emp_name, bonus_amt), false, false, LocalTempView, UNSUPPORTED, true +- Project [emp_name#x, bonus_amt#x] +- SubqueryAlias BONUS +- LocalRelation [emp_name#x, bonus_amt#x] @@ -91,7 +91,7 @@ CreateViewCommand `ADDRESS`, SELECT * FROM VALUES (null, null, "addr4"), (600, "emp 6", "addr6"), (800, "emp 8", "addr8") -AS ADDRESS(id, emp_name, address), false, false, LocalTempView, true +AS ADDRESS(id, emp_name, address), false, false, LocalTempView, UNSUPPORTED, true +- Project [id#x, emp_name#x, address#x] +- SubqueryAlias ADDRESS +- LocalRelation [id#x, emp_name#x, address#x] @@ -102,7 +102,7 @@ CREATE TEMPORARY VIEW S1 AS SELECT * FROM VALUES (null, null), (5, 5), (8, 8), (11, 11) AS s1(a, b) -- !query analysis CreateViewCommand `S1`, SELECT * FROM VALUES - (null, null), (5, 5), (8, 8), (11, 11) AS s1(a, b), false, false, LocalTempView, true + (null, null), (5, 5), (8, 8), (11, 11) AS s1(a, b), false, false, LocalTempView, UNSUPPORTED, true +- Project [a#x, b#x] +- SubqueryAlias s1 +- LocalRelation [a#x, b#x] @@ -113,7 +113,7 @@ CREATE TEMPORARY VIEW S2 AS SELECT * FROM VALUES (7, 7), (8, 8), (11, 11), (null, null) AS s2(c, d) -- !query analysis CreateViewCommand `S2`, SELECT * FROM VALUES - (7, 7), (8, 8), (11, 11), (null, null) AS s2(c, d), false, false, LocalTempView, true + (7, 7), (8, 8), (11, 11), (null, null) AS s2(c, d), false, false, LocalTempView, UNSUPPORTED, true +- Project [c#x, d#x] +- SubqueryAlias s2 +- LocalRelation [c#x, d#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/not-in-group-by.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/not-in-group-by.sql.out index b2559f63ea642..724f9f29894c1 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/not-in-group-by.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/not-in-group-by.sql.out @@ -28,7 +28,7 @@ CreateViewCommand `t1`, select * from values ("val1d", 10S, null, 12L, float(17.0), 25D, 26E2, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), ("val1e", 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') - as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i), false, false, LocalTempView, true + as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i), false, false, LocalTempView, UNSUPPORTED, true +- Project [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x] +- SubqueryAlias t1 +- LocalRelation [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x] @@ -65,7 +65,7 @@ CreateViewCommand `t2`, select * from values ("val1e", 8S, null, 19L, float(17), 25D, 26E2, timestamp '2014-09-04 01:01:00.000', date '2014-09-04'), ("val1f", 19S, null, 19L, float(17), 25D, 26E2, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), ("val1b", null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', null) - as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i), false, false, LocalTempView, true + as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i), false, false, LocalTempView, UNSUPPORTED, true +- Project [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] +- SubqueryAlias t2 +- LocalRelation [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] @@ -100,7 +100,7 @@ CreateViewCommand `t3`, select * from values ("val1b", null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-11-04 01:02:00.000', null), ("val3b", 8S, null, 719L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), ("val3b", 8S, null, 19L, float(17), 25D, 26E2, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') - as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i), false, false, LocalTempView, true + as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i), false, false, LocalTempView, UNSUPPORTED, true +- Project [t3a#x, t3b#x, t3c#x, t3d#xL, t3e#x, t3f#x, t3g#x, t3h#x, t3i#x] +- SubqueryAlias t3 +- LocalRelation [t3a#x, t3b#x, t3c#x, t3d#xL, t3e#x, t3f#x, t3g#x, t3h#x, t3i#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/not-in-joins.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/not-in-joins.sql.out index 5f6a4b9d0a546..90bcb4818091f 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/not-in-joins.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/not-in-joins.sql.out @@ -28,7 +28,7 @@ CreateViewCommand `t1`, select * from values ("val1d", 10S, null, 12L, float(17.0), 25D, 26E2, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), ("val1e", 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') - as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i), false, false, LocalTempView, true + as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i), false, false, LocalTempView, UNSUPPORTED, true +- Project [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x] +- SubqueryAlias t1 +- LocalRelation [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x] @@ -65,7 +65,7 @@ CreateViewCommand `t2`, select * from values ("val1e", 8S, null, 19L, float(17), 25D, 26E2, timestamp '2014-09-04 01:01:00.000', date '2014-09-04'), ("val1f", 19S, null, 19L, float(17), 25D, 26E2, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), ("val1b", null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', null) - as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i), false, false, LocalTempView, true + as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i), false, false, LocalTempView, UNSUPPORTED, true +- Project [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] +- SubqueryAlias t2 +- LocalRelation [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] @@ -100,7 +100,7 @@ CreateViewCommand `t3`, select * from values ("val1b", null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-11-04 01:02:00.000', null), ("val3b", 8S, null, 719L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), ("val3b", 8S, null, 19L, float(17), 25D, 26E2, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') - as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i), false, false, LocalTempView, true + as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i), false, false, LocalTempView, UNSUPPORTED, true +- Project [t3a#x, t3b#x, t3c#x, t3d#xL, t3e#x, t3f#x, t3g#x, t3h#x, t3i#x] +- SubqueryAlias t3 +- LocalRelation [t3a#x, t3b#x, t3c#x, t3d#xL, t3e#x, t3f#x, t3g#x, t3h#x, t3i#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/not-in-unit-tests-multi-column-literal.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/not-in-unit-tests-multi-column-literal.sql.out index 0844e2540acc8..e8a8e46d1d122 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/not-in-unit-tests-multi-column-literal.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/not-in-unit-tests-multi-column-literal.sql.out @@ -12,7 +12,7 @@ CreateViewCommand `m`, SELECT * FROM VALUES (null, 1.0), (2, 3.0), (4, 5.0) - AS m(a, b), false, false, LocalTempView, true + AS m(a, b), false, false, LocalTempView, UNSUPPORTED, true +- Project [a#x, b#x] +- SubqueryAlias m +- LocalRelation [a#x, b#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/not-in-unit-tests-multi-column.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/not-in-unit-tests-multi-column.sql.out index 7578a97548e06..f7dac3e2c4675 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/not-in-unit-tests-multi-column.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/not-in-unit-tests-multi-column.sql.out @@ -12,7 +12,7 @@ CreateViewCommand `m`, SELECT * FROM VALUES (null, 1.0), (2, 3.0), (4, 5.0) - AS m(a, b), false, false, LocalTempView, true + AS m(a, b), false, false, LocalTempView, UNSUPPORTED, true +- Project [a#x, b#x] +- SubqueryAlias m +- LocalRelation [a#x, b#x] @@ -31,7 +31,7 @@ CreateViewCommand `s`, SELECT * FROM VALUES (0, 1.0), (2, 3.0), (4, null) - AS s(c, d), false, false, LocalTempView, true + AS s(c, d), false, false, LocalTempView, UNSUPPORTED, true +- Project [c#x, d#x] +- SubqueryAlias s +- LocalRelation [c#x, d#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/not-in-unit-tests-single-column-literal.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/not-in-unit-tests-single-column-literal.sql.out index f1c047fddd955..7de84bb97346a 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/not-in-unit-tests-single-column-literal.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/not-in-unit-tests-single-column-literal.sql.out @@ -10,7 +10,7 @@ CreateViewCommand `m`, SELECT * FROM VALUES (null, 1.0), (2, 3.0), (4, 5.0) - AS m(a, b), false, false, LocalTempView, true + AS m(a, b), false, false, LocalTempView, UNSUPPORTED, true +- Project [a#x, b#x] +- SubqueryAlias m +- LocalRelation [a#x, b#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/not-in-unit-tests-single-column.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/not-in-unit-tests-single-column.sql.out index 0b63a774631af..8c234dd5b7d70 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/not-in-unit-tests-single-column.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/not-in-unit-tests-single-column.sql.out @@ -10,7 +10,7 @@ CreateViewCommand `m`, SELECT * FROM VALUES (null, 1.0), (2, 3.0), (4, 5.0) - AS m(a, b), false, false, LocalTempView, true + AS m(a, b), false, false, LocalTempView, UNSUPPORTED, true +- Project [a#x, b#x] +- SubqueryAlias m +- LocalRelation [a#x, b#x] @@ -27,7 +27,7 @@ CreateViewCommand `s`, SELECT * FROM VALUES (null, 1.0), (2, 3.0), (6, 7.0) - AS s(c, d), false, false, LocalTempView, true + AS s(c, d), false, false, LocalTempView, UNSUPPORTED, true +- Project [c#x, d#x] +- SubqueryAlias s +- LocalRelation [c#x, d#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/simple-in.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/simple-in.sql.out index e95de3f5a0f1d..f3f5aa7fcade4 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/simple-in.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/simple-in.sql.out @@ -28,7 +28,7 @@ CreateViewCommand `t1`, select * from values ("t1d", 10S, null, 12L, float(17.0), 25D, 26E2BD, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), ("t1a", 6S, 8, 10L, float(15.0), 20D, 20E2BD, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), ("t1e", 10S, null, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') - as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i), false, false, LocalTempView, true + as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i), false, false, LocalTempView, UNSUPPORTED, true +- Project [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x] +- SubqueryAlias t1 +- LocalRelation [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x] @@ -65,7 +65,7 @@ CreateViewCommand `t2`, select * from values ("t1e", 8S, null, 19L, float(17), 25D, 26E2BD, timestamp '2014-09-04 01:01:00.000', date '2014-09-04'), ("t1f", 19S, null, 19L, float(17), 25D, 26E2BD, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), ("t1b", null, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', null) - as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i), false, false, LocalTempView, true + as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i), false, false, LocalTempView, UNSUPPORTED, true +- Project [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] +- SubqueryAlias t2 +- LocalRelation [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] @@ -100,7 +100,7 @@ CreateViewCommand `t3`, select * from values ("t1b", null, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-11-04 01:02:00.000', null), ("t3b", 8S, null, 719L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), ("t3b", 8S, null, 19L, float(17), 25D, 26E2BD, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') - as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i), false, false, LocalTempView, true + as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i), false, false, LocalTempView, UNSUPPORTED, true +- Project [t3a#x, t3b#x, t3c#x, t3d#xL, t3e#x, t3f#x, t3g#x, t3h#x, t3i#x] +- SubqueryAlias t3 +- LocalRelation [t3a#x, t3b#x, t3c#x, t3d#xL, t3e#x, t3f#x, t3g#x, t3h#x, t3i#x] @@ -317,7 +317,7 @@ create temporary view a as select * from values -- !query analysis CreateViewCommand `a`, select * from values (1, 1), (2, 1), (null, 1), (1, 3), (null, 3), (1, null), (null, 2) - as a(a1, a2), false, false, LocalTempView, true + as a(a1, a2), false, false, LocalTempView, UNSUPPORTED, true +- Project [a1#x, a2#x] +- SubqueryAlias a +- LocalRelation [a1#x, a2#x] @@ -330,7 +330,7 @@ create temporary view b as select * from values -- !query analysis CreateViewCommand `b`, select * from values (1, 1, 2), (null, 3, 2), (1, null, 2), (1, 2, null) - as b(b1, b2, b3), false, false, LocalTempView, true + as b(b1, b2, b3), false, false, LocalTempView, UNSUPPORTED, true +- Project [b1#x, b2#x, b3#x] +- SubqueryAlias b +- LocalRelation [b1#x, b2#x, b3#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/negative-cases/invalid-correlation.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/negative-cases/invalid-correlation.sql.out index 62aa6be0bd118..95b38e1c7e0f5 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/negative-cases/invalid-correlation.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/negative-cases/invalid-correlation.sql.out @@ -6,7 +6,7 @@ AS t1(t1a, t1b, t1c) -- !query analysis CreateViewCommand `t1`, SELECT * FROM VALUES (1, 2, 3) -AS t1(t1a, t1b, t1c), false, false, LocalTempView, true +AS t1(t1a, t1b, t1c), false, false, LocalTempView, UNSUPPORTED, true +- Project [t1a#x, t1b#x, t1c#x] +- SubqueryAlias t1 +- LocalRelation [t1a#x, t1b#x, t1c#x] @@ -19,7 +19,7 @@ AS t2(t2a, t2b, t2c) -- !query analysis CreateViewCommand `t2`, SELECT * FROM VALUES (1, 0, 1) -AS t2(t2a, t2b, t2c), false, false, LocalTempView, true +AS t2(t2a, t2b, t2c), false, false, LocalTempView, UNSUPPORTED, true +- Project [t2a#x, t2b#x, t2c#x] +- SubqueryAlias t2 +- LocalRelation [t2a#x, t2b#x, t2c#x] @@ -32,7 +32,7 @@ AS t3(t3a, t3b, t3c) -- !query analysis CreateViewCommand `t3`, SELECT * FROM VALUES (3, 1, 2) -AS t3(t3a, t3b, t3c), false, false, LocalTempView, true +AS t3(t3a, t3b, t3c), false, false, LocalTempView, UNSUPPORTED, true +- Project [t3a#x, t3b#x, t3c#x] +- SubqueryAlias t3 +- LocalRelation [t3a#x, t3b#x, t3c#x] @@ -178,7 +178,7 @@ AS t1(t1a, t1b, t1c) -- !query analysis CreateViewCommand `t1_copy`, SELECT * FROM VALUES (1, 2, 3) -AS t1(t1a, t1b, t1c), false, false, LocalTempView, true +AS t1(t1a, t1b, t1c), false, false, LocalTempView, UNSUPPORTED, true +- Project [t1a#x, t1b#x, t1c#x] +- SubqueryAlias t1 +- LocalRelation [t1a#x, t1b#x, t1c#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/negative-cases/subq-input-typecheck.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/negative-cases/subq-input-typecheck.sql.out index e550e52bf64b0..dbbb3e2d7062b 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/negative-cases/subq-input-typecheck.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/negative-cases/subq-input-typecheck.sql.out @@ -6,7 +6,7 @@ AS t1(t1a, t1b, t1c) -- !query analysis CreateViewCommand `t1`, SELECT * FROM VALUES (1, 2, 3) -AS t1(t1a, t1b, t1c), false, false, LocalTempView, true +AS t1(t1a, t1b, t1c), false, false, LocalTempView, UNSUPPORTED, true +- Project [t1a#x, t1b#x, t1c#x] +- SubqueryAlias t1 +- LocalRelation [t1a#x, t1b#x, t1c#x] @@ -19,7 +19,7 @@ AS t2(t2a, t2b, t2c) -- !query analysis CreateViewCommand `t2`, SELECT * FROM VALUES (1, 0, 1) -AS t2(t2a, t2b, t2c), false, false, LocalTempView, true +AS t2(t2a, t2b, t2c), false, false, LocalTempView, UNSUPPORTED, true +- Project [t2a#x, t2b#x, t2c#x] +- SubqueryAlias t2 +- LocalRelation [t2a#x, t2b#x, t2c#x] @@ -32,7 +32,7 @@ AS t3(t3a, t3b, t3c) -- !query analysis CreateViewCommand `t3`, SELECT * FROM VALUES (3, 1, 2) -AS t3(t3a, t3b, t3c), false, false, LocalTempView, true +AS t3(t3a, t3b, t3c), false, false, LocalTempView, UNSUPPORTED, true +- Project [t3a#x, t3b#x, t3c#x] +- SubqueryAlias t3 +- LocalRelation [t3a#x, t3b#x, t3c#x] @@ -45,7 +45,7 @@ AS t1(t4a, t4b, t4c) -- !query analysis CreateViewCommand `t4`, SELECT * FROM VALUES (CAST(1 AS DOUBLE), CAST(2 AS STRING), CAST(3 AS STRING)) -AS t1(t4a, t4b, t4c), false, false, LocalTempView, true +AS t1(t4a, t4b, t4c), false, false, LocalTempView, UNSUPPORTED, true +- Project [t4a#x, t4b#x, t4c#x] +- SubqueryAlias t1 +- LocalRelation [t4a#x, t4b#x, t4c#x] @@ -58,7 +58,7 @@ AS t1(t5a, t5b, t5c) -- !query analysis CreateViewCommand `t5`, SELECT * FROM VALUES (CAST('2011-01-01 01:01:01' AS TIMESTAMP), CAST(2 AS STRING), CAST(3 AS BIGINT)) -AS t1(t5a, t5b, t5c), false, false, LocalTempView, true +AS t1(t5a, t5b, t5c), false, false, LocalTempView, UNSUPPORTED, true +- Project [t5a#x, t5b#x, t5c#xL] +- SubqueryAlias t1 +- LocalRelation [t5a#x, t5b#x, t5c#xL] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/scalar-subquery/nested-scalar-subquery-count-bug.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/scalar-subquery/nested-scalar-subquery-count-bug.sql.out index 0dc4fa9e4808f..2a1abc0d48871 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/scalar-subquery/nested-scalar-subquery-count-bug.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/scalar-subquery/nested-scalar-subquery-count-bug.sql.out @@ -2,21 +2,21 @@ -- !query CREATE OR REPLACE VIEW t1(a1, a2) as values (0, 1), (1, 2) -- !query analysis -CreateViewCommand `spark_catalog`.`default`.`t1`, [(a1,None), (a2,None)], values (0, 1), (1, 2), false, true, PersistedView, true +CreateViewCommand `spark_catalog`.`default`.`t1`, [(a1,None), (a2,None)], values (0, 1), (1, 2), false, true, PersistedView, COMPENSATION, true +- LocalRelation [col1#x, col2#x] -- !query CREATE OR REPLACE VIEW t2(b1, b2) as values (0, 2), (0, 3) -- !query analysis -CreateViewCommand `spark_catalog`.`default`.`t2`, [(b1,None), (b2,None)], values (0, 2), (0, 3), false, true, PersistedView, true +CreateViewCommand `spark_catalog`.`default`.`t2`, [(b1,None), (b2,None)], values (0, 2), (0, 3), false, true, PersistedView, COMPENSATION, true +- LocalRelation [col1#x, col2#x] -- !query CREATE OR REPLACE VIEW t3(c1, c2) as values (0, 2), (0, 3) -- !query analysis -CreateViewCommand `spark_catalog`.`default`.`t3`, [(c1,None), (c2,None)], values (0, 2), (0, 3), false, true, PersistedView, true +CreateViewCommand `spark_catalog`.`default`.`t3`, [(c1,None), (c2,None)], values (0, 2), (0, 3), false, true, PersistedView, COMPENSATION, true +- LocalRelation [col1#x, col2#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/scalar-subquery/scalar-subquery-count-bug.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/scalar-subquery/scalar-subquery-count-bug.sql.out index f40deb3ead5ab..ebe071c5261fd 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/scalar-subquery/scalar-subquery-count-bug.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/scalar-subquery/scalar-subquery-count-bug.sql.out @@ -19,7 +19,7 @@ CreateViewCommand `l`, [(a,None), (b,None)], values (3, 3.0), (null, null), (null, 5.0), - (6, null), false, false, LocalTempView, true + (6, null), false, false, LocalTempView, UNSUPPORTED, true +- LocalRelation [col1#x, col2#x] @@ -41,7 +41,7 @@ CreateViewCommand `r`, [(c,None), (d,None)], values (4, 1.0), (null, null), (null, 5.0), - (6, null), false, false, LocalTempView, true + (6, null), false, false, LocalTempView, UNSUPPORTED, true +- LocalRelation [col1#x, col2#x] @@ -171,7 +171,7 @@ Project [a#x, b#x, scalar-subquery#x [a#x] AS scalarsubquery(a)#xL] -- !query CREATE TEMPORARY VIEW null_view(a, b) AS SELECT CAST(null AS int), CAST(null as int) -- !query analysis -CreateViewCommand `null_view`, [(a,None), (b,None)], SELECT CAST(null AS int), CAST(null as int), false, false, LocalTempView, true +CreateViewCommand `null_view`, [(a,None), (b,None)], SELECT CAST(null AS int), CAST(null as int), false, false, LocalTempView, UNSUPPORTED, true +- Project [cast(null as int) AS CAST(NULL AS INT)#x, cast(null as int) AS CAST(NULL AS INT)#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/scalar-subquery/scalar-subquery-group-by.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/scalar-subquery/scalar-subquery-group-by.sql.out new file mode 100644 index 0000000000000..671557aa39566 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/scalar-subquery/scalar-subquery-group-by.sql.out @@ -0,0 +1,218 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +create temp view x (x1, x2) as values (1, 1), (2, 2) +-- !query analysis +CreateViewCommand `x`, [(x1,None), (x2,None)], values (1, 1), (2, 2), false, false, LocalTempView, UNSUPPORTED, true + +- LocalRelation [col1#x, col2#x] + + +-- !query +create temp view y (y1, y2) as values (2, 0), (3, -1) +-- !query analysis +CreateViewCommand `y`, [(y1,None), (y2,None)], values (2, 0), (3, -1), false, false, LocalTempView, UNSUPPORTED, true + +- LocalRelation [col1#x, col2#x] + + +-- !query +create temp view z (z1, z2) as values (1, 0), (1, 1) +-- !query analysis +CreateViewCommand `z`, [(z1,None), (z2,None)], values (1, 0), (1, 1), false, false, LocalTempView, UNSUPPORTED, true + +- LocalRelation [col1#x, col2#x] + + +-- !query +select * from x where (select count(*) from y where y1 = x1 group by y1) = 1 +-- !query analysis +Project [x1#x, x2#x] ++- Filter (scalar-subquery#x [x1#x] = cast(1 as bigint)) + : +- Aggregate [y1#x], [count(1) AS count(1)#xL] + : +- Filter (y1#x = outer(x1#x)) + : +- SubqueryAlias y + : +- View (`y`, [y1#x, y2#x]) + : +- Project [cast(col1#x as int) AS y1#x, cast(col2#x as int) AS y2#x] + : +- LocalRelation [col1#x, col2#x] + +- SubqueryAlias x + +- View (`x`, [x1#x, x2#x]) + +- Project [cast(col1#x as int) AS x1#x, cast(col2#x as int) AS x2#x] + +- LocalRelation [col1#x, col2#x] + + +-- !query +select * from x where (select count(*) from y where y1 = x1 group by x1) = 1 +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.CORRELATED_REFERENCE", + "sqlState" : "0A000", + "messageParameters" : { + "sqlExprs" : "\"x1\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 61, + "stopIndex" : 71, + "fragment" : "group by x1" + } ] +} + + +-- !query +select * from x where (select count(*) from y where y1 > x1 group by x1) = 1 +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.CORRELATED_REFERENCE", + "sqlState" : "0A000", + "messageParameters" : { + "sqlExprs" : "\"x1\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 61, + "stopIndex" : 71, + "fragment" : "group by x1" + } ] +} + + +-- !query +select *, (select count(*) from y where x1 = y1 and y2 = 1 group by y2) from x +-- !query analysis +Project [x1#x, x2#x, scalar-subquery#x [x1#x] AS scalarsubquery(x1)#xL] +: +- Aggregate [y2#x], [count(1) AS count(1)#xL] +: +- Filter ((outer(x1#x) = y1#x) AND (y2#x = 1)) +: +- SubqueryAlias y +: +- View (`y`, [y1#x, y2#x]) +: +- Project [cast(col1#x as int) AS y1#x, cast(col2#x as int) AS y2#x] +: +- LocalRelation [col1#x, col2#x] ++- SubqueryAlias x + +- View (`x`, [x1#x, x2#x]) + +- Project [cast(col1#x as int) AS x1#x, cast(col2#x as int) AS x2#x] + +- LocalRelation [col1#x, col2#x] + + +-- !query +select *, (select count(*) from y where x1 = y1 and y2 = x1 + 1 group by y2) from x +-- !query analysis +Project [x1#x, x2#x, scalar-subquery#x [x1#x && x1#x] AS scalarsubquery(x1, x1)#xL] +: +- Aggregate [y2#x], [count(1) AS count(1)#xL] +: +- Filter ((outer(x1#x) = y1#x) AND (y2#x = (outer(x1#x) + 1))) +: +- SubqueryAlias y +: +- View (`y`, [y1#x, y2#x]) +: +- Project [cast(col1#x as int) AS y1#x, cast(col2#x as int) AS y2#x] +: +- LocalRelation [col1#x, col2#x] ++- SubqueryAlias x + +- View (`x`, [x1#x, x2#x]) + +- Project [cast(col1#x as int) AS x1#x, cast(col2#x as int) AS x2#x] + +- LocalRelation [col1#x, col2#x] + + +-- !query +select * from x where (select count(*) from y where y1 > x1 group by y1) = 1 +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.NON_CORRELATED_COLUMNS_IN_GROUP_BY", + "sqlState" : "0A000", + "messageParameters" : { + "value" : "y1" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 23, + "stopIndex" : 72, + "fragment" : "(select count(*) from y where y1 > x1 group by y1)" + } ] +} + + +-- !query +select *, (select count(*) from y where y1 + y2 = x1 group by y1) from x +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.NON_CORRELATED_COLUMNS_IN_GROUP_BY", + "sqlState" : "0A000", + "messageParameters" : { + "value" : "y1" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 11, + "stopIndex" : 65, + "fragment" : "(select count(*) from y where y1 + y2 = x1 group by y1)" + } ] +} + + +-- !query +select *, (select count(*) from (select * from y where y1 = x1 union all select * from y) sub group by y1) from x +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.NON_CORRELATED_COLUMNS_IN_GROUP_BY", + "sqlState" : "0A000", + "messageParameters" : { + "value" : "y1" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 11, + "stopIndex" : 106, + "fragment" : "(select count(*) from (select * from y where y1 = x1 union all select * from y) sub group by y1)" + } ] +} + + +-- !query +select *, (select count(*) from y left join (select * from z where z1 = x1) sub on y2 = z2 group by z1) from x +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.NON_CORRELATED_COLUMNS_IN_GROUP_BY", + "sqlState" : "0A000", + "messageParameters" : { + "value" : "z1" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 11, + "stopIndex" : 103, + "fragment" : "(select count(*) from y left join (select * from z where z1 = x1) sub on y2 = z2 group by z1)" + } ] +} + + +-- !query +set spark.sql.legacy.scalarSubqueryAllowGroupByNonEqualityCorrelatedPredicate = true +-- !query analysis +SetCommand (spark.sql.legacy.scalarSubqueryAllowGroupByNonEqualityCorrelatedPredicate,Some(true)) + + +-- !query +select * from x where (select count(*) from y where y1 > x1 group by y1) = 1 +-- !query analysis +Project [x1#x, x2#x] ++- Filter (scalar-subquery#x [x1#x] = cast(1 as bigint)) + : +- Aggregate [y1#x], [count(1) AS count(1)#xL] + : +- Filter (y1#x > outer(x1#x)) + : +- SubqueryAlias y + : +- View (`y`, [y1#x, y2#x]) + : +- Project [cast(col1#x as int) AS y1#x, cast(col2#x as int) AS y2#x] + : +- LocalRelation [col1#x, col2#x] + +- SubqueryAlias x + +- View (`x`, [x1#x, x2#x]) + +- Project [cast(col1#x as int) AS x1#x, cast(col2#x as int) AS x2#x] + +- LocalRelation [col1#x, col2#x] + + +-- !query +reset spark.sql.legacy.scalarSubqueryAllowGroupByNonEqualityCorrelatedPredicate +-- !query analysis +ResetCommand spark.sql.legacy.scalarSubqueryAllowGroupByNonEqualityCorrelatedPredicate diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/scalar-subquery/scalar-subquery-predicate.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/scalar-subquery/scalar-subquery-predicate.sql.out index fd1acb113e3a6..3648a97e9872a 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/scalar-subquery/scalar-subquery-predicate.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/scalar-subquery/scalar-subquery-predicate.sql.out @@ -2,7 +2,7 @@ -- !query CREATE OR REPLACE TEMPORARY VIEW p AS VALUES (1, 1) AS T(pk, pv) -- !query analysis -CreateViewCommand `p`, VALUES (1, 1) AS T(pk, pv), false, true, LocalTempView, true +CreateViewCommand `p`, VALUES (1, 1) AS T(pk, pv), false, true, LocalTempView, UNSUPPORTED, true +- SubqueryAlias T +- LocalRelation [pk#x, pv#x] @@ -10,7 +10,7 @@ CreateViewCommand `p`, VALUES (1, 1) AS T(pk, pv), false, true, LocalTempView, t -- !query CREATE OR REPLACE TEMPORARY VIEW c AS VALUES (1, 1) AS T(ck, cv) -- !query analysis -CreateViewCommand `c`, VALUES (1, 1) AS T(ck, cv), false, true, LocalTempView, true +CreateViewCommand `c`, VALUES (1, 1) AS T(ck, cv), false, true, LocalTempView, UNSUPPORTED, true +- SubqueryAlias T +- LocalRelation [ck#x, cv#x] @@ -110,7 +110,7 @@ CreateViewCommand `t1`, select * from values ('val1d', 10S, null, 12L, float(17.0), 25D, 26E2BD, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), ('val1a', 6S, 8, 10L, float(15.0), 20D, 20E2BD, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), ('val1e', 10S, null, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') - as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i), false, false, LocalTempView, true + as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i), false, false, LocalTempView, UNSUPPORTED, true +- Project [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x] +- SubqueryAlias t1 +- LocalRelation [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x] @@ -147,7 +147,7 @@ CreateViewCommand `t2`, select * from values ('val1e', 8S, null, 19L, float(17), 25D, 26E2BD, timestamp '2014-09-04 01:01:00.000', date '2014-09-04'), ('val1f', 19S, null, 19L, float(17), 25D, 26E2BD, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), ('val1b', null, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', null) - as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i), false, false, LocalTempView, true + as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i), false, false, LocalTempView, UNSUPPORTED, true +- Project [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] +- SubqueryAlias t2 +- LocalRelation [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] @@ -182,7 +182,7 @@ CreateViewCommand `t3`, select * from values ('val1b', null, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-11-04 01:02:00.000', null), ('val3b', 8S, null, 719L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), ('val3b', 8S, null, 19L, float(17), 25D, 26E2BD, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') - as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i), false, false, LocalTempView, true + as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i), false, false, LocalTempView, UNSUPPORTED, true +- Project [t3a#x, t3b#x, t3c#x, t3d#xL, t3e#x, t3f#x, t3g#x, t3h#x, t3i#x] +- SubqueryAlias t3 +- LocalRelation [t3a#x, t3b#x, t3c#x, t3d#xL, t3e#x, t3f#x, t3g#x, t3h#x, t3i#x] @@ -1254,21 +1254,21 @@ Project [t1a#x, t1b#x] -- !query CREATE OR REPLACE TEMP VIEW t0(t0a, t0b) AS VALUES (1, 1), (2, 0) -- !query analysis -CreateViewCommand `t0`, [(t0a,None), (t0b,None)], VALUES (1, 1), (2, 0), false, true, LocalTempView, true +CreateViewCommand `t0`, [(t0a,None), (t0b,None)], VALUES (1, 1), (2, 0), false, true, LocalTempView, UNSUPPORTED, true +- LocalRelation [col1#x, col2#x] -- !query CREATE OR REPLACE TEMP VIEW t1(t1a, t1b, t1c) AS VALUES (1, 1, 3) -- !query analysis -CreateViewCommand `t1`, [(t1a,None), (t1b,None), (t1c,None)], VALUES (1, 1, 3), false, true, LocalTempView, true +CreateViewCommand `t1`, [(t1a,None), (t1b,None), (t1c,None)], VALUES (1, 1, 3), false, true, LocalTempView, UNSUPPORTED, true +- LocalRelation [col1#x, col2#x, col3#x] -- !query CREATE OR REPLACE TEMP VIEW t2(t2a, t2b, t2c) AS VALUES (1, 1, 5), (2, 2, 7) -- !query analysis -CreateViewCommand `t2`, [(t2a,None), (t2b,None), (t2c,None)], VALUES (1, 1, 5), (2, 2, 7), false, true, LocalTempView, true +CreateViewCommand `t2`, [(t2a,None), (t2b,None), (t2c,None)], VALUES (1, 1, 5), (2, 2, 7), false, true, LocalTempView, UNSUPPORTED, true +- LocalRelation [col1#x, col2#x, col3#x] @@ -1626,9 +1626,9 @@ Project [id#xL] : +- Project [id#xL AS c#xL] : +- Filter (outer(id#xL) = id#xL) : +- SubqueryAlias t2 - : +- Range (1, 2, step=1, splits=None) + : +- Range (1, 2, step=1) +- SubqueryAlias t1 - +- Range (1, 3, step=1, splits=None) + +- Range (1, 3, step=1) -- !query diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/scalar-subquery/scalar-subquery-select.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/scalar-subquery/scalar-subquery-select.sql.out index 1b03c3b780ca5..72e230f9bb881 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/scalar-subquery/scalar-subquery-select.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/scalar-subquery/scalar-subquery-select.sql.out @@ -28,7 +28,7 @@ CreateViewCommand `t1`, select * from values ('val1d', 10S, null, 12L, float(17.0), 25D, 26E2, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), ('val1a', 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), ('val1e', 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') - as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i), false, false, LocalTempView, true + as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i), false, false, LocalTempView, UNSUPPORTED, true +- Project [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x] +- SubqueryAlias t1 +- LocalRelation [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x] @@ -65,7 +65,7 @@ CreateViewCommand `t2`, select * from values ('val1e', 8S, null, 19L, float(17), 25D, 26E2, timestamp '2014-09-04 01:01:00.000', date '2014-09-04'), ('val1f', 19S, null, 19L, float(17), 25D, 26E2, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), ('val1b', null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', null) - as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i), false, false, LocalTempView, true + as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i), false, false, LocalTempView, UNSUPPORTED, true +- Project [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] +- SubqueryAlias t2 +- LocalRelation [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] @@ -100,7 +100,7 @@ CreateViewCommand `t3`, select * from values ('val1b', null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-11-04 01:02:00.000', null), ('val3b', 8S, null, 719L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), ('val3b', 8S, null, 19L, float(17), 25D, 26E2, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') - as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i), false, false, LocalTempView, true + as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i), false, false, LocalTempView, UNSUPPORTED, true +- Project [t3a#x, t3b#x, t3c#x, t3d#xL, t3e#x, t3f#x, t3g#x, t3h#x, t3i#x] +- SubqueryAlias t3 +- LocalRelation [t3a#x, t3b#x, t3c#x, t3d#xL, t3e#x, t3f#x, t3g#x, t3h#x, t3i#x] @@ -599,7 +599,7 @@ Project [t1a#x, scalar-subquery#x [t1a#x] AS scalarsubquery(t1a)#xL] -- !query CREATE OR REPLACE TEMPORARY VIEW t1 AS VALUES (0, 1), (1, 2) t1(c1, c2) -- !query analysis -CreateViewCommand `t1`, VALUES (0, 1), (1, 2) t1(c1, c2), false, true, LocalTempView, true +CreateViewCommand `t1`, VALUES (0, 1), (1, 2) t1(c1, c2), false, true, LocalTempView, UNSUPPORTED, true +- SubqueryAlias t1 +- LocalRelation [c1#x, c2#x] @@ -607,7 +607,7 @@ CreateViewCommand `t1`, VALUES (0, 1), (1, 2) t1(c1, c2), false, true, LocalTemp -- !query CREATE OR REPLACE TEMPORARY VIEW t2 AS VALUES (0, 2), (0, 3) t2(c1, c2) -- !query analysis -CreateViewCommand `t2`, VALUES (0, 2), (0, 3) t2(c1, c2), false, true, LocalTempView, true +CreateViewCommand `t2`, VALUES (0, 2), (0, 3) t2(c1, c2), false, true, LocalTempView, UNSUPPORTED, true +- SubqueryAlias t2 +- LocalRelation [c1#x, c2#x] @@ -820,14 +820,14 @@ Project [scalar-subquery#x [] AS b#x] -- !query CREATE OR REPLACE TEMP VIEW t1(c1, c2) AS (VALUES (0, 1), (1, 2)) -- !query analysis -CreateViewCommand `t1`, [(c1,None), (c2,None)], (VALUES (0, 1), (1, 2)), false, true, LocalTempView, true +CreateViewCommand `t1`, [(c1,None), (c2,None)], (VALUES (0, 1), (1, 2)), false, true, LocalTempView, UNSUPPORTED, true +- LocalRelation [col1#x, col2#x] -- !query CREATE OR REPLACE TEMP VIEW t2(c1, c2) AS (VALUES (0, 2), (0, 3)) -- !query analysis -CreateViewCommand `t2`, [(c1,None), (c2,None)], (VALUES (0, 2), (0, 3)), false, true, LocalTempView, true +CreateViewCommand `t2`, [(c1,None), (c2,None)], (VALUES (0, 2), (0, 3)), false, true, LocalTempView, UNSUPPORTED, true +- LocalRelation [col1#x, col2#x] @@ -840,7 +840,7 @@ CREATE OR REPLACE TEMP VIEW students(id, name, major, year) AS (VALUES CreateViewCommand `students`, [(id,None), (name,None), (major,None), (year,None)], (VALUES (0, 'A', 'CS', 2022), (1, 'B', 'CS', 2022), - (2, 'C', 'Math', 2022)), false, true, LocalTempView, true + (2, 'C', 'Math', 2022)), false, true, LocalTempView, UNSUPPORTED, true +- LocalRelation [col1#x, col2#x, col3#x, col4#x] @@ -855,7 +855,7 @@ CreateViewCommand `exams`, [(sid,None), (course,None), (curriculum,None), (grade (0, 'C1', 'CS', 4, 2020), (0, 'C2', 'CS', 3, 2021), (1, 'C1', 'CS', 2, 2020), - (1, 'C2', 'CS', 1, 2021)), false, true, LocalTempView, true + (1, 'C2', 'CS', 1, 2021)), false, true, LocalTempView, UNSUPPORTED, true +- LocalRelation [col1#x, col2#x, col3#x, col4#x, col5#x] @@ -1054,9 +1054,9 @@ Project [c1#xL, c2#xL] : +- Project [id#xL AS c#xL] : +- Filter (outer(id#xL) = id#xL) : +- SubqueryAlias t2 - : +- Range (1, 2, step=1, splits=None) + : +- Range (1, 2, step=1) +- SubqueryAlias t1 - +- Range (1, 3, step=1, splits=None) + +- Range (1, 3, step=1) -- !query diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/scalar-subquery/scalar-subquery-set-op.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/scalar-subquery/scalar-subquery-set-op.sql.out index eeea71f6cd4f2..eaeff0ba9dedb 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/scalar-subquery/scalar-subquery-set-op.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/scalar-subquery/scalar-subquery-set-op.sql.out @@ -2,21 +2,21 @@ -- !query CREATE OR REPLACE TEMP VIEW t0(t0a, t0b) AS VALUES (1, 1), (2, 0) -- !query analysis -CreateViewCommand `t0`, [(t0a,None), (t0b,None)], VALUES (1, 1), (2, 0), false, true, LocalTempView, true +CreateViewCommand `t0`, [(t0a,None), (t0b,None)], VALUES (1, 1), (2, 0), false, true, LocalTempView, UNSUPPORTED, true +- LocalRelation [col1#x, col2#x] -- !query CREATE OR REPLACE TEMP VIEW t1(t1a, t1b, t1c) AS VALUES (1, 1, 3) -- !query analysis -CreateViewCommand `t1`, [(t1a,None), (t1b,None), (t1c,None)], VALUES (1, 1, 3), false, true, LocalTempView, true +CreateViewCommand `t1`, [(t1a,None), (t1b,None), (t1c,None)], VALUES (1, 1, 3), false, true, LocalTempView, UNSUPPORTED, true +- LocalRelation [col1#x, col2#x, col3#x] -- !query CREATE OR REPLACE TEMP VIEW t2(t2a, t2b, t2c) AS VALUES (1, 1, 5), (2, 2, 7) -- !query analysis -CreateViewCommand `t2`, [(t2a,None), (t2b,None), (t2c,None)], VALUES (1, 1, 5), (2, 2, 7), false, true, LocalTempView, true +CreateViewCommand `t2`, [(t2a,None), (t2b,None), (t2c,None)], VALUES (1, 1, 5), (2, 2, 7), false, true, LocalTempView, UNSUPPORTED, true +- LocalRelation [col1#x, col2#x, col3#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/subquery-offset.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/subquery-offset.sql.out index 428df8e6adf8e..104c1b0f41a10 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/subquery-offset.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/subquery-offset.sql.out @@ -139,7 +139,7 @@ CreateViewCommand `EMP`, SELECT * FROM VALUES (600, "emp 6 - no dept", date "2001-01-01", 400.00D, 100), (700, "emp 7", date "2010-01-01", 400.00D, 100), (800, "emp 8", date "2016-01-01", 150.00D, 70) -AS EMP(id, emp_name, hiredate, salary, dept_id), false, false, LocalTempView, true +AS EMP(id, emp_name, hiredate, salary, dept_id), false, false, LocalTempView, UNSUPPORTED, true +- Project [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] +- SubqueryAlias EMP +- LocalRelation [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] @@ -162,7 +162,7 @@ CreateViewCommand `DEPT`, SELECT * FROM VALUES (40, "dept 4 - unassigned", "OR"), (50, "dept 5 - unassigned", "NJ"), (70, "dept 7", "FL") -AS DEPT(dept_id, dept_name, state), false, false, LocalTempView, true +AS DEPT(dept_id, dept_name, state), false, false, LocalTempView, UNSUPPORTED, true +- Project [dept_id#x, dept_name#x, state#x] +- SubqueryAlias DEPT +- LocalRelation [dept_id#x, dept_name#x, state#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/table-aliases.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/table-aliases.sql.out index a889e549d99cc..8d4bffd868753 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/table-aliases.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/table-aliases.sql.out @@ -2,7 +2,7 @@ -- !query CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES (1, 1), (1, 2), (2, 1) AS testData(a, b) -- !query analysis -CreateViewCommand `testData`, SELECT * FROM VALUES (1, 1), (1, 2), (2, 1) AS testData(a, b), false, true, LocalTempView, true +CreateViewCommand `testData`, SELECT * FROM VALUES (1, 1), (1, 2), (2, 1) AS testData(a, b), false, true, LocalTempView, UNSUPPORTED, true +- Project [a#x, b#x] +- SubqueryAlias testData +- LocalRelation [a#x, b#x] @@ -158,7 +158,7 @@ Project [col1#x, col2#x] -- !query CREATE OR REPLACE TEMPORARY VIEW src1 AS SELECT * FROM VALUES (1, "a"), (2, "b"), (3, "c") AS src1(id, v1) -- !query analysis -CreateViewCommand `src1`, SELECT * FROM VALUES (1, "a"), (2, "b"), (3, "c") AS src1(id, v1), false, true, LocalTempView, true +CreateViewCommand `src1`, SELECT * FROM VALUES (1, "a"), (2, "b"), (3, "c") AS src1(id, v1), false, true, LocalTempView, UNSUPPORTED, true +- Project [id#x, v1#x] +- SubqueryAlias src1 +- LocalRelation [id#x, v1#x] @@ -167,7 +167,7 @@ CreateViewCommand `src1`, SELECT * FROM VALUES (1, "a"), (2, "b"), (3, "c") AS s -- !query CREATE OR REPLACE TEMPORARY VIEW src2 AS SELECT * FROM VALUES (2, 1.0), (3, 3.2), (1, 8.5) AS src2(id, v2) -- !query analysis -CreateViewCommand `src2`, SELECT * FROM VALUES (2, 1.0), (3, 3.2), (1, 8.5) AS src2(id, v2), false, true, LocalTempView, true +CreateViewCommand `src2`, SELECT * FROM VALUES (2, 1.0), (3, 3.2), (1, 8.5) AS src2(id, v2), false, true, LocalTempView, UNSUPPORTED, true +- Project [id#x, v2#x] +- SubqueryAlias src2 +- LocalRelation [id#x, v2#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/table-valued-functions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/table-valued-functions.sql.out index c8698f7c7cd73..438e98f559db7 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/table-valued-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/table-valued-functions.sql.out @@ -23,21 +23,21 @@ org.apache.spark.sql.AnalysisException select * from range(6 + cos(3)) -- !query analysis Project [id#xL] -+- Range (0, 5, step=1, splits=None) ++- Range (0, 5, step=1) -- !query select * from range(5, 10) -- !query analysis Project [id#xL] -+- Range (5, 10, step=1, splits=None) ++- Range (5, 10, step=1) -- !query select * from range(0, 10, 2) -- !query analysis Project [id#xL] -+- Range (0, 10, step=2, splits=None) ++- Range (0, 10, step=2) -- !query @@ -142,7 +142,7 @@ org.apache.spark.sql.AnalysisException select * from RaNgE(2) -- !query analysis Project [id#xL] -+- Range (0, 2, step=1, splits=None) ++- Range (0, 2, step=1) -- !query @@ -151,7 +151,7 @@ select i from range(0, 2) t(i) Project [i#xL] +- SubqueryAlias t +- Project [id#xL AS i#xL] - +- Range (0, 2, step=1, splits=None) + +- Range (0, 2, step=1) -- !query @@ -430,7 +430,7 @@ select * from range(2) join explode(array(1, 2)) -- !query analysis Project [id#xL, col#x] +- Join Inner - :- Range (0, 2, step=1, splits=None) + :- Range (0, 2, step=1) +- Generate explode(array(1, 2)), false, [col#x] +- OneRowRelation @@ -440,7 +440,7 @@ select * from range(2) join explode_outer(array()) -- !query analysis Project [id#xL, col#x] +- Join Inner - :- Range (0, 2, step=1, splits=None) + :- Range (0, 2, step=1) +- Generate explode(array()), true, [col#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/timestamp.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/timestamp.sql.out index 4a48795e4cca5..6ca35b8b141dc 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/timestamp.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/timestamp.sql.out @@ -236,7 +236,7 @@ create temporary view ttf1 as select * from values CreateViewCommand `ttf1`, select * from values (1, 2), (2, 3) - as ttf1(`current_date`, `current_timestamp`), false, false, LocalTempView, true + as ttf1(`current_date`, `current_timestamp`), false, false, LocalTempView, UNSUPPORTED, true +- Project [current_date#x, current_timestamp#x] +- SubqueryAlias ttf1 +- LocalRelation [current_date#x, current_timestamp#x] @@ -263,7 +263,7 @@ create temporary view ttf2 as select * from values CreateViewCommand `ttf2`, select * from values (1, 2), (2, 3) - as ttf2(a, b), false, false, LocalTempView, true + as ttf2(a, b), false, false, LocalTempView, UNSUPPORTED, true +- Project [a#x, b#x] +- SubqueryAlias ttf2 +- LocalRelation [a#x, b#x] @@ -611,7 +611,7 @@ select null - timestamp'2011-11-11 11:11:11' -- !query create temporary view ts_view as select '2011-11-11 11:11:11' str -- !query analysis -CreateViewCommand `ts_view`, select '2011-11-11 11:11:11' str, false, false, LocalTempView, true +CreateViewCommand `ts_view`, select '2011-11-11 11:11:11' str, false, false, LocalTempView, UNSUPPORTED, true +- Project [2011-11-11 11:11:11 AS str#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/timestampNTZ/timestamp-ansi.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/timestampNTZ/timestamp-ansi.sql.out index 959eac6d5e639..e50c860270563 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/timestampNTZ/timestamp-ansi.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/timestampNTZ/timestamp-ansi.sql.out @@ -237,7 +237,7 @@ create temporary view ttf1 as select * from values CreateViewCommand `ttf1`, select * from values (1, 2), (2, 3) - as ttf1(`current_date`, `current_timestamp`), false, false, LocalTempView, true + as ttf1(`current_date`, `current_timestamp`), false, false, LocalTempView, UNSUPPORTED, true +- Project [current_date#x, current_timestamp#x] +- SubqueryAlias ttf1 +- LocalRelation [current_date#x, current_timestamp#x] @@ -264,7 +264,7 @@ create temporary view ttf2 as select * from values CreateViewCommand `ttf2`, select * from values (1, 2), (2, 3) - as ttf2(a, b), false, false, LocalTempView, true + as ttf2(a, b), false, false, LocalTempView, UNSUPPORTED, true +- Project [a#x, b#x] +- SubqueryAlias ttf2 +- LocalRelation [a#x, b#x] @@ -579,7 +579,7 @@ select null - timestamp'2011-11-11 11:11:11' -- !query create temporary view ts_view as select '2011-11-11 11:11:11' str -- !query analysis -CreateViewCommand `ts_view`, select '2011-11-11 11:11:11' str, false, false, LocalTempView, true +CreateViewCommand `ts_view`, select '2011-11-11 11:11:11' str, false, false, LocalTempView, UNSUPPORTED, true +- Project [2011-11-11 11:11:11 AS str#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/timestampNTZ/timestamp.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/timestampNTZ/timestamp.sql.out index 4aa1ffcbc31db..098abfb3852cf 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/timestampNTZ/timestamp.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/timestampNTZ/timestamp.sql.out @@ -237,7 +237,7 @@ create temporary view ttf1 as select * from values CreateViewCommand `ttf1`, select * from values (1, 2), (2, 3) - as ttf1(`current_date`, `current_timestamp`), false, false, LocalTempView, true + as ttf1(`current_date`, `current_timestamp`), false, false, LocalTempView, UNSUPPORTED, true +- Project [current_date#x, current_timestamp#x] +- SubqueryAlias ttf1 +- LocalRelation [current_date#x, current_timestamp#x] @@ -264,7 +264,7 @@ create temporary view ttf2 as select * from values CreateViewCommand `ttf2`, select * from values (1, 2), (2, 3) - as ttf2(a, b), false, false, LocalTempView, true + as ttf2(a, b), false, false, LocalTempView, UNSUPPORTED, true +- Project [a#x, b#x] +- SubqueryAlias ttf2 +- LocalRelation [a#x, b#x] @@ -613,7 +613,7 @@ select null - timestamp'2011-11-11 11:11:11' -- !query create temporary view ts_view as select '2011-11-11 11:11:11' str -- !query analysis -CreateViewCommand `ts_view`, select '2011-11-11 11:11:11' str, false, false, LocalTempView, true +CreateViewCommand `ts_view`, select '2011-11-11 11:11:11' str, false, false, LocalTempView, UNSUPPORTED, true +- Project [2011-11-11 11:11:11 AS str#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/to_from_avro.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/to_from_avro.sql.out new file mode 100644 index 0000000000000..951a4025d5fb2 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/to_from_avro.sql.out @@ -0,0 +1,132 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +create table t as + select named_struct('u', named_struct('member0', member0, 'member1', member1)) as s + from values (1, null), (null, 'a') tab(member0, member1) +-- !query analysis +CreateDataSourceTableAsSelectCommand `spark_catalog`.`default`.`t`, ErrorIfExists, [s] + +- Project [named_struct(u, named_struct(member0, member0#x, member1, member1#x)) AS s#x] + +- SubqueryAlias tab + +- LocalRelation [member0#x, member1#x] + + +-- !query +declare avro_schema string +-- !query analysis +CreateVariable defaultvalueexpression(null, null), false ++- ResolvedIdentifier org.apache.spark.sql.catalyst.analysis.FakeSystemCatalog$@xxxxxxxx, session.avro_schema + + +-- !query +set variable avro_schema = + '{ "type": "record", "name": "struct", "fields": [{ "name": "u", "type": ["int","string"] }] }' +-- !query analysis +SetVariable [variablereference(system.session.avro_schema=CAST(NULL AS STRING))] ++- Project [{ "type": "record", "name": "struct", "fields": [{ "name": "u", "type": ["int","string"] }] } AS avro_schema#x] + +- OneRowRelation + + +-- !query +select from_avro(s, 42, map()) from t +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "DATATYPE_MISMATCH.TYPE_CHECK_FAILURE_WITH_HINT", + "sqlState" : "42K09", + "messageParameters" : { + "hint" : "", + "msg" : "The second argument of the FROM_AVRO SQL function must be a constant string containing the JSON representation of the schema to use for converting the value from AVRO format", + "sqlExpr" : "\"fromavro(s, 42, map())\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 30, + "fragment" : "from_avro(s, 42, map())" + } ] +} + + +-- !query +select from_avro(s, avro_schema, 42) from t +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "DATATYPE_MISMATCH.TYPE_CHECK_FAILURE_WITH_HINT", + "sqlState" : "42K09", + "messageParameters" : { + "hint" : "", + "msg" : "The third argument of the FROM_AVRO SQL function must be a constant map of strings to strings containing the options to use for converting the value from AVRO format", + "sqlExpr" : "\"fromavro(s, variablereference(system.session.avro_schema='{ \"type\": \"record\", \"name\": \"struct\", \"fields\": [{ \"name\": \"u\", \"type\": [\"int\",\"string\"] }] }'), 42)\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 36, + "fragment" : "from_avro(s, avro_schema, 42)" + } ] +} + + +-- !query +select to_avro(s, 42) from t +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "DATATYPE_MISMATCH.TYPE_CHECK_FAILURE_WITH_HINT", + "sqlState" : "42K09", + "messageParameters" : { + "hint" : "", + "msg" : "The second argument of the TO_AVRO SQL function must be a constant string containing the JSON representation of the schema to use for converting the value to AVRO format", + "sqlExpr" : "\"toavro(s, 42)\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 21, + "fragment" : "to_avro(s, 42)" + } ] +} + + +-- !query +select to_avro(s, avro_schema) as result from t +-- !query analysis +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "AVRO_NOT_LOADED_SQL_FUNCTIONS_UNUSABLE", + "sqlState" : "22KD3", + "messageParameters" : { + "functionName" : "TO_AVRO" + } +} + + +-- !query +select from_avro(result, avro_schema, map()).u from (select null as result) +-- !query analysis +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "AVRO_NOT_LOADED_SQL_FUNCTIONS_UNUSABLE", + "sqlState" : "22KD3", + "messageParameters" : { + "functionName" : "FROM_AVRO" + } +} + + +-- !query +drop temporary variable avro_schema +-- !query analysis +DropVariable false ++- ResolvedIdentifier org.apache.spark.sql.catalyst.analysis.FakeSystemCatalog$@xxxxxxxx, session.avro_schema + + +-- !query +drop table t +-- !query analysis +DropTable false, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/transform.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/transform.sql.out index 173db880eb9d4..7cf8a2886069d 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/transform.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/transform.sql.out @@ -10,7 +10,7 @@ CreateViewCommand `t`, SELECT * FROM VALUES ('1', true, unhex('537061726B2053514C'), tinyint(1), 1, smallint(100), bigint(1), float(1.0), 1.0, Decimal(1.0), timestamp('1997-01-02'), date('2000-04-01')), ('2', false, unhex('537061726B2053514C'), tinyint(2), 2, smallint(200), bigint(2), float(2.0), 2.0, Decimal(2.0), timestamp('1997-01-02 03:04:05'), date('2000-04-02')), ('3', true, unhex('537061726B2053514C'), tinyint(3), 3, smallint(300), bigint(3), float(3.0), 3.0, Decimal(3.0), timestamp('1997-02-10 17:32:01-08'), date('2000-04-03')) -AS t(a, b, c, d, e, f, g, h, i, j, k, l), false, true, LocalTempView, true +AS t(a, b, c, d, e, f, g, h, i, j, k, l), false, true, LocalTempView, UNSUPPORTED, true +- Project [a#x, b#x, c#x, d#x, e#x, f#x, g#xL, h#x, i#x, j#x, k#x, l#x] +- SubqueryAlias t +- LocalRelation [a#x, b#x, c#x, d#x, e#x, f#x, g#xL, h#x, i#x, j#x, k#x, l#x] @@ -27,7 +27,7 @@ CreateViewCommand `script_trans`, SELECT * FROM VALUES (1, 2, 3), (4, 5, 6), (7, 8, 9) -AS script_trans(a, b, c), false, true, LocalTempView, true +AS script_trans(a, b, c), false, true, LocalTempView, UNSUPPORTED, true +- Project [a#x, b#x, c#x] +- SubqueryAlias script_trans +- LocalRelation [a#x, b#x, c#x] @@ -56,7 +56,7 @@ CreateViewCommand `complex_trans`, SELECT * FROM VALUES (3, 3), (1, 1), (3, 3) -as complex_trans(a, b), false, true, LocalTempView, true +as complex_trans(a, b), false, true, LocalTempView, UNSUPPORTED, true +- Project [a#x, b#x] +- SubqueryAlias complex_trans +- LocalRelation [a#x, b#x] @@ -1035,3 +1035,14 @@ ScriptTransformation cat, [a#x, b#x], ScriptInputOutputSchema(List(),List(),None +- Project [a#x, b#x] +- SubqueryAlias complex_trans +- LocalRelation [a#x, b#x] + + +-- !query +SELECT TRANSFORM (a, b) + USING 'cat' AS (a CHAR(10), b VARCHAR(10)) +FROM VALUES('apache', 'spark') t(a, b) +-- !query analysis +ScriptTransformation cat, [a#x, b#x], ScriptInputOutputSchema(List(),List(),None,None,List(),List(),None,None,false) ++- Project [a#x, b#x] + +- SubqueryAlias t + +- LocalRelation [a#x, b#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/try_arithmetic.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/try_arithmetic.sql.out index ef17f6b50b90a..caf997f6ccbb2 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/try_arithmetic.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/try_arithmetic.sql.out @@ -13,6 +13,20 @@ Project [try_add(2147483647, 1) AS try_add(2147483647, 1)#x] +- OneRowRelation +-- !query +SELECT try_add(2147483647, decimal(1)) +-- !query analysis +Project [try_add(2147483647, cast(1 as decimal(10,0))) AS try_add(2147483647, 1)#x] ++- OneRowRelation + + +-- !query +SELECT try_add(2147483647, "1") +-- !query analysis +Project [try_add(2147483647, 1) AS try_add(2147483647, 1)#x] ++- OneRowRelation + + -- !query SELECT try_add(-2147483648, -1) -- !query analysis @@ -211,6 +225,20 @@ Project [try_divide(1, (1.0 / 0.0)) AS try_divide(1, (1.0 / 0.0))#x] +- OneRowRelation +-- !query +SELECT try_divide(1, decimal(0)) +-- !query analysis +Project [try_divide(1, cast(0 as decimal(10,0))) AS try_divide(1, 0)#x] ++- OneRowRelation + + +-- !query +SELECT try_divide(1, "0") +-- !query analysis +Project [try_divide(1, 0) AS try_divide(1, 0)#x] ++- OneRowRelation + + -- !query SELECT try_divide(interval 2 year, 2) -- !query analysis @@ -267,6 +295,20 @@ Project [try_subtract(2147483647, -1) AS try_subtract(2147483647, -1)#x] +- OneRowRelation +-- !query +SELECT try_subtract(2147483647, decimal(-1)) +-- !query analysis +Project [try_subtract(2147483647, cast(-1 as decimal(10,0))) AS try_subtract(2147483647, -1)#x] ++- OneRowRelation + + +-- !query +SELECT try_subtract(2147483647, "-1") +-- !query analysis +Project [try_subtract(2147483647, -1) AS try_subtract(2147483647, -1)#x] ++- OneRowRelation + + -- !query SELECT try_subtract(-2147483648, 1) -- !query analysis @@ -351,6 +393,20 @@ Project [try_multiply(2147483647, -2) AS try_multiply(2147483647, -2)#x] +- OneRowRelation +-- !query +SELECT try_multiply(2147483647, decimal(-2)) +-- !query analysis +Project [try_multiply(2147483647, cast(-2 as decimal(10,0))) AS try_multiply(2147483647, -2)#x] ++- OneRowRelation + + +-- !query +SELECT try_multiply(2147483647, "-2") +-- !query analysis +Project [try_multiply(2147483647, -2) AS try_multiply(2147483647, -2)#x] ++- OneRowRelation + + -- !query SELECT try_multiply(-2147483648, 2) -- !query analysis diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/binaryComparison.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/binaryComparison.sql.out index c2dfe61b259da..d15418c17b730 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/binaryComparison.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/binaryComparison.sql.out @@ -2,7 +2,7 @@ -- !query CREATE TEMPORARY VIEW t AS SELECT 1 -- !query analysis -CreateViewCommand `t`, SELECT 1, false, false, LocalTempView, true +CreateViewCommand `t`, SELECT 1, false, false, LocalTempView, UNSUPPORTED, true +- Project [1 AS 1#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/booleanEquality.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/booleanEquality.sql.out index c1aa8f3c7921e..de6c0b72c1c79 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/booleanEquality.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/booleanEquality.sql.out @@ -2,7 +2,7 @@ -- !query CREATE TEMPORARY VIEW t AS SELECT 1 -- !query analysis -CreateViewCommand `t`, SELECT 1, false, false, LocalTempView, true +CreateViewCommand `t`, SELECT 1, false, false, LocalTempView, UNSUPPORTED, true +- Project [1 AS 1#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/caseWhenCoercion.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/caseWhenCoercion.sql.out index 7662eac61e543..4124fc57996cd 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/caseWhenCoercion.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/caseWhenCoercion.sql.out @@ -2,7 +2,7 @@ -- !query CREATE TEMPORARY VIEW t AS SELECT 1 -- !query analysis -CreateViewCommand `t`, SELECT 1, false, false, LocalTempView, true +CreateViewCommand `t`, SELECT 1, false, false, LocalTempView, UNSUPPORTED, true +- Project [1 AS 1#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/concat.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/concat.sql.out index eb3d43a92896f..62e3a87473263 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/concat.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/concat.sql.out @@ -11,8 +11,8 @@ FROM ( -- !query analysis Project [concat(concat(cast(col1#xL as string), col2#x), cast(col3#x as string)) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [id#xL AS col1#xL, cast((id#xL + cast(1 as bigint)) as string) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col3#x] - +- Range (0, 10, step=1, splits=None) + +- Project [id#xL AS col1#xL, cast((id#xL + cast(1 as bigint)) as string) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x] + +- Range (0, 10, step=1) -- !query @@ -29,8 +29,8 @@ FROM ( -- !query analysis Project [concat(concat(concat(col1#x, cast(col2#xL as string)), concat(col3#x, cast(col4#x as string))), cast(col5#x as string)) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [prefix_ AS col1#x, id#xL AS col2#xL, cast((id#xL + cast(1 as bigint)) as string) AS col3#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col4#x, cast(id#xL as double) AS col5#x] - +- Range (0, 10, step=1, splits=None) + +- Project [prefix_ AS col1#x, id#xL AS col2#xL, cast((id#xL + cast(1 as bigint)) as string) AS col3#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col4#x, cast(id#xL as double) AS col5#x] + +- Range (0, 10, step=1) -- !query @@ -46,8 +46,8 @@ FROM ( -- !query analysis Project [concat(concat(col1#x, col2#x), cast(concat(col3#x, col4#x) as string)) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [cast(id#xL as string) AS col1#x, cast((id#xL + cast(1 as bigint)) as string) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false) AS col4#x] - +- Range (0, 10, step=1, splits=None) + +- Project [cast(id#xL as string) AS col1#x, cast((id#xL + cast(1 as bigint)) as string) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS col4#x] + +- Range (0, 10, step=1) -- !query @@ -67,8 +67,8 @@ FROM ( -- !query analysis Project [concat(cast(col1#x as string), cast(col2#x as string)) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x] - +- Range (0, 10, step=1, splits=None) + +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x] + +- Range (0, 10, step=1) -- !query @@ -84,8 +84,8 @@ FROM ( -- !query analysis Project [concat(concat(concat(cast(col1#x as string), cast(col2#x as string)), cast(col3#x as string)), cast(col4#x as string)) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false) AS col4#x] - +- Range (0, 10, step=1, splits=None) + +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS col4#x] + +- Range (0, 10, step=1) -- !query @@ -101,8 +101,8 @@ FROM ( -- !query analysis Project [concat(concat(cast(col1#x as string), cast(col2#x as string)), concat(cast(col3#x as string), cast(col4#x as string))) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false) AS col4#x] - +- Range (0, 10, step=1, splits=None) + +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS col4#x] + +- Range (0, 10, step=1) -- !query @@ -122,8 +122,8 @@ FROM ( -- !query analysis Project [concat(col1#x, col2#x) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x] - +- Range (0, 10, step=1, splits=None) + +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x] + +- Range (0, 10, step=1) -- !query @@ -139,8 +139,8 @@ FROM ( -- !query analysis Project [concat(concat(concat(col1#x, col2#x), col3#x), col4#x) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false) AS col4#x] - +- Range (0, 10, step=1, splits=None) + +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS col4#x] + +- Range (0, 10, step=1) -- !query @@ -156,8 +156,8 @@ FROM ( -- !query analysis Project [concat(concat(col1#x, col2#x), concat(col3#x, col4#x)) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false) AS col4#x] - +- Range (0, 10, step=1, splits=None) + +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS col4#x] + +- Range (0, 10, step=1) -- !query @@ -225,7 +225,7 @@ CreateViewCommand `various_arrays`, SELECT * FROM VALUES ( array_array1, array_array2, struct_array1, struct_array2, map_array1, map_array2 -), false, false, LocalTempView, true +), false, false, LocalTempView, UNSUPPORTED, true +- Project [boolean_array1#x, boolean_array2#x, tinyint_array1#x, tinyint_array2#x, smallint_array1#x, smallint_array2#x, int_array1#x, int_array2#x, bigint_array1#x, bigint_array2#x, decimal_array1#x, decimal_array2#x, double_array1#x, double_array2#x, float_array1#x, float_array2#x, date_array1#x, data_array2#x, timestamp_array1#x, timestamp_array2#x, string_array1#x, string_array2#x, array_array1#x, array_array2#x, ... 4 more fields] +- SubqueryAlias various_arrays +- LocalRelation [boolean_array1#x, boolean_array2#x, tinyint_array1#x, tinyint_array2#x, smallint_array1#x, smallint_array2#x, int_array1#x, int_array2#x, bigint_array1#x, bigint_array2#x, decimal_array1#x, decimal_array2#x, double_array1#x, double_array2#x, float_array1#x, float_array2#x, date_array1#x, data_array2#x, timestamp_array1#x, timestamp_array2#x, string_array1#x, string_array2#x, array_array1#x, array_array2#x, ... 4 more fields] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/dateTimeOperations.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/dateTimeOperations.sql.out index a50c797e78c20..c2787e5816883 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/dateTimeOperations.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/dateTimeOperations.sql.out @@ -2,7 +2,7 @@ -- !query CREATE TEMPORARY VIEW t AS SELECT 1 -- !query analysis -CreateViewCommand `t`, SELECT 1, false, false, LocalTempView, true +CreateViewCommand `t`, SELECT 1, false, false, LocalTempView, UNSUPPORTED, true +- Project [1 AS 1#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/decimalPrecision.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/decimalPrecision.sql.out index eebe370666edb..093297f03edb7 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/decimalPrecision.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/decimalPrecision.sql.out @@ -2,7 +2,7 @@ -- !query CREATE TEMPORARY VIEW t AS SELECT 1 -- !query analysis -CreateViewCommand `t`, SELECT 1, false, false, LocalTempView, true +CreateViewCommand `t`, SELECT 1, false, false, LocalTempView, UNSUPPORTED, true +- Project [1 AS 1#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/division.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/division.sql.out index a034c22ffcda3..22b870bc0b420 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/division.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/division.sql.out @@ -2,7 +2,7 @@ -- !query CREATE TEMPORARY VIEW t AS SELECT 1 -- !query analysis -CreateViewCommand `t`, SELECT 1, false, false, LocalTempView, true +CreateViewCommand `t`, SELECT 1, false, false, LocalTempView, UNSUPPORTED, true +- Project [1 AS 1#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/elt.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/elt.sql.out index 4d897a329cfe1..f4902012f0f96 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/elt.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/elt.sql.out @@ -13,8 +13,8 @@ FROM ( -- !query analysis Project [elt(2, col1#x, cast(col2#xL as string), col3#x, cast(col4#x as string), cast(col5#x as string), false) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [prefix_ AS col1#x, id#xL AS col2#xL, cast((id#xL + cast(1 as bigint)) as string) AS col3#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col4#x, cast(id#xL as double) AS col5#x] - +- Range (0, 10, step=1, splits=None) + +- Project [prefix_ AS col1#x, id#xL AS col2#xL, cast((id#xL + cast(1 as bigint)) as string) AS col3#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col4#x, cast(id#xL as double) AS col5#x] + +- Range (0, 10, step=1) -- !query @@ -30,8 +30,8 @@ FROM ( -- !query analysis Project [elt(3, col1#x, col2#x, cast(col3#x as string), cast(col4#x as string), false) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [cast(id#xL as string) AS col1#x, cast((id#xL + cast(1 as bigint)) as string) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false) AS col4#x] - +- Range (0, 10, step=1, splits=None) + +- Project [cast(id#xL as string) AS col1#x, cast((id#xL + cast(1 as bigint)) as string) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS col4#x] + +- Range (0, 10, step=1) -- !query @@ -51,8 +51,8 @@ FROM ( -- !query analysis Project [elt(1, cast(col1#x as string), cast(col2#x as string), false) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x] - +- Range (0, 10, step=1, splits=None) + +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x] + +- Range (0, 10, step=1) -- !query @@ -72,5 +72,5 @@ FROM ( -- !query analysis Project [elt(2, col1#x, col2#x, false) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x] - +- Range (0, 10, step=1, splits=None) + +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x] + +- Range (0, 10, step=1) diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/ifCoercion.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/ifCoercion.sql.out index edee343b59250..b1d07bd7be902 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/ifCoercion.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/ifCoercion.sql.out @@ -2,7 +2,7 @@ -- !query CREATE TEMPORARY VIEW t AS SELECT 1 -- !query analysis -CreateViewCommand `t`, SELECT 1, false, false, LocalTempView, true +CreateViewCommand `t`, SELECT 1, false, false, LocalTempView, UNSUPPORTED, true +- Project [1 AS 1#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/implicitTypeCasts.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/implicitTypeCasts.sql.out index 9818d54b809a5..43aaea63fd045 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/implicitTypeCasts.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/implicitTypeCasts.sql.out @@ -2,7 +2,7 @@ -- !query CREATE TEMPORARY VIEW t AS SELECT 1 -- !query analysis -CreateViewCommand `t`, SELECT 1, false, false, LocalTempView, true +CreateViewCommand `t`, SELECT 1, false, false, LocalTempView, UNSUPPORTED, true +- Project [1 AS 1#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/inConversion.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/inConversion.sql.out index b248ec5997297..0db96719a3fb0 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/inConversion.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/inConversion.sql.out @@ -2,7 +2,7 @@ -- !query CREATE TEMPORARY VIEW t AS SELECT 1 -- !query analysis -CreateViewCommand `t`, SELECT 1, false, false, LocalTempView, true +CreateViewCommand `t`, SELECT 1, false, false, LocalTempView, UNSUPPORTED, true +- Project [1 AS 1#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/mapZipWith.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/mapZipWith.sql.out index 2b8152f0cc703..0f72b0cf8a0e3 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/mapZipWith.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/mapZipWith.sql.out @@ -66,7 +66,7 @@ CreateViewCommand `various_maps`, SELECT * FROM VALUES ( string_map1, string_map2, string_map3, string_map4, array_map1, array_map2, struct_map1, struct_map2 -), false, false, LocalTempView, true +), false, false, LocalTempView, UNSUPPORTED, true +- Project [boolean_map#x, tinyint_map#x, smallint_map#x, int_map#x, bigint_map#x, decimal_map1#x, decimal_map2#x, double_map#x, float_map#x, date_map#x, timestamp_map#x, string_map1#x, string_map2#x, string_map3#x, string_map4#x, array_map1#x, array_map2#x, struct_map1#x, struct_map2#x] +- SubqueryAlias various_maps +- LocalRelation [boolean_map#x, tinyint_map#x, smallint_map#x, int_map#x, bigint_map#x, decimal_map1#x, decimal_map2#x, double_map#x, float_map#x, date_map#x, timestamp_map#x, string_map1#x, string_map2#x, string_map3#x, string_map4#x, array_map1#x, array_map2#x, struct_map1#x, struct_map2#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/mapconcat.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/mapconcat.sql.out index f4c932fa29f97..dd3e56fe9322d 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/mapconcat.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/mapconcat.sql.out @@ -70,7 +70,7 @@ CreateViewCommand `various_maps`, SELECT * FROM VALUES ( struct_map1, struct_map2, string_int_map1, string_int_map2, int_string_map1, int_string_map2 -), false, false, LocalTempView, true +), false, false, LocalTempView, UNSUPPORTED, true +- Project [boolean_map1#x, boolean_map2#x, tinyint_map1#x, tinyint_map2#x, smallint_map1#x, smallint_map2#x, int_map1#x, int_map2#x, bigint_map1#x, bigint_map2#x, decimal_map1#x, decimal_map2#x, double_map1#x, double_map2#x, float_map1#x, float_map2#x, date_map1#x, date_map2#x, timestamp_map1#x, timestamp_map2#x, string_map1#x, string_map2#x, array_map1#x, array_map2#x, ... 6 more fields] +- SubqueryAlias various_maps +- LocalRelation [boolean_map1#x, boolean_map2#x, tinyint_map1#x, tinyint_map2#x, smallint_map1#x, smallint_map2#x, int_map1#x, int_map2#x, bigint_map1#x, bigint_map2#x, decimal_map1#x, decimal_map2#x, double_map1#x, double_map2#x, float_map1#x, float_map2#x, date_map1#x, date_map2#x, timestamp_map1#x, timestamp_map2#x, string_map1#x, string_map2#x, array_map1#x, array_map2#x, ... 6 more fields] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/promoteStrings.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/promoteStrings.sql.out index 0a32a7eaac474..ccd34cfaeb67f 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/promoteStrings.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/promoteStrings.sql.out @@ -2,7 +2,7 @@ -- !query CREATE TEMPORARY VIEW t AS SELECT 1 -- !query analysis -CreateViewCommand `t`, SELECT 1, false, false, LocalTempView, true +CreateViewCommand `t`, SELECT 1, false, false, LocalTempView, UNSUPPORTED, true +- Project [1 AS 1#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/stringCastAndExpressions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/stringCastAndExpressions.sql.out index 448ab457d3951..009e91f7ffacf 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/stringCastAndExpressions.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/stringCastAndExpressions.sql.out @@ -2,7 +2,7 @@ -- !query CREATE TEMPORARY VIEW t AS SELECT 'aa' as a -- !query analysis -CreateViewCommand `t`, SELECT 'aa' as a, false, false, LocalTempView, true +CreateViewCommand `t`, SELECT 'aa' as a, false, false, LocalTempView, UNSUPPORTED, true +- Project [aa AS a#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/widenSetOperationTypes.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/widenSetOperationTypes.sql.out index ff009f8bd64c0..029ec4abb6faf 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/widenSetOperationTypes.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/widenSetOperationTypes.sql.out @@ -2,7 +2,7 @@ -- !query CREATE TEMPORARY VIEW t AS SELECT 1 -- !query analysis -CreateViewCommand `t`, SELECT 1, false, false, LocalTempView, true +CreateViewCommand `t`, SELECT 1, false, false, LocalTempView, UNSUPPORTED, true +- Project [1 AS 1#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/windowFrameCoercion.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/windowFrameCoercion.sql.out index 76595c6cbded2..170e7dff38ac3 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/windowFrameCoercion.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/windowFrameCoercion.sql.out @@ -2,7 +2,7 @@ -- !query CREATE TEMPORARY VIEW t AS SELECT 1 -- !query analysis -CreateViewCommand `t`, SELECT 1, false, false, LocalTempView, true +CreateViewCommand `t`, SELECT 1, false, false, LocalTempView, UNSUPPORTED, true +- Project [1 AS 1#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/postgreSQL/udf-aggregates_part1.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/postgreSQL/udf-aggregates_part1.sql.out index e8bdf42655498..ddfe742f7ea08 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/postgreSQL/udf-aggregates_part1.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/postgreSQL/udf-aggregates_part1.sql.out @@ -154,70 +154,70 @@ Aggregate [stddev_pop(cast(cast(udf(cast(cast(3.0 as decimal(38,0)) as string)) select sum(udf(CAST(null AS int))) from range(1,4) -- !query analysis Aggregate [sum(cast(udf(cast(cast(null as int) as string)) as int)) AS sum(udf(CAST(NULL AS INT)))#xL] -+- Range (1, 4, step=1, splits=None) ++- Range (1, 4, step=1) -- !query select sum(udf(CAST(null AS long))) from range(1,4) -- !query analysis Aggregate [sum(cast(udf(cast(cast(null as bigint) as string)) as bigint)) AS sum(udf(CAST(NULL AS BIGINT)))#xL] -+- Range (1, 4, step=1, splits=None) ++- Range (1, 4, step=1) -- !query select sum(udf(CAST(null AS Decimal(38,0)))) from range(1,4) -- !query analysis Aggregate [sum(cast(udf(cast(cast(null as decimal(38,0)) as string)) as decimal(38,0))) AS sum(udf(CAST(NULL AS DECIMAL(38,0))))#x] -+- Range (1, 4, step=1, splits=None) ++- Range (1, 4, step=1) -- !query select sum(udf(CAST(null AS DOUBLE))) from range(1,4) -- !query analysis Aggregate [sum(cast(udf(cast(cast(null as double) as string)) as double)) AS sum(udf(CAST(NULL AS DOUBLE)))#x] -+- Range (1, 4, step=1, splits=None) ++- Range (1, 4, step=1) -- !query select avg(udf(CAST(null AS int))) from range(1,4) -- !query analysis Aggregate [avg(cast(udf(cast(cast(null as int) as string)) as int)) AS avg(udf(CAST(NULL AS INT)))#x] -+- Range (1, 4, step=1, splits=None) ++- Range (1, 4, step=1) -- !query select avg(udf(CAST(null AS long))) from range(1,4) -- !query analysis Aggregate [avg(cast(udf(cast(cast(null as bigint) as string)) as bigint)) AS avg(udf(CAST(NULL AS BIGINT)))#x] -+- Range (1, 4, step=1, splits=None) ++- Range (1, 4, step=1) -- !query select avg(udf(CAST(null AS Decimal(38,0)))) from range(1,4) -- !query analysis Aggregate [avg(cast(udf(cast(cast(null as decimal(38,0)) as string)) as decimal(38,0))) AS avg(udf(CAST(NULL AS DECIMAL(38,0))))#x] -+- Range (1, 4, step=1, splits=None) ++- Range (1, 4, step=1) -- !query select avg(udf(CAST(null AS DOUBLE))) from range(1,4) -- !query analysis Aggregate [avg(cast(udf(cast(cast(null as double) as string)) as double)) AS avg(udf(CAST(NULL AS DOUBLE)))#x] -+- Range (1, 4, step=1, splits=None) ++- Range (1, 4, step=1) -- !query select sum(CAST(udf('NaN') AS DOUBLE)) from range(1,4) -- !query analysis Aggregate [sum(cast(cast(udf(cast(NaN as string)) as string) as double)) AS sum(CAST(udf(NaN) AS DOUBLE))#x] -+- Range (1, 4, step=1, splits=None) ++- Range (1, 4, step=1) -- !query select avg(CAST(udf('NaN') AS DOUBLE)) from range(1,4) -- !query analysis Aggregate [avg(cast(cast(udf(cast(NaN as string)) as string) as double)) AS avg(CAST(udf(NaN) AS DOUBLE))#x] -+- Range (1, 4, step=1, splits=None) ++- Range (1, 4, step=1) -- !query @@ -345,7 +345,7 @@ Aggregate [corr(cast(b#x as double), cast(cast(udf(cast(a#x as string)) as int) -- !query CREATE TEMPORARY VIEW regr_test AS SELECT * FROM VALUES (10,150),(20,250),(30,350),(80,540),(100,200) AS regr_test (x, y) -- !query analysis -CreateViewCommand `regr_test`, SELECT * FROM VALUES (10,150),(20,250),(30,350),(80,540),(100,200) AS regr_test (x, y), false, false, LocalTempView, true +CreateViewCommand `regr_test`, SELECT * FROM VALUES (10,150),(20,250),(30,350),(80,540),(100,200) AS regr_test (x, y), false, false, LocalTempView, UNSUPPORTED, true +- Project [x#x, y#x] +- SubqueryAlias regr_test +- LocalRelation [x#x, y#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/postgreSQL/udf-aggregates_part2.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/postgreSQL/udf-aggregates_part2.sql.out index bb8d0824eb5b7..99bc3cbf49b2a 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/postgreSQL/udf-aggregates_part2.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/postgreSQL/udf-aggregates_part2.sql.out @@ -14,7 +14,7 @@ CreateViewCommand `int4_tbl`, select * from values (-123456), (2147483647), (-2147483647) - as int4_tbl(f1), false, false, LocalTempView, true + as int4_tbl(f1), false, false, LocalTempView, UNSUPPORTED, true +- Project [f1#x] +- SubqueryAlias int4_tbl +- LocalRelation [f1#x] @@ -29,7 +29,7 @@ CREATE OR REPLACE TEMPORARY VIEW bitwise_test AS SELECT * FROM VALUES CreateViewCommand `bitwise_test`, SELECT * FROM VALUES (1, 1, 1, 1L), (3, 3, 3, null), - (7, 7, 7, 3L) AS bitwise_test(b1, b2, b3, b4), false, true, LocalTempView, true + (7, 7, 7, 3L) AS bitwise_test(b1, b2, b3, b4), false, true, LocalTempView, UNSUPPORTED, true +- Project [b1#x, b2#x, b3#x, b4#xL] +- SubqueryAlias bitwise_test +- LocalRelation [b1#x, b2#x, b3#x, b4#xL] @@ -123,7 +123,7 @@ CREATE OR REPLACE TEMPORARY VIEW bool_test AS SELECT * FROM VALUES CreateViewCommand `bool_test`, SELECT * FROM VALUES (TRUE, null, FALSE, null), (FALSE, TRUE, null, null), - (null, TRUE, FALSE, null) AS bool_test(b1, b2, b3, b4), false, true, LocalTempView, true + (null, TRUE, FALSE, null) AS bool_test(b1, b2, b3, b4), false, true, LocalTempView, UNSUPPORTED, true +- Project [b1#x, b2#x, b3#x, b4#x] +- SubqueryAlias bool_test +- LocalRelation [b1#x, b2#x, b3#x, b4#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/postgreSQL/udf-join.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/postgreSQL/udf-join.sql.out index cd743c0a7fefa..c74124402c554 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/postgreSQL/udf-join.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/postgreSQL/udf-join.sql.out @@ -12,7 +12,7 @@ CreateViewCommand `INT2_TBL`, [(f1,None)], VALUES (smallint(trim(' 1234 '))), (smallint(trim(' -1234'))), (smallint('32767')), - (smallint('-32767')), false, true, LocalTempView, true + (smallint('-32767')), false, true, LocalTempView, UNSUPPORTED, true +- LocalRelation [col1#x] @@ -23,7 +23,7 @@ CREATE OR REPLACE TEMPORARY VIEW INT4_TBL AS SELECT * FROM -- !query analysis CreateViewCommand `INT4_TBL`, SELECT * FROM (VALUES (0), (123456), (-123456), (2147483647), (-2147483647)) - AS v(f1), false, true, LocalTempView, true + AS v(f1), false, true, LocalTempView, UNSUPPORTED, true +- Project [f1#x] +- SubqueryAlias v +- Project [col1#x AS f1#x] @@ -47,7 +47,7 @@ CreateViewCommand `INT8_TBL`, SELECT * FROM (4567890123456789, 123), (4567890123456789, 4567890123456789), (4567890123456789, -4567890123456789)) - AS v(q1, q2), false, true, LocalTempView, true + AS v(q1, q2), false, true, LocalTempView, UNSUPPORTED, true +- Project [q1#xL, q2#xL] +- SubqueryAlias v +- Project [col1#xL AS q1#xL, col2#xL AS q2#xL] @@ -63,7 +63,7 @@ CREATE OR REPLACE TEMPORARY VIEW FLOAT8_TBL AS SELECT * FROM CreateViewCommand `FLOAT8_TBL`, SELECT * FROM (VALUES (0.0), (1004.30), (-34.84), (cast('1.2345678901234e+200' as double)), (cast('1.2345678901234e-200' as double))) - AS v(f1), false, true, LocalTempView, true + AS v(f1), false, true, LocalTempView, UNSUPPORTED, true +- Project [f1#x] +- SubqueryAlias v +- Project [col1#x AS f1#x] @@ -77,7 +77,7 @@ CREATE OR REPLACE TEMPORARY VIEW TEXT_TBL AS SELECT * FROM -- !query analysis CreateViewCommand `TEXT_TBL`, SELECT * FROM (VALUES ('doh!'), ('hi de ho neighbor')) - AS v(f1), false, true, LocalTempView, true + AS v(f1), false, true, LocalTempView, UNSUPPORTED, true +- Project [f1#x] +- SubqueryAlias v +- Project [col1#x AS f1#x] @@ -87,7 +87,7 @@ CreateViewCommand `TEXT_TBL`, SELECT * FROM -- !query CREATE OR REPLACE TEMPORARY VIEW tenk2 AS SELECT * FROM tenk1 -- !query analysis -CreateViewCommand `tenk2`, SELECT * FROM tenk1, false, true, LocalTempView, true +CreateViewCommand `tenk2`, SELECT * FROM tenk1, false, true, LocalTempView, UNSUPPORTED, true +- Project [unique1#x, unique2#x, two#x, four#x, ten#x, twenty#x, hundred#x, thousand#x, twothousand#x, fivethous#x, tenthous#x, odd#x, even#x, stringu1#x, stringu2#x, string4#x] +- SubqueryAlias spark_catalog.default.tenk1 +- Relation spark_catalog.default.tenk1[unique1#x,unique2#x,two#x,four#x,ten#x,twenty#x,hundred#x,thousand#x,twothousand#x,fivethous#x,tenthous#x,odd#x,even#x,stringu1#x,stringu2#x,string4#x] parquet @@ -1005,7 +1005,7 @@ create or replace temporary view x as select * from -- !query analysis CreateViewCommand `x`, select * from (values (1,11), (2,22), (3,null), (4,44), (5,null)) - as v(x1, x2), false, true, LocalTempView, true + as v(x1, x2), false, true, LocalTempView, UNSUPPORTED, true +- Project [x1#x, x2#x] +- SubqueryAlias v +- Project [col1#x AS x1#x, col2#x AS x2#x] @@ -1019,7 +1019,7 @@ create or replace temporary view y as select * from -- !query analysis CreateViewCommand `y`, select * from (values (1,111), (2,222), (3,333), (4,null)) - as v(y1, y2), false, true, LocalTempView, true + as v(y1, y2), false, true, LocalTempView, UNSUPPORTED, true +- Project [y1#x, y2#x] +- SubqueryAlias v +- Project [col1#x AS y1#x, col2#x AS y2#x] @@ -1629,7 +1629,7 @@ create or replace temporary view tt1 as select * from -- !query analysis CreateViewCommand `tt1`, select * from (values (1, 11), (2, NULL)) - as v(tt1_id, joincol), false, true, LocalTempView, true + as v(tt1_id, joincol), false, true, LocalTempView, UNSUPPORTED, true +- Project [tt1_id#x, joincol#x] +- SubqueryAlias v +- Project [col1#x AS tt1_id#x, col2#x AS joincol#x] @@ -1643,7 +1643,7 @@ create or replace temporary view tt2 as select * from -- !query analysis CreateViewCommand `tt2`, select * from (values (21, 11), (22, 11)) - as v(tt2_id, joincol), false, true, LocalTempView, true + as v(tt2_id, joincol), false, true, LocalTempView, UNSUPPORTED, true +- Project [tt2_id#x, joincol#x] +- SubqueryAlias v +- Project [col1#x AS tt2_id#x, col2#x AS joincol#x] @@ -1727,7 +1727,7 @@ InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_d +- Project [cast(id#xL as int) AS f1#x, cast(repeat(xyzzy, 100)#x as string) AS f2#x] +- Project [id#xL, repeat(xyzzy, 100) AS repeat(xyzzy, 100)#x] +- SubqueryAlias x - +- Range (1, 10001, step=1, splits=None) + +- Range (1, 10001, step=1) -- !query @@ -1786,7 +1786,7 @@ create or replace temporary view tt5 as select * from -- !query analysis CreateViewCommand `tt5`, select * from (values (1, 10), (1, 11)) - as v(f1, f2), false, true, LocalTempView, true + as v(f1, f2), false, true, LocalTempView, UNSUPPORTED, true +- Project [f1#x, f2#x] +- SubqueryAlias v +- Project [col1#x AS f1#x, col2#x AS f2#x] @@ -1800,7 +1800,7 @@ create or replace temporary view tt6 as select * from -- !query analysis CreateViewCommand `tt6`, select * from (values (1, 9), (1, 2), (2, 9)) - as v(f1, f2), false, true, LocalTempView, true + as v(f1, f2), false, true, LocalTempView, UNSUPPORTED, true +- Project [f1#x, f2#x] +- SubqueryAlias v +- Project [col1#x AS f1#x, col2#x AS f2#x] @@ -1836,7 +1836,7 @@ create or replace temporary view xx as select * from -- !query analysis CreateViewCommand `xx`, select * from (values (1), (2), (3)) - as v(pkxx), false, true, LocalTempView, true + as v(pkxx), false, true, LocalTempView, UNSUPPORTED, true +- Project [pkxx#x] +- SubqueryAlias v +- Project [col1#x AS pkxx#x] @@ -1850,7 +1850,7 @@ create or replace temporary view yy as select * from -- !query analysis CreateViewCommand `yy`, select * from (values (101, 1), (201, 2), (301, NULL)) - as v(pkyy, pkxx), false, true, LocalTempView, true + as v(pkyy, pkxx), false, true, LocalTempView, UNSUPPORTED, true +- Project [pkyy#x, pkxx#x] +- SubqueryAlias v +- Project [col1#x AS pkyy#x, col2#x AS pkxx#x] @@ -1911,7 +1911,7 @@ create or replace temporary view zt1 as select * from -- !query analysis CreateViewCommand `zt1`, select * from (values (53)) - as v(f1), false, true, LocalTempView, true + as v(f1), false, true, LocalTempView, UNSUPPORTED, true +- Project [f1#x] +- SubqueryAlias v +- Project [col1#x AS f1#x] @@ -1925,7 +1925,7 @@ create or replace temporary view zt2 as select * from -- !query analysis CreateViewCommand `zt2`, select * from (values (53)) - as v(f2), false, true, LocalTempView, true + as v(f2), false, true, LocalTempView, UNSUPPORTED, true +- Project [f2#x] +- SubqueryAlias v +- Project [col1#x AS f2#x] @@ -1970,7 +1970,7 @@ Project [f2#x, f3#x, f1#x] -- !query create temp view zv1 as select *,'dummy' AS junk from zt1 -- !query analysis -CreateViewCommand `zv1`, select *,'dummy' AS junk from zt1, false, false, LocalTempView, true +CreateViewCommand `zv1`, select *,'dummy' AS junk from zt1, false, false, LocalTempView, UNSUPPORTED, true +- Project [f1#x, dummy AS junk#x] +- SubqueryAlias zt1 +- View (`zt1`, [f1#x]) @@ -2145,7 +2145,7 @@ create or replace temporary view a as select * from -- !query analysis CreateViewCommand `a`, select * from (values ('p'), ('q')) - as v(code), false, true, LocalTempView, true + as v(code), false, true, LocalTempView, UNSUPPORTED, true +- Project [code#x] +- SubqueryAlias v +- Project [col1#x AS code#x] @@ -2159,7 +2159,7 @@ create or replace temporary view b as select * from -- !query analysis CreateViewCommand `b`, select * from (values ('p', 1), ('p', 2)) - as v(a, num), false, true, LocalTempView, true + as v(a, num), false, true, LocalTempView, UNSUPPORTED, true +- Project [a#x, num#x] +- SubqueryAlias v +- Project [col1#x AS a#x, col2#x AS num#x] @@ -2173,7 +2173,7 @@ create or replace temporary view c as select * from -- !query analysis CreateViewCommand `c`, select * from (values ('A', 'p'), ('B', 'q'), ('C', null)) - as v(name, a), false, true, LocalTempView, true + as v(name, a), false, true, LocalTempView, UNSUPPORTED, true +- Project [name#x, a#x] +- SubqueryAlias v +- Project [col1#x AS name#x, col2#x AS a#x] @@ -2346,7 +2346,7 @@ create or replace temporary view nt1 as select * from -- !query analysis CreateViewCommand `nt1`, select * from (values(1,true,true), (2,true,false), (3,false,false)) - as v(id, a1, a2), false, true, LocalTempView, true + as v(id, a1, a2), false, true, LocalTempView, UNSUPPORTED, true +- Project [id#x, a1#x, a2#x] +- SubqueryAlias v +- Project [col1#x AS id#x, col2#x AS a1#x, col3#x AS a2#x] @@ -2360,7 +2360,7 @@ create or replace temporary view nt2 as select * from -- !query analysis CreateViewCommand `nt2`, select * from (values(1,1,true,true), (2,2,true,false), (3,3,false,false)) - as v(id, nt1_id, b1, b2), false, true, LocalTempView, true + as v(id, nt1_id, b1, b2), false, true, LocalTempView, UNSUPPORTED, true +- Project [id#x, nt1_id#x, b1#x, b2#x] +- SubqueryAlias v +- Project [col1#x AS id#x, col2#x AS nt1_id#x, col3#x AS b1#x, col4#x AS b2#x] @@ -2374,7 +2374,7 @@ create or replace temporary view nt3 as select * from -- !query analysis CreateViewCommand `nt3`, select * from (values(1,1,true), (2,2,false), (3,3,true)) - as v(id, nt2_id, c1), false, true, LocalTempView, true + as v(id, nt2_id, c1), false, true, LocalTempView, UNSUPPORTED, true +- Project [id#x, nt2_id#x, c1#x] +- SubqueryAlias v +- Project [col1#x AS id#x, col2#x AS nt2_id#x, col3#x AS c1#x] @@ -2732,7 +2732,7 @@ create or replace temporary view parent as select * from -- !query analysis CreateViewCommand `parent`, select * from (values (1, 10), (2, 20), (3, 30)) - as v(k, pd), false, true, LocalTempView, true + as v(k, pd), false, true, LocalTempView, UNSUPPORTED, true +- Project [k#x, pd#x] +- SubqueryAlias v +- Project [col1#x AS k#x, col2#x AS pd#x] @@ -2746,7 +2746,7 @@ create or replace temporary view child as select * from -- !query analysis CreateViewCommand `child`, select * from (values (1, 100), (4, 400)) - as v(k, cd), false, true, LocalTempView, true + as v(k, cd), false, true, LocalTempView, UNSUPPORTED, true +- Project [k#x, cd#x] +- SubqueryAlias v +- Project [col1#x AS k#x, col2#x AS cd#x] @@ -2871,7 +2871,7 @@ create or replace temporary view a as select * from -- !query analysis CreateViewCommand `a`, select * from (values (0), (1)) - as v(id), false, true, LocalTempView, true + as v(id), false, true, LocalTempView, UNSUPPORTED, true +- Project [id#x] +- SubqueryAlias v +- Project [col1#x AS id#x] @@ -2885,7 +2885,7 @@ create or replace temporary view b as select * from -- !query analysis CreateViewCommand `b`, select * from (values (0, 0), (1, NULL)) - as v(id, a_id), false, true, LocalTempView, true + as v(id, a_id), false, true, LocalTempView, UNSUPPORTED, true +- Project [id#x, a_id#x] +- SubqueryAlias v +- Project [col1#x AS id#x, col2#x AS a_id#x] @@ -2943,7 +2943,7 @@ create or replace temporary view innertab as select * from -- !query analysis CreateViewCommand `innertab`, select * from (values (123L, 42L)) - as v(id, dat1), false, true, LocalTempView, true + as v(id, dat1), false, true, LocalTempView, UNSUPPORTED, true +- Project [id#xL, dat1#xL] +- SubqueryAlias v +- Project [col1#xL AS id#xL, col2#xL AS dat1#xL] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-count.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-count.sql.out index 57c42b338c41a..2540daef71424 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-count.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-count.sql.out @@ -6,7 +6,7 @@ AS testData(a, b) -- !query analysis CreateViewCommand `testData`, SELECT * FROM VALUES (1, 1), (1, 2), (2, 1), (1, 1), (null, 2), (1, null), (null, null) -AS testData(a, b), false, true, LocalTempView, true +AS testData(a, b), false, true, LocalTempView, UNSUPPORTED, true +- Project [a#x, b#x] +- SubqueryAlias testData +- LocalRelation [a#x, b#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-cross-join.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-cross-join.sql.out index c704623140d67..c5ee1742f5d7c 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-cross-join.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-cross-join.sql.out @@ -10,7 +10,7 @@ CreateViewCommand `nt1`, select * from values ("one", 1), ("two", 2), ("three", 3) - as nt1(k, v1), false, false, LocalTempView, true + as nt1(k, v1), false, false, LocalTempView, UNSUPPORTED, true +- Project [k#x, v1#x] +- SubqueryAlias nt1 +- LocalRelation [k#x, v1#x] @@ -27,7 +27,7 @@ CreateViewCommand `nt2`, select * from values ("one", 1), ("two", 22), ("one", 5) - as nt2(k, v2), false, false, LocalTempView, true + as nt2(k, v2), false, false, LocalTempView, UNSUPPORTED, true +- Project [k#x, v2#x] +- SubqueryAlias nt2 +- LocalRelation [k#x, v2#x] @@ -142,7 +142,7 @@ Project [cast(udf(cast(key#x as string)) as string) AS udf(key)#x, cast(udf(cast -- !query create temporary view A(a, va) as select * from nt1 -- !query analysis -CreateViewCommand `A`, [(a,None), (va,None)], select * from nt1, false, false, LocalTempView, true +CreateViewCommand `A`, [(a,None), (va,None)], select * from nt1, false, false, LocalTempView, UNSUPPORTED, true +- Project [k#x, v1#x] +- SubqueryAlias nt1 +- View (`nt1`, [k#x, v1#x]) @@ -155,7 +155,7 @@ CreateViewCommand `A`, [(a,None), (va,None)], select * from nt1, false, false, L -- !query create temporary view B(b, vb) as select * from nt1 -- !query analysis -CreateViewCommand `B`, [(b,None), (vb,None)], select * from nt1, false, false, LocalTempView, true +CreateViewCommand `B`, [(b,None), (vb,None)], select * from nt1, false, false, LocalTempView, UNSUPPORTED, true +- Project [k#x, v1#x] +- SubqueryAlias nt1 +- View (`nt1`, [k#x, v1#x]) @@ -168,7 +168,7 @@ CreateViewCommand `B`, [(b,None), (vb,None)], select * from nt1, false, false, L -- !query create temporary view C(c, vc) as select * from nt1 -- !query analysis -CreateViewCommand `C`, [(c,None), (vc,None)], select * from nt1, false, false, LocalTempView, true +CreateViewCommand `C`, [(c,None), (vc,None)], select * from nt1, false, false, LocalTempView, UNSUPPORTED, true +- Project [k#x, v1#x] +- SubqueryAlias nt1 +- View (`nt1`, [k#x, v1#x]) @@ -181,7 +181,7 @@ CreateViewCommand `C`, [(c,None), (vc,None)], select * from nt1, false, false, L -- !query create temporary view D(d, vd) as select * from nt1 -- !query analysis -CreateViewCommand `D`, [(d,None), (vd,None)], select * from nt1, false, false, LocalTempView, true +CreateViewCommand `D`, [(d,None), (vd,None)], select * from nt1, false, false, LocalTempView, UNSUPPORTED, true +- Project [k#x, v1#x] +- SubqueryAlias nt1 +- View (`nt1`, [k#x, v1#x]) diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-except-all.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-except-all.sql.out index 74ae6c0c584a8..37c10b6fcd03c 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-except-all.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-except-all.sql.out @@ -4,7 +4,7 @@ CREATE TEMPORARY VIEW tab1 AS SELECT * FROM VALUES (0), (1), (2), (2), (2), (2), (3), (null), (null) AS tab1(c1) -- !query analysis CreateViewCommand `tab1`, SELECT * FROM VALUES - (0), (1), (2), (2), (2), (2), (3), (null), (null) AS tab1(c1), false, false, LocalTempView, true + (0), (1), (2), (2), (2), (2), (3), (null), (null) AS tab1(c1), false, false, LocalTempView, UNSUPPORTED, true +- Project [c1#x] +- SubqueryAlias tab1 +- LocalRelation [c1#x] @@ -15,7 +15,7 @@ CREATE TEMPORARY VIEW tab2 AS SELECT * FROM VALUES (1), (2), (2), (3), (5), (5), (null) AS tab2(c1) -- !query analysis CreateViewCommand `tab2`, SELECT * FROM VALUES - (1), (2), (2), (3), (5), (5), (null) AS tab2(c1), false, false, LocalTempView, true + (1), (2), (2), (3), (5), (5), (null) AS tab2(c1), false, false, LocalTempView, UNSUPPORTED, true +- Project [c1#x] +- SubqueryAlias tab2 +- LocalRelation [c1#x] @@ -36,7 +36,7 @@ CreateViewCommand `tab3`, SELECT * FROM VALUES (1, 3), (2, 3), (2, 2) - AS tab3(k, v), false, false, LocalTempView, true + AS tab3(k, v), false, false, LocalTempView, UNSUPPORTED, true +- Project [k#x, v#x] +- SubqueryAlias tab3 +- LocalRelation [k#x, v#x] @@ -57,7 +57,7 @@ CreateViewCommand `tab4`, SELECT * FROM VALUES (2, 2), (2, 2), (2, 20) - AS tab4(k, v), false, false, LocalTempView, true + AS tab4(k, v), false, false, LocalTempView, UNSUPPORTED, true +- Project [k#x, v#x] +- SubqueryAlias tab4 +- LocalRelation [k#x, v#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-except.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-except.sql.out index 07649c3e5620b..4a7a38bd88496 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-except.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-except.sql.out @@ -12,7 +12,7 @@ CreateViewCommand `t1`, select * from values ("two", 2), ("three", 3), ("one", NULL) - as t1(k, v), false, false, LocalTempView, true + as t1(k, v), false, false, LocalTempView, UNSUPPORTED, true +- Project [k#x, v#x] +- SubqueryAlias t1 +- LocalRelation [k#x, v#x] @@ -33,7 +33,7 @@ CreateViewCommand `t2`, select * from values ("one", 5), ("one", NULL), (NULL, 5) - as t2(k, v), false, false, LocalTempView, true + as t2(k, v), false, false, LocalTempView, UNSUPPORTED, true +- Project [k#x, v#x] +- SubqueryAlias t2 +- LocalRelation [k#x, v#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-group-analytics.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-group-analytics.sql.out index 8e1eb80b4bac8..fbee3e2c8c89f 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-group-analytics.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-group-analytics.sql.out @@ -6,7 +6,7 @@ AS testData(a, b) -- !query analysis CreateViewCommand `testData`, SELECT * FROM VALUES (1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2) -AS testData(a, b), false, true, LocalTempView, true +AS testData(a, b), false, true, LocalTempView, UNSUPPORTED, true +- Project [a#x, b#x] +- SubqueryAlias testData +- LocalRelation [a#x, b#x] @@ -75,7 +75,7 @@ AS courseSales(course, year, earnings) -- !query analysis CreateViewCommand `courseSales`, SELECT * FROM VALUES ("dotNET", 2012, 10000), ("Java", 2012, 20000), ("dotNET", 2012, 5000), ("dotNET", 2013, 48000), ("Java", 2013, 30000) -AS courseSales(course, year, earnings), false, true, LocalTempView, true +AS courseSales(course, year, earnings), false, true, LocalTempView, UNSUPPORTED, true +- Project [course#x, year#x, earnings#x] +- SubqueryAlias courseSales +- LocalRelation [course#x, year#x, earnings#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-group-by.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-group-by.sql.out index 12b903477b6c4..5811a4ff6566c 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-group-by.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-group-by.sql.out @@ -6,7 +6,7 @@ AS testData(a, b) -- !query analysis CreateViewCommand `testData`, SELECT * FROM VALUES (1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, null) -AS testData(a, b), false, true, LocalTempView, true +AS testData(a, b), false, true, LocalTempView, UNSUPPORTED, true +- Project [a#x, b#x] +- SubqueryAlias testData +- LocalRelation [a#x, b#x] @@ -240,7 +240,7 @@ CREATE OR REPLACE TEMPORARY VIEW testDataHasSameNameWithAlias AS SELECT * FROM V (1, 1, 3), (1, 2, 1) AS testDataHasSameNameWithAlias(k, a, v) -- !query analysis CreateViewCommand `testDataHasSameNameWithAlias`, SELECT * FROM VALUES -(1, 1, 3), (1, 2, 1) AS testDataHasSameNameWithAlias(k, a, v), false, true, LocalTempView, true +(1, 1, 3), (1, 2, 1) AS testDataHasSameNameWithAlias(k, a, v), false, true, LocalTempView, UNSUPPORTED, true +- Project [k#x, a#x, v#x] +- SubqueryAlias testDataHasSameNameWithAlias +- LocalRelation [k#x, a#x, v#x] @@ -362,7 +362,7 @@ SELECT udf(1) FROM range(10) HAVING true -- !query analysis Filter cast(true as boolean) +- Aggregate [cast(udf(cast(1 as string)) as int) AS udf(1)#x] - +- Range (0, 10, step=1, splits=None) + +- Range (0, 10, step=1) -- !query @@ -371,7 +371,7 @@ SELECT udf(udf(1)) FROM range(10) HAVING MAX(id) > 0 Project [udf(udf(1))#x] +- Filter (max(id#xL)#xL > cast(0 as bigint)) +- Aggregate [cast(udf(cast(cast(udf(cast(1 as string)) as int) as string)) as int) AS udf(udf(1))#x, max(id#xL) AS max(id#xL)#xL] - +- Range (0, 10, step=1, splits=None) + +- Range (0, 10, step=1) -- !query @@ -404,7 +404,7 @@ CreateViewCommand `test_agg`, SELECT * FROM VALUES (2, true), (3, false), (3, null), (4, null), (4, null), - (5, null), (5, true), (5, false) AS test_agg(k, v), false, true, LocalTempView, true + (5, null), (5, true), (5, false) AS test_agg(k, v), false, true, LocalTempView, UNSUPPORTED, true +- Project [k#x, v#x] +- SubqueryAlias test_agg +- LocalRelation [k#x, v#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-having.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-having.sql.out index 441fda84479e0..f5cbe4abf3538 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-having.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-having.sql.out @@ -12,7 +12,7 @@ CreateViewCommand `hav`, select * from values ("two", 2), ("three", 3), ("one", 5) - as hav(k, v), false, false, LocalTempView, true + as hav(k, v), false, false, LocalTempView, UNSUPPORTED, true +- Project [k#x, v#x] +- SubqueryAlias hav +- LocalRelation [k#x, v#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-inner-join.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-inner-join.sql.out index 1454be010d9c9..129575c77e732 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-inner-join.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-inner-join.sql.out @@ -2,7 +2,7 @@ -- !query CREATE TEMPORARY VIEW t1 AS SELECT * FROM VALUES (1) AS GROUPING(a) -- !query analysis -CreateViewCommand `t1`, SELECT * FROM VALUES (1) AS GROUPING(a), false, false, LocalTempView, true +CreateViewCommand `t1`, SELECT * FROM VALUES (1) AS GROUPING(a), false, false, LocalTempView, UNSUPPORTED, true +- Project [a#x] +- SubqueryAlias GROUPING +- LocalRelation [a#x] @@ -11,7 +11,7 @@ CreateViewCommand `t1`, SELECT * FROM VALUES (1) AS GROUPING(a), false, false, L -- !query CREATE TEMPORARY VIEW t2 AS SELECT * FROM VALUES (1) AS GROUPING(a) -- !query analysis -CreateViewCommand `t2`, SELECT * FROM VALUES (1) AS GROUPING(a), false, false, LocalTempView, true +CreateViewCommand `t2`, SELECT * FROM VALUES (1) AS GROUPING(a), false, false, LocalTempView, UNSUPPORTED, true +- Project [a#x] +- SubqueryAlias GROUPING +- LocalRelation [a#x] @@ -20,7 +20,7 @@ CreateViewCommand `t2`, SELECT * FROM VALUES (1) AS GROUPING(a), false, false, L -- !query CREATE TEMPORARY VIEW t3 AS SELECT * FROM VALUES (1), (1) AS GROUPING(a) -- !query analysis -CreateViewCommand `t3`, SELECT * FROM VALUES (1), (1) AS GROUPING(a), false, false, LocalTempView, true +CreateViewCommand `t3`, SELECT * FROM VALUES (1), (1) AS GROUPING(a), false, false, LocalTempView, UNSUPPORTED, true +- Project [a#x] +- SubqueryAlias GROUPING +- LocalRelation [a#x] @@ -29,7 +29,7 @@ CreateViewCommand `t3`, SELECT * FROM VALUES (1), (1) AS GROUPING(a), false, fal -- !query CREATE TEMPORARY VIEW t4 AS SELECT * FROM VALUES (1), (1) AS GROUPING(a) -- !query analysis -CreateViewCommand `t4`, SELECT * FROM VALUES (1), (1) AS GROUPING(a), false, false, LocalTempView, true +CreateViewCommand `t4`, SELECT * FROM VALUES (1), (1) AS GROUPING(a), false, false, LocalTempView, UNSUPPORTED, true +- Project [a#x] +- SubqueryAlias GROUPING +- LocalRelation [a#x] @@ -43,7 +43,7 @@ SELECT udf(a) AS a, udf('b') AS tag FROM t2 -- !query analysis CreateViewCommand `ta`, SELECT udf(a) AS a, udf('a') AS tag FROM t1 UNION ALL -SELECT udf(a) AS a, udf('b') AS tag FROM t2, false, false, LocalTempView, true, [udf] +SELECT udf(a) AS a, udf('b') AS tag FROM t2, false, false, LocalTempView, UNSUPPORTED, true, [udf] +- Union false, false :- Project [cast(udf(cast(a#x as string)) as int) AS a#x, cast(udf(cast(a as string)) as string) AS tag#x] : +- SubqueryAlias t1 @@ -69,7 +69,7 @@ SELECT udf(a) AS a, udf('b') AS tag FROM t4 -- !query analysis CreateViewCommand `tb`, SELECT udf(a) AS a, udf('a') AS tag FROM t3 UNION ALL -SELECT udf(a) AS a, udf('b') AS tag FROM t4, false, false, LocalTempView, true, [udf] +SELECT udf(a) AS a, udf('b') AS tag FROM t4, false, false, LocalTempView, UNSUPPORTED, true, [udf] +- Union false, false :- Project [cast(udf(cast(a#x as string)) as int) AS a#x, cast(udf(cast(a as string)) as string) AS tag#x] : +- SubqueryAlias t3 diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-intersect-all.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-intersect-all.sql.out index 323f07a7d8339..35306746932e2 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-intersect-all.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-intersect-all.sql.out @@ -18,7 +18,7 @@ CreateViewCommand `tab1`, SELECT * FROM VALUES (2, 3), (null, null), (null, null) - AS tab1(k, v), false, false, LocalTempView, true + AS tab1(k, v), false, false, LocalTempView, UNSUPPORTED, true +- Project [k#x, v#x] +- SubqueryAlias tab1 +- LocalRelation [k#x, v#x] @@ -41,7 +41,7 @@ CreateViewCommand `tab2`, SELECT * FROM VALUES (3, 4), (null, null), (null, null) - AS tab2(k, v), false, false, LocalTempView, true + AS tab2(k, v), false, false, LocalTempView, UNSUPPORTED, true +- Project [k#x, v#x] +- SubqueryAlias tab2 +- LocalRelation [k#x, v#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-join-empty-relation.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-join-empty-relation.sql.out index d5ffa3ebb2bb5..ebd5127112cc0 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-join-empty-relation.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-join-empty-relation.sql.out @@ -2,7 +2,7 @@ -- !query CREATE TEMPORARY VIEW t1 AS SELECT * FROM VALUES (1) AS GROUPING(a) -- !query analysis -CreateViewCommand `t1`, SELECT * FROM VALUES (1) AS GROUPING(a), false, false, LocalTempView, true +CreateViewCommand `t1`, SELECT * FROM VALUES (1) AS GROUPING(a), false, false, LocalTempView, UNSUPPORTED, true +- Project [a#x] +- SubqueryAlias GROUPING +- LocalRelation [a#x] @@ -11,7 +11,7 @@ CreateViewCommand `t1`, SELECT * FROM VALUES (1) AS GROUPING(a), false, false, L -- !query CREATE TEMPORARY VIEW t2 AS SELECT * FROM VALUES (1) AS GROUPING(a) -- !query analysis -CreateViewCommand `t2`, SELECT * FROM VALUES (1) AS GROUPING(a), false, false, LocalTempView, true +CreateViewCommand `t2`, SELECT * FROM VALUES (1) AS GROUPING(a), false, false, LocalTempView, UNSUPPORTED, true +- Project [a#x] +- SubqueryAlias GROUPING +- LocalRelation [a#x] @@ -20,7 +20,7 @@ CreateViewCommand `t2`, SELECT * FROM VALUES (1) AS GROUPING(a), false, false, L -- !query CREATE TEMPORARY VIEW empty_table as SELECT a FROM t2 WHERE false -- !query analysis -CreateViewCommand `empty_table`, SELECT a FROM t2 WHERE false, false, false, LocalTempView, true +CreateViewCommand `empty_table`, SELECT a FROM t2 WHERE false, false, false, LocalTempView, UNSUPPORTED, true +- Project [a#x] +- Filter false +- SubqueryAlias t2 diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-natural-join.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-natural-join.sql.out index 324622e615da0..5fc413c66326b 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-natural-join.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-natural-join.sql.out @@ -10,7 +10,7 @@ CreateViewCommand `nt1`, select * from values ("one", 1), ("two", 2), ("three", 3) - as nt1(k, v1), false, false, LocalTempView, true + as nt1(k, v1), false, false, LocalTempView, UNSUPPORTED, true +- Project [k#x, v1#x] +- SubqueryAlias nt1 +- LocalRelation [k#x, v1#x] @@ -27,7 +27,7 @@ CreateViewCommand `nt2`, select * from values ("one", 1), ("two", 22), ("one", 5) - as nt2(k, v2), false, false, LocalTempView, true + as nt2(k, v2), false, false, LocalTempView, UNSUPPORTED, true +- Project [k#x, v2#x] +- SubqueryAlias nt2 +- LocalRelation [k#x, v2#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-outer-join.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-outer-join.sql.out index 8eee8746637cd..df2169386249a 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-outer-join.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-outer-join.sql.out @@ -6,7 +6,7 @@ as t1(int_col1) -- !query analysis CreateViewCommand `t1`, SELECT * FROM VALUES (-234), (145), (367), (975), (298) -as t1(int_col1), false, true, LocalTempView, true +as t1(int_col1), false, true, LocalTempView, UNSUPPORTED, true +- Project [int_col1#x] +- SubqueryAlias t1 +- LocalRelation [int_col1#x] @@ -19,7 +19,7 @@ as t2(int_col0, int_col1) -- !query analysis CreateViewCommand `t2`, SELECT * FROM VALUES (-769, -244), (-800, -409), (940, 86), (-507, 304), (-367, 158) -as t2(int_col0, int_col1), false, true, LocalTempView, true +as t2(int_col0, int_col1), false, true, LocalTempView, UNSUPPORTED, true +- Project [int_col0#x, int_col1#x] +- SubqueryAlias t2 +- LocalRelation [int_col0#x, int_col1#x] @@ -58,7 +58,7 @@ Project [udf(sum(udf(coalesce(int_col1, int_col0))))#xL, (udf(coalesce(int_col1, -- !query CREATE OR REPLACE TEMPORARY VIEW t1 AS SELECT * FROM VALUES (97) as t1(int_col1) -- !query analysis -CreateViewCommand `t1`, SELECT * FROM VALUES (97) as t1(int_col1), false, true, LocalTempView, true +CreateViewCommand `t1`, SELECT * FROM VALUES (97) as t1(int_col1), false, true, LocalTempView, UNSUPPORTED, true +- Project [int_col1#x] +- SubqueryAlias t1 +- LocalRelation [int_col1#x] @@ -67,7 +67,7 @@ CreateViewCommand `t1`, SELECT * FROM VALUES (97) as t1(int_col1), false, true, -- !query CREATE OR REPLACE TEMPORARY VIEW t2 AS SELECT * FROM VALUES (0) as t2(int_col1) -- !query analysis -CreateViewCommand `t2`, SELECT * FROM VALUES (0) as t2(int_col1), false, true, LocalTempView, true +CreateViewCommand `t2`, SELECT * FROM VALUES (0) as t2(int_col1), false, true, LocalTempView, UNSUPPORTED, true +- Project [int_col1#x] +- SubqueryAlias t2 +- LocalRelation [int_col1#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-pivot.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-pivot.sql.out index e92b7003d6e58..5cfa86309f6d1 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-pivot.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-pivot.sql.out @@ -14,7 +14,7 @@ CreateViewCommand `courseSales`, select * from values ("dotNET", 2012, 5000), ("dotNET", 2013, 48000), ("Java", 2013, 30000) - as courseSales(course, year, earnings), false, false, LocalTempView, true + as courseSales(course, year, earnings), false, false, LocalTempView, UNSUPPORTED, true +- Project [course#x, year#x, earnings#x] +- SubqueryAlias courseSales +- LocalRelation [course#x, year#x, earnings#x] @@ -29,7 +29,7 @@ create temporary view years as select * from values CreateViewCommand `years`, select * from values (2012, 1), (2013, 2) - as years(y, s), false, false, LocalTempView, true + as years(y, s), false, false, LocalTempView, UNSUPPORTED, true +- Project [y#x, s#x] +- SubqueryAlias years +- LocalRelation [y#x, s#x] @@ -44,7 +44,7 @@ create temporary view yearsWithComplexTypes as select * from values CreateViewCommand `yearsWithComplexTypes`, select * from values (2012, array(1, 1), map('1', 1), struct(1, 'a')), (2013, array(2, 2), map('2', 2), struct(2, 'b')) - as yearsWithComplexTypes(y, a, m, s), false, false, LocalTempView, true + as yearsWithComplexTypes(y, a, m, s), false, false, LocalTempView, UNSUPPORTED, true +- Project [y#x, a#x, m#x, s#x] +- SubqueryAlias yearsWithComplexTypes +- LocalRelation [y#x, a#x, m#x, s#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-udaf.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-udaf.sql.out index b04e49da481da..248ed95df9ded 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-udaf.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-udaf.sql.out @@ -6,7 +6,7 @@ as t1(int_col1) -- !query analysis CreateViewCommand `t1`, SELECT * FROM VALUES (1), (2), (3), (4) -as t1(int_col1), false, true, LocalTempView, true +as t1(int_col1), false, true, LocalTempView, UNSUPPORTED, true +- Project [int_col1#x] +- SubqueryAlias t1 +- LocalRelation [int_col1#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-union.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-union.sql.out index 671c7a4765296..a1436d0a77c83 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-union.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-union.sql.out @@ -2,7 +2,7 @@ -- !query CREATE OR REPLACE TEMPORARY VIEW t1 AS VALUES (1, 'a'), (2, 'b') tbl(c1, c2) -- !query analysis -CreateViewCommand `t1`, VALUES (1, 'a'), (2, 'b') tbl(c1, c2), false, true, LocalTempView, true +CreateViewCommand `t1`, VALUES (1, 'a'), (2, 'b') tbl(c1, c2), false, true, LocalTempView, UNSUPPORTED, true +- SubqueryAlias tbl +- LocalRelation [c1#x, c2#x] @@ -10,7 +10,7 @@ CreateViewCommand `t1`, VALUES (1, 'a'), (2, 'b') tbl(c1, c2), false, true, Loca -- !query CREATE OR REPLACE TEMPORARY VIEW t2 AS VALUES (1.0, 1), (2.0, 4) tbl(c1, c2) -- !query analysis -CreateViewCommand `t2`, VALUES (1.0, 1), (2.0, 4) tbl(c1, c2), false, true, LocalTempView, true +CreateViewCommand `t2`, VALUES (1.0, 1), (2.0, 4) tbl(c1, c2), false, true, LocalTempView, UNSUPPORTED, true +- SubqueryAlias tbl +- LocalRelation [c1#x, c2#x] @@ -97,7 +97,7 @@ Project [cast(udf(cast(cast(udf(cast(a#xL as string)) as bigint) as string)) as -- !query CREATE OR REPLACE TEMPORARY VIEW p1 AS VALUES 1 T(col) -- !query analysis -CreateViewCommand `p1`, VALUES 1 T(col), false, true, LocalTempView, true +CreateViewCommand `p1`, VALUES 1 T(col), false, true, LocalTempView, UNSUPPORTED, true +- SubqueryAlias T +- LocalRelation [col#x] @@ -105,7 +105,7 @@ CreateViewCommand `p1`, VALUES 1 T(col), false, true, LocalTempView, true -- !query CREATE OR REPLACE TEMPORARY VIEW p2 AS VALUES 1 T(col) -- !query analysis -CreateViewCommand `p2`, VALUES 1 T(col), false, true, LocalTempView, true +CreateViewCommand `p2`, VALUES 1 T(col), false, true, LocalTempView, UNSUPPORTED, true +- SubqueryAlias T +- LocalRelation [col#x] @@ -113,7 +113,7 @@ CreateViewCommand `p2`, VALUES 1 T(col), false, true, LocalTempView, true -- !query CREATE OR REPLACE TEMPORARY VIEW p3 AS VALUES 1 T(col) -- !query analysis -CreateViewCommand `p3`, VALUES 1 T(col), false, true, LocalTempView, true +CreateViewCommand `p3`, VALUES 1 T(col), false, true, LocalTempView, UNSUPPORTED, true +- SubqueryAlias T +- LocalRelation [col#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-window.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-window.sql.out index e18f67055913f..c10988310c0a9 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-window.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-window.sql.out @@ -22,7 +22,7 @@ CreateViewCommand `testData`, SELECT * FROM VALUES (3, 2147483650L, 100.001D, date("2020-12-31"), timestamp_seconds(1609372800), "b"), (null, null, null, null, null, null), (3, 1L, 1.0D, date("2017-08-01"), timestamp_seconds(1501545600), null) -AS testData(val, val_long, val_double, val_date, val_timestamp, cate), false, true, LocalTempView, true +AS testData(val, val_long, val_double, val_date, val_timestamp, cate), false, true, LocalTempView, UNSUPPORTED, true +- Project [val#x, val_long#xL, val_double#x, val_date#x, val_timestamp#x, cate#x] +- SubqueryAlias testData +- LocalRelation [val#x, val_long#xL, val_double#x, val_date#x, val_timestamp#x, cate#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/udtf/udtf.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/udtf/udtf.sql.out index cdfa4f69f6e70..4b53f1c6f19c4 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/udtf/udtf.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/udtf/udtf.sql.out @@ -14,7 +14,7 @@ DropTableCommand `spark_catalog`.`default`.`t2`, true, true, false -- !query CREATE OR REPLACE TEMPORARY VIEW t1 AS VALUES (0, 1), (1, 2) t(c1, c2) -- !query analysis -CreateViewCommand `t1`, VALUES (0, 1), (1, 2) t(c1, c2), false, true, LocalTempView, true +CreateViewCommand `t1`, VALUES (0, 1), (1, 2) t(c1, c2), false, true, LocalTempView, UNSUPPORTED, true +- SubqueryAlias t +- LocalRelation [c1#x, c2#x] @@ -22,7 +22,7 @@ CreateViewCommand `t1`, VALUES (0, 1), (1, 2) t(c1, c2), false, true, LocalTempV -- !query CREATE OR REPLACE TEMPORARY VIEW t2 AS VALUES (0, 1), (1, 2), (1, 3) t(partition_col, input) -- !query analysis -CreateViewCommand `t2`, VALUES (0, 1), (1, 2), (1, 3) t(partition_col, input), false, true, LocalTempView, true +CreateViewCommand `t2`, VALUES (0, 1), (1, 2), (1, 3) t(partition_col, input), false, true, LocalTempView, UNSUPPORTED, true +- SubqueryAlias t +- LocalRelation [partition_col#x, input#x] @@ -904,6 +904,26 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException } +-- !query +SELECT * FROM UDTFPartitionByIndexingBug( + TABLE( + SELECT + 5 AS unused_col, + 'hi' AS partition_col, + 1.0 AS double_col + + UNION ALL + + SELECT + 4 AS unused_col, + 'hi' AS partition_col, + 1.0 AS double_col + ) +) +-- !query analysis +[Analyzer test output redacted due to nondeterminism] + + -- !query DROP VIEW t1 -- !query analysis diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/union.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/union.sql.out index 44dff64f3a72e..cafdd850e86d6 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/union.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/union.sql.out @@ -2,7 +2,7 @@ -- !query CREATE OR REPLACE TEMPORARY VIEW t1 AS VALUES (1, 'a'), (2, 'b') tbl(c1, c2) -- !query analysis -CreateViewCommand `t1`, VALUES (1, 'a'), (2, 'b') tbl(c1, c2), false, true, LocalTempView, true +CreateViewCommand `t1`, VALUES (1, 'a'), (2, 'b') tbl(c1, c2), false, true, LocalTempView, UNSUPPORTED, true +- SubqueryAlias tbl +- LocalRelation [c1#x, c2#x] @@ -10,7 +10,7 @@ CreateViewCommand `t1`, VALUES (1, 'a'), (2, 'b') tbl(c1, c2), false, true, Loca -- !query CREATE OR REPLACE TEMPORARY VIEW t2 AS VALUES (1.0, 1), (2.0, 4) tbl(c1, c2) -- !query analysis -CreateViewCommand `t2`, VALUES (1.0, 1), (2.0, 4) tbl(c1, c2), false, true, LocalTempView, true +CreateViewCommand `t2`, VALUES (1.0, 1), (2.0, 4) tbl(c1, c2), false, true, LocalTempView, UNSUPPORTED, true +- SubqueryAlias tbl +- LocalRelation [c1#x, c2#x] @@ -97,7 +97,7 @@ Project [a#xL] -- !query CREATE OR REPLACE TEMPORARY VIEW p1 AS VALUES 1 T(col) -- !query analysis -CreateViewCommand `p1`, VALUES 1 T(col), false, true, LocalTempView, true +CreateViewCommand `p1`, VALUES 1 T(col), false, true, LocalTempView, UNSUPPORTED, true +- SubqueryAlias T +- LocalRelation [col#x] @@ -105,7 +105,7 @@ CreateViewCommand `p1`, VALUES 1 T(col), false, true, LocalTempView, true -- !query CREATE OR REPLACE TEMPORARY VIEW p2 AS VALUES 1 T(col) -- !query analysis -CreateViewCommand `p2`, VALUES 1 T(col), false, true, LocalTempView, true +CreateViewCommand `p2`, VALUES 1 T(col), false, true, LocalTempView, UNSUPPORTED, true +- SubqueryAlias T +- LocalRelation [col#x] @@ -113,7 +113,7 @@ CreateViewCommand `p2`, VALUES 1 T(col), false, true, LocalTempView, true -- !query CREATE OR REPLACE TEMPORARY VIEW p3 AS VALUES 1 T(col) -- !query analysis -CreateViewCommand `p3`, VALUES 1 T(col), false, true, LocalTempView, true +CreateViewCommand `p3`, VALUES 1 T(col), false, true, LocalTempView, UNSUPPORTED, true +- SubqueryAlias T +- LocalRelation [col#x] @@ -184,7 +184,7 @@ Union false, false -- !query CREATE OR REPLACE TEMPORARY VIEW t3 AS VALUES (decimal(1)) tbl(v) -- !query analysis -CreateViewCommand `t3`, VALUES (decimal(1)) tbl(v), false, true, LocalTempView, true +CreateViewCommand `t3`, VALUES (decimal(1)) tbl(v), false, true, LocalTempView, UNSUPPORTED, true +- SubqueryAlias tbl +- LocalRelation [v#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/unpivot.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/unpivot.sql.out index 3b3a64073cbf8..7f4d1a5b7d467 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/unpivot.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/unpivot.sql.out @@ -8,7 +8,7 @@ create temporary view courseEarnings as select * from values CreateViewCommand `courseEarnings`, select * from values ("dotNET", 15000, 48000, 22500), ("Java", 20000, 30000, NULL) - as courseEarnings(course, `2012`, `2013`, `2014`), false, false, LocalTempView, true + as courseEarnings(course, `2012`, `2013`, `2014`), false, false, LocalTempView, UNSUPPORTED, true +- Project [course#x, 2012#x, 2013#x, 2014#x] +- SubqueryAlias courseEarnings +- LocalRelation [course#x, 2012#x, 2013#x, 2014#x] @@ -73,7 +73,7 @@ create temporary view courseEarningsAndSales as select * from values CreateViewCommand `courseEarningsAndSales`, select * from values ("dotNET", 15000, NULL, 48000, 1, 22500, 1), ("Java", 20000, 1, 30000, 2, NULL, NULL) - as courseEarningsAndSales(course, earnings2012, sales2012, earnings2013, sales2013, earnings2014, sales2014), false, false, LocalTempView, true + as courseEarningsAndSales(course, earnings2012, sales2012, earnings2013, sales2013, earnings2014, sales2014), false, false, LocalTempView, UNSUPPORTED, true +- Project [course#x, earnings2012#x, sales2012#x, earnings2013#x, sales2013#x, earnings2014#x, sales2014#x] +- SubqueryAlias courseEarningsAndSales +- LocalRelation [course#x, earnings2012#x, sales2012#x, earnings2013#x, sales2013#x, earnings2014#x, sales2014#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/using-join.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/using-join.sql.out index 3c9c6ec169afa..5a74c4be107e3 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/using-join.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/using-join.sql.out @@ -10,7 +10,7 @@ CreateViewCommand `nt1`, select * from values ("one", 1), ("two", 2), ("three", 3) - as nt1(k, v1), false, false, LocalTempView, true + as nt1(k, v1), false, false, LocalTempView, UNSUPPORTED, true +- Project [k#x, v1#x] +- SubqueryAlias nt1 +- LocalRelation [k#x, v1#x] @@ -29,7 +29,7 @@ CreateViewCommand `nt2`, select * from values ("two", 22), ("one", 5), ("four", 4) - as nt2(k, v2), false, false, LocalTempView, true + as nt2(k, v2), false, false, LocalTempView, UNSUPPORTED, true +- Project [k#x, v2#x] +- SubqueryAlias nt2 +- LocalRelation [k#x, v2#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/view-schema-binding-config.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/view-schema-binding-config.sql.out new file mode 100644 index 0000000000000..efa221400b0be --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/view-schema-binding-config.sql.out @@ -0,0 +1,813 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +SET spark.sql.legacy.viewSchemaBindingMode +-- !query analysis +SetCommand (spark.sql.legacy.viewSchemaBindingMode,None) + + +-- !query +SET spark.sql.legacy.viewSchemaBindingMode = false +-- !query analysis +SetCommand (spark.sql.legacy.viewSchemaBindingMode,Some(false)) + + +-- !query +CREATE OR REPLACE VIEW v WITH SCHEMA BINDING AS SELECT 1 +-- !query analysis +org.apache.spark.sql.catalyst.parser.ParseException +{ + "errorClass" : "FEATURE_NOT_ENABLED", + "sqlState" : "56038", + "messageParameters" : { + "configKey" : "spark.sql.legacy.viewSchemaBindingMode", + "configValue" : "true", + "featureName" : "VIEW ... WITH SCHEMA ..." + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 26, + "stopIndex" : 44, + "fragment" : "WITH SCHEMA BINDING" + } ] +} + + +-- !query +CREATE OR REPLACE VIEW v WITH SCHEMA COMPENSATION AS SELECT 1 +-- !query analysis +org.apache.spark.sql.catalyst.parser.ParseException +{ + "errorClass" : "FEATURE_NOT_ENABLED", + "sqlState" : "56038", + "messageParameters" : { + "configKey" : "spark.sql.legacy.viewSchemaBindingMode", + "configValue" : "true", + "featureName" : "VIEW ... WITH SCHEMA ..." + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 26, + "stopIndex" : 49, + "fragment" : "WITH SCHEMA COMPENSATION" + } ] +} + + +-- !query +CREATE OR REPLACE VIEW v WITH SCHEMA TYPE EVOLUTION AS SELECT 1 +-- !query analysis +org.apache.spark.sql.catalyst.parser.ParseException +{ + "errorClass" : "FEATURE_NOT_ENABLED", + "sqlState" : "56038", + "messageParameters" : { + "configKey" : "spark.sql.legacy.viewSchemaBindingMode", + "configValue" : "true", + "featureName" : "VIEW ... WITH SCHEMA ..." + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 26, + "stopIndex" : 51, + "fragment" : "WITH SCHEMA TYPE EVOLUTION" + } ] +} + + +-- !query +CREATE OR REPLACE VIEW v WITH SCHEMA EVOLUTION AS SELECT 1 +-- !query analysis +org.apache.spark.sql.catalyst.parser.ParseException +{ + "errorClass" : "FEATURE_NOT_ENABLED", + "sqlState" : "56038", + "messageParameters" : { + "configKey" : "spark.sql.legacy.viewSchemaBindingMode", + "configValue" : "true", + "featureName" : "VIEW ... WITH SCHEMA ..." + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 26, + "stopIndex" : 46, + "fragment" : "WITH SCHEMA EVOLUTION" + } ] +} + + +-- !query +CREATE OR REPLACE VIEW v AS SELECT 1 +-- !query analysis +CreateViewCommand `spark_catalog`.`default`.`v`, SELECT 1, false, true, PersistedView, UNSUPPORTED, true + +- Project [1 AS 1#x] + +- OneRowRelation + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +SHOW TABLE EXTENDED LIKE 'v' +-- !query analysis +ShowTablesCommand default, v, [namespace#x, tableName#x, isTemporary#x, information#x], true + + +-- !query +SHOW CREATE TABLE v +-- !query analysis +ShowCreateTableCommand `spark_catalog`.`default`.`v`, [createtab_stmt#x] + + +-- !query +DROP VIEW IF EXISTS v +-- !query analysis +DropTableCommand `spark_catalog`.`default`.`v`, true, true, false + + +-- !query +CREATE OR REPLACE TEMPORARY VIEW v AS SELECT 1 +-- !query analysis +CreateViewCommand `v`, SELECT 1, false, true, LocalTempView, UNSUPPORTED, true + +- Project [1 AS 1#x] + +- OneRowRelation + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +SHOW TABLE EXTENDED LIKE 'v' +-- !query analysis +ShowTablesCommand default, v, [namespace#x, tableName#x, isTemporary#x, information#x], true + + +-- !query +DROP VIEW IF EXISTS v +-- !query analysis +DropTempViewCommand v + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 INT NOT NULL) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +CREATE OR REPLACE VIEW v AS SELECT * FROM t +-- !query analysis +CreateViewCommand `spark_catalog`.`default`.`v`, SELECT * FROM t, false, true, PersistedView, UNSUPPORTED, true + +- Project [c1#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x] parquet + + +-- !query +SELECT * FROM v +-- !query analysis +Project [c1#x] ++- SubqueryAlias spark_catalog.default.v + +- View (`spark_catalog`.`default`.`v`, [c1#x]) + +- Project [cast(c1#x as int) AS c1#x] + +- Project [c1#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x] parquet + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +SHOW CREATE TABLE v +-- !query analysis +ShowCreateTableCommand `spark_catalog`.`default`.`v`, [createtab_stmt#x] + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 BIGINT NOT NULL) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +SELECT * FROM v +-- !query analysis +Project [c1#x] ++- SubqueryAlias spark_catalog.default.v + +- View (`spark_catalog`.`default`.`v`, [c1#x]) + +- Project [cast(c1#xL as int) AS c1#x] + +- Project [c1#xL] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#xL] parquet + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +SET spark.sql.legacy.viewSchemaBindingMode = true +-- !query analysis +SetCommand (spark.sql.legacy.viewSchemaBindingMode,Some(true)) + + +-- !query +SET spark.sql.legacy.viewSchemaCompensation = false +-- !query analysis +SetCommand (spark.sql.legacy.viewSchemaCompensation,Some(false)) + + +-- !query +SET spark.sql.ansi.enabled = false +-- !query analysis +SetCommand (spark.sql.ansi.enabled,Some(false)) + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 INT NOT NULL) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +CREATE OR REPLACE VIEW v AS SELECT * FROM t +-- !query analysis +CreateViewCommand `spark_catalog`.`default`.`v`, SELECT * FROM t, false, true, PersistedView, BINDING, true + +- Project [c1#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x] parquet + + +-- !query +SELECT * FROM v +-- !query analysis +Project [c1#x] ++- SubqueryAlias spark_catalog.default.v + +- View (`spark_catalog`.`default`.`v`, [c1#x]) + +- Project [cast(c1#x as int) AS c1#x] + +- Project [c1#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x] parquet + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +SHOW CREATE TABLE v +-- !query analysis +ShowCreateTableCommand `spark_catalog`.`default`.`v`, [createtab_stmt#x] + + +-- !query +DROP TABLE t +-- !query analysis +DropTable false, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 BIGINT NOT NULL) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +INSERT INTO t VALUES (1) +-- !query analysis +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t], Append, `spark_catalog`.`default`.`t`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t), [c1] ++- Project [cast(col1#x as bigint) AS c1#xL] + +- LocalRelation [col1#x] + + +-- !query +SELECT * FROM v +-- !query analysis +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "CANNOT_UP_CAST_DATATYPE", + "sqlState" : "42846", + "messageParameters" : { + "details" : "The type path of the target object is:\n\nYou can either add an explicit cast to the input data or choose a higher precision type of the field in the target object", + "expression" : "spark_catalog.default.t.c1", + "sourceType" : "\"BIGINT\"", + "targetType" : "\"INT\"" + } +} + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +SHOW CREATE TABLE v +-- !query analysis +ShowCreateTableCommand `spark_catalog`.`default`.`v`, [createtab_stmt#x] + + +-- !query +SET spark.sql.legacy.viewSchemaCompensation = true +-- !query analysis +SetCommand (spark.sql.legacy.viewSchemaCompensation,Some(true)) + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 INT NOT NULL) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +CREATE OR REPLACE VIEW v AS SELECT * FROM t +-- !query analysis +CreateViewCommand `spark_catalog`.`default`.`v`, SELECT * FROM t, false, true, PersistedView, COMPENSATION, true + +- Project [c1#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x] parquet + + +-- !query +SELECT * FROM v +-- !query analysis +Project [c1#x] ++- SubqueryAlias spark_catalog.default.v + +- View (`spark_catalog`.`default`.`v`, [c1#x]) + +- Project [cast(c1#x as int) AS c1#x] + +- Project [c1#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x] parquet + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +SHOW CREATE TABLE v +-- !query analysis +ShowCreateTableCommand `spark_catalog`.`default`.`v`, [createtab_stmt#x] + + +-- !query +DROP TABLE t +-- !query analysis +DropTable false, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 BIGINT NOT NULL) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +INSERT INTO t VALUES (1) +-- !query analysis +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t], Append, `spark_catalog`.`default`.`t`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t), [c1] ++- Project [cast(col1#x as bigint) AS c1#xL] + +- LocalRelation [col1#x] + + +-- !query +SELECT * FROM v +-- !query analysis +Project [c1#x] ++- SubqueryAlias spark_catalog.default.v + +- View (`spark_catalog`.`default`.`v`, [c1#x]) + +- Project [cast(c1#xL as int) AS c1#x] + +- Project [c1#xL] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#xL] parquet + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +SHOW CREATE TABLE v +-- !query analysis +ShowCreateTableCommand `spark_catalog`.`default`.`v`, [createtab_stmt#x] + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 STRING NOT NULL, c2 INT) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +INSERT INTO t VALUES ('1', 2) +-- !query analysis +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t], Append, `spark_catalog`.`default`.`t`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t), [c1, c2] ++- Project [cast(col1#x as string) AS c1#x, cast(col2#x as int) AS c2#x] + +- LocalRelation [col1#x, col2#x] + + +-- !query +SELECT * FROM v +-- !query analysis +Project [c1#x] ++- SubqueryAlias spark_catalog.default.v + +- View (`spark_catalog`.`default`.`v`, [c1#x]) + +- Project [cast(c1#x as int) AS c1#x] + +- Project [c1#x, c2#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x,c2#x] parquet + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +INSERT INTO t VALUES ('a', 2) +-- !query analysis +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t], Append, `spark_catalog`.`default`.`t`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t), [c1, c2] ++- Project [cast(col1#x as string) AS c1#x, cast(col2#x as int) AS c2#x] + +- LocalRelation [col1#x, col2#x] + + +-- !query +SELECT * FROM v +-- !query analysis +Project [c1#x] ++- SubqueryAlias spark_catalog.default.v + +- View (`spark_catalog`.`default`.`v`, [c1#x]) + +- Project [cast(c1#x as int) AS c1#x] + +- Project [c1#x, c2#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x,c2#x] parquet + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 MAP, c2 INT) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +SELECT * FROM v +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "DATATYPE_MISMATCH.CAST_WITHOUT_SUGGESTION", + "sqlState" : "42K09", + "messageParameters" : { + "sqlExpr" : "\"c1\"", + "srcType" : "\"MAP\"", + "targetType" : "\"INT\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 15, + "stopIndex" : 15, + "fragment" : "v" + } ] +} + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 INT, c2 INT) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +INSERT INTO t VALUES (1, 2) +-- !query analysis +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t], Append, `spark_catalog`.`default`.`t`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t), [c1, c2] ++- Project [cast(col1#x as int) AS c1#x, cast(col2#x as int) AS c2#x] + +- LocalRelation [col1#x, col2#x] + + +-- !query +CREATE OR REPLACE VIEW v AS SELECT * FROM t +-- !query analysis +CreateViewCommand `spark_catalog`.`default`.`v`, SELECT * FROM t, false, true, PersistedView, COMPENSATION, true + +- Project [c1#x, c2#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x,c2#x] parquet + + +-- !query +SELECT * FROM v +-- !query analysis +Project [c1#x, c2#x] ++- SubqueryAlias spark_catalog.default.v + +- View (`spark_catalog`.`default`.`v`, [c1#x, c2#x]) + +- Project [cast(c1#x as int) AS c1#x, cast(c2#x as int) AS c2#x] + +- Project [c1#x, c2#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x,c2#x] parquet + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 INT NOT NULL) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +SELECT * FROM v +-- !query analysis +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "INCOMPATIBLE_VIEW_SCHEMA_CHANGE", + "sqlState" : "51024", + "messageParameters" : { + "actualCols" : "[]", + "colName" : "c2", + "expectedNum" : "1", + "suggestion" : "CREATE OR REPLACE VIEW spark_catalog.default.v AS SELECT * FROM t", + "viewName" : "`spark_catalog`.`default`.`v`" + } +} + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c3 INT NOT NULL, c2 INT) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +SELECT * FROM v +-- !query analysis +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "INCOMPATIBLE_VIEW_SCHEMA_CHANGE", + "sqlState" : "51024", + "messageParameters" : { + "actualCols" : "[]", + "colName" : "c1", + "expectedNum" : "1", + "suggestion" : "CREATE OR REPLACE VIEW spark_catalog.default.v AS SELECT * FROM t", + "viewName" : "`spark_catalog`.`default`.`v`" + } +} + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +SET spark.sql.legacy.viewSchemaBindingMode = false +-- !query analysis +SetCommand (spark.sql.legacy.viewSchemaBindingMode,Some(false)) + + +-- !query +SET spark.sql.legacy.viewSchemaCompensation = false +-- !query analysis +SetCommand (spark.sql.legacy.viewSchemaCompensation,Some(false)) + + +-- !query +CREATE OR REPLACE VIEW v AS SELECT 1 +-- !query analysis +CreateViewCommand `spark_catalog`.`default`.`v`, SELECT 1, false, true, PersistedView, UNSUPPORTED, true + +- Project [1 AS 1#x] + +- OneRowRelation + + +-- !query +SET spark.sql.legacy.viewSchemaBindingMode = true +-- !query analysis +SetCommand (spark.sql.legacy.viewSchemaBindingMode,Some(true)) + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +SHOW TABLE EXTENDED LIKE 'v' +-- !query analysis +ShowTablesCommand default, v, [namespace#x, tableName#x, isTemporary#x, information#x], true + + +-- !query +SHOW CREATE TABLE v +-- !query analysis +ShowCreateTableCommand `spark_catalog`.`default`.`v`, [createtab_stmt#x] + + +-- !query +SET spark.sql.legacy.viewSchemaCompensation = true +-- !query analysis +SetCommand (spark.sql.legacy.viewSchemaCompensation,Some(true)) + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +SHOW TABLE EXTENDED LIKE 'v' +-- !query analysis +ShowTablesCommand default, v, [namespace#x, tableName#x, isTemporary#x, information#x], true + + +-- !query +SHOW CREATE TABLE v +-- !query analysis +ShowCreateTableCommand `spark_catalog`.`default`.`v`, [createtab_stmt#x] + + +-- !query +DROP VIEW IF EXISTS v +-- !query analysis +DropTableCommand `spark_catalog`.`default`.`v`, true, true, false + + +-- !query +SET spark.sql.legacy.viewSchemaBindingMode = false +-- !query analysis +SetCommand (spark.sql.legacy.viewSchemaBindingMode,Some(false)) + + +-- !query +SET spark.sql.legacy.viewSchemaCompensation = false +-- !query analysis +SetCommand (spark.sql.legacy.viewSchemaCompensation,Some(false)) + + +-- !query +CREATE OR REPLACE TEMPORARY VIEW v AS SELECT 1 +-- !query analysis +CreateViewCommand `v`, SELECT 1, false, true, LocalTempView, UNSUPPORTED, true + +- Project [1 AS 1#x] + +- OneRowRelation + + +-- !query +SET spark.sql.legacy.viewSchemaBindingMode = true +-- !query analysis +SetCommand (spark.sql.legacy.viewSchemaBindingMode,Some(true)) + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +SHOW TABLE EXTENDED LIKE 'v' +-- !query analysis +ShowTablesCommand default, v, [namespace#x, tableName#x, isTemporary#x, information#x], true + + +-- !query +SET spark.sql.legacy.viewSchemaCompensation = true +-- !query analysis +SetCommand (spark.sql.legacy.viewSchemaCompensation,Some(true)) + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +SHOW TABLE EXTENDED LIKE 'v' +-- !query analysis +ShowTablesCommand default, v, [namespace#x, tableName#x, isTemporary#x, information#x], true + + +-- !query +DROP VIEW IF EXISTS v +-- !query analysis +DropTempViewCommand v + + +-- !query +DROP VIEW IF EXISTS v +-- !query analysis +DropTableCommand `spark_catalog`.`default`.`v`, true, true, false + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/view-schema-binding.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/view-schema-binding.sql.out new file mode 100644 index 0000000000000..75cae1f19d46d --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/view-schema-binding.sql.out @@ -0,0 +1,256 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 INT NOT NULL) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +CREATE OR REPLACE VIEW v WITH SCHEMA BINDING AS SELECT * FROM t +-- !query analysis +CreateViewCommand `spark_catalog`.`default`.`v`, SELECT * FROM t, false, true, PersistedView, BINDING, true + +- Project [c1#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x] parquet + + +-- !query +SELECT * FROM v +-- !query analysis +Project [c1#x] ++- SubqueryAlias spark_catalog.default.v + +- View (`spark_catalog`.`default`.`v`, [c1#x]) + +- Project [cast(c1#x as int) AS c1#x] + +- Project [c1#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x] parquet + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +DROP TABLE t +-- !query analysis +DropTable false, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 BIGINT NOT NULL) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +SELECT * FROM v +-- !query analysis +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "CANNOT_UP_CAST_DATATYPE", + "sqlState" : "42846", + "messageParameters" : { + "details" : "The type path of the target object is:\n\nYou can either add an explicit cast to the input data or choose a higher precision type of the field in the target object", + "expression" : "spark_catalog.default.t.c1", + "sourceType" : "\"BIGINT\"", + "targetType" : "\"INT\"" + } +} + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 INT, c2 INT) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +CREATE OR REPLACE VIEW v WITH SCHEMA BINDING AS SELECT * FROM t +-- !query analysis +CreateViewCommand `spark_catalog`.`default`.`v`, SELECT * FROM t, false, true, PersistedView, BINDING, true + +- Project [c1#x, c2#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x,c2#x] parquet + + +-- !query +SELECT * FROM v +-- !query analysis +Project [c1#x, c2#x] ++- SubqueryAlias spark_catalog.default.v + +- View (`spark_catalog`.`default`.`v`, [c1#x, c2#x]) + +- Project [cast(c1#x as int) AS c1#x, cast(c2#x as int) AS c2#x] + +- Project [c1#x, c2#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x,c2#x] parquet + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +DROP TABLE t +-- !query analysis +DropTable false, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 INT) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +SELECT * FROM v +-- !query analysis +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "INCOMPATIBLE_VIEW_SCHEMA_CHANGE", + "sqlState" : "51024", + "messageParameters" : { + "actualCols" : "[]", + "colName" : "c2", + "expectedNum" : "1", + "suggestion" : "CREATE OR REPLACE VIEW spark_catalog.default.v AS SELECT * FROM t", + "viewName" : "`spark_catalog`.`default`.`v`" + } +} + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +SET spark.sql.legacy.viewSchemaCompensation=false +-- !query analysis +SetCommand (spark.sql.legacy.viewSchemaCompensation,Some(false)) + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 INT NOT NULL) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +CREATE OR REPLACE VIEW v AS SELECT * FROM t +-- !query analysis +CreateViewCommand `spark_catalog`.`default`.`v`, SELECT * FROM t, false, true, PersistedView, BINDING, true + +- Project [c1#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x] parquet + + +-- !query +SELECT * FROM v +-- !query analysis +Project [c1#x] ++- SubqueryAlias spark_catalog.default.v + +- View (`spark_catalog`.`default`.`v`, [c1#x]) + +- Project [cast(c1#x as int) AS c1#x] + +- Project [c1#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x] parquet + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +ALTER VIEW v WITH SCHEMA BINDING +-- !query analysis +AlterViewSchemaBindingCommand `spark_catalog`.`default`.`v`, BINDING + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +DROP TABLE t +-- !query analysis +DropTable false, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 BIGINT NOT NULL) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +SELECT * FROM v +-- !query analysis +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "CANNOT_UP_CAST_DATATYPE", + "sqlState" : "42846", + "messageParameters" : { + "details" : "The type path of the target object is:\n\nYou can either add an explicit cast to the input data or choose a higher precision type of the field in the target object", + "expression" : "spark_catalog.default.t.c1", + "sourceType" : "\"BIGINT\"", + "targetType" : "\"INT\"" + } +} + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +DROP VIEW IF EXISTS v +-- !query analysis +DropTableCommand `spark_catalog`.`default`.`v`, true, true, false + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/view-schema-compensation.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/view-schema-compensation.sql.out new file mode 100644 index 0000000000000..64295a6f9bc0c --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/view-schema-compensation.sql.out @@ -0,0 +1,414 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +SET spark.sql.ansi.enabled = false +-- !query analysis +SetCommand (spark.sql.ansi.enabled,Some(false)) + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 INT NOT NULL) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +CREATE OR REPLACE VIEW v WITH SCHEMA COMPENSATION AS SELECT * FROM t +-- !query analysis +CreateViewCommand `spark_catalog`.`default`.`v`, SELECT * FROM t, false, true, PersistedView, COMPENSATION, true + +- Project [c1#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x] parquet + + +-- !query +SELECT * FROM v +-- !query analysis +Project [c1#x] ++- SubqueryAlias spark_catalog.default.v + +- View (`spark_catalog`.`default`.`v`, [c1#x]) + +- Project [cast(c1#x as int) AS c1#x] + +- Project [c1#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x] parquet + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +DROP TABLE t +-- !query analysis +DropTable false, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 BIGINT NOT NULL) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +INSERT INTO t VALUES (1) +-- !query analysis +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t], Append, `spark_catalog`.`default`.`t`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t), [c1] ++- Project [cast(col1#x as bigint) AS c1#xL] + +- LocalRelation [col1#x] + + +-- !query +SELECT * FROM v +-- !query analysis +Project [c1#x] ++- SubqueryAlias spark_catalog.default.v + +- View (`spark_catalog`.`default`.`v`, [c1#x]) + +- Project [cast(c1#xL as int) AS c1#x] + +- Project [c1#xL] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#xL] parquet + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 STRING NOT NULL, c2 INT) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +INSERT INTO t VALUES ('1', 2) +-- !query analysis +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t], Append, `spark_catalog`.`default`.`t`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t), [c1, c2] ++- Project [cast(col1#x as string) AS c1#x, cast(col2#x as int) AS c2#x] + +- LocalRelation [col1#x, col2#x] + + +-- !query +SELECT * FROM v +-- !query analysis +Project [c1#x] ++- SubqueryAlias spark_catalog.default.v + +- View (`spark_catalog`.`default`.`v`, [c1#x]) + +- Project [cast(c1#x as int) AS c1#x] + +- Project [c1#x, c2#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x,c2#x] parquet + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +INSERT INTO t VALUES ('a', 2) +-- !query analysis +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t], Append, `spark_catalog`.`default`.`t`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t), [c1, c2] ++- Project [cast(col1#x as string) AS c1#x, cast(col2#x as int) AS c2#x] + +- LocalRelation [col1#x, col2#x] + + +-- !query +SELECT * FROM v +-- !query analysis +Project [c1#x] ++- SubqueryAlias spark_catalog.default.v + +- View (`spark_catalog`.`default`.`v`, [c1#x]) + +- Project [cast(c1#x as int) AS c1#x] + +- Project [c1#x, c2#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x,c2#x] parquet + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 MAP, c2 INT) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +SELECT * FROM v +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "DATATYPE_MISMATCH.CAST_WITHOUT_SUGGESTION", + "sqlState" : "42K09", + "messageParameters" : { + "sqlExpr" : "\"c1\"", + "srcType" : "\"MAP\"", + "targetType" : "\"INT\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 15, + "stopIndex" : 15, + "fragment" : "v" + } ] +} + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 INT, c2 INT) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +INSERT INTO t VALUES (1, 2) +-- !query analysis +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t], Append, `spark_catalog`.`default`.`t`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t), [c1, c2] ++- Project [cast(col1#x as int) AS c1#x, cast(col2#x as int) AS c2#x] + +- LocalRelation [col1#x, col2#x] + + +-- !query +CREATE OR REPLACE VIEW v AS SELECT * FROM t +-- !query analysis +CreateViewCommand `spark_catalog`.`default`.`v`, SELECT * FROM t, false, true, PersistedView, COMPENSATION, true + +- Project [c1#x, c2#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x,c2#x] parquet + + +-- !query +SELECT * FROM v +-- !query analysis +Project [c1#x, c2#x] ++- SubqueryAlias spark_catalog.default.v + +- View (`spark_catalog`.`default`.`v`, [c1#x, c2#x]) + +- Project [cast(c1#x as int) AS c1#x, cast(c2#x as int) AS c2#x] + +- Project [c1#x, c2#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x,c2#x] parquet + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 INT NOT NULL) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +SELECT * FROM v +-- !query analysis +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "INCOMPATIBLE_VIEW_SCHEMA_CHANGE", + "sqlState" : "51024", + "messageParameters" : { + "actualCols" : "[]", + "colName" : "c2", + "expectedNum" : "1", + "suggestion" : "CREATE OR REPLACE VIEW spark_catalog.default.v AS SELECT * FROM t", + "viewName" : "`spark_catalog`.`default`.`v`" + } +} + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c3 INT NOT NULL, c2 INT) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +SELECT * FROM v +-- !query analysis +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "INCOMPATIBLE_VIEW_SCHEMA_CHANGE", + "sqlState" : "51024", + "messageParameters" : { + "actualCols" : "[]", + "colName" : "c1", + "expectedNum" : "1", + "suggestion" : "CREATE OR REPLACE VIEW spark_catalog.default.v AS SELECT * FROM t", + "viewName" : "`spark_catalog`.`default`.`v`" + } +} + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 INT) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +INSERT INTO t VALUES(1) +-- !query analysis +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t], Append, `spark_catalog`.`default`.`t`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t), [c1] ++- Project [cast(col1#x as int) AS c1#x] + +- LocalRelation [col1#x] + + +-- !query +CREATE OR REPLACE VIEW v WITH SCHEMA BINDING AS SELECT * FROM t +-- !query analysis +CreateViewCommand `spark_catalog`.`default`.`v`, SELECT * FROM t, false, true, PersistedView, BINDING, true + +- Project [c1#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x] parquet + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 STRING) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +INSERT INTO t VALUES('1') +-- !query analysis +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t], Append, `spark_catalog`.`default`.`t`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t), [c1] ++- Project [cast(col1#x as string) AS c1#x] + +- LocalRelation [col1#x] + + +-- !query +SELECT * FROM v +-- !query analysis +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "CANNOT_UP_CAST_DATATYPE", + "sqlState" : "42846", + "messageParameters" : { + "details" : "The type path of the target object is:\n\nYou can either add an explicit cast to the input data or choose a higher precision type of the field in the target object", + "expression" : "spark_catalog.default.t.c1", + "sourceType" : "\"STRING\"", + "targetType" : "\"INT\"" + } +} + + +-- !query +ALTER VIEW v WITH SCHEMA COMPENSATION +-- !query analysis +AlterViewSchemaBindingCommand `spark_catalog`.`default`.`v`, COMPENSATION + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +SELECT * FROM v +-- !query analysis +Project [c1#x] ++- SubqueryAlias spark_catalog.default.v + +- View (`spark_catalog`.`default`.`v`, [c1#x]) + +- Project [cast(c1#x as int) AS c1#x] + +- Project [c1#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x] parquet + + +-- !query +DROP VIEW IF EXISTS v +-- !query analysis +DropTableCommand `spark_catalog`.`default`.`v`, true, true, false + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/view-schema-evolution.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/view-schema-evolution.sql.out new file mode 100644 index 0000000000000..258edf31d4c17 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/view-schema-evolution.sql.out @@ -0,0 +1,781 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 INT NOT NULL, c2 INT) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +INSERT INTO t VALUES (1, 2) +-- !query analysis +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t], Append, `spark_catalog`.`default`.`t`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t), [c1, c2] ++- Project [cast(col1#x as int) AS c1#x, cast(col2#x as int) AS c2#x] + +- LocalRelation [col1#x, col2#x] + + +-- !query +CREATE OR REPLACE VIEW v WITH SCHEMA EVOLUTION AS SELECT * FROM t +-- !query analysis +CreateViewCommand `spark_catalog`.`default`.`v`, SELECT * FROM t, false, true, PersistedView, EVOLUTION, true + +- Project [c1#x, c2#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x,c2#x] parquet + + +-- !query +SELECT * FROM v +-- !query analysis +Project [c1#x, c2#x] ++- SubqueryAlias spark_catalog.default.v + +- View (`spark_catalog`.`default`.`v`, [c1#x, c2#x]) + +- Project [c1#x, c2#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x,c2#x] parquet + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c4 STRING NOT NULL, c5 DOUBLE) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +INSERT INTO t VALUES ('1', 2.0) +-- !query analysis +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t], Append, `spark_catalog`.`default`.`t`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t), [c4, c5] ++- Project [cast(col1#x as string) AS c4#x, cast(col2#x as double) AS c5#x] + +- LocalRelation [col1#x, col2#x] + + +-- !query +SELECT * FROM v +-- !query analysis +Project [c4#x, c5#x] ++- SubqueryAlias spark_catalog.default.v + +- View (`spark_catalog`.`default`.`v`, [c4#x, c5#x]) + +- Project [c4#x, c5#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c4#x,c5#x] parquet + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c4 STRING, c5 DOUBLE, c6 DATE) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +INSERT INTO t VALUES ('1', 2.0, DATE'2022-01-01') +-- !query analysis +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t], Append, `spark_catalog`.`default`.`t`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t), [c4, c5, c6] ++- Project [cast(col1#x as string) AS c4#x, cast(col2#x as double) AS c5#x, cast(col3#x as date) AS c6#x] + +- LocalRelation [col1#x, col2#x, col3#x] + + +-- !query +SELECT * FROM v +-- !query analysis +Project [c4#x, c5#x, c6#x] ++- SubqueryAlias spark_catalog.default.v + +- View (`spark_catalog`.`default`.`v`, [c4#x, c5#x, c6#x]) + +- Project [c4#x, c5#x, c6#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c4#x,c5#x,c6#x] parquet + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 INT, c2 INT) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +INSERT INTO t VALUES (1, 2) +-- !query analysis +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t], Append, `spark_catalog`.`default`.`t`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t), [c1, c2] ++- Project [cast(col1#x as int) AS c1#x, cast(col2#x as int) AS c2#x] + +- LocalRelation [col1#x, col2#x] + + +-- !query +CREATE OR REPLACE VIEW v WITH SCHEMA EVOLUTION AS SELECT * FROM t +-- !query analysis +CreateViewCommand `spark_catalog`.`default`.`v`, SELECT * FROM t, false, true, PersistedView, EVOLUTION, true + +- Project [c1#x, c2#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x,c2#x] parquet + + +-- !query +SELECT * FROM v +-- !query analysis +Project [c1#x, c2#x] ++- SubqueryAlias spark_catalog.default.v + +- View (`spark_catalog`.`default`.`v`, [c1#x, c2#x]) + +- Project [c1#x, c2#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x,c2#x] parquet + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 INT) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +SELECT * FROM v +-- !query analysis +Project [c1#x] ++- SubqueryAlias spark_catalog.default.v + +- View (`spark_catalog`.`default`.`v`, [c1#x]) + +- Project [c1#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x] parquet + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 INT NOT NULL, c2 INT) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +INSERT INTO t VALUES (1, 2) +-- !query analysis +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t], Append, `spark_catalog`.`default`.`t`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t), [c1, c2] ++- Project [cast(col1#x as int) AS c1#x, cast(col2#x as int) AS c2#x] + +- LocalRelation [col1#x, col2#x] + + +-- !query +CREATE OR REPLACE VIEW v(a1, a2) WITH SCHEMA EVOLUTION AS SELECT * FROM t +-- !query analysis +CreateViewCommand `spark_catalog`.`default`.`v`, [(a1,None), (a2,None)], SELECT * FROM t, false, true, PersistedView, TYPE EVOLUTION, true + +- Project [c1#x, c2#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x,c2#x] parquet + + +-- !query +SELECT * FROM v +-- !query analysis +Project [a1#x, a2#x] ++- SubqueryAlias spark_catalog.default.v + +- View (`spark_catalog`.`default`.`v`, [a1#x, a2#x]) + +- Project [c1#x AS a1#x, c2#x AS a2#x] + +- Project [c1#x, c2#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x,c2#x] parquet + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 STRING NOT NULL, c2 DOUBLE) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +INSERT INTO t VALUES ('1', 2.0) +-- !query analysis +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t], Append, `spark_catalog`.`default`.`t`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t), [c1, c2] ++- Project [cast(col1#x as string) AS c1#x, cast(col2#x as double) AS c2#x] + +- LocalRelation [col1#x, col2#x] + + +-- !query +SELECT * FROM v +-- !query analysis +Project [a1#x, a2#x] ++- SubqueryAlias spark_catalog.default.v + +- View (`spark_catalog`.`default`.`v`, [a1#x, a2#x]) + +- Project [c1#x AS a1#x, c2#x AS a2#x] + +- Project [c1#x, c2#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x,c2#x] parquet + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 STRING, c2 DOUBLE, c3 DATE) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +INSERT INTO t VALUES ('1', 2.0, DATE'2022-01-01') +-- !query analysis +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t], Append, `spark_catalog`.`default`.`t`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t), [c1, c2, c3] ++- Project [cast(col1#x as string) AS c1#x, cast(col2#x as double) AS c2#x, cast(col3#x as date) AS c3#x] + +- LocalRelation [col1#x, col2#x, col3#x] + + +-- !query +SELECT * FROM v +-- !query analysis +Project [a1#x, a2#x] ++- SubqueryAlias spark_catalog.default.v + +- View (`spark_catalog`.`default`.`v`, [a1#x, a2#x]) + +- Project [c1#x AS a1#x, c2#x AS a2#x] + +- Project [c1#x, c2#x, c3#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x,c2#x,c3#x] parquet + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 INT, c2 INT) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +INSERT INTO t VALUES (1, 2) +-- !query analysis +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t], Append, `spark_catalog`.`default`.`t`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t), [c1, c2] ++- Project [cast(col1#x as int) AS c1#x, cast(col2#x as int) AS c2#x] + +- LocalRelation [col1#x, col2#x] + + +-- !query +CREATE OR REPLACE VIEW v(a1, a2) WITH SCHEMA EVOLUTION AS SELECT * FROM t +-- !query analysis +CreateViewCommand `spark_catalog`.`default`.`v`, [(a1,None), (a2,None)], SELECT * FROM t, false, true, PersistedView, TYPE EVOLUTION, true + +- Project [c1#x, c2#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x,c2#x] parquet + + +-- !query +SELECT * FROM v +-- !query analysis +Project [a1#x, a2#x] ++- SubqueryAlias spark_catalog.default.v + +- View (`spark_catalog`.`default`.`v`, [a1#x, a2#x]) + +- Project [c1#x AS a1#x, c2#x AS a2#x] + +- Project [c1#x, c2#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x,c2#x] parquet + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 INT) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +SELECT * FROM v +-- !query analysis +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "INCOMPATIBLE_VIEW_SCHEMA_CHANGE", + "sqlState" : "51024", + "messageParameters" : { + "actualCols" : "[]", + "colName" : "c2", + "expectedNum" : "1", + "suggestion" : "CREATE OR REPLACE VIEW spark_catalog.default.v (a1, a2) AS SELECT * FROM t", + "viewName" : "`spark_catalog`.`default`.`v`" + } +} + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c3 INT, c2 INT) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +SELECT * FROM v +-- !query analysis +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "INCOMPATIBLE_VIEW_SCHEMA_CHANGE", + "sqlState" : "51024", + "messageParameters" : { + "actualCols" : "[]", + "colName" : "c1", + "expectedNum" : "1", + "suggestion" : "CREATE OR REPLACE VIEW spark_catalog.default.v (a1, a2) AS SELECT * FROM t", + "viewName" : "`spark_catalog`.`default`.`v`" + } +} + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 INT COMMENT 'c1', c2 INT COMMENT 'c2') USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +CREATE OR REPLACE VIEW v(a1, a2) WITH SCHEMA TYPE EVOLUTION AS SELECT * FROM t +-- !query analysis +CreateViewCommand `spark_catalog`.`default`.`v`, [(a1,None), (a2,None)], SELECT * FROM t, false, true, PersistedView, TYPE EVOLUTION, true + +- Project [c1#x, c2#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x,c2#x] parquet + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +CREATE OR REPLACE VIEW v(a1, a2) WITH SCHEMA EVOLUTION AS SELECT * FROM t +-- !query analysis +CreateViewCommand `spark_catalog`.`default`.`v`, [(a1,None), (a2,None)], SELECT * FROM t, false, true, PersistedView, TYPE EVOLUTION, true + +- Project [c1#x, c2#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x,c2#x] parquet + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 BIGINT COMMENT 'c1 6c', c2 STRING COMMENT 'c2 6c') USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +SELECT * FROM v +-- !query analysis +Project [a1#xL, a2#x] ++- SubqueryAlias spark_catalog.default.v + +- View (`spark_catalog`.`default`.`v`, [a1#xL, a2#x]) + +- Project [c1#xL AS a1#xL, c2#x AS a2#x] + +- Project [c1#xL, c2#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#xL,c2#x] parquet + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +CREATE OR REPLACE VIEW v(a1 COMMENT 'a1', a2 COMMENT 'a2') WITH SCHEMA EVOLUTION AS SELECT * FROM t +-- !query analysis +CreateViewCommand `spark_catalog`.`default`.`v`, [(a1,Some(a1)), (a2,Some(a2))], SELECT * FROM t, false, true, PersistedView, TYPE EVOLUTION, true + +- Project [c1#xL, c2#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#xL,c2#x] parquet + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 BIGINT COMMENT 'c1 6d', c2 STRING COMMENT 'c2 6d') USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +SELECT * FROM v +-- !query analysis +Project [a1#xL, a2#x] ++- SubqueryAlias spark_catalog.default.v + +- View (`spark_catalog`.`default`.`v`, [a1#xL, a2#x]) + +- Project [c1#xL AS a1#xL, c2#x AS a2#x] + +- Project [c1#xL, c2#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#xL,c2#x] parquet + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +CREATE OR REPLACE VIEW v WITH SCHEMA EVOLUTION AS SELECT * FROM t +-- !query analysis +CreateViewCommand `spark_catalog`.`default`.`v`, SELECT * FROM t, false, true, PersistedView, EVOLUTION, true + +- Project [c1#xL, c2#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#xL,c2#x] parquet + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 BIGINT COMMENT 'c1 6e', c2 STRING COMMENT 'c2 6e') USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +SELECT * FROM v +-- !query analysis +Project [c1#xL, c2#x] ++- SubqueryAlias spark_catalog.default.v + +- View (`spark_catalog`.`default`.`v`, [c1#xL, c2#x]) + +- Project [c1#xL, c2#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#xL,c2#x] parquet + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +DROP TABLE IF EXISTS t1 +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t1 + + +-- !query +CREATE TABLE t1(c1 INT) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t1`, false + + +-- !query +DROP TABLE IF EXISTS t2 +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t2 + + +-- !query +CREATE TABLE t2(c2 INT) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t2`, false + + +-- !query +CREATE OR REPLACE VIEW v WITH SCHEMA EVOLUTION AS SELECT * FROM t1, t2 +-- !query analysis +CreateViewCommand `spark_catalog`.`default`.`v`, SELECT * FROM t1, t2, false, true, PersistedView, EVOLUTION, true + +- Project [c1#x, c2#x] + +- Join Inner + :- SubqueryAlias spark_catalog.default.t1 + : +- Relation spark_catalog.default.t1[c1#x] parquet + +- SubqueryAlias spark_catalog.default.t2 + +- Relation spark_catalog.default.t2[c2#x] parquet + + +-- !query +SELECT * FROM v +-- !query analysis +Project [c1#x, c2#x] ++- SubqueryAlias spark_catalog.default.v + +- View (`spark_catalog`.`default`.`v`, [c1#x, c2#x]) + +- Project [c1#x, c2#x] + +- Join Inner + :- SubqueryAlias spark_catalog.default.t1 + : +- Relation spark_catalog.default.t1[c1#x] parquet + +- SubqueryAlias spark_catalog.default.t2 + +- Relation spark_catalog.default.t2[c2#x] parquet + + +-- !query +DROP TABLE IF EXISTS t2 +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t2 + + +-- !query +CREATE TABLE t2(c1 INT) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t2`, false + + +-- !query +SELECT * FROM v +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "COLUMN_ALREADY_EXISTS", + "sqlState" : "42711", + "messageParameters" : { + "columnName" : "`c1`" + } +} + + +-- !query +DROP TABLE IF EXISTS t1 +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t1 + + +-- !query +DROP TABLE IF EXISTS t2 +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t2 + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 INT) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +INSERT INTO t VALUES(1) +-- !query analysis +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t], Append, `spark_catalog`.`default`.`t`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t), [c1] ++- Project [cast(col1#x as int) AS c1#x] + +- LocalRelation [col1#x] + + +-- !query +CREATE OR REPLACE VIEW v AS SELECT * FROM t +-- !query analysis +CreateViewCommand `spark_catalog`.`default`.`v`, SELECT * FROM t, false, true, PersistedView, COMPENSATION, true + +- Project [c1#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x] parquet + + +-- !query +ALTER VIEW v WITH SCHEMA EVOLUTION +-- !query analysis +AlterViewSchemaBindingCommand `spark_catalog`.`default`.`v`, EVOLUTION + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 STRING, c2 INT) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +SELECT * FROM v +-- !query analysis +Project [c1#x, c2#x] ++- SubqueryAlias spark_catalog.default.v + +- View (`spark_catalog`.`default`.`v`, [c1#x, c2#x]) + +- Project [c1#x, c2#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x,c2#x] parquet + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +DROP VIEW IF EXISTS v +-- !query analysis +DropTableCommand `spark_catalog`.`default`.`v`, true, true, false + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/view-schema-type-evolution.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/view-schema-type-evolution.sql.out new file mode 100644 index 0000000000000..95aa35d59fdc8 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/view-schema-type-evolution.sql.out @@ -0,0 +1,456 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 INT NOT NULL, c2 INT) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +INSERT INTO t VALUES (1, 2) +-- !query analysis +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t], Append, `spark_catalog`.`default`.`t`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t), [c1, c2] ++- Project [cast(col1#x as int) AS c1#x, cast(col2#x as int) AS c2#x] + +- LocalRelation [col1#x, col2#x] + + +-- !query +CREATE OR REPLACE VIEW v WITH SCHEMA TYPE EVOLUTION AS SELECT * FROM t +-- !query analysis +CreateViewCommand `spark_catalog`.`default`.`v`, SELECT * FROM t, false, true, PersistedView, TYPE EVOLUTION, true + +- Project [c1#x, c2#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x,c2#x] parquet + + +-- !query +SELECT * FROM v +-- !query analysis +Project [c1#x, c2#x] ++- SubqueryAlias spark_catalog.default.v + +- View (`spark_catalog`.`default`.`v`, [c1#x, c2#x]) + +- Project [c1#x AS c1#x, c2#x AS c2#x] + +- Project [c1#x, c2#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x,c2#x] parquet + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 STRING NOT NULL, c2 DOUBLE) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +INSERT INTO t VALUES ('1', 2.0) +-- !query analysis +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t], Append, `spark_catalog`.`default`.`t`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t), [c1, c2] ++- Project [cast(col1#x as string) AS c1#x, cast(col2#x as double) AS c2#x] + +- LocalRelation [col1#x, col2#x] + + +-- !query +SELECT * FROM v +-- !query analysis +Project [c1#x, c2#x] ++- SubqueryAlias spark_catalog.default.v + +- View (`spark_catalog`.`default`.`v`, [c1#x, c2#x]) + +- Project [c1#x AS c1#x, c2#x AS c2#x] + +- Project [c1#x, c2#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x,c2#x] parquet + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 STRING, c2 DOUBLE, c3 DATE) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +INSERT INTO t VALUES ('1', 2.0, DATE'2022-01-01') +-- !query analysis +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t], Append, `spark_catalog`.`default`.`t`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t), [c1, c2, c3] ++- Project [cast(col1#x as string) AS c1#x, cast(col2#x as double) AS c2#x, cast(col3#x as date) AS c3#x] + +- LocalRelation [col1#x, col2#x, col3#x] + + +-- !query +SELECT * FROM v +-- !query analysis +Project [c1#x, c2#x] ++- SubqueryAlias spark_catalog.default.v + +- View (`spark_catalog`.`default`.`v`, [c1#x, c2#x]) + +- Project [c1#x AS c1#x, c2#x AS c2#x] + +- Project [c1#x, c2#x, c3#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x,c2#x,c3#x] parquet + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 INT, c2 INT) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +INSERT INTO t VALUES (1, 2) +-- !query analysis +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t], Append, `spark_catalog`.`default`.`t`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t), [c1, c2] ++- Project [cast(col1#x as int) AS c1#x, cast(col2#x as int) AS c2#x] + +- LocalRelation [col1#x, col2#x] + + +-- !query +CREATE OR REPLACE VIEW v WITH SCHEMA TYPE EVOLUTION AS SELECT * FROM t +-- !query analysis +CreateViewCommand `spark_catalog`.`default`.`v`, SELECT * FROM t, false, true, PersistedView, TYPE EVOLUTION, true + +- Project [c1#x, c2#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x,c2#x] parquet + + +-- !query +SELECT * FROM v +-- !query analysis +Project [c1#x, c2#x] ++- SubqueryAlias spark_catalog.default.v + +- View (`spark_catalog`.`default`.`v`, [c1#x, c2#x]) + +- Project [c1#x AS c1#x, c2#x AS c2#x] + +- Project [c1#x, c2#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x,c2#x] parquet + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 INT) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +SELECT * FROM v +-- !query analysis +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "INCOMPATIBLE_VIEW_SCHEMA_CHANGE", + "sqlState" : "51024", + "messageParameters" : { + "actualCols" : "[]", + "colName" : "c2", + "expectedNum" : "1", + "suggestion" : "CREATE OR REPLACE VIEW spark_catalog.default.v AS SELECT * FROM t", + "viewName" : "`spark_catalog`.`default`.`v`" + } +} + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c3 INT, c2 INT) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +SELECT * FROM v +-- !query analysis +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "INCOMPATIBLE_VIEW_SCHEMA_CHANGE", + "sqlState" : "51024", + "messageParameters" : { + "actualCols" : "[]", + "colName" : "c1", + "expectedNum" : "1", + "suggestion" : "CREATE OR REPLACE VIEW spark_catalog.default.v AS SELECT * FROM t", + "viewName" : "`spark_catalog`.`default`.`v`" + } +} + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 INT COMMENT 'c1', c2 INT COMMENT 'c2') USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +CREATE OR REPLACE VIEW v(a1, a2) WITH SCHEMA TYPE EVOLUTION AS SELECT * FROM t +-- !query analysis +CreateViewCommand `spark_catalog`.`default`.`v`, [(a1,None), (a2,None)], SELECT * FROM t, false, true, PersistedView, TYPE EVOLUTION, true + +- Project [c1#x, c2#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x,c2#x] parquet + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 BIGINT COMMENT 'c1 6a', c2 STRING COMMENT 'c2 6a') USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +SELECT * FROM v +-- !query analysis +Project [a1#xL, a2#x] ++- SubqueryAlias spark_catalog.default.v + +- View (`spark_catalog`.`default`.`v`, [a1#xL, a2#x]) + +- Project [c1#xL AS a1#xL, c2#x AS a2#x] + +- Project [c1#xL, c2#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#xL,c2#x] parquet + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +CREATE OR REPLACE VIEW v(a1 COMMENT 'a1', a2 COMMENT 'a2') WITH SCHEMA TYPE EVOLUTION AS SELECT * FROM t +-- !query analysis +CreateViewCommand `spark_catalog`.`default`.`v`, [(a1,Some(a1)), (a2,Some(a2))], SELECT * FROM t, false, true, PersistedView, TYPE EVOLUTION, true + +- Project [c1#xL, c2#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#xL,c2#x] parquet + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 BIGINT COMMENT 'c1 6b', c2 STRING COMMENT 'c2 6b') USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +SELECT * FROM v +-- !query analysis +Project [a1#xL, a2#x] ++- SubqueryAlias spark_catalog.default.v + +- View (`spark_catalog`.`default`.`v`, [a1#xL, a2#x]) + +- Project [c1#xL AS a1#xL, c2#x AS a2#x] + +- Project [c1#xL, c2#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#xL,c2#x] parquet + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 INT) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +INSERT INTO t VALUES(1) +-- !query analysis +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t], Append, `spark_catalog`.`default`.`t`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t), [c1] ++- Project [cast(col1#x as int) AS c1#x] + +- LocalRelation [col1#x] + + +-- !query +CREATE OR REPLACE VIEW v WITH SCHEMA COMPENSATION AS SELECT * FROM t +-- !query analysis +CreateViewCommand `spark_catalog`.`default`.`v`, SELECT * FROM t, false, true, PersistedView, COMPENSATION, true + +- Project [c1#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x] parquet + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(c1 STRING) USING PARQUET +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +INSERT INTO t VALUES('1') +-- !query analysis +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t], Append, `spark_catalog`.`default`.`t`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t), [c1] ++- Project [cast(col1#x as string) AS c1#x] + +- LocalRelation [col1#x] + + +-- !query +SELECT * FROM v +-- !query analysis +Project [c1#x] ++- SubqueryAlias spark_catalog.default.v + +- View (`spark_catalog`.`default`.`v`, [c1#x]) + +- Project [cast(c1#x as int) AS c1#x] + +- Project [c1#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x] parquet + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +ALTER VIEW v WITH SCHEMA TYPE EVOLUTION +-- !query analysis +AlterViewSchemaBindingCommand `spark_catalog`.`default`.`v`, TYPE EVOLUTION + + +-- !query +SELECT * FROM v +-- !query analysis +Project [c1#x] ++- SubqueryAlias spark_catalog.default.v + +- View (`spark_catalog`.`default`.`v`, [c1#x]) + +- Project [c1#x AS c1#x] + +- Project [c1#x] + +- SubqueryAlias spark_catalog.default.t + +- Relation spark_catalog.default.t[c1#x] parquet + + +-- !query +DESCRIBE EXTENDED v +-- !query analysis +DescribeTableCommand `spark_catalog`.`default`.`v`, true, [col_name#x, data_type#x, comment#x] + + +-- !query +DROP VIEW IF EXISTS v +-- !query analysis +DropTableCommand `spark_catalog`.`default`.`v`, true, true, false + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/window.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/window.sql.out index a7168db622a66..8c129534e7d03 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/window.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/window.sql.out @@ -22,7 +22,7 @@ CreateViewCommand `testData`, SELECT * FROM VALUES (3, 2147483650L, 100.001D, date("2020-12-31"), timestamp_seconds(1609372800), "b"), (null, null, null, null, null, null), (3, 1L, 1.0D, date("2017-08-01"), timestamp_seconds(1501545600), null) -AS testData(val, val_long, val_double, val_date, val_timestamp, cate), false, true, LocalTempView, true +AS testData(val, val_long, val_double, val_date, val_timestamp, cate), false, true, LocalTempView, UNSUPPORTED, true +- Project [val#x, val_long#xL, val_double#x, val_date#x, val_timestamp#x, cate#x] +- SubqueryAlias testData +- LocalRelation [val#x, val_long#xL, val_double#x, val_date#x, val_timestamp#x, cate#x] @@ -67,7 +67,7 @@ CreateViewCommand `basic_pays`, SELECT * FROM VALUES ('Pamela Castillo','SCM',11303), ('Larry Bott','SCM',11798), ('Barry Jones','SCM',10586) -AS basic_pays(employee_name, department, salary), false, true, LocalTempView, true +AS basic_pays(employee_name, department, salary), false, true, LocalTempView, UNSUPPORTED, true +- Project [employee_name#x, department#x, salary#x] +- SubqueryAlias basic_pays +- LocalRelation [employee_name#x, department#x, salary#x] @@ -96,7 +96,7 @@ CreateViewCommand `test_ignore_null`, SELECT * FROM VALUES ('a', 6, 'z'), ('a', 7, 'v'), ('a', 8, null) -AS test_ignore_null(content, id, v), false, true, LocalTempView, true +AS test_ignore_null(content, id, v), false, true, LocalTempView, UNSUPPORTED, true +- Project [content#x, id#x, v#x] +- SubqueryAlias test_ignore_null +- LocalRelation [content#x, id#x, v#x] @@ -1284,7 +1284,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException -- !query create or replace temp view t1 (p, o) as values (1, 1), (1, 1), (1, 2), (2, 1), (2, 1), (2, 2) -- !query analysis -CreateViewCommand `t1`, [(p,None), (o,None)], values (1, 1), (1, 1), (1, 2), (2, 1), (2, 1), (2, 2), false, true, LocalTempView, true +CreateViewCommand `t1`, [(p,None), (o,None)], values (1, 1), (1, 1), (1, 2), (2, 1), (2, 1), (2, 2), false, true, LocalTempView, UNSUPPORTED, true +- LocalRelation [col1#x, col2#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/xml-functions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/xml-functions.sql.out index 05b8eed46d1d6..de9fb2f395210 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/xml-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/xml-functions.sql.out @@ -359,7 +359,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException -- !query CREATE TEMPORARY VIEW xmlTable(xmlField, a) AS SELECT * FROM VALUES ('

      1"2"

      ', 'a') -- !query analysis -CreateViewCommand `xmlTable`, [(xmlField,None), (a,None)], SELECT * FROM VALUES ('

      1"2"

      ', 'a'), false, false, LocalTempView, true +CreateViewCommand `xmlTable`, [(xmlField,None), (a,None)], SELECT * FROM VALUES ('

      1"2"

      ', 'a'), false, false, LocalTempView, UNSUPPORTED, true +- Project [col1#x, col2#x] +- LocalRelation [col1#x, col2#x] diff --git a/sql/core/src/test/resources/sql-tests/inputs/binary.sql b/sql/core/src/test/resources/sql-tests/inputs/binary.sql new file mode 100644 index 0000000000000..8cd33eccaaf07 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/binary.sql @@ -0,0 +1,7 @@ +--SET spark.sql.binaryOutputStyle=UTF8 + +SELECT X''; +SELECT X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333'; +SELECT CAST('Spark' as BINARY); +SELECT array( X'', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333', CAST('Spark' as BINARY)); +SELECT to_csv(named_struct('n', 1, 'info', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333')); \ No newline at end of file diff --git a/sql/core/src/test/resources/sql-tests/inputs/binary_base64.sql b/sql/core/src/test/resources/sql-tests/inputs/binary_base64.sql new file mode 100644 index 0000000000000..853eedd51773f --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/binary_base64.sql @@ -0,0 +1,3 @@ +--IMPORT binary.sql + +--SET spark.sql.binaryOutputStyle=BASE64 diff --git a/sql/core/src/test/resources/sql-tests/inputs/binary_basic.sql b/sql/core/src/test/resources/sql-tests/inputs/binary_basic.sql new file mode 100644 index 0000000000000..1a5b64bdf7e05 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/binary_basic.sql @@ -0,0 +1,4 @@ +--IMPORT binary.sql + +--SET spark.sql.binaryOutputStyle=BASIC + diff --git a/sql/core/src/test/resources/sql-tests/inputs/binary_hex.sql b/sql/core/src/test/resources/sql-tests/inputs/binary_hex.sql new file mode 100644 index 0000000000000..7863da737a72f --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/binary_hex.sql @@ -0,0 +1,3 @@ +--IMPORT binary.sql + +--SET spark.sql.binaryOutputStyle=HEX diff --git a/sql/core/src/test/resources/sql-tests/inputs/binary_hex_discrete.sql b/sql/core/src/test/resources/sql-tests/inputs/binary_hex_discrete.sql new file mode 100644 index 0000000000000..282a7634cbc5e --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/binary_hex_discrete.sql @@ -0,0 +1,3 @@ +--IMPORT binary.sql + +--SET spark.sql.binaryOutputStyle=HEX_DISCRETE diff --git a/sql/core/src/test/resources/sql-tests/inputs/bitwise.sql b/sql/core/src/test/resources/sql-tests/inputs/bitwise.sql index f9dfd161d0c07..e080fdd32a4aa 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/bitwise.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/bitwise.sql @@ -75,3 +75,19 @@ select getbit(11L, 2 + 1), getbit(11L, 3 - 1), getbit(10L + 1, 1 * 1), getbit(ca select getbit(11L, 63); select getbit(11L, -1); select getbit(11L, 64); + +SELECT 20181117 >> 2; +SELECT 20181117 << 2; +SELECT 20181117 >>> 2; +SELECT 20181117 > > 2; +SELECT 20181117 < < 2; +SELECT 20181117 > >> 2; +SELECT 20181117 <<< 2; +SELECT 20181117 >>>> 2; +select cast(null as array>), 20181117 >> 2; +select cast(null as array>), 20181117 >>> 2; +select cast(null as map>), 20181117 >> 2; + +select 1 << 1 + 2 as plus_over_shift; -- if correct, the result is 8. otherwise, 4 +select 2 >> 1 << 1 as left_to_right; -- if correct, the result is 2. otherwise, 0 +select 1 & 2 >> 1 as shift_over_ampersand; -- if correct, the result is 1. otherwise, 0 diff --git a/sql/core/src/test/resources/sql-tests/inputs/collations.sql b/sql/core/src/test/resources/sql-tests/inputs/collations.sql index 619eb4470e9ad..c0262a0f0ad14 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/collations.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/collations.sql @@ -1,7 +1,7 @@ -- test cases for collation support -- Create a test table with data -create table t1(utf8_binary string collate utf8_binary, utf8_binary_lcase string collate utf8_binary_lcase) using parquet; +create table t1(utf8_binary string collate utf8_binary, utf8_lcase string collate utf8_lcase) using parquet; insert into t1 values('aaa', 'aaa'); insert into t1 values('AAA', 'AAA'); insert into t1 values('bbb', 'bbb'); @@ -13,67 +13,80 @@ describe table t1; -- group by and count utf8_binary select count(*) from t1 group by utf8_binary; --- group by and count utf8_binary_lcase -select count(*) from t1 group by utf8_binary_lcase; +-- group by and count utf8_lcase +select count(*) from t1 group by utf8_lcase; -- filter equal utf8_binary select * from t1 where utf8_binary = 'aaa'; --- filter equal utf8_binary_lcase -select * from t1 where utf8_binary_lcase = 'aaa' collate utf8_binary_lcase; +-- filter equal utf8_lcase +select * from t1 where utf8_lcase = 'aaa' collate utf8_lcase; -- filter less then utf8_binary select * from t1 where utf8_binary < 'bbb'; --- filter less then utf8_binary_lcase -select * from t1 where utf8_binary_lcase < 'bbb' collate utf8_binary_lcase; +-- filter less then utf8_lcase +select * from t1 where utf8_lcase < 'bbb' collate utf8_lcase; -- inner join -select l.utf8_binary, r.utf8_binary_lcase from t1 l join t1 r on l.utf8_binary_lcase = r.utf8_binary_lcase; +select l.utf8_binary, r.utf8_lcase from t1 l join t1 r on l.utf8_lcase = r.utf8_lcase; -- create second table for anti-join -create table t2(utf8_binary string collate utf8_binary, utf8_binary_lcase string collate utf8_binary_lcase) using parquet; +create table t2(utf8_binary string collate utf8_binary, utf8_lcase string collate utf8_lcase) using parquet; insert into t2 values('aaa', 'aaa'); insert into t2 values('bbb', 'bbb'); -- anti-join on lcase -select * from t1 anti join t2 on t1.utf8_binary_lcase = t2.utf8_binary_lcase; +select * from t1 anti join t2 on t1.utf8_lcase = t2.utf8_lcase; drop table t2; drop table t1; -- set operations -select col1 collate utf8_binary_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') except select col1 collate utf8_binary_lcase from values ('aaa'), ('bbb'); -select col1 collate utf8_binary_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') except all select col1 collate utf8_binary_lcase from values ('aaa'), ('bbb'); -select col1 collate utf8_binary_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') union select col1 collate utf8_binary_lcase from values ('aaa'), ('bbb'); -select col1 collate utf8_binary_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') union all select col1 collate utf8_binary_lcase from values ('aaa'), ('bbb'); -select col1 collate utf8_binary_lcase from values ('aaa'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') intersect select col1 collate utf8_binary_lcase from values ('aaa'), ('bbb'); +select col1 collate utf8_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') except select col1 collate utf8_lcase from values ('aaa'), ('bbb'); +select col1 collate utf8_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') except all select col1 collate utf8_lcase from values ('aaa'), ('bbb'); +select col1 collate utf8_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') union select col1 collate utf8_lcase from values ('aaa'), ('bbb'); +select col1 collate utf8_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') union all select col1 collate utf8_lcase from values ('aaa'), ('bbb'); +select col1 collate utf8_lcase from values ('aaa'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') intersect select col1 collate utf8_lcase from values ('aaa'), ('bbb'); -- create table with struct field -create table t1 (c1 struct) USING PARQUET; +create table t1 (c1 struct) USING PARQUET; -insert into t1 values (named_struct('utf8_binary', 'aaa', 'utf8_binary_lcase', 'aaa')); -insert into t1 values (named_struct('utf8_binary', 'AAA', 'utf8_binary_lcase', 'AAA')); +insert into t1 values (named_struct('utf8_binary', 'aaa', 'utf8_lcase', 'aaa')); +insert into t1 values (named_struct('utf8_binary', 'AAA', 'utf8_lcase', 'AAA')); -- aggregate against nested field utf8_binary select count(*) from t1 group by c1.utf8_binary; --- aggregate against nested field utf8_binary_lcase -select count(*) from t1 group by c1.utf8_binary_lcase; +-- aggregate against nested field utf8_lcase +select count(*) from t1 group by c1.utf8_lcase; drop table t1; -- array function tests -select array_contains(ARRAY('aaa' collate utf8_binary_lcase),'AAA' collate utf8_binary_lcase); -select array_position(ARRAY('aaa' collate utf8_binary_lcase, 'bbb' collate utf8_binary_lcase),'BBB' collate utf8_binary_lcase); +select array_contains(ARRAY('aaa' collate utf8_lcase),'AAA' collate utf8_lcase); +select array_position(ARRAY('aaa' collate utf8_lcase, 'bbb' collate utf8_lcase),'BBB' collate utf8_lcase); -- utility -select nullif('aaa' COLLATE utf8_binary_lcase, 'AAA' COLLATE utf8_binary_lcase); -select least('aaa' COLLATE utf8_binary_lcase, 'AAA' collate utf8_binary_lcase, 'a' collate utf8_binary_lcase); +select nullif('aaa' COLLATE utf8_lcase, 'AAA' COLLATE utf8_lcase); +select least('aaa' COLLATE utf8_lcase, 'AAA' collate utf8_lcase, 'a' collate utf8_lcase); -- array operations -select arrays_overlap(array('aaa' collate utf8_binary_lcase), array('AAA' collate utf8_binary_lcase)); -select array_distinct(array('aaa' collate utf8_binary_lcase, 'AAA' collate utf8_binary_lcase)); -select array_union(array('aaa' collate utf8_binary_lcase), array('AAA' collate utf8_binary_lcase)); -select array_intersect(array('aaa' collate utf8_binary_lcase), array('AAA' collate utf8_binary_lcase)); -select array_except(array('aaa' collate utf8_binary_lcase), array('AAA' collate utf8_binary_lcase)); +select arrays_overlap(array('aaa' collate utf8_lcase), array('AAA' collate utf8_lcase)); +select array_distinct(array('aaa' collate utf8_lcase, 'AAA' collate utf8_lcase)); +select array_union(array('aaa' collate utf8_lcase), array('AAA' collate utf8_lcase)); +select array_intersect(array('aaa' collate utf8_lcase), array('AAA' collate utf8_lcase)); +select array_except(array('aaa' collate utf8_lcase), array('AAA' collate utf8_lcase)); + +-- ICU collations (all statements return true) +select 'a' collate unicode < 'A'; +select 'a' collate unicode_ci = 'A'; +select 'a' collate unicode_ai = 'å'; +select 'a' collate unicode_ci_ai = 'Å'; +select 'a' collate en < 'A'; +select 'a' collate en_ci = 'A'; +select 'a' collate en_ai = 'å'; +select 'a' collate en_ci_ai = 'Å'; +select 'Kypper' collate sv < 'Köpfe'; +select 'Kypper' collate de > 'Köpfe'; +select 'I' collate tr_ci = 'ı'; diff --git a/sql/core/src/test/resources/sql-tests/inputs/cte-nested.sql b/sql/core/src/test/resources/sql-tests/inputs/cte-nested.sql index e5ef244341751..3b2ba1fcdd66e 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/cte-nested.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/cte-nested.sql @@ -17,6 +17,18 @@ SELECT ( SELECT * FROM t ); +-- un-referenced CTE in subquery expression: outer reference in CTE relation +SELECT ( + WITH unreferenced AS (SELECT id) + SELECT 1 +) FROM range(1); + +-- un-referenced CTE in subquery expression: outer reference in CTE main query +SELECT ( + WITH unreferenced AS (SELECT 1) + SELECT id +) FROM range(1); + -- Make sure CTE in subquery is scoped to that subquery rather than global -- the 2nd half of the union should fail because the cte is scoped to the first half SELECT * FROM diff --git a/sql/core/src/test/resources/sql-tests/inputs/higher-order-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/higher-order-functions.sql index 7925a21de04cd..37081de012e98 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/higher-order-functions.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/higher-order-functions.sql @@ -11,6 +11,8 @@ create or replace temporary view nested as values -- Only allow lambda's in higher order functions. select upper(x -> x) as v; +-- Also test functions registered with `ExpressionBuilder`. +select ceil(x -> x) as v; -- Identity transform an array select transform(zs, z -> z) as v from nested; diff --git a/sql/core/src/test/resources/sql-tests/inputs/identifier-clause.sql b/sql/core/src/test/resources/sql-tests/inputs/identifier-clause.sql index fd53f44d3c33c..46461dcd048e3 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/identifier-clause.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/identifier-clause.sql @@ -119,8 +119,8 @@ VALUES(IDENTIFIER(1)); VALUES(IDENTIFIER(SUBSTR('HELLO', 1, RAND() + 1))); SELECT `IDENTIFIER`('abs')(c1) FROM VALUES(-1) AS T(c1); -CREATE TABLE IDENTIFIER(1)(c1 INT); -CREATE TABLE IDENTIFIER('a.b.c')(c1 INT); +CREATE TABLE IDENTIFIER(1)(c1 INT) USING csv; +CREATE TABLE IDENTIFIER('a.b.c')(c1 INT) USING csv; CREATE VIEW IDENTIFIER('a.b.c')(c1) AS VALUES(1); DROP TABLE IDENTIFIER('a.b.c'); DROP VIEW IDENTIFIER('a.b.c'); @@ -132,6 +132,15 @@ CREATE TEMPORARY FUNCTION IDENTIFIER('default.my' || 'DoubleAvg') AS 'test.org.a DROP TEMPORARY FUNCTION IDENTIFIER('default.my' || 'DoubleAvg'); CREATE TEMPORARY VIEW IDENTIFIER('default.v')(c1) AS VALUES(1); +-- SPARK-48273: Aggregation operation in statements using identifier clause for table name +create temporary view identifier('v1') as (select my_col from (values (1), (2), (1) as (my_col)) group by 1); +cache table identifier('t1') as (select my_col from (values (1), (2), (1) as (my_col)) group by 1); +create table identifier('t2') using csv as (select my_col from (values (1), (2), (1) as (my_col)) group by 1); +insert into identifier('t2') select my_col from (values (3) as (my_col)) group by 1; +drop view v1; +drop table t1; +drop table t2; + -- Not supported SELECT row_number() OVER IDENTIFIER('x.win') FROM VALUES(1) AS T(c1) WINDOW win AS (ORDER BY c1); SELECT T1.c1 FROM VALUES(1) AS T1(c1) JOIN VALUES(1) AS T2(c1) USING (IDENTIFIER('c1')); diff --git a/sql/core/src/test/resources/sql-tests/inputs/math.sql b/sql/core/src/test/resources/sql-tests/inputs/math.sql index 96fb0eeef7ac3..14a647a610cc3 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/math.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/math.sql @@ -77,3 +77,16 @@ SELECT conv('9223372036854775808', 10, 16); SELECT conv('92233720368547758070', 10, 16); SELECT conv('9223372036854775807', 36, 10); SELECT conv('-9223372036854775807', 36, 10); + +SELECT BIN(0); +SELECT BIN(25); +SELECT BIN(25L); +SELECT BIN(25.5); + +SELECT POSITIVE(0Y); +SELECT POSITIVE(25); +SELECT POSITIVE(-25L); +SELECT POSITIVE(25.5); +SELECT POSITIVE("25.5"); +SELECT POSITIVE("invalid"); +SELECT POSITIVE(null); diff --git a/sql/core/src/test/resources/sql-tests/inputs/predicate-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/predicate-functions.sql index 6f64b0da6502e..195db17a3a1f9 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/predicate-functions.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/predicate-functions.sql @@ -1,3 +1,30 @@ +-- NOT +select not true; +select ! true; +select not null::boolean; + +-- AND +select true and true; +select true and false; +select false and true; +select false and false; +select true and null::boolean; +select false and null::boolean; +select null::boolean and true; +select null::boolean and false; +select null::boolean and null::boolean; + +-- OR +select true or true; +select true or false; +select false or true; +select false or false; +select true or null::boolean; +select false or null::boolean; +select null::boolean or true; +select null::boolean or false; +select null::boolean or null::boolean; + -- EqualTo select 1 = 1; select 1 = '1'; @@ -82,3 +109,12 @@ select 2.0 not between '1.0' and '3.0'; select 'b' not between 'a' and 'c'; select to_timestamp('2022-12-26 00:00:01') not between to_date('2022-03-01') and to_date('2022-12-31'); select rand(123) not between 0.1 AND 0.2; + +-- Sanity test for legacy flag equating ! with NOT +set spark.sql.legacy.bangEqualsNot=true; +select 1 ! between 0 and 2; +select 1 ! in (3, 4); +select 'hello' ! like 'world'; +select 1 is ! null; +select false is ! true; +set spark.sql.legacy.bangEqualsNot=false; diff --git a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql index 64ea6e655d0b5..c108f7c76f764 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql @@ -126,11 +126,23 @@ select encode('hello', 'WINDOWS-1252'); select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol); select encode('hello', 'Windows-xxx'); select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol); +set spark.sql.legacy.codingErrorAction=true; +select encode('渭城朝雨浥轻尘', 'US-ASCII'); +select encode(scol, ecol) from values('渭城朝雨浥轻尘', 'US-ASCII') as t(scol, ecol); +set spark.sql.legacy.codingErrorAction=false; +select encode('客舍青青柳色新', 'US-ASCII'); +select encode(scol, ecol) from values('客舍青青柳色新', 'US-ASCII') as t(scol, ecol); +select encode(decode(encode('白日依山尽,黄河入海流。欲穷千里目,更上一层楼。', 'UTF-16'), 'UTF-16'), 'UTF-8'); +select encode(decode(encode('南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。', 'UTF-16'), 'UTF-16'), 'UTF-8'); +select encode(decode(encode('세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark', 'UTF-16'), 'UTF-16'), 'UTF-8'); +select encode(decode(encode('το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας μεγάλων δεδομένων παγκοσμίως', 'UTF-16'), 'UTF-16'), 'UTF-8'); +select encode(decode(encode('Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。', 'UTF-16'), 'UTF-16'), 'UTF-8'); -- decode select decode(); select decode(encode('abc', 'utf-8')); select decode(encode('abc', 'utf-8'), 'utf-8'); +select decode(encode('大千世界', 'utf-32'), 'utf-32'); select decode(1, 1, 'Southlake'); select decode(2, 1, 'Southlake'); select decode(2, 1, 'Southlake', 2, 'San Francisco', 3, 'New Jersey', 4, 'Seattle', 'Non domestic'); @@ -146,6 +158,12 @@ select decode(scol, ecol) from values(X'68656c6c6f', 'WINDOWS-1252') as t(scol, set spark.sql.legacy.javaCharsets=false; select decode(X'68656c6c6f', 'WINDOWS-1252'); select decode(scol, ecol) from values(X'68656c6c6f', 'WINDOWS-1252') as t(scol, ecol); +set spark.sql.legacy.codingErrorAction=true; +select decode(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII'); +select decode(scol, ecol) from values(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') as t(scol, ecol); +set spark.sql.legacy.codingErrorAction=false; +select decode(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII'); +select decode(scol, ecol) from values(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') as t(scol, ecol); -- contains SELECT CONTAINS(null, 'Spark'); @@ -275,3 +293,16 @@ select luhn_check(6011111111111117); select luhn_check(6011111111111118); select luhn_check(123.456); +--utf8 string validation +select is_valid_utf8(''); +select is_valid_utf8('abc'); +select is_valid_utf8(x'80'); +select make_valid_utf8(''); +select make_valid_utf8('abc'); +select make_valid_utf8(x'80'); +select validate_utf8(''); +select validate_utf8('abc'); +select validate_utf8(x'80'); +select try_validate_utf8(''); +select try_validate_utf8('abc'); +select try_validate_utf8(x'80'); diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/exists-subquery/exists-in-join-condition.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/exists-subquery/exists-in-join-condition.sql index ad2e7ad563e08..bc732cc3d320d 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/subquery/exists-subquery/exists-in-join-condition.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/exists-subquery/exists-in-join-condition.sql @@ -89,3 +89,7 @@ select * from x inner join y on x1 = y1 and exists (select * from z where z1 = y select * from x inner join y on x1 = y1 and not exists (select * from z where z1 = y1) order by x1, x2, y1, y2; select * from x left join y on x1 = y1 and exists (select * from z where z1 = y1) order by x1, x2, y1, y2; select * from x left join y on x1 = y1 and not exists (select * from z where z1 = y1) order by x1, x2, y1, y2; + +-- Correlated subquery references both left and right children, errors +select * from x join y on x1 = y1 and exists (select * from z where z2 = x2 AND z2 = y2) order by x1, x2, y1, y2; +select * from x join y on x1 = y1 and not exists (select * from z where z2 = x2 AND z2 = y2) order by x1, x2, y1, y2; diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-subquery-in-join-condition.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-subquery-in-join-condition.sql index d519abdbacc05..c906390c99c32 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-subquery-in-join-condition.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-subquery-in-join-condition.sql @@ -84,3 +84,7 @@ select * from x inner join y on x1 = y1 and y2 IN (select z1 from z where z1 = y select * from x inner join y on x1 = y1 and y2 not IN (select z1 from z where z1 = y1) order by x1, x2, y1, y2; select * from x left join y on x1 = y1 and y2 IN (select z1 from z where z1 = y1) order by x1, x2, y1, y2; select * from x left join y on x1 = y1 and y2 not IN (select z1 from z where z1 = y1) order by x1, x2, y1, y2; + +-- Correlated subquery references both left and right children, errors +select * from x left join y on x1 = y1 and x2 IN (select z1 from z where z2 = x2 AND z2 = y2) order by x1, x2, y1, y2; +select * from x left join y on x1 = y1 and x2 not IN (select z1 from z where z2 = x2 AND z2 = y2) order by x1, x2, y1, y2; diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-group-by.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-group-by.sql new file mode 100644 index 0000000000000..6787fac75b39a --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-group-by.sql @@ -0,0 +1,30 @@ +-- Tests for scalar subquery with a group-by. Only a group-by that guarantees a single row result is allowed. See SPARK-48503 + +--ONLY_IF spark + +create temp view x (x1, x2) as values (1, 1), (2, 2); +create temp view y (y1, y2) as values (2, 0), (3, -1); +create temp view z (z1, z2) as values (1, 0), (1, 1); + +-- Legal queries +select * from x where (select count(*) from y where y1 = x1 group by y1) = 1; +select * from x where (select count(*) from y where y1 = x1 group by x1) = 1; +select * from x where (select count(*) from y where y1 > x1 group by x1) = 1; + +-- Group-by column equal to constant - legal +select *, (select count(*) from y where x1 = y1 and y2 = 1 group by y2) from x; +-- Group-by column equal to expression with constants and outer refs - legal +select *, (select count(*) from y where x1 = y1 and y2 = x1 + 1 group by y2) from x; + +-- Illegal queries +select * from x where (select count(*) from y where y1 > x1 group by y1) = 1; +select *, (select count(*) from y where y1 + y2 = x1 group by y1) from x; + +-- Certain other operators like OUTER JOIN or UNION between the correlating filter and the group-by also can cause the scalar subquery to return multiple values and hence make the query illegal. +select *, (select count(*) from (select * from y where y1 = x1 union all select * from y) sub group by y1) from x; +select *, (select count(*) from y left join (select * from z where z1 = x1) sub on y2 = z2 group by z1) from x; -- The correlation below the join is unsupported in Spark anyway, but when we do support it this query should still be disallowed. + +-- Test legacy behavior conf +set spark.sql.legacy.scalarSubqueryAllowGroupByNonEqualityCorrelatedPredicate = true; +select * from x where (select count(*) from y where y1 > x1 group by y1) = 1; +reset spark.sql.legacy.scalarSubqueryAllowGroupByNonEqualityCorrelatedPredicate; diff --git a/sql/core/src/test/resources/sql-tests/inputs/timestamp-ltz.sql b/sql/core/src/test/resources/sql-tests/inputs/timestamp-ltz.sql index 377b26c67a3ea..28fe4539855cd 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/timestamp-ltz.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/timestamp-ltz.sql @@ -1,6 +1,6 @@ -- timestamp_ltz literals and constructors --CONFIG_DIM1 spark.sql.timestampType=TIMESTAMP_LTZ ---CONFIG_DIM1 spark.sql.timestampType=TIMESTAMP_NTZ +--CONFIG_DIM2 spark.sql.timestampType=TIMESTAMP_NTZ select timestamp_ltz'2016-12-31 00:12:00', timestamp_ltz'2016-12-31'; diff --git a/sql/core/src/test/resources/sql-tests/inputs/timestamp-ntz.sql b/sql/core/src/test/resources/sql-tests/inputs/timestamp-ntz.sql index d744c0c19b42e..07901093cfba8 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/timestamp-ntz.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/timestamp-ntz.sql @@ -1,6 +1,6 @@ -- timestamp_ntz literals and constructors --CONFIG_DIM1 spark.sql.timestampType=TIMESTAMP_LTZ ---CONFIG_DIM1 spark.sql.timestampType=TIMESTAMP_NTZ +--CONFIG_DIM2 spark.sql.timestampType=TIMESTAMP_NTZ select timestamp_ntz'2016-12-31 00:12:00', timestamp_ntz'2016-12-31'; diff --git a/sql/core/src/test/resources/sql-tests/inputs/to_from_avro.sql b/sql/core/src/test/resources/sql-tests/inputs/to_from_avro.sql new file mode 100644 index 0000000000000..12541ff26e24e --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/to_from_avro.sql @@ -0,0 +1,21 @@ +-- Create some temporary test data. +create table t as + select named_struct('u', named_struct('member0', member0, 'member1', member1)) as s + from values (1, null), (null, 'a') tab(member0, member1); +declare avro_schema string; +set variable avro_schema = + '{ "type": "record", "name": "struct", "fields": [{ "name": "u", "type": ["int","string"] }] }'; + +-- Exercise invalid SQL syntax when calling the 'from_avro' and 'to_avro' functions. +select from_avro(s, 42, map()) from t; +select from_avro(s, avro_schema, 42) from t; +select to_avro(s, 42) from t; + +-- Avro is not loaded in this testing environment, so queries calling the 'from_avro' or 'to_avro' +-- SQL functions that otherwise pass analysis return appropriate "Avro not loaded" errors here. +select to_avro(s, avro_schema) as result from t; +select from_avro(result, avro_schema, map()).u from (select null as result); + +-- Clean up. +drop temporary variable avro_schema; +drop table t; diff --git a/sql/core/src/test/resources/sql-tests/inputs/transform.sql b/sql/core/src/test/resources/sql-tests/inputs/transform.sql index 922a1d8177780..8570496d439e6 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/transform.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/transform.sql @@ -415,4 +415,8 @@ FROM ( ORDER BY a ) map_output SELECT TRANSFORM(a, b) - USING 'cat' AS (a, b); \ No newline at end of file + USING 'cat' AS (a, b); + +SELECT TRANSFORM (a, b) + USING 'cat' AS (a CHAR(10), b VARCHAR(10)) +FROM VALUES('apache', 'spark') t(a, b); diff --git a/sql/core/src/test/resources/sql-tests/inputs/try_arithmetic.sql b/sql/core/src/test/resources/sql-tests/inputs/try_arithmetic.sql index 55907b6701e50..943865b68d39e 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/try_arithmetic.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/try_arithmetic.sql @@ -1,6 +1,8 @@ -- Numeric + Numeric SELECT try_add(1, 1); SELECT try_add(2147483647, 1); +SELECT try_add(2147483647, decimal(1)); +SELECT try_add(2147483647, "1"); SELECT try_add(-2147483648, -1); SELECT try_add(9223372036854775807L, 1); SELECT try_add(-9223372036854775808L, -1); @@ -38,6 +40,8 @@ SELECT try_divide(0, 0); SELECT try_divide(1, (2147483647 + 1)); SELECT try_divide(1L, (9223372036854775807L + 1L)); SELECT try_divide(1, 1.0 / 0.0); +SELECT try_divide(1, decimal(0)); +SELECT try_divide(1, "0"); -- Interval / Numeric SELECT try_divide(interval 2 year, 2); @@ -50,6 +54,8 @@ SELECT try_divide(interval 106751991 day, 0.5); -- Numeric - Numeric SELECT try_subtract(1, 1); SELECT try_subtract(2147483647, -1); +SELECT try_subtract(2147483647, decimal(-1)); +SELECT try_subtract(2147483647, "-1"); SELECT try_subtract(-2147483648, 1); SELECT try_subtract(9223372036854775807L, -1); SELECT try_subtract(-9223372036854775808L, 1); @@ -66,6 +72,8 @@ SELECT try_subtract(interval 106751991 day, interval -3 day); -- Numeric * Numeric SELECT try_multiply(2, 3); SELECT try_multiply(2147483647, -2); +SELECT try_multiply(2147483647, decimal(-2)); +SELECT try_multiply(2147483647, "-2"); SELECT try_multiply(-2147483648, 2); SELECT try_multiply(9223372036854775807L, 2); SELECT try_multiply(-9223372036854775808L, -2); diff --git a/sql/core/src/test/resources/sql-tests/inputs/udtf/udtf.sql b/sql/core/src/test/resources/sql-tests/inputs/udtf/udtf.sql index c83481f10dca6..a437b1f93b604 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/udtf/udtf.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/udtf/udtf.sql @@ -143,6 +143,22 @@ SELECT * FROM UDTFWithSinglePartition(1, invalid_arg_name => 2); SELECT * FROM UDTFWithSinglePartition(1, initial_count => 2); SELECT * FROM UDTFWithSinglePartition(initial_count => 1, initial_count => 2); SELECT * FROM UDTFInvalidPartitionByOrderByParseError(TABLE(t2)); +-- Exercise the UDTF partitioning bug. +SELECT * FROM UDTFPartitionByIndexingBug( + TABLE( + SELECT + 5 AS unused_col, + 'hi' AS partition_col, + 1.0 AS double_col + + UNION ALL + + SELECT + 4 AS unused_col, + 'hi' AS partition_col, + 1.0 AS double_col + ) +); -- cleanup DROP VIEW t1; diff --git a/sql/core/src/test/resources/sql-tests/inputs/view-schema-binding-config.sql b/sql/core/src/test/resources/sql-tests/inputs/view-schema-binding-config.sql new file mode 100644 index 0000000000000..e803254ea642a --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/view-schema-binding-config.sql @@ -0,0 +1,172 @@ +-- This test suits check the spark.sql.viewSchemaBindingMode configuration. +-- It can be DISABLED and COMPENSATION + +-- Verify the default binding is true +SET spark.sql.legacy.viewSchemaBindingMode; + +-- 1. Test DISABLED mode. +SET spark.sql.legacy.viewSchemaBindingMode = false; + +-- 1.a Attempts to use the SCHEMA BINDING clause fail with FEATURE_NOT_ENABLED +CREATE OR REPLACE VIEW v WITH SCHEMA BINDING AS SELECT 1; +CREATE OR REPLACE VIEW v WITH SCHEMA COMPENSATION AS SELECT 1; +CREATE OR REPLACE VIEW v WITH SCHEMA TYPE EVOLUTION AS SELECT 1; +CREATE OR REPLACE VIEW v WITH SCHEMA EVOLUTION AS SELECT 1; + +-- 1.b Existing SHOW and DESCRIBE should behave as before Spark 4.0.0 +CREATE OR REPLACE VIEW v AS SELECT 1; +DESCRIBE EXTENDED v; +SHOW TABLE EXTENDED LIKE 'v'; +SHOW CREATE TABLE v; +DROP VIEW IF EXISTS v; + +CREATE OR REPLACE TEMPORARY VIEW v AS SELECT 1; +DESCRIBE EXTENDED v; +SHOW TABLE EXTENDED LIKE 'v'; +DROP VIEW IF EXISTS v; + +-- 1.c Views get invalidated if the types change in an unsafe matter +DROP TABLE IF EXISTS t; +CREATE TABLE t(c1 INT NOT NULL) USING PARQUET; +CREATE OR REPLACE VIEW v AS SELECT * FROM t; +SELECT * FROM v; +-- Baseline: v(c1 INT); +DESCRIBE EXTENDED v; +SHOW CREATE TABLE v; + +-- Widen the column c1 in t +DROP TABLE IF EXISTS t; +CREATE TABLE t(c1 BIGINT NOT NULL) USING PARQUET; +-- The view should be invalidated, cannot upcast from BIGINT to INT +SELECT * FROM v; + +-- The view still describes as v(c1 INT); +DESCRIBE EXTENDED v; + +-- 2. Test true mode. In this mode Spark tolerates any supported CAST, not just up cast +SET spark.sql.legacy.viewSchemaBindingMode = true; +SET spark.sql.legacy.viewSchemaCompensation = false; + +-- To verify ANSI_MODE is enforced even if ANSI_MODE is turned off. +SET spark.sql.ansi.enabled = false; + +-- 2.a In BINDING views get invalidated if the type can't cast +DROP TABLE IF EXISTS t; +CREATE TABLE t(c1 INT NOT NULL) USING PARQUET; +CREATE OR REPLACE VIEW v AS SELECT * FROM t; +SELECT * FROM v; +-- Baseline: v(c1 INT); +DESCRIBE EXTENDED v; +SHOW CREATE TABLE v; + +-- Widen the column c1 in t +DROP TABLE t; +CREATE TABLE t(c1 BIGINT NOT NULL) USING PARQUET; +INSERT INTO t VALUES (1); + +-- This fails +SELECT * FROM v; +-- The view still describes as v(c1 BIGINT) +DESCRIBE EXTENDED v; +SHOW CREATE TABLE v; + +-- 2.b Switch to default COMPENSATION +SET spark.sql.legacy.viewSchemaCompensation = true; + +DROP TABLE IF EXISTS t; +CREATE TABLE t(c1 INT NOT NULL) USING PARQUET; +CREATE OR REPLACE VIEW v AS SELECT * FROM t; +SELECT * FROM v; +-- Baseline: v(c1 INT); +DESCRIBE EXTENDED v; +SHOW CREATE TABLE v; + +-- Widen the column c1 in t +DROP TABLE t; +CREATE TABLE t(c1 BIGINT NOT NULL) USING PARQUET; +INSERT INTO t VALUES (1); + +-- This now succeeds +SELECT * FROM v; +-- The view still describes as v(c1 BIGINT) +DESCRIBE EXTENDED v; +SHOW CREATE TABLE v; + +-- 2.c In COMPENSATION views ignore added columns and change the type +-- Expect the added column to be ignore, but the type will be tolerated, as long as it can cast +DROP TABLE IF EXISTS t; +CREATE TABLE t(c1 STRING NOT NULL, c2 INT) USING PARQUET; +INSERT INTO t VALUES ('1', 2); +SELECT * FROM v; +-- The view still describes as v(c1 INT); +DESCRIBE EXTENDED v; + +-- Runtime error if the cast fails +INSERT INTO t VALUES ('a', 2); +SELECT * FROM v; + +-- Compile time error if the cast can't be done +DROP TABLE IF EXISTS t; +CREATE TABLE t(c1 MAP, c2 INT) USING PARQUET; + +-- The view should be invalidated, we can't cast a MAP to INT +SELECT * FROM v; + +-- The view still describes as v(c1 INT); +DESCRIBE EXTENDED v; + +-- 2.d Still can't drop a column, though +DROP TABLE IF EXISTS t; +CREATE TABLE t(c1 INT, c2 INT) USING PARQUET; +INSERT INTO t VALUES (1, 2); +CREATE OR REPLACE VIEW v AS SELECT * FROM t; +SELECT * FROM v; + +-- Describes as v(c1 INT, c2 INT) +DESCRIBE EXTENDED v; + +DROP TABLE IF EXISTS t; +CREATE TABLE t(c1 INT NOT NULL) USING PARQUET; + +-- The view should be invalidated, it lost a column +SELECT * FROM v; +DESCRIBE EXTENDED v; + +-- 2.e Attempt to rename a column +DROP TABLE IF EXISTS t; +CREATE TABLE t(c3 INT NOT NULL, c2 INT) USING PARQUET; +SELECT * FROM v; +DESCRIBE EXTENDED v; + +-- 3. Test the behavior of grandfathered views and temp views +SET spark.sql.legacy.viewSchemaBindingMode = false; +SET spark.sql.legacy.viewSchemaCompensation = false; +CREATE OR REPLACE VIEW v AS SELECT 1; +SET spark.sql.legacy.viewSchemaBindingMode = true; +DESCRIBE EXTENDED v; +SHOW TABLE EXTENDED LIKE 'v'; +SHOW CREATE TABLE v; + +SET spark.sql.legacy.viewSchemaCompensation = true; +DESCRIBE EXTENDED v; +SHOW TABLE EXTENDED LIKE 'v'; +SHOW CREATE TABLE v; + +DROP VIEW IF EXISTS v; + +SET spark.sql.legacy.viewSchemaBindingMode = false; +SET spark.sql.legacy.viewSchemaCompensation = false; +CREATE OR REPLACE TEMPORARY VIEW v AS SELECT 1; +SET spark.sql.legacy.viewSchemaBindingMode = true; +DESCRIBE EXTENDED v; +SHOW TABLE EXTENDED LIKE 'v'; + +SET spark.sql.legacy.viewSchemaCompensation = true; +DESCRIBE EXTENDED v; +SHOW TABLE EXTENDED LIKE 'v'; + +DROP VIEW IF EXISTS v; + +-- 99 Cleanup +DROP VIEW IF EXISTS v; +DROP TABLE IF EXISTS t; diff --git a/sql/core/src/test/resources/sql-tests/inputs/view-schema-binding.sql b/sql/core/src/test/resources/sql-tests/inputs/view-schema-binding.sql new file mode 100644 index 0000000000000..413322db10d28 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/view-schema-binding.sql @@ -0,0 +1,64 @@ +-- This test suite checks that the WITH SCHEMA BINDING clause is correctly implemented + +-- New view with schema binding +-- 1.a BINDING is persisted +DROP TABLE IF EXISTS t; +CREATE TABLE t(c1 INT NOT NULL) USING PARQUET; +CREATE OR REPLACE VIEW v WITH SCHEMA BINDING AS SELECT * FROM t; +SELECT * FROM v; +-- Baseline: v(c1 INT); +DESCRIBE EXTENDED v; + +-- Widen the column c1 in t +DROP TABLE t; +CREATE TABLE t(c1 BIGINT NOT NULL) USING PARQUET; +-- The view should be invalidated, cannot upcast from BIGINT to INT +SELECT * FROM v; + +-- The view still describes as v(c1 INT); +DESCRIBE EXTENDED v; + +-- 1.b In BINDING views get invalidated if a column is lost +DROP TABLE IF EXISTS t; +CREATE TABLE t(c1 INT, c2 INT) USING PARQUET; +CREATE OR REPLACE VIEW v WITH SCHEMA BINDING AS SELECT * FROM t; +SELECT * FROM v; +-- Baseline: v(c1 INT, c2 INT); +DESCRIBE EXTENDED v; + +-- Drop the column c2 from t +DROP TABLE t; +CREATE TABLE t(c1 INT) USING PARQUET; +-- The view should be invalidated, it lost a column +SELECT * FROM v; + +-- The view still describes as v(c1 INT, c2 INT); +DESCRIBE EXTENDED v; + +-- Test ALTER VIEW ... WITH SCHEMA BINDING +SET spark.sql.legacy.viewSchemaCompensation=false; + +DROP TABLE IF EXISTS t; +CREATE TABLE t(c1 INT NOT NULL) USING PARQUET; +CREATE OR REPLACE VIEW v AS SELECT * FROM t; +SELECT * FROM v; +-- Baseline: v(c1 INT); +-- There is no binding recorded +DESCRIBE EXTENDED v; + +ALTER VIEW v WITH SCHEMA BINDING; +-- Baseline: v(c1 INT); +-- There is SCHEMA BINDING recorded +DESCRIBE EXTENDED v; + +DROP TABLE t; +CREATE TABLE t(c1 BIGINT NOT NULL) USING PARQUET; +-- The view should be invalidated, cannot upcast from BIGINT to INT +SELECT * FROM v; + +-- The view still describes as v(c1 INT); +DESCRIBE EXTENDED v; + +-- 99 Cleanup +DROP VIEW IF EXISTS v; +DROP TABLE IF EXISTS t; diff --git a/sql/core/src/test/resources/sql-tests/inputs/view-schema-compensation.sql b/sql/core/src/test/resources/sql-tests/inputs/view-schema-compensation.sql new file mode 100644 index 0000000000000..21a3ce1e12293 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/view-schema-compensation.sql @@ -0,0 +1,90 @@ +-- This test suite checks the WITH SCHEMA COMPENSATION clause +-- Disable ANSI mode to ensure we are forcing it explicitly in the CASTS +SET spark.sql.ansi.enabled = false; + +-- In COMPENSATION views get invalidated if the type can't cast +DROP TABLE IF EXISTS t; +CREATE TABLE t(c1 INT NOT NULL) USING PARQUET; +CREATE OR REPLACE VIEW v WITH SCHEMA COMPENSATION AS SELECT * FROM t; +SELECT * FROM v; +-- Baseline: v(c1 INT); +DESCRIBE EXTENDED v; + +-- Widen the column c1 in t +DROP TABLE t; +CREATE TABLE t(c1 BIGINT NOT NULL) USING PARQUET; +INSERT INTO t VALUES (1); +SELECT * FROM v; +-- The view still describes as v(c1 BIGINT) +DESCRIBE EXTENDED v; + +-- In COMPENSATION views ignore added a column and change the type +-- Expect the added column to be ignore, but the type will be tolerated, as long as it can cast +DROP TABLE IF EXISTS t; +CREATE TABLE t(c1 STRING NOT NULL, c2 INT) USING PARQUET; +INSERT INTO t VALUES ('1', 2); +SELECT * FROM v; +-- The view still describes as v(c1 INT); +DESCRIBE EXTENDED v; + +-- Runtime error if the cast fails +INSERT INTO t VALUES ('a', 2); +SELECT * FROM v; + +-- Compile time error if the cast can't be done +DROP TABLE IF EXISTS t; +CREATE TABLE t(c1 MAP, c2 INT) USING PARQUET; + +-- The view should be invalidated, we can't cast a MAP to INT +SELECT * FROM v; + +-- The view still describes as v(c1 INT); +DESCRIBE EXTENDED v; + +-- Still can't drop a column, though +DROP TABLE IF EXISTS t; +CREATE TABLE t(c1 INT, c2 INT) USING PARQUET; +INSERT INTO t VALUES (1, 2); +CREATE OR REPLACE VIEW v AS SELECT * FROM t; +SELECT * FROM v; + +-- Describes as v(c1 INT, c2 INT) +DESCRIBE EXTENDED v; + +DROP TABLE IF EXISTS t; +CREATE TABLE t(c1 INT NOT NULL) USING PARQUET; + +-- The view should be invalidated, it lost a column +SELECT * FROM v; +DESCRIBE EXTENDED v; + +-- Attempt to rename a column, this fails +DROP TABLE IF EXISTS t; +CREATE TABLE t(c3 INT NOT NULL, c2 INT) USING PARQUET; +SELECT * FROM v; +DESCRIBE EXTENDED v; + +-- Test ALTER VIEW ... WITH SCHEMA ... +DROP TABLE IF EXISTS t; +CREATE TABLE t(c1 INT) USING PARQUET; +INSERT INTO t VALUES(1); +CREATE OR REPLACE VIEW v WITH SCHEMA BINDING AS SELECT * FROM t; +DESCRIBE EXTENDED v; + +DROP TABLE IF EXISTS t; +CREATE TABLE t(c1 STRING) USING PARQUET; +INSERT INTO t VALUES('1'); + +-- This fails, because teh view uses SCHEMA BINDING +SELECT * FROM v; + +-- Now upgrade the view to schema compensation +ALTER VIEW v WITH SCHEMA COMPENSATION; +DESCRIBE EXTENDED v; + +-- Success +SELECT * FROM v; + +-- Cleanup +DROP VIEW IF EXISTS v; +DROP TABLE IF EXISTS t; diff --git a/sql/core/src/test/resources/sql-tests/inputs/view-schema-evolution.sql b/sql/core/src/test/resources/sql-tests/inputs/view-schema-evolution.sql new file mode 100644 index 0000000000000..5ff153acef25c --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/view-schema-evolution.sql @@ -0,0 +1,155 @@ +-- This test suite checks the WITH SCHEMA TYPE EVOLUTION clause + +-- In EVOLUTION mode Spark will inherit everything from the query, unless +-- a column list is given. In that case it behaves like TYPE EVOLUTION +DROP TABLE IF EXISTS t; +CREATE TABLE t(c1 INT NOT NULL, c2 INT) USING PARQUET; +INSERT INTO t VALUES (1, 2); +CREATE OR REPLACE VIEW v WITH SCHEMA EVOLUTION AS SELECT * FROM t; +SELECT * FROM v; +DESCRIBE EXTENDED v; + +-- In EVOLUTION views inherit column type changes and name changes +DROP TABLE IF EXISTS t; +CREATE TABLE t(c4 STRING NOT NULL, c5 DOUBLE) USING PARQUET; +INSERT INTO t VALUES ('1', 2.0); +SELECT * FROM v; +-- The view now describes as v(c3 STRING, c4 DOUBLE) +DESCRIBE EXTENDED v; + +-- In EVOLUTION new columns are inherited +DROP TABLE IF EXISTS t; +CREATE TABLE t(c4 STRING, c5 DOUBLE, c6 DATE) USING PARQUET; +INSERT INTO t VALUES ('1', 2.0, DATE'2022-01-01'); +SELECT * FROM v; +-- The view describes as v(c4 STRING, c5 DOUBLE, c6 DATE) +DESCRIBE EXTENDED v; + +-- We can even drop columns +DROP TABLE IF EXISTS t; +CREATE TABLE t(c1 INT, c2 INT) USING PARQUET; +INSERT INTO t VALUES (1, 2); +CREATE OR REPLACE VIEW v WITH SCHEMA EVOLUTION AS SELECT * FROM t; +SELECT * FROM v; +-- Describes as v(c1 INT, c2 INT) +DESCRIBE EXTENDED v; + +DROP TABLE IF EXISTS t; +CREATE TABLE t(c1 INT) USING PARQUET; +SELECT * FROM v; +-- The view describes as v(c1 INT) +DESCRIBE EXTENDED v; + +-- If a column list is given it behaves like TYPE EVOLUTION +DROP TABLE IF EXISTS t; +CREATE TABLE t(c1 INT NOT NULL, c2 INT) USING PARQUET; +INSERT INTO t VALUES (1, 2); +CREATE OR REPLACE VIEW v(a1, a2) WITH SCHEMA EVOLUTION AS SELECT * FROM t; +SELECT * FROM v; +DESCRIBE EXTENDED v; + +-- In EVOLUTION views with explicit column lists still inherit column type changes +DROP TABLE IF EXISTS t; +CREATE TABLE t(c1 STRING NOT NULL, c2 DOUBLE) USING PARQUET; +INSERT INTO t VALUES ('1', 2.0); +SELECT * FROM v; +-- The view now describes as v(a1 STRING, a2 DOUBLE) +DESCRIBE EXTENDED v; + +-- In EVOLUTION views with explicit column lists no new columns are inherited +DROP TABLE IF EXISTS t; +CREATE TABLE t(c1 STRING, c2 DOUBLE, c3 DATE) USING PARQUET; +INSERT INTO t VALUES ('1', 2.0, DATE'2022-01-01'); +SELECT * FROM v; +-- The view still describes as v(a1 STRING, a2 DOUBLE) +DESCRIBE EXTENDED v; + +-- In EVOLUTION views with explicit column lists can't drop a column +DROP TABLE IF EXISTS t; +CREATE TABLE t(c1 INT, c2 INT) USING PARQUET; +INSERT INTO t VALUES (1, 2); +CREATE OR REPLACE VIEW v(a1, a2) WITH SCHEMA EVOLUTION AS SELECT * FROM t; +SELECT * FROM v; + +-- Describes as v(a1 INT, a2 INT) +DESCRIBE EXTENDED v; + +DROP TABLE IF EXISTS t; +CREATE TABLE t(c1 INT) USING PARQUET; + +-- The view should be invalidated, it lost a column +SELECT * FROM v; +DESCRIBE EXTENDED v; + +-- Attempt to rename a column +DROP TABLE IF EXISTS t; +CREATE TABLE t(c3 INT, c2 INT) USING PARQUET; +SELECT * FROM v; +DESCRIBE EXTENDED v; + +-- Test preservation of comments +DROP TABLE IF EXISTS t; +CREATE TABLE t(c1 INT COMMENT 'c1', c2 INT COMMENT 'c2') USING PARQUET; + +-- EVOLUTION, column list, but no comments +CREATE OR REPLACE VIEW v(a1, a2) WITH SCHEMA TYPE EVOLUTION AS SELECT * FROM t; +DESCRIBE EXTENDED v; + +-- EVOLUTION, column list, but no comments +CREATE OR REPLACE VIEW v(a1, a2) WITH SCHEMA EVOLUTION AS SELECT * FROM t; +DESCRIBE EXTENDED v; + +DROP TABLE IF EXISTS t; +CREATE TABLE t(c1 BIGINT COMMENT 'c1 6c', c2 STRING COMMENT 'c2 6c') USING PARQUET; +SELECT * FROM v; +DESCRIBE EXTENDED v; + +-- EVOLUTION, column list with comments +CREATE OR REPLACE VIEW v(a1 COMMENT 'a1', a2 COMMENT 'a2') WITH SCHEMA EVOLUTION AS SELECT * FROM t; +DESCRIBE EXTENDED v; + +DROP TABLE IF EXISTS t; +CREATE TABLE t(c1 BIGINT COMMENT 'c1 6d', c2 STRING COMMENT 'c2 6d') USING PARQUET; +SELECT * FROM v; +DESCRIBE EXTENDED v; + +-- EVOLUTION, no column list +CREATE OR REPLACE VIEW v WITH SCHEMA EVOLUTION AS SELECT * FROM t; +DESCRIBE EXTENDED v; + +DROP TABLE IF EXISTS t; +CREATE TABLE t(c1 BIGINT COMMENT 'c1 6e', c2 STRING COMMENT 'c2 6e') USING PARQUET; +SELECT * FROM v; +DESCRIBE EXTENDED v; + +-- Test error condition where a duplicate column name is produced +DROP TABLE IF EXISTS t1; +CREATE TABLE t1(c1 INT) USING PARQUET; +DROP TABLE IF EXISTS t2; +CREATE TABLE t2(c2 INT) USING PARQUET; +CREATE OR REPLACE VIEW v WITH SCHEMA EVOLUTION AS SELECT * FROM t1, t2; +SELECT * FROM v; +DROP TABLE IF EXISTS t2; +CREATE TABLE t2(c1 INT) USING PARQUET; +-- This should fail with a duplicate column error +SELECT * FROM v; +DROP TABLE IF EXISTS t1; +DROP TABLE IF EXISTS t2; + +-- Test ALTER VIEW ... WITH SCHEMA EVOLUTION + +DROP TABLE IF EXISTS t; +CREATE TABLE t(c1 INT) USING PARQUET; +INSERT INTO t VALUES(1); +CREATE OR REPLACE VIEW v AS SELECT * FROM t; + +ALTER VIEW v WITH SCHEMA EVOLUTION; +DROP TABLE IF EXISTS t; +CREATE TABLE t(c1 STRING, c2 INT) USING PARQUET; +-- No error, extra column +SELECT * FROM v; +DESCRIBE EXTENDED v; + +-- clean up +DROP VIEW IF EXISTS v; +DROP TABLE IF EXISTS t; diff --git a/sql/core/src/test/resources/sql-tests/inputs/view-schema-type-evolution.sql b/sql/core/src/test/resources/sql-tests/inputs/view-schema-type-evolution.sql new file mode 100644 index 0000000000000..c0278f15b6418 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/view-schema-type-evolution.sql @@ -0,0 +1,90 @@ +-- This test suite checks the WITH SCHEMA TYPE EVOLUTION clause + +-- In TYPE EVOLUTION mode Spark will inherit the view column types from the query +DROP TABLE IF EXISTS t; +CREATE TABLE t(c1 INT NOT NULL, c2 INT) USING PARQUET; +INSERT INTO t VALUES (1, 2); +CREATE OR REPLACE VIEW v WITH SCHEMA TYPE EVOLUTION AS SELECT * FROM t; +SELECT * FROM v; +DESCRIBE EXTENDED v; + +DROP TABLE IF EXISTS t; +CREATE TABLE t(c1 STRING NOT NULL, c2 DOUBLE) USING PARQUET; +INSERT INTO t VALUES ('1', 2.0); +SELECT * FROM v; +-- The view now describes as v(c1 STRING, c2 DOUBLE) +DESCRIBE EXTENDED v; + +-- In TYPE EVOLUTION no new columns are inherited +DROP TABLE IF EXISTS t; +CREATE TABLE t(c1 STRING, c2 DOUBLE, c3 DATE) USING PARQUET; +INSERT INTO t VALUES ('1', 2.0, DATE'2022-01-01'); +SELECT * FROM v; +-- The view still describes as v(c1 STRING, c2 DOUBLE) +DESCRIBE EXTENDED v; + +-- Still can't drop a column +DROP TABLE IF EXISTS t; +CREATE TABLE t(c1 INT, c2 INT) USING PARQUET; +INSERT INTO t VALUES (1, 2); +CREATE OR REPLACE VIEW v WITH SCHEMA TYPE EVOLUTION AS SELECT * FROM t; +SELECT * FROM v; + +-- Describes as v(c1 INT, c2 INT) +DESCRIBE EXTENDED v; + +DROP TABLE IF EXISTS t; +CREATE TABLE t(c1 INT) USING PARQUET; + +-- The view should be invalid, it lost a column +SELECT * FROM v; +DESCRIBE EXTENDED v; + +-- Attempt to rename a column +DROP TABLE IF EXISTS t; +CREATE TABLE t(c3 INT, c2 INT) USING PARQUET; +SELECT * FROM v; +DESCRIBE EXTENDED v; + +-- Test preservation of comments +DROP TABLE IF EXISTS t; +CREATE TABLE t(c1 INT COMMENT 'c1', c2 INT COMMENT 'c2') USING PARQUET; + +-- Inherit comments from the table, if none are given +CREATE OR REPLACE VIEW v(a1, a2) WITH SCHEMA TYPE EVOLUTION AS SELECT * FROM t; +DESCRIBE EXTENDED v; + +DROP TABLE IF EXISTS t; +CREATE TABLE t(c1 BIGINT COMMENT 'c1 6a', c2 STRING COMMENT 'c2 6a') USING PARQUET; +SELECT * FROM v; +DESCRIBE EXTENDED v; + +-- TYPE EVOLUTION, column list with comments +CREATE OR REPLACE VIEW v(a1 COMMENT 'a1', a2 COMMENT 'a2') WITH SCHEMA TYPE EVOLUTION AS SELECT * FROM t; +DESCRIBE EXTENDED v; + +DROP TABLE IF EXISTS t; +CREATE TABLE t(c1 BIGINT COMMENT 'c1 6b', c2 STRING COMMENT 'c2 6b') USING PARQUET; +SELECT * FROM v; +DESCRIBE EXTENDED v; + +-- Test ALTER VIEW ... WITH SCHEMA TYPE EVOLUTION + +DROP TABLE IF EXISTS t; +CREATE TABLE t(c1 INT) USING PARQUET; +INSERT INTO t VALUES(1); +CREATE OR REPLACE VIEW v WITH SCHEMA COMPENSATION AS SELECT * FROM t; + +DROP TABLE IF EXISTS t; +CREATE TABLE t(c1 STRING) USING PARQUET; +INSERT INTO t VALUES('1'); +SELECT * FROM v; +DESCRIBE EXTENDED v; + +ALTER VIEW v WITH SCHEMA TYPE EVOLUTION; +SELECT * FROM v; +DESCRIBE EXTENDED v; + +-- Cleanup +DROP VIEW IF EXISTS v; +DROP TABLE IF EXISTS t; diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/higher-order-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/higher-order-functions.sql.out index ee4525285a9be..7bfc35a61e092 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/higher-order-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/higher-order-functions.sql.out @@ -33,6 +33,28 @@ org.apache.spark.sql.AnalysisException } +-- !query +select ceil(x -> x) as v +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "INVALID_LAMBDA_FUNCTION_CALL.NON_HIGHER_ORDER_FUNCTION", + "sqlState" : "42K0D", + "messageParameters" : { + "class" : "org.apache.spark.sql.catalyst.expressions.CeilExpressionBuilder$" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 19, + "fragment" : "ceil(x -> x)" + } ] +} + + -- !query select transform(zs, z -> z) as v from nested -- !query schema diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/keywords.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/keywords.sql.out index 8b4acd12911b4..cabbfa520d77a 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/keywords.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/keywords.sql.out @@ -20,9 +20,11 @@ AS true ASC false AT false AUTHORIZATION true +BEGIN false BETWEEN false BIGINT false BINARY false +BINDING false BOOLEAN false BOTH true BUCKET false @@ -30,6 +32,7 @@ BUCKETS false BY false BYTE false CACHE false +CALLED false CASCADE false CASE true CAST true @@ -52,9 +55,11 @@ COMMENT false COMMIT false COMPACT false COMPACTIONS false +COMPENSATION false COMPUTE false CONCATENATE false CONSTRAINT true +CONTAINS false COST false CREATE true CROSS true @@ -81,10 +86,12 @@ DECIMAL false DECLARE false DEFAULT false DEFINED false +DEFINER false DELETE false DELIMITED false DESC false DESCRIBE false +DETERMINISTIC false DFS false DIRECTORIES false DIRECTORY false @@ -97,6 +104,7 @@ ELSE true END true ESCAPE true ESCAPED false +EVOLUTION false EXCEPT true EXCHANGE false EXCLUDE false @@ -143,6 +151,7 @@ INDEX false INDEXES false INNER true INPATH false +INPUT false INPUTFORMAT false INSERT false INT false @@ -150,10 +159,12 @@ INTEGER false INTERSECT true INTERVAL false INTO true +INVOKER false IS true ITEMS false JOIN true KEYS false +LANGUAGE false LAST false LATERAL true LAZY false @@ -181,6 +192,7 @@ MILLISECONDS false MINUS false MINUTE false MINUTES false +MODIFIES false MONTH false MONTHS false MSCK false @@ -214,8 +226,6 @@ PARTITION false PARTITIONED false PARTITIONS false PERCENT false -PERCENTILE_CONT true -PERCENTILE_DISC true PIVOT false PLACING false POSITION false @@ -227,6 +237,7 @@ PURGE false QUARTER false QUERY false RANGE false +READS false REAL false RECORDREADER false RECORDWRITER false @@ -241,6 +252,8 @@ REPLACE false RESET false RESPECT false RESTRICT false +RETURN false +RETURNS false REVOKE false RIGHT true ROLE false @@ -253,6 +266,7 @@ SCHEMA false SCHEMAS false SECOND false SECONDS false +SECURITY false SELECT true SEMI false SEPARATED false @@ -270,6 +284,8 @@ SOME true SORT false SORTED false SOURCE false +SPECIFIC false +SQL true START false STATISTICS false STORED false @@ -401,14 +417,13 @@ OR ORDER OUTER OVERLAPS -PERCENTILE_CONT -PERCENTILE_DISC PRIMARY REFERENCES RIGHT SELECT SESSION_USER SOME +SQL TABLE THEN TIME diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/math.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/math.sql.out index 8cd1536d7f726..e2abcb099130a 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/math.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/math.sql.out @@ -797,3 +797,108 @@ org.apache.spark.SparkArithmeticException "fragment" : "conv('-9223372036854775807', 36, 10)" } ] } + + +-- !query +SELECT BIN(0) +-- !query schema +struct +-- !query output +0 + + +-- !query +SELECT BIN(25) +-- !query schema +struct +-- !query output +11001 + + +-- !query +SELECT BIN(25L) +-- !query schema +struct +-- !query output +11001 + + +-- !query +SELECT BIN(25.5) +-- !query schema +struct +-- !query output +11001 + + +-- !query +SELECT POSITIVE(0Y) +-- !query schema +struct<(+ 0):tinyint> +-- !query output +0 + + +-- !query +SELECT POSITIVE(25) +-- !query schema +struct<(+ 25):int> +-- !query output +25 + + +-- !query +SELECT POSITIVE(-25L) +-- !query schema +struct<(+ -25):bigint> +-- !query output +-25 + + +-- !query +SELECT POSITIVE(25.5) +-- !query schema +struct<(+ 25.5):decimal(3,1)> +-- !query output +25.5 + + +-- !query +SELECT POSITIVE("25.5") +-- !query schema +struct<(+ 25.5):double> +-- !query output +25.5 + + +-- !query +SELECT POSITIVE("invalid") +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkNumberFormatException +{ + "errorClass" : "CAST_INVALID_INPUT", + "sqlState" : "22018", + "messageParameters" : { + "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "expression" : "'invalid'", + "sourceType" : "\"STRING\"", + "targetType" : "\"DOUBLE\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 26, + "fragment" : "POSITIVE(\"invalid\")" + } ] +} + + +-- !query +SELECT POSITIVE(null) +-- !query schema +struct<(+ NULL):double> +-- !query output +NULL diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out index 8096cef266ec4..da2fa9ca0c18b 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out @@ -903,6 +903,110 @@ org.apache.spark.SparkIllegalArgumentException } +-- !query +set spark.sql.legacy.codingErrorAction=true +-- !query schema +struct +-- !query output +spark.sql.legacy.codingErrorAction true + + +-- !query +select encode('渭城朝雨浥轻尘', 'US-ASCII') +-- !query schema +struct +-- !query output +??????? + + +-- !query +select encode(scol, ecol) from values('渭城朝雨浥轻尘', 'US-ASCII') as t(scol, ecol) +-- !query schema +struct +-- !query output +??????? + + +-- !query +set spark.sql.legacy.codingErrorAction=false +-- !query schema +struct +-- !query output +spark.sql.legacy.codingErrorAction false + + +-- !query +select encode('客舍青青柳色新', 'US-ASCII') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkRuntimeException +{ + "errorClass" : "MALFORMED_CHARACTER_CODING", + "sqlState" : "22000", + "messageParameters" : { + "charset" : "US-ASCII", + "function" : "`encode`" + } +} + + +-- !query +select encode(scol, ecol) from values('客舍青青柳色新', 'US-ASCII') as t(scol, ecol) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkRuntimeException +{ + "errorClass" : "MALFORMED_CHARACTER_CODING", + "sqlState" : "22000", + "messageParameters" : { + "charset" : "US-ASCII", + "function" : "`encode`" + } +} + + +-- !query +select encode(decode(encode('白日依山尽,黄河入海流。欲穷千里目,更上一层楼。', 'UTF-16'), 'UTF-16'), 'UTF-8') +-- !query schema +struct +-- !query output +白日依山尽,黄河入海流。欲穷千里目,更上一层楼。 + + +-- !query +select encode(decode(encode('南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。', 'UTF-16'), 'UTF-16'), 'UTF-8') +-- !query schema +struct +-- !query output +南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。 + + +-- !query +select encode(decode(encode('세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark', 'UTF-16'), 'UTF-16'), 'UTF-8') +-- !query schema +struct +-- !query output +세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark + + +-- !query +select encode(decode(encode('το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας μεγάλων δεδομένων παγκοσμίως', 'UTF-16'), 'UTF-16'), 'UTF-8') +-- !query schema +struct +-- !query output +το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας μεγάλων δεδομένων παγκοσμίως + + +-- !query +select encode(decode(encode('Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。', 'UTF-16'), 'UTF-16'), 'UTF-8') +-- !query schema +struct +-- !query output +Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。 + + -- !query select decode() -- !query schema @@ -961,6 +1065,14 @@ struct abc +-- !query +select decode(encode('大千世界', 'utf-32'), 'utf-32') +-- !query schema +struct +-- !query output +大千世界 + + -- !query select decode(1, 1, 'Southlake') -- !query schema @@ -1117,6 +1229,70 @@ org.apache.spark.SparkIllegalArgumentException } +-- !query +set spark.sql.legacy.codingErrorAction=true +-- !query schema +struct +-- !query output +spark.sql.legacy.codingErrorAction true + + +-- !query +select decode(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') +-- !query schema +struct +-- !query output +��������������������� + + +-- !query +select decode(scol, ecol) from values(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') as t(scol, ecol) +-- !query schema +struct +-- !query output +��������������������� + + +-- !query +set spark.sql.legacy.codingErrorAction=false +-- !query schema +struct +-- !query output +spark.sql.legacy.codingErrorAction false + + +-- !query +select decode(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkRuntimeException +{ + "errorClass" : "MALFORMED_CHARACTER_CODING", + "sqlState" : "22000", + "messageParameters" : { + "charset" : "US-ASCII", + "function" : "`decode`" + } +} + + +-- !query +select decode(scol, ecol) from values(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') as t(scol, ecol) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkRuntimeException +{ + "errorClass" : "MALFORMED_CHARACTER_CODING", + "sqlState" : "22000", + "messageParameters" : { + "charset" : "US-ASCII", + "function" : "`decode`" + } +} + + -- !query SELECT CONTAINS(null, 'Spark') -- !query schema @@ -2058,3 +2234,106 @@ select luhn_check(123.456) struct -- !query output false + + +-- !query +select is_valid_utf8('') +-- !query schema +struct +-- !query output +true + + +-- !query +select is_valid_utf8('abc') +-- !query schema +struct +-- !query output +true + + +-- !query +select is_valid_utf8(x'80') +-- !query schema +struct +-- !query output +false + + +-- !query +select make_valid_utf8('') +-- !query schema +struct +-- !query output + + + +-- !query +select make_valid_utf8('abc') +-- !query schema +struct +-- !query output +abc + + +-- !query +select make_valid_utf8(x'80') +-- !query schema +struct +-- !query output +� + + +-- !query +select validate_utf8('') +-- !query schema +struct +-- !query output + + + +-- !query +select validate_utf8('abc') +-- !query schema +struct +-- !query output +abc + + +-- !query +select validate_utf8(x'80') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkIllegalArgumentException +{ + "errorClass" : "INVALID_UTF8_STRING", + "sqlState" : "22029", + "messageParameters" : { + "str" : "\\x80" + } +} + + +-- !query +select try_validate_utf8('') +-- !query schema +struct +-- !query output + + + +-- !query +select try_validate_utf8('abc') +-- !query schema +struct +-- !query output +abc + + +-- !query +select try_validate_utf8(x'80') +-- !query schema +struct +-- !query output +NULL diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/try_arithmetic.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/try_arithmetic.sql.out index adb6550e80830..acf6e70a50dea 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/try_arithmetic.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/try_arithmetic.sql.out @@ -15,6 +15,22 @@ struct NULL +-- !query +SELECT try_add(2147483647, decimal(1)) +-- !query schema +struct +-- !query output +2147483648 + + +-- !query +SELECT try_add(2147483647, "1") +-- !query schema +struct +-- !query output +2147483648 + + -- !query SELECT try_add(-2147483648, -1) -- !query schema @@ -341,6 +357,22 @@ org.apache.spark.SparkArithmeticException } +-- !query +SELECT try_divide(1, decimal(0)) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_divide(1, "0") +-- !query schema +struct +-- !query output +NULL + + -- !query SELECT try_divide(interval 2 year, 2) -- !query schema @@ -405,6 +437,22 @@ struct NULL +-- !query +SELECT try_subtract(2147483647, decimal(-1)) +-- !query schema +struct +-- !query output +2147483648 + + +-- !query +SELECT try_subtract(2147483647, "-1") +-- !query schema +struct +-- !query output +2147483648 + + -- !query SELECT try_subtract(-2147483648, 1) -- !query schema @@ -547,6 +595,22 @@ struct NULL +-- !query +SELECT try_multiply(2147483647, decimal(-2)) +-- !query schema +struct +-- !query output +-4294967294 + + +-- !query +SELECT try_multiply(2147483647, "-2") +-- !query schema +struct +-- !query output +-4294967294 + + -- !query SELECT try_multiply(-2147483648, 2) -- !query schema diff --git a/sql/core/src/test/resources/sql-tests/results/binary.sql.out b/sql/core/src/test/resources/sql-tests/results/binary.sql.out new file mode 100644 index 0000000000000..050f05271411a --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/binary.sql.out @@ -0,0 +1,39 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +SELECT X'' +-- !query schema +struct +-- !query output + + + +-- !query +SELECT X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333' +-- !query schema +struct +-- !query output +Eason Yao 2018-11-17:13:33:33 + + +-- !query +SELECT CAST('Spark' as BINARY) +-- !query schema +struct +-- !query output +Spark + + +-- !query +SELECT array( X'', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333', CAST('Spark' as BINARY)) +-- !query schema +struct> +-- !query output +[,Eason Yao 2018-11-17:13:33:33,Spark] + + +-- !query +SELECT to_csv(named_struct('n', 1, 'info', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333')) +-- !query schema +struct +-- !query output +1,Eason Yao 2018-11-17:13:33:33 diff --git a/sql/core/src/test/resources/sql-tests/results/binary_base64.sql.out b/sql/core/src/test/resources/sql-tests/results/binary_base64.sql.out new file mode 100644 index 0000000000000..8724e8620b48f --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/binary_base64.sql.out @@ -0,0 +1,39 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +SELECT X'' +-- !query schema +struct +-- !query output + + + +-- !query +SELECT X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333' +-- !query schema +struct +-- !query output +RWFzb24gWWFvIDIwMTgtMTEtMTc6MTM6MzM6MzM + + +-- !query +SELECT CAST('Spark' as BINARY) +-- !query schema +struct +-- !query output +U3Bhcms + + +-- !query +SELECT array( X'', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333', CAST('Spark' as BINARY)) +-- !query schema +struct> +-- !query output +[,RWFzb24gWWFvIDIwMTgtMTEtMTc6MTM6MzM6MzM,U3Bhcms] + + +-- !query +SELECT to_csv(named_struct('n', 1, 'info', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333')) +-- !query schema +struct +-- !query output +1,RWFzb24gWWFvIDIwMTgtMTEtMTc6MTM6MzM6MzM diff --git a/sql/core/src/test/resources/sql-tests/results/binary_basic.sql.out b/sql/core/src/test/resources/sql-tests/results/binary_basic.sql.out new file mode 100644 index 0000000000000..0c543a7b45476 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/binary_basic.sql.out @@ -0,0 +1,39 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +SELECT X'' +-- !query schema +struct +-- !query output +[] + + +-- !query +SELECT X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333' +-- !query schema +struct +-- !query output +[69, 97, 115, 111, 110, 32, 89, 97, 111, 32, 50, 48, 49, 56, 45, 49, 49, 45, 49, 55, 58, 49, 51, 58, 51, 51, 58, 51, 51] + + +-- !query +SELECT CAST('Spark' as BINARY) +-- !query schema +struct +-- !query output +[83, 112, 97, 114, 107] + + +-- !query +SELECT array( X'', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333', CAST('Spark' as BINARY)) +-- !query schema +struct> +-- !query output +[[],[69, 97, 115, 111, 110, 32, 89, 97, 111, 32, 50, 48, 49, 56, 45, 49, 49, 45, 49, 55, 58, 49, 51, 58, 51, 51, 58, 51, 51],[83, 112, 97, 114, 107]] + + +-- !query +SELECT to_csv(named_struct('n', 1, 'info', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333')) +-- !query schema +struct +-- !query output +1,"[69, 97, 115, 111, 110, 32, 89, 97, 111, 32, 50, 48, 49, 56, 45, 49, 49, 45, 49, 55, 58, 49, 51, 58, 51, 51, 58, 51, 51]" diff --git a/sql/core/src/test/resources/sql-tests/results/binary_hex.sql.out b/sql/core/src/test/resources/sql-tests/results/binary_hex.sql.out new file mode 100644 index 0000000000000..d977301f98e00 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/binary_hex.sql.out @@ -0,0 +1,39 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +SELECT X'' +-- !query schema +struct +-- !query output + + + +-- !query +SELECT X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333' +-- !query schema +struct +-- !query output +4561736F6E2059616F20323031382D31312D31373A31333A33333A3333 + + +-- !query +SELECT CAST('Spark' as BINARY) +-- !query schema +struct +-- !query output +537061726B + + +-- !query +SELECT array( X'', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333', CAST('Spark' as BINARY)) +-- !query schema +struct> +-- !query output +[,4561736F6E2059616F20323031382D31312D31373A31333A33333A3333,537061726B] + + +-- !query +SELECT to_csv(named_struct('n', 1, 'info', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333')) +-- !query schema +struct +-- !query output +1,4561736F6E2059616F20323031382D31312D31373A31333A33333A3333 diff --git a/sql/core/src/test/resources/sql-tests/results/binary_hex_discrete.sql.out b/sql/core/src/test/resources/sql-tests/results/binary_hex_discrete.sql.out new file mode 100644 index 0000000000000..3fc6c0f53cc54 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/binary_hex_discrete.sql.out @@ -0,0 +1,39 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +SELECT X'' +-- !query schema +struct +-- !query output +[] + + +-- !query +SELECT X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333' +-- !query schema +struct +-- !query output +[45 61 73 6F 6E 20 59 61 6F 20 32 30 31 38 2D 31 31 2D 31 37 3A 31 33 3A 33 33 3A 33 33] + + +-- !query +SELECT CAST('Spark' as BINARY) +-- !query schema +struct +-- !query output +[53 70 61 72 6B] + + +-- !query +SELECT array( X'', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333', CAST('Spark' as BINARY)) +-- !query schema +struct> +-- !query output +[[],[45 61 73 6F 6E 20 59 61 6F 20 32 30 31 38 2D 31 31 2D 31 37 3A 31 33 3A 33 33 3A 33 33],[53 70 61 72 6B]] + + +-- !query +SELECT to_csv(named_struct('n', 1, 'info', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333')) +-- !query schema +struct +-- !query output +1,[45 61 73 6F 6E 20 59 61 6F 20 32 30 31 38 2D 31 31 2D 31 37 3A 31 33 3A 33 33 3A 33 33] diff --git a/sql/core/src/test/resources/sql-tests/results/bitwise.sql.out b/sql/core/src/test/resources/sql-tests/results/bitwise.sql.out index 2c8b733004aac..7233b0d0ae499 100644 --- a/sql/core/src/test/resources/sql-tests/results/bitwise.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/bitwise.sql.out @@ -322,3 +322,155 @@ org.apache.spark.SparkIllegalArgumentException "upper" : "64" } } + + +-- !query +SELECT 20181117 >> 2 +-- !query schema +struct<(20181117 >> 2):int> +-- !query output +5045279 + + +-- !query +SELECT 20181117 << 2 +-- !query schema +struct<(20181117 << 2):int> +-- !query output +80724468 + + +-- !query +SELECT 20181117 >>> 2 +-- !query schema +struct<(20181117 >>> 2):int> +-- !query output +5045279 + + +-- !query +SELECT 20181117 > > 2 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException +{ + "errorClass" : "PARSE_SYNTAX_ERROR", + "sqlState" : "42601", + "messageParameters" : { + "error" : "'>'", + "hint" : "" + } +} + + +-- !query +SELECT 20181117 < < 2 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException +{ + "errorClass" : "PARSE_SYNTAX_ERROR", + "sqlState" : "42601", + "messageParameters" : { + "error" : "'<'", + "hint" : "" + } +} + + +-- !query +SELECT 20181117 > >> 2 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException +{ + "errorClass" : "PARSE_SYNTAX_ERROR", + "sqlState" : "42601", + "messageParameters" : { + "error" : "'>>'", + "hint" : "" + } +} + + +-- !query +SELECT 20181117 <<< 2 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException +{ + "errorClass" : "PARSE_SYNTAX_ERROR", + "sqlState" : "42601", + "messageParameters" : { + "error" : "'<'", + "hint" : "" + } +} + + +-- !query +SELECT 20181117 >>>> 2 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException +{ + "errorClass" : "PARSE_SYNTAX_ERROR", + "sqlState" : "42601", + "messageParameters" : { + "error" : "'>'", + "hint" : "" + } +} + + +-- !query +select cast(null as array>), 20181117 >> 2 +-- !query schema +struct>,(20181117 >> 2):int> +-- !query output +NULL 5045279 + + +-- !query +select cast(null as array>), 20181117 >>> 2 +-- !query schema +struct>,(20181117 >>> 2):int> +-- !query output +NULL 5045279 + + +-- !query +select cast(null as map>), 20181117 >> 2 +-- !query schema +struct>,(20181117 >> 2):int> +-- !query output +NULL 5045279 + + +-- !query +select 1 << 1 + 2 as plus_over_shift +-- !query schema +struct +-- !query output +8 + + +-- !query +select 2 >> 1 << 1 as left_to_right +-- !query schema +struct +-- !query output +2 + + +-- !query +select 1 & 2 >> 1 as shift_over_ampersand +-- !query schema +struct +-- !query output +1 diff --git a/sql/core/src/test/resources/sql-tests/results/charvarchar.sql.out b/sql/core/src/test/resources/sql-tests/results/charvarchar.sql.out index 8ff5865168878..568c9f3b29e87 100644 --- a/sql/core/src/test/resources/sql-tests/results/charvarchar.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/charvarchar.sql.out @@ -193,6 +193,7 @@ Created By [not included in comparison] Type VIEW View Text select * from char_tbl View Original Text select * from char_tbl +View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.default View Query Output Columns [c, v] @@ -222,6 +223,7 @@ struct CREATE VIEW default.char_view ( c, v) +WITH SCHEMA COMPENSATION AS select * from char_tbl @@ -362,6 +364,7 @@ Created By [not included in comparison] Type VIEW View Text select * from char_tbl2 View Original Text select * from char_tbl2 +View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.default View Query Output Columns [c, v] @@ -422,6 +425,7 @@ Created By [not included in comparison] Type VIEW View Text select * from char_tbl2 View Original Text select * from char_tbl2 +View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.default View Query Output Columns [c, v] Table Properties [yes=no] @@ -482,6 +486,7 @@ Created By [not included in comparison] Type VIEW View Text select * from char_tbl2 View Original Text select * from char_tbl2 +View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.default View Query Output Columns [c, v] diff --git a/sql/core/src/test/resources/sql-tests/results/collations.sql.out b/sql/core/src/test/resources/sql-tests/results/collations.sql.out index 4485191ba1f3b..89e6665df9d04 100644 --- a/sql/core/src/test/resources/sql-tests/results/collations.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/collations.sql.out @@ -1,6 +1,6 @@ -- Automatically generated by SQLQueryTestSuite -- !query -create table t1(utf8_binary string collate utf8_binary, utf8_binary_lcase string collate utf8_binary_lcase) using parquet +create table t1(utf8_binary string collate utf8_binary, utf8_lcase string collate utf8_lcase) using parquet -- !query schema struct<> -- !query output @@ -45,7 +45,7 @@ describe table t1 struct -- !query output utf8_binary string -utf8_binary_lcase string collate UTF8_BINARY_LCASE +utf8_lcase string collate UTF8_LCASE -- !query @@ -60,7 +60,7 @@ struct -- !query -select count(*) from t1 group by utf8_binary_lcase +select count(*) from t1 group by utf8_lcase -- !query schema struct -- !query output @@ -71,15 +71,15 @@ struct -- !query select * from t1 where utf8_binary = 'aaa' -- !query schema -struct +struct -- !query output aaa aaa -- !query -select * from t1 where utf8_binary_lcase = 'aaa' collate utf8_binary_lcase +select * from t1 where utf8_lcase = 'aaa' collate utf8_lcase -- !query schema -struct +struct -- !query output AAA AAA aaa aaa @@ -88,7 +88,7 @@ aaa aaa -- !query select * from t1 where utf8_binary < 'bbb' -- !query schema -struct +struct -- !query output AAA AAA BBB BBB @@ -96,18 +96,18 @@ aaa aaa -- !query -select * from t1 where utf8_binary_lcase < 'bbb' collate utf8_binary_lcase +select * from t1 where utf8_lcase < 'bbb' collate utf8_lcase -- !query schema -struct +struct -- !query output AAA AAA aaa aaa -- !query -select l.utf8_binary, r.utf8_binary_lcase from t1 l join t1 r on l.utf8_binary_lcase = r.utf8_binary_lcase +select l.utf8_binary, r.utf8_lcase from t1 l join t1 r on l.utf8_lcase = r.utf8_lcase -- !query schema -struct +struct -- !query output AAA AAA AAA aaa @@ -120,7 +120,7 @@ bbb bbb -- !query -create table t2(utf8_binary string collate utf8_binary, utf8_binary_lcase string collate utf8_binary_lcase) using parquet +create table t2(utf8_binary string collate utf8_binary, utf8_lcase string collate utf8_lcase) using parquet -- !query schema struct<> -- !query output @@ -144,9 +144,9 @@ struct<> -- !query -select * from t1 anti join t2 on t1.utf8_binary_lcase = t2.utf8_binary_lcase +select * from t1 anti join t2 on t1.utf8_lcase = t2.utf8_lcase -- !query schema -struct +struct -- !query output @@ -168,17 +168,17 @@ struct<> -- !query -select col1 collate utf8_binary_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') except select col1 collate utf8_binary_lcase from values ('aaa'), ('bbb') +select col1 collate utf8_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') except select col1 collate utf8_lcase from values ('aaa'), ('bbb') -- !query schema -struct +struct -- !query output zzz -- !query -select col1 collate utf8_binary_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') except all select col1 collate utf8_binary_lcase from values ('aaa'), ('bbb') +select col1 collate utf8_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') except all select col1 collate utf8_lcase from values ('aaa'), ('bbb') -- !query schema -struct +struct -- !query output aaa bbb @@ -187,9 +187,9 @@ zzz -- !query -select col1 collate utf8_binary_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') union select col1 collate utf8_binary_lcase from values ('aaa'), ('bbb') +select col1 collate utf8_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') union select col1 collate utf8_lcase from values ('aaa'), ('bbb') -- !query schema -struct +struct -- !query output aaa bbb @@ -197,9 +197,9 @@ zzz -- !query -select col1 collate utf8_binary_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') union all select col1 collate utf8_binary_lcase from values ('aaa'), ('bbb') +select col1 collate utf8_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') union all select col1 collate utf8_lcase from values ('aaa'), ('bbb') -- !query schema -struct +struct -- !query output AAA BBB @@ -212,16 +212,16 @@ zzz -- !query -select col1 collate utf8_binary_lcase from values ('aaa'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') intersect select col1 collate utf8_binary_lcase from values ('aaa'), ('bbb') +select col1 collate utf8_lcase from values ('aaa'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') intersect select col1 collate utf8_lcase from values ('aaa'), ('bbb') -- !query schema -struct +struct -- !query output aaa bbb -- !query -create table t1 (c1 struct) USING PARQUET +create table t1 (c1 struct) USING PARQUET -- !query schema struct<> -- !query output @@ -229,7 +229,7 @@ struct<> -- !query -insert into t1 values (named_struct('utf8_binary', 'aaa', 'utf8_binary_lcase', 'aaa')) +insert into t1 values (named_struct('utf8_binary', 'aaa', 'utf8_lcase', 'aaa')) -- !query schema struct<> -- !query output @@ -237,7 +237,7 @@ struct<> -- !query -insert into t1 values (named_struct('utf8_binary', 'AAA', 'utf8_binary_lcase', 'AAA')) +insert into t1 values (named_struct('utf8_binary', 'AAA', 'utf8_lcase', 'AAA')) -- !query schema struct<> -- !query output @@ -254,7 +254,7 @@ struct -- !query -select count(*) from t1 group by c1.utf8_binary_lcase +select count(*) from t1 group by c1.utf8_lcase -- !query schema struct -- !query output @@ -270,7 +270,7 @@ struct<> -- !query -select array_contains(ARRAY('aaa' collate utf8_binary_lcase),'AAA' collate utf8_binary_lcase) +select array_contains(ARRAY('aaa' collate utf8_lcase),'AAA' collate utf8_lcase) -- !query schema struct -- !query output @@ -278,7 +278,7 @@ true -- !query -select array_position(ARRAY('aaa' collate utf8_binary_lcase, 'bbb' collate utf8_binary_lcase),'BBB' collate utf8_binary_lcase) +select array_position(ARRAY('aaa' collate utf8_lcase, 'bbb' collate utf8_lcase),'BBB' collate utf8_lcase) -- !query schema struct -- !query output @@ -286,23 +286,23 @@ struct -- !query -select nullif('aaa' COLLATE utf8_binary_lcase, 'AAA' COLLATE utf8_binary_lcase) +select nullif('aaa' COLLATE utf8_lcase, 'AAA' COLLATE utf8_lcase) -- !query schema -struct +struct -- !query output NULL -- !query -select least('aaa' COLLATE utf8_binary_lcase, 'AAA' collate utf8_binary_lcase, 'a' collate utf8_binary_lcase) +select least('aaa' COLLATE utf8_lcase, 'AAA' collate utf8_lcase, 'a' collate utf8_lcase) -- !query schema -struct +struct -- !query output a -- !query -select arrays_overlap(array('aaa' collate utf8_binary_lcase), array('AAA' collate utf8_binary_lcase)) +select arrays_overlap(array('aaa' collate utf8_lcase), array('AAA' collate utf8_lcase)) -- !query schema struct -- !query output @@ -310,32 +310,120 @@ true -- !query -select array_distinct(array('aaa' collate utf8_binary_lcase, 'AAA' collate utf8_binary_lcase)) +select array_distinct(array('aaa' collate utf8_lcase, 'AAA' collate utf8_lcase)) -- !query schema -struct> +struct> -- !query output ["aaa"] -- !query -select array_union(array('aaa' collate utf8_binary_lcase), array('AAA' collate utf8_binary_lcase)) +select array_union(array('aaa' collate utf8_lcase), array('AAA' collate utf8_lcase)) -- !query schema -struct> +struct> -- !query output ["aaa"] -- !query -select array_intersect(array('aaa' collate utf8_binary_lcase), array('AAA' collate utf8_binary_lcase)) +select array_intersect(array('aaa' collate utf8_lcase), array('AAA' collate utf8_lcase)) -- !query schema -struct> +struct> -- !query output ["aaa"] -- !query -select array_except(array('aaa' collate utf8_binary_lcase), array('AAA' collate utf8_binary_lcase)) +select array_except(array('aaa' collate utf8_lcase), array('AAA' collate utf8_lcase)) -- !query schema -struct> +struct> -- !query output [] + + +-- !query +select 'a' collate unicode < 'A' +-- !query schema +struct<(collate(a) < A):boolean> +-- !query output +true + + +-- !query +select 'a' collate unicode_ci = 'A' +-- !query schema +struct<(collate(a) = A):boolean> +-- !query output +true + + +-- !query +select 'a' collate unicode_ai = 'å' +-- !query schema +struct<(collate(a) = å):boolean> +-- !query output +true + + +-- !query +select 'a' collate unicode_ci_ai = 'Å' +-- !query schema +struct<(collate(a) = Å):boolean> +-- !query output +true + + +-- !query +select 'a' collate en < 'A' +-- !query schema +struct<(collate(a) < A):boolean> +-- !query output +true + + +-- !query +select 'a' collate en_ci = 'A' +-- !query schema +struct<(collate(a) = A):boolean> +-- !query output +true + + +-- !query +select 'a' collate en_ai = 'å' +-- !query schema +struct<(collate(a) = å):boolean> +-- !query output +true + + +-- !query +select 'a' collate en_ci_ai = 'Å' +-- !query schema +struct<(collate(a) = Å):boolean> +-- !query output +true + + +-- !query +select 'Kypper' collate sv < 'Köpfe' +-- !query schema +struct<(collate(Kypper) < Köpfe):boolean> +-- !query output +true + + +-- !query +select 'Kypper' collate de > 'Köpfe' +-- !query schema +struct<(collate(Kypper) > Köpfe):boolean> +-- !query output +true + + +-- !query +select 'I' collate tr_ci = 'ı' +-- !query schema +struct<(collate(I) = ı):boolean> +-- !query output +true diff --git a/sql/core/src/test/resources/sql-tests/results/cte-legacy.sql.out b/sql/core/src/test/resources/sql-tests/results/cte-legacy.sql.out index b79d8b1afb0d4..1255e8b51f301 100644 --- a/sql/core/src/test/resources/sql-tests/results/cte-legacy.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/cte-legacy.sql.out @@ -33,6 +33,28 @@ struct 1 +-- !query +SELECT ( + WITH unreferenced AS (SELECT id) + SELECT 1 +) FROM range(1) +-- !query schema +struct +-- !query output +1 + + +-- !query +SELECT ( + WITH unreferenced AS (SELECT 1) + SELECT id +) FROM range(1) +-- !query schema +struct +-- !query output +0 + + -- !query SELECT * FROM ( diff --git a/sql/core/src/test/resources/sql-tests/results/cte-nested.sql.out b/sql/core/src/test/resources/sql-tests/results/cte-nested.sql.out index a93bcb7593768..7cf488ce8cad4 100644 --- a/sql/core/src/test/resources/sql-tests/results/cte-nested.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/cte-nested.sql.out @@ -33,6 +33,28 @@ struct 1 +-- !query +SELECT ( + WITH unreferenced AS (SELECT id) + SELECT 1 +) FROM range(1) +-- !query schema +struct +-- !query output +1 + + +-- !query +SELECT ( + WITH unreferenced AS (SELECT 1) + SELECT id +) FROM range(1) +-- !query schema +struct +-- !query output +0 + + -- !query SELECT * FROM ( diff --git a/sql/core/src/test/resources/sql-tests/results/cte-nonlegacy.sql.out b/sql/core/src/test/resources/sql-tests/results/cte-nonlegacy.sql.out index ba311c0253ab1..94ef47397eff1 100644 --- a/sql/core/src/test/resources/sql-tests/results/cte-nonlegacy.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/cte-nonlegacy.sql.out @@ -33,6 +33,28 @@ struct 1 +-- !query +SELECT ( + WITH unreferenced AS (SELECT id) + SELECT 1 +) FROM range(1) +-- !query schema +struct +-- !query output +1 + + +-- !query +SELECT ( + WITH unreferenced AS (SELECT 1) + SELECT id +) FROM range(1) +-- !query schema +struct +-- !query output +0 + + -- !query SELECT * FROM ( diff --git a/sql/core/src/test/resources/sql-tests/results/describe.sql.out b/sql/core/src/test/resources/sql-tests/results/describe.sql.out index 10c27ea0cc794..004802c48d4c0 100644 --- a/sql/core/src/test/resources/sql-tests/results/describe.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/describe.sql.out @@ -536,6 +536,7 @@ Created By [not included in comparison] Type VIEW View Text SELECT * FROM t View Original Text SELECT * FROM t +View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.default View Query Output Columns [a, b, c, d] @@ -560,6 +561,7 @@ Created By [not included in comparison] Type VIEW View Text SELECT * FROM t View Original Text SELECT * FROM t +View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.default View Query Output Columns [a, b, c, d] diff --git a/sql/core/src/test/resources/sql-tests/results/explain-aqe.sql.out b/sql/core/src/test/resources/sql-tests/results/explain-aqe.sql.out index 97a3f6c8f2ee2..3830b47ba8a6d 100644 --- a/sql/core/src/test/resources/sql-tests/results/explain-aqe.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/explain-aqe.sql.out @@ -975,7 +975,7 @@ Execute CreateViewCommand (1) Output: [] (2) CreateViewCommand -Arguments: `spark_catalog`.`default`.`explain_view`, SELECT key, val FROM explain_temp1, false, false, PersistedView, true +Arguments: `spark_catalog`.`default`.`explain_view`, SELECT key, val FROM explain_temp1, false, false, PersistedView, COMPENSATION, true (3) LogicalRelation Arguments: parquet, [key#x, val#x], `spark_catalog`.`default`.`explain_temp1`, false diff --git a/sql/core/src/test/resources/sql-tests/results/explain.sql.out b/sql/core/src/test/resources/sql-tests/results/explain.sql.out index e21d968eb252e..c0dee38e6d07a 100644 --- a/sql/core/src/test/resources/sql-tests/results/explain.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/explain.sql.out @@ -870,7 +870,7 @@ Execute CreateViewCommand (1) Output: [] (2) CreateViewCommand -Arguments: `spark_catalog`.`default`.`explain_view`, SELECT key, val FROM explain_temp1, false, false, PersistedView, true +Arguments: `spark_catalog`.`default`.`explain_view`, SELECT key, val FROM explain_temp1, false, false, PersistedView, COMPENSATION, true (3) LogicalRelation Arguments: parquet, [key#x, val#x], `spark_catalog`.`default`.`explain_temp1`, false diff --git a/sql/core/src/test/resources/sql-tests/results/higher-order-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/higher-order-functions.sql.out index ee4525285a9be..7bfc35a61e092 100644 --- a/sql/core/src/test/resources/sql-tests/results/higher-order-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/higher-order-functions.sql.out @@ -33,6 +33,28 @@ org.apache.spark.sql.AnalysisException } +-- !query +select ceil(x -> x) as v +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "INVALID_LAMBDA_FUNCTION_CALL.NON_HIGHER_ORDER_FUNCTION", + "sqlState" : "42K0D", + "messageParameters" : { + "class" : "org.apache.spark.sql.catalyst.expressions.CeilExpressionBuilder$" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 19, + "fragment" : "ceil(x -> x)" + } ] +} + + -- !query select transform(zs, z -> z) as v from nested -- !query schema diff --git a/sql/core/src/test/resources/sql-tests/results/identifier-clause.sql.out b/sql/core/src/test/resources/sql-tests/results/identifier-clause.sql.out index 9dfc6a66b0782..2aa809324a763 100644 --- a/sql/core/src/test/resources/sql-tests/results/identifier-clause.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/identifier-clause.sql.out @@ -843,7 +843,7 @@ org.apache.spark.sql.AnalysisException -- !query -CREATE TABLE IDENTIFIER(1)(c1 INT) +CREATE TABLE IDENTIFIER(1)(c1 INT) USING csv -- !query schema struct<> -- !query output @@ -867,7 +867,7 @@ org.apache.spark.sql.AnalysisException -- !query -CREATE TABLE IDENTIFIER('a.b.c')(c1 INT) +CREATE TABLE IDENTIFIER('a.b.c')(c1 INT) USING csv -- !query schema struct<> -- !query output @@ -1059,6 +1059,62 @@ org.apache.spark.sql.catalyst.parser.ParseException } +-- !query +create temporary view identifier('v1') as (select my_col from (values (1), (2), (1) as (my_col)) group by 1) +-- !query schema +struct<> +-- !query output + + + +-- !query +cache table identifier('t1') as (select my_col from (values (1), (2), (1) as (my_col)) group by 1) +-- !query schema +struct<> +-- !query output + + + +-- !query +create table identifier('t2') using csv as (select my_col from (values (1), (2), (1) as (my_col)) group by 1) +-- !query schema +struct<> +-- !query output + + + +-- !query +insert into identifier('t2') select my_col from (values (3) as (my_col)) group by 1 +-- !query schema +struct<> +-- !query output + + + +-- !query +drop view v1 +-- !query schema +struct<> +-- !query output + + + +-- !query +drop table t1 +-- !query schema +struct<> +-- !query output + + + +-- !query +drop table t2 +-- !query schema +struct<> +-- !query output + + + -- !query SELECT row_number() OVER IDENTIFIER('x.win') FROM VALUES(1) AS T(c1) WINDOW win AS (ORDER BY c1) -- !query schema diff --git a/sql/core/src/test/resources/sql-tests/results/keywords.sql.out b/sql/core/src/test/resources/sql-tests/results/keywords.sql.out index 884f17c23eb00..e304509aa6d75 100644 --- a/sql/core/src/test/resources/sql-tests/results/keywords.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/keywords.sql.out @@ -20,9 +20,11 @@ AS false ASC false AT false AUTHORIZATION false +BEGIN false BETWEEN false BIGINT false BINARY false +BINDING false BOOLEAN false BOTH false BUCKET false @@ -30,6 +32,7 @@ BUCKETS false BY false BYTE false CACHE false +CALLED false CASCADE false CASE false CAST false @@ -52,9 +55,11 @@ COMMENT false COMMIT false COMPACT false COMPACTIONS false +COMPENSATION false COMPUTE false CONCATENATE false CONSTRAINT false +CONTAINS false COST false CREATE false CROSS false @@ -81,10 +86,12 @@ DECIMAL false DECLARE false DEFAULT false DEFINED false +DEFINER false DELETE false DELIMITED false DESC false DESCRIBE false +DETERMINISTIC false DFS false DIRECTORIES false DIRECTORY false @@ -97,6 +104,7 @@ ELSE false END false ESCAPE false ESCAPED false +EVOLUTION false EXCEPT false EXCHANGE false EXCLUDE false @@ -143,6 +151,7 @@ INDEX false INDEXES false INNER false INPATH false +INPUT false INPUTFORMAT false INSERT false INT false @@ -150,10 +159,12 @@ INTEGER false INTERSECT false INTERVAL false INTO false +INVOKER false IS false ITEMS false JOIN false KEYS false +LANGUAGE false LAST false LATERAL false LAZY false @@ -181,6 +192,7 @@ MILLISECONDS false MINUS false MINUTE false MINUTES false +MODIFIES false MONTH false MONTHS false MSCK false @@ -214,8 +226,6 @@ PARTITION false PARTITIONED false PARTITIONS false PERCENT false -PERCENTILE_CONT false -PERCENTILE_DISC false PIVOT false PLACING false POSITION false @@ -227,6 +237,7 @@ PURGE false QUARTER false QUERY false RANGE false +READS false REAL false RECORDREADER false RECORDWRITER false @@ -241,6 +252,8 @@ REPLACE false RESET false RESPECT false RESTRICT false +RETURN false +RETURNS false REVOKE false RIGHT false ROLE false @@ -253,6 +266,7 @@ SCHEMA false SCHEMAS false SECOND false SECONDS false +SECURITY false SELECT false SEMI false SEPARATED false @@ -270,6 +284,8 @@ SOME false SORT false SORTED false SOURCE false +SPECIFIC false +SQL false START false STATISTICS false STORED false diff --git a/sql/core/src/test/resources/sql-tests/results/math.sql.out b/sql/core/src/test/resources/sql-tests/results/math.sql.out index d3df5cb933574..09f4383933288 100644 --- a/sql/core/src/test/resources/sql-tests/results/math.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/math.sql.out @@ -493,3 +493,91 @@ SELECT conv('-9223372036854775807', 36, 10) struct -- !query output 18446744073709551615 + + +-- !query +SELECT BIN(0) +-- !query schema +struct +-- !query output +0 + + +-- !query +SELECT BIN(25) +-- !query schema +struct +-- !query output +11001 + + +-- !query +SELECT BIN(25L) +-- !query schema +struct +-- !query output +11001 + + +-- !query +SELECT BIN(25.5) +-- !query schema +struct +-- !query output +11001 + + +-- !query +SELECT POSITIVE(0Y) +-- !query schema +struct<(+ 0):tinyint> +-- !query output +0 + + +-- !query +SELECT POSITIVE(25) +-- !query schema +struct<(+ 25):int> +-- !query output +25 + + +-- !query +SELECT POSITIVE(-25L) +-- !query schema +struct<(+ -25):bigint> +-- !query output +-25 + + +-- !query +SELECT POSITIVE(25.5) +-- !query schema +struct<(+ 25.5):decimal(3,1)> +-- !query output +25.5 + + +-- !query +SELECT POSITIVE("25.5") +-- !query schema +struct<(+ 25.5):double> +-- !query output +25.5 + + +-- !query +SELECT POSITIVE("invalid") +-- !query schema +struct<(+ invalid):double> +-- !query output +NULL + + +-- !query +SELECT POSITIVE(null) +-- !query schema +struct<(+ NULL):double> +-- !query output +NULL diff --git a/sql/core/src/test/resources/sql-tests/results/named-function-arguments.sql.out b/sql/core/src/test/resources/sql-tests/results/named-function-arguments.sql.out index 64a6c4b23722a..299730fdf3787 100644 --- a/sql/core/src/test/resources/sql-tests/results/named-function-arguments.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/named-function-arguments.sql.out @@ -188,7 +188,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.UNSUPPORTED_TABLE_ARGUMENT", "sqlState" : "0A000", "messageParameters" : { - "treeNode" : "'Generate explode(table-argument#x []), false\n: +- SubqueryAlias v\n: +- View (`v`, [id#xL])\n: +- Project [cast(id#xL as bigint) AS id#xL]\n: +- Project [id#xL]\n: +- Range (0, 8, step=1, splits=None)\n+- OneRowRelation\n" + "treeNode" : "'Generate explode(table-argument#x []), false\n: +- SubqueryAlias v\n: +- View (`v`, [id#xL])\n: +- Project [cast(id#xL as bigint) AS id#xL]\n: +- Project [id#xL]\n: +- Range (0, 8, step=1)\n+- OneRowRelation\n" }, "queryContext" : [ { "objectType" : "", diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/create_view.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/create_view.sql.out index 5a36cfd2369b4..4c2fd671229a1 100644 --- a/sql/core/src/test/resources/sql-tests/results/postgreSQL/create_view.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/create_view.sql.out @@ -267,6 +267,7 @@ Created By [not included in comparison] Type VIEW View Text SELECT * FROM base_table View Original Text SELECT * FROM base_table +View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.temp_view_test View Query Output Columns [a, id] @@ -332,6 +333,7 @@ Created By [not included in comparison] Type VIEW View Text SELECT * FROM base_table View Original Text SELECT * FROM base_table +View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.temp_view_test View Query Output Columns [a, id] @@ -387,6 +389,7 @@ View Text SELECT t1.a AS t1_a, t2.a AS t2_a View Original Text SELECT t1.a AS t1_a, t2.a AS t2_a FROM base_table t1, base_table2 t2 WHERE t1.id = t2.id +View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.temp_view_test View Query Output Columns [t1_a, t2_a] @@ -459,6 +462,7 @@ Created By [not included in comparison] Type VIEW View Text SELECT * FROM base_table WHERE id IN (SELECT id FROM base_table2) View Original Text SELECT * FROM base_table WHERE id IN (SELECT id FROM base_table2) +View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.temp_view_test View Query Output Columns [a, id] @@ -489,6 +493,7 @@ Created By [not included in comparison] Type VIEW View Text SELECT t1.id, t2.a FROM base_table t1, (SELECT * FROM base_table2) t2 View Original Text SELECT t1.id, t2.a FROM base_table t1, (SELECT * FROM base_table2) t2 +View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.temp_view_test View Query Output Columns [id, a] @@ -519,6 +524,7 @@ Created By [not included in comparison] Type VIEW View Text SELECT * FROM base_table WHERE EXISTS (SELECT 1 FROM base_table2) View Original Text SELECT * FROM base_table WHERE EXISTS (SELECT 1 FROM base_table2) +View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.temp_view_test View Query Output Columns [a, id] @@ -549,6 +555,7 @@ Created By [not included in comparison] Type VIEW View Text SELECT * FROM base_table WHERE NOT EXISTS (SELECT 1 FROM base_table2) View Original Text SELECT * FROM base_table WHERE NOT EXISTS (SELECT 1 FROM base_table2) +View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.temp_view_test View Query Output Columns [a, id] @@ -579,6 +586,7 @@ Created By [not included in comparison] Type VIEW View Text SELECT * FROM base_table WHERE EXISTS (SELECT 1) View Original Text SELECT * FROM base_table WHERE EXISTS (SELECT 1) +View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.temp_view_test View Query Output Columns [a, id] @@ -790,6 +798,7 @@ Created By [not included in comparison] Type VIEW View Text SELECT * FROM t1 CROSS JOIN t2 View Original Text SELECT * FROM t1 CROSS JOIN t2 +View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.testviewschm2 View Query Output Columns [num, name, num2, value] @@ -840,6 +849,7 @@ Created By [not included in comparison] Type VIEW View Text SELECT * FROM t1 INNER JOIN t2 ON t1.num = t2.num2 View Original Text SELECT * FROM t1 INNER JOIN t2 ON t1.num = t2.num2 +View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.testviewschm2 View Query Output Columns [num, name, num2, value] @@ -890,6 +900,7 @@ Created By [not included in comparison] Type VIEW View Text SELECT * FROM t1 LEFT JOIN t2 ON t1.num = t2.num2 View Original Text SELECT * FROM t1 LEFT JOIN t2 ON t1.num = t2.num2 +View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.testviewschm2 View Query Output Columns [num, name, num2, value] @@ -940,6 +951,7 @@ Created By [not included in comparison] Type VIEW View Text SELECT * FROM t1 LEFT JOIN t2 ON t1.num = t2.num2 AND t2.value = 'xxx' View Original Text SELECT * FROM t1 LEFT JOIN t2 ON t1.num = t2.num2 AND t2.value = 'xxx' +View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.testviewschm2 View Query Output Columns [num, name, num2, value] @@ -1060,6 +1072,7 @@ AND EXISTS (SELECT g FROM tbl4 LEFT JOIN tbl3 ON tbl4.h = tbl3.f) View Original Text SELECT * FROM tbl1 WHERE tbl1.a BETWEEN (SELECT d FROM tbl2 WHERE c = 1) AND (SELECT e FROM tbl3 WHERE f = 2) AND EXISTS (SELECT g FROM tbl4 LEFT JOIN tbl3 ON tbl4.h = tbl3.f) +View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.testviewschm2 View Query Output Columns [a, b] @@ -1099,6 +1112,7 @@ View Original Text SELECT * FROM tbl1 WHERE tbl1.a BETWEEN (SELECT d FROM tbl2 WHERE c = 1) AND (SELECT e FROM tbl3 WHERE f = 2) AND EXISTS (SELECT g FROM tbl4 LEFT JOIN tbl3 ON tbl4.h = tbl3.f) AND NOT EXISTS (SELECT g FROM tbl4 LEFT JOIN tmptbl ON tbl4.h = tmptbl.j) +View Schema Mode COMPENSATION View Catalog and Namespace spark_catalog.testviewschm2 View Query Output Columns [a, b] diff --git a/sql/core/src/test/resources/sql-tests/results/predicate-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/predicate-functions.sql.out index 71c342054ae47..5b97f2a27b8ed 100644 --- a/sql/core/src/test/resources/sql-tests/results/predicate-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/predicate-functions.sql.out @@ -1,4 +1,172 @@ -- Automatically generated by SQLQueryTestSuite +-- !query +select not true +-- !query schema +struct<(NOT true):boolean> +-- !query output +false + + +-- !query +select ! true +-- !query schema +struct<(NOT true):boolean> +-- !query output +false + + +-- !query +select not null::boolean +-- !query schema +struct<(NOT CAST(NULL AS BOOLEAN)):boolean> +-- !query output +NULL + + +-- !query +select true and true +-- !query schema +struct<(true AND true):boolean> +-- !query output +true + + +-- !query +select true and false +-- !query schema +struct<(true AND false):boolean> +-- !query output +false + + +-- !query +select false and true +-- !query schema +struct<(false AND true):boolean> +-- !query output +false + + +-- !query +select false and false +-- !query schema +struct<(false AND false):boolean> +-- !query output +false + + +-- !query +select true and null::boolean +-- !query schema +struct<(true AND CAST(NULL AS BOOLEAN)):boolean> +-- !query output +NULL + + +-- !query +select false and null::boolean +-- !query schema +struct<(false AND CAST(NULL AS BOOLEAN)):boolean> +-- !query output +false + + +-- !query +select null::boolean and true +-- !query schema +struct<(CAST(NULL AS BOOLEAN) AND true):boolean> +-- !query output +NULL + + +-- !query +select null::boolean and false +-- !query schema +struct<(CAST(NULL AS BOOLEAN) AND false):boolean> +-- !query output +false + + +-- !query +select null::boolean and null::boolean +-- !query schema +struct<(CAST(NULL AS BOOLEAN) AND CAST(NULL AS BOOLEAN)):boolean> +-- !query output +NULL + + +-- !query +select true or true +-- !query schema +struct<(true OR true):boolean> +-- !query output +true + + +-- !query +select true or false +-- !query schema +struct<(true OR false):boolean> +-- !query output +true + + +-- !query +select false or true +-- !query schema +struct<(false OR true):boolean> +-- !query output +true + + +-- !query +select false or false +-- !query schema +struct<(false OR false):boolean> +-- !query output +false + + +-- !query +select true or null::boolean +-- !query schema +struct<(true OR CAST(NULL AS BOOLEAN)):boolean> +-- !query output +true + + +-- !query +select false or null::boolean +-- !query schema +struct<(false OR CAST(NULL AS BOOLEAN)):boolean> +-- !query output +NULL + + +-- !query +select null::boolean or true +-- !query schema +struct<(CAST(NULL AS BOOLEAN) OR true):boolean> +-- !query output +true + + +-- !query +select null::boolean or false +-- !query schema +struct<(CAST(NULL AS BOOLEAN) OR false):boolean> +-- !query output +NULL + + +-- !query +select null::boolean or null::boolean +-- !query schema +struct<(CAST(NULL AS BOOLEAN) OR CAST(NULL AS BOOLEAN)):boolean> +-- !query output +NULL + + -- !query select 1 = 1 -- !query schema @@ -517,3 +685,59 @@ select rand(123) not between 0.1 AND 0.2 struct<(NOT between(rand(123), 0.1, 0.2)):boolean> -- !query output false + + +-- !query +set spark.sql.legacy.bangEqualsNot=true +-- !query schema +struct +-- !query output +spark.sql.legacy.bangEqualsNot true + + +-- !query +select 1 ! between 0 and 2 +-- !query schema +struct<(NOT between(1, 0, 2)):boolean> +-- !query output +false + + +-- !query +select 1 ! in (3, 4) +-- !query schema +struct<(NOT (1 IN (3, 4))):boolean> +-- !query output +true + + +-- !query +select 'hello' ! like 'world' +-- !query schema +struct<(NOT hello LIKE world):boolean> +-- !query output +true + + +-- !query +select 1 is ! null +-- !query schema +struct<(1 IS NOT NULL):boolean> +-- !query output +true + + +-- !query +select false is ! true +-- !query schema +struct<(NOT (false <=> true)):boolean> +-- !query output +true + + +-- !query +set spark.sql.legacy.bangEqualsNot=false +-- !query schema +struct +-- !query output +spark.sql.legacy.bangEqualsNot false diff --git a/sql/core/src/test/resources/sql-tests/results/show-create-table.sql.out b/sql/core/src/test/resources/sql-tests/results/show-create-table.sql.out index dcb96b9d2dce6..ad96e7e106ad9 100644 --- a/sql/core/src/test/resources/sql-tests/results/show-create-table.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/show-create-table.sql.out @@ -327,6 +327,7 @@ struct CREATE VIEW default.view_spark_30302 ( aaa, bbb) +WITH SCHEMA COMPENSATION AS SELECT a, b FROM tbl @@ -338,6 +339,7 @@ struct CREATE VIEW default.view_spark_30302 ( aaa, bbb) +WITH SCHEMA COMPENSATION AS SELECT a, b FROM tbl @@ -368,6 +370,7 @@ CREATE VIEW default.view_spark_30302 ( aaa COMMENT 'comment with \'quoted text\' for aaa', bbb) COMMENT 'This is a comment with \'quoted text\' for view' +WITH SCHEMA COMPENSATION AS SELECT a, b FROM tbl @@ -380,6 +383,7 @@ CREATE VIEW default.view_spark_30302 ( aaa COMMENT 'comment with \'quoted text\' for aaa', bbb) COMMENT 'This is a comment with \'quoted text\' for view' +WITH SCHEMA COMPENSATION AS SELECT a, b FROM tbl @@ -412,6 +416,7 @@ CREATE VIEW default.view_spark_30302 ( TBLPROPERTIES ( 'a' = '1', 'b' = '2') +WITH SCHEMA COMPENSATION AS SELECT a, b FROM tbl @@ -426,6 +431,7 @@ CREATE VIEW default.view_spark_30302 ( TBLPROPERTIES ( 'a' = '1', 'b' = '2') +WITH SCHEMA COMPENSATION AS SELECT a, b FROM tbl diff --git a/sql/core/src/test/resources/sql-tests/results/show-tables.sql.out b/sql/core/src/test/resources/sql-tests/results/show-tables.sql.out index 442f0fe5d5f27..cc32e2eff2551 100644 --- a/sql/core/src/test/resources/sql-tests/results/show-tables.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/show-tables.sql.out @@ -124,6 +124,7 @@ Created Time [not included in comparison] Last Access [not included in comparison] Created By [not included in comparison] Type: VIEW +View Schema Mode: BINDING Schema: root |-- e: integer (nullable = true) diff --git a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out index 91ad830dd3d7a..d42c387c8057f 100644 --- a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out @@ -835,6 +835,110 @@ org.apache.spark.SparkIllegalArgumentException } +-- !query +set spark.sql.legacy.codingErrorAction=true +-- !query schema +struct +-- !query output +spark.sql.legacy.codingErrorAction true + + +-- !query +select encode('渭城朝雨浥轻尘', 'US-ASCII') +-- !query schema +struct +-- !query output +??????? + + +-- !query +select encode(scol, ecol) from values('渭城朝雨浥轻尘', 'US-ASCII') as t(scol, ecol) +-- !query schema +struct +-- !query output +??????? + + +-- !query +set spark.sql.legacy.codingErrorAction=false +-- !query schema +struct +-- !query output +spark.sql.legacy.codingErrorAction false + + +-- !query +select encode('客舍青青柳色新', 'US-ASCII') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkRuntimeException +{ + "errorClass" : "MALFORMED_CHARACTER_CODING", + "sqlState" : "22000", + "messageParameters" : { + "charset" : "US-ASCII", + "function" : "`encode`" + } +} + + +-- !query +select encode(scol, ecol) from values('客舍青青柳色新', 'US-ASCII') as t(scol, ecol) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkRuntimeException +{ + "errorClass" : "MALFORMED_CHARACTER_CODING", + "sqlState" : "22000", + "messageParameters" : { + "charset" : "US-ASCII", + "function" : "`encode`" + } +} + + +-- !query +select encode(decode(encode('白日依山尽,黄河入海流。欲穷千里目,更上一层楼。', 'UTF-16'), 'UTF-16'), 'UTF-8') +-- !query schema +struct +-- !query output +白日依山尽,黄河入海流。欲穷千里目,更上一层楼。 + + +-- !query +select encode(decode(encode('南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。', 'UTF-16'), 'UTF-16'), 'UTF-8') +-- !query schema +struct +-- !query output +南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。 + + +-- !query +select encode(decode(encode('세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark', 'UTF-16'), 'UTF-16'), 'UTF-8') +-- !query schema +struct +-- !query output +세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark + + +-- !query +select encode(decode(encode('το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας μεγάλων δεδομένων παγκοσμίως', 'UTF-16'), 'UTF-16'), 'UTF-8') +-- !query schema +struct +-- !query output +το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας μεγάλων δεδομένων παγκοσμίως + + +-- !query +select encode(decode(encode('Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。', 'UTF-16'), 'UTF-16'), 'UTF-8') +-- !query schema +struct +-- !query output +Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。 + + -- !query select decode() -- !query schema @@ -893,6 +997,14 @@ struct abc +-- !query +select decode(encode('大千世界', 'utf-32'), 'utf-32') +-- !query schema +struct +-- !query output +大千世界 + + -- !query select decode(1, 1, 'Southlake') -- !query schema @@ -1049,6 +1161,70 @@ org.apache.spark.SparkIllegalArgumentException } +-- !query +set spark.sql.legacy.codingErrorAction=true +-- !query schema +struct +-- !query output +spark.sql.legacy.codingErrorAction true + + +-- !query +select decode(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') +-- !query schema +struct +-- !query output +��������������������� + + +-- !query +select decode(scol, ecol) from values(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') as t(scol, ecol) +-- !query schema +struct +-- !query output +��������������������� + + +-- !query +set spark.sql.legacy.codingErrorAction=false +-- !query schema +struct +-- !query output +spark.sql.legacy.codingErrorAction false + + +-- !query +select decode(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkRuntimeException +{ + "errorClass" : "MALFORMED_CHARACTER_CODING", + "sqlState" : "22000", + "messageParameters" : { + "charset" : "US-ASCII", + "function" : "`decode`" + } +} + + +-- !query +select decode(scol, ecol) from values(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') as t(scol, ecol) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkRuntimeException +{ + "errorClass" : "MALFORMED_CHARACTER_CODING", + "sqlState" : "22000", + "messageParameters" : { + "charset" : "US-ASCII", + "function" : "`decode`" + } +} + + -- !query SELECT CONTAINS(null, 'Spark') -- !query schema @@ -1990,3 +2166,106 @@ select luhn_check(123.456) struct -- !query output false + + +-- !query +select is_valid_utf8('') +-- !query schema +struct +-- !query output +true + + +-- !query +select is_valid_utf8('abc') +-- !query schema +struct +-- !query output +true + + +-- !query +select is_valid_utf8(x'80') +-- !query schema +struct +-- !query output +false + + +-- !query +select make_valid_utf8('') +-- !query schema +struct +-- !query output + + + +-- !query +select make_valid_utf8('abc') +-- !query schema +struct +-- !query output +abc + + +-- !query +select make_valid_utf8(x'80') +-- !query schema +struct +-- !query output +� + + +-- !query +select validate_utf8('') +-- !query schema +struct +-- !query output + + + +-- !query +select validate_utf8('abc') +-- !query schema +struct +-- !query output +abc + + +-- !query +select validate_utf8(x'80') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkIllegalArgumentException +{ + "errorClass" : "INVALID_UTF8_STRING", + "sqlState" : "22029", + "messageParameters" : { + "str" : "\\x80" + } +} + + +-- !query +select try_validate_utf8('') +-- !query schema +struct +-- !query output + + + +-- !query +select try_validate_utf8('abc') +-- !query schema +struct +-- !query output +abc + + +-- !query +select try_validate_utf8(x'80') +-- !query schema +struct +-- !query output +NULL diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/exists-subquery/exists-in-join-condition.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/exists-subquery/exists-in-join-condition.sql.out index b490704bebc57..c9c68a5f0602b 100644 --- a/sql/core/src/test/resources/sql-tests/results/subquery/exists-subquery/exists-in-join-condition.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/subquery/exists-subquery/exists-in-join-condition.sql.out @@ -472,3 +472,33 @@ struct 1 1 1 4 2 1 NULL NULL 3 4 NULL NULL + + +-- !query +select * from x join y on x1 = y1 and exists (select * from z where z2 = x2 AND z2 = y2) order by x1, x2, y1, y2 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.UNSUPPORTED_CORRELATED_EXPRESSION_IN_JOIN_CONDITION", + "sqlState" : "0A000", + "messageParameters" : { + "subqueryExpression" : "exists(x.x2, y.y2, (z.z2 = x.x2), (z.z2 = y.y2))" + } +} + + +-- !query +select * from x join y on x1 = y1 and not exists (select * from z where z2 = x2 AND z2 = y2) order by x1, x2, y1, y2 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.UNSUPPORTED_CORRELATED_EXPRESSION_IN_JOIN_CONDITION", + "sqlState" : "0A000", + "messageParameters" : { + "subqueryExpression" : "exists(x.x2, y.y2, (z.z2 = x.x2), (z.z2 = y.y2))" + } +} diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-subquery-in-join-condition.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-subquery-in-join-condition.sql.out index 9f829d522ad25..13af4c81173ae 100644 --- a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-subquery-in-join-condition.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-subquery-in-join-condition.sql.out @@ -434,3 +434,33 @@ struct 1 1 1 4 2 1 NULL NULL 3 4 NULL NULL + + +-- !query +select * from x left join y on x1 = y1 and x2 IN (select z1 from z where z2 = x2 AND z2 = y2) order by x1, x2, y1, y2 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.UNSUPPORTED_CORRELATED_EXPRESSION_IN_JOIN_CONDITION", + "sqlState" : "0A000", + "messageParameters" : { + "subqueryExpression" : "(x.x2 IN (listquery(x.x2, y.y2, (z.z2 = x.x2), (z.z2 = y.y2))))" + } +} + + +-- !query +select * from x left join y on x1 = y1 and x2 not IN (select z1 from z where z2 = x2 AND z2 = y2) order by x1, x2, y1, y2 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.UNSUPPORTED_CORRELATED_EXPRESSION_IN_JOIN_CONDITION", + "sqlState" : "0A000", + "messageParameters" : { + "subqueryExpression" : "(x.x2 IN (listquery(x.x2, y.y2, (z.z2 = x.x2), (z.z2 = y.y2))))" + } +} diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-group-by.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-group-by.sql.out new file mode 100644 index 0000000000000..85ebd91c28c9c --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-group-by.sql.out @@ -0,0 +1,207 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +create temp view x (x1, x2) as values (1, 1), (2, 2) +-- !query schema +struct<> +-- !query output + + + +-- !query +create temp view y (y1, y2) as values (2, 0), (3, -1) +-- !query schema +struct<> +-- !query output + + + +-- !query +create temp view z (z1, z2) as values (1, 0), (1, 1) +-- !query schema +struct<> +-- !query output + + + +-- !query +select * from x where (select count(*) from y where y1 = x1 group by y1) = 1 +-- !query schema +struct +-- !query output +2 2 + + +-- !query +select * from x where (select count(*) from y where y1 = x1 group by x1) = 1 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.CORRELATED_REFERENCE", + "sqlState" : "0A000", + "messageParameters" : { + "sqlExprs" : "\"x1\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 61, + "stopIndex" : 71, + "fragment" : "group by x1" + } ] +} + + +-- !query +select * from x where (select count(*) from y where y1 > x1 group by x1) = 1 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.CORRELATED_REFERENCE", + "sqlState" : "0A000", + "messageParameters" : { + "sqlExprs" : "\"x1\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 61, + "stopIndex" : 71, + "fragment" : "group by x1" + } ] +} + + +-- !query +select *, (select count(*) from y where x1 = y1 and y2 = 1 group by y2) from x +-- !query schema +struct +-- !query output +1 1 NULL +2 2 NULL + + +-- !query +select *, (select count(*) from y where x1 = y1 and y2 = x1 + 1 group by y2) from x +-- !query schema +struct +-- !query output +1 1 NULL +2 2 NULL + + +-- !query +select * from x where (select count(*) from y where y1 > x1 group by y1) = 1 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.NON_CORRELATED_COLUMNS_IN_GROUP_BY", + "sqlState" : "0A000", + "messageParameters" : { + "value" : "y1" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 23, + "stopIndex" : 72, + "fragment" : "(select count(*) from y where y1 > x1 group by y1)" + } ] +} + + +-- !query +select *, (select count(*) from y where y1 + y2 = x1 group by y1) from x +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.NON_CORRELATED_COLUMNS_IN_GROUP_BY", + "sqlState" : "0A000", + "messageParameters" : { + "value" : "y1" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 11, + "stopIndex" : 65, + "fragment" : "(select count(*) from y where y1 + y2 = x1 group by y1)" + } ] +} + + +-- !query +select *, (select count(*) from (select * from y where y1 = x1 union all select * from y) sub group by y1) from x +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.NON_CORRELATED_COLUMNS_IN_GROUP_BY", + "sqlState" : "0A000", + "messageParameters" : { + "value" : "y1" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 11, + "stopIndex" : 106, + "fragment" : "(select count(*) from (select * from y where y1 = x1 union all select * from y) sub group by y1)" + } ] +} + + +-- !query +select *, (select count(*) from y left join (select * from z where z1 = x1) sub on y2 = z2 group by z1) from x +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.NON_CORRELATED_COLUMNS_IN_GROUP_BY", + "sqlState" : "0A000", + "messageParameters" : { + "value" : "z1" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 11, + "stopIndex" : 103, + "fragment" : "(select count(*) from y left join (select * from z where z1 = x1) sub on y2 = z2 group by z1)" + } ] +} + + +-- !query +set spark.sql.legacy.scalarSubqueryAllowGroupByNonEqualityCorrelatedPredicate = true +-- !query schema +struct +-- !query output +spark.sql.legacy.scalarSubqueryAllowGroupByNonEqualityCorrelatedPredicate true + + +-- !query +select * from x where (select count(*) from y where y1 > x1 group by y1) = 1 +-- !query schema +struct +-- !query output +1 1 +1 1 +2 2 + + +-- !query +reset spark.sql.legacy.scalarSubqueryAllowGroupByNonEqualityCorrelatedPredicate +-- !query schema +struct<> +-- !query output + diff --git a/sql/core/src/test/resources/sql-tests/results/to_from_avro.sql.out b/sql/core/src/test/resources/sql-tests/results/to_from_avro.sql.out new file mode 100644 index 0000000000000..f9f491bd70fd1 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/to_from_avro.sql.out @@ -0,0 +1,144 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +create table t as + select named_struct('u', named_struct('member0', member0, 'member1', member1)) as s + from values (1, null), (null, 'a') tab(member0, member1) +-- !query schema +struct<> +-- !query output + + + +-- !query +declare avro_schema string +-- !query schema +struct<> +-- !query output + + + +-- !query +set variable avro_schema = + '{ "type": "record", "name": "struct", "fields": [{ "name": "u", "type": ["int","string"] }] }' +-- !query schema +struct<> +-- !query output + + + +-- !query +select from_avro(s, 42, map()) from t +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "DATATYPE_MISMATCH.TYPE_CHECK_FAILURE_WITH_HINT", + "sqlState" : "42K09", + "messageParameters" : { + "hint" : "", + "msg" : "The second argument of the FROM_AVRO SQL function must be a constant string containing the JSON representation of the schema to use for converting the value from AVRO format", + "sqlExpr" : "\"fromavro(s, 42, map())\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 30, + "fragment" : "from_avro(s, 42, map())" + } ] +} + + +-- !query +select from_avro(s, avro_schema, 42) from t +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "DATATYPE_MISMATCH.TYPE_CHECK_FAILURE_WITH_HINT", + "sqlState" : "42K09", + "messageParameters" : { + "hint" : "", + "msg" : "The third argument of the FROM_AVRO SQL function must be a constant map of strings to strings containing the options to use for converting the value from AVRO format", + "sqlExpr" : "\"fromavro(s, variablereference(system.session.avro_schema='{ \"type\": \"record\", \"name\": \"struct\", \"fields\": [{ \"name\": \"u\", \"type\": [\"int\",\"string\"] }] }'), 42)\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 36, + "fragment" : "from_avro(s, avro_schema, 42)" + } ] +} + + +-- !query +select to_avro(s, 42) from t +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "DATATYPE_MISMATCH.TYPE_CHECK_FAILURE_WITH_HINT", + "sqlState" : "42K09", + "messageParameters" : { + "hint" : "", + "msg" : "The second argument of the TO_AVRO SQL function must be a constant string containing the JSON representation of the schema to use for converting the value to AVRO format", + "sqlExpr" : "\"toavro(s, 42)\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 21, + "fragment" : "to_avro(s, 42)" + } ] +} + + +-- !query +select to_avro(s, avro_schema) as result from t +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "AVRO_NOT_LOADED_SQL_FUNCTIONS_UNUSABLE", + "sqlState" : "22KD3", + "messageParameters" : { + "functionName" : "TO_AVRO" + } +} + + +-- !query +select from_avro(result, avro_schema, map()).u from (select null as result) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "AVRO_NOT_LOADED_SQL_FUNCTIONS_UNUSABLE", + "sqlState" : "22KD3", + "messageParameters" : { + "functionName" : "FROM_AVRO" + } +} + + +-- !query +drop temporary variable avro_schema +-- !query schema +struct<> +-- !query output + + + +-- !query +drop table t +-- !query schema +struct<> +-- !query output + diff --git a/sql/core/src/test/resources/sql-tests/results/transform.sql.out b/sql/core/src/test/resources/sql-tests/results/transform.sql.out index ab726b93c07c8..7975392fd0147 100644 --- a/sql/core/src/test/resources/sql-tests/results/transform.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/transform.sql.out @@ -837,3 +837,13 @@ struct 3 3 3 3 3 3 + + +-- !query +SELECT TRANSFORM (a, b) + USING 'cat' AS (a CHAR(10), b VARCHAR(10)) +FROM VALUES('apache', 'spark') t(a, b) +-- !query schema +struct +-- !query output +apache spark diff --git a/sql/core/src/test/resources/sql-tests/results/try_arithmetic.sql.out b/sql/core/src/test/resources/sql-tests/results/try_arithmetic.sql.out index fa83652da0edc..b12680c2a6751 100644 --- a/sql/core/src/test/resources/sql-tests/results/try_arithmetic.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/try_arithmetic.sql.out @@ -15,6 +15,22 @@ struct NULL +-- !query +SELECT try_add(2147483647, decimal(1)) +-- !query schema +struct +-- !query output +2147483648 + + +-- !query +SELECT try_add(2147483647, "1") +-- !query schema +struct +-- !query output +2.147483648E9 + + -- !query SELECT try_add(-2147483648, -1) -- !query schema @@ -249,6 +265,22 @@ struct NULL +-- !query +SELECT try_divide(1, decimal(0)) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_divide(1, "0") +-- !query schema +struct +-- !query output +NULL + + -- !query SELECT try_divide(interval 2 year, 2) -- !query schema @@ -313,6 +345,22 @@ struct NULL +-- !query +SELECT try_subtract(2147483647, decimal(-1)) +-- !query schema +struct +-- !query output +2147483648 + + +-- !query +SELECT try_subtract(2147483647, "-1") +-- !query schema +struct +-- !query output +2.147483648E9 + + -- !query SELECT try_subtract(-2147483648, 1) -- !query schema @@ -409,6 +457,22 @@ struct NULL +-- !query +SELECT try_multiply(2147483647, decimal(-2)) +-- !query schema +struct +-- !query output +-4294967294 + + +-- !query +SELECT try_multiply(2147483647, "-2") +-- !query schema +struct +-- !query output +-4.294967294E9 + + -- !query SELECT try_multiply(-2147483648, 2) -- !query schema diff --git a/sql/core/src/test/resources/sql-tests/results/try_arithmetic.sql.out.java21 b/sql/core/src/test/resources/sql-tests/results/try_arithmetic.sql.out.java21 index dcdb9d0dcb194..002a0dfcf37ef 100644 --- a/sql/core/src/test/resources/sql-tests/results/try_arithmetic.sql.out.java21 +++ b/sql/core/src/test/resources/sql-tests/results/try_arithmetic.sql.out.java21 @@ -15,6 +15,22 @@ struct NULL +-- !query +SELECT try_add(2147483647, decimal(1)) +-- !query schema +struct +-- !query output +2147483648 + + +-- !query +SELECT try_add(2147483647, "1") +-- !query schema +struct +-- !query output +2.147483648E9 + + -- !query SELECT try_add(-2147483648, -1) -- !query schema @@ -249,6 +265,22 @@ struct NULL +-- !query +SELECT try_divide(1, decimal(0)) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT try_divide(1, "0") +-- !query schema +struct +-- !query output +NULL + + -- !query SELECT try_divide(interval 2 year, 2) -- !query schema @@ -313,6 +345,22 @@ struct NULL +-- !query +SELECT try_subtract(2147483647, decimal(-1)) +-- !query schema +struct +-- !query output +2147483648 + + +-- !query +SELECT try_subtract(2147483647, "-1") +-- !query schema +struct +-- !query output +2.147483648E9 + + -- !query SELECT try_subtract(-2147483648, 1) -- !query schema @@ -409,6 +457,22 @@ struct NULL +-- !query +SELECT try_multiply(2147483647, decimal(-2)) +-- !query schema +struct +-- !query output +-4294967294 + + +-- !query +SELECT try_multiply(2147483647, "-2") +-- !query schema +struct +-- !query output +-4.294967294E9 + + -- !query SELECT try_multiply(-2147483648, 2) -- !query schema diff --git a/sql/core/src/test/resources/sql-tests/results/udtf/udtf.sql.out b/sql/core/src/test/resources/sql-tests/results/udtf/udtf.sql.out index 78ad8b7c02cd5..f99c6c30c07e2 100644 --- a/sql/core/src/test/resources/sql-tests/results/udtf/udtf.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udtf/udtf.sql.out @@ -1069,6 +1069,32 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException } +-- !query +SELECT * FROM UDTFPartitionByIndexingBug( + TABLE( + SELECT + 5 AS unused_col, + 'hi' AS partition_col, + 1.0 AS double_col + + UNION ALL + + SELECT + 4 AS unused_col, + 'hi' AS partition_col, + 1.0 AS double_col + ) +) +-- !query schema +struct +-- !query output +NULL 1.0 +NULL 1.0 +NULL 1.0 +NULL 1.0 +NULL 1.0 + + -- !query DROP VIEW t1 -- !query schema diff --git a/sql/core/src/test/resources/sql-tests/results/view-schema-binding-config.sql.out b/sql/core/src/test/resources/sql-tests/results/view-schema-binding-config.sql.out new file mode 100644 index 0000000000000..b0d497e070477 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/view-schema-binding-config.sql.out @@ -0,0 +1,1229 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +SET spark.sql.legacy.viewSchemaBindingMode +-- !query schema +struct +-- !query output +spark.sql.legacy.viewSchemaBindingMode true + + +-- !query +SET spark.sql.legacy.viewSchemaBindingMode = false +-- !query schema +struct +-- !query output +spark.sql.legacy.viewSchemaBindingMode false + + +-- !query +CREATE OR REPLACE VIEW v WITH SCHEMA BINDING AS SELECT 1 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException +{ + "errorClass" : "FEATURE_NOT_ENABLED", + "sqlState" : "56038", + "messageParameters" : { + "configKey" : "spark.sql.legacy.viewSchemaBindingMode", + "configValue" : "true", + "featureName" : "VIEW ... WITH SCHEMA ..." + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 26, + "stopIndex" : 44, + "fragment" : "WITH SCHEMA BINDING" + } ] +} + + +-- !query +CREATE OR REPLACE VIEW v WITH SCHEMA COMPENSATION AS SELECT 1 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException +{ + "errorClass" : "FEATURE_NOT_ENABLED", + "sqlState" : "56038", + "messageParameters" : { + "configKey" : "spark.sql.legacy.viewSchemaBindingMode", + "configValue" : "true", + "featureName" : "VIEW ... WITH SCHEMA ..." + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 26, + "stopIndex" : 49, + "fragment" : "WITH SCHEMA COMPENSATION" + } ] +} + + +-- !query +CREATE OR REPLACE VIEW v WITH SCHEMA TYPE EVOLUTION AS SELECT 1 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException +{ + "errorClass" : "FEATURE_NOT_ENABLED", + "sqlState" : "56038", + "messageParameters" : { + "configKey" : "spark.sql.legacy.viewSchemaBindingMode", + "configValue" : "true", + "featureName" : "VIEW ... WITH SCHEMA ..." + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 26, + "stopIndex" : 51, + "fragment" : "WITH SCHEMA TYPE EVOLUTION" + } ] +} + + +-- !query +CREATE OR REPLACE VIEW v WITH SCHEMA EVOLUTION AS SELECT 1 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException +{ + "errorClass" : "FEATURE_NOT_ENABLED", + "sqlState" : "56038", + "messageParameters" : { + "configKey" : "spark.sql.legacy.viewSchemaBindingMode", + "configValue" : "true", + "featureName" : "VIEW ... WITH SCHEMA ..." + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 26, + "stopIndex" : 46, + "fragment" : "WITH SCHEMA EVOLUTION" + } ] +} + + +-- !query +CREATE OR REPLACE VIEW v AS SELECT 1 +-- !query schema +struct<> +-- !query output + + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +1 int + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT 1 +View Original Text SELECT 1 +View Catalog and Namespace spark_catalog.default +View Query Output Columns [1] + + +-- !query +SHOW TABLE EXTENDED LIKE 'v' +-- !query schema +struct +-- !query output +default v false Catalog: spark_catalog +Database: default +Table: v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type: VIEW +View Text: SELECT 1 +View Original Text: SELECT 1 +View Catalog and Namespace: spark_catalog.default +View Query Output Columns: [1] +Schema: root + |-- 1: integer (nullable = false) + + +-- !query +SHOW CREATE TABLE v +-- !query schema +struct +-- !query output +CREATE VIEW default.v ( + `1`) +AS SELECT 1 + + +-- !query +DROP VIEW IF EXISTS v +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE OR REPLACE TEMPORARY VIEW v AS SELECT 1 +-- !query schema +struct<> +-- !query output + + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +1 int + + +-- !query +SHOW TABLE EXTENDED LIKE 'v' +-- !query schema +struct +-- !query output + v true Table: v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type: VIEW +View Text: SELECT 1 +View Catalog and Namespace: spark_catalog.default +View Query Output Columns: [1] +Schema: root + |-- 1: integer (nullable = false) + + +-- !query +DROP VIEW IF EXISTS v +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 INT NOT NULL) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE OR REPLACE VIEW v AS SELECT * FROM t +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct +-- !query output + + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +c1 int + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1] + + +-- !query +SHOW CREATE TABLE v +-- !query schema +struct +-- !query output +CREATE VIEW default.v ( + c1) +AS SELECT * FROM t + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 BIGINT NOT NULL) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct +-- !query output + + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +c1 int + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1] + + +-- !query +SET spark.sql.legacy.viewSchemaBindingMode = true +-- !query schema +struct +-- !query output +spark.sql.legacy.viewSchemaBindingMode true + + +-- !query +SET spark.sql.legacy.viewSchemaCompensation = false +-- !query schema +struct +-- !query output +spark.sql.legacy.viewSchemaCompensation false + + +-- !query +SET spark.sql.ansi.enabled = false +-- !query schema +struct +-- !query output +spark.sql.ansi.enabled false + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 INT NOT NULL) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE OR REPLACE VIEW v AS SELECT * FROM t +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct +-- !query output + + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +c1 int + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode BINDING +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1] + + +-- !query +SHOW CREATE TABLE v +-- !query schema +struct +-- !query output +CREATE VIEW default.v ( + c1) +WITH SCHEMA BINDING +AS SELECT * FROM t + + +-- !query +DROP TABLE t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 BIGINT NOT NULL) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO t VALUES (1) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "CANNOT_UP_CAST_DATATYPE", + "sqlState" : "42846", + "messageParameters" : { + "details" : "The type path of the target object is:\n\nYou can either add an explicit cast to the input data or choose a higher precision type of the field in the target object", + "expression" : "spark_catalog.default.t.c1", + "sourceType" : "\"BIGINT\"", + "targetType" : "\"INT\"" + } +} + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +c1 int + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode BINDING +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1] + + +-- !query +SHOW CREATE TABLE v +-- !query schema +struct +-- !query output +CREATE VIEW default.v ( + c1) +WITH SCHEMA BINDING +AS SELECT * FROM t + + +-- !query +SET spark.sql.legacy.viewSchemaCompensation = true +-- !query schema +struct +-- !query output +spark.sql.legacy.viewSchemaCompensation true + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 INT NOT NULL) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE OR REPLACE VIEW v AS SELECT * FROM t +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct +-- !query output + + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +c1 int + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode COMPENSATION +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1] + + +-- !query +SHOW CREATE TABLE v +-- !query schema +struct +-- !query output +CREATE VIEW default.v ( + c1) +WITH SCHEMA COMPENSATION +AS SELECT * FROM t + + +-- !query +DROP TABLE t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 BIGINT NOT NULL) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO t VALUES (1) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct +-- !query output +1 + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +c1 int + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode COMPENSATION +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1] + + +-- !query +SHOW CREATE TABLE v +-- !query schema +struct +-- !query output +CREATE VIEW default.v ( + c1) +WITH SCHEMA COMPENSATION +AS SELECT * FROM t + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 STRING NOT NULL, c2 INT) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO t VALUES ('1', 2) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct +-- !query output +1 + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +c1 int + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode COMPENSATION +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1] + + +-- !query +INSERT INTO t VALUES ('a', 2) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkNumberFormatException +{ + "errorClass" : "CAST_INVALID_INPUT", + "sqlState" : "22018", + "messageParameters" : { + "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "expression" : "'a'", + "sourceType" : "\"STRING\"", + "targetType" : "\"INT\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 15, + "stopIndex" : 15, + "fragment" : "v" + } ] +} + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 MAP, c2 INT) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "DATATYPE_MISMATCH.CAST_WITHOUT_SUGGESTION", + "sqlState" : "42K09", + "messageParameters" : { + "sqlExpr" : "\"c1\"", + "srcType" : "\"MAP\"", + "targetType" : "\"INT\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 15, + "stopIndex" : 15, + "fragment" : "v" + } ] +} + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +c1 int + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode COMPENSATION +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1] + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 INT, c2 INT) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO t VALUES (1, 2) +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE OR REPLACE VIEW v AS SELECT * FROM t +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct +-- !query output +1 2 + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +c1 int +c2 int + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode COMPENSATION +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1, c2] + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 INT NOT NULL) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "INCOMPATIBLE_VIEW_SCHEMA_CHANGE", + "sqlState" : "51024", + "messageParameters" : { + "actualCols" : "[]", + "colName" : "c2", + "expectedNum" : "1", + "suggestion" : "CREATE OR REPLACE VIEW spark_catalog.default.v AS SELECT * FROM t", + "viewName" : "`spark_catalog`.`default`.`v`" + } +} + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +c1 int +c2 int + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode COMPENSATION +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1, c2] + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c3 INT NOT NULL, c2 INT) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "INCOMPATIBLE_VIEW_SCHEMA_CHANGE", + "sqlState" : "51024", + "messageParameters" : { + "actualCols" : "[]", + "colName" : "c1", + "expectedNum" : "1", + "suggestion" : "CREATE OR REPLACE VIEW spark_catalog.default.v AS SELECT * FROM t", + "viewName" : "`spark_catalog`.`default`.`v`" + } +} + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +c1 int +c2 int + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode COMPENSATION +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1, c2] + + +-- !query +SET spark.sql.legacy.viewSchemaBindingMode = false +-- !query schema +struct +-- !query output +spark.sql.legacy.viewSchemaBindingMode false + + +-- !query +SET spark.sql.legacy.viewSchemaCompensation = false +-- !query schema +struct +-- !query output +spark.sql.legacy.viewSchemaCompensation false + + +-- !query +CREATE OR REPLACE VIEW v AS SELECT 1 +-- !query schema +struct<> +-- !query output + + + +-- !query +SET spark.sql.legacy.viewSchemaBindingMode = true +-- !query schema +struct +-- !query output +spark.sql.legacy.viewSchemaBindingMode true + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +1 int + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT 1 +View Original Text SELECT 1 +View Schema Mode BINDING +View Catalog and Namespace spark_catalog.default +View Query Output Columns [1] + + +-- !query +SHOW TABLE EXTENDED LIKE 'v' +-- !query schema +struct +-- !query output +default v false Catalog: spark_catalog +Database: default +Table: v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type: VIEW +View Text: SELECT 1 +View Original Text: SELECT 1 +View Schema Mode: BINDING +View Catalog and Namespace: spark_catalog.default +View Query Output Columns: [1] +Schema: root + |-- 1: integer (nullable = false) + + +-- !query +SHOW CREATE TABLE v +-- !query schema +struct +-- !query output +CREATE VIEW default.v ( + `1`) +WITH SCHEMA BINDING +AS SELECT 1 + + +-- !query +SET spark.sql.legacy.viewSchemaCompensation = true +-- !query schema +struct +-- !query output +spark.sql.legacy.viewSchemaCompensation true + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +1 int + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT 1 +View Original Text SELECT 1 +View Schema Mode BINDING +View Catalog and Namespace spark_catalog.default +View Query Output Columns [1] + + +-- !query +SHOW TABLE EXTENDED LIKE 'v' +-- !query schema +struct +-- !query output +default v false Catalog: spark_catalog +Database: default +Table: v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type: VIEW +View Text: SELECT 1 +View Original Text: SELECT 1 +View Schema Mode: BINDING +View Catalog and Namespace: spark_catalog.default +View Query Output Columns: [1] +Schema: root + |-- 1: integer (nullable = false) + + +-- !query +SHOW CREATE TABLE v +-- !query schema +struct +-- !query output +CREATE VIEW default.v ( + `1`) +WITH SCHEMA BINDING +AS SELECT 1 + + +-- !query +DROP VIEW IF EXISTS v +-- !query schema +struct<> +-- !query output + + + +-- !query +SET spark.sql.legacy.viewSchemaBindingMode = false +-- !query schema +struct +-- !query output +spark.sql.legacy.viewSchemaBindingMode false + + +-- !query +SET spark.sql.legacy.viewSchemaCompensation = false +-- !query schema +struct +-- !query output +spark.sql.legacy.viewSchemaCompensation false + + +-- !query +CREATE OR REPLACE TEMPORARY VIEW v AS SELECT 1 +-- !query schema +struct<> +-- !query output + + + +-- !query +SET spark.sql.legacy.viewSchemaBindingMode = true +-- !query schema +struct +-- !query output +spark.sql.legacy.viewSchemaBindingMode true + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +1 int + + +-- !query +SHOW TABLE EXTENDED LIKE 'v' +-- !query schema +struct +-- !query output + v true Table: v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type: VIEW +View Text: SELECT 1 +View Schema Mode: BINDING +View Catalog and Namespace: spark_catalog.default +View Query Output Columns: [1] +Schema: root + |-- 1: integer (nullable = false) + + +-- !query +SET spark.sql.legacy.viewSchemaCompensation = true +-- !query schema +struct +-- !query output +spark.sql.legacy.viewSchemaCompensation true + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +1 int + + +-- !query +SHOW TABLE EXTENDED LIKE 'v' +-- !query schema +struct +-- !query output + v true Table: v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type: VIEW +View Text: SELECT 1 +View Schema Mode: BINDING +View Catalog and Namespace: spark_catalog.default +View Query Output Columns: [1] +Schema: root + |-- 1: integer (nullable = false) + + +-- !query +DROP VIEW IF EXISTS v +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP VIEW IF EXISTS v +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + diff --git a/sql/core/src/test/resources/sql-tests/results/view-schema-binding.sql.out b/sql/core/src/test/resources/sql-tests/results/view-schema-binding.sql.out new file mode 100644 index 0000000000000..a4e5820cb7ce6 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/view-schema-binding.sql.out @@ -0,0 +1,386 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 INT NOT NULL) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE OR REPLACE VIEW v WITH SCHEMA BINDING AS SELECT * FROM t +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct +-- !query output + + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +c1 int + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode BINDING +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1] + + +-- !query +DROP TABLE t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 BIGINT NOT NULL) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "CANNOT_UP_CAST_DATATYPE", + "sqlState" : "42846", + "messageParameters" : { + "details" : "The type path of the target object is:\n\nYou can either add an explicit cast to the input data or choose a higher precision type of the field in the target object", + "expression" : "spark_catalog.default.t.c1", + "sourceType" : "\"BIGINT\"", + "targetType" : "\"INT\"" + } +} + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +c1 int + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode BINDING +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1] + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 INT, c2 INT) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE OR REPLACE VIEW v WITH SCHEMA BINDING AS SELECT * FROM t +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct +-- !query output + + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +c1 int +c2 int + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode BINDING +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1, c2] + + +-- !query +DROP TABLE t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 INT) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "INCOMPATIBLE_VIEW_SCHEMA_CHANGE", + "sqlState" : "51024", + "messageParameters" : { + "actualCols" : "[]", + "colName" : "c2", + "expectedNum" : "1", + "suggestion" : "CREATE OR REPLACE VIEW spark_catalog.default.v AS SELECT * FROM t", + "viewName" : "`spark_catalog`.`default`.`v`" + } +} + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +c1 int +c2 int + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode BINDING +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1, c2] + + +-- !query +SET spark.sql.legacy.viewSchemaCompensation=false +-- !query schema +struct +-- !query output +spark.sql.legacy.viewSchemaCompensation false + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 INT NOT NULL) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE OR REPLACE VIEW v AS SELECT * FROM t +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct +-- !query output + + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +c1 int + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode BINDING +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1] + + +-- !query +ALTER VIEW v WITH SCHEMA BINDING +-- !query schema +struct<> +-- !query output + + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +c1 int + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode BINDING +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1] + + +-- !query +DROP TABLE t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 BIGINT NOT NULL) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "CANNOT_UP_CAST_DATATYPE", + "sqlState" : "42846", + "messageParameters" : { + "details" : "The type path of the target object is:\n\nYou can either add an explicit cast to the input data or choose a higher precision type of the field in the target object", + "expression" : "spark_catalog.default.t.c1", + "sourceType" : "\"BIGINT\"", + "targetType" : "\"INT\"" + } +} + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +c1 int + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode BINDING +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1] + + +-- !query +DROP VIEW IF EXISTS v +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + diff --git a/sql/core/src/test/resources/sql-tests/results/view-schema-compensation.sql.out b/sql/core/src/test/resources/sql-tests/results/view-schema-compensation.sql.out new file mode 100644 index 0000000000000..ffd1fbec47bbb --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/view-schema-compensation.sql.out @@ -0,0 +1,593 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +SET spark.sql.ansi.enabled = false +-- !query schema +struct +-- !query output +spark.sql.ansi.enabled false + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 INT NOT NULL) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE OR REPLACE VIEW v WITH SCHEMA COMPENSATION AS SELECT * FROM t +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct +-- !query output + + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +c1 int + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode COMPENSATION +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1] + + +-- !query +DROP TABLE t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 BIGINT NOT NULL) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO t VALUES (1) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct +-- !query output +1 + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +c1 int + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode COMPENSATION +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1] + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 STRING NOT NULL, c2 INT) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO t VALUES ('1', 2) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct +-- !query output +1 + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +c1 int + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode COMPENSATION +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1] + + +-- !query +INSERT INTO t VALUES ('a', 2) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkNumberFormatException +{ + "errorClass" : "CAST_INVALID_INPUT", + "sqlState" : "22018", + "messageParameters" : { + "ansiConfig" : "\"spark.sql.ansi.enabled\"", + "expression" : "'a'", + "sourceType" : "\"STRING\"", + "targetType" : "\"INT\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 15, + "stopIndex" : 15, + "fragment" : "v" + } ] +} + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 MAP, c2 INT) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "DATATYPE_MISMATCH.CAST_WITHOUT_SUGGESTION", + "sqlState" : "42K09", + "messageParameters" : { + "sqlExpr" : "\"c1\"", + "srcType" : "\"MAP\"", + "targetType" : "\"INT\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 15, + "stopIndex" : 15, + "fragment" : "v" + } ] +} + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +c1 int + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode COMPENSATION +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1] + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 INT, c2 INT) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO t VALUES (1, 2) +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE OR REPLACE VIEW v AS SELECT * FROM t +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct +-- !query output +1 2 + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +c1 int +c2 int + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode COMPENSATION +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1, c2] + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 INT NOT NULL) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "INCOMPATIBLE_VIEW_SCHEMA_CHANGE", + "sqlState" : "51024", + "messageParameters" : { + "actualCols" : "[]", + "colName" : "c2", + "expectedNum" : "1", + "suggestion" : "CREATE OR REPLACE VIEW spark_catalog.default.v AS SELECT * FROM t", + "viewName" : "`spark_catalog`.`default`.`v`" + } +} + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +c1 int +c2 int + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode COMPENSATION +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1, c2] + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c3 INT NOT NULL, c2 INT) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "INCOMPATIBLE_VIEW_SCHEMA_CHANGE", + "sqlState" : "51024", + "messageParameters" : { + "actualCols" : "[]", + "colName" : "c1", + "expectedNum" : "1", + "suggestion" : "CREATE OR REPLACE VIEW spark_catalog.default.v AS SELECT * FROM t", + "viewName" : "`spark_catalog`.`default`.`v`" + } +} + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +c1 int +c2 int + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode COMPENSATION +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1, c2] + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 INT) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO t VALUES(1) +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE OR REPLACE VIEW v WITH SCHEMA BINDING AS SELECT * FROM t +-- !query schema +struct<> +-- !query output + + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +c1 int + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode BINDING +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1] + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 STRING) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO t VALUES('1') +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "CANNOT_UP_CAST_DATATYPE", + "sqlState" : "42846", + "messageParameters" : { + "details" : "The type path of the target object is:\n\nYou can either add an explicit cast to the input data or choose a higher precision type of the field in the target object", + "expression" : "spark_catalog.default.t.c1", + "sourceType" : "\"STRING\"", + "targetType" : "\"INT\"" + } +} + + +-- !query +ALTER VIEW v WITH SCHEMA COMPENSATION +-- !query schema +struct<> +-- !query output + + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +c1 int + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode COMPENSATION +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1] + + +-- !query +SELECT * FROM v +-- !query schema +struct +-- !query output +1 + + +-- !query +DROP VIEW IF EXISTS v +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + diff --git a/sql/core/src/test/resources/sql-tests/results/view-schema-evolution.sql.out b/sql/core/src/test/resources/sql-tests/results/view-schema-evolution.sql.out new file mode 100644 index 0000000000000..46d6acc8d98e2 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/view-schema-evolution.sql.out @@ -0,0 +1,1113 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 INT NOT NULL, c2 INT) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO t VALUES (1, 2) +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE OR REPLACE VIEW v WITH SCHEMA EVOLUTION AS SELECT * FROM t +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct +-- !query output +1 2 + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +c1 int +c2 int + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode EVOLUTION +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1, c2] + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c4 STRING NOT NULL, c5 DOUBLE) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO t VALUES ('1', 2.0) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct +-- !query output +1 2.0 + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +c4 string +c5 double + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode EVOLUTION +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c4, c5] + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c4 STRING, c5 DOUBLE, c6 DATE) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO t VALUES ('1', 2.0, DATE'2022-01-01') +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct +-- !query output +1 2.0 2022-01-01 + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +c4 string +c5 double +c6 date + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode EVOLUTION +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c4, c5, c6] + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 INT, c2 INT) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO t VALUES (1, 2) +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE OR REPLACE VIEW v WITH SCHEMA EVOLUTION AS SELECT * FROM t +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct +-- !query output +1 2 + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +c1 int +c2 int + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode EVOLUTION +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1, c2] + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 INT) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct +-- !query output + + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +c1 int + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode EVOLUTION +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1] + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 INT NOT NULL, c2 INT) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO t VALUES (1, 2) +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE OR REPLACE VIEW v(a1, a2) WITH SCHEMA EVOLUTION AS SELECT * FROM t +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct +-- !query output +1 2 + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +a1 int +a2 int + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode TYPE EVOLUTION +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1, c2] + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 STRING NOT NULL, c2 DOUBLE) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO t VALUES ('1', 2.0) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct +-- !query output +1 2.0 + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +a1 string +a2 double + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode TYPE EVOLUTION +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1, c2] + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 STRING, c2 DOUBLE, c3 DATE) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO t VALUES ('1', 2.0, DATE'2022-01-01') +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct +-- !query output +1 2.0 + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +a1 string +a2 double + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode TYPE EVOLUTION +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1, c2] + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 INT, c2 INT) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO t VALUES (1, 2) +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE OR REPLACE VIEW v(a1, a2) WITH SCHEMA EVOLUTION AS SELECT * FROM t +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct +-- !query output +1 2 + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +a1 int +a2 int + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode TYPE EVOLUTION +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1, c2] + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 INT) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "INCOMPATIBLE_VIEW_SCHEMA_CHANGE", + "sqlState" : "51024", + "messageParameters" : { + "actualCols" : "[]", + "colName" : "c2", + "expectedNum" : "1", + "suggestion" : "CREATE OR REPLACE VIEW spark_catalog.default.v (a1, a2) AS SELECT * FROM t", + "viewName" : "`spark_catalog`.`default`.`v`" + } +} + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +a1 int +a2 int + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode TYPE EVOLUTION +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1, c2] + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c3 INT, c2 INT) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "INCOMPATIBLE_VIEW_SCHEMA_CHANGE", + "sqlState" : "51024", + "messageParameters" : { + "actualCols" : "[]", + "colName" : "c1", + "expectedNum" : "1", + "suggestion" : "CREATE OR REPLACE VIEW spark_catalog.default.v (a1, a2) AS SELECT * FROM t", + "viewName" : "`spark_catalog`.`default`.`v`" + } +} + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +a1 int +a2 int + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode TYPE EVOLUTION +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1, c2] + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 INT COMMENT 'c1', c2 INT COMMENT 'c2') USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE OR REPLACE VIEW v(a1, a2) WITH SCHEMA TYPE EVOLUTION AS SELECT * FROM t +-- !query schema +struct<> +-- !query output + + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +a1 int c1 +a2 int c2 + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode TYPE EVOLUTION +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1, c2] + + +-- !query +CREATE OR REPLACE VIEW v(a1, a2) WITH SCHEMA EVOLUTION AS SELECT * FROM t +-- !query schema +struct<> +-- !query output + + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +a1 int c1 +a2 int c2 + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode TYPE EVOLUTION +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1, c2] + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 BIGINT COMMENT 'c1 6c', c2 STRING COMMENT 'c2 6c') USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct +-- !query output + + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +a1 bigint c1 +a2 string c2 + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode TYPE EVOLUTION +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1, c2] + + +-- !query +CREATE OR REPLACE VIEW v(a1 COMMENT 'a1', a2 COMMENT 'a2') WITH SCHEMA EVOLUTION AS SELECT * FROM t +-- !query schema +struct<> +-- !query output + + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +a1 bigint a1 +a2 string a2 + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode TYPE EVOLUTION +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1, c2] + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 BIGINT COMMENT 'c1 6d', c2 STRING COMMENT 'c2 6d') USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct +-- !query output + + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +a1 bigint a1 +a2 string a2 + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode TYPE EVOLUTION +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1, c2] + + +-- !query +CREATE OR REPLACE VIEW v WITH SCHEMA EVOLUTION AS SELECT * FROM t +-- !query schema +struct<> +-- !query output + + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +c1 bigint c1 6d +c2 string c2 6d + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode EVOLUTION +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1, c2] + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 BIGINT COMMENT 'c1 6e', c2 STRING COMMENT 'c2 6e') USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct +-- !query output + + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +c1 bigint c1 6e +c2 string c2 6e + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode EVOLUTION +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1, c2] + + +-- !query +DROP TABLE IF EXISTS t1 +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t1(c1 INT) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE IF EXISTS t2 +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t2(c2 INT) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE OR REPLACE VIEW v WITH SCHEMA EVOLUTION AS SELECT * FROM t1, t2 +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct +-- !query output + + + +-- !query +DROP TABLE IF EXISTS t2 +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t2(c1 INT) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "COLUMN_ALREADY_EXISTS", + "sqlState" : "42711", + "messageParameters" : { + "columnName" : "`c1`" + } +} + + +-- !query +DROP TABLE IF EXISTS t1 +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE IF EXISTS t2 +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 INT) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO t VALUES(1) +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE OR REPLACE VIEW v AS SELECT * FROM t +-- !query schema +struct<> +-- !query output + + + +-- !query +ALTER VIEW v WITH SCHEMA EVOLUTION +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 STRING, c2 INT) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct +-- !query output + + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +c1 string +c2 int + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode EVOLUTION +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1, c2] + + +-- !query +DROP VIEW IF EXISTS v +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + diff --git a/sql/core/src/test/resources/sql-tests/results/view-schema-type-evolution.sql.out b/sql/core/src/test/resources/sql-tests/results/view-schema-type-evolution.sql.out new file mode 100644 index 0000000000000..707715120a861 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/view-schema-type-evolution.sql.out @@ -0,0 +1,663 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 INT NOT NULL, c2 INT) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO t VALUES (1, 2) +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE OR REPLACE VIEW v WITH SCHEMA TYPE EVOLUTION AS SELECT * FROM t +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct +-- !query output +1 2 + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +c1 int +c2 int + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode TYPE EVOLUTION +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1, c2] + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 STRING NOT NULL, c2 DOUBLE) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO t VALUES ('1', 2.0) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct +-- !query output +1 2.0 + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +c1 string +c2 double + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode TYPE EVOLUTION +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1, c2] + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 STRING, c2 DOUBLE, c3 DATE) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO t VALUES ('1', 2.0, DATE'2022-01-01') +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct +-- !query output +1 2.0 + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +c1 string +c2 double + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode TYPE EVOLUTION +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1, c2] + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 INT, c2 INT) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO t VALUES (1, 2) +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE OR REPLACE VIEW v WITH SCHEMA TYPE EVOLUTION AS SELECT * FROM t +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct +-- !query output +1 2 + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +c1 int +c2 int + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode TYPE EVOLUTION +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1, c2] + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 INT) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "INCOMPATIBLE_VIEW_SCHEMA_CHANGE", + "sqlState" : "51024", + "messageParameters" : { + "actualCols" : "[]", + "colName" : "c2", + "expectedNum" : "1", + "suggestion" : "CREATE OR REPLACE VIEW spark_catalog.default.v AS SELECT * FROM t", + "viewName" : "`spark_catalog`.`default`.`v`" + } +} + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +c1 int +c2 int + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode TYPE EVOLUTION +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1, c2] + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c3 INT, c2 INT) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "INCOMPATIBLE_VIEW_SCHEMA_CHANGE", + "sqlState" : "51024", + "messageParameters" : { + "actualCols" : "[]", + "colName" : "c1", + "expectedNum" : "1", + "suggestion" : "CREATE OR REPLACE VIEW spark_catalog.default.v AS SELECT * FROM t", + "viewName" : "`spark_catalog`.`default`.`v`" + } +} + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +c1 int +c2 int + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode TYPE EVOLUTION +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1, c2] + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 INT COMMENT 'c1', c2 INT COMMENT 'c2') USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE OR REPLACE VIEW v(a1, a2) WITH SCHEMA TYPE EVOLUTION AS SELECT * FROM t +-- !query schema +struct<> +-- !query output + + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +a1 int c1 +a2 int c2 + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode TYPE EVOLUTION +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1, c2] + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 BIGINT COMMENT 'c1 6a', c2 STRING COMMENT 'c2 6a') USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct +-- !query output + + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +a1 bigint c1 +a2 string c2 + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode TYPE EVOLUTION +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1, c2] + + +-- !query +CREATE OR REPLACE VIEW v(a1 COMMENT 'a1', a2 COMMENT 'a2') WITH SCHEMA TYPE EVOLUTION AS SELECT * FROM t +-- !query schema +struct<> +-- !query output + + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +a1 bigint a1 +a2 string a2 + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode TYPE EVOLUTION +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1, c2] + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 BIGINT COMMENT 'c1 6b', c2 STRING COMMENT 'c2 6b') USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct +-- !query output + + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +a1 bigint a1 +a2 string a2 + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode TYPE EVOLUTION +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1, c2] + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 INT) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO t VALUES(1) +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE OR REPLACE VIEW v WITH SCHEMA COMPENSATION AS SELECT * FROM t +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t(c1 STRING) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO t VALUES('1') +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct +-- !query output +1 + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +c1 int + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode COMPENSATION +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1] + + +-- !query +ALTER VIEW v WITH SCHEMA TYPE EVOLUTION +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v +-- !query schema +struct +-- !query output +1 + + +-- !query +DESCRIBE EXTENDED v +-- !query schema +struct +-- !query output +c1 string + +# Detailed Table Information +Catalog spark_catalog +Database default +Table v +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t +View Original Text SELECT * FROM t +View Schema Mode TYPE EVOLUTION +View Catalog and Namespace spark_catalog.default +View Query Output Columns [c1] + + +-- !query +DROP VIEW IF EXISTS v +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE IF EXISTS t +-- !query schema +struct<> +-- !query output + diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/.metadata.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/.metadata.crc new file mode 100644 index 0000000000000..6177f01d501b1 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/.metadata.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/commits/.0.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/commits/.0.crc new file mode 100644 index 0000000000000..1aee7033161ec Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/commits/.0.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/commits/.1.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/commits/.1.crc new file mode 100644 index 0000000000000..1aee7033161ec Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/commits/.1.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/commits/0 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/commits/0 new file mode 100644 index 0000000000000..9c1e3021c3ead --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/commits/0 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":0} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/commits/1 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/commits/1 new file mode 100644 index 0000000000000..9c1e3021c3ead --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/commits/1 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":0} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/metadata b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/metadata new file mode 100644 index 0000000000000..c8acdedc074b7 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/metadata @@ -0,0 +1 @@ +{"id":"a13717f3-7485-421b-b55a-21625123b680"} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/offsets/.0.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/offsets/.0.crc new file mode 100644 index 0000000000000..121286161cb66 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/offsets/.0.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/offsets/.1.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/offsets/.1.crc new file mode 100644 index 0000000000000..89d73c77c55c0 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/offsets/.1.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/offsets/0 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/offsets/0 new file mode 100644 index 0000000000000..7a7b38628a15d --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/offsets/0 @@ -0,0 +1,3 @@ +v1 +{"batchWatermarkMs":0,"batchTimestampMs":1719278977158,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5"}} +0 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/offsets/1 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/offsets/1 new file mode 100644 index 0000000000000..589d400395e1c --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/offsets/1 @@ -0,0 +1,3 @@ +v1 +{"batchWatermarkMs":0,"batchTimestampMs":1719278978807,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5"}} +1 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/0/.1.delta.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/0/.1.delta.crc new file mode 100644 index 0000000000000..1992982c58ff2 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/0/.1.delta.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/0/.2.delta.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/0/.2.delta.crc new file mode 100644 index 0000000000000..cf1d68e2acee3 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/0/.2.delta.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/0/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/0/1.delta new file mode 100644 index 0000000000000..fec40e83a5471 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/0/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/0/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/0/2.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/0/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/0/_metadata/.schema.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/0/_metadata/.schema.crc new file mode 100644 index 0000000000000..97b2fbbd4cdf9 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/0/_metadata/.schema.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/0/_metadata/schema b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/0/_metadata/schema new file mode 100644 index 0000000000000..c35505fa363fb Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/0/_metadata/schema differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/1/.1.delta.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/1/.1.delta.crc new file mode 100644 index 0000000000000..cf1d68e2acee3 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/1/.1.delta.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/1/.2.delta.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/1/.2.delta.crc new file mode 100644 index 0000000000000..d18b77b93aff2 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/1/.2.delta.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/1/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/1/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/1/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/1/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/1/2.delta new file mode 100644 index 0000000000000..fcbf8df80f5f9 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/1/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/2/.1.delta.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/2/.1.delta.crc new file mode 100644 index 0000000000000..cf1d68e2acee3 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/2/.1.delta.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/2/.2.delta.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/2/.2.delta.crc new file mode 100644 index 0000000000000..cf1d68e2acee3 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/2/.2.delta.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/2/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/2/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/2/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/2/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/2/2.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/2/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/3/.1.delta.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/3/.1.delta.crc new file mode 100644 index 0000000000000..cf1d68e2acee3 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/3/.1.delta.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/3/.2.delta.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/3/.2.delta.crc new file mode 100644 index 0000000000000..cf1d68e2acee3 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/3/.2.delta.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/3/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/3/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/3/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/3/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/3/2.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/3/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/4/.1.delta.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/4/.1.delta.crc new file mode 100644 index 0000000000000..cf1d68e2acee3 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/4/.1.delta.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/4/.2.delta.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/4/.2.delta.crc new file mode 100644 index 0000000000000..cf1d68e2acee3 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/4/.2.delta.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/4/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/4/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/4/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/4/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/4/2.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/state/0/4/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/.metadata.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/.metadata.crc new file mode 100644 index 0000000000000..a0afa9cbeabb7 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/.metadata.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/commits/.0.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/commits/.0.crc new file mode 100644 index 0000000000000..1aee7033161ec Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/commits/.0.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/commits/0 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/commits/0 new file mode 100644 index 0000000000000..9c1e3021c3ead --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/commits/0 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":0} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/metadata b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/metadata new file mode 100644 index 0000000000000..9f8d6f4d5cf50 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/metadata @@ -0,0 +1 @@ +{"id":"e41911da-47c9-4560-a95d-e2ab97f2bc85"} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/offsets/.0.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/offsets/.0.crc new file mode 100644 index 0000000000000..6cd2a4731154f Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/offsets/.0.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/offsets/0 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/offsets/0 new file mode 100644 index 0000000000000..a45ae3899e0b3 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/offsets/0 @@ -0,0 +1,4 @@ +v1 +{"batchWatermarkMs":0,"batchTimestampMs":1719343083746,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5"}} +0 +0 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/left-keyToNumValues/.1.delta.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/left-keyToNumValues/.1.delta.crc new file mode 100644 index 0000000000000..cf1d68e2acee3 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/left-keyToNumValues/.1.delta.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/left-keyToNumValues/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/left-keyToNumValues/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/left-keyToNumValues/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/left-keyToNumValues/_metadata/.schema.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/left-keyToNumValues/_metadata/.schema.crc new file mode 100644 index 0000000000000..0b5ede7660246 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/left-keyToNumValues/_metadata/.schema.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/left-keyToNumValues/_metadata/schema b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/left-keyToNumValues/_metadata/schema new file mode 100644 index 0000000000000..4da637d143496 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/left-keyToNumValues/_metadata/schema differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/left-keyWithIndexToValue/.1.delta.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/left-keyWithIndexToValue/.1.delta.crc new file mode 100644 index 0000000000000..cf1d68e2acee3 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/left-keyWithIndexToValue/.1.delta.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/left-keyWithIndexToValue/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/left-keyWithIndexToValue/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/left-keyWithIndexToValue/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/left-keyWithIndexToValue/_metadata/.schema.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/left-keyWithIndexToValue/_metadata/.schema.crc new file mode 100644 index 0000000000000..3f303a9e7b035 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/left-keyWithIndexToValue/_metadata/.schema.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/left-keyWithIndexToValue/_metadata/schema b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/left-keyWithIndexToValue/_metadata/schema new file mode 100644 index 0000000000000..42448b3b584ce Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/left-keyWithIndexToValue/_metadata/schema differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/right-keyToNumValues/.1.delta.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/right-keyToNumValues/.1.delta.crc new file mode 100644 index 0000000000000..cf1d68e2acee3 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/right-keyToNumValues/.1.delta.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/right-keyToNumValues/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/right-keyToNumValues/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/right-keyToNumValues/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/right-keyToNumValues/_metadata/.schema.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/right-keyToNumValues/_metadata/.schema.crc new file mode 100644 index 0000000000000..0b5ede7660246 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/right-keyToNumValues/_metadata/.schema.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/right-keyToNumValues/_metadata/schema b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/right-keyToNumValues/_metadata/schema new file mode 100644 index 0000000000000..4da637d143496 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/right-keyToNumValues/_metadata/schema differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/right-keyWithIndexToValue/.1.delta.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/right-keyWithIndexToValue/.1.delta.crc new file mode 100644 index 0000000000000..cf1d68e2acee3 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/right-keyWithIndexToValue/.1.delta.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/right-keyWithIndexToValue/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/right-keyWithIndexToValue/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/right-keyWithIndexToValue/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/right-keyWithIndexToValue/_metadata/.schema.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/right-keyWithIndexToValue/_metadata/.schema.crc new file mode 100644 index 0000000000000..bcc7311689f0a Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/right-keyWithIndexToValue/_metadata/.schema.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/right-keyWithIndexToValue/_metadata/schema b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/right-keyWithIndexToValue/_metadata/schema new file mode 100644 index 0000000000000..8fa8f1675bc82 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/0/right-keyWithIndexToValue/_metadata/schema differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/1/left-keyToNumValues/.1.delta.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/1/left-keyToNumValues/.1.delta.crc new file mode 100644 index 0000000000000..cf1d68e2acee3 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/1/left-keyToNumValues/.1.delta.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/1/left-keyToNumValues/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/1/left-keyToNumValues/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/1/left-keyToNumValues/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/1/left-keyWithIndexToValue/.1.delta.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/1/left-keyWithIndexToValue/.1.delta.crc new file mode 100644 index 0000000000000..cf1d68e2acee3 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/1/left-keyWithIndexToValue/.1.delta.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/1/left-keyWithIndexToValue/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/1/left-keyWithIndexToValue/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/1/left-keyWithIndexToValue/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/1/right-keyToNumValues/.1.delta.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/1/right-keyToNumValues/.1.delta.crc new file mode 100644 index 0000000000000..cf1d68e2acee3 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/1/right-keyToNumValues/.1.delta.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/1/right-keyToNumValues/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/1/right-keyToNumValues/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/1/right-keyToNumValues/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/1/right-keyWithIndexToValue/.1.delta.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/1/right-keyWithIndexToValue/.1.delta.crc new file mode 100644 index 0000000000000..cf1d68e2acee3 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/1/right-keyWithIndexToValue/.1.delta.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/1/right-keyWithIndexToValue/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/1/right-keyWithIndexToValue/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/1/right-keyWithIndexToValue/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/2/left-keyToNumValues/.1.delta.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/2/left-keyToNumValues/.1.delta.crc new file mode 100644 index 0000000000000..d73ee1ba16c2e Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/2/left-keyToNumValues/.1.delta.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/2/left-keyToNumValues/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/2/left-keyToNumValues/1.delta new file mode 100644 index 0000000000000..4e421cd377fb6 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/2/left-keyToNumValues/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/2/left-keyWithIndexToValue/.1.delta.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/2/left-keyWithIndexToValue/.1.delta.crc new file mode 100644 index 0000000000000..8dee4c86270f2 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/2/left-keyWithIndexToValue/.1.delta.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/2/left-keyWithIndexToValue/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/2/left-keyWithIndexToValue/1.delta new file mode 100644 index 0000000000000..7e6dce9cc108c Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/2/left-keyWithIndexToValue/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/2/right-keyToNumValues/.1.delta.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/2/right-keyToNumValues/.1.delta.crc new file mode 100644 index 0000000000000..d73ee1ba16c2e Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/2/right-keyToNumValues/.1.delta.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/2/right-keyToNumValues/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/2/right-keyToNumValues/1.delta new file mode 100644 index 0000000000000..4e421cd377fb6 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/2/right-keyToNumValues/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/2/right-keyWithIndexToValue/.1.delta.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/2/right-keyWithIndexToValue/.1.delta.crc new file mode 100644 index 0000000000000..d0e2f40c18ada Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/2/right-keyWithIndexToValue/.1.delta.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/2/right-keyWithIndexToValue/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/2/right-keyWithIndexToValue/1.delta new file mode 100644 index 0000000000000..2ec494e6a636f Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/2/right-keyWithIndexToValue/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/3/left-keyToNumValues/.1.delta.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/3/left-keyToNumValues/.1.delta.crc new file mode 100644 index 0000000000000..cf1d68e2acee3 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/3/left-keyToNumValues/.1.delta.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/3/left-keyToNumValues/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/3/left-keyToNumValues/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/3/left-keyToNumValues/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/3/left-keyWithIndexToValue/.1.delta.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/3/left-keyWithIndexToValue/.1.delta.crc new file mode 100644 index 0000000000000..cf1d68e2acee3 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/3/left-keyWithIndexToValue/.1.delta.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/3/left-keyWithIndexToValue/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/3/left-keyWithIndexToValue/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/3/left-keyWithIndexToValue/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/3/right-keyToNumValues/.1.delta.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/3/right-keyToNumValues/.1.delta.crc new file mode 100644 index 0000000000000..cf1d68e2acee3 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/3/right-keyToNumValues/.1.delta.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/3/right-keyToNumValues/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/3/right-keyToNumValues/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/3/right-keyToNumValues/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/3/right-keyWithIndexToValue/.1.delta.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/3/right-keyWithIndexToValue/.1.delta.crc new file mode 100644 index 0000000000000..cf1d68e2acee3 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/3/right-keyWithIndexToValue/.1.delta.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/3/right-keyWithIndexToValue/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/3/right-keyWithIndexToValue/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/3/right-keyWithIndexToValue/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/4/left-keyToNumValues/.1.delta.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/4/left-keyToNumValues/.1.delta.crc new file mode 100644 index 0000000000000..20918a4ffe6ff Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/4/left-keyToNumValues/.1.delta.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/4/left-keyToNumValues/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/4/left-keyToNumValues/1.delta new file mode 100644 index 0000000000000..0bdaf341003b9 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/4/left-keyToNumValues/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/4/left-keyWithIndexToValue/.1.delta.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/4/left-keyWithIndexToValue/.1.delta.crc new file mode 100644 index 0000000000000..cb8ce356ad7f3 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/4/left-keyWithIndexToValue/.1.delta.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/4/left-keyWithIndexToValue/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/4/left-keyWithIndexToValue/1.delta new file mode 100644 index 0000000000000..56b52ab974a3d Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/4/left-keyWithIndexToValue/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/4/right-keyToNumValues/.1.delta.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/4/right-keyToNumValues/.1.delta.crc new file mode 100644 index 0000000000000..20918a4ffe6ff Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/4/right-keyToNumValues/.1.delta.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/4/right-keyToNumValues/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/4/right-keyToNumValues/1.delta new file mode 100644 index 0000000000000..0bdaf341003b9 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/4/right-keyToNumValues/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/4/right-keyWithIndexToValue/.1.delta.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/4/right-keyWithIndexToValue/.1.delta.crc new file mode 100644 index 0000000000000..a874ad31b7403 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/4/right-keyWithIndexToValue/.1.delta.crc differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/4/right-keyWithIndexToValue/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/4/right-keyWithIndexToValue/1.delta new file mode 100644 index 0000000000000..0ed4feb1bd9b6 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.5.1-streaming-join/state/0/4/right-keyWithIndexToValue/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/commits/0 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/commits/0 new file mode 100644 index 0000000000000..9c1e3021c3ead --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/commits/0 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":0} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/commits/1 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/commits/1 new file mode 100644 index 0000000000000..9c1e3021c3ead --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/commits/1 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":0} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/commits/2 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/commits/2 new file mode 100644 index 0000000000000..9c1e3021c3ead --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/commits/2 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":0} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/commits/3 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/commits/3 new file mode 100644 index 0000000000000..9c1e3021c3ead --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/commits/3 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":0} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/metadata b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/metadata new file mode 100644 index 0000000000000..6fb99c5969bd9 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/metadata @@ -0,0 +1 @@ +{"id":"ab7bcd9a-4146-45d3-933d-a615b381c3be"} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/offsets/0 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/offsets/0 new file mode 100644 index 0000000000000..ba13971f3848b --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/offsets/0 @@ -0,0 +1,3 @@ +v1 +{"batchWatermarkMs":0,"batchTimestampMs":1719290655102,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4"}} +0 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/offsets/1 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/offsets/1 new file mode 100644 index 0000000000000..5fd58f6716944 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/offsets/1 @@ -0,0 +1,3 @@ +v1 +{"batchWatermarkMs":0,"batchTimestampMs":1719290659041,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4"}} +1 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/offsets/2 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/offsets/2 new file mode 100644 index 0000000000000..be839a1efa191 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/offsets/2 @@ -0,0 +1,3 @@ +v1 +{"batchWatermarkMs":0,"batchTimestampMs":1719290661716,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4"}} +2 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/offsets/3 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/offsets/3 new file mode 100644 index 0000000000000..c87f4b1b97bcc --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/offsets/3 @@ -0,0 +1,3 @@ +v1 +{"batchWatermarkMs":0,"batchTimestampMs":1719290664278,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4"}} +3 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/0/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/0/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/0/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/0/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/0/2.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/0/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/0/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/0/2.snapshot new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/0/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/0/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/0/3.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/0/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/0/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/0/4.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/0/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/0/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/0/4.snapshot new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/0/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/0/_metadata/schema b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/0/_metadata/schema new file mode 100644 index 0000000000000..20b45317e0a22 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/0/_metadata/schema differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/1/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/1/1.delta new file mode 100644 index 0000000000000..dae5bfc800597 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/1/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/1/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/1/2.delta new file mode 100644 index 0000000000000..4405af3420786 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/1/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/1/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/1/2.snapshot new file mode 100644 index 0000000000000..4405af3420786 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/1/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/1/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/1/3.delta new file mode 100644 index 0000000000000..433925dab114d Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/1/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/1/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/1/4.delta new file mode 100644 index 0000000000000..064533dcd4f30 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/1/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/1/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/1/4.snapshot new file mode 100644 index 0000000000000..b4e23cc0ed4e5 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/1/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/2/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/2/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/2/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/2/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/2/2.delta new file mode 100644 index 0000000000000..4e421cd377fb6 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/2/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/2/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/2/2.snapshot new file mode 100644 index 0000000000000..4e421cd377fb6 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/2/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/2/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/2/3.delta new file mode 100644 index 0000000000000..acf7619c291f3 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/2/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/2/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/2/4.delta new file mode 100644 index 0000000000000..27ca4dcede3dc Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/2/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/2/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/2/4.snapshot new file mode 100644 index 0000000000000..27ca4dcede3dc Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/2/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/3/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/3/1.delta new file mode 100644 index 0000000000000..859c2b1315a5e Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/3/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/3/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/3/2.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/3/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/3/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/3/2.snapshot new file mode 100644 index 0000000000000..859c2b1315a5e Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/3/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/3/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/3/3.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/3/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/3/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/3/4.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/3/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/3/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/3/4.snapshot new file mode 100644 index 0000000000000..859c2b1315a5e Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/3/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/4/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/4/1.delta new file mode 100644 index 0000000000000..0bdaf341003b9 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/4/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/4/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/4/2.delta new file mode 100644 index 0000000000000..3465b025dfa03 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/4/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/4/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/4/2.snapshot new file mode 100644 index 0000000000000..3465b025dfa03 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/4/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/4/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/4/3.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/4/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/4/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/4/4.delta new file mode 100644 index 0000000000000..6070683f44e12 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/4/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/4/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/4/4.snapshot new file mode 100644 index 0000000000000..bf46f06c500d8 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/4/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/_metadata/metadata b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/_metadata/metadata new file mode 100644 index 0000000000000..5094b71aa3581 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/agg/state/0/_metadata/metadata @@ -0,0 +1,2 @@ +v1 +{"operatorInfo":{"operatorId":0,"operatorName":"stateStoreSave"},"stateStoreInfo":[{"storeName":"default","numColsPrefixKey":0,"numPartitions":5}]} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/commits/0 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/commits/0 new file mode 100644 index 0000000000000..9c1e3021c3ead --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/commits/0 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":0} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/commits/1 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/commits/1 new file mode 100644 index 0000000000000..9c1e3021c3ead --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/commits/1 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":0} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/commits/2 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/commits/2 new file mode 100644 index 0000000000000..9c1e3021c3ead --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/commits/2 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":0} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/commits/3 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/commits/3 new file mode 100644 index 0000000000000..9c1e3021c3ead --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/commits/3 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":0} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/metadata b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/metadata new file mode 100644 index 0000000000000..b151b6c27e031 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/metadata @@ -0,0 +1 @@ +{"id":"9253a36d-8c80-4cdc-9d2a-3cd9b5ceff59"} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/offsets/0 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/offsets/0 new file mode 100644 index 0000000000000..acbebd05ef160 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/offsets/0 @@ -0,0 +1,3 @@ +v1 +{"batchWatermarkMs":0,"batchTimestampMs":1719289915309,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4"}} +0 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/offsets/1 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/offsets/1 new file mode 100644 index 0000000000000..3485e7b5927c1 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/offsets/1 @@ -0,0 +1,3 @@ +v1 +{"batchWatermarkMs":0,"batchTimestampMs":1719289918096,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4"}} +1 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/offsets/2 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/offsets/2 new file mode 100644 index 0000000000000..a5be1f2e758aa --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/offsets/2 @@ -0,0 +1,3 @@ +v1 +{"batchWatermarkMs":0,"batchTimestampMs":1719289921002,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4"}} +2 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/offsets/3 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/offsets/3 new file mode 100644 index 0000000000000..6a238cc687bf7 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/offsets/3 @@ -0,0 +1,3 @@ +v1 +{"batchWatermarkMs":0,"batchTimestampMs":1719289923707,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4"}} +3 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/0/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/0/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/0/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/0/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/0/2.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/0/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/0/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/0/2.snapshot new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/0/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/0/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/0/3.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/0/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/0/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/0/4.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/0/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/0/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/0/4.snapshot new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/0/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/0/_metadata/schema b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/0/_metadata/schema new file mode 100644 index 0000000000000..70ebb77e1b781 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/0/_metadata/schema differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/1/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/1/1.delta new file mode 100644 index 0000000000000..aac301da2c2ab Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/1/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/1/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/1/2.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/1/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/1/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/1/2.snapshot new file mode 100644 index 0000000000000..aac301da2c2ab Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/1/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/1/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/1/3.delta new file mode 100644 index 0000000000000..c88091a5bbc9e Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/1/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/1/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/1/4.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/1/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/1/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/1/4.snapshot new file mode 100644 index 0000000000000..8cd4b5dcb1ec2 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/1/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/2/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/2/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/2/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/2/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/2/2.delta new file mode 100644 index 0000000000000..52dacd1351c7e Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/2/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/2/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/2/2.snapshot new file mode 100644 index 0000000000000..52dacd1351c7e Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/2/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/2/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/2/3.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/2/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/2/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/2/4.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/2/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/2/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/2/4.snapshot new file mode 100644 index 0000000000000..52dacd1351c7e Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/2/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/3/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/3/1.delta new file mode 100644 index 0000000000000..d86baf6d41aa2 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/3/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/3/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/3/2.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/3/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/3/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/3/2.snapshot new file mode 100644 index 0000000000000..d86baf6d41aa2 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/3/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/3/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/3/3.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/3/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/3/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/3/4.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/3/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/3/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/3/4.snapshot new file mode 100644 index 0000000000000..d86baf6d41aa2 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/3/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/4/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/4/1.delta new file mode 100644 index 0000000000000..1a985dfde9d45 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/4/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/4/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/4/2.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/4/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/4/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/4/2.snapshot new file mode 100644 index 0000000000000..1a985dfde9d45 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/4/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/4/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/4/3.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/4/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/4/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/4/4.delta new file mode 100644 index 0000000000000..9d22a051ebfdc Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/4/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/4/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/4/4.snapshot new file mode 100644 index 0000000000000..eb35f4815bfe8 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/4/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/_metadata/metadata b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/_metadata/metadata new file mode 100644 index 0000000000000..39ce28c9b4aa5 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/dedup/state/0/_metadata/metadata @@ -0,0 +1,2 @@ +v1 +{"operatorInfo":{"operatorId":0,"operatorName":"dedupe"},"stateStoreInfo":[{"storeName":"default","numColsPrefixKey":0,"numPartitions":5}]} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/commits/0 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/commits/0 new file mode 100644 index 0000000000000..9c1e3021c3ead --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/commits/0 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":0} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/commits/1 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/commits/1 new file mode 100644 index 0000000000000..1715390973f9b --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/commits/1 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":5000} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/commits/2 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/commits/2 new file mode 100644 index 0000000000000..1715390973f9b --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/commits/2 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":5000} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/commits/3 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/commits/3 new file mode 100644 index 0000000000000..2a2d02b0bb3b3 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/commits/3 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":9000} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/commits/4 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/commits/4 new file mode 100644 index 0000000000000..2a2d02b0bb3b3 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/commits/4 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":9000} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/metadata b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/metadata new file mode 100644 index 0000000000000..c3656bc51c886 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/metadata @@ -0,0 +1 @@ +{"id":"fbcd445d-b716-4589-ad15-774afa5a243d"} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/offsets/0 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/offsets/0 new file mode 100644 index 0000000000000..4c4f47f8e9ec5 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/offsets/0 @@ -0,0 +1,4 @@ +v1 +{"batchWatermarkMs":0,"batchTimestampMs":1719289926768,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4"}} +0 +0 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/offsets/1 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/offsets/1 new file mode 100644 index 0000000000000..13cd6f3002a74 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/offsets/1 @@ -0,0 +1,4 @@ +v1 +{"batchWatermarkMs":0,"batchTimestampMs":1719289932001,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4"}} +1 +1 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/offsets/2 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/offsets/2 new file mode 100644 index 0000000000000..013735ad0d044 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/offsets/2 @@ -0,0 +1,4 @@ +v1 +{"batchWatermarkMs":5000,"batchTimestampMs":1719289935688,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4"}} +1 +1 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/offsets/3 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/offsets/3 new file mode 100644 index 0000000000000..6802d77f1ecc8 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/offsets/3 @@ -0,0 +1,4 @@ +v1 +{"batchWatermarkMs":5000,"batchTimestampMs":1719289939702,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4"}} +2 +2 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/offsets/4 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/offsets/4 new file mode 100644 index 0000000000000..8621a231056b2 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/offsets/4 @@ -0,0 +1,4 @@ +v1 +{"batchWatermarkMs":9000,"batchTimestampMs":1719289942516,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4"}} +2 +2 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyToNumValues/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyToNumValues/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyToNumValues/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyToNumValues/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyToNumValues/2.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyToNumValues/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyToNumValues/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyToNumValues/2.snapshot new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyToNumValues/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyToNumValues/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyToNumValues/3.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyToNumValues/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyToNumValues/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyToNumValues/4.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyToNumValues/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyToNumValues/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyToNumValues/4.snapshot new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyToNumValues/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyToNumValues/5.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyToNumValues/5.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyToNumValues/5.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyToNumValues/_metadata/schema b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyToNumValues/_metadata/schema new file mode 100644 index 0000000000000..4da637d143496 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyToNumValues/_metadata/schema differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyWithIndexToValue/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyWithIndexToValue/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyWithIndexToValue/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyWithIndexToValue/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyWithIndexToValue/2.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyWithIndexToValue/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyWithIndexToValue/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyWithIndexToValue/2.snapshot new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyWithIndexToValue/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyWithIndexToValue/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyWithIndexToValue/3.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyWithIndexToValue/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyWithIndexToValue/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyWithIndexToValue/4.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyWithIndexToValue/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyWithIndexToValue/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyWithIndexToValue/4.snapshot new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyWithIndexToValue/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyWithIndexToValue/5.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyWithIndexToValue/5.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyWithIndexToValue/5.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyWithIndexToValue/_metadata/schema b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyWithIndexToValue/_metadata/schema new file mode 100644 index 0000000000000..42448b3b584ce Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/left-keyWithIndexToValue/_metadata/schema differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyToNumValues/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyToNumValues/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyToNumValues/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyToNumValues/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyToNumValues/2.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyToNumValues/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyToNumValues/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyToNumValues/2.snapshot new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyToNumValues/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyToNumValues/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyToNumValues/3.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyToNumValues/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyToNumValues/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyToNumValues/4.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyToNumValues/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyToNumValues/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyToNumValues/4.snapshot new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyToNumValues/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyToNumValues/5.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyToNumValues/5.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyToNumValues/5.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyToNumValues/_metadata/schema b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyToNumValues/_metadata/schema new file mode 100644 index 0000000000000..4da637d143496 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyToNumValues/_metadata/schema differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyWithIndexToValue/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyWithIndexToValue/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyWithIndexToValue/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyWithIndexToValue/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyWithIndexToValue/2.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyWithIndexToValue/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyWithIndexToValue/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyWithIndexToValue/2.snapshot new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyWithIndexToValue/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyWithIndexToValue/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyWithIndexToValue/3.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyWithIndexToValue/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyWithIndexToValue/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyWithIndexToValue/4.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyWithIndexToValue/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyWithIndexToValue/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyWithIndexToValue/4.snapshot new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyWithIndexToValue/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyWithIndexToValue/5.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyWithIndexToValue/5.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyWithIndexToValue/5.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyWithIndexToValue/_metadata/schema b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyWithIndexToValue/_metadata/schema new file mode 100644 index 0000000000000..8fa8f1675bc82 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/0/right-keyWithIndexToValue/_metadata/schema differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/left-keyToNumValues/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/left-keyToNumValues/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/left-keyToNumValues/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/left-keyToNumValues/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/left-keyToNumValues/2.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/left-keyToNumValues/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/left-keyToNumValues/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/left-keyToNumValues/2.snapshot new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/left-keyToNumValues/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/left-keyToNumValues/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/left-keyToNumValues/3.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/left-keyToNumValues/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/left-keyToNumValues/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/left-keyToNumValues/4.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/left-keyToNumValues/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/left-keyToNumValues/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/left-keyToNumValues/4.snapshot new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/left-keyToNumValues/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/left-keyToNumValues/5.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/left-keyToNumValues/5.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/left-keyToNumValues/5.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/left-keyWithIndexToValue/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/left-keyWithIndexToValue/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/left-keyWithIndexToValue/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/left-keyWithIndexToValue/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/left-keyWithIndexToValue/2.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/left-keyWithIndexToValue/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/left-keyWithIndexToValue/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/left-keyWithIndexToValue/2.snapshot new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/left-keyWithIndexToValue/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/left-keyWithIndexToValue/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/left-keyWithIndexToValue/3.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/left-keyWithIndexToValue/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/left-keyWithIndexToValue/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/left-keyWithIndexToValue/4.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/left-keyWithIndexToValue/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/left-keyWithIndexToValue/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/left-keyWithIndexToValue/4.snapshot new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/left-keyWithIndexToValue/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/left-keyWithIndexToValue/5.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/left-keyWithIndexToValue/5.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/left-keyWithIndexToValue/5.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/right-keyToNumValues/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/right-keyToNumValues/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/right-keyToNumValues/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/right-keyToNumValues/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/right-keyToNumValues/2.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/right-keyToNumValues/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/right-keyToNumValues/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/right-keyToNumValues/2.snapshot new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/right-keyToNumValues/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/right-keyToNumValues/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/right-keyToNumValues/3.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/right-keyToNumValues/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/right-keyToNumValues/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/right-keyToNumValues/4.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/right-keyToNumValues/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/right-keyToNumValues/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/right-keyToNumValues/4.snapshot new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/right-keyToNumValues/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/right-keyToNumValues/5.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/right-keyToNumValues/5.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/right-keyToNumValues/5.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/right-keyWithIndexToValue/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/right-keyWithIndexToValue/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/right-keyWithIndexToValue/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/right-keyWithIndexToValue/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/right-keyWithIndexToValue/2.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/right-keyWithIndexToValue/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/right-keyWithIndexToValue/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/right-keyWithIndexToValue/2.snapshot new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/right-keyWithIndexToValue/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/right-keyWithIndexToValue/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/right-keyWithIndexToValue/3.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/right-keyWithIndexToValue/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/right-keyWithIndexToValue/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/right-keyWithIndexToValue/4.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/right-keyWithIndexToValue/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/right-keyWithIndexToValue/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/right-keyWithIndexToValue/4.snapshot new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/right-keyWithIndexToValue/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/right-keyWithIndexToValue/5.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/right-keyWithIndexToValue/5.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/1/right-keyWithIndexToValue/5.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/left-keyToNumValues/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/left-keyToNumValues/1.delta new file mode 100644 index 0000000000000..4e421cd377fb6 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/left-keyToNumValues/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/left-keyToNumValues/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/left-keyToNumValues/2.delta new file mode 100644 index 0000000000000..25d24f4cedf3f Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/left-keyToNumValues/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/left-keyToNumValues/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/left-keyToNumValues/2.snapshot new file mode 100644 index 0000000000000..d295efee0d000 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/left-keyToNumValues/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/left-keyToNumValues/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/left-keyToNumValues/3.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/left-keyToNumValues/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/left-keyToNumValues/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/left-keyToNumValues/4.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/left-keyToNumValues/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/left-keyToNumValues/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/left-keyToNumValues/4.snapshot new file mode 100644 index 0000000000000..4460eadea0c0d Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/left-keyToNumValues/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/left-keyToNumValues/5.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/left-keyToNumValues/5.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/left-keyToNumValues/5.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/left-keyWithIndexToValue/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/left-keyWithIndexToValue/1.delta new file mode 100644 index 0000000000000..7e6dce9cc108c Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/left-keyWithIndexToValue/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/left-keyWithIndexToValue/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/left-keyWithIndexToValue/2.delta new file mode 100644 index 0000000000000..21e2a706e8c7a Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/left-keyWithIndexToValue/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/left-keyWithIndexToValue/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/left-keyWithIndexToValue/2.snapshot new file mode 100644 index 0000000000000..5bfdddd0e2f14 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/left-keyWithIndexToValue/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/left-keyWithIndexToValue/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/left-keyWithIndexToValue/3.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/left-keyWithIndexToValue/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/left-keyWithIndexToValue/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/left-keyWithIndexToValue/4.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/left-keyWithIndexToValue/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/left-keyWithIndexToValue/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/left-keyWithIndexToValue/4.snapshot new file mode 100644 index 0000000000000..30e88f6cb35c6 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/left-keyWithIndexToValue/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/left-keyWithIndexToValue/5.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/left-keyWithIndexToValue/5.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/left-keyWithIndexToValue/5.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/right-keyToNumValues/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/right-keyToNumValues/1.delta new file mode 100644 index 0000000000000..4e421cd377fb6 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/right-keyToNumValues/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/right-keyToNumValues/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/right-keyToNumValues/2.delta new file mode 100644 index 0000000000000..25d24f4cedf3f Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/right-keyToNumValues/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/right-keyToNumValues/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/right-keyToNumValues/2.snapshot new file mode 100644 index 0000000000000..d295efee0d000 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/right-keyToNumValues/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/right-keyToNumValues/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/right-keyToNumValues/3.delta new file mode 100644 index 0000000000000..4d3cf654ce551 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/right-keyToNumValues/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/right-keyToNumValues/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/right-keyToNumValues/4.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/right-keyToNumValues/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/right-keyToNumValues/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/right-keyToNumValues/4.snapshot new file mode 100644 index 0000000000000..25d24f4cedf3f Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/right-keyToNumValues/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/right-keyToNumValues/5.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/right-keyToNumValues/5.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/right-keyToNumValues/5.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/right-keyWithIndexToValue/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/right-keyWithIndexToValue/1.delta new file mode 100644 index 0000000000000..2ec494e6a636f Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/right-keyWithIndexToValue/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/right-keyWithIndexToValue/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/right-keyWithIndexToValue/2.delta new file mode 100644 index 0000000000000..0877f7564366f Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/right-keyWithIndexToValue/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/right-keyWithIndexToValue/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/right-keyWithIndexToValue/2.snapshot new file mode 100644 index 0000000000000..5bfdddd0e2f14 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/right-keyWithIndexToValue/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/right-keyWithIndexToValue/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/right-keyWithIndexToValue/3.delta new file mode 100644 index 0000000000000..edcc5dd1f672a Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/right-keyWithIndexToValue/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/right-keyWithIndexToValue/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/right-keyWithIndexToValue/4.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/right-keyWithIndexToValue/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/right-keyWithIndexToValue/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/right-keyWithIndexToValue/4.snapshot new file mode 100644 index 0000000000000..0877f7564366f Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/right-keyWithIndexToValue/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/right-keyWithIndexToValue/5.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/right-keyWithIndexToValue/5.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/2/right-keyWithIndexToValue/5.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/left-keyToNumValues/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/left-keyToNumValues/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/left-keyToNumValues/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/left-keyToNumValues/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/left-keyToNumValues/2.delta new file mode 100644 index 0000000000000..74fa1fc58b611 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/left-keyToNumValues/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/left-keyToNumValues/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/left-keyToNumValues/2.snapshot new file mode 100644 index 0000000000000..74fa1fc58b611 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/left-keyToNumValues/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/left-keyToNumValues/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/left-keyToNumValues/3.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/left-keyToNumValues/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/left-keyToNumValues/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/left-keyToNumValues/4.delta new file mode 100644 index 0000000000000..1aeeffd771c08 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/left-keyToNumValues/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/left-keyToNumValues/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/left-keyToNumValues/4.snapshot new file mode 100644 index 0000000000000..379479c0ccc3b Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/left-keyToNumValues/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/left-keyToNumValues/5.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/left-keyToNumValues/5.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/left-keyToNumValues/5.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/left-keyWithIndexToValue/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/left-keyWithIndexToValue/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/left-keyWithIndexToValue/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/left-keyWithIndexToValue/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/left-keyWithIndexToValue/2.delta new file mode 100644 index 0000000000000..cafd9d540f8e0 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/left-keyWithIndexToValue/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/left-keyWithIndexToValue/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/left-keyWithIndexToValue/2.snapshot new file mode 100644 index 0000000000000..9899fb58eebe4 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/left-keyWithIndexToValue/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/left-keyWithIndexToValue/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/left-keyWithIndexToValue/3.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/left-keyWithIndexToValue/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/left-keyWithIndexToValue/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/left-keyWithIndexToValue/4.delta new file mode 100644 index 0000000000000..69437319e872d Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/left-keyWithIndexToValue/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/left-keyWithIndexToValue/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/left-keyWithIndexToValue/4.snapshot new file mode 100644 index 0000000000000..aa76ff9b416a1 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/left-keyWithIndexToValue/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/left-keyWithIndexToValue/5.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/left-keyWithIndexToValue/5.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/left-keyWithIndexToValue/5.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/right-keyToNumValues/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/right-keyToNumValues/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/right-keyToNumValues/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/right-keyToNumValues/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/right-keyToNumValues/2.delta new file mode 100644 index 0000000000000..74fa1fc58b611 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/right-keyToNumValues/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/right-keyToNumValues/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/right-keyToNumValues/2.snapshot new file mode 100644 index 0000000000000..74fa1fc58b611 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/right-keyToNumValues/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/right-keyToNumValues/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/right-keyToNumValues/3.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/right-keyToNumValues/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/right-keyToNumValues/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/right-keyToNumValues/4.delta new file mode 100644 index 0000000000000..1aeeffd771c08 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/right-keyToNumValues/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/right-keyToNumValues/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/right-keyToNumValues/4.snapshot new file mode 100644 index 0000000000000..379479c0ccc3b Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/right-keyToNumValues/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/right-keyToNumValues/5.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/right-keyToNumValues/5.delta new file mode 100644 index 0000000000000..47dff164d42d0 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/right-keyToNumValues/5.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/right-keyWithIndexToValue/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/right-keyWithIndexToValue/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/right-keyWithIndexToValue/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/right-keyWithIndexToValue/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/right-keyWithIndexToValue/2.delta new file mode 100644 index 0000000000000..9899fb58eebe4 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/right-keyWithIndexToValue/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/right-keyWithIndexToValue/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/right-keyWithIndexToValue/2.snapshot new file mode 100644 index 0000000000000..9899fb58eebe4 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/right-keyWithIndexToValue/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/right-keyWithIndexToValue/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/right-keyWithIndexToValue/3.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/right-keyWithIndexToValue/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/right-keyWithIndexToValue/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/right-keyWithIndexToValue/4.delta new file mode 100644 index 0000000000000..105c7cc4255a9 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/right-keyWithIndexToValue/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/right-keyWithIndexToValue/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/right-keyWithIndexToValue/4.snapshot new file mode 100644 index 0000000000000..aa76ff9b416a1 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/right-keyWithIndexToValue/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/right-keyWithIndexToValue/5.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/right-keyWithIndexToValue/5.delta new file mode 100644 index 0000000000000..d536c6fdcbce4 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/3/right-keyWithIndexToValue/5.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/left-keyToNumValues/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/left-keyToNumValues/1.delta new file mode 100644 index 0000000000000..0bdaf341003b9 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/left-keyToNumValues/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/left-keyToNumValues/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/left-keyToNumValues/2.delta new file mode 100644 index 0000000000000..6070683f44e12 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/left-keyToNumValues/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/left-keyToNumValues/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/left-keyToNumValues/2.snapshot new file mode 100644 index 0000000000000..837a1434917ca Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/left-keyToNumValues/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/left-keyToNumValues/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/left-keyToNumValues/3.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/left-keyToNumValues/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/left-keyToNumValues/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/left-keyToNumValues/4.delta new file mode 100644 index 0000000000000..32506a0366066 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/left-keyToNumValues/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/left-keyToNumValues/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/left-keyToNumValues/4.snapshot new file mode 100644 index 0000000000000..cf43298608153 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/left-keyToNumValues/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/left-keyToNumValues/5.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/left-keyToNumValues/5.delta new file mode 100644 index 0000000000000..2f2da77129ea4 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/left-keyToNumValues/5.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/left-keyWithIndexToValue/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/left-keyWithIndexToValue/1.delta new file mode 100644 index 0000000000000..56b52ab974a3d Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/left-keyWithIndexToValue/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/left-keyWithIndexToValue/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/left-keyWithIndexToValue/2.delta new file mode 100644 index 0000000000000..26d97da9c610f Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/left-keyWithIndexToValue/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/left-keyWithIndexToValue/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/left-keyWithIndexToValue/2.snapshot new file mode 100644 index 0000000000000..2dbdd331b3e97 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/left-keyWithIndexToValue/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/left-keyWithIndexToValue/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/left-keyWithIndexToValue/3.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/left-keyWithIndexToValue/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/left-keyWithIndexToValue/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/left-keyWithIndexToValue/4.delta new file mode 100644 index 0000000000000..3603b3f81bc77 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/left-keyWithIndexToValue/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/left-keyWithIndexToValue/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/left-keyWithIndexToValue/4.snapshot new file mode 100644 index 0000000000000..b92431e0a4df6 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/left-keyWithIndexToValue/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/left-keyWithIndexToValue/5.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/left-keyWithIndexToValue/5.delta new file mode 100644 index 0000000000000..71fdc6c434ca3 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/left-keyWithIndexToValue/5.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/right-keyToNumValues/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/right-keyToNumValues/1.delta new file mode 100644 index 0000000000000..0bdaf341003b9 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/right-keyToNumValues/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/right-keyToNumValues/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/right-keyToNumValues/2.delta new file mode 100644 index 0000000000000..6070683f44e12 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/right-keyToNumValues/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/right-keyToNumValues/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/right-keyToNumValues/2.snapshot new file mode 100644 index 0000000000000..837a1434917ca Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/right-keyToNumValues/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/right-keyToNumValues/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/right-keyToNumValues/3.delta new file mode 100644 index 0000000000000..2f2da77129ea4 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/right-keyToNumValues/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/right-keyToNumValues/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/right-keyToNumValues/4.delta new file mode 100644 index 0000000000000..32506a0366066 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/right-keyToNumValues/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/right-keyToNumValues/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/right-keyToNumValues/4.snapshot new file mode 100644 index 0000000000000..3b529f86101ac Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/right-keyToNumValues/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/right-keyToNumValues/5.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/right-keyToNumValues/5.delta new file mode 100644 index 0000000000000..83a5f723a34ab Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/right-keyToNumValues/5.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/right-keyWithIndexToValue/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/right-keyWithIndexToValue/1.delta new file mode 100644 index 0000000000000..0ed4feb1bd9b6 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/right-keyWithIndexToValue/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/right-keyWithIndexToValue/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/right-keyWithIndexToValue/2.delta new file mode 100644 index 0000000000000..31a686912dc97 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/right-keyWithIndexToValue/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/right-keyWithIndexToValue/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/right-keyWithIndexToValue/2.snapshot new file mode 100644 index 0000000000000..2dbdd331b3e97 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/right-keyWithIndexToValue/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/right-keyWithIndexToValue/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/right-keyWithIndexToValue/3.delta new file mode 100644 index 0000000000000..71fdc6c434ca3 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/right-keyWithIndexToValue/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/right-keyWithIndexToValue/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/right-keyWithIndexToValue/4.delta new file mode 100644 index 0000000000000..649772d35ffd8 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/right-keyWithIndexToValue/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/right-keyWithIndexToValue/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/right-keyWithIndexToValue/4.snapshot new file mode 100644 index 0000000000000..50b057756915b Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/right-keyWithIndexToValue/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/right-keyWithIndexToValue/5.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/right-keyWithIndexToValue/5.delta new file mode 100644 index 0000000000000..1c4c3974de1d5 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/4/right-keyWithIndexToValue/5.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/_metadata/metadata b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/_metadata/metadata new file mode 100644 index 0000000000000..b73f1e3e66ac5 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join1/state/0/_metadata/metadata @@ -0,0 +1,2 @@ +v1 +{"operatorInfo":{"operatorId":0,"operatorName":"symmetricHashJoin"},"stateStoreInfo":[{"storeName":"left-keyToNumValues","numColsPrefixKey":0,"numPartitions":5},{"storeName":"left-keyWithIndexToValue","numColsPrefixKey":0,"numPartitions":5},{"storeName":"right-keyToNumValues","numColsPrefixKey":0,"numPartitions":5},{"storeName":"right-keyWithIndexToValue","numColsPrefixKey":0,"numPartitions":5}]} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/commits/0 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/commits/0 new file mode 100644 index 0000000000000..9c1e3021c3ead --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/commits/0 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":0} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/commits/1 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/commits/1 new file mode 100644 index 0000000000000..1715390973f9b --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/commits/1 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":5000} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/commits/2 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/commits/2 new file mode 100644 index 0000000000000..1715390973f9b --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/commits/2 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":5000} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/commits/3 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/commits/3 new file mode 100644 index 0000000000000..2a2d02b0bb3b3 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/commits/3 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":9000} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/commits/4 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/commits/4 new file mode 100644 index 0000000000000..2a2d02b0bb3b3 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/commits/4 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":9000} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/metadata b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/metadata new file mode 100644 index 0000000000000..331d37d197a17 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/metadata @@ -0,0 +1 @@ +{"id":"26806544-709f-4745-a6e4-7641361fe94a"} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/offsets/0 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/offsets/0 new file mode 100644 index 0000000000000..20e0fddaf646b --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/offsets/0 @@ -0,0 +1,4 @@ +v1 +{"batchWatermarkMs":0,"batchTimestampMs":1719289946613,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4"}} +0 +0 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/offsets/1 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/offsets/1 new file mode 100644 index 0000000000000..f71629438b9e2 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/offsets/1 @@ -0,0 +1,4 @@ +v1 +{"batchWatermarkMs":0,"batchTimestampMs":1719289951327,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4"}} +1 +1 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/offsets/2 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/offsets/2 new file mode 100644 index 0000000000000..789561fd971a4 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/offsets/2 @@ -0,0 +1,4 @@ +v1 +{"batchWatermarkMs":5000,"batchTimestampMs":1719289954259,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4"}} +1 +1 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/offsets/3 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/offsets/3 new file mode 100644 index 0000000000000..0d224bfc5920a --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/offsets/3 @@ -0,0 +1,4 @@ +v1 +{"batchWatermarkMs":5000,"batchTimestampMs":1719289958068,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4"}} +2 +2 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/offsets/4 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/offsets/4 new file mode 100644 index 0000000000000..68158e5f2542e --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/offsets/4 @@ -0,0 +1,4 @@ +v1 +{"batchWatermarkMs":9000,"batchTimestampMs":1719289960765,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4"}} +2 +2 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyToNumValues/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyToNumValues/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyToNumValues/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyToNumValues/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyToNumValues/2.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyToNumValues/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyToNumValues/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyToNumValues/2.snapshot new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyToNumValues/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyToNumValues/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyToNumValues/3.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyToNumValues/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyToNumValues/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyToNumValues/4.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyToNumValues/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyToNumValues/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyToNumValues/4.snapshot new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyToNumValues/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyToNumValues/5.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyToNumValues/5.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyToNumValues/5.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyToNumValues/_metadata/schema b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyToNumValues/_metadata/schema new file mode 100644 index 0000000000000..4da637d143496 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyToNumValues/_metadata/schema differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyWithIndexToValue/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyWithIndexToValue/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyWithIndexToValue/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyWithIndexToValue/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyWithIndexToValue/2.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyWithIndexToValue/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyWithIndexToValue/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyWithIndexToValue/2.snapshot new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyWithIndexToValue/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyWithIndexToValue/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyWithIndexToValue/3.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyWithIndexToValue/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyWithIndexToValue/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyWithIndexToValue/4.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyWithIndexToValue/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyWithIndexToValue/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyWithIndexToValue/4.snapshot new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyWithIndexToValue/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyWithIndexToValue/5.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyWithIndexToValue/5.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyWithIndexToValue/5.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyWithIndexToValue/_metadata/schema b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyWithIndexToValue/_metadata/schema new file mode 100644 index 0000000000000..42448b3b584ce Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/left-keyWithIndexToValue/_metadata/schema differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyToNumValues/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyToNumValues/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyToNumValues/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyToNumValues/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyToNumValues/2.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyToNumValues/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyToNumValues/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyToNumValues/2.snapshot new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyToNumValues/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyToNumValues/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyToNumValues/3.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyToNumValues/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyToNumValues/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyToNumValues/4.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyToNumValues/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyToNumValues/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyToNumValues/4.snapshot new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyToNumValues/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyToNumValues/5.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyToNumValues/5.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyToNumValues/5.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyToNumValues/_metadata/schema b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyToNumValues/_metadata/schema new file mode 100644 index 0000000000000..4da637d143496 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyToNumValues/_metadata/schema differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyWithIndexToValue/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyWithIndexToValue/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyWithIndexToValue/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyWithIndexToValue/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyWithIndexToValue/2.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyWithIndexToValue/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyWithIndexToValue/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyWithIndexToValue/2.snapshot new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyWithIndexToValue/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyWithIndexToValue/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyWithIndexToValue/3.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyWithIndexToValue/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyWithIndexToValue/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyWithIndexToValue/4.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyWithIndexToValue/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyWithIndexToValue/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyWithIndexToValue/4.snapshot new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyWithIndexToValue/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyWithIndexToValue/5.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyWithIndexToValue/5.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyWithIndexToValue/5.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyWithIndexToValue/_metadata/schema b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyWithIndexToValue/_metadata/schema new file mode 100644 index 0000000000000..8fa8f1675bc82 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/0/right-keyWithIndexToValue/_metadata/schema differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/left-keyToNumValues/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/left-keyToNumValues/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/left-keyToNumValues/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/left-keyToNumValues/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/left-keyToNumValues/2.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/left-keyToNumValues/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/left-keyToNumValues/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/left-keyToNumValues/2.snapshot new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/left-keyToNumValues/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/left-keyToNumValues/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/left-keyToNumValues/3.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/left-keyToNumValues/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/left-keyToNumValues/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/left-keyToNumValues/4.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/left-keyToNumValues/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/left-keyToNumValues/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/left-keyToNumValues/4.snapshot new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/left-keyToNumValues/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/left-keyToNumValues/5.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/left-keyToNumValues/5.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/left-keyToNumValues/5.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/left-keyWithIndexToValue/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/left-keyWithIndexToValue/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/left-keyWithIndexToValue/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/left-keyWithIndexToValue/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/left-keyWithIndexToValue/2.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/left-keyWithIndexToValue/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/left-keyWithIndexToValue/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/left-keyWithIndexToValue/2.snapshot new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/left-keyWithIndexToValue/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/left-keyWithIndexToValue/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/left-keyWithIndexToValue/3.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/left-keyWithIndexToValue/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/left-keyWithIndexToValue/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/left-keyWithIndexToValue/4.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/left-keyWithIndexToValue/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/left-keyWithIndexToValue/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/left-keyWithIndexToValue/4.snapshot new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/left-keyWithIndexToValue/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/left-keyWithIndexToValue/5.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/left-keyWithIndexToValue/5.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/left-keyWithIndexToValue/5.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/right-keyToNumValues/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/right-keyToNumValues/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/right-keyToNumValues/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/right-keyToNumValues/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/right-keyToNumValues/2.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/right-keyToNumValues/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/right-keyToNumValues/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/right-keyToNumValues/2.snapshot new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/right-keyToNumValues/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/right-keyToNumValues/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/right-keyToNumValues/3.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/right-keyToNumValues/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/right-keyToNumValues/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/right-keyToNumValues/4.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/right-keyToNumValues/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/right-keyToNumValues/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/right-keyToNumValues/4.snapshot new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/right-keyToNumValues/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/right-keyToNumValues/5.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/right-keyToNumValues/5.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/right-keyToNumValues/5.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/right-keyWithIndexToValue/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/right-keyWithIndexToValue/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/right-keyWithIndexToValue/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/right-keyWithIndexToValue/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/right-keyWithIndexToValue/2.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/right-keyWithIndexToValue/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/right-keyWithIndexToValue/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/right-keyWithIndexToValue/2.snapshot new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/right-keyWithIndexToValue/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/right-keyWithIndexToValue/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/right-keyWithIndexToValue/3.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/right-keyWithIndexToValue/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/right-keyWithIndexToValue/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/right-keyWithIndexToValue/4.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/right-keyWithIndexToValue/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/right-keyWithIndexToValue/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/right-keyWithIndexToValue/4.snapshot new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/right-keyWithIndexToValue/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/right-keyWithIndexToValue/5.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/right-keyWithIndexToValue/5.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/1/right-keyWithIndexToValue/5.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/left-keyToNumValues/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/left-keyToNumValues/1.delta new file mode 100644 index 0000000000000..4e421cd377fb6 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/left-keyToNumValues/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/left-keyToNumValues/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/left-keyToNumValues/2.delta new file mode 100644 index 0000000000000..25d24f4cedf3f Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/left-keyToNumValues/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/left-keyToNumValues/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/left-keyToNumValues/2.snapshot new file mode 100644 index 0000000000000..d295efee0d000 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/left-keyToNumValues/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/left-keyToNumValues/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/left-keyToNumValues/3.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/left-keyToNumValues/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/left-keyToNumValues/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/left-keyToNumValues/4.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/left-keyToNumValues/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/left-keyToNumValues/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/left-keyToNumValues/4.snapshot new file mode 100644 index 0000000000000..4460eadea0c0d Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/left-keyToNumValues/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/left-keyToNumValues/5.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/left-keyToNumValues/5.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/left-keyToNumValues/5.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/left-keyWithIndexToValue/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/left-keyWithIndexToValue/1.delta new file mode 100644 index 0000000000000..7e6dce9cc108c Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/left-keyWithIndexToValue/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/left-keyWithIndexToValue/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/left-keyWithIndexToValue/2.delta new file mode 100644 index 0000000000000..21e2a706e8c7a Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/left-keyWithIndexToValue/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/left-keyWithIndexToValue/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/left-keyWithIndexToValue/2.snapshot new file mode 100644 index 0000000000000..5bfdddd0e2f14 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/left-keyWithIndexToValue/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/left-keyWithIndexToValue/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/left-keyWithIndexToValue/3.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/left-keyWithIndexToValue/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/left-keyWithIndexToValue/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/left-keyWithIndexToValue/4.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/left-keyWithIndexToValue/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/left-keyWithIndexToValue/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/left-keyWithIndexToValue/4.snapshot new file mode 100644 index 0000000000000..30e88f6cb35c6 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/left-keyWithIndexToValue/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/left-keyWithIndexToValue/5.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/left-keyWithIndexToValue/5.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/left-keyWithIndexToValue/5.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/right-keyToNumValues/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/right-keyToNumValues/1.delta new file mode 100644 index 0000000000000..4e421cd377fb6 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/right-keyToNumValues/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/right-keyToNumValues/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/right-keyToNumValues/2.delta new file mode 100644 index 0000000000000..25d24f4cedf3f Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/right-keyToNumValues/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/right-keyToNumValues/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/right-keyToNumValues/2.snapshot new file mode 100644 index 0000000000000..d295efee0d000 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/right-keyToNumValues/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/right-keyToNumValues/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/right-keyToNumValues/3.delta new file mode 100644 index 0000000000000..4d3cf654ce551 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/right-keyToNumValues/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/right-keyToNumValues/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/right-keyToNumValues/4.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/right-keyToNumValues/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/right-keyToNumValues/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/right-keyToNumValues/4.snapshot new file mode 100644 index 0000000000000..25d24f4cedf3f Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/right-keyToNumValues/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/right-keyToNumValues/5.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/right-keyToNumValues/5.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/right-keyToNumValues/5.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/right-keyWithIndexToValue/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/right-keyWithIndexToValue/1.delta new file mode 100644 index 0000000000000..2ec494e6a636f Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/right-keyWithIndexToValue/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/right-keyWithIndexToValue/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/right-keyWithIndexToValue/2.delta new file mode 100644 index 0000000000000..0877f7564366f Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/right-keyWithIndexToValue/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/right-keyWithIndexToValue/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/right-keyWithIndexToValue/2.snapshot new file mode 100644 index 0000000000000..5bfdddd0e2f14 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/right-keyWithIndexToValue/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/right-keyWithIndexToValue/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/right-keyWithIndexToValue/3.delta new file mode 100644 index 0000000000000..edcc5dd1f672a Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/right-keyWithIndexToValue/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/right-keyWithIndexToValue/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/right-keyWithIndexToValue/4.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/right-keyWithIndexToValue/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/right-keyWithIndexToValue/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/right-keyWithIndexToValue/4.snapshot new file mode 100644 index 0000000000000..0877f7564366f Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/right-keyWithIndexToValue/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/right-keyWithIndexToValue/5.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/right-keyWithIndexToValue/5.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/2/right-keyWithIndexToValue/5.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/left-keyToNumValues/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/left-keyToNumValues/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/left-keyToNumValues/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/left-keyToNumValues/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/left-keyToNumValues/2.delta new file mode 100644 index 0000000000000..74fa1fc58b611 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/left-keyToNumValues/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/left-keyToNumValues/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/left-keyToNumValues/2.snapshot new file mode 100644 index 0000000000000..74fa1fc58b611 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/left-keyToNumValues/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/left-keyToNumValues/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/left-keyToNumValues/3.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/left-keyToNumValues/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/left-keyToNumValues/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/left-keyToNumValues/4.delta new file mode 100644 index 0000000000000..1aeeffd771c08 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/left-keyToNumValues/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/left-keyToNumValues/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/left-keyToNumValues/4.snapshot new file mode 100644 index 0000000000000..379479c0ccc3b Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/left-keyToNumValues/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/left-keyToNumValues/5.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/left-keyToNumValues/5.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/left-keyToNumValues/5.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/left-keyWithIndexToValue/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/left-keyWithIndexToValue/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/left-keyWithIndexToValue/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/left-keyWithIndexToValue/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/left-keyWithIndexToValue/2.delta new file mode 100644 index 0000000000000..cafd9d540f8e0 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/left-keyWithIndexToValue/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/left-keyWithIndexToValue/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/left-keyWithIndexToValue/2.snapshot new file mode 100644 index 0000000000000..9899fb58eebe4 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/left-keyWithIndexToValue/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/left-keyWithIndexToValue/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/left-keyWithIndexToValue/3.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/left-keyWithIndexToValue/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/left-keyWithIndexToValue/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/left-keyWithIndexToValue/4.delta new file mode 100644 index 0000000000000..69437319e872d Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/left-keyWithIndexToValue/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/left-keyWithIndexToValue/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/left-keyWithIndexToValue/4.snapshot new file mode 100644 index 0000000000000..aa76ff9b416a1 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/left-keyWithIndexToValue/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/left-keyWithIndexToValue/5.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/left-keyWithIndexToValue/5.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/left-keyWithIndexToValue/5.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/right-keyToNumValues/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/right-keyToNumValues/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/right-keyToNumValues/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/right-keyToNumValues/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/right-keyToNumValues/2.delta new file mode 100644 index 0000000000000..74fa1fc58b611 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/right-keyToNumValues/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/right-keyToNumValues/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/right-keyToNumValues/2.snapshot new file mode 100644 index 0000000000000..74fa1fc58b611 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/right-keyToNumValues/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/right-keyToNumValues/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/right-keyToNumValues/3.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/right-keyToNumValues/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/right-keyToNumValues/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/right-keyToNumValues/4.delta new file mode 100644 index 0000000000000..1aeeffd771c08 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/right-keyToNumValues/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/right-keyToNumValues/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/right-keyToNumValues/4.snapshot new file mode 100644 index 0000000000000..379479c0ccc3b Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/right-keyToNumValues/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/right-keyToNumValues/5.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/right-keyToNumValues/5.delta new file mode 100644 index 0000000000000..47dff164d42d0 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/right-keyToNumValues/5.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/right-keyWithIndexToValue/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/right-keyWithIndexToValue/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/right-keyWithIndexToValue/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/right-keyWithIndexToValue/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/right-keyWithIndexToValue/2.delta new file mode 100644 index 0000000000000..9899fb58eebe4 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/right-keyWithIndexToValue/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/right-keyWithIndexToValue/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/right-keyWithIndexToValue/2.snapshot new file mode 100644 index 0000000000000..9899fb58eebe4 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/right-keyWithIndexToValue/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/right-keyWithIndexToValue/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/right-keyWithIndexToValue/3.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/right-keyWithIndexToValue/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/right-keyWithIndexToValue/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/right-keyWithIndexToValue/4.delta new file mode 100644 index 0000000000000..105c7cc4255a9 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/right-keyWithIndexToValue/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/right-keyWithIndexToValue/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/right-keyWithIndexToValue/4.snapshot new file mode 100644 index 0000000000000..aa76ff9b416a1 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/right-keyWithIndexToValue/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/right-keyWithIndexToValue/5.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/right-keyWithIndexToValue/5.delta new file mode 100644 index 0000000000000..d536c6fdcbce4 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/3/right-keyWithIndexToValue/5.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/left-keyToNumValues/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/left-keyToNumValues/1.delta new file mode 100644 index 0000000000000..0bdaf341003b9 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/left-keyToNumValues/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/left-keyToNumValues/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/left-keyToNumValues/2.delta new file mode 100644 index 0000000000000..6070683f44e12 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/left-keyToNumValues/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/left-keyToNumValues/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/left-keyToNumValues/2.snapshot new file mode 100644 index 0000000000000..837a1434917ca Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/left-keyToNumValues/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/left-keyToNumValues/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/left-keyToNumValues/3.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/left-keyToNumValues/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/left-keyToNumValues/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/left-keyToNumValues/4.delta new file mode 100644 index 0000000000000..32506a0366066 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/left-keyToNumValues/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/left-keyToNumValues/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/left-keyToNumValues/4.snapshot new file mode 100644 index 0000000000000..cf43298608153 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/left-keyToNumValues/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/left-keyToNumValues/5.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/left-keyToNumValues/5.delta new file mode 100644 index 0000000000000..2f2da77129ea4 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/left-keyToNumValues/5.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/left-keyWithIndexToValue/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/left-keyWithIndexToValue/1.delta new file mode 100644 index 0000000000000..56b52ab974a3d Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/left-keyWithIndexToValue/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/left-keyWithIndexToValue/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/left-keyWithIndexToValue/2.delta new file mode 100644 index 0000000000000..26d97da9c610f Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/left-keyWithIndexToValue/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/left-keyWithIndexToValue/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/left-keyWithIndexToValue/2.snapshot new file mode 100644 index 0000000000000..2dbdd331b3e97 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/left-keyWithIndexToValue/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/left-keyWithIndexToValue/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/left-keyWithIndexToValue/3.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/left-keyWithIndexToValue/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/left-keyWithIndexToValue/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/left-keyWithIndexToValue/4.delta new file mode 100644 index 0000000000000..3603b3f81bc77 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/left-keyWithIndexToValue/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/left-keyWithIndexToValue/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/left-keyWithIndexToValue/4.snapshot new file mode 100644 index 0000000000000..b92431e0a4df6 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/left-keyWithIndexToValue/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/left-keyWithIndexToValue/5.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/left-keyWithIndexToValue/5.delta new file mode 100644 index 0000000000000..71fdc6c434ca3 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/left-keyWithIndexToValue/5.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/right-keyToNumValues/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/right-keyToNumValues/1.delta new file mode 100644 index 0000000000000..0bdaf341003b9 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/right-keyToNumValues/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/right-keyToNumValues/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/right-keyToNumValues/2.delta new file mode 100644 index 0000000000000..6070683f44e12 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/right-keyToNumValues/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/right-keyToNumValues/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/right-keyToNumValues/2.snapshot new file mode 100644 index 0000000000000..837a1434917ca Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/right-keyToNumValues/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/right-keyToNumValues/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/right-keyToNumValues/3.delta new file mode 100644 index 0000000000000..2f2da77129ea4 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/right-keyToNumValues/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/right-keyToNumValues/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/right-keyToNumValues/4.delta new file mode 100644 index 0000000000000..32506a0366066 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/right-keyToNumValues/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/right-keyToNumValues/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/right-keyToNumValues/4.snapshot new file mode 100644 index 0000000000000..3b529f86101ac Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/right-keyToNumValues/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/right-keyToNumValues/5.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/right-keyToNumValues/5.delta new file mode 100644 index 0000000000000..83a5f723a34ab Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/right-keyToNumValues/5.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/right-keyWithIndexToValue/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/right-keyWithIndexToValue/1.delta new file mode 100644 index 0000000000000..0ed4feb1bd9b6 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/right-keyWithIndexToValue/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/right-keyWithIndexToValue/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/right-keyWithIndexToValue/2.delta new file mode 100644 index 0000000000000..31a686912dc97 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/right-keyWithIndexToValue/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/right-keyWithIndexToValue/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/right-keyWithIndexToValue/2.snapshot new file mode 100644 index 0000000000000..2dbdd331b3e97 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/right-keyWithIndexToValue/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/right-keyWithIndexToValue/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/right-keyWithIndexToValue/3.delta new file mode 100644 index 0000000000000..71fdc6c434ca3 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/right-keyWithIndexToValue/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/right-keyWithIndexToValue/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/right-keyWithIndexToValue/4.delta new file mode 100644 index 0000000000000..649772d35ffd8 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/right-keyWithIndexToValue/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/right-keyWithIndexToValue/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/right-keyWithIndexToValue/4.snapshot new file mode 100644 index 0000000000000..50b057756915b Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/right-keyWithIndexToValue/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/right-keyWithIndexToValue/5.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/right-keyWithIndexToValue/5.delta new file mode 100644 index 0000000000000..1c4c3974de1d5 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/4/right-keyWithIndexToValue/5.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/_metadata/metadata b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/_metadata/metadata new file mode 100644 index 0000000000000..b73f1e3e66ac5 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/join2/state/0/_metadata/metadata @@ -0,0 +1,2 @@ +v1 +{"operatorInfo":{"operatorId":0,"operatorName":"symmetricHashJoin"},"stateStoreInfo":[{"storeName":"left-keyToNumValues","numColsPrefixKey":0,"numPartitions":5},{"storeName":"left-keyWithIndexToValue","numColsPrefixKey":0,"numPartitions":5},{"storeName":"right-keyToNumValues","numColsPrefixKey":0,"numPartitions":5},{"storeName":"right-keyWithIndexToValue","numColsPrefixKey":0,"numPartitions":5}]} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/commits/0 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/commits/0 new file mode 100644 index 0000000000000..9c1e3021c3ead --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/commits/0 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":0} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/commits/1 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/commits/1 new file mode 100644 index 0000000000000..9c1e3021c3ead --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/commits/1 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":0} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/commits/2 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/commits/2 new file mode 100644 index 0000000000000..9c1e3021c3ead --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/commits/2 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":0} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/commits/3 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/commits/3 new file mode 100644 index 0000000000000..9c1e3021c3ead --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/commits/3 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":0} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/metadata b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/metadata new file mode 100644 index 0000000000000..1d512939c8dca --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/metadata @@ -0,0 +1 @@ +{"id":"5ed656f0-84dd-414c-abbc-851e4ce58b93"} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/offsets/0 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/offsets/0 new file mode 100644 index 0000000000000..95444ab5cb96a --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/offsets/0 @@ -0,0 +1,3 @@ +v1 +{"batchWatermarkMs":0,"batchTimestampMs":1719289904957,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4"}} +0 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/offsets/1 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/offsets/1 new file mode 100644 index 0000000000000..614f4bb85c018 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/offsets/1 @@ -0,0 +1,3 @@ +v1 +{"batchWatermarkMs":0,"batchTimestampMs":1719289907608,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4"}} +1 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/offsets/2 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/offsets/2 new file mode 100644 index 0000000000000..b4f3aebbbe875 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/offsets/2 @@ -0,0 +1,3 @@ +v1 +{"batchWatermarkMs":0,"batchTimestampMs":1719289910090,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4"}} +2 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/offsets/3 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/offsets/3 new file mode 100644 index 0000000000000..7692f0f1abead --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/offsets/3 @@ -0,0 +1,3 @@ +v1 +{"batchWatermarkMs":0,"batchTimestampMs":1719289912575,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4"}} +3 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/state/0/0/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/state/0/0/1.delta new file mode 100644 index 0000000000000..701f5bd986b86 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/state/0/0/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/state/0/0/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/state/0/0/2.delta new file mode 100644 index 0000000000000..1de87347fb513 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/state/0/0/2.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/state/0/0/2.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/state/0/0/2.snapshot new file mode 100644 index 0000000000000..1de87347fb513 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/state/0/0/2.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/state/0/0/3.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/state/0/0/3.delta new file mode 100644 index 0000000000000..5b1d0ce87f287 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/state/0/0/3.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/state/0/0/4.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/state/0/0/4.delta new file mode 100644 index 0000000000000..cc309967b185e Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/state/0/0/4.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/state/0/0/4.snapshot b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/state/0/0/4.snapshot new file mode 100644 index 0000000000000..cc309967b185e Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/state/0/0/4.snapshot differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/state/0/0/_metadata/schema b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/state/0/0/_metadata/schema new file mode 100644 index 0000000000000..371b0df09d80b Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/state/0/0/_metadata/schema differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/state/0/_metadata/metadata b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/state/0/_metadata/metadata new file mode 100644 index 0000000000000..5792421dd423e --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/hdfs/limit/state/0/_metadata/metadata @@ -0,0 +1,2 @@ +v1 +{"operatorInfo":{"operatorId":0,"operatorName":"globalLimit"},"stateStoreInfo":[{"storeName":"default","numColsPrefixKey":0,"numPartitions":5}]} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/commits/0 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/commits/0 new file mode 100644 index 0000000000000..9c1e3021c3ead --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/commits/0 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":0} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/commits/1 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/commits/1 new file mode 100644 index 0000000000000..9c1e3021c3ead --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/commits/1 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":0} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/commits/2 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/commits/2 new file mode 100644 index 0000000000000..9c1e3021c3ead --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/commits/2 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":0} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/commits/3 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/commits/3 new file mode 100644 index 0000000000000..9c1e3021c3ead --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/commits/3 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":0} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/metadata b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/metadata new file mode 100644 index 0000000000000..24c81c5d47be5 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/metadata @@ -0,0 +1 @@ +{"id":"0da797f0-6c12-4954-89cf-cea200a87f97"} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/offsets/0 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/offsets/0 new file mode 100644 index 0000000000000..add7a6458926f --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/offsets/0 @@ -0,0 +1,3 @@ +v1 +{"batchWatermarkMs":0,"batchTimestampMs":1719290931555,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4"}} +0 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/offsets/1 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/offsets/1 new file mode 100644 index 0000000000000..2857388877a31 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/offsets/1 @@ -0,0 +1,3 @@ +v1 +{"batchWatermarkMs":0,"batchTimestampMs":1719290934440,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4"}} +1 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/offsets/2 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/offsets/2 new file mode 100644 index 0000000000000..5df970a41a729 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/offsets/2 @@ -0,0 +1,3 @@ +v1 +{"batchWatermarkMs":0,"batchTimestampMs":1719290937697,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4"}} +2 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/offsets/3 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/offsets/3 new file mode 100644 index 0000000000000..a8efebc1cf7ec --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/offsets/3 @@ -0,0 +1,3 @@ +v1 +{"batchWatermarkMs":0,"batchTimestampMs":1719290940519,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4"}} +3 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/0/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/0/1.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/0/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/0/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/0/2.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/0/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/0/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/0/2.zip new file mode 100644 index 0000000000000..b52a3e04cde93 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/0/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/0/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/0/3.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/0/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/0/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/0/4.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/0/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/0/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/0/4.zip new file mode 100644 index 0000000000000..97e703840ba71 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/0/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/0/_metadata/schema b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/0/_metadata/schema new file mode 100644 index 0000000000000..20b45317e0a22 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/0/_metadata/schema differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/1/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/1/1.changelog new file mode 100644 index 0000000000000..497e792645825 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/1/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/1/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/1/2.changelog new file mode 100644 index 0000000000000..c197734e2608c Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/1/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/1/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/1/2.zip new file mode 100644 index 0000000000000..4002b00264aac Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/1/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/1/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/1/3.changelog new file mode 100644 index 0000000000000..35985490c08d1 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/1/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/1/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/1/4.changelog new file mode 100644 index 0000000000000..bbd5a488fb858 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/1/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/1/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/1/4.zip new file mode 100644 index 0000000000000..00e9f17f64f12 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/1/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/1/SSTs/000008-e414df11-1a90-4be8-807c-8d6f970b5f56.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/1/SSTs/000008-e414df11-1a90-4be8-807c-8d6f970b5f56.sst new file mode 100644 index 0000000000000..c41d3cc2bafb8 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/1/SSTs/000008-e414df11-1a90-4be8-807c-8d6f970b5f56.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/1/SSTs/000015-97d0e283-eed7-4af0-87d9-eded3afe15ca.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/1/SSTs/000015-97d0e283-eed7-4af0-87d9-eded3afe15ca.sst new file mode 100644 index 0000000000000..186fba2d13a5c Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/1/SSTs/000015-97d0e283-eed7-4af0-87d9-eded3afe15ca.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/2/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/2/1.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/2/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/2/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/2/2.changelog new file mode 100644 index 0000000000000..2daeae74a72b9 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/2/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/2/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/2/2.zip new file mode 100644 index 0000000000000..1c1ae3f894804 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/2/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/2/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/2/3.changelog new file mode 100644 index 0000000000000..5d9eea4ad997e Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/2/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/2/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/2/4.changelog new file mode 100644 index 0000000000000..1ddb6499c28c4 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/2/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/2/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/2/4.zip new file mode 100644 index 0000000000000..e574f5e25473b Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/2/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/2/SSTs/000008-d6862a4a-f10f-4641-ba89-0ea25cd816b0.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/2/SSTs/000008-d6862a4a-f10f-4641-ba89-0ea25cd816b0.sst new file mode 100644 index 0000000000000..9e38658bda2de Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/2/SSTs/000008-d6862a4a-f10f-4641-ba89-0ea25cd816b0.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/2/SSTs/000015-1e8225d7-793e-47df-aec6-fb47522e2494.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/2/SSTs/000015-1e8225d7-793e-47df-aec6-fb47522e2494.sst new file mode 100644 index 0000000000000..8f4391b954d05 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/2/SSTs/000015-1e8225d7-793e-47df-aec6-fb47522e2494.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/3/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/3/1.changelog new file mode 100644 index 0000000000000..46e739559ac8d Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/3/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/3/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/3/2.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/3/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/3/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/3/2.zip new file mode 100644 index 0000000000000..847766f4eb425 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/3/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/3/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/3/3.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/3/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/3/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/3/4.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/3/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/3/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/3/4.zip new file mode 100644 index 0000000000000..a2b18fa0d827e Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/3/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/3/SSTs/000008-637e442e-feb6-4022-a0b1-919d388073ae.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/3/SSTs/000008-637e442e-feb6-4022-a0b1-919d388073ae.sst new file mode 100644 index 0000000000000..66dd9dfb28c19 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/3/SSTs/000008-637e442e-feb6-4022-a0b1-919d388073ae.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/4/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/4/1.changelog new file mode 100644 index 0000000000000..9099712c6134d Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/4/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/4/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/4/2.changelog new file mode 100644 index 0000000000000..1152f1a527f20 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/4/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/4/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/4/2.zip new file mode 100644 index 0000000000000..9a806e422149f Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/4/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/4/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/4/3.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/4/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/4/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/4/4.changelog new file mode 100644 index 0000000000000..c5a3af6244a01 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/4/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/4/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/4/4.zip new file mode 100644 index 0000000000000..eebcf56f8533c Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/4/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/4/SSTs/000008-0f086ebd-c188-4508-97d3-9282313b0a97.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/4/SSTs/000008-0f086ebd-c188-4508-97d3-9282313b0a97.sst new file mode 100644 index 0000000000000..fcf1c915751ea Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/4/SSTs/000008-0f086ebd-c188-4508-97d3-9282313b0a97.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/4/SSTs/000015-074aa977-f8df-4b58-8143-c5c4691cdb9a.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/4/SSTs/000015-074aa977-f8df-4b58-8143-c5c4691cdb9a.sst new file mode 100644 index 0000000000000..7522d3bd7cadb Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/4/SSTs/000015-074aa977-f8df-4b58-8143-c5c4691cdb9a.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/_metadata/metadata b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/_metadata/metadata new file mode 100644 index 0000000000000..5094b71aa3581 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/agg/state/0/_metadata/metadata @@ -0,0 +1,2 @@ +v1 +{"operatorInfo":{"operatorId":0,"operatorName":"stateStoreSave"},"stateStoreInfo":[{"storeName":"default","numColsPrefixKey":0,"numPartitions":5}]} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/commits/0 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/commits/0 new file mode 100644 index 0000000000000..9c1e3021c3ead --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/commits/0 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":0} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/commits/1 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/commits/1 new file mode 100644 index 0000000000000..9c1e3021c3ead --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/commits/1 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":0} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/commits/2 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/commits/2 new file mode 100644 index 0000000000000..9c1e3021c3ead --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/commits/2 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":0} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/commits/3 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/commits/3 new file mode 100644 index 0000000000000..9c1e3021c3ead --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/commits/3 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":0} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/metadata b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/metadata new file mode 100644 index 0000000000000..46c70457ae13f --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/metadata @@ -0,0 +1 @@ +{"id":"7715e3b8-26dd-416a-ac64-c804bb48a972"} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/offsets/0 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/offsets/0 new file mode 100644 index 0000000000000..ef98649c9fc3a --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/offsets/0 @@ -0,0 +1,3 @@ +v1 +{"batchWatermarkMs":0,"batchTimestampMs":1719290943839,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4"}} +0 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/offsets/1 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/offsets/1 new file mode 100644 index 0000000000000..46d8177c1ed43 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/offsets/1 @@ -0,0 +1,3 @@ +v1 +{"batchWatermarkMs":0,"batchTimestampMs":1719290946665,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4"}} +1 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/offsets/2 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/offsets/2 new file mode 100644 index 0000000000000..dc7562efc9fd2 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/offsets/2 @@ -0,0 +1,3 @@ +v1 +{"batchWatermarkMs":0,"batchTimestampMs":1719290949666,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4"}} +2 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/offsets/3 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/offsets/3 new file mode 100644 index 0000000000000..52f25fb6acccd --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/offsets/3 @@ -0,0 +1,3 @@ +v1 +{"batchWatermarkMs":0,"batchTimestampMs":1719290952214,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4"}} +3 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/0/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/0/1.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/0/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/0/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/0/2.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/0/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/0/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/0/2.zip new file mode 100644 index 0000000000000..5910abeed315f Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/0/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/0/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/0/3.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/0/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/0/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/0/4.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/0/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/0/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/0/4.zip new file mode 100644 index 0000000000000..ddba4b5772e50 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/0/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/0/_metadata/schema b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/0/_metadata/schema new file mode 100644 index 0000000000000..70ebb77e1b781 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/0/_metadata/schema differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/1/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/1/1.changelog new file mode 100644 index 0000000000000..1132252c8945f Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/1/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/1/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/1/2.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/1/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/1/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/1/2.zip new file mode 100644 index 0000000000000..0bf7ee845bc08 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/1/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/1/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/1/3.changelog new file mode 100644 index 0000000000000..c5ebbc7bbed26 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/1/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/1/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/1/4.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/1/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/1/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/1/4.zip new file mode 100644 index 0000000000000..33af823d62d1f Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/1/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/1/SSTs/000008-3073b336-661b-4d30-984d-427402150d37.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/1/SSTs/000008-3073b336-661b-4d30-984d-427402150d37.sst new file mode 100644 index 0000000000000..20f8285636f8f Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/1/SSTs/000008-3073b336-661b-4d30-984d-427402150d37.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/1/SSTs/000009-5669c494-2412-46c4-96ef-1bb9ab6b2710.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/1/SSTs/000009-5669c494-2412-46c4-96ef-1bb9ab6b2710.sst new file mode 100644 index 0000000000000..cf9e952972a54 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/1/SSTs/000009-5669c494-2412-46c4-96ef-1bb9ab6b2710.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/2/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/2/1.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/2/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/2/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/2/2.changelog new file mode 100644 index 0000000000000..f96b5ad48b3f8 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/2/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/2/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/2/2.zip new file mode 100644 index 0000000000000..bc12dd5ba4c2d Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/2/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/2/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/2/3.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/2/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/2/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/2/4.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/2/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/2/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/2/4.zip new file mode 100644 index 0000000000000..9a720d8720d88 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/2/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/2/SSTs/000008-90e6688f-83a0-4917-ae32-9e107c002bcc.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/2/SSTs/000008-90e6688f-83a0-4917-ae32-9e107c002bcc.sst new file mode 100644 index 0000000000000..3bbb3466e6f82 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/2/SSTs/000008-90e6688f-83a0-4917-ae32-9e107c002bcc.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/3/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/3/1.changelog new file mode 100644 index 0000000000000..b3f54eeacdd04 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/3/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/3/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/3/2.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/3/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/3/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/3/2.zip new file mode 100644 index 0000000000000..ae313425f0644 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/3/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/3/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/3/3.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/3/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/3/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/3/4.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/3/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/3/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/3/4.zip new file mode 100644 index 0000000000000..fa589eca47e36 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/3/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/3/SSTs/000008-e115799d-89cb-4f4d-8e58-fe104d382c80.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/3/SSTs/000008-e115799d-89cb-4f4d-8e58-fe104d382c80.sst new file mode 100644 index 0000000000000..8b786dd457f96 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/3/SSTs/000008-e115799d-89cb-4f4d-8e58-fe104d382c80.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/4/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/4/1.changelog new file mode 100644 index 0000000000000..9b224728c8bcb Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/4/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/4/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/4/2.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/4/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/4/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/4/2.zip new file mode 100644 index 0000000000000..faf5ea1b8158b Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/4/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/4/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/4/3.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/4/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/4/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/4/4.changelog new file mode 100644 index 0000000000000..6fc1bb07e798d Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/4/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/4/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/4/4.zip new file mode 100644 index 0000000000000..03837a7357d71 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/4/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/4/SSTs/000008-5f249adf-c98e-46a3-8217-e39a4360b624.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/4/SSTs/000008-5f249adf-c98e-46a3-8217-e39a4360b624.sst new file mode 100644 index 0000000000000..a17404703c22a Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/4/SSTs/000008-5f249adf-c98e-46a3-8217-e39a4360b624.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/4/SSTs/000009-0d1befcf-847e-43a5-b354-a4d11afeaaba.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/4/SSTs/000009-0d1befcf-847e-43a5-b354-a4d11afeaaba.sst new file mode 100644 index 0000000000000..5d68995c5c8b0 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/4/SSTs/000009-0d1befcf-847e-43a5-b354-a4d11afeaaba.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/_metadata/metadata b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/_metadata/metadata new file mode 100644 index 0000000000000..39ce28c9b4aa5 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/dedup/state/0/_metadata/metadata @@ -0,0 +1,2 @@ +v1 +{"operatorInfo":{"operatorId":0,"operatorName":"dedupe"},"stateStoreInfo":[{"storeName":"default","numColsPrefixKey":0,"numPartitions":5}]} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/commits/0 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/commits/0 new file mode 100644 index 0000000000000..9c1e3021c3ead --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/commits/0 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":0} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/commits/1 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/commits/1 new file mode 100644 index 0000000000000..1715390973f9b --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/commits/1 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":5000} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/commits/2 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/commits/2 new file mode 100644 index 0000000000000..1715390973f9b --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/commits/2 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":5000} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/commits/3 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/commits/3 new file mode 100644 index 0000000000000..2a2d02b0bb3b3 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/commits/3 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":9000} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/commits/4 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/commits/4 new file mode 100644 index 0000000000000..2a2d02b0bb3b3 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/commits/4 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":9000} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/metadata b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/metadata new file mode 100644 index 0000000000000..e0a261d1ce0e2 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/metadata @@ -0,0 +1 @@ +{"id":"160224de-518b-4191-b9d6-544602476824"} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/offsets/0 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/offsets/0 new file mode 100644 index 0000000000000..67dd9ed17d96c --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/offsets/0 @@ -0,0 +1,4 @@ +v1 +{"batchWatermarkMs":0,"batchTimestampMs":1719290955223,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4"}} +0 +0 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/offsets/1 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/offsets/1 new file mode 100644 index 0000000000000..dd1850eb28417 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/offsets/1 @@ -0,0 +1,4 @@ +v1 +{"batchWatermarkMs":0,"batchTimestampMs":1719290960126,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4"}} +1 +1 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/offsets/2 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/offsets/2 new file mode 100644 index 0000000000000..65fe14e65bd8d --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/offsets/2 @@ -0,0 +1,4 @@ +v1 +{"batchWatermarkMs":5000,"batchTimestampMs":1719290962748,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4"}} +1 +1 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/offsets/3 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/offsets/3 new file mode 100644 index 0000000000000..756be72687efd --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/offsets/3 @@ -0,0 +1,4 @@ +v1 +{"batchWatermarkMs":5000,"batchTimestampMs":1719290966185,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4"}} +2 +2 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/offsets/4 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/offsets/4 new file mode 100644 index 0000000000000..d2cbfe3c08429 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/offsets/4 @@ -0,0 +1,4 @@ +v1 +{"batchWatermarkMs":9000,"batchTimestampMs":1719290968718,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4"}} +2 +2 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyToNumValues/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyToNumValues/1.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyToNumValues/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyToNumValues/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyToNumValues/2.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyToNumValues/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyToNumValues/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyToNumValues/2.zip new file mode 100644 index 0000000000000..0bedf21fe2da4 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyToNumValues/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyToNumValues/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyToNumValues/3.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyToNumValues/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyToNumValues/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyToNumValues/4.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyToNumValues/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyToNumValues/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyToNumValues/4.zip new file mode 100644 index 0000000000000..59a6ea16dc12b Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyToNumValues/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyToNumValues/5.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyToNumValues/5.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyToNumValues/5.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyToNumValues/_metadata/schema b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyToNumValues/_metadata/schema new file mode 100644 index 0000000000000..4da637d143496 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyToNumValues/_metadata/schema differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyWithIndexToValue/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyWithIndexToValue/1.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyWithIndexToValue/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyWithIndexToValue/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyWithIndexToValue/2.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyWithIndexToValue/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyWithIndexToValue/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyWithIndexToValue/2.zip new file mode 100644 index 0000000000000..0bedf21fe2da4 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyWithIndexToValue/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyWithIndexToValue/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyWithIndexToValue/3.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyWithIndexToValue/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyWithIndexToValue/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyWithIndexToValue/4.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyWithIndexToValue/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyWithIndexToValue/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyWithIndexToValue/4.zip new file mode 100644 index 0000000000000..59a6ea16dc12b Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyWithIndexToValue/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyWithIndexToValue/5.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyWithIndexToValue/5.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyWithIndexToValue/5.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyWithIndexToValue/_metadata/schema b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyWithIndexToValue/_metadata/schema new file mode 100644 index 0000000000000..42448b3b584ce Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/left-keyWithIndexToValue/_metadata/schema differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyToNumValues/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyToNumValues/1.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyToNumValues/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyToNumValues/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyToNumValues/2.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyToNumValues/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyToNumValues/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyToNumValues/2.zip new file mode 100644 index 0000000000000..0bedf21fe2da4 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyToNumValues/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyToNumValues/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyToNumValues/3.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyToNumValues/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyToNumValues/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyToNumValues/4.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyToNumValues/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyToNumValues/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyToNumValues/4.zip new file mode 100644 index 0000000000000..59a6ea16dc12b Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyToNumValues/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyToNumValues/5.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyToNumValues/5.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyToNumValues/5.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyToNumValues/_metadata/schema b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyToNumValues/_metadata/schema new file mode 100644 index 0000000000000..4da637d143496 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyToNumValues/_metadata/schema differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyWithIndexToValue/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyWithIndexToValue/1.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyWithIndexToValue/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyWithIndexToValue/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyWithIndexToValue/2.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyWithIndexToValue/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyWithIndexToValue/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyWithIndexToValue/2.zip new file mode 100644 index 0000000000000..0bedf21fe2da4 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyWithIndexToValue/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyWithIndexToValue/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyWithIndexToValue/3.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyWithIndexToValue/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyWithIndexToValue/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyWithIndexToValue/4.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyWithIndexToValue/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyWithIndexToValue/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyWithIndexToValue/4.zip new file mode 100644 index 0000000000000..59a6ea16dc12b Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyWithIndexToValue/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyWithIndexToValue/5.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyWithIndexToValue/5.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyWithIndexToValue/5.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyWithIndexToValue/_metadata/schema b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyWithIndexToValue/_metadata/schema new file mode 100644 index 0000000000000..8fa8f1675bc82 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/0/right-keyWithIndexToValue/_metadata/schema differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/left-keyToNumValues/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/left-keyToNumValues/1.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/left-keyToNumValues/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/left-keyToNumValues/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/left-keyToNumValues/2.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/left-keyToNumValues/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/left-keyToNumValues/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/left-keyToNumValues/2.zip new file mode 100644 index 0000000000000..0bedf21fe2da4 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/left-keyToNumValues/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/left-keyToNumValues/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/left-keyToNumValues/3.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/left-keyToNumValues/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/left-keyToNumValues/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/left-keyToNumValues/4.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/left-keyToNumValues/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/left-keyToNumValues/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/left-keyToNumValues/4.zip new file mode 100644 index 0000000000000..59a6ea16dc12b Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/left-keyToNumValues/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/left-keyToNumValues/5.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/left-keyToNumValues/5.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/left-keyToNumValues/5.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/left-keyWithIndexToValue/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/left-keyWithIndexToValue/1.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/left-keyWithIndexToValue/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/left-keyWithIndexToValue/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/left-keyWithIndexToValue/2.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/left-keyWithIndexToValue/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/left-keyWithIndexToValue/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/left-keyWithIndexToValue/2.zip new file mode 100644 index 0000000000000..0bedf21fe2da4 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/left-keyWithIndexToValue/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/left-keyWithIndexToValue/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/left-keyWithIndexToValue/3.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/left-keyWithIndexToValue/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/left-keyWithIndexToValue/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/left-keyWithIndexToValue/4.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/left-keyWithIndexToValue/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/left-keyWithIndexToValue/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/left-keyWithIndexToValue/4.zip new file mode 100644 index 0000000000000..59a6ea16dc12b Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/left-keyWithIndexToValue/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/left-keyWithIndexToValue/5.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/left-keyWithIndexToValue/5.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/left-keyWithIndexToValue/5.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/right-keyToNumValues/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/right-keyToNumValues/1.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/right-keyToNumValues/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/right-keyToNumValues/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/right-keyToNumValues/2.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/right-keyToNumValues/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/right-keyToNumValues/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/right-keyToNumValues/2.zip new file mode 100644 index 0000000000000..0bedf21fe2da4 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/right-keyToNumValues/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/right-keyToNumValues/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/right-keyToNumValues/3.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/right-keyToNumValues/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/right-keyToNumValues/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/right-keyToNumValues/4.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/right-keyToNumValues/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/right-keyToNumValues/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/right-keyToNumValues/4.zip new file mode 100644 index 0000000000000..59a6ea16dc12b Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/right-keyToNumValues/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/right-keyToNumValues/5.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/right-keyToNumValues/5.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/right-keyToNumValues/5.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/right-keyWithIndexToValue/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/right-keyWithIndexToValue/1.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/right-keyWithIndexToValue/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/right-keyWithIndexToValue/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/right-keyWithIndexToValue/2.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/right-keyWithIndexToValue/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/right-keyWithIndexToValue/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/right-keyWithIndexToValue/2.zip new file mode 100644 index 0000000000000..0bedf21fe2da4 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/right-keyWithIndexToValue/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/right-keyWithIndexToValue/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/right-keyWithIndexToValue/3.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/right-keyWithIndexToValue/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/right-keyWithIndexToValue/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/right-keyWithIndexToValue/4.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/right-keyWithIndexToValue/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/right-keyWithIndexToValue/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/right-keyWithIndexToValue/4.zip new file mode 100644 index 0000000000000..59a6ea16dc12b Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/right-keyWithIndexToValue/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/right-keyWithIndexToValue/5.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/right-keyWithIndexToValue/5.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/1/right-keyWithIndexToValue/5.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyToNumValues/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyToNumValues/1.changelog new file mode 100644 index 0000000000000..2daeae74a72b9 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyToNumValues/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyToNumValues/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyToNumValues/2.changelog new file mode 100644 index 0000000000000..abdd6a5fca257 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyToNumValues/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyToNumValues/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyToNumValues/2.zip new file mode 100644 index 0000000000000..3802a7e7b1ee8 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyToNumValues/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyToNumValues/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyToNumValues/3.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyToNumValues/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyToNumValues/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyToNumValues/4.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyToNumValues/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyToNumValues/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyToNumValues/4.zip new file mode 100644 index 0000000000000..f5d38ea63e7b6 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyToNumValues/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyToNumValues/5.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyToNumValues/5.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyToNumValues/5.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyToNumValues/SSTs/000008-97286fb3-8d19-4e91-aecb-3f9647f6103f.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyToNumValues/SSTs/000008-97286fb3-8d19-4e91-aecb-3f9647f6103f.sst new file mode 100644 index 0000000000000..d7494daaea026 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyToNumValues/SSTs/000008-97286fb3-8d19-4e91-aecb-3f9647f6103f.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyWithIndexToValue/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyWithIndexToValue/1.changelog new file mode 100644 index 0000000000000..88a2191cae130 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyWithIndexToValue/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyWithIndexToValue/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyWithIndexToValue/2.changelog new file mode 100644 index 0000000000000..57d0436d95d79 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyWithIndexToValue/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyWithIndexToValue/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyWithIndexToValue/2.zip new file mode 100644 index 0000000000000..1d20ea2ddd926 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyWithIndexToValue/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyWithIndexToValue/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyWithIndexToValue/3.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyWithIndexToValue/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyWithIndexToValue/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyWithIndexToValue/4.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyWithIndexToValue/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyWithIndexToValue/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyWithIndexToValue/4.zip new file mode 100644 index 0000000000000..171c10c2f069c Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyWithIndexToValue/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyWithIndexToValue/5.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyWithIndexToValue/5.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyWithIndexToValue/5.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyWithIndexToValue/SSTs/000008-44099692-68de-4b22-8238-3a8029f693ad.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyWithIndexToValue/SSTs/000008-44099692-68de-4b22-8238-3a8029f693ad.sst new file mode 100644 index 0000000000000..865f1fef7f1f5 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/left-keyWithIndexToValue/SSTs/000008-44099692-68de-4b22-8238-3a8029f693ad.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyToNumValues/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyToNumValues/1.changelog new file mode 100644 index 0000000000000..2daeae74a72b9 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyToNumValues/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyToNumValues/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyToNumValues/2.changelog new file mode 100644 index 0000000000000..abdd6a5fca257 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyToNumValues/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyToNumValues/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyToNumValues/2.zip new file mode 100644 index 0000000000000..c3a661fc13ade Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyToNumValues/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyToNumValues/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyToNumValues/3.changelog new file mode 100644 index 0000000000000..e8da61cb5ddc9 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyToNumValues/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyToNumValues/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyToNumValues/4.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyToNumValues/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyToNumValues/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyToNumValues/4.zip new file mode 100644 index 0000000000000..f6157326c212d Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyToNumValues/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyToNumValues/5.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyToNumValues/5.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyToNumValues/5.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyToNumValues/SSTs/000008-a9a5b2ba-3bff-402f-b750-b9232ac3d1b1.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyToNumValues/SSTs/000008-a9a5b2ba-3bff-402f-b750-b9232ac3d1b1.sst new file mode 100644 index 0000000000000..a9160d9a66ab6 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyToNumValues/SSTs/000008-a9a5b2ba-3bff-402f-b750-b9232ac3d1b1.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyToNumValues/SSTs/000009-c5863121-41a9-4ffc-9c54-5f4c2a1139a2.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyToNumValues/SSTs/000009-c5863121-41a9-4ffc-9c54-5f4c2a1139a2.sst new file mode 100644 index 0000000000000..f4987ae824e82 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyToNumValues/SSTs/000009-c5863121-41a9-4ffc-9c54-5f4c2a1139a2.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyWithIndexToValue/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyWithIndexToValue/1.changelog new file mode 100644 index 0000000000000..14f1066107657 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyWithIndexToValue/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyWithIndexToValue/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyWithIndexToValue/2.changelog new file mode 100644 index 0000000000000..e1452512bade8 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyWithIndexToValue/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyWithIndexToValue/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyWithIndexToValue/2.zip new file mode 100644 index 0000000000000..6411680f0c8fb Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyWithIndexToValue/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyWithIndexToValue/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyWithIndexToValue/3.changelog new file mode 100644 index 0000000000000..6b20acbc9b79b Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyWithIndexToValue/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyWithIndexToValue/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyWithIndexToValue/4.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyWithIndexToValue/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyWithIndexToValue/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyWithIndexToValue/4.zip new file mode 100644 index 0000000000000..982f220c2cbd7 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyWithIndexToValue/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyWithIndexToValue/5.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyWithIndexToValue/5.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyWithIndexToValue/5.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyWithIndexToValue/SSTs/000008-a55c633d-a594-4cac-b6f2-c63907cb9581.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyWithIndexToValue/SSTs/000008-a55c633d-a594-4cac-b6f2-c63907cb9581.sst new file mode 100644 index 0000000000000..6464d1dbfcc7e Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyWithIndexToValue/SSTs/000008-a55c633d-a594-4cac-b6f2-c63907cb9581.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyWithIndexToValue/SSTs/000009-f8390e48-d464-49b8-8a3b-6393448f907d.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyWithIndexToValue/SSTs/000009-f8390e48-d464-49b8-8a3b-6393448f907d.sst new file mode 100644 index 0000000000000..df6034ac2f3db Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/2/right-keyWithIndexToValue/SSTs/000009-f8390e48-d464-49b8-8a3b-6393448f907d.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyToNumValues/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyToNumValues/1.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyToNumValues/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyToNumValues/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyToNumValues/2.changelog new file mode 100644 index 0000000000000..99210c3a3c567 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyToNumValues/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyToNumValues/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyToNumValues/2.zip new file mode 100644 index 0000000000000..d02b01bc35f23 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyToNumValues/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyToNumValues/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyToNumValues/3.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyToNumValues/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyToNumValues/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyToNumValues/4.changelog new file mode 100644 index 0000000000000..262abc1cc08a5 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyToNumValues/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyToNumValues/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyToNumValues/4.zip new file mode 100644 index 0000000000000..e198b32fff211 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyToNumValues/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyToNumValues/5.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyToNumValues/5.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyToNumValues/5.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyToNumValues/SSTs/000008-ad09ce4b-4be4-44ce-a620-4244a73f84f5.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyToNumValues/SSTs/000008-ad09ce4b-4be4-44ce-a620-4244a73f84f5.sst new file mode 100644 index 0000000000000..07de5247476c7 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyToNumValues/SSTs/000008-ad09ce4b-4be4-44ce-a620-4244a73f84f5.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyToNumValues/SSTs/000009-5561a93a-1e0b-46aa-98d5-685048b992fa.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyToNumValues/SSTs/000009-5561a93a-1e0b-46aa-98d5-685048b992fa.sst new file mode 100644 index 0000000000000..7619f71cd55ad Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyToNumValues/SSTs/000009-5561a93a-1e0b-46aa-98d5-685048b992fa.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyWithIndexToValue/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyWithIndexToValue/1.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyWithIndexToValue/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyWithIndexToValue/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyWithIndexToValue/2.changelog new file mode 100644 index 0000000000000..150d775115a47 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyWithIndexToValue/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyWithIndexToValue/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyWithIndexToValue/2.zip new file mode 100644 index 0000000000000..91cea46a6cd86 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyWithIndexToValue/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyWithIndexToValue/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyWithIndexToValue/3.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyWithIndexToValue/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyWithIndexToValue/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyWithIndexToValue/4.changelog new file mode 100644 index 0000000000000..3964e93f5a400 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyWithIndexToValue/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyWithIndexToValue/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyWithIndexToValue/4.zip new file mode 100644 index 0000000000000..9319b07246660 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyWithIndexToValue/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyWithIndexToValue/5.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyWithIndexToValue/5.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyWithIndexToValue/5.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyWithIndexToValue/SSTs/000008-36205897-8ada-4516-b663-122c915754f0.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyWithIndexToValue/SSTs/000008-36205897-8ada-4516-b663-122c915754f0.sst new file mode 100644 index 0000000000000..cad982214ad18 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyWithIndexToValue/SSTs/000008-36205897-8ada-4516-b663-122c915754f0.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyWithIndexToValue/SSTs/000009-bffa46f8-b6c7-431e-a579-d8725aec338b.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyWithIndexToValue/SSTs/000009-bffa46f8-b6c7-431e-a579-d8725aec338b.sst new file mode 100644 index 0000000000000..aa392eaadd1d9 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/left-keyWithIndexToValue/SSTs/000009-bffa46f8-b6c7-431e-a579-d8725aec338b.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyToNumValues/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyToNumValues/1.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyToNumValues/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyToNumValues/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyToNumValues/2.changelog new file mode 100644 index 0000000000000..99210c3a3c567 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyToNumValues/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyToNumValues/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyToNumValues/2.zip new file mode 100644 index 0000000000000..3ae503e5deff3 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyToNumValues/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyToNumValues/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyToNumValues/3.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyToNumValues/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyToNumValues/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyToNumValues/4.changelog new file mode 100644 index 0000000000000..262abc1cc08a5 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyToNumValues/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyToNumValues/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyToNumValues/4.zip new file mode 100644 index 0000000000000..4ba14fb5af442 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyToNumValues/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyToNumValues/5.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyToNumValues/5.changelog new file mode 100644 index 0000000000000..5074a90bc3b90 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyToNumValues/5.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyToNumValues/SSTs/000008-f6d87143-b4d6-4c30-bd43-8e90c98704e9.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyToNumValues/SSTs/000008-f6d87143-b4d6-4c30-bd43-8e90c98704e9.sst new file mode 100644 index 0000000000000..e3dbd832a2321 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyToNumValues/SSTs/000008-f6d87143-b4d6-4c30-bd43-8e90c98704e9.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyToNumValues/SSTs/000009-64a8ab01-cdfe-4c82-94e3-5bc4c64b6671.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyToNumValues/SSTs/000009-64a8ab01-cdfe-4c82-94e3-5bc4c64b6671.sst new file mode 100644 index 0000000000000..4801a5b8254f7 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyToNumValues/SSTs/000009-64a8ab01-cdfe-4c82-94e3-5bc4c64b6671.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyWithIndexToValue/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyWithIndexToValue/1.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyWithIndexToValue/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyWithIndexToValue/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyWithIndexToValue/2.changelog new file mode 100644 index 0000000000000..6d01a6e79416f Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyWithIndexToValue/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyWithIndexToValue/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyWithIndexToValue/2.zip new file mode 100644 index 0000000000000..6a9f94a8b33bf Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyWithIndexToValue/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyWithIndexToValue/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyWithIndexToValue/3.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyWithIndexToValue/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyWithIndexToValue/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyWithIndexToValue/4.changelog new file mode 100644 index 0000000000000..28785bebe081e Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyWithIndexToValue/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyWithIndexToValue/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyWithIndexToValue/4.zip new file mode 100644 index 0000000000000..5b8f0d2bbf151 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyWithIndexToValue/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyWithIndexToValue/5.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyWithIndexToValue/5.changelog new file mode 100644 index 0000000000000..ff933a6cfd56c Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyWithIndexToValue/5.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyWithIndexToValue/SSTs/000008-50764781-b9d2-482d-a307-b2d1cab8c639.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyWithIndexToValue/SSTs/000008-50764781-b9d2-482d-a307-b2d1cab8c639.sst new file mode 100644 index 0000000000000..9a627936b57ca Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyWithIndexToValue/SSTs/000008-50764781-b9d2-482d-a307-b2d1cab8c639.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyWithIndexToValue/SSTs/000009-38fca8e3-893a-4d8b-8f4a-57366102a54c.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyWithIndexToValue/SSTs/000009-38fca8e3-893a-4d8b-8f4a-57366102a54c.sst new file mode 100644 index 0000000000000..307b166f4e9b3 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/3/right-keyWithIndexToValue/SSTs/000009-38fca8e3-893a-4d8b-8f4a-57366102a54c.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyToNumValues/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyToNumValues/1.changelog new file mode 100644 index 0000000000000..9099712c6134d Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyToNumValues/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyToNumValues/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyToNumValues/2.changelog new file mode 100644 index 0000000000000..c5a3af6244a01 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyToNumValues/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyToNumValues/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyToNumValues/2.zip new file mode 100644 index 0000000000000..1ac5a3348bbe6 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyToNumValues/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyToNumValues/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyToNumValues/3.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyToNumValues/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyToNumValues/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyToNumValues/4.changelog new file mode 100644 index 0000000000000..4b3cfd1f719bb Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyToNumValues/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyToNumValues/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyToNumValues/4.zip new file mode 100644 index 0000000000000..54e0959698acf Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyToNumValues/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyToNumValues/5.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyToNumValues/5.changelog new file mode 100644 index 0000000000000..298ebddcda718 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyToNumValues/5.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyToNumValues/SSTs/000008-fd11870f-bea5-4721-aedf-b14399b966bd.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyToNumValues/SSTs/000008-fd11870f-bea5-4721-aedf-b14399b966bd.sst new file mode 100644 index 0000000000000..4ae4e252afb2a Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyToNumValues/SSTs/000008-fd11870f-bea5-4721-aedf-b14399b966bd.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyToNumValues/SSTs/000009-1ba9d77a-6c31-42e5-a18e-f66c0ef201b5.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyToNumValues/SSTs/000009-1ba9d77a-6c31-42e5-a18e-f66c0ef201b5.sst new file mode 100644 index 0000000000000..c3dd64fbe2805 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyToNumValues/SSTs/000009-1ba9d77a-6c31-42e5-a18e-f66c0ef201b5.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyWithIndexToValue/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyWithIndexToValue/1.changelog new file mode 100644 index 0000000000000..a7735153b54cc Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyWithIndexToValue/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyWithIndexToValue/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyWithIndexToValue/2.changelog new file mode 100644 index 0000000000000..f4407b5c4f83b Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyWithIndexToValue/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyWithIndexToValue/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyWithIndexToValue/2.zip new file mode 100644 index 0000000000000..ee8a442d7eaa0 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyWithIndexToValue/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyWithIndexToValue/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyWithIndexToValue/3.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyWithIndexToValue/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyWithIndexToValue/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyWithIndexToValue/4.changelog new file mode 100644 index 0000000000000..3e2ea96d857f0 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyWithIndexToValue/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyWithIndexToValue/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyWithIndexToValue/4.zip new file mode 100644 index 0000000000000..2dfce06a4435b Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyWithIndexToValue/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyWithIndexToValue/5.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyWithIndexToValue/5.changelog new file mode 100644 index 0000000000000..9ec7dcb0d0515 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyWithIndexToValue/5.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyWithIndexToValue/SSTs/000008-7fbc8ecb-8e90-4de6-a334-ce789dfe3dd5.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyWithIndexToValue/SSTs/000008-7fbc8ecb-8e90-4de6-a334-ce789dfe3dd5.sst new file mode 100644 index 0000000000000..6b5b6c05ae0d2 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyWithIndexToValue/SSTs/000008-7fbc8ecb-8e90-4de6-a334-ce789dfe3dd5.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyWithIndexToValue/SSTs/000009-05c7d657-5d52-4d9d-9e5e-d28daf8d6500.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyWithIndexToValue/SSTs/000009-05c7d657-5d52-4d9d-9e5e-d28daf8d6500.sst new file mode 100644 index 0000000000000..72187124c13f2 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/left-keyWithIndexToValue/SSTs/000009-05c7d657-5d52-4d9d-9e5e-d28daf8d6500.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyToNumValues/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyToNumValues/1.changelog new file mode 100644 index 0000000000000..9099712c6134d Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyToNumValues/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyToNumValues/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyToNumValues/2.changelog new file mode 100644 index 0000000000000..c5a3af6244a01 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyToNumValues/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyToNumValues/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyToNumValues/2.zip new file mode 100644 index 0000000000000..ed87d10aecafa Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyToNumValues/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyToNumValues/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyToNumValues/3.changelog new file mode 100644 index 0000000000000..298ebddcda718 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyToNumValues/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyToNumValues/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyToNumValues/4.changelog new file mode 100644 index 0000000000000..4b3cfd1f719bb Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyToNumValues/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyToNumValues/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyToNumValues/4.zip new file mode 100644 index 0000000000000..b777691d26852 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyToNumValues/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyToNumValues/5.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyToNumValues/5.changelog new file mode 100644 index 0000000000000..d50df0481f970 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyToNumValues/5.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyToNumValues/SSTs/000008-fb249fa1-17cf-4f28-99fa-e31932ec1caf.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyToNumValues/SSTs/000008-fb249fa1-17cf-4f28-99fa-e31932ec1caf.sst new file mode 100644 index 0000000000000..c8b94f7b4bc85 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyToNumValues/SSTs/000008-fb249fa1-17cf-4f28-99fa-e31932ec1caf.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyToNumValues/SSTs/000009-6dc46235-1157-4bb7-9af0-58aee4e2edf2.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyToNumValues/SSTs/000009-6dc46235-1157-4bb7-9af0-58aee4e2edf2.sst new file mode 100644 index 0000000000000..564acccff46f8 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyToNumValues/SSTs/000009-6dc46235-1157-4bb7-9af0-58aee4e2edf2.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyWithIndexToValue/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyWithIndexToValue/1.changelog new file mode 100644 index 0000000000000..32458caceea0a Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyWithIndexToValue/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyWithIndexToValue/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyWithIndexToValue/2.changelog new file mode 100644 index 0000000000000..a3bc5bfd26f8e Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyWithIndexToValue/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyWithIndexToValue/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyWithIndexToValue/2.zip new file mode 100644 index 0000000000000..54fd13c914ae4 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyWithIndexToValue/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyWithIndexToValue/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyWithIndexToValue/3.changelog new file mode 100644 index 0000000000000..9ec7dcb0d0515 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyWithIndexToValue/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyWithIndexToValue/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyWithIndexToValue/4.changelog new file mode 100644 index 0000000000000..e3906cea3366a Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyWithIndexToValue/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyWithIndexToValue/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyWithIndexToValue/4.zip new file mode 100644 index 0000000000000..97ba13e356c67 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyWithIndexToValue/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyWithIndexToValue/5.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyWithIndexToValue/5.changelog new file mode 100644 index 0000000000000..ffe4b92ca56dd Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyWithIndexToValue/5.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyWithIndexToValue/SSTs/000008-2695fe50-483f-4beb-a12d-770bfe63da9c.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyWithIndexToValue/SSTs/000008-2695fe50-483f-4beb-a12d-770bfe63da9c.sst new file mode 100644 index 0000000000000..fa6f037ecb6ab Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyWithIndexToValue/SSTs/000008-2695fe50-483f-4beb-a12d-770bfe63da9c.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyWithIndexToValue/SSTs/000009-c819e3a7-826b-46d0-8667-1670b9b2d13c.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyWithIndexToValue/SSTs/000009-c819e3a7-826b-46d0-8667-1670b9b2d13c.sst new file mode 100644 index 0000000000000..9db30172f1477 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/4/right-keyWithIndexToValue/SSTs/000009-c819e3a7-826b-46d0-8667-1670b9b2d13c.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/_metadata/metadata b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/_metadata/metadata new file mode 100644 index 0000000000000..b73f1e3e66ac5 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join1/state/0/_metadata/metadata @@ -0,0 +1,2 @@ +v1 +{"operatorInfo":{"operatorId":0,"operatorName":"symmetricHashJoin"},"stateStoreInfo":[{"storeName":"left-keyToNumValues","numColsPrefixKey":0,"numPartitions":5},{"storeName":"left-keyWithIndexToValue","numColsPrefixKey":0,"numPartitions":5},{"storeName":"right-keyToNumValues","numColsPrefixKey":0,"numPartitions":5},{"storeName":"right-keyWithIndexToValue","numColsPrefixKey":0,"numPartitions":5}]} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/commits/0 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/commits/0 new file mode 100644 index 0000000000000..9c1e3021c3ead --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/commits/0 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":0} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/commits/1 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/commits/1 new file mode 100644 index 0000000000000..1715390973f9b --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/commits/1 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":5000} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/commits/2 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/commits/2 new file mode 100644 index 0000000000000..1715390973f9b --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/commits/2 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":5000} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/commits/3 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/commits/3 new file mode 100644 index 0000000000000..2a2d02b0bb3b3 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/commits/3 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":9000} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/commits/4 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/commits/4 new file mode 100644 index 0000000000000..2a2d02b0bb3b3 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/commits/4 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":9000} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/metadata b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/metadata new file mode 100644 index 0000000000000..67f735ddd54b5 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/metadata @@ -0,0 +1 @@ +{"id":"cb2f2800-2129-4356-bf83-e985db7d8556"} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/offsets/0 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/offsets/0 new file mode 100644 index 0000000000000..fe391df89fb42 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/offsets/0 @@ -0,0 +1,4 @@ +v1 +{"batchWatermarkMs":0,"batchTimestampMs":1719290972508,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4"}} +0 +0 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/offsets/1 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/offsets/1 new file mode 100644 index 0000000000000..448160d0ae047 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/offsets/1 @@ -0,0 +1,4 @@ +v1 +{"batchWatermarkMs":0,"batchTimestampMs":1719290977372,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4"}} +1 +1 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/offsets/2 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/offsets/2 new file mode 100644 index 0000000000000..740bc6b031054 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/offsets/2 @@ -0,0 +1,4 @@ +v1 +{"batchWatermarkMs":5000,"batchTimestampMs":1719290979966,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4"}} +1 +1 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/offsets/3 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/offsets/3 new file mode 100644 index 0000000000000..1987fabf89c19 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/offsets/3 @@ -0,0 +1,4 @@ +v1 +{"batchWatermarkMs":5000,"batchTimestampMs":1719290983375,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4"}} +2 +2 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/offsets/4 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/offsets/4 new file mode 100644 index 0000000000000..7eeef49c1f168 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/offsets/4 @@ -0,0 +1,4 @@ +v1 +{"batchWatermarkMs":9000,"batchTimestampMs":1719290985871,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4"}} +2 +2 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyToNumValues/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyToNumValues/1.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyToNumValues/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyToNumValues/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyToNumValues/2.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyToNumValues/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyToNumValues/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyToNumValues/2.zip new file mode 100644 index 0000000000000..89a73981724aa Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyToNumValues/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyToNumValues/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyToNumValues/3.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyToNumValues/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyToNumValues/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyToNumValues/4.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyToNumValues/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyToNumValues/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyToNumValues/4.zip new file mode 100644 index 0000000000000..85424f0d5afba Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyToNumValues/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyToNumValues/5.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyToNumValues/5.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyToNumValues/5.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyToNumValues/_metadata/schema b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyToNumValues/_metadata/schema new file mode 100644 index 0000000000000..4da637d143496 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyToNumValues/_metadata/schema differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyWithIndexToValue/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyWithIndexToValue/1.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyWithIndexToValue/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyWithIndexToValue/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyWithIndexToValue/2.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyWithIndexToValue/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyWithIndexToValue/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyWithIndexToValue/2.zip new file mode 100644 index 0000000000000..89a73981724aa Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyWithIndexToValue/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyWithIndexToValue/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyWithIndexToValue/3.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyWithIndexToValue/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyWithIndexToValue/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyWithIndexToValue/4.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyWithIndexToValue/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyWithIndexToValue/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyWithIndexToValue/4.zip new file mode 100644 index 0000000000000..85424f0d5afba Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyWithIndexToValue/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyWithIndexToValue/5.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyWithIndexToValue/5.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyWithIndexToValue/5.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyWithIndexToValue/_metadata/schema b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyWithIndexToValue/_metadata/schema new file mode 100644 index 0000000000000..42448b3b584ce Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/left-keyWithIndexToValue/_metadata/schema differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyToNumValues/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyToNumValues/1.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyToNumValues/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyToNumValues/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyToNumValues/2.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyToNumValues/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyToNumValues/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyToNumValues/2.zip new file mode 100644 index 0000000000000..993d32e965db5 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyToNumValues/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyToNumValues/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyToNumValues/3.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyToNumValues/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyToNumValues/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyToNumValues/4.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyToNumValues/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyToNumValues/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyToNumValues/4.zip new file mode 100644 index 0000000000000..cfcea9ae02e88 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyToNumValues/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyToNumValues/5.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyToNumValues/5.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyToNumValues/5.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyToNumValues/_metadata/schema b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyToNumValues/_metadata/schema new file mode 100644 index 0000000000000..4da637d143496 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyToNumValues/_metadata/schema differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyWithIndexToValue/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyWithIndexToValue/1.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyWithIndexToValue/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyWithIndexToValue/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyWithIndexToValue/2.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyWithIndexToValue/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyWithIndexToValue/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyWithIndexToValue/2.zip new file mode 100644 index 0000000000000..993d32e965db5 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyWithIndexToValue/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyWithIndexToValue/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyWithIndexToValue/3.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyWithIndexToValue/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyWithIndexToValue/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyWithIndexToValue/4.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyWithIndexToValue/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyWithIndexToValue/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyWithIndexToValue/4.zip new file mode 100644 index 0000000000000..cfcea9ae02e88 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyWithIndexToValue/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyWithIndexToValue/5.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyWithIndexToValue/5.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyWithIndexToValue/5.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyWithIndexToValue/_metadata/schema b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyWithIndexToValue/_metadata/schema new file mode 100644 index 0000000000000..8fa8f1675bc82 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/0/right-keyWithIndexToValue/_metadata/schema differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/left-keyToNumValues/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/left-keyToNumValues/1.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/left-keyToNumValues/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/left-keyToNumValues/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/left-keyToNumValues/2.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/left-keyToNumValues/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/left-keyToNumValues/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/left-keyToNumValues/2.zip new file mode 100644 index 0000000000000..89a73981724aa Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/left-keyToNumValues/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/left-keyToNumValues/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/left-keyToNumValues/3.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/left-keyToNumValues/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/left-keyToNumValues/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/left-keyToNumValues/4.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/left-keyToNumValues/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/left-keyToNumValues/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/left-keyToNumValues/4.zip new file mode 100644 index 0000000000000..cfcea9ae02e88 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/left-keyToNumValues/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/left-keyToNumValues/5.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/left-keyToNumValues/5.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/left-keyToNumValues/5.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/left-keyWithIndexToValue/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/left-keyWithIndexToValue/1.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/left-keyWithIndexToValue/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/left-keyWithIndexToValue/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/left-keyWithIndexToValue/2.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/left-keyWithIndexToValue/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/left-keyWithIndexToValue/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/left-keyWithIndexToValue/2.zip new file mode 100644 index 0000000000000..89a73981724aa Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/left-keyWithIndexToValue/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/left-keyWithIndexToValue/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/left-keyWithIndexToValue/3.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/left-keyWithIndexToValue/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/left-keyWithIndexToValue/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/left-keyWithIndexToValue/4.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/left-keyWithIndexToValue/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/left-keyWithIndexToValue/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/left-keyWithIndexToValue/4.zip new file mode 100644 index 0000000000000..85424f0d5afba Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/left-keyWithIndexToValue/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/left-keyWithIndexToValue/5.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/left-keyWithIndexToValue/5.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/left-keyWithIndexToValue/5.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/right-keyToNumValues/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/right-keyToNumValues/1.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/right-keyToNumValues/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/right-keyToNumValues/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/right-keyToNumValues/2.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/right-keyToNumValues/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/right-keyToNumValues/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/right-keyToNumValues/2.zip new file mode 100644 index 0000000000000..993d32e965db5 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/right-keyToNumValues/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/right-keyToNumValues/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/right-keyToNumValues/3.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/right-keyToNumValues/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/right-keyToNumValues/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/right-keyToNumValues/4.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/right-keyToNumValues/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/right-keyToNumValues/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/right-keyToNumValues/4.zip new file mode 100644 index 0000000000000..cfcea9ae02e88 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/right-keyToNumValues/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/right-keyToNumValues/5.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/right-keyToNumValues/5.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/right-keyToNumValues/5.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/right-keyWithIndexToValue/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/right-keyWithIndexToValue/1.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/right-keyWithIndexToValue/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/right-keyWithIndexToValue/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/right-keyWithIndexToValue/2.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/right-keyWithIndexToValue/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/right-keyWithIndexToValue/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/right-keyWithIndexToValue/2.zip new file mode 100644 index 0000000000000..993d32e965db5 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/right-keyWithIndexToValue/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/right-keyWithIndexToValue/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/right-keyWithIndexToValue/3.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/right-keyWithIndexToValue/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/right-keyWithIndexToValue/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/right-keyWithIndexToValue/4.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/right-keyWithIndexToValue/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/right-keyWithIndexToValue/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/right-keyWithIndexToValue/4.zip new file mode 100644 index 0000000000000..cfcea9ae02e88 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/right-keyWithIndexToValue/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/right-keyWithIndexToValue/5.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/right-keyWithIndexToValue/5.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/1/right-keyWithIndexToValue/5.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyToNumValues/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyToNumValues/1.changelog new file mode 100644 index 0000000000000..2daeae74a72b9 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyToNumValues/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyToNumValues/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyToNumValues/2.changelog new file mode 100644 index 0000000000000..abdd6a5fca257 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyToNumValues/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyToNumValues/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyToNumValues/2.zip new file mode 100644 index 0000000000000..23e8882b8d9f2 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyToNumValues/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyToNumValues/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyToNumValues/3.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyToNumValues/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyToNumValues/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyToNumValues/4.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyToNumValues/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyToNumValues/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyToNumValues/4.zip new file mode 100644 index 0000000000000..218dc89456c8b Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyToNumValues/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyToNumValues/5.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyToNumValues/5.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyToNumValues/5.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyToNumValues/SSTs/000008-bfd9de3b-a339-42ad-8270-343b70b3999e.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyToNumValues/SSTs/000008-bfd9de3b-a339-42ad-8270-343b70b3999e.sst new file mode 100644 index 0000000000000..6dcf6ef780c87 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyToNumValues/SSTs/000008-bfd9de3b-a339-42ad-8270-343b70b3999e.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyWithIndexToValue/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyWithIndexToValue/1.changelog new file mode 100644 index 0000000000000..88a2191cae130 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyWithIndexToValue/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyWithIndexToValue/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyWithIndexToValue/2.changelog new file mode 100644 index 0000000000000..57d0436d95d79 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyWithIndexToValue/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyWithIndexToValue/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyWithIndexToValue/2.zip new file mode 100644 index 0000000000000..46803957b004a Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyWithIndexToValue/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyWithIndexToValue/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyWithIndexToValue/3.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyWithIndexToValue/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyWithIndexToValue/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyWithIndexToValue/4.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyWithIndexToValue/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyWithIndexToValue/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyWithIndexToValue/4.zip new file mode 100644 index 0000000000000..2d25b2447e398 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyWithIndexToValue/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyWithIndexToValue/5.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyWithIndexToValue/5.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyWithIndexToValue/5.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyWithIndexToValue/SSTs/000008-daf5150d-98b0-4042-ac23-f6ffcb9eef2f.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyWithIndexToValue/SSTs/000008-daf5150d-98b0-4042-ac23-f6ffcb9eef2f.sst new file mode 100644 index 0000000000000..10fd7fc691c70 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/left-keyWithIndexToValue/SSTs/000008-daf5150d-98b0-4042-ac23-f6ffcb9eef2f.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyToNumValues/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyToNumValues/1.changelog new file mode 100644 index 0000000000000..2daeae74a72b9 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyToNumValues/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyToNumValues/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyToNumValues/2.changelog new file mode 100644 index 0000000000000..abdd6a5fca257 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyToNumValues/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyToNumValues/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyToNumValues/2.zip new file mode 100644 index 0000000000000..ad3d69d2a8789 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyToNumValues/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyToNumValues/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyToNumValues/3.changelog new file mode 100644 index 0000000000000..e8da61cb5ddc9 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyToNumValues/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyToNumValues/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyToNumValues/4.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyToNumValues/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyToNumValues/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyToNumValues/4.zip new file mode 100644 index 0000000000000..c89c97f5641fe Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyToNumValues/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyToNumValues/5.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyToNumValues/5.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyToNumValues/5.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyToNumValues/SSTs/000008-8de57e57-02b1-47fc-8150-eb984771e1ca.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyToNumValues/SSTs/000008-8de57e57-02b1-47fc-8150-eb984771e1ca.sst new file mode 100644 index 0000000000000..4744bbf75039b Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyToNumValues/SSTs/000008-8de57e57-02b1-47fc-8150-eb984771e1ca.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyToNumValues/SSTs/000009-a8eb1b10-f5fe-4a44-94ab-545d1e33121c.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyToNumValues/SSTs/000009-a8eb1b10-f5fe-4a44-94ab-545d1e33121c.sst new file mode 100644 index 0000000000000..5706bad6347fe Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyToNumValues/SSTs/000009-a8eb1b10-f5fe-4a44-94ab-545d1e33121c.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyWithIndexToValue/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyWithIndexToValue/1.changelog new file mode 100644 index 0000000000000..14f1066107657 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyWithIndexToValue/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyWithIndexToValue/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyWithIndexToValue/2.changelog new file mode 100644 index 0000000000000..e1452512bade8 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyWithIndexToValue/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyWithIndexToValue/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyWithIndexToValue/2.zip new file mode 100644 index 0000000000000..a600351f2674f Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyWithIndexToValue/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyWithIndexToValue/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyWithIndexToValue/3.changelog new file mode 100644 index 0000000000000..6b20acbc9b79b Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyWithIndexToValue/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyWithIndexToValue/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyWithIndexToValue/4.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyWithIndexToValue/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyWithIndexToValue/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyWithIndexToValue/4.zip new file mode 100644 index 0000000000000..4d53649b5608c Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyWithIndexToValue/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyWithIndexToValue/5.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyWithIndexToValue/5.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyWithIndexToValue/5.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyWithIndexToValue/SSTs/000008-4f4617f6-01ce-48f5-aaa9-c9d7007ac482.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyWithIndexToValue/SSTs/000008-4f4617f6-01ce-48f5-aaa9-c9d7007ac482.sst new file mode 100644 index 0000000000000..650af4d3042de Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyWithIndexToValue/SSTs/000008-4f4617f6-01ce-48f5-aaa9-c9d7007ac482.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyWithIndexToValue/SSTs/000009-840bae7d-ff30-4aef-9672-bf70fa6943c4.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyWithIndexToValue/SSTs/000009-840bae7d-ff30-4aef-9672-bf70fa6943c4.sst new file mode 100644 index 0000000000000..e1ed48d909d9c Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/2/right-keyWithIndexToValue/SSTs/000009-840bae7d-ff30-4aef-9672-bf70fa6943c4.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyToNumValues/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyToNumValues/1.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyToNumValues/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyToNumValues/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyToNumValues/2.changelog new file mode 100644 index 0000000000000..99210c3a3c567 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyToNumValues/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyToNumValues/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyToNumValues/2.zip new file mode 100644 index 0000000000000..2123cc3a99531 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyToNumValues/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyToNumValues/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyToNumValues/3.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyToNumValues/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyToNumValues/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyToNumValues/4.changelog new file mode 100644 index 0000000000000..262abc1cc08a5 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyToNumValues/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyToNumValues/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyToNumValues/4.zip new file mode 100644 index 0000000000000..83efaf04ab162 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyToNumValues/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyToNumValues/5.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyToNumValues/5.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyToNumValues/5.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyToNumValues/SSTs/000008-e1bd2941-d044-4898-a841-7d6d094b7c30.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyToNumValues/SSTs/000008-e1bd2941-d044-4898-a841-7d6d094b7c30.sst new file mode 100644 index 0000000000000..37af8fb0bf6b4 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyToNumValues/SSTs/000008-e1bd2941-d044-4898-a841-7d6d094b7c30.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyToNumValues/SSTs/000009-e646f573-4a5b-408d-88da-85810aabe966.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyToNumValues/SSTs/000009-e646f573-4a5b-408d-88da-85810aabe966.sst new file mode 100644 index 0000000000000..149350b5abec4 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyToNumValues/SSTs/000009-e646f573-4a5b-408d-88da-85810aabe966.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyWithIndexToValue/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyWithIndexToValue/1.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyWithIndexToValue/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyWithIndexToValue/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyWithIndexToValue/2.changelog new file mode 100644 index 0000000000000..150d775115a47 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyWithIndexToValue/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyWithIndexToValue/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyWithIndexToValue/2.zip new file mode 100644 index 0000000000000..4f0ff11d17441 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyWithIndexToValue/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyWithIndexToValue/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyWithIndexToValue/3.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyWithIndexToValue/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyWithIndexToValue/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyWithIndexToValue/4.changelog new file mode 100644 index 0000000000000..3964e93f5a400 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyWithIndexToValue/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyWithIndexToValue/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyWithIndexToValue/4.zip new file mode 100644 index 0000000000000..c4ac694c0f31b Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyWithIndexToValue/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyWithIndexToValue/5.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyWithIndexToValue/5.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyWithIndexToValue/5.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyWithIndexToValue/SSTs/000008-28cb1ac6-cf15-4f63-8c74-ea655f8ce669.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyWithIndexToValue/SSTs/000008-28cb1ac6-cf15-4f63-8c74-ea655f8ce669.sst new file mode 100644 index 0000000000000..cbf2c438d8437 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyWithIndexToValue/SSTs/000008-28cb1ac6-cf15-4f63-8c74-ea655f8ce669.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyWithIndexToValue/SSTs/000009-5a4a38a3-8e5a-4436-b0b7-718fe7ed09a6.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyWithIndexToValue/SSTs/000009-5a4a38a3-8e5a-4436-b0b7-718fe7ed09a6.sst new file mode 100644 index 0000000000000..f3769a666d813 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/left-keyWithIndexToValue/SSTs/000009-5a4a38a3-8e5a-4436-b0b7-718fe7ed09a6.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyToNumValues/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyToNumValues/1.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyToNumValues/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyToNumValues/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyToNumValues/2.changelog new file mode 100644 index 0000000000000..99210c3a3c567 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyToNumValues/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyToNumValues/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyToNumValues/2.zip new file mode 100644 index 0000000000000..4a5a9bd672051 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyToNumValues/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyToNumValues/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyToNumValues/3.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyToNumValues/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyToNumValues/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyToNumValues/4.changelog new file mode 100644 index 0000000000000..262abc1cc08a5 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyToNumValues/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyToNumValues/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyToNumValues/4.zip new file mode 100644 index 0000000000000..52e86b7a99073 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyToNumValues/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyToNumValues/5.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyToNumValues/5.changelog new file mode 100644 index 0000000000000..5074a90bc3b90 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyToNumValues/5.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyToNumValues/SSTs/000008-294ce63e-1e67-4bc5-98fa-67b6f40669ff.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyToNumValues/SSTs/000008-294ce63e-1e67-4bc5-98fa-67b6f40669ff.sst new file mode 100644 index 0000000000000..0e93b773ae5ca Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyToNumValues/SSTs/000008-294ce63e-1e67-4bc5-98fa-67b6f40669ff.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyToNumValues/SSTs/000009-ad3aeaf9-d804-4237-91e5-e2d61f7c8338.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyToNumValues/SSTs/000009-ad3aeaf9-d804-4237-91e5-e2d61f7c8338.sst new file mode 100644 index 0000000000000..2eb1556d44bac Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyToNumValues/SSTs/000009-ad3aeaf9-d804-4237-91e5-e2d61f7c8338.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyWithIndexToValue/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyWithIndexToValue/1.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyWithIndexToValue/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyWithIndexToValue/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyWithIndexToValue/2.changelog new file mode 100644 index 0000000000000..6d01a6e79416f Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyWithIndexToValue/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyWithIndexToValue/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyWithIndexToValue/2.zip new file mode 100644 index 0000000000000..e64d4d046de88 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyWithIndexToValue/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyWithIndexToValue/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyWithIndexToValue/3.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyWithIndexToValue/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyWithIndexToValue/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyWithIndexToValue/4.changelog new file mode 100644 index 0000000000000..28785bebe081e Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyWithIndexToValue/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyWithIndexToValue/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyWithIndexToValue/4.zip new file mode 100644 index 0000000000000..e5fda4000120f Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyWithIndexToValue/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyWithIndexToValue/5.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyWithIndexToValue/5.changelog new file mode 100644 index 0000000000000..ff933a6cfd56c Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyWithIndexToValue/5.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyWithIndexToValue/SSTs/000008-e9c9dd20-a2b3-4eac-94e6-3336a8bf2953.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyWithIndexToValue/SSTs/000008-e9c9dd20-a2b3-4eac-94e6-3336a8bf2953.sst new file mode 100644 index 0000000000000..a737c2036d013 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyWithIndexToValue/SSTs/000008-e9c9dd20-a2b3-4eac-94e6-3336a8bf2953.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyWithIndexToValue/SSTs/000009-7db59f7a-dabc-41f7-8878-87ca30afb09d.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyWithIndexToValue/SSTs/000009-7db59f7a-dabc-41f7-8878-87ca30afb09d.sst new file mode 100644 index 0000000000000..4af5716c9c24e Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/3/right-keyWithIndexToValue/SSTs/000009-7db59f7a-dabc-41f7-8878-87ca30afb09d.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyToNumValues/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyToNumValues/1.changelog new file mode 100644 index 0000000000000..9099712c6134d Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyToNumValues/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyToNumValues/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyToNumValues/2.changelog new file mode 100644 index 0000000000000..c5a3af6244a01 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyToNumValues/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyToNumValues/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyToNumValues/2.zip new file mode 100644 index 0000000000000..a62d4d6504f88 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyToNumValues/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyToNumValues/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyToNumValues/3.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyToNumValues/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyToNumValues/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyToNumValues/4.changelog new file mode 100644 index 0000000000000..4b3cfd1f719bb Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyToNumValues/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyToNumValues/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyToNumValues/4.zip new file mode 100644 index 0000000000000..3003df2131bf3 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyToNumValues/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyToNumValues/5.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyToNumValues/5.changelog new file mode 100644 index 0000000000000..298ebddcda718 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyToNumValues/5.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyToNumValues/SSTs/000008-a9c68839-18e4-4ef3-a329-432d2810b0c3.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyToNumValues/SSTs/000008-a9c68839-18e4-4ef3-a329-432d2810b0c3.sst new file mode 100644 index 0000000000000..5b9827d63d689 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyToNumValues/SSTs/000008-a9c68839-18e4-4ef3-a329-432d2810b0c3.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyToNumValues/SSTs/000009-e91db969-62ec-45d7-a7da-a4301e129feb.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyToNumValues/SSTs/000009-e91db969-62ec-45d7-a7da-a4301e129feb.sst new file mode 100644 index 0000000000000..ad1f76fb3e750 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyToNumValues/SSTs/000009-e91db969-62ec-45d7-a7da-a4301e129feb.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyWithIndexToValue/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyWithIndexToValue/1.changelog new file mode 100644 index 0000000000000..a7735153b54cc Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyWithIndexToValue/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyWithIndexToValue/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyWithIndexToValue/2.changelog new file mode 100644 index 0000000000000..f4407b5c4f83b Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyWithIndexToValue/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyWithIndexToValue/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyWithIndexToValue/2.zip new file mode 100644 index 0000000000000..b251d50f7e4ba Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyWithIndexToValue/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyWithIndexToValue/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyWithIndexToValue/3.changelog new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyWithIndexToValue/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyWithIndexToValue/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyWithIndexToValue/4.changelog new file mode 100644 index 0000000000000..3e2ea96d857f0 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyWithIndexToValue/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyWithIndexToValue/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyWithIndexToValue/4.zip new file mode 100644 index 0000000000000..1873033aa1c9a Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyWithIndexToValue/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyWithIndexToValue/5.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyWithIndexToValue/5.changelog new file mode 100644 index 0000000000000..9ec7dcb0d0515 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyWithIndexToValue/5.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyWithIndexToValue/SSTs/000008-5e19d13c-8855-47c9-8d4b-f14964f7fdfa.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyWithIndexToValue/SSTs/000008-5e19d13c-8855-47c9-8d4b-f14964f7fdfa.sst new file mode 100644 index 0000000000000..71dec68b333a2 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyWithIndexToValue/SSTs/000008-5e19d13c-8855-47c9-8d4b-f14964f7fdfa.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyWithIndexToValue/SSTs/000009-a97d5e47-e4eb-4878-b62d-30cc77dac3b1.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyWithIndexToValue/SSTs/000009-a97d5e47-e4eb-4878-b62d-30cc77dac3b1.sst new file mode 100644 index 0000000000000..a562e80e27871 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/left-keyWithIndexToValue/SSTs/000009-a97d5e47-e4eb-4878-b62d-30cc77dac3b1.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyToNumValues/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyToNumValues/1.changelog new file mode 100644 index 0000000000000..9099712c6134d Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyToNumValues/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyToNumValues/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyToNumValues/2.changelog new file mode 100644 index 0000000000000..c5a3af6244a01 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyToNumValues/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyToNumValues/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyToNumValues/2.zip new file mode 100644 index 0000000000000..5c32348084ebb Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyToNumValues/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyToNumValues/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyToNumValues/3.changelog new file mode 100644 index 0000000000000..298ebddcda718 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyToNumValues/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyToNumValues/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyToNumValues/4.changelog new file mode 100644 index 0000000000000..4b3cfd1f719bb Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyToNumValues/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyToNumValues/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyToNumValues/4.zip new file mode 100644 index 0000000000000..954265fe88cfa Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyToNumValues/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyToNumValues/5.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyToNumValues/5.changelog new file mode 100644 index 0000000000000..d50df0481f970 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyToNumValues/5.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyToNumValues/SSTs/000008-432ddd3b-981c-4a2b-a1e4-e06ecbc4a8a7.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyToNumValues/SSTs/000008-432ddd3b-981c-4a2b-a1e4-e06ecbc4a8a7.sst new file mode 100644 index 0000000000000..e752148099e3d Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyToNumValues/SSTs/000008-432ddd3b-981c-4a2b-a1e4-e06ecbc4a8a7.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyToNumValues/SSTs/000009-2e821d73-a2c6-4261-bcb6-d69e82e4c0d3.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyToNumValues/SSTs/000009-2e821d73-a2c6-4261-bcb6-d69e82e4c0d3.sst new file mode 100644 index 0000000000000..27f61ffe48bf8 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyToNumValues/SSTs/000009-2e821d73-a2c6-4261-bcb6-d69e82e4c0d3.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyWithIndexToValue/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyWithIndexToValue/1.changelog new file mode 100644 index 0000000000000..32458caceea0a Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyWithIndexToValue/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyWithIndexToValue/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyWithIndexToValue/2.changelog new file mode 100644 index 0000000000000..a3bc5bfd26f8e Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyWithIndexToValue/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyWithIndexToValue/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyWithIndexToValue/2.zip new file mode 100644 index 0000000000000..13bd0a0979309 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyWithIndexToValue/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyWithIndexToValue/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyWithIndexToValue/3.changelog new file mode 100644 index 0000000000000..9ec7dcb0d0515 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyWithIndexToValue/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyWithIndexToValue/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyWithIndexToValue/4.changelog new file mode 100644 index 0000000000000..e3906cea3366a Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyWithIndexToValue/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyWithIndexToValue/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyWithIndexToValue/4.zip new file mode 100644 index 0000000000000..d403db9b3e121 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyWithIndexToValue/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyWithIndexToValue/5.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyWithIndexToValue/5.changelog new file mode 100644 index 0000000000000..ffe4b92ca56dd Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyWithIndexToValue/5.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyWithIndexToValue/SSTs/000008-4e1c24f5-bbcb-4797-95d9-cfa4de1be09d.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyWithIndexToValue/SSTs/000008-4e1c24f5-bbcb-4797-95d9-cfa4de1be09d.sst new file mode 100644 index 0000000000000..2aafcbeadee17 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyWithIndexToValue/SSTs/000008-4e1c24f5-bbcb-4797-95d9-cfa4de1be09d.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyWithIndexToValue/SSTs/000009-366ba158-6a59-4922-9cb0-df3e2b9aa789.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyWithIndexToValue/SSTs/000009-366ba158-6a59-4922-9cb0-df3e2b9aa789.sst new file mode 100644 index 0000000000000..68f1e65fa2885 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/4/right-keyWithIndexToValue/SSTs/000009-366ba158-6a59-4922-9cb0-df3e2b9aa789.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/_metadata/metadata b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/_metadata/metadata new file mode 100644 index 0000000000000..b73f1e3e66ac5 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/join2/state/0/_metadata/metadata @@ -0,0 +1,2 @@ +v1 +{"operatorInfo":{"operatorId":0,"operatorName":"symmetricHashJoin"},"stateStoreInfo":[{"storeName":"left-keyToNumValues","numColsPrefixKey":0,"numPartitions":5},{"storeName":"left-keyWithIndexToValue","numColsPrefixKey":0,"numPartitions":5},{"storeName":"right-keyToNumValues","numColsPrefixKey":0,"numPartitions":5},{"storeName":"right-keyWithIndexToValue","numColsPrefixKey":0,"numPartitions":5}]} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/commits/0 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/commits/0 new file mode 100644 index 0000000000000..9c1e3021c3ead --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/commits/0 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":0} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/commits/1 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/commits/1 new file mode 100644 index 0000000000000..9c1e3021c3ead --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/commits/1 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":0} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/commits/2 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/commits/2 new file mode 100644 index 0000000000000..9c1e3021c3ead --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/commits/2 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":0} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/commits/3 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/commits/3 new file mode 100644 index 0000000000000..9c1e3021c3ead --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/commits/3 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":0} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/metadata b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/metadata new file mode 100644 index 0000000000000..fa82647084453 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/metadata @@ -0,0 +1 @@ +{"id":"711e4377-ce90-456e-8530-9e0374ce4791"} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/offsets/0 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/offsets/0 new file mode 100644 index 0000000000000..50bf84e3a759e --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/offsets/0 @@ -0,0 +1,3 @@ +v1 +{"batchWatermarkMs":0,"batchTimestampMs":1719290921178,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4"}} +0 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/offsets/1 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/offsets/1 new file mode 100644 index 0000000000000..1690ffdd2b79b --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/offsets/1 @@ -0,0 +1,3 @@ +v1 +{"batchWatermarkMs":0,"batchTimestampMs":1719290923799,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4"}} +1 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/offsets/2 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/offsets/2 new file mode 100644 index 0000000000000..c331cc80a3e1e --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/offsets/2 @@ -0,0 +1,3 @@ +v1 +{"batchWatermarkMs":0,"batchTimestampMs":1719290926380,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4"}} +2 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/offsets/3 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/offsets/3 new file mode 100644 index 0000000000000..2d42b90630780 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/offsets/3 @@ -0,0 +1,3 @@ +v1 +{"batchWatermarkMs":0,"batchTimestampMs":1719290928784,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4"}} +3 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/state/0/0/1.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/state/0/0/1.changelog new file mode 100644 index 0000000000000..1d9f25b472ace Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/state/0/0/1.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/state/0/0/2.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/state/0/0/2.changelog new file mode 100644 index 0000000000000..c335f52e07bca Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/state/0/0/2.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/state/0/0/2.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/state/0/0/2.zip new file mode 100644 index 0000000000000..e707c3f31376a Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/state/0/0/2.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/state/0/0/3.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/state/0/0/3.changelog new file mode 100644 index 0000000000000..41e61d1915140 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/state/0/0/3.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/state/0/0/4.changelog b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/state/0/0/4.changelog new file mode 100644 index 0000000000000..d51b4fc081740 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/state/0/0/4.changelog differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/state/0/0/4.zip b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/state/0/0/4.zip new file mode 100644 index 0000000000000..35f294cf782ce Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/state/0/0/4.zip differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/state/0/0/SSTs/000008-8f6e7567-9c70-4f55-9f28-8b6b497c85ae.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/state/0/0/SSTs/000008-8f6e7567-9c70-4f55-9f28-8b6b497c85ae.sst new file mode 100644 index 0000000000000..e8da97359c7fb Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/state/0/0/SSTs/000008-8f6e7567-9c70-4f55-9f28-8b6b497c85ae.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/state/0/0/SSTs/000009-71eca19b-ab68-4608-bcfa-897365067239.sst b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/state/0/0/SSTs/000009-71eca19b-ab68-4608-bcfa-897365067239.sst new file mode 100644 index 0000000000000..fd296636929b8 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/state/0/0/SSTs/000009-71eca19b-ab68-4608-bcfa-897365067239.sst differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/state/0/0/_metadata/schema b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/state/0/0/_metadata/schema new file mode 100644 index 0000000000000..371b0df09d80b Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/state/0/0/_metadata/schema differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/state/0/_metadata/metadata b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/state/0/_metadata/metadata new file mode 100644 index 0000000000000..5792421dd423e --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0/rocksdb/limit/state/0/_metadata/metadata @@ -0,0 +1,2 @@ +v1 +{"operatorInfo":{"operatorId":0,"operatorName":"globalLimit"},"stateStoreInfo":[{"storeName":"default","numColsPrefixKey":0,"numPartitions":5}]} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/partition-tests/randomSchemas b/sql/core/src/test/resources/structured-streaming/partition-tests/randomSchemas index 8d6ff942610c4..f6eadd776cc6e 100644 --- a/sql/core/src/test/resources/structured-streaming/partition-tests/randomSchemas +++ b/sql/core/src/test/resources/structured-streaming/partition-tests/randomSchemas @@ -1 +1 @@ -col_0 STRUCT NOT NULL, col_3: FLOAT NOT NULL, col_4: INT NOT NULL>,col_1 STRUCT, col_3: ARRAY NOT NULL, col_4: ARRAY, col_5: TIMESTAMP NOT NULL, col_6: STRUCT, col_1: BIGINT NOT NULL> NOT NULL, col_7: ARRAY NOT NULL, col_8: ARRAY, col_9: BIGINT NOT NULL> NOT NULL,col_2 BIGINT NOT NULL,col_3 STRUCT NOT NULL,col_4 STRUCT NOT NULL> NOT NULL> NOT NULL,col_5 ARRAY NOT NULL +col_0 ARRAY,col_1 STRUCT NOT NULL,col_2 STRING NOT NULL,col_3 STRUCT, col_2: ARRAY NOT NULL> NOT NULL,col_4 BINARY NOT NULL,col_5 ARRAY NOT NULL,col_6 ARRAY,col_7 DOUBLE NOT NULL,col_8 ARRAY NOT NULL,col_9 ARRAY,col_10 FLOAT NOT NULL,col_11 STRUCT NOT NULL>, col_1: STRUCT NOT NULL, col_1: INT, col_2: STRUCT> NOT NULL>, col_2: BINARY NOT NULL, col_3: STRUCT NOT NULL> NOT NULL> NOT NULL,col_12 ARRAY diff --git a/sql/core/src/test/resources/structured-streaming/partition-tests/rowsAndPartIds b/sql/core/src/test/resources/structured-streaming/partition-tests/rowsAndPartIds index 3902d6d7d5f61..1b2eda8502e5e 100644 Binary files a/sql/core/src/test/resources/structured-streaming/partition-tests/rowsAndPartIds and b/sql/core/src/test/resources/structured-streaming/partition-tests/rowsAndPartIds differ diff --git a/sql/core/src/test/resources/test-data/char.csv b/sql/core/src/test/resources/test-data/char.csv new file mode 100644 index 0000000000000..d2be68a15fc12 --- /dev/null +++ b/sql/core/src/test/resources/test-data/char.csv @@ -0,0 +1,4 @@ +color,name +pink,Bob +blue,Mike +grey,Tom diff --git a/sql/core/src/test/resources/test-data/xml-resources/cdata-ending-eof.xml b/sql/core/src/test/resources/test-data/xml-resources/cdata-ending-eof.xml index 15edf3186a52a..c1ec547db3391 100644 --- a/sql/core/src/test/resources/test-data/xml-resources/cdata-ending-eof.xml +++ b/sql/core/src/test/resources/test-data/xml-resources/cdata-ending-eof.xml @@ -23,10 +23,11 @@ 10 0]]> - 0]]> 11 0]]> - 0 ]]> 0]]> 12 0]]> + + + 0 ]]> 0]]> 13 - 0 diff --git a/sql/core/src/test/resources/test-data/xml-resources/cdata-no-close.xml b/sql/core/src/test/resources/test-data/xml-resources/cdata-no-close.xml index 45ee814001d58..133ef22538ec3 100644 --- a/sql/core/src/test/resources/test-data/xml-resources/cdata-no-close.xml +++ b/sql/core/src/test/resources/test-data/xml-resources/cdata-no-close.xml @@ -23,10 +23,11 @@ 10 0]]> - 0]]> 11 0]]> - 0 ]]> 0]]> 12 0]]> + + + 0 ]]> 0]]> 13 - 0 @@ -44,5 +45,4 @@ 17 + + + ignored row ]]> + 1 ]]> + + 3 ]]> + + 5 ]]> + ignored row ]]> + diff --git a/sql/core/src/test/resources/test-data/xml-resources/ignored-rows.xml b/sql/core/src/test/resources/test-data/xml-resources/ignored-rows.xml index 31822ee3e3e96..3b964c68cfcb2 100644 --- a/sql/core/src/test/resources/test-data/xml-resources/ignored-rows.xml +++ b/sql/core/src/test/resources/test-data/xml-resources/ignored-rows.xml @@ -23,10 +23,11 @@ 10 0]]> - 0]]> 11 0]]> - 0 ]]> 0]]> 12 0]]> + + + 0 ]]> 0]]> 13 - 0 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/BloomFilterAggregateQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/BloomFilterAggregateQuerySuite.scala index 4edb51d271903..9b39a2295e7d6 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/BloomFilterAggregateQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/BloomFilterAggregateQuerySuite.scala @@ -26,10 +26,12 @@ import org.apache.spark.sql.execution.aggregate.BaseAggregateExec import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types.LongType +import org.apache.spark.tags.ExtendedSQLTest /** * Query tests for the Bloom filter aggregate and filter function. */ +@ExtendedSQLTest class BloomFilterAggregateQuerySuite extends QueryTest with SharedSparkSession { import testImplicits._ diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CTEInlineSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CTEInlineSuite.scala index a06b50d175f90..7b608b7438c29 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CTEInlineSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CTEInlineSuite.scala @@ -703,6 +703,17 @@ abstract class CTEInlineSuiteBase checkErrorTableNotFound(e, "`tab_non_exists`", ExpectedContext("tab_non_exists", 83, 96)) } } + + test("SPARK-48307: not-inlined CTE references sibling") { + val df = sql( + """ + |WITH + |v1 AS (SELECT 1 col), + |v2 AS (SELECT col, rand() FROM v1) + |SELECT l.col FROM v2 l JOIN v2 r ON l.col = r.col + |""".stripMargin) + checkAnswer(df, Row(1)) + } } class CTEInlineSuiteAEOff extends CTEInlineSuiteBase with DisableAdaptiveExecutionSuite diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala index 7865e7f1f864c..71b420bb85eac 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala @@ -1107,7 +1107,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils assert(queryStats1.map(_._1.name).isEmpty) val cacheManager = spark.sharedState.cacheManager - val cachedData = cacheManager.lookupCachedData(query().logicalPlan) + val cachedData = cacheManager.lookupCachedData(query()) assert(cachedData.isDefined) val queryAttrs = cachedData.get.plan.output assert(queryAttrs.size === 3) @@ -1436,7 +1436,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils withSQLConf(SQLConf.STORE_ANALYZED_PLAN_FOR_VIEW.key -> storeAnalyzed.toString) { withGlobalTempView("view1") { withTempView("view2") { - val db = spark.sharedState.globalTempViewManager.database + val db = spark.sharedState.globalTempDB sql("CREATE GLOBAL TEMPORARY VIEW view1 AS SELECT * FROM testData WHERE key > 1") sql(s"CACHE TABLE view2 AS SELECT * FROM ${db}.view1 WHERE value > 1") assert(spark.catalog.isCached("view2")) @@ -1487,7 +1487,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils withSQLConf(SQLConf.STORE_ANALYZED_PLAN_FOR_VIEW.key -> storeAnalyzed.toString) { withGlobalTempView("view1") { withTempView("view2") { - val db = spark.sharedState.globalTempViewManager.database + val db = spark.sharedState.globalTempDB sql("CREATE GLOBAL TEMPORARY VIEW view1 AS SELECT * FROM testData WHERE key > 1") sql(s"CACHE TABLE view2 AS SELECT * FROM $db.view1 WHERE value > 1") assert(spark.catalog.isCached("view2")) @@ -1517,7 +1517,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils Seq(true, false).foreach { storeAnalyzed => withSQLConf(SQLConf.STORE_ANALYZED_PLAN_FOR_VIEW.key -> storeAnalyzed.toString) { withGlobalTempView("global_tv") { - val db = spark.sharedState.globalTempViewManager.database + val db = spark.sharedState.globalTempDB testAlterTemporaryViewAsWithCache(TableIdentifier("global_tv", Some(db)), storeAnalyzed) } } @@ -1575,7 +1575,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils test("SPARK-34699: CREATE GLOBAL TEMP VIEW USING should uncache correctly") { withGlobalTempView("global_tv") { - val db = spark.sharedState.globalTempViewManager.database + val db = spark.sharedState.globalTempDB testCreateTemporaryViewUsingWithCache(TableIdentifier("global_tv", Some(db))) } } @@ -1770,4 +1770,23 @@ class CachedTableSuite extends QueryTest with SQLTestUtils withSQLConf(SQLConf.DEFAULT_CACHE_STORAGE_LEVEL.key -> "DISK") {} } } + + test("SPARK-47633: Cache hit for lateral join with join condition") { + withTempView("t", "q1") { + sql("create or replace temp view t(c1, c2) as values (0, 1), (1, 2)") + val query = """select * + |from t + |join lateral ( + | select c1 as a, c2 as b + | from t) + |on c1 = a; + |""".stripMargin + sql(s"cache table q1 as $query") + val df = sql(query) + checkAnswer(df, + Row(0, 1, 0, 1) :: Row(1, 2, 1, 2) :: Nil) + assert(getNumInMemoryRelations(df) == 1) + } + + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala index 013177425da78..a93dee3bf2a61 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala @@ -942,6 +942,34 @@ class FileSourceCharVarcharTestSuite extends CharVarcharTestSuite with SharedSpa } } } + + test("SPARK-48498: always do char padding in predicates") { + import testImplicits._ + withSQLConf(SQLConf.READ_SIDE_CHAR_PADDING.key -> "false") { + withTempPath { dir => + withTable("t") { + Seq( + "12" -> "12", + "12" -> "12 ", + "12 " -> "12", + "12 " -> "12 " + ).toDF("c1", "c2").write.format(format).save(dir.toString) + sql(s"CREATE TABLE t (c1 CHAR(3), c2 STRING) USING $format LOCATION '$dir'") + // Comparing CHAR column with STRING column directly compares the stored value. + checkAnswer( + sql("SELECT c1 = c2 FROM t"), + Seq(Row(true), Row(false), Row(false), Row(true)) + ) + // No matter the CHAR type value is padded or not in the storage, we should always pad it + // before comparison with STRING literals. + checkAnswer( + sql("SELECT c1 = '12', c1 = '12 ', c1 = '12 ' FROM t WHERE c2 = '12'"), + Seq(Row(true, true, true), Row(true, true, true)) + ) + } + } + } + } } class DSV2CharVarcharTestSuite extends CharVarcharTestSuite diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationRegexpExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationRegexpExpressionsSuite.scala deleted file mode 100644 index 0876425847bbb..0000000000000 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationRegexpExpressionsSuite.scala +++ /dev/null @@ -1,329 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql - -import scala.collection.immutable.Seq - -import org.apache.spark.SparkConf -import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.test.SharedSparkSession -import org.apache.spark.sql.types.{ArrayType, BooleanType, IntegerType, StringType} - -class CollationRegexpExpressionsSuite - extends QueryTest - with SharedSparkSession - with ExpressionEvalHelper { - - test("Support Like string expression with collation") { - // Supported collations - case class LikeTestCase[R](l: String, r: String, c: String, result: R) - val testCases = Seq( - LikeTestCase("ABC", "%B%", "UTF8_BINARY", true) - ) - testCases.foreach(t => { - val query = s"SELECT like(collate('${t.l}', '${t.c}'), collate('${t.r}', '${t.c}'))" - // Result & data type - checkAnswer(sql(query), Row(t.result)) - assert(sql(query).schema.fields.head.dataType.sameType(BooleanType)) - // TODO: Implicit casting (not currently supported) - }) - // Unsupported collations - case class LikeTestFail(l: String, r: String, c: String) - val failCases = Seq( - LikeTestFail("ABC", "%b%", "UTF8_BINARY_LCASE"), - LikeTestFail("ABC", "%B%", "UNICODE"), - LikeTestFail("ABC", "%b%", "UNICODE_CI") - ) - failCases.foreach(t => { - val query = s"SELECT like(collate('${t.l}', '${t.c}'), collate('${t.r}', '${t.c}'))" - val unsupportedCollation = intercept[AnalysisException] { sql(query) } - assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") - }) - // TODO: Collation mismatch (not currently supported) - } - - test("Support ILike string expression with collation") { - // Supported collations - case class ILikeTestCase[R](l: String, r: String, c: String, result: R) - val testCases = Seq( - ILikeTestCase("ABC", "%b%", "UTF8_BINARY", true) - ) - testCases.foreach(t => { - val query = s"SELECT ilike(collate('${t.l}', '${t.c}'), collate('${t.r}', '${t.c}'))" - // Result & data type - checkAnswer(sql(query), Row(t.result)) - assert(sql(query).schema.fields.head.dataType.sameType(BooleanType)) - // TODO: Implicit casting (not currently supported) - }) - // Unsupported collations - case class ILikeTestFail(l: String, r: String, c: String) - val failCases = Seq( - ILikeTestFail("ABC", "%b%", "UTF8_BINARY_LCASE"), - ILikeTestFail("ABC", "%b%", "UNICODE"), - ILikeTestFail("ABC", "%b%", "UNICODE_CI") - ) - failCases.foreach(t => { - val query = s"SELECT ilike(collate('${t.l}', '${t.c}'), collate('${t.r}', '${t.c}'))" - val unsupportedCollation = intercept[AnalysisException] { sql(query) } - assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") - }) - // TODO: Collation mismatch (not currently supported) - } - - test("Support RLike string expression with collation") { - // Supported collations - case class RLikeTestCase[R](l: String, r: String, c: String, result: R) - val testCases = Seq( - RLikeTestCase("ABC", ".B.", "UTF8_BINARY", true) - ) - testCases.foreach(t => { - val query = s"SELECT rlike(collate('${t.l}', '${t.c}'), collate('${t.r}', '${t.c}'))" - // Result & data type - checkAnswer(sql(query), Row(t.result)) - assert(sql(query).schema.fields.head.dataType.sameType(BooleanType)) - // TODO: Implicit casting (not currently supported) - }) - // Unsupported collations - case class RLikeTestFail(l: String, r: String, c: String) - val failCases = Seq( - RLikeTestFail("ABC", ".b.", "UTF8_BINARY_LCASE"), - RLikeTestFail("ABC", ".B.", "UNICODE"), - RLikeTestFail("ABC", ".b.", "UNICODE_CI") - ) - failCases.foreach(t => { - val query = s"SELECT rlike(collate('${t.l}', '${t.c}'), collate('${t.r}', '${t.c}'))" - val unsupportedCollation = intercept[AnalysisException] { sql(query) } - assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") - }) - // TODO: Collation mismatch (not currently supported) - } - - test("Support StringSplit string expression with collation") { - // Supported collations - case class StringSplitTestCase[R](l: String, r: String, c: String, result: R) - val testCases = Seq( - StringSplitTestCase("ABC", "[B]", "UTF8_BINARY", Seq("A", "C")) - ) - testCases.foreach(t => { - val query = s"SELECT split(collate('${t.l}', '${t.c}'), collate('${t.r}', '${t.c}'))" - // Result & data type - checkAnswer(sql(query), Row(t.result)) - assert(sql(query).schema.fields.head.dataType.sameType(ArrayType(StringType(t.c)))) - // TODO: Implicit casting (not currently supported) - }) - // Unsupported collations - case class StringSplitTestFail(l: String, r: String, c: String) - val failCases = Seq( - StringSplitTestFail("ABC", "[b]", "UTF8_BINARY_LCASE"), - StringSplitTestFail("ABC", "[B]", "UNICODE"), - StringSplitTestFail("ABC", "[b]", "UNICODE_CI") - ) - failCases.foreach(t => { - val query = s"SELECT split(collate('${t.l}', '${t.c}'), collate('${t.r}', '${t.c}'))" - val unsupportedCollation = intercept[AnalysisException] { sql(query) } - assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") - }) - // TODO: Collation mismatch (not currently supported) - } - - test("Support RegExpReplace string expression with collation") { - // Supported collations - case class RegExpReplaceTestCase[R](l: String, r: String, c: String, result: R) - val testCases = Seq( - RegExpReplaceTestCase("ABCDE", ".C.", "UTF8_BINARY", "AFFFE") - ) - testCases.foreach(t => { - val query = - s"SELECT regexp_replace(collate('${t.l}', '${t.c}'), collate('${t.r}', '${t.c}'), 'FFF')" - // Result & data type - checkAnswer(sql(query), Row(t.result)) - assert(sql(query).schema.fields.head.dataType.sameType(StringType(t.c))) - // TODO: Implicit casting (not currently supported) - }) - // Unsupported collations - case class RegExpReplaceTestFail(l: String, r: String, c: String) - val failCases = Seq( - RegExpReplaceTestFail("ABCDE", ".c.", "UTF8_BINARY_LCASE"), - RegExpReplaceTestFail("ABCDE", ".C.", "UNICODE"), - RegExpReplaceTestFail("ABCDE", ".c.", "UNICODE_CI") - ) - failCases.foreach(t => { - val query = - s"SELECT regexp_replace(collate('${t.l}', '${t.c}'), collate('${t.r}', '${t.c}'), 'FFF')" - val unsupportedCollation = intercept[AnalysisException] { sql(query) } - assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") - }) - // TODO: Collation mismatch (not currently supported) - } - - test("Support RegExpExtract string expression with collation") { - // Supported collations - case class RegExpExtractTestCase[R](l: String, r: String, c: String, result: R) - val testCases = Seq( - RegExpExtractTestCase("ABCDE", ".C.", "UTF8_BINARY", "BCD") - ) - testCases.foreach(t => { - val query = - s"SELECT regexp_extract(collate('${t.l}', '${t.c}'), collate('${t.r}', '${t.c}'), 0)" - // Result & data type - checkAnswer(sql(query), Row(t.result)) - assert(sql(query).schema.fields.head.dataType.sameType(StringType(t.c))) - // TODO: Implicit casting (not currently supported) - }) - // Unsupported collations - case class RegExpExtractTestFail(l: String, r: String, c: String) - val failCases = Seq( - RegExpExtractTestFail("ABCDE", ".c.", "UTF8_BINARY_LCASE"), - RegExpExtractTestFail("ABCDE", ".C.", "UNICODE"), - RegExpExtractTestFail("ABCDE", ".c.", "UNICODE_CI") - ) - failCases.foreach(t => { - val query = - s"SELECT regexp_extract(collate('${t.l}', '${t.c}'), collate('${t.r}', '${t.c}'), 0)" - val unsupportedCollation = intercept[AnalysisException] { sql(query) } - assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") - }) - // TODO: Collation mismatch (not currently supported) - } - - test("Support RegExpExtractAll string expression with collation") { - // Supported collations - case class RegExpExtractAllTestCase[R](l: String, r: String, c: String, result: R) - val testCases = Seq( - RegExpExtractAllTestCase("ABCDE", ".C.", "UTF8_BINARY", Seq("BCD")) - ) - testCases.foreach(t => { - val query = - s"SELECT regexp_extract_all(collate('${t.l}', '${t.c}'), collate('${t.r}', '${t.c}'), 0)" - // Result & data type - checkAnswer(sql(query), Row(t.result)) - assert(sql(query).schema.fields.head.dataType.sameType(ArrayType(StringType(t.c)))) - // TODO: Implicit casting (not currently supported) - }) - // Unsupported collations - case class RegExpExtractAllTestFail(l: String, r: String, c: String) - val failCases = Seq( - RegExpExtractAllTestFail("ABCDE", ".c.", "UTF8_BINARY_LCASE"), - RegExpExtractAllTestFail("ABCDE", ".C.", "UNICODE"), - RegExpExtractAllTestFail("ABCDE", ".c.", "UNICODE_CI") - ) - failCases.foreach(t => { - val query = - s"SELECT regexp_extract_all(collate('${t.l}', '${t.c}'), collate('${t.r}', '${t.c}'), 0)" - val unsupportedCollation = intercept[AnalysisException] { sql(query) } - assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") - }) - // TODO: Collation mismatch (not currently supported) - } - - test("Support RegExpCount string expression with collation") { - // Supported collations - case class RegExpCountTestCase[R](l: String, r: String, c: String, result: R) - val testCases = Seq( - RegExpCountTestCase("ABCDE", ".C.", "UTF8_BINARY", 1) - ) - testCases.foreach(t => { - val query = s"SELECT regexp_count(collate('${t.l}', '${t.c}'), collate('${t.r}', '${t.c}'))" - // Result & data type - checkAnswer(sql(query), Row(t.result)) - assert(sql(query).schema.fields.head.dataType.sameType(IntegerType)) - // TODO: Implicit casting (not currently supported) - }) - // Unsupported collations - case class RegExpCountTestFail(l: String, r: String, c: String) - val failCases = Seq( - RegExpCountTestFail("ABCDE", ".c.", "UTF8_BINARY_LCASE"), - RegExpCountTestFail("ABCDE", ".C.", "UNICODE"), - RegExpCountTestFail("ABCDE", ".c.", "UNICODE_CI") - ) - failCases.foreach(t => { - val query = s"SELECT regexp_count(collate('${t.l}', '${t.c}'), collate('${t.r}', '${t.c}'))" - val unsupportedCollation = intercept[AnalysisException] { sql(query) } - assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") - }) - // TODO: Collation mismatch (not currently supported) - } - - test("Support RegExpSubStr string expression with collation") { - // Supported collations - case class RegExpSubStrTestCase[R](l: String, r: String, c: String, result: R) - val testCases = Seq( - RegExpSubStrTestCase("ABCDE", ".C.", "UTF8_BINARY", "BCD") - ) - testCases.foreach(t => { - val query = s"SELECT regexp_substr(collate('${t.l}', '${t.c}'), collate('${t.r}', '${t.c}'))" - // Result & data type - checkAnswer(sql(query), Row(t.result)) - assert(sql(query).schema.fields.head.dataType.sameType(StringType(t.c))) - // TODO: Implicit casting (not currently supported) - }) - // Unsupported collations - case class RegExpSubStrTestFail(l: String, r: String, c: String) - val failCases = Seq( - RegExpSubStrTestFail("ABCDE", ".c.", "UTF8_BINARY_LCASE"), - RegExpSubStrTestFail("ABCDE", ".C.", "UNICODE"), - RegExpSubStrTestFail("ABCDE", ".c.", "UNICODE_CI") - ) - failCases.foreach(t => { - val query = s"SELECT regexp_substr(collate('${t.l}', '${t.c}'), collate('${t.r}', '${t.c}'))" - val unsupportedCollation = intercept[AnalysisException] { sql(query) } - assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") - }) - // TODO: Collation mismatch (not currently supported) - } - - test("Support RegExpInStr string expression with collation") { - // Supported collations - case class RegExpInStrTestCase[R](l: String, r: String, c: String, result: R) - val testCases = Seq( - RegExpInStrTestCase("ABCDE", ".C.", "UTF8_BINARY", 2) - ) - testCases.foreach(t => { - val query = s"SELECT regexp_instr(collate('${t.l}', '${t.c}'), collate('${t.r}', '${t.c}'))" - // Result & data type - checkAnswer(sql(query), Row(t.result)) - assert(sql(query).schema.fields.head.dataType.sameType(IntegerType)) - // TODO: Implicit casting (not currently supported) - }) - // Unsupported collations - case class RegExpInStrTestFail(l: String, r: String, c: String) - val failCases = Seq( - RegExpInStrTestFail("ABCDE", ".c.", "UTF8_BINARY_LCASE"), - RegExpInStrTestFail("ABCDE", ".C.", "UNICODE"), - RegExpInStrTestFail("ABCDE", ".c.", "UNICODE_CI") - ) - failCases.foreach(t => { - val query = s"SELECT regexp_instr(collate('${t.l}', '${t.c}'), collate('${t.r}', '${t.c}'))" - val unsupportedCollation = intercept[AnalysisException] { - sql(query) - } - assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") - }) - // TODO: Collation mismatch (not currently supported) - } - -} - -class CollationRegexpExpressionsANSISuite extends CollationRegexpExpressionsSuite { - override protected def sparkConf: SparkConf = - super.sparkConf.set(SQLConf.ANSI_ENABLED, true) - - // TODO: If needed, add more tests for other regexp expressions (with ANSI mode enabled) - -} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala new file mode 100644 index 0000000000000..7994c496cb65c --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala @@ -0,0 +1,2307 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import java.sql.{Date, Timestamp} +import java.text.SimpleDateFormat + +import scala.collection.immutable.Seq + +import org.apache.spark.{SparkConf, SparkException, SparkIllegalArgumentException, SparkRuntimeException} +import org.apache.spark.sql.catalyst.ExtendedAnalysisException +import org.apache.spark.sql.catalyst.expressions.Literal +import org.apache.spark.sql.catalyst.expressions.aggregate.Mode +import org.apache.spark.sql.internal.{SqlApiConf, SQLConf} +import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.UTF8String +import org.apache.spark.util.collection.OpenHashMap + +// scalastyle:off nonascii +class CollationSQLExpressionsSuite + extends QueryTest + with SharedSparkSession { + + private val testSuppCollations = Seq("UTF8_BINARY", "UTF8_LCASE", "UNICODE", "UNICODE_CI") + + test("Support Md5 hash expression with collation") { + case class Md5TestCase( + input: String, + collationName: String, + result: String + ) + + val testCases = Seq( + Md5TestCase("Spark", "UTF8_BINARY", "8cde774d6f7333752ed72cacddb05126"), + Md5TestCase("Spark", "UTF8_LCASE", "8cde774d6f7333752ed72cacddb05126"), + Md5TestCase("SQL", "UNICODE", "9778840a0100cb30c982876741b0b5a2"), + Md5TestCase("SQL", "UNICODE_CI", "9778840a0100cb30c982876741b0b5a2") + ) + + // Supported collations + testCases.foreach(t => { + val query = + s""" + |select md5('${t.input}') + |""".stripMargin + // Result & data type + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { + val testQuery = sql(query) + checkAnswer(testQuery, Row(t.result)) + val dataType = StringType(t.collationName) + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + } + }) + } + + test("Support Sha2 hash expression with collation") { + case class Sha2TestCase( + input: String, + collationName: String, + bitLength: Int, + result: String + ) + + val testCases = Seq( + Sha2TestCase("Spark", "UTF8_BINARY", 256, + "529bc3b07127ecb7e53a4dcf1991d9152c24537d919178022b2c42657f79a26b"), + Sha2TestCase("Spark", "UTF8_LCASE", 256, + "529bc3b07127ecb7e53a4dcf1991d9152c24537d919178022b2c42657f79a26b"), + Sha2TestCase("SQL", "UNICODE", 256, + "a7056a455639d1c7deec82ee787db24a0c1878e2792b4597709f0facf7cc7b35"), + Sha2TestCase("SQL", "UNICODE_CI", 256, + "a7056a455639d1c7deec82ee787db24a0c1878e2792b4597709f0facf7cc7b35") + ) + + // Supported collations + testCases.foreach(t => { + val query = + s""" + |select sha2('${t.input}', ${t.bitLength}) + |""".stripMargin + // Result & data type + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { + val testQuery = sql(query) + checkAnswer(testQuery, Row(t.result)) + val dataType = StringType(t.collationName) + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + } + }) + } + + test("Support Sha1 hash expression with collation") { + case class Sha1TestCase( + input: String, + collationName: String, + result: String + ) + + val testCases = Seq( + Sha1TestCase("Spark", "UTF8_BINARY", "85f5955f4b27a9a4c2aab6ffe5d7189fc298b92c"), + Sha1TestCase("Spark", "UTF8_LCASE", "85f5955f4b27a9a4c2aab6ffe5d7189fc298b92c"), + Sha1TestCase("SQL", "UNICODE", "2064cb643caa8d9e1de12eea7f3e143ca9f8680d"), + Sha1TestCase("SQL", "UNICODE_CI", "2064cb643caa8d9e1de12eea7f3e143ca9f8680d") + ) + + // Supported collations + testCases.foreach(t => { + val query = + s""" + |select sha1('${t.input}') + |""".stripMargin + // Result & data type + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { + val testQuery = sql(query) + checkAnswer(testQuery, Row(t.result)) + val dataType = StringType(t.collationName) + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + } + }) + } + + test("Support Crc32 hash expression with collation") { + case class Crc321TestCase( + input: String, + collationName: String, + result: Int + ) + + val testCases = Seq( + Crc321TestCase("Spark", "UTF8_BINARY", 1557323817), + Crc321TestCase("Spark", "UTF8_LCASE", 1557323817), + Crc321TestCase("SQL", "UNICODE", 1299261525), + Crc321TestCase("SQL", "UNICODE_CI", 1299261525) + ) + + // Supported collations + testCases.foreach(t => { + val query = + s""" + |select crc32('${t.input}') + |""".stripMargin + // Result + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { + val testQuery = sql(query) + checkAnswer(testQuery, Row(t.result)) + } + }) + } + + test("Support Murmur3Hash hash expression with collation") { + case class Murmur3HashTestCase( + input: String, + collationName: String, + result: Int + ) + + val testCases = Seq( + Murmur3HashTestCase("Spark", "UTF8_BINARY", 228093765), + Murmur3HashTestCase("Spark", "UTF8_LCASE", 228093765), + Murmur3HashTestCase("SQL", "UNICODE", 17468742), + Murmur3HashTestCase("SQL", "UNICODE_CI", 17468742) + ) + + // Supported collations + testCases.foreach(t => { + val query = + s""" + |select hash('${t.input}') + |""".stripMargin + // Result + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { + val testQuery = sql(query) + checkAnswer(testQuery, Row(t.result)) + } + }) + } + + test("Support XxHash64 hash expression with collation") { + case class XxHash64TestCase( + input: String, + collationName: String, + result: Long + ) + + val testCases = Seq( + XxHash64TestCase("Spark", "UTF8_BINARY", -4294468057691064905L), + XxHash64TestCase("Spark", "UTF8_LCASE", -4294468057691064905L), + XxHash64TestCase("SQL", "UNICODE", -2147923034195946097L), + XxHash64TestCase("SQL", "UNICODE_CI", -2147923034195946097L) + ) + + // Supported collations + testCases.foreach(t => { + val query = + s""" + |select xxhash64('${t.input}') + |""".stripMargin + // Result + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { + val testQuery = sql(query) + checkAnswer(testQuery, Row(t.result)) + } + }) + } + + test("Support UrlEncode hash expression with collation") { + case class UrlEncodeTestCase( + input: String, + collationName: String, + result: String + ) + + val testCases = Seq( + UrlEncodeTestCase("https://spark.apache.org", "UTF8_BINARY", + "https%3A%2F%2Fspark.apache.org"), + UrlEncodeTestCase("https://spark.apache.org", "UTF8_LCASE", + "https%3A%2F%2Fspark.apache.org"), + UrlEncodeTestCase("https://spark.apache.org", "UNICODE", + "https%3A%2F%2Fspark.apache.org"), + UrlEncodeTestCase("https://spark.apache.org", "UNICODE_CI", + "https%3A%2F%2Fspark.apache.org") + ) + + // Supported collations + testCases.foreach(t => { + val query = + s""" + |select url_encode('${t.input}') + |""".stripMargin + // Result + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { + val testQuery = sql(query) + checkAnswer(testQuery, Row(t.result)) + val dataType = StringType(t.collationName) + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + } + }) + } + + test("Support UrlDecode hash expression with collation") { + case class UrlDecodeTestCase( + input: String, + collationName: String, + result: String + ) + + val testCases = Seq( + UrlDecodeTestCase("https%3A%2F%2Fspark.apache.org", "UTF8_BINARY", + "https://spark.apache.org"), + UrlDecodeTestCase("https%3A%2F%2Fspark.apache.org", "UTF8_LCASE", + "https://spark.apache.org"), + UrlDecodeTestCase("https%3A%2F%2Fspark.apache.org", "UNICODE", + "https://spark.apache.org"), + UrlDecodeTestCase("https%3A%2F%2Fspark.apache.org", "UNICODE_CI", + "https://spark.apache.org") + ) + + // Supported collations + testCases.foreach(t => { + val query = + s""" + |select url_decode('${t.input}') + |""".stripMargin + // Result + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { + val testQuery = sql(query) + checkAnswer(testQuery, Row(t.result)) + val dataType = StringType(t.collationName) + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + } + }) + } + + test("Support ParseUrl hash expression with collation") { + case class ParseUrlTestCase( + input: String, + collationName: String, + path: String, + result: String + ) + + val testCases = Seq( + ParseUrlTestCase("http://spark.apache.org/path?query=1", "UTF8_BINARY", "HOST", + "spark.apache.org"), + ParseUrlTestCase("http://spark.apache.org/path?query=2", "UTF8_LCASE", "PATH", + "/path"), + ParseUrlTestCase("http://spark.apache.org/path?query=3", "UNICODE", "QUERY", + "query=3"), + ParseUrlTestCase("http://spark.apache.org/path?query=4", "UNICODE_CI", "PROTOCOL", + "http") + ) + + // Supported collations + testCases.foreach(t => { + val query = + s""" + |select parse_url('${t.input}', '${t.path}') + |""".stripMargin + // Result + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { + val testQuery = sql(query) + checkAnswer(testQuery, Row(t.result)) + val dataType = StringType(t.collationName) + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + } + }) + } + + test("Support CsvToStructs csv expression with collation") { + case class CsvToStructsTestCase( + input: String, + collationName: String, + schema: String, + options: String, + result: Row, + structFields: Seq[StructField] + ) + + val testCases = Seq( + CsvToStructsTestCase("1", "UTF8_BINARY", "'a INT'", "", + Row(1), Seq( + StructField("a", IntegerType, nullable = true) + )), + CsvToStructsTestCase("true, 0.8", "UTF8_LCASE", "'A BOOLEAN, B DOUBLE'", "", + Row(true, 0.8), Seq( + StructField("A", BooleanType, nullable = true), + StructField("B", DoubleType, nullable = true) + )), + CsvToStructsTestCase("\"Spark\"", "UNICODE", "'a STRING'", "", + Row("Spark"), Seq( + StructField("a", StringType("UNICODE"), nullable = true) + )), + CsvToStructsTestCase("26/08/2015", "UTF8_BINARY", "'time Timestamp'", + ", map('timestampFormat', 'dd/MM/yyyy')", Row( + new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.S").parse("2015-08-26 00:00:00.0") + ), Seq( + StructField("time", TimestampType, nullable = true) + )) + ) + + // Supported collations + testCases.foreach(t => { + val query = + s""" + |select from_csv('${t.input}', ${t.schema} ${t.options}) + |""".stripMargin + // Result + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { + val testQuery = sql(query) + val queryResult = testQuery.collect().head + checkAnswer(testQuery, Row(t.result)) + val dataType = StructType(t.structFields) + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + } + }) + } + + test("Support SchemaOfCsv csv expression with collation") { + case class SchemaOfCsvTestCase( + input: String, + collationName: String, + result: String + ) + + val testCases = Seq( + SchemaOfCsvTestCase("1", "UTF8_BINARY", "STRUCT<_c0: INT>"), + SchemaOfCsvTestCase("true,0.8", "UTF8_LCASE", + "STRUCT<_c0: BOOLEAN, _c1: DOUBLE>"), + SchemaOfCsvTestCase("2015-08-26", "UNICODE", "STRUCT<_c0: DATE>"), + SchemaOfCsvTestCase("abc", "UNICODE_CI", + "STRUCT<_c0: STRING>") + ) + + // Supported collations + testCases.foreach(t => { + val query = + s""" + |select schema_of_csv('${t.input}') + |""".stripMargin + // Result + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { + val testQuery = sql(query) + checkAnswer(testQuery, Row(t.result)) + val dataType = StringType(t.collationName) + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + } + }) + } + + test("Support StructsToCsv csv expression with collation") { + case class StructsToCsvTestCase( + input: String, + collationName: String, + result: String + ) + + val testCases = Seq( + StructsToCsvTestCase("named_struct('a', 1, 'b', 2)", "UTF8_BINARY", "1,2"), + StructsToCsvTestCase("named_struct('A', true, 'B', 2.0)", "UTF8_LCASE", "true,2.0"), + StructsToCsvTestCase("named_struct()", "UNICODE", null), + StructsToCsvTestCase("named_struct('time', to_timestamp('2015-08-26'))", "UNICODE_CI", + "2015-08-26T00:00:00.000-07:00") + ) + + // Supported collations + testCases.foreach(t => { + val query = + s""" + |select to_csv(${t.input}) + |""".stripMargin + // Result + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { + val testQuery = sql(query) + checkAnswer(testQuery, Row(t.result)) + val dataType = StringType(t.collationName) + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + } + }) + } + + test("Conv expression with collation") { + // Supported collations + case class ConvTestCase( + num: String, + from_base: String, + to_base: String, + collationName: String, + result: String) + + val testCases = Seq( + ConvTestCase("100", "2", "10", "UTF8_BINARY", "4"), + ConvTestCase("100", "2", "10", "UTF8_LCASE", "4"), + ConvTestCase("100", "2", "10", "UNICODE", "4"), + ConvTestCase("100", "2", "10", "UNICODE_CI", "4") + ) + testCases.foreach(t => { + val query = + s""" + |select conv(collate('${t.num}', '${t.collationName}'), ${t.from_base}, ${t.to_base}) + |""".stripMargin + // Result & data type + checkAnswer(sql(query), Row(t.result)) + assert(sql(query).schema.fields.head.dataType.sameType(StringType(t.collationName))) + }) + } + + test("Bin expression with collation") { + // Supported collations + case class BinTestCase( + num: String, + collationName: String, + result: String) + + val testCases = Seq( + BinTestCase("13", "UTF8_BINARY", "1101"), + BinTestCase("13", "UTF8_LCASE", "1101"), + BinTestCase("13", "UNICODE", "1101"), + BinTestCase("13", "UNICODE_CI", "1101") + ) + testCases.foreach(t => { + val query = + s""" + |select bin(${t.num}) + |""".stripMargin + // Result & data type + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { + checkAnswer(sql(query), Row(t.result)) + assert(sql(query).schema.fields.head.dataType.sameType(StringType(t.collationName))) + } + }) + } + + test("Hex with non-string input expression with collation") { + case class HexTestCase( + num: String, + collationName: String, + result: String) + + val testCases = Seq( + HexTestCase("13", "UTF8_BINARY", "D"), + HexTestCase("13", "UTF8_LCASE", "D"), + HexTestCase("13", "UNICODE", "D"), + HexTestCase("13", "UNICODE_CI", "D") + ) + testCases.foreach(t => { + val query = + s""" + |select hex(${t.num}) + |""".stripMargin + // Result & data type + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { + checkAnswer(sql(query), Row(t.result)) + assert(sql(query).schema.fields.head.dataType.sameType(StringType(t.collationName))) + } + }) + } + + test("Hex with string input expression with collation") { + case class HexTestCase( + num: String, + collationName: String, + result: String) + + val testCases = Seq( + HexTestCase("Spark SQL", "UTF8_BINARY", "537061726B2053514C"), + HexTestCase("Spark SQL", "UTF8_LCASE", "537061726B2053514C"), + HexTestCase("Spark SQL", "UNICODE", "537061726B2053514C"), + HexTestCase("Spark SQL", "UNICODE_CI", "537061726B2053514C") + ) + testCases.foreach(t => { + val query = + s""" + |select hex(collate('${t.num}', '${t.collationName}')) + |""".stripMargin + // Result & data type + checkAnswer(sql(query), Row(t.result)) + assert(sql(query).schema.fields.head.dataType.sameType(StringType(t.collationName))) + }) + } + + test("UnHex expression with collation") { + case class UnHexTestCase( + num: String, + collationName: String, + result: String) + + val testCases = Seq( + UnHexTestCase("537061726B2053514C", "UTF8_BINARY", "Spark SQL"), + UnHexTestCase("537061726B2053514C", "UTF8_LCASE", "Spark SQL"), + UnHexTestCase("537061726B2053514C", "UNICODE", "Spark SQL"), + UnHexTestCase("537061726B2053514C", "UNICODE_CI", "Spark SQL") + ) + testCases.foreach(t => { + val query = + s""" + |select decode(unhex(collate('${t.num}', '${t.collationName}')), 'UTF-8') + |""".stripMargin + // Result & data type + checkAnswer(sql(query), Row(t.result)) + assert(sql(query).schema.fields.head.dataType.sameType(StringType("UTF8_BINARY"))) + }) + } + + test("Support XPath expressions with collation") { + case class XPathTestCase( + xml: String, + xpath: String, + functionName: String, + collationName: String, + result: Any, + resultType: DataType + ) + + val testCases = Seq( + XPathTestCase("1", "a/b", + "xpath_boolean", "UTF8_BINARY", true, BooleanType), + XPathTestCase("12", "sum(A/B)", + "xpath_short", "UTF8_BINARY", 3, ShortType), + XPathTestCase("34", "sum(a/b)", + "xpath_int", "UTF8_LCASE", 7, IntegerType), + XPathTestCase("56", "sum(A/B)", + "xpath_long", "UTF8_LCASE", 11, LongType), + XPathTestCase("78", "sum(a/b)", + "xpath_float", "UNICODE", 15.0, FloatType), + XPathTestCase("90", "sum(A/B)", + "xpath_double", "UNICODE", 9.0, DoubleType), + XPathTestCase("bcc", "a/c", + "xpath_string", "UNICODE_CI", "cc", StringType("UNICODE_CI")), + XPathTestCase("b1b2b3c1c2", "a/b/text()", + "xpath", "UNICODE_CI", Array("b1", "b2", "b3"), ArrayType(StringType("UNICODE_CI"))) + ) + + // Supported collations + testCases.foreach(t => { + val query = + s""" + |select ${t.functionName}('${t.xml}', '${t.xpath}') + |""".stripMargin + // Result & data type + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { + val testQuery = sql(query) + checkAnswer(testQuery, Row(t.result)) + assert(testQuery.schema.fields.head.dataType.sameType(t.resultType)) + } + }) + } + + test("Support StringSpace expression with collation") { + case class StringSpaceTestCase( + input: Int, + collationName: String, + result: String + ) + + val testCases = Seq( + StringSpaceTestCase(1, "UTF8_BINARY", " "), + StringSpaceTestCase(2, "UTF8_LCASE", " "), + StringSpaceTestCase(3, "UNICODE", " "), + StringSpaceTestCase(4, "UNICODE_CI", " ") + ) + + // Supported collations + testCases.foreach(t => { + val query = + s""" + |select space(${t.input}) + |""".stripMargin + // Result & data type + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { + val testQuery = sql(query) + checkAnswer(testQuery, Row(t.result)) + val dataType = StringType(t.collationName) + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + } + }) + } + + test("Support ToNumber & TryToNumber expressions with collation") { + case class ToNumberTestCase( + input: String, + collationName: String, + format: String, + result: Any, + resultType: DataType + ) + + val testCases = Seq( + ToNumberTestCase("123", "UTF8_BINARY", "999", 123, DecimalType(3, 0)), + ToNumberTestCase("1", "UTF8_LCASE", "0.00", 1.00, DecimalType(3, 2)), + ToNumberTestCase("99,999", "UNICODE", "99,999", 99999, DecimalType(5, 0)), + ToNumberTestCase("$14.99", "UNICODE_CI", "$99.99", 14.99, DecimalType(4, 2)) + ) + + // Supported collations (ToNumber) + testCases.foreach(t => { + val query = + s""" + |select to_number('${t.input}', '${t.format}') + |""".stripMargin + // Result & data type + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { + val testQuery = sql(query) + checkAnswer(testQuery, Row(t.result)) + assert(testQuery.schema.fields.head.dataType.sameType(t.resultType)) + } + }) + + // Supported collations (TryToNumber) + testCases.foreach(t => { + val query = + s""" + |select try_to_number('${t.input}', '${t.format}') + |""".stripMargin + // Result & data type + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { + val testQuery = sql(query) + checkAnswer(testQuery, Row(t.result)) + assert(testQuery.schema.fields.head.dataType.sameType(t.resultType)) + } + }) + } + + test("Handle invalid number for ToNumber variant expression with collation") { + // to_number should throw an exception if the conversion fails + val number = "xx" + val query = s"SELECT to_number('$number', '999');" + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> "UNICODE") { + val e = intercept[SparkIllegalArgumentException] { + val testQuery = sql(query) + testQuery.collect() + } + assert(e.getErrorClass === "INVALID_FORMAT.MISMATCH_INPUT") + } + } + + test("Handle invalid number for TryToNumber variant expression with collation") { + // try_to_number shouldn't throw an exception if the conversion fails + val number = "xx" + val query = s"SELECT try_to_number('$number', '999');" + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> "UNICODE") { + val testQuery = sql(query) + checkAnswer(testQuery, Row(null)) + } + } + + test("Support ToChar expression with collation") { + case class ToCharTestCase( + input: Int, + collationName: String, + format: String, + result: String + ) + + val testCases = Seq( + ToCharTestCase(12, "UTF8_BINARY", "999", " 12"), + ToCharTestCase(34, "UTF8_LCASE", "000D00", "034.00"), + ToCharTestCase(56, "UNICODE", "$99.99", "$56.00"), + ToCharTestCase(78, "UNICODE_CI", "99D9S", "78.0+") + ) + + // Supported collations + testCases.foreach(t => { + val query = + s""" + |select to_char(${t.input}, '${t.format}') + |""".stripMargin + // Result & data type + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { + val testQuery = sql(query) + checkAnswer(testQuery, Row(t.result)) + val dataType = StringType(t.collationName) + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + } + }) + } + + test("Support GetJsonObject json expression with collation") { + case class GetJsonObjectTestCase( + input: String, + path: String, + collationName: String, + result: String + ) + + val testCases = Seq( + GetJsonObjectTestCase("{\"a\":\"b\"}", "$.a", "UTF8_BINARY", "b"), + GetJsonObjectTestCase("{\"A\":\"1\"}", "$.A", "UTF8_LCASE", "1"), + GetJsonObjectTestCase("{\"x\":true}", "$.x", "UNICODE", "true"), + GetJsonObjectTestCase("{\"X\":1}", "$.X", "UNICODE_CI", "1") + ) + + // Supported collations + testCases.foreach(t => { + val query = + s""" + |SELECT get_json_object('${t.input}', '${t.path}') + |""".stripMargin + // Result & data type + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { + val testQuery = sql(query) + checkAnswer(testQuery, Row(t.result)) + val dataType = StringType(t.collationName) + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + } + }) + } + + test("Support JsonTuple json expression with collation") { + case class JsonTupleTestCase( + input: String, + names: String, + collationName: String, + result: Row + ) + + val testCases = Seq( + JsonTupleTestCase("{\"a\":1, \"b\":2}", "'a', 'b'", "UTF8_BINARY", + Row("1", "2")), + JsonTupleTestCase("{\"A\":\"3\", \"B\":\"4\"}", "'A', 'B'", "UTF8_LCASE", + Row("3", "4")), + JsonTupleTestCase("{\"x\":true, \"y\":false}", "'x', 'y'", "UNICODE", + Row("true", "false")), + JsonTupleTestCase("{\"X\":null, \"Y\":null}", "'X', 'Y'", "UNICODE_CI", + Row(null, null)) + ) + + // Supported collations + testCases.foreach(t => { + val query = + s""" + |SELECT json_tuple('${t.input}', ${t.names}) + |""".stripMargin + // Result & data type + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { + val testQuery = sql(query) + checkAnswer(testQuery, t.result) + val dataType = StringType(t.collationName) + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + } + }) + } + + test("Support JsonToStructs json expression with collation") { + case class JsonToStructsTestCase( + input: String, + schema: String, + collationName: String, + result: Row + ) + + val testCases = Seq( + JsonToStructsTestCase("{\"a\":1, \"b\":2.0}", "a INT, b DOUBLE", + "UTF8_BINARY", Row(Row(1, 2.0))), + JsonToStructsTestCase("{\"A\":\"3\", \"B\":4}", "A STRING COLLATE UTF8_LCASE, B INT", + "UTF8_LCASE", Row(Row("3", 4))), + JsonToStructsTestCase("{\"x\":true, \"y\":null}", "x BOOLEAN, y VOID", + "UNICODE", Row(Row(true, null))), + JsonToStructsTestCase("{\"X\":null, \"Y\":false}", "X VOID, Y BOOLEAN", + "UNICODE_CI", Row(Row(null, false))) + ) + + // Supported collations + testCases.foreach(t => { + val query = + s""" + |SELECT from_json('${t.input}', '${t.schema}') + |""".stripMargin + // Result & data type + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { + val testQuery = sql(query) + checkAnswer(testQuery, t.result) + val dataType = StructType.fromDDL(t.schema) + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + } + }) + } + + test("Support StructsToJson json expression with collation") { + case class StructsToJsonTestCase( + struct: String, + collationName: String, + result: Row + ) + + val testCases = Seq( + StructsToJsonTestCase("named_struct('a', 1, 'b', 2)", + "UTF8_BINARY", Row("{\"a\":1,\"b\":2}")), + StructsToJsonTestCase("array(named_struct('a', 1, 'b', 2))", + "UTF8_LCASE", Row("[{\"a\":1,\"b\":2}]")), + StructsToJsonTestCase("map('a', named_struct('b', 1))", + "UNICODE", Row("{\"a\":{\"b\":1}}")), + StructsToJsonTestCase("array(map('a', 1))", + "UNICODE_CI", Row("[{\"a\":1}]")) + ) + + // Supported collations + testCases.foreach(t => { + val query = + s""" + |SELECT to_json(${t.struct}) + |""".stripMargin + // Result & data type + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { + val testQuery = sql(query) + checkAnswer(testQuery, t.result) + val dataType = StringType(t.collationName) + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + } + }) + } + + test("Support LengthOfJsonArray json expression with collation") { + case class LengthOfJsonArrayTestCase( + input: String, + collationName: String, + result: Row + ) + + val testCases = Seq( + LengthOfJsonArrayTestCase("'[1,2,3,4]'", "UTF8_BINARY", Row(4)), + LengthOfJsonArrayTestCase("'[1,2,3,{\"f1\":1,\"f2\":[5,6]},4]'", "UTF8_LCASE", Row(5)), + LengthOfJsonArrayTestCase("'[1,2'", "UNICODE", Row(null)), + LengthOfJsonArrayTestCase("'['", "UNICODE_CI", Row(null)) + ) + + // Supported collations + testCases.foreach(t => { + val query = + s""" + |SELECT json_array_length(${t.input}) + |""".stripMargin + // Result & data type + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { + val testQuery = sql(query) + checkAnswer(testQuery, t.result) + assert(testQuery.schema.fields.head.dataType.sameType(IntegerType)) + } + }) + } + + test("Support JsonObjectKeys json expression with collation") { + case class JsonObjectKeysJsonArrayTestCase( + input: String, + collationName: String, + result: Row + ) + + val testCases = Seq( + JsonObjectKeysJsonArrayTestCase("{}", "UTF8_BINARY", + Row(Seq())), + JsonObjectKeysJsonArrayTestCase("{\"k\":", "UTF8_LCASE", + Row(null)), + JsonObjectKeysJsonArrayTestCase("{\"k1\": \"v1\"}", "UNICODE", + Row(Seq("k1"))), + JsonObjectKeysJsonArrayTestCase("{\"k1\":1,\"k2\":{\"k3\":3, \"k4\":4}}", "UNICODE_CI", + Row(Seq("k1", "k2"))) + ) + + // Supported collations + testCases.foreach(t => { + val query = + s""" + |SELECT json_object_keys('${t.input}') + |""".stripMargin + // Result & data type + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { + val testQuery = sql(query) + checkAnswer(testQuery, t.result) + val dataType = ArrayType(StringType(t.collationName)) + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + } + }) + } + + test("Support SchemaOfJson json expression with collation") { + case class SchemaOfJsonTestCase( + input: String, + collationName: String, + result: Row + ) + + val testCases = Seq( + SchemaOfJsonTestCase("'[{\"col\":0}]'", + "UTF8_BINARY", Row("ARRAY>")), + SchemaOfJsonTestCase("'[{\"col\":01}]', map('allowNumericLeadingZeros', 'true')", + "UTF8_LCASE", Row("ARRAY>")), + SchemaOfJsonTestCase("'[]'", + "UNICODE", Row("ARRAY")), + SchemaOfJsonTestCase("''", + "UNICODE_CI", Row("STRING")) + ) + + // Supported collations + testCases.foreach(t => { + val query = + s""" + |SELECT schema_of_json(${t.input}) + |""".stripMargin + // Result & data type + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { + val testQuery = sql(query) + checkAnswer(testQuery, t.result) + val dataType = StringType(t.collationName) + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + } + }) + } + + test("Support StringToMap expression with collation") { + // Supported collations + case class StringToMapTestCase[R](t: String, p: String, k: String, c: String, result: R) + val testCases = Seq( + StringToMapTestCase("a:1,b:2,c:3", ",", ":", "UTF8_BINARY", + Map("a" -> "1", "b" -> "2", "c" -> "3")), + StringToMapTestCase("A-1;B-2;C-3", ";", "-", "UTF8_LCASE", + Map("A" -> "1", "B" -> "2", "C" -> "3")), + StringToMapTestCase("1:a,2:b,3:c", ",", ":", "UNICODE", + Map("1" -> "a", "2" -> "b", "3" -> "c")), + StringToMapTestCase("1/A!2/B!3/C", "!", "/", "UNICODE_CI", + Map("1" -> "A", "2" -> "B", "3" -> "C")) + ) + testCases.foreach(t => { + val query = s"SELECT str_to_map(collate('${t.t}', '${t.c}'), '${t.p}', '${t.k}');" + // Result & data type + checkAnswer(sql(query), Row(t.result)) + val dataType = MapType(StringType(t.c), StringType(t.c), true) + assert(sql(query).schema.fields.head.dataType.sameType(dataType)) + }) + } + + test("Support RaiseError misc expression with collation") { + // Supported collations + case class RaiseErrorTestCase(errorMessage: String, collationName: String) + val testCases = Seq( + RaiseErrorTestCase("custom error message 1", "UTF8_BINARY"), + RaiseErrorTestCase("custom error message 2", "UTF8_LCASE"), + RaiseErrorTestCase("custom error message 3", "UNICODE"), + RaiseErrorTestCase("custom error message 4", "UNICODE_CI") + ) + testCases.foreach(t => { + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { + val query = s"SELECT raise_error('${t.errorMessage}')" + // Result & data type + val userException = intercept[SparkRuntimeException] { + sql(query).collect() + } + assert(userException.getErrorClass === "USER_RAISED_EXCEPTION") + assert(userException.getMessage.contains(t.errorMessage)) + } + }) + } + + test("Support CurrentDatabase/Catalog/User expressions with collation") { + // Supported collations + Seq("UTF8_LCASE", "UNICODE", "UNICODE_CI").foreach(collationName => + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> collationName) { + val queryDatabase = sql("SELECT current_schema()") + val queryCatalog = sql("SELECT current_catalog()") + val queryUser = sql("SELECT current_user()") + // Data type + val dataType = StringType(collationName) + assert(queryDatabase.schema.fields.head.dataType.sameType(dataType)) + assert(queryCatalog.schema.fields.head.dataType.sameType(dataType)) + assert(queryUser.schema.fields.head.dataType.sameType(dataType)) + } + ) + } + + test("Support Uuid misc expression with collation") { + // Supported collations + Seq("UTF8_LCASE", "UNICODE", "UNICODE_CI").foreach(collationName => + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> collationName) { + val query = s"SELECT uuid()" + // Result & data type + val testQuery = sql(query) + val queryResult = testQuery.collect().head.getString(0) + val uuidFormat = "^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$" + assert(queryResult.matches(uuidFormat)) + val dataType = StringType(collationName) + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + } + ) + } + + test("Support SparkVersion misc expression with collation") { + // Supported collations + Seq("UTF8_BINARY", "UTF8_LCASE", "UNICODE", "UNICODE_CI").foreach(collationName => + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> collationName) { + val query = s"SELECT version()" + // Result & data type + val testQuery = sql(query) + val queryResult = testQuery.collect().head.getString(0) + val versionFormat = "^[0-9]\\.[0-9]\\.[0-9] [0-9a-f]{40}$" + assert(queryResult.matches(versionFormat)) + val dataType = StringType(collationName) + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + } + ) + } + + test("Support TypeOf misc expression with collation") { + // Supported collations + case class TypeOfTestCase(input: String, collationName: String, result: String) + val testCases = Seq( + TypeOfTestCase("1", "UTF8_BINARY", "int"), + TypeOfTestCase("\"A\"", "UTF8_LCASE", "string collate UTF8_LCASE"), + TypeOfTestCase("array(1)", "UNICODE", "array"), + TypeOfTestCase("null", "UNICODE_CI", "void") + ) + testCases.foreach(t => { + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { + val query = s"SELECT typeof(${t.input})" + // Result & data type + val testQuery = sql(query) + checkAnswer(testQuery, Row(t.result)) + val dataType = StringType(t.collationName) + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + } + }) + } + + test("Support AesEncrypt misc expression with collation") { + // Supported collations + case class AesEncryptTestCase( + input: String, + collationName: String, + params: String, + result: String + ) + val testCases = Seq( + AesEncryptTestCase("Spark", "UTF8_BINARY", "'1234567890abcdef', 'ECB'", + "8DE7DB79A23F3E8ED530994DDEA98913"), + AesEncryptTestCase("Spark", "UTF8_LCASE", "'1234567890abcdef', 'ECB', 'DEFAULT', ''", + "8DE7DB79A23F3E8ED530994DDEA98913"), + AesEncryptTestCase("Spark", "UNICODE", "'1234567890abcdef', 'GCM', 'DEFAULT', " + + "unhex('000000000000000000000000')", + "00000000000000000000000046596B2DE09C729FE48A0F81A00A4E7101DABEB61D"), + AesEncryptTestCase("Spark", "UNICODE_CI", "'1234567890abcdef', 'CBC', 'DEFAULT', " + + "unhex('00000000000000000000000000000000')", + "000000000000000000000000000000008DE7DB79A23F3E8ED530994DDEA98913") + ) + testCases.foreach(t => { + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { + val query = s"SELECT hex(aes_encrypt('${t.input}', ${t.params}))" + // Result & data type + val testQuery = sql(query) + checkAnswer(testQuery, Row(t.result)) + val dataType = StringType(t.collationName) + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + } + }) + } + + test("Support AesDecrypt misc expression with collation") { + // Supported collations + case class AesDecryptTestCase( + input: String, + collationName: String, + params: String, + result: String + ) + val testCases = Seq( + AesDecryptTestCase("8DE7DB79A23F3E8ED530994DDEA98913", + "UTF8_BINARY", "'1234567890abcdef', 'ECB'", "Spark"), + AesDecryptTestCase("8DE7DB79A23F3E8ED530994DDEA98913", + "UTF8_LCASE", "'1234567890abcdef', 'ECB', 'DEFAULT', ''", "Spark"), + AesDecryptTestCase("00000000000000000000000046596B2DE09C729FE48A0F81A00A4E7101DABEB61D", + "UNICODE", "'1234567890abcdef', 'GCM', 'DEFAULT'", "Spark"), + AesDecryptTestCase("000000000000000000000000000000008DE7DB79A23F3E8ED530994DDEA98913", + "UNICODE_CI", "'1234567890abcdef', 'CBC', 'DEFAULT'", "Spark") + ) + testCases.foreach(t => { + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { + val query = s"SELECT aes_decrypt(unhex('${t.input}'), ${t.params})" + // Result & data type + val testQuery = sql(query) + checkAnswer(testQuery, sql(s"SELECT to_binary('${t.result}', 'utf-8')")) + assert(testQuery.schema.fields.head.dataType.sameType(BinaryType)) + } + }) + } + + test("Support Mask expression with collation") { + // Supported collations + case class MaskTestCase[R](i: String, u: String, l: String, d: String, o: String, c: String, + result: R) + val testCases = Seq( + MaskTestCase("ab-CD-12-@$", null, null, null, null, "UTF8_BINARY", "ab-CD-12-@$"), + MaskTestCase("ab-CD-12-@$", "X", null, null, null, "UTF8_LCASE", "ab-XX-12-@$"), + MaskTestCase("ab-CD-12-@$", "X", "x", null, null, "UNICODE", "xx-XX-12-@$"), + MaskTestCase("ab-CD-12-@$", "X", "x", "0", "#", "UNICODE_CI", "xx#XX#00###") + ) + testCases.foreach(t => { + def col(s: String): String = if (s == null) "null" else s"collate('$s', '${t.c}')" + val query = s"SELECT mask(${col(t.i)}, ${col(t.u)}, ${col(t.l)}, ${col(t.d)}, ${col(t.o)})" + // Result & data type + var result = sql(query) + checkAnswer(result, Row(t.result)) + assert(result.schema.fields.head.dataType.sameType(StringType(t.c))) + }) + // Implicit casting + val testCasting = Seq( + MaskTestCase("ab-CD-12-@$", "X", "x", "0", "#", "UNICODE_CI", "xx#XX#00###") + ) + testCasting.foreach(t => { + def col(s: String): String = if (s == null) "null" else s"collate('$s', '${t.c}')" + def str(s: String): String = if (s == null) "null" else s"'$s'" + val query1 = s"SELECT mask(${col(t.i)}, ${str(t.u)}, ${str(t.l)}, ${str(t.d)}, ${str(t.o)})" + val query2 = s"SELECT mask(${str(t.i)}, ${col(t.u)}, ${str(t.l)}, ${str(t.d)}, ${str(t.o)})" + val query3 = s"SELECT mask(${str(t.i)}, ${str(t.u)}, ${col(t.l)}, ${str(t.d)}, ${str(t.o)})" + val query4 = s"SELECT mask(${str(t.i)}, ${str(t.u)}, ${str(t.l)}, ${col(t.d)}, ${str(t.o)})" + val query5 = s"SELECT mask(${str(t.i)}, ${str(t.u)}, ${str(t.l)}, ${str(t.d)}, ${col(t.o)})" + for (q <- Seq(query1, query2, query3, query4, query5)) { + val result = sql(q) + checkAnswer(result, Row(t.result)) + assert(result.schema.fields.head.dataType.sameType(StringType(t.c))) + } + }) + // Collation mismatch + val collationMismatch = intercept[AnalysisException] { + sql("SELECT mask(collate('ab-CD-12-@$','UNICODE'),collate('X','UNICODE_CI'),'x','0','#')") + } + assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT") + } + + test("Support XmlToStructs xml expression with collation") { + case class XmlToStructsTestCase( + input: String, + collationName: String, + schema: String, + options: String, + result: Row, + structFields: Seq[StructField] + ) + + val testCases = Seq( + XmlToStructsTestCase("

      1

      ", "UTF8_BINARY", "'a INT'", "", + Row(1), Seq( + StructField("a", IntegerType, nullable = true) + )), + XmlToStructsTestCase("

      true0.8

      ", "UTF8_LCASE", + "'A BOOLEAN, B DOUBLE'", "", Row(true, 0.8), Seq( + StructField("A", BooleanType, nullable = true), + StructField("B", DoubleType, nullable = true) + )), + XmlToStructsTestCase("

      Spark

      ", "UNICODE", "'s STRING'", "", + Row("Spark"), Seq( + StructField("s", StringType("UNICODE"), nullable = true) + )), + XmlToStructsTestCase("

      ", "UNICODE_CI", "'time Timestamp'", + ", map('timestampFormat', 'dd/MM/yyyy')", Row( + new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.S").parse("2015-08-26 00:00:00.0") + ), Seq( + StructField("time", TimestampType, nullable = true) + )) + ) + + // Supported collations + testCases.foreach(t => { + val query = + s""" + |select from_xml('${t.input}', ${t.schema} ${t.options}) + |""".stripMargin + // Result + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { + val testQuery = sql(query) + checkAnswer(testQuery, Row(t.result)) + val dataType = StructType(t.structFields) + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + } + }) + } + + test("Support SchemaOfXml xml expression with collation") { + case class SchemaOfXmlTestCase( + input: String, + collationName: String, + result: String + ) + + val testCases = Seq( + SchemaOfXmlTestCase("

      1

      ", "UTF8_BINARY", "STRUCT"), + SchemaOfXmlTestCase("

      true0.8

      ", "UTF8_LCASE", + "STRUCT"), + SchemaOfXmlTestCase("

      ", "UNICODE", "STRUCT<>"), + SchemaOfXmlTestCase("

      123

      ", "UNICODE_CI", + "STRUCT>") + ) + + // Supported collations + testCases.foreach(t => { + val query = + s""" + |select schema_of_xml('${t.input}') + |""".stripMargin + // Result + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { + val testQuery = sql(query) + checkAnswer(testQuery, Row(t.result)) + val dataType = StringType(t.collationName) + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + } + }) + } + + test("Support StructsToXml xml expression with collation") { + case class StructsToXmlTestCase( + input: String, + collationName: String, + result: String + ) + + val testCases = Seq( + StructsToXmlTestCase("named_struct('a', 1, 'b', 2)", "UTF8_BINARY", + s""" + | 1 + | 2 + |""".stripMargin), + StructsToXmlTestCase("named_struct('A', true, 'B', 2.0)", "UTF8_LCASE", + s""" + | true + | 2.0 + |""".stripMargin), + StructsToXmlTestCase("named_struct('A', 'aa', 'B', 'bb')", "UTF8_LCASE", + s""" + | aa + | bb + |""".stripMargin), + StructsToXmlTestCase("named_struct('A', 'aa', 'B', 'bb')", "UTF8_BINARY", + s""" + | aa + | bb + |""".stripMargin), + StructsToXmlTestCase("named_struct()", "UNICODE", + ""), + StructsToXmlTestCase("named_struct('time', to_timestamp('2015-08-26'))", "UNICODE_CI", + s""" + | + |""".stripMargin) + ) + + // Supported collations + testCases.foreach(t => { + val query = + s""" + |select to_xml(${t.input}) + |""".stripMargin + // Result + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { + val testQuery = sql(query) + checkAnswer(testQuery, Row(t.result)) + val dataType = StringType(t.collationName) + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + } + }) + } + + test("Support ParseJson & TryParseJson variant expressions with collation") { + case class ParseJsonTestCase( + input: String, + collationName: String, + result: String + ) + + val testCases = Seq( + ParseJsonTestCase("{\"a\":1,\"b\":2}", "UTF8_BINARY", "{\"a\":1,\"b\":2}"), + ParseJsonTestCase("{\"A\":3,\"B\":4}", "UTF8_LCASE", "{\"A\":3,\"B\":4}"), + ParseJsonTestCase("{\"c\":5,\"d\":6}", "UNICODE", "{\"c\":5,\"d\":6}"), + ParseJsonTestCase("{\"C\":7,\"D\":8}", "UNICODE_CI", "{\"C\":7,\"D\":8}") + ) + + // Supported collations (ParseJson) + testCases.foreach(t => { + val query = + s""" + |SELECT parse_json('${t.input}') + |""".stripMargin + // Result & data type + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { + val testQuery = sql(query) + val testResult = testQuery.collect().map(_.toString()).mkString("") + assert(testResult === "[" + t.result + "]") // can't use checkAnswer for Variant + assert(testQuery.schema.fields.head.dataType.sameType(VariantType)) + } + }) + + // Supported collations (TryParseJson) + testCases.foreach(t => { + val query = + s""" + |SELECT try_parse_json('${t.input}') + |""".stripMargin + // Result & data type + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { + val testQuery = sql(query) + val testResult = testQuery.collect().map(_.toString()).mkString("") + assert(testResult === "[" + t.result + "]") // can't use checkAnswer for Variant + assert(testQuery.schema.fields.head.dataType.sameType(VariantType)) + } + }) + } + + test("Handle invalid JSON for ParseJson variant expression with collation") { + // parse_json should throw an exception when the string is not valid JSON value + val json = "{\"a\":1," + val query = s"SELECT parse_json('$json');" + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> "UNICODE") { + val e = intercept[SparkException] { + val testQuery = sql(query) + testQuery.collect() + } + assert(e.getErrorClass === "MALFORMED_RECORD_IN_PARSING.WITHOUT_SUGGESTION") + } + } + + test("Handle invalid JSON for TryParseJson variant expression with collation") { + // try_parse_json shouldn't throw an exception when the string is not valid JSON value + val json = "{\"a\":1,]" + val query = s"SELECT try_parse_json('$json');" + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> "UNICODE") { + val testQuery = sql(query) + val testResult = testQuery.collect().map(_.toString()).mkString("") + assert(testResult === s"[null]") + } + } + + test("Support IsVariantNull variant expressions with collation") { + case class IsVariantNullTestCase( + input: String, + collationName: String, + result: Boolean + ) + + val testCases = Seq( + IsVariantNullTestCase("'null'", "UTF8_BINARY", result = true), + IsVariantNullTestCase("'\"null\"'", "UTF8_LCASE", result = false), + IsVariantNullTestCase("'13'", "UNICODE", result = false), + IsVariantNullTestCase("null", "UNICODE_CI", result = false) + ) + + // Supported collations + testCases.foreach(t => { + val query = + s""" + |SELECT is_variant_null(parse_json(${t.input})) + |""".stripMargin + // Result & data type + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { + val testQuery = sql(query) + checkAnswer(testQuery, Row(t.result)) + } + }) + } + + test("Support VariantGet & TryVariantGet variant expressions with collation") { + case class VariantGetTestCase( + input: String, + path: String, + variantType: String, + collationName: String, + result: Any, + resultType: DataType + ) + + val testCases = Seq( + VariantGetTestCase("{\"a\": 1}", "$.a", "int", "UTF8_BINARY", 1, IntegerType), + VariantGetTestCase("{\"a\": 1}", "$.b", "int", "UTF8_LCASE", null, IntegerType), + VariantGetTestCase("[1, \"2\"]", "$[1]", "string", "UNICODE", "2", StringType("UNICODE")), + VariantGetTestCase("[1, \"2\"]", "$[2]", "string", "UNICODE_CI", null, + StringType("UNICODE_CI")) + ) + + // Supported collations (VariantGet) + testCases.foreach(t => { + val query = + s""" + |SELECT variant_get(parse_json('${t.input}'), '${t.path}', '${t.variantType}') + |""".stripMargin + // Result & data type + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { + val testQuery = sql(query) + val testResult = testQuery.collect().map(_.toString()).mkString("") + assert(testResult === "[" + t.result + "]") // can't use checkAnswer for Variant + assert(testQuery.schema.fields.head.dataType.sameType(t.resultType)) + } + }) + + // Supported collations (TryVariantGet) + testCases.foreach(t => { + val query = + s""" + |SELECT try_variant_get(parse_json('${t.input}'), '${t.path}', '${t.variantType}') + |""".stripMargin + // Result & data type + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { + val testQuery = sql(query) + val testResult = testQuery.collect().map(_.toString()).mkString("") + assert(testResult === "[" + t.result + "]") // can't use checkAnswer for Variant + assert(testQuery.schema.fields.head.dataType.sameType(t.resultType)) + } + }) + } + + test("Handle invalid JSON for VariantGet variant expression with collation") { + // variant_get should throw an exception if the cast fails + val json = "[1, \"Spark\"]" + val query = s"SELECT variant_get(parse_json('$json'), '$$[1]', 'int');" + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> "UNICODE") { + val e = intercept[SparkRuntimeException] { + val testQuery = sql(query) + testQuery.collect() + } + assert(e.getErrorClass === "INVALID_VARIANT_CAST") + } + } + + test("Handle invalid JSON for TryVariantGet variant expression with collation") { + // try_variant_get shouldn't throw an exception if the cast fails + val json = "[1, \"Spark\"]" + val query = s"SELECT try_variant_get(parse_json('$json'), '$$[1]', 'int');" + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> "UNICODE") { + val testQuery = sql(query) + val testResult = testQuery.collect().map(_.toString()).mkString("") + assert(testResult === s"[null]") + } + } + + test("Support VariantExplode variant expressions with collation") { + case class VariantExplodeTestCase( + input: String, + collationName: String, + result: String, + resultType: Seq[StructField] + ) + + val testCases = Seq( + VariantExplodeTestCase("[\"hello\", \"world\"]", "UTF8_BINARY", + Row(0, "null", "\"hello\"").toString() + Row(1, "null", "\"world\"").toString(), + Seq[StructField]( + StructField("pos", IntegerType, nullable = false), + StructField("key", StringType("UTF8_BINARY")), + StructField("value", VariantType, nullable = false) + ) + ), + VariantExplodeTestCase("[\"Spark\", \"SQL\"]", "UTF8_LCASE", + Row(0, "null", "\"Spark\"").toString() + Row(1, "null", "\"SQL\"").toString(), + Seq[StructField]( + StructField("pos", IntegerType, nullable = false), + StructField("key", StringType("UTF8_LCASE")), + StructField("value", VariantType, nullable = false) + ) + ), + VariantExplodeTestCase("{\"a\": true, \"b\": 3.14}", "UNICODE", + Row(0, "a", "true").toString() + Row(1, "b", "3.14").toString(), + Seq[StructField]( + StructField("pos", IntegerType, nullable = false), + StructField("key", StringType("UNICODE")), + StructField("value", VariantType, nullable = false) + ) + ), + VariantExplodeTestCase("{\"A\": 9.99, \"B\": false}", "UNICODE_CI", + Row(0, "A", "9.99").toString() + Row(1, "B", "false").toString(), + Seq[StructField]( + StructField("pos", IntegerType, nullable = false), + StructField("key", StringType("UNICODE_CI")), + StructField("value", VariantType, nullable = false) + ) + ) + ) + + // Supported collations + testCases.foreach(t => { + val query = + s""" + |SELECT * from variant_explode(parse_json('${t.input}')) + |""".stripMargin + // Result & data type + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { + val testQuery = sql(query) + val testResult = testQuery.collect().map(_.toString()).mkString("") + assert(testResult === t.result) // can't use checkAnswer for Variant + assert(testQuery.schema.fields.sameElements(t.resultType)) + } + }) + } + + test("Support SchemaOfVariant variant expressions with collation") { + case class SchemaOfVariantTestCase( + input: String, + collationName: String, + result: String + ) + + val testCases = Seq( + SchemaOfVariantTestCase("null", "UTF8_BINARY", "VOID"), + SchemaOfVariantTestCase("[]", "UTF8_LCASE", "ARRAY"), + SchemaOfVariantTestCase("[{\"a\":true,\"b\":0}]", "UNICODE", + "ARRAY>"), + SchemaOfVariantTestCase("[{\"A\":\"x\",\"B\":-1.00}]", "UNICODE_CI", + "ARRAY>") + ) + + // Supported collations + testCases.foreach(t => { + val query = + s""" + |SELECT schema_of_variant(parse_json('${t.input}')) + |""".stripMargin + // Result & data type + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { + val testQuery = sql(query) + checkAnswer(testQuery, Row(t.result)) + assert(testQuery.schema.fields.head.dataType.sameType(StringType(t.collationName))) + } + }) + } + + test("Support SchemaOfVariantAgg variant expressions with collation") { + case class SchemaOfVariantAggTestCase( + input: String, + collationName: String, + result: String + ) + + val testCases = Seq( + SchemaOfVariantAggTestCase("('1'), ('2'), ('3')", "UTF8_BINARY", "BIGINT"), + SchemaOfVariantAggTestCase("('true'), ('false'), ('true')", "UTF8_LCASE", "BOOLEAN"), + SchemaOfVariantAggTestCase("('{\"a\": 1}'), ('{\"b\": true}'), ('{\"c\": 1.23}')", + "UNICODE", "STRUCT"), + SchemaOfVariantAggTestCase("('{\"A\": \"x\"}'), ('{\"B\": 9.99}'), ('{\"C\": 0}')", + "UNICODE_CI", "STRUCT") + ) + + // Supported collations + testCases.foreach(t => { + val query = + s""" + |SELECT schema_of_variant_agg(parse_json(j)) FROM VALUES ${t.input} AS tab(j) + |""".stripMargin + // Result & data type + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { + val testQuery = sql(query) + checkAnswer(testQuery, Row(t.result)) + assert(testQuery.schema.fields.head.dataType.sameType(StringType(t.collationName))) + } + }) + } + + test("Support InputFileName expression with collation") { + // Supported collations + Seq("UTF8_BINARY", "UTF8_LCASE", "UNICODE", "UNICODE_CI").foreach(collationName => { + val query = + s""" + |select input_file_name() + |""".stripMargin + // Result + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> collationName) { + val testQuery = sql(query) + checkAnswer(testQuery, Row("")) + val dataType = StringType(collationName) + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + } + }) + } + + test("DateFormat expression with collation") { + case class DateFormatTestCase[R](date: String, format: String, collation: String, result: R) + val testCases = Seq( + DateFormatTestCase("2021-01-01", "yyyy-MM-dd", "UTF8_BINARY", "2021-01-01"), + DateFormatTestCase("2021-01-01", "yyyy-dd", "UTF8_LCASE", "2021-01"), + DateFormatTestCase("2021-01-01", "yyyy-MM-dd", "UNICODE", "2021-01-01"), + DateFormatTestCase("2021-01-01", "yyyy", "UNICODE_CI", "2021") + ) + + for { + collateDate <- Seq(true, false) + collateFormat <- Seq(true, false) + } { + testCases.foreach(t => { + val dateArg = if (collateDate) s"collate('${t.date}', '${t.collation}')" else s"'${t.date}'" + val formatArg = + if (collateFormat) { + s"collate('${t.format}', '${t.collation}')" + } else { + s"'${t.format}'" + } + + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collation) { + val query = s"SELECT date_format(${dateArg}, ${formatArg})" + // Result & data type + checkAnswer(sql(query), Row(t.result)) + assert(sql(query).schema.fields.head.dataType.sameType(StringType(t.collation))) + } + }) + } + } + + test("Support mode for string expression with collation - Basic Test") { + Seq("utf8_binary", "UTF8_LCASE", "unicode_ci", "unicode").foreach { collationId => + val query = s"SELECT mode(collate('abc', '${collationId}'))" + checkAnswer(sql(query), Row("abc")) + assert(sql(query).schema.fields.head.dataType.sameType(StringType(collationId))) + } + } + + test("Support mode for string expression with collation - Advanced Test") { + case class ModeTestCase[R](collationId: String, bufferValues: Map[String, Long], result: R) + val testCases = Seq( + ModeTestCase("utf8_binary", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), + ModeTestCase("UTF8_LCASE", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b"), + ModeTestCase("unicode_ci", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b"), + ModeTestCase("unicode", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a") + ) + testCases.foreach(t => { + val valuesToAdd = t.bufferValues.map { case (elt, numRepeats) => + (0L to numRepeats).map(_ => s"('$elt')").mkString(",") + }.mkString(",") + + val tableName = s"t_${t.collationId}_mode" + withTable(s"${tableName}") { + sql(s"CREATE TABLE ${tableName}(i STRING) USING parquet") + sql(s"INSERT INTO ${tableName} VALUES " + valuesToAdd) + val query = s"SELECT mode(collate(i, '${t.collationId}')) FROM ${tableName}" + checkAnswer(sql(query), Row(t.result)) + assert(sql(query).schema.fields.head.dataType.sameType(StringType(t.collationId))) + + } + }) + } + + test("Support Mode.eval(buffer)") { + case class UTF8StringModeTestCase[R]( + collationId: String, + bufferValues: Map[UTF8String, Long], + result: R) + + val bufferValuesUTF8String = Map( + UTF8String.fromString("a") -> 5L, + UTF8String.fromString("b") -> 4L, + UTF8String.fromString("B") -> 3L, + UTF8String.fromString("d") -> 2L, + UTF8String.fromString("e") -> 1L) + + val testCasesUTF8String = Seq( + UTF8StringModeTestCase("utf8_binary", bufferValuesUTF8String, "a"), + UTF8StringModeTestCase("UTF8_LCASE", bufferValuesUTF8String, "b"), + UTF8StringModeTestCase("unicode_ci", bufferValuesUTF8String, "b"), + UTF8StringModeTestCase("unicode", bufferValuesUTF8String, "a")) + + testCasesUTF8String.foreach(t => { + val buffer = new OpenHashMap[AnyRef, Long](5) + val myMode = Mode(child = Literal.create("some_column_name", StringType(t.collationId))) + t.bufferValues.foreach { case (k, v) => buffer.update(k, v) } + assert(myMode.eval(buffer).toString.toLowerCase() == t.result.toLowerCase()) + }) + } + + test("Support mode for string expression with collated strings in struct") { + case class ModeTestCase[R](collationId: String, bufferValues: Map[String, Long], result: R) + val testCases = Seq( + ModeTestCase("utf8_binary", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), + ModeTestCase("UTF8_LCASE", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b"), + ModeTestCase("unicode", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), + ModeTestCase("unicode_ci", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b") + ) + testCases.foreach(t => { + val valuesToAdd = t.bufferValues.map { case (elt, numRepeats) => + (0L to numRepeats).map(_ => s"named_struct('f1'," + + s" collate('$elt', '${t.collationId}'), 'f2', 1)").mkString(",") + }.mkString(",") + + val tableName = s"t_${t.collationId}_mode_struct" + withTable(tableName) { + sql(s"CREATE TABLE ${tableName}(i STRUCT) USING parquet") + sql(s"INSERT INTO ${tableName} VALUES " + valuesToAdd) + val query = s"SELECT lower(mode(i).f1) FROM ${tableName}" + if(t.collationId == "UTF8_LCASE" || + t.collationId == "unicode_ci" || + t.collationId == "unicode") { + // Cannot resolve "mode(i)" due to data type mismatch: + // Input to function mode was a complex type with strings collated on non-binary + // collations, which is not yet supported.. SQLSTATE: 42K09; line 1 pos 13; + val params = Seq(("sqlExpr", "\"mode(i)\""), + ("msg", "The input to the function 'mode'" + + " was a type of binary-unstable type that is not currently supported by mode."), + ("hint", "")).toMap + checkError( + exception = intercept[AnalysisException] { + sql(query) + }, + errorClass = "DATATYPE_MISMATCH.TYPE_CHECK_FAILURE_WITH_HINT", + parameters = params, + queryContext = Array( + ExpectedContext(objectType = "", + objectName = "", + startIndex = 13, + stopIndex = 19, + fragment = "mode(i)") + ) + ) + } else { + checkAnswer(sql(query), Row(t.result)) + } + } + }) + } + + test("Support mode for string expression with collated strings in recursively nested struct") { + case class ModeTestCase[R](collationId: String, bufferValues: Map[String, Long], result: R) + val testCases = Seq( + ModeTestCase("utf8_binary", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), + ModeTestCase("UTF8_LCASE", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b"), + ModeTestCase("unicode", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), + ModeTestCase("unicode_ci", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b") + ) + testCases.foreach(t => { + val valuesToAdd = t.bufferValues.map { case (elt, numRepeats) => + (0L to numRepeats).map(_ => s"named_struct('f1', " + + s"named_struct('f2', collate('$elt', '${t.collationId}')), 'f3', 1)").mkString(",") + }.mkString(",") + + val tableName = s"t_${t.collationId}_mode_nested_struct" + withTable(tableName) { + sql(s"CREATE TABLE ${tableName}(i STRUCT, f3: INT>) USING parquet") + sql(s"INSERT INTO ${tableName} VALUES " + valuesToAdd) + val query = s"SELECT lower(mode(i).f1.f2) FROM ${tableName}" + if(t.collationId == "UTF8_LCASE" || + t.collationId == "unicode_ci" || + t.collationId == "unicode") { + // Cannot resolve "mode(i)" due to data type mismatch: + // Input to function mode was a complex type with strings collated on non-binary + // collations, which is not yet supported.. SQLSTATE: 42K09; line 1 pos 13; + val params = Seq(("sqlExpr", "\"mode(i)\""), + ("msg", "The input to the function 'mode' " + + "was a type of binary-unstable type that is not currently supported by mode."), + ("hint", "")).toMap + checkError( + exception = intercept[AnalysisException] { + sql(query) + }, + errorClass = "DATATYPE_MISMATCH.TYPE_CHECK_FAILURE_WITH_HINT", + parameters = params, + queryContext = Array( + ExpectedContext(objectType = "", + objectName = "", + startIndex = 13, + stopIndex = 19, + fragment = "mode(i)") + ) + ) + } else { + checkAnswer(sql(query), Row(t.result)) + } + } + }) + } + + test("Support mode for string expression with collated strings in array complex type") { + case class ModeTestCase[R](collationId: String, bufferValues: Map[String, Long], result: R) + val testCases = Seq( + ModeTestCase("utf8_binary", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), + ModeTestCase("UTF8_LCASE", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b"), + ModeTestCase("unicode", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "a"), + ModeTestCase("unicode_ci", Map("a" -> 3L, "b" -> 2L, "B" -> 2L), "b") + ) + testCases.foreach(t => { + val valuesToAdd = t.bufferValues.map { case (elt, numRepeats) => + (0L to numRepeats).map(_ => s"array(named_struct('s1', named_struct('a2', " + + s"array(collate('$elt', '${t.collationId}'))), 'f3', 1))").mkString(",") + }.mkString(",") + + val tableName = s"t_${t.collationId}_mode_nested_struct" + withTable(tableName) { + sql(s"CREATE TABLE ${tableName}(" + + s"i ARRAY>, f3: INT>>)" + + s" USING parquet") + sql(s"INSERT INTO ${tableName} VALUES " + valuesToAdd) + val query = s"SELECT lower(element_at(element_at(mode(i), 1).s1.a2, 1)) FROM ${tableName}" + if(t.collationId == "UTF8_LCASE" || + t.collationId == "unicode_ci" || t.collationId == "unicode") { + val params = Seq(("sqlExpr", "\"mode(i)\""), + ("msg", "The input to the function 'mode' was a type" + + " of binary-unstable type that is not currently supported by mode."), + ("hint", "")).toMap + checkError( + exception = intercept[AnalysisException] { + sql(query) + }, + errorClass = "DATATYPE_MISMATCH.TYPE_CHECK_FAILURE_WITH_HINT", + parameters = params, + queryContext = Array( + ExpectedContext(objectType = "", + objectName = "", + startIndex = 35, + stopIndex = 41, + fragment = "mode(i)") + ) + ) + } else { + checkAnswer(sql(query), Row(t.result)) + } + } + }) + } + + test("SPARK-48430: Map value extraction with collations") { + for { + collateKey <- Seq(true, false) + collateVal <- Seq(true, false) + defaultCollation <- Seq("UTF8_BINARY", "UTF8_LCASE", "UNICODE") + } { + val mapKey = if (collateKey) "'a' collate utf8_lcase" else "'a'" + val mapVal = if (collateVal) "'b' collate utf8_lcase" else "'b'" + val collation = if (collateVal) "UTF8_LCASE" else "UTF8_BINARY" + val queryExtractor = s"select collation(map($mapKey, $mapVal)[$mapKey])" + val queryElementAt = s"select collation(element_at(map($mapKey, $mapVal), $mapKey))" + + checkAnswer(sql(queryExtractor), Row(collation)) + checkAnswer(sql(queryElementAt), Row(collation)) + + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> defaultCollation) { + val res = if (collateVal) "UTF8_LCASE" else defaultCollation + checkAnswer(sql(queryExtractor), Row(res)) + checkAnswer(sql(queryElementAt), Row(res)) + } + } + } + + test("CurrentTimeZone expression with collation") { + // Supported collations + testSuppCollations.foreach(collationName => { + val query = "select current_timezone()" + // Data type check + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> collationName) { + val testQuery = sql(query) + val dataType = StringType(collationName) + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + } + }) + } + + test("DayName expression with collation") { + // Supported collations + testSuppCollations.foreach(collationName => { + val query = "select dayname(current_date())" + // Data type check + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> collationName) { + val testQuery = sql(query) + val dataType = StringType(collationName) + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + } + }) + } + + test("ToUnixTimestamp expression with collation") { + // Supported collations + testSuppCollations.foreach(collationName => { + val query = + s""" + |select to_unix_timestamp(collate('2021-01-01 00:00:00', '${collationName}'), + |collate('yyyy-MM-dd HH:mm:ss', '${collationName}')) + |""".stripMargin + // Result & data type check + val testQuery = sql(query) + val dataType = LongType + val expectedResult = 1609488000L + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + checkAnswer(testQuery, Row(expectedResult)) + }) + } + + test("FromUnixTime expression with collation") { + // Supported collations + testSuppCollations.foreach(collationName => { + val query = + s""" + |select from_unixtime(1609488000, collate('yyyy-MM-dd HH:mm:ss', '${collationName}')) + |""".stripMargin + // Result & data type check + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> collationName) { + val testQuery = sql(query) + val dataType = StringType(collationName) + val expectedResult = "2021-01-01 00:00:00" + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + checkAnswer(testQuery, Row(expectedResult)) + } + }) + } + + test("NextDay expression with collation") { + // Supported collations + testSuppCollations.foreach(collationName => { + val query = + s""" + |select next_day('2015-01-14', collate('TU', '${collationName}')) + |""".stripMargin + // Result & data type check + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> collationName) { + val testQuery = sql(query) + val dataType = DateType + val expectedResult = "2015-01-20" + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + checkAnswer(testQuery, Row(Date.valueOf(expectedResult))) + } + }) + } + + test("FromUTCTimestamp expression with collation") { + // Supported collations + testSuppCollations.foreach(collationName => { + val query = + s""" + |select from_utc_timestamp(collate('2016-08-31', '${collationName}'), + |collate('Asia/Seoul', '${collationName}')) + |""".stripMargin + // Result & data type check + val testQuery = sql(query) + val dataType = TimestampType + val expectedResult = "2016-08-31 09:00:00.0" + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + checkAnswer(testQuery, Row(Timestamp.valueOf(expectedResult))) + }) + } + + test("ToUTCTimestamp expression with collation") { + // Supported collations + testSuppCollations.foreach(collationName => { + val query = + s""" + |select to_utc_timestamp(collate('2016-08-31 09:00:00', '${collationName}'), + |collate('Asia/Seoul', '${collationName}')) + |""".stripMargin + // Result & data type check + val testQuery = sql(query) + val dataType = TimestampType + val expectedResult = "2016-08-31 00:00:00.0" + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + checkAnswer(testQuery, Row(Timestamp.valueOf(expectedResult))) + }) + } + + test("ParseToDate expression with collation") { + // Supported collations + testSuppCollations.foreach(collationName => { + val query = + s""" + |select to_date(collate('2016-12-31', '${collationName}'), + |collate('yyyy-MM-dd', '${collationName}')) + |""".stripMargin + // Result & data type check + val testQuery = sql(query) + val dataType = DateType + val expectedResult = "2016-12-31" + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + checkAnswer(testQuery, Row(Date.valueOf(expectedResult))) + }) + } + + test("ParseToTimestamp expression with collation") { + // Supported collations + testSuppCollations.foreach(collationName => { + val query = + s""" + |select to_timestamp(collate('2016-12-31 23:59:59', '${collationName}'), + |collate('yyyy-MM-dd HH:mm:ss', '${collationName}')) + |""".stripMargin + // Result & data type check + val testQuery = sql(query) + val dataType = TimestampType + val expectedResult = "2016-12-31 23:59:59.0" + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + checkAnswer(testQuery, Row(Timestamp.valueOf(expectedResult))) + }) + } + + test("TruncDate expression with collation") { + // Supported collations + testSuppCollations.foreach(collationName => { + val query = + s""" + |select trunc(collate('2016-12-31 23:59:59', '${collationName}'), 'MM') + |""".stripMargin + // Result & data type check + val testQuery = sql(query) + val dataType = DateType + val expectedResult = "2016-12-01" + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + checkAnswer(testQuery, Row(Date.valueOf(expectedResult))) + }) + } + + test("TruncTimestamp expression with collation") { + // Supported collations + testSuppCollations.foreach(collationName => { + val query = + s""" + |select date_trunc(collate('HOUR', '${collationName}'), + |collate('2015-03-05T09:32:05.359', '${collationName}')) + |""".stripMargin + // Result & data type check + val testQuery = sql(query) + val dataType = TimestampType + val expectedResult = "2015-03-05 09:00:00.0" + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + checkAnswer(testQuery, Row(Timestamp.valueOf(expectedResult))) + }) + } + + test("MakeTimestamp expression with collation") { + // Supported collations + testSuppCollations.foreach(collationName => { + val query = + s""" + |select make_timestamp(2014, 12, 28, 6, 30, 45.887, collate('CET', '${collationName}')) + |""".stripMargin + // Result & data type check + val testQuery = sql(query) + val dataType = TimestampType + val expectedResult = "2014-12-27 21:30:45.887" + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + checkAnswer(testQuery, Row(Timestamp.valueOf(expectedResult))) + }) + } + + test("ExtractValue expression with collation") { + // Supported collations + testSuppCollations.foreach(collationName => { + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> collationName) { + val query = + s""" + |select col['Field1'] + |from values (named_struct('Field1', 'Spark', 'Field2', 5)) as tab(col); + |""".stripMargin + // Result & data type check + val testQuery = sql(query) + val dataType = StringType(collationName) + val expectedResult = "Spark" + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + checkAnswer(testQuery, Row(expectedResult)) + } + }) + } + + test("Lag expression with collation") { + // Supported collations + testSuppCollations.foreach(collationName => { + val query = + s""" + |SELECT lag(a, -1, 'default' collate $collationName) OVER (PARTITION BY b ORDER BY a) + |FROM VALUES ('A1', 2), ('A2', 1), ('A2', 3), ('A1', 1) tab(a, b); + |""".stripMargin + // Result & data type check + val testQuery = sql(query) + val dataType = StringType(collationName) + val expectedResult = Seq("A2", "default", "default", "default") + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + checkAnswer(testQuery, expectedResult.map(Row(_))) + }) + } + + test("Lead expression with collation") { + // Supported collations + testSuppCollations.foreach(collationName => { + val query = + s""" + |SELECT lead(a, -1, 'default' collate $collationName) OVER (PARTITION BY b ORDER BY a) + |FROM VALUES ('A1', 2), ('A2', 1), ('A2', 3), ('A1', 1) tab(a, b); + |""".stripMargin + // Result & data type check + val testQuery = sql(query) + val dataType = StringType(collationName) + val expectedResult = Seq("A1", "default", "default", "default") + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + checkAnswer(testQuery, expectedResult.map(Row(_))) + }) + } + + test("DatePart expression with collation") { + // Supported collations + testSuppCollations.foreach(collationName => { + val query = + s""" + |select date_part(collate('Week', '${collationName}'), + |collate('2019-08-12 01:00:00.123456', '${collationName}')) + |""".stripMargin + // Result & data type check + val testQuery = sql(query) + val dataType = IntegerType + val expectedResult = 33 + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + checkAnswer(testQuery, Row(expectedResult)) + }) + } + + test("DateAdd expression with collation") { + // Supported collations + testSuppCollations.foreach(collationName => { + val query = s"""select date_add(collate('2016-07-30', '${collationName}'), 1)""" + // Result & data type check + val testQuery = sql(query) + val dataType = DateType + val expectedResult = "2016-07-31" + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + checkAnswer(testQuery, Row(Date.valueOf(expectedResult))) + }) + } + + test("DateSub expression with collation") { + // Supported collations + testSuppCollations.foreach(collationName => { + val query = s"""select date_sub(collate('2016-07-30', '${collationName}'), 1)""" + // Result & data type check + val testQuery = sql(query) + val dataType = DateType + val expectedResult = "2016-07-29" + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + checkAnswer(testQuery, Row(Date.valueOf(expectedResult))) + }) + } + + test("WindowTime and TimeWindow expressions with collation") { + // Supported collations + testSuppCollations.foreach(collationName => { + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> collationName) { + val query = + s"""SELECT window_time(window) + | FROM (SELECT a, window, count(*) as cnt FROM VALUES + |('A1', '2021-01-01 00:00:00'), + |('A1', '2021-01-01 00:04:30'), + |('A1', '2021-01-01 00:06:00'), + |('A2', '2021-01-01 00:01:00') AS tab(a, b) + |GROUP by a, window(b, '5 minutes') ORDER BY a, window.start); + |""".stripMargin + // Result & data type check + val testQuery = sql(query) + val dataType = TimestampType + val expectedResults = + Seq("2021-01-01 00:04:59.999999", + "2021-01-01 00:09:59.999999", + "2021-01-01 00:04:59.999999") + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + checkAnswer(testQuery, expectedResults.map(ts => Row(Timestamp.valueOf(ts)))) + } + }) + } + + test("SessionWindow expressions with collation") { + // Supported collations + testSuppCollations.foreach(collationName => { + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> collationName) { + val query = + s"""SELECT count(*) as cnt + | FROM VALUES + |('A1', '2021-01-01 00:00:00'), + |('A1', '2021-01-01 00:04:30'), + |('A1', '2021-01-01 00:10:00'), + |('A2', '2021-01-01 00:01:00'), + |('A2', '2021-01-01 00:04:30') AS tab(a, b) + |GROUP BY a, + |session_window(b, CASE WHEN a = 'A1' THEN '5 minutes' ELSE '1 minutes' END) + |ORDER BY a, session_window.start; + |""".stripMargin + // Result & data type check + val testQuery = sql(query) + val dataType = LongType + val expectedResults = Seq(2, 1, 1, 1) + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + checkAnswer(testQuery, expectedResults.map(Row(_))) + } + }) + } + + test("ConvertTimezone expression with collation") { + // Supported collations + testSuppCollations.foreach(collationName => { + val query = + s""" + |select date_format(convert_timezone(collate('America/Los_Angeles', '${collationName}'), + |collate('UTC', '${collationName}'), collate('2021-12-06 00:00:00', '${collationName}')), + |'yyyy-MM-dd HH:mm:ss.S') + |""".stripMargin + // Result & data type check + val testQuery = sql(query) + val dataType = StringType + val expectedResult = "2021-12-06 08:00:00.0" + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + checkAnswer(testQuery, Row(expectedResult)) + }) + } + + test("Reflect expressions with collated strings") { + // be aware that output of java.util.UUID.fromString is always lowercase + + case class ReflectExpressions( + left: String, + leftCollation: String, + right: String, + rightCollation: String, + result: Boolean + ) + + val testCases = Seq( + ReflectExpressions("a5cf6c42-0c85-418f-af6c-3e4e5b1328f2", "utf8_binary", + "a5cf6c42-0c85-418f-af6c-3e4e5b1328f2", "utf8_binary", true), + ReflectExpressions("a5cf6c42-0c85-418f-af6c-3e4e5b1328f2", "utf8_binary", + "A5Cf6c42-0c85-418f-af6c-3e4e5b1328f2", "utf8_binary", false), + + ReflectExpressions("A5cf6C42-0C85-418f-af6c-3E4E5b1328f2", "utf8_binary", + "a5cf6c42-0c85-418f-af6c-3e4e5b1328f2", "utf8_lcase", true), + ReflectExpressions("A5cf6C42-0C85-418f-af6c-3E4E5b1328f2", "utf8_binary", + "A5Cf6c42-0c85-418f-af6c-3e4e5b1328f2", "utf8_lcase", true) + ) + testCases.foreach(testCase => { + val query = + s""" + |SELECT REFLECT('java.util.UUID', 'fromString', + |collate('${testCase.left}', '${testCase.leftCollation}'))= + |collate('${testCase.right}', '${testCase.rightCollation}'); + |""".stripMargin + val testQuery = sql(query) + checkAnswer(testQuery, Row(testCase.result)) + }) + + val queryPass = + s""" + |SELECT REFLECT('java.lang.Integer', 'toHexString',2); + |""".stripMargin + val testQueryPass = sql(queryPass) + checkAnswer(testQueryPass, Row("2")) + + val queryFail = + s""" + |SELECT REFLECT('java.lang.Integer', 'toHexString',"2"); + |""".stripMargin + val typeException = intercept[ExtendedAnalysisException] { + sql(queryFail).collect() + } + assert(typeException.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_STATIC_METHOD") + } + + // TODO: Add more tests for other SQL expressions + +} +// scalastyle:on nonascii + +class CollationSQLExpressionsANSIOffSuite extends CollationSQLExpressionsSuite { + override protected def sparkConf: SparkConf = + super.sparkConf.set(SQLConf.ANSI_ENABLED, false) + +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala new file mode 100644 index 0000000000000..885ed37098680 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala @@ -0,0 +1,496 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.plans.logical.Project +import org.apache.spark.sql.internal.SqlApiConf +import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.types.{ArrayType, BooleanType, IntegerType, StringType} + +// scalastyle:off nonascii +class CollationSQLRegexpSuite + extends QueryTest + with SharedSparkSession + with ExpressionEvalHelper { + + test("Support Like string expression with collation") { + // Supported collations + case class LikeTestCase[R](l: String, r: String, c: String, result: R) + val testCases = Seq( + LikeTestCase("ABC", "%B%", "UTF8_BINARY", true), + LikeTestCase("AḂC", "%ḃ%", "UTF8_LCASE", true), + LikeTestCase("ABC", "%b%", "UTF8_BINARY", false) + ) + testCases.foreach(t => { + val query = s"SELECT like(collate('${t.l}', '${t.c}'), '${t.r}')" + // Result & data type + checkAnswer(sql(query), Row(t.result)) + assert(sql(query).schema.fields.head.dataType.sameType(BooleanType)) + }) + // Unsupported collations + case class LikeTestFail(l: String, r: String, c: String) + val failCases = Seq( + LikeTestFail("ABC", "%b%", "UNICODE_CI") + ) + failCases.foreach(t => { + val query = s"SELECT like(collate('${t.l}', '${t.c}'), '${t.r}')" + val unsupportedCollation = intercept[AnalysisException] { + sql(query) + } + assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") + }) + } + + test("Like simplification should work with collated strings") { + case class SimplifyLikeTestCase[R](collation: String, str: String, cls: Class[_], result: R) + val testCases = Seq( + SimplifyLikeTestCase("UTF8_BINARY", "ab%", classOf[StartsWith], false), + SimplifyLikeTestCase("UTF8_BINARY", "%bc", classOf[EndsWith], false), + SimplifyLikeTestCase("UTF8_BINARY", "a%c", classOf[And], false), + SimplifyLikeTestCase("UTF8_BINARY", "%b%", classOf[Contains], false), + SimplifyLikeTestCase("UTF8_BINARY", "abc", classOf[EqualTo], false), + SimplifyLikeTestCase("UTF8_LCASE", "ab%", classOf[StartsWith], true), + SimplifyLikeTestCase("UTF8_LCASE", "%bc", classOf[EndsWith], true), + SimplifyLikeTestCase("UTF8_LCASE", "a%c", classOf[And], true), + SimplifyLikeTestCase("UTF8_LCASE", "%b%", classOf[Contains], true), + SimplifyLikeTestCase("UTF8_LCASE", "abc", classOf[EqualTo], true) + ) + val tableName = "T" + withTable(tableName) { + sql(s"CREATE TABLE IF NOT EXISTS $tableName(c STRING) using PARQUET") + sql(s"INSERT INTO $tableName(c) VALUES('ABC')") + testCases.foreach { t => + val query = sql(s"select c collate ${t.collation} like '${t.str}' FROM t") + checkAnswer(query, Row(t.result)) + val optimizedPlan = query.queryExecution.optimizedPlan.asInstanceOf[Project] + assert(optimizedPlan.projectList.head.asInstanceOf[Alias].child.getClass == t.cls) + } + } + } + + test("Like simplification should work with collated strings (for default collation)") { + val tableNameBinary = "T_BINARY" + withTable(tableNameBinary) { + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> "UTF8_BINARY") { + sql(s"CREATE TABLE IF NOT EXISTS $tableNameBinary(c STRING) using PARQUET") + sql(s"INSERT INTO $tableNameBinary(c) VALUES('ABC')") + checkAnswer(sql(s"select c like 'ab%' FROM $tableNameBinary"), Row(false)) + checkAnswer(sql(s"select c like '%bc' FROM $tableNameBinary"), Row(false)) + checkAnswer(sql(s"select c like 'a%c' FROM $tableNameBinary"), Row(false)) + checkAnswer(sql(s"select c like '%b%' FROM $tableNameBinary"), Row(false)) + checkAnswer(sql(s"select c like 'abc' FROM $tableNameBinary"), Row(false)) + } + } + val tableNameLcase = "T_LCASE" + withTable(tableNameLcase) { + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> "UTF8_LCASE") { + sql(s"CREATE TABLE IF NOT EXISTS $tableNameLcase(c STRING) using PARQUET") + sql(s"INSERT INTO $tableNameLcase(c) VALUES('ABC')") + checkAnswer(sql(s"select c like 'ab%' FROM $tableNameLcase"), Row(true)) + checkAnswer(sql(s"select c like '%bc' FROM $tableNameLcase"), Row(true)) + checkAnswer(sql(s"select c like 'a%c' FROM $tableNameLcase"), Row(true)) + checkAnswer(sql(s"select c like '%b%' FROM $tableNameLcase"), Row(true)) + checkAnswer(sql(s"select c like 'abc' FROM $tableNameLcase"), Row(true)) + } + } + } + + test("Support ILike string expression with collation") { + // Supported collations + case class ILikeTestCase[R](l: String, r: String, c: String, result: R) + val testCases = Seq( + ILikeTestCase("ABC", "%b%", "UTF8_BINARY", true), + ILikeTestCase("AḂC", "%ḃ%", "UTF8_LCASE", true), + ILikeTestCase("ABC", "%b%", "UTF8_BINARY", true) + ) + testCases.foreach(t => { + val query = s"SELECT ilike(collate('${t.l}', '${t.c}'), '${t.r}')" + // Result & data type + checkAnswer(sql(query), Row(t.result)) + assert(sql(query).schema.fields.head.dataType.sameType(BooleanType)) + }) + // Unsupported collations + case class ILikeTestFail(l: String, r: String, c: String) + val failCases = Seq( + ILikeTestFail("ABC", "%b%", "UNICODE_CI") + ) + failCases.foreach(t => { + val query = s"SELECT ilike(collate('${t.l}', '${t.c}'), '${t.r}')" + val unsupportedCollation = intercept[AnalysisException] { + sql(query) + } + assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") + }) + } + + test("Support LikeAll string expression with collation") { + // Supported collations + case class LikeAllTestCase[R](s: String, p: Seq[String], c: String, result: R) + val testCases = Seq( + LikeAllTestCase("foo", Seq("%foo%", "%oo"), "UTF8_BINARY", true), + LikeAllTestCase("Foo", Seq("%foo%", "%oo"), "UTF8_LCASE", true), + LikeAllTestCase("foo", Seq("%foo%", "%bar%"), "UTF8_BINARY", false) + ) + testCases.foreach(t => { + val query = s"SELECT collate('${t.s}', '${t.c}') LIKE ALL ('${t.p.mkString("','")}')" + // Result & data type + checkAnswer(sql(query), Row(t.result)) + assert(sql(query).schema.fields.head.dataType.sameType(BooleanType)) + }) + // Unsupported collations + case class LikeAllTestFail(s: String, p: Seq[String], c: String) + val failCases = Seq( + LikeAllTestFail("Foo", Seq("%foo%", "%oo"), "UNICODE_CI") + ) + failCases.foreach(t => { + val query = s"SELECT collate('${t.s}', '${t.c}') LIKE ALL ('${t.p.mkString("','")}')" + val unsupportedCollation = intercept[AnalysisException] { + sql(query) + } + assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") + }) + } + + test("Support NotLikeAll string expression with collation") { + // Supported collations + case class NotLikeAllTestCase[R](s: String, p: Seq[String], c: String, result: R) + val testCases = Seq( + NotLikeAllTestCase("foo", Seq("%foo%", "%oo"), "UTF8_BINARY", false), + NotLikeAllTestCase("Foo", Seq("%foo%", "%oo"), "UTF8_LCASE", false), + NotLikeAllTestCase("foo", Seq("%goo%", "%bar%"), "UTF8_BINARY", true) + ) + testCases.foreach(t => { + val query = s"SELECT collate('${t.s}', '${t.c}') NOT LIKE ALL ('${t.p.mkString("','")}')" + // Result & data type + checkAnswer(sql(query), Row(t.result)) + assert(sql(query).schema.fields.head.dataType.sameType(BooleanType)) + }) + // Unsupported collations + case class NotLikeAllTestFail(s: String, p: Seq[String], c: String) + val failCases = Seq( + NotLikeAllTestFail("Foo", Seq("%foo%", "%oo"), "UNICODE_CI") + ) + failCases.foreach(t => { + val query = s"SELECT collate('${t.s}', '${t.c}') NOT LIKE ALL ('${t.p.mkString("','")}')" + val unsupportedCollation = intercept[AnalysisException] { + sql(query) + } + assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") + }) + } + + test("Support LikeAny string expression with collation") { + // Supported collations + case class LikeAnyTestCase[R](s: String, p: Seq[String], c: String, result: R) + val testCases = Seq( + LikeAnyTestCase("foo", Seq("%foo%", "%bar"), "UTF8_BINARY", true), + LikeAnyTestCase("Foo", Seq("%foo%", "%bar"), "UTF8_LCASE", true), + LikeAnyTestCase("foo", Seq("%goo%", "%hoo%"), "UTF8_BINARY", false) + ) + testCases.foreach(t => { + val query = s"SELECT collate('${t.s}', '${t.c}') LIKE ANY ('${t.p.mkString("','")}')" + // Result & data type + checkAnswer(sql(query), Row(t.result)) + assert(sql(query).schema.fields.head.dataType.sameType(BooleanType)) + }) + // Unsupported collations + case class LikeAnyTestFail(s: String, p: Seq[String], c: String) + val failCases = Seq( + LikeAnyTestFail("Foo", Seq("%foo%", "%oo"), "UNICODE_CI") + ) + failCases.foreach(t => { + val query = s"SELECT collate('${t.s}', '${t.c}') LIKE ANY ('${t.p.mkString("','")}')" + val unsupportedCollation = intercept[AnalysisException] { + sql(query) + } + assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") + }) + } + + test("Support NotLikeAny string expression with collation") { + // Supported collations + case class NotLikeAnyTestCase[R](s: String, p: Seq[String], c: String, result: R) + val testCases = Seq( + NotLikeAnyTestCase("foo", Seq("%foo%", "%hoo"), "UTF8_BINARY", true), + NotLikeAnyTestCase("Foo", Seq("%foo%", "%hoo"), "UTF8_LCASE", true), + NotLikeAnyTestCase("foo", Seq("%foo%", "%oo%"), "UTF8_BINARY", false) + ) + testCases.foreach(t => { + val query = s"SELECT collate('${t.s}', '${t.c}') NOT LIKE ANY ('${t.p.mkString("','")}')" + // Result & data type + checkAnswer(sql(query), Row(t.result)) + assert(sql(query).schema.fields.head.dataType.sameType(BooleanType)) + }) + // Unsupported collations + case class NotLikeAnyTestFail(s: String, p: Seq[String], c: String) + val failCases = Seq( + NotLikeAnyTestFail("Foo", Seq("%foo%", "%oo"), "UNICODE_CI") + ) + failCases.foreach(t => { + val query = s"SELECT collate('${t.s}', '${t.c}') NOT LIKE ANY ('${t.p.mkString("','")}')" + val unsupportedCollation = intercept[AnalysisException] { + sql(query) + } + assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") + }) + } + + test("Support RLike string expression with collation") { + // Supported collations + case class RLikeTestCase[R](l: String, r: String, c: String, result: R) + val testCases = Seq( + RLikeTestCase("ABC", ".B.", "UTF8_BINARY", true), + RLikeTestCase("AḂC", ".ḃ.", "UTF8_LCASE", true), + RLikeTestCase("ABC", ".b.", "UTF8_BINARY", false) + ) + testCases.foreach(t => { + val query = s"SELECT rlike(collate('${t.l}', '${t.c}'), '${t.r}')" + // Result & data type + checkAnswer(sql(query), Row(t.result)) + assert(sql(query).schema.fields.head.dataType.sameType(BooleanType)) + }) + // Unsupported collations + case class RLikeTestFail(l: String, r: String, c: String) + val failCases = Seq( + RLikeTestFail("ABC", ".b.", "UNICODE_CI") + ) + failCases.foreach(t => { + val query = s"SELECT rlike(collate('${t.l}', '${t.c}'), '${t.r}')" + val unsupportedCollation = intercept[AnalysisException] { + sql(query) + } + assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") + }) + } + + test("Support StringSplit string expression with collation") { + // Supported collations + case class StringSplitTestCase[R](l: String, r: String, c: String, result: R) + val testCases = Seq( + StringSplitTestCase("ABC", "[B]", "UTF8_BINARY", Seq("A", "C")), + StringSplitTestCase("AḂC", "[ḃ]", "UTF8_LCASE", Seq("A", "C")), + StringSplitTestCase("ABC", "[B]", "UTF8_BINARY", Seq("A", "C")) + ) + testCases.foreach(t => { + val query = s"SELECT split(collate('${t.l}', '${t.c}'), '${t.r}')" + // Result & data type + checkAnswer(sql(query), Row(t.result)) + assert(sql(query).schema.fields.head.dataType.sameType(ArrayType(StringType(t.c)))) + }) + // Unsupported collations + case class StringSplitTestFail(l: String, r: String, c: String) + val failCases = Seq( + StringSplitTestFail("ABC", "[b]", "UNICODE_CI") + ) + failCases.foreach(t => { + val query = s"SELECT split(collate('${t.l}', '${t.c}'), '${t.r}')" + val unsupportedCollation = intercept[AnalysisException] { + sql(query) + } + assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") + }) + } + + test("Support RegExpReplace string expression with collation") { + // Supported collations + case class RegExpReplaceTestCase[R](l: String, r: String, c: String, result: R) + val testCases = Seq( + RegExpReplaceTestCase("ABCDE", ".C.", "UTF8_BINARY", "AFFFE"), + RegExpReplaceTestCase("ABĆDE", ".ć.", "UTF8_LCASE", "AFFFE"), + RegExpReplaceTestCase("ABCDE", ".c.", "UTF8_BINARY", "ABCDE") + ) + testCases.foreach(t => { + val query = + s"SELECT regexp_replace(collate('${t.l}', '${t.c}'), '${t.r}', collate('FFF', '${t.c}'))" + // Result & data type + checkAnswer(sql(query), Row(t.result)) + assert(sql(query).schema.fields.head.dataType.sameType(StringType(t.c))) + // Implicit casting + checkAnswer(sql(s"SELECT regexp_replace(collate('${t.l}', '${t.c}'), '${t.r}', 'FFF')"), + Row(t.result)) + checkAnswer(sql(s"SELECT regexp_replace('${t.l}', '${t.r}', collate('FFF', '${t.c}'))"), + Row(t.result)) + }) + // Collation mismatch + val (c1, c2) = ("UTF8_BINARY", "UTF8_LCASE") + val collationMismatch = intercept[AnalysisException] { + sql(s"SELECT regexp_replace(collate('ABCDE','$c1'), '.c.', collate('FFF','$c2'))") + } + assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT") + // Unsupported collations + case class RegExpReplaceTestFail(l: String, r: String, c: String) + val failCases = Seq( + RegExpReplaceTestFail("ABCDE", ".c.", "UNICODE_CI") + ) + failCases.foreach(t => { + val query = + s"SELECT regexp_replace(collate('${t.l}', '${t.c}'), '${t.r}', 'FFF')" + val unsupportedCollation = intercept[AnalysisException] { + sql(query) + } + assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") + }) + } + + test("Support RegExpExtract string expression with collation") { + // Supported collations + case class RegExpExtractTestCase[R](l: String, r: String, c: String, result: R) + val testCases = Seq( + RegExpExtractTestCase("ABCDE", ".C.", "UTF8_BINARY", "BCD"), + RegExpExtractTestCase("ABĆDE", ".ć.", "UTF8_LCASE", "BĆD"), + RegExpExtractTestCase("ABCDE", ".c.", "UTF8_BINARY", "") + ) + testCases.foreach(t => { + val query = + s"SELECT regexp_extract(collate('${t.l}', '${t.c}'), '${t.r}', 0)" + // Result & data type + checkAnswer(sql(query), Row(t.result)) + assert(sql(query).schema.fields.head.dataType.sameType(StringType(t.c))) + }) + // Unsupported collations + case class RegExpExtractTestFail(l: String, r: String, c: String) + val failCases = Seq( + RegExpExtractTestFail("ABCDE", ".c.", "UNICODE_CI") + ) + failCases.foreach(t => { + val query = + s"SELECT regexp_extract(collate('${t.l}', '${t.c}'), '${t.r}', 0)" + val unsupportedCollation = intercept[AnalysisException] { + sql(query) + } + assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") + }) + } + + test("Support RegExpExtractAll string expression with collation") { + // Supported collations + case class RegExpExtractAllTestCase[R](l: String, r: String, c: String, result: R) + val testCases = Seq( + RegExpExtractAllTestCase("ABCDE", ".C.", "UTF8_BINARY", Seq("BCD")), + RegExpExtractAllTestCase("ABĆDE", ".ć.", "UTF8_LCASE", Seq("BĆD")), + RegExpExtractAllTestCase("ABCDE", ".c.", "UTF8_BINARY", Seq()) + ) + testCases.foreach(t => { + val query = + s"SELECT regexp_extract_all(collate('${t.l}', '${t.c}'), '${t.r}', 0)" + // Result & data type + checkAnswer(sql(query), Row(t.result)) + assert(sql(query).schema.fields.head.dataType.sameType(ArrayType(StringType(t.c)))) + }) + // Unsupported collations + case class RegExpExtractAllTestFail(l: String, r: String, c: String) + val failCases = Seq( + RegExpExtractAllTestFail("ABCDE", ".c.", "UNICODE_CI") + ) + failCases.foreach(t => { + val query = + s"SELECT regexp_extract_all(collate('${t.l}', '${t.c}'), '${t.r}', 0)" + val unsupportedCollation = intercept[AnalysisException] { + sql(query) + } + assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") + }) + } + + test("Support RegExpCount string expression with collation") { + // Supported collations + case class RegExpCountTestCase[R](l: String, r: String, c: String, result: R) + val testCases = Seq( + RegExpCountTestCase("ABCDE", ".C.", "UTF8_BINARY", 1), + RegExpCountTestCase("ABĆDE", ".ć.", "UTF8_LCASE", 1), + RegExpCountTestCase("ABCDE", ".c.", "UTF8_BINARY", 0) + ) + testCases.foreach(t => { + val query = s"SELECT regexp_count(collate('${t.l}', '${t.c}'), '${t.r}')" + // Result & data type + checkAnswer(sql(query), Row(t.result)) + assert(sql(query).schema.fields.head.dataType.sameType(IntegerType)) + }) + // Unsupported collations + case class RegExpCountTestFail(l: String, r: String, c: String) + val failCases = Seq( + RegExpCountTestFail("ABCDE", ".c.", "UNICODE_CI") + ) + failCases.foreach(t => { + val query = s"SELECT regexp_count(collate('${t.l}', '${t.c}'), '${t.r}')" + val unsupportedCollation = intercept[AnalysisException] { + sql(query) + } + assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") + }) + } + + test("Support RegExpSubStr string expression with collation") { + // Supported collations + case class RegExpSubStrTestCase[R](l: String, r: String, c: String, result: R) + val testCases = Seq( + RegExpSubStrTestCase("ABCDE", ".C.", "UTF8_BINARY", "BCD"), + RegExpSubStrTestCase("ABĆDE", ".ć.", "UTF8_LCASE", "BĆD"), + RegExpSubStrTestCase("ABCDE", ".c.", "UTF8_BINARY", null) + ) + testCases.foreach(t => { + val query = s"SELECT regexp_substr(collate('${t.l}', '${t.c}'), '${t.r}')" + // Result & data type + checkAnswer(sql(query), Row(t.result)) + assert(sql(query).schema.fields.head.dataType.sameType(StringType(t.c))) + }) + // Unsupported collations + case class RegExpSubStrTestFail(l: String, r: String, c: String) + val failCases = Seq( + RegExpSubStrTestFail("ABCDE", ".c.", "UNICODE_CI") + ) + failCases.foreach(t => { + val query = s"SELECT regexp_substr(collate('${t.l}', '${t.c}'), '${t.r}')" + val unsupportedCollation = intercept[AnalysisException] { + sql(query) + } + assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") + }) + } + + test("Support RegExpInStr string expression with collation") { + // Supported collations + case class RegExpInStrTestCase[R](l: String, r: String, c: String, result: R) + val testCases = Seq( + RegExpInStrTestCase("ABCDE", ".C.", "UTF8_BINARY", 2), + RegExpInStrTestCase("ABĆDE", ".ć.", "UTF8_LCASE", 2), + RegExpInStrTestCase("ABCDE", ".c.", "UTF8_BINARY", 0) + ) + testCases.foreach(t => { + val query = s"SELECT regexp_instr(collate('${t.l}', '${t.c}'), '${t.r}')" + // Result & data type + checkAnswer(sql(query), Row(t.result)) + assert(sql(query).schema.fields.head.dataType.sameType(IntegerType)) + }) + // Unsupported collations + case class RegExpInStrTestFail(l: String, r: String, c: String) + val failCases = Seq( + RegExpInStrTestFail("ABCDE", ".c.", "UNICODE_CI") + ) + failCases.foreach(t => { + val query = s"SELECT regexp_instr(collate('${t.l}', '${t.c}'), '${t.r}')" + val unsupportedCollation = intercept[AnalysisException] { + sql(query) + } + assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") + }) + } + +} +// scalastyle:on nonascii diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala index e815a5051ed20..78aee5b80e549 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala @@ -17,23 +17,27 @@ package org.apache.spark.sql -import scala.collection.immutable.Seq +import scala.jdk.CollectionConverters.MapHasAsScala -import org.apache.spark.SparkConf +import org.apache.spark.{SparkConf, SparkIllegalArgumentException} +import org.apache.spark.sql.catalyst.expressions.{ExpressionEvalHelper, Literal, StringTrim, StringTrimLeft, StringTrimRight} +import org.apache.spark.sql.catalyst.util.CollationFactory import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types.{ArrayType, BinaryType, BooleanType, DataType, IntegerType, StringType} +// scalastyle:off nonascii class CollationStringExpressionsSuite extends QueryTest - with SharedSparkSession { + with SharedSparkSession + with ExpressionEvalHelper { test("Support ConcatWs string expression with collation") { // Supported collations case class ConcatWsTestCase[R](s: String, a: Array[String], c: String, result: R) val testCases = Seq( ConcatWsTestCase(" ", Array("Spark", "SQL"), "UTF8_BINARY", "Spark SQL"), - ConcatWsTestCase(" ", Array("Spark", "SQL"), "UTF8_BINARY_LCASE", "Spark SQL"), + ConcatWsTestCase(" ", Array("Spark", "SQL"), "UTF8_LCASE", "Spark SQL"), ConcatWsTestCase(" ", Array("Spark", "SQL"), "UNICODE", "Spark SQL"), ConcatWsTestCase(" ", Array("Spark", "SQL"), "UNICODE_CI", "Spark SQL") ) @@ -54,7 +58,7 @@ class CollationStringExpressionsSuite }) // Collation mismatch val collationMismatch = intercept[AnalysisException] { - sql("SELECT concat_ws(' ',collate('Spark', 'UTF8_BINARY_LCASE'),collate('SQL', 'UNICODE'))") + sql("SELECT concat_ws(' ',collate('Spark', 'UTF8_LCASE'),collate('SQL', 'UNICODE'))") } assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT") } @@ -64,7 +68,7 @@ class CollationStringExpressionsSuite case class EltTestCase[R](index: Int, inputs: Array[String], c: String, result: R) val testCases = Seq( EltTestCase(1, Array("Spark", "SQL"), "UTF8_BINARY", "Spark"), - EltTestCase(1, Array("Spark", "SQL"), "UTF8_BINARY_LCASE", "Spark"), + EltTestCase(1, Array("Spark", "SQL"), "UTF8_LCASE", "Spark"), EltTestCase(2, Array("Spark", "SQL"), "UNICODE", "SQL"), EltTestCase(2, Array("Spark", "SQL"), "UNICODE_CI", "SQL") ) @@ -84,18 +88,35 @@ class CollationStringExpressionsSuite }) // Collation mismatch val collationMismatch = intercept[AnalysisException] { - sql("SELECT elt(0 ,collate('Spark', 'UTF8_BINARY_LCASE'), collate('SQL', 'UNICODE'))") + sql("SELECT elt(0 ,collate('Spark', 'UTF8_LCASE'), collate('SQL', 'UNICODE'))") } assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT") } + test("Support SplitPart string expression with collation") { + // Supported collations + case class SplitPartTestCase[R](s: String, d: String, p: Int, c: String, result: R) + val testCases = Seq( + SplitPartTestCase("1a2", "a", 2, "UTF8_BINARY", "2"), + SplitPartTestCase("1a2", "a", 2, "UNICODE", "2"), + SplitPartTestCase("1a2", "A", 2, "UTF8_LCASE", "2"), + SplitPartTestCase("1a2", "A", 2, "UNICODE_CI", "2") + ) + testCases.foreach(t => { + val query = s"SELECT split_part(collate('${t.s}','${t.c}'),collate('${t.d}','${t.c}'),${t.p})" + // Result & data type + checkAnswer(sql(query), Row(t.result)) + assert(sql(query).schema.fields.head.dataType.sameType(StringType(t.c))) + }) + } + test("Support Contains string expression with collation") { // Supported collations case class ContainsTestCase[R](l: String, r: String, c: String, result: R) val testCases = Seq( ContainsTestCase("", "", "UTF8_BINARY", true), ContainsTestCase("abcde", "C", "UNICODE", false), - ContainsTestCase("abcde", "FGH", "UTF8_BINARY_LCASE", false), + ContainsTestCase("abcde", "FGH", "UTF8_LCASE", false), ContainsTestCase("abcde", "BCD", "UNICODE_CI", true) ) testCases.foreach(t => { @@ -109,7 +130,100 @@ class CollationStringExpressionsSuite }) // Collation mismatch val collationMismatch = intercept[AnalysisException] { - sql("SELECT contains(collate('abcde','UTF8_BINARY_LCASE'),collate('C','UNICODE_CI'))") + sql("SELECT contains(collate('abcde','UTF8_LCASE'),collate('C','UNICODE_CI'))") + } + assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT") + } + + test("Support SubstringIndex expression with collation") { + case class SubstringIndexTestCase[R](string: String, delimiter: String, count: Integer, + c: String, result: R) + val testCases = Seq( + SubstringIndexTestCase("wwwgapachegorg", "g", -3, "UTF8_BINARY", "apachegorg"), + SubstringIndexTestCase("www||apache||org", "||", 2, "UTF8_BINARY", "www||apache"), + SubstringIndexTestCase("wwwXapacheXorg", "x", 2, "UTF8_LCASE", "wwwXapache"), + SubstringIndexTestCase("aaaaaaaaaa", "aa", 2, "UNICODE", "a"), + SubstringIndexTestCase("wwwmapacheMorg", "M", -2, "UNICODE_CI", "apacheMorg") + ) + testCases.foreach(t => { + val query = s"SELECT substring_index(collate('${t.string}','${t.c}')," + + s"collate('${t.delimiter}','${t.c}'),${t.count})" + // Result & data type + checkAnswer(sql(query), Row(t.result)) + assert(sql(query).schema.fields.head.dataType.sameType( + StringType(CollationFactory.collationNameToId(t.c)))) + // Implicit casting + checkAnswer(sql(s"SELECT substring_index(collate('${t.string}','${t.c}')," + + s"'${t.delimiter}',${t.count})"), Row(t.result)) + checkAnswer(sql(s"SELECT substring_index('${t.string}',collate('${t.delimiter}','${t.c}')," + + s"${t.count})"), Row(t.result)) + }) + // Collation mismatch + val collationMismatch = intercept[AnalysisException] { + sql("SELECT substring_index(collate('abcde','UTF8_LCASE')," + + "collate('C','UNICODE_CI'),1)") + } + assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT") + } + + test("Support StringInStr string expression with collation") { + case class StringInStrTestCase[R](string: String, substring: String, c: String, result: R) + val testCases = Seq( + // scalastyle:off + StringInStrTestCase("test大千世界X大千世界", "大千", "UTF8_BINARY", 5), + StringInStrTestCase("test大千世界X大千世界", "界x", "UTF8_LCASE", 8), + StringInStrTestCase("test大千世界X大千世界", "界x", "UNICODE", 0), + StringInStrTestCase("test大千世界X大千世界", "界y", "UNICODE_CI", 0), + StringInStrTestCase("test大千世界X大千世界", "界x", "UNICODE_CI", 8), + StringInStrTestCase("abİo12", "i̇o", "UNICODE_CI", 3) + // scalastyle:on + ) + testCases.foreach(t => { + val query = s"SELECT instr(collate('${t.string}','${t.c}')," + + s"collate('${t.substring}','${t.c}'))" + // Result & data type + checkAnswer(sql(query), Row(t.result)) + assert(sql(query).schema.fields.head.dataType.sameType(IntegerType)) + // Implicit casting + checkAnswer(sql(s"SELECT instr(collate('${t.string}','${t.c}')," + + s"'${t.substring}')"), Row(t.result)) + checkAnswer(sql(s"SELECT instr('${t.string}'," + + s"collate('${t.substring}','${t.c}'))"), Row(t.result)) + }) + // Collation mismatch + val collationMismatch = intercept[AnalysisException] { + sql(s"SELECT instr(collate('aaads','UTF8_BINARY'), collate('Aa','UTF8_LCASE'))") + } + assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT") + } + + test("Support FindInSet string expression with collation") { + case class FindInSetTestCase[R](word: String, set: String, c: String, result: R) + val testCases = Seq( + FindInSetTestCase("AB", "abc,b,ab,c,def", "UTF8_BINARY", 0), + FindInSetTestCase("C", "abc,b,ab,c,def", "UTF8_LCASE", 4), + FindInSetTestCase("d,ef", "abc,b,ab,c,def", "UNICODE", 0), + // scalastyle:off + FindInSetTestCase("i̇o", "ab,İo,12", "UNICODE_CI", 2), + FindInSetTestCase("İo", "ab,i̇o,12", "UNICODE_CI", 2) + // scalastyle:on + ) + testCases.foreach(t => { + val query = s"SELECT find_in_set(collate('${t.word}', '${t.c}')," + + s"collate('${t.set}', '${t.c}'))" + // Result & data type + checkAnswer(sql(query), Row(t.result)) + assert(sql(query).schema.fields.head.dataType.sameType(IntegerType)) + // Implicit casting + checkAnswer(sql(s"SELECT find_in_set(collate('${t.word}', '${t.c}')," + + s"'${t.set}')"), Row(t.result)) + checkAnswer(sql(s"SELECT find_in_set('${t.word}'," + + s"collate('${t.set}', '${t.c}'))"), Row(t.result)) + }) + // Collation mismatch + val collationMismatch = intercept[AnalysisException] { + sql(s"SELECT find_in_set(collate('AB','UTF8_BINARY')," + + s"collate('ab,xyz,fgh','UTF8_LCASE'))") } assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT") } @@ -120,7 +234,7 @@ class CollationStringExpressionsSuite val testCases = Seq( StartsWithTestCase("", "", "UTF8_BINARY", true), StartsWithTestCase("abcde", "A", "UNICODE", false), - StartsWithTestCase("abcde", "FGH", "UTF8_BINARY_LCASE", false), + StartsWithTestCase("abcde", "FGH", "UTF8_LCASE", false), StartsWithTestCase("abcde", "ABC", "UNICODE_CI", true) ) testCases.foreach(t => { @@ -134,7 +248,116 @@ class CollationStringExpressionsSuite }) // Collation mismatch val collationMismatch = intercept[AnalysisException] { - sql("SELECT startswith(collate('abcde', 'UTF8_BINARY_LCASE'),collate('C', 'UNICODE_CI'))") + sql("SELECT startswith(collate('abcde', 'UTF8_LCASE'),collate('C', 'UNICODE_CI'))") + } + assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT") + } + test("TRANSLATE check result on explicitly collated string") { + // Supported collations + case class TranslateTestCase[R](input: String, matchExpression: String, + replaceExpression: String, collation: String, result: R) + val testCases = Seq( + TranslateTestCase("Translate", "Rnlt", "1234", "UTF8_LCASE", "41a2s3a4e"), + TranslateTestCase("Translate", "Rnlt", "1234", "UTF8_LCASE", "41a2s3a4e"), + TranslateTestCase("TRanslate", "rnlt", "XxXx", "UTF8_LCASE", "xXaxsXaxe"), + TranslateTestCase("TRanslater", "Rrnlt", "xXxXx", "UTF8_LCASE", "xxaxsXaxex"), + TranslateTestCase("TRanslater", "Rrnlt", "XxxXx", "UTF8_LCASE", "xXaxsXaxeX"), + // scalastyle:off + TranslateTestCase("test大千世界X大千世界", "界x", "AB", "UTF8_LCASE", "test大千世AB大千世A"), + TranslateTestCase("大千世界test大千世界", "TEST", "abcd", "UTF8_LCASE", "大千世界abca大千世界"), + TranslateTestCase("Test大千世界大千世界", "tT", "oO", "UTF8_LCASE", "oeso大千世界大千世界"), + TranslateTestCase("大千世界大千世界tesT", "Tt", "Oo", "UTF8_LCASE", "大千世界大千世界OesO"), + TranslateTestCase("大千世界大千世界tesT", "大千", "世世", "UTF8_LCASE", "世世世界世世世界tesT"), + // scalastyle:on + TranslateTestCase("Translate", "Rnlt", "1234", "UNICODE", "Tra2s3a4e"), + TranslateTestCase("TRanslate", "rnlt", "XxXx", "UNICODE", "TRaxsXaxe"), + TranslateTestCase("TRanslater", "Rrnlt", "xXxXx", "UNICODE", "TxaxsXaxeX"), + TranslateTestCase("TRanslater", "Rrnlt", "XxxXx", "UNICODE", "TXaxsXaxex"), + // scalastyle:off + TranslateTestCase("test大千世界X大千世界", "界x", "AB", "UNICODE", "test大千世AX大千世A"), + TranslateTestCase("Test大千世界大千世界", "tT", "oO", "UNICODE", "Oeso大千世界大千世界"), + TranslateTestCase("大千世界大千世界tesT", "Tt", "Oo", "UNICODE", "大千世界大千世界oesO"), + // scalastyle:on + TranslateTestCase("Translate", "Rnlt", "1234", "UNICODE_CI", "41a2s3a4e"), + TranslateTestCase("TRanslate", "rnlt", "XxXx", "UNICODE_CI", "xXaxsXaxe"), + TranslateTestCase("TRanslater", "Rrnlt", "xXxXx", "UNICODE_CI", "xxaxsXaxex"), + TranslateTestCase("TRanslater", "Rrnlt", "XxxXx", "UNICODE_CI", "xXaxsXaxeX"), + // scalastyle:off + TranslateTestCase("test大千世界X大千世界", "界x", "AB", "UNICODE_CI", "test大千世AB大千世A"), + TranslateTestCase("大千世界test大千世界", "TEST", "abcd", "UNICODE_CI", "大千世界abca大千世界"), + TranslateTestCase("Test大千世界大千世界", "tT", "oO", "UNICODE_CI", "oeso大千世界大千世界"), + TranslateTestCase("大千世界大千世界tesT", "Tt", "Oo", "UNICODE_CI", "大千世界大千世界OesO"), + TranslateTestCase("大千世界大千世界tesT", "大千", "世世", "UNICODE_CI", "世世世界世世世界tesT"), + // scalastyle:on + TranslateTestCase("Translate", "Rnlasdfjhgadt", "1234", "UTF8_LCASE", "14234e"), + TranslateTestCase("Translate", "Rnlasdfjhgadt", "1234", "UNICODE_CI", "14234e"), + TranslateTestCase("Translate", "Rnlasdfjhgadt", "1234", "UNICODE", "Tr4234e"), + TranslateTestCase("Translate", "Rnlasdfjhgadt", "1234", "UTF8_BINARY", "Tr4234e"), + TranslateTestCase("Translate", "Rnlt", "123495834634", "UTF8_LCASE", "41a2s3a4e"), + TranslateTestCase("Translate", "Rnlt", "123495834634", "UNICODE", "Tra2s3a4e"), + TranslateTestCase("Translate", "Rnlt", "123495834634", "UNICODE_CI", "41a2s3a4e"), + TranslateTestCase("Translate", "Rnlt", "123495834634", "UTF8_BINARY", "Tra2s3a4e"), + TranslateTestCase("abcdef", "abcde", "123", "UTF8_BINARY", "123f"), + TranslateTestCase("abcdef", "abcde", "123", "UTF8_LCASE", "123f"), + TranslateTestCase("abcdef", "abcde", "123", "UNICODE", "123f"), + TranslateTestCase("abcdef", "abcde", "123", "UNICODE_CI", "123f") + ) + + testCases.foreach(t => { + val query = s"SELECT translate(collate('${t.input}', '${t.collation}')," + + s"collate('${t.matchExpression}', '${t.collation}')," + + s"collate('${t.replaceExpression}', '${t.collation}'))" + // Result & data type + checkAnswer(sql(query), Row(t.result)) + assert(sql(query).schema.fields.head.dataType.sameType( + StringType(CollationFactory.collationNameToId(t.collation)))) + // Implicit casting + checkAnswer(sql(s"SELECT translate(collate('${t.input}', '${t.collation}')," + + s"'${t.matchExpression}', '${t.replaceExpression}')"), Row(t.result)) + checkAnswer(sql(s"SELECT translate('${t.input}', collate('${t.matchExpression}'," + + s"'${t.collation}'), '${t.replaceExpression}')"), Row(t.result)) + checkAnswer(sql(s"SELECT translate('${t.input}', '${t.matchExpression}'," + + s"collate('${t.replaceExpression}', '${t.collation}'))"), Row(t.result)) + }) + // Collation mismatch + val collationMismatch = intercept[AnalysisException] { + sql(s"SELECT translate(collate('Translate', 'UTF8_LCASE')," + + s"collate('Rnlt', 'UNICODE'), '1234')") + } + assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT") + } + + test("Support Replace string expression with collation") { + case class ReplaceTestCase[R](source: String, search: String, replace: String, + c: String, result: R) + val testCases = Seq( + // scalastyle:off + ReplaceTestCase("r世eplace", "pl", "123", "UTF8_BINARY", "r世e123ace"), + ReplaceTestCase("repl世ace", "PL", "AB", "UTF8_LCASE", "reAB世ace"), + ReplaceTestCase("abcdabcd", "bc", "", "UNICODE", "adad"), + ReplaceTestCase("aBc世abc", "b", "12", "UNICODE_CI", "a12c世a12c"), + ReplaceTestCase("abi̇o12i̇o", "İo", "yy", "UNICODE_CI", "abyy12yy"), + ReplaceTestCase("abİo12i̇o", "i̇o", "xx", "UNICODE_CI", "abxx12xx") + // scalastyle:on + ) + testCases.foreach(t => { + val query = s"SELECT replace(collate('${t.source}','${t.c}'),collate('${t.search}'," + + s"'${t.c}'),collate('${t.replace}','${t.c}'))" + // Result & data type + checkAnswer(sql(query), Row(t.result)) + assert(sql(query).schema.fields.head.dataType.sameType( + StringType(CollationFactory.collationNameToId(t.c)))) + // Implicit casting + checkAnswer(sql(s"SELECT replace(collate('${t.source}','${t.c}'),'${t.search}'," + + s"'${t.replace}')"), Row(t.result)) + checkAnswer(sql(s"SELECT replace('${t.source}',collate('${t.search}','${t.c}')," + + s"'${t.replace}')"), Row(t.result)) + checkAnswer(sql(s"SELECT replace('${t.source}','${t.search}'," + + s"collate('${t.replace}','${t.c}'))"), Row(t.result)) + }) + // Collation mismatch + val collationMismatch = intercept[AnalysisException] { + sql("SELECT startswith(collate('abcde', 'UTF8_LCASE'),collate('C', 'UNICODE_CI'))") } assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT") } @@ -145,7 +368,7 @@ class CollationStringExpressionsSuite val testCases = Seq( EndsWithTestCase("", "", "UTF8_BINARY", true), EndsWithTestCase("abcde", "E", "UNICODE", false), - EndsWithTestCase("abcde", "FGH", "UTF8_BINARY_LCASE", false), + EndsWithTestCase("abcde", "FGH", "UTF8_LCASE", false), EndsWithTestCase("abcde", "CDE", "UNICODE_CI", true) ) testCases.foreach(t => { @@ -159,7 +382,7 @@ class CollationStringExpressionsSuite }) // Collation mismatch val collationMismatch = intercept[AnalysisException] { - sql("SELECT endswith(collate('abcde', 'UTF8_BINARY_LCASE'),collate('C', 'UNICODE_CI'))") + sql("SELECT endswith(collate('abcde', 'UTF8_LCASE'),collate('C', 'UNICODE_CI'))") } assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT") } @@ -170,7 +393,7 @@ class CollationStringExpressionsSuite val testCases = Seq( StringRepeatTestCase("", 1, "UTF8_BINARY", ""), StringRepeatTestCase("a", 0, "UNICODE", ""), - StringRepeatTestCase("XY", 3, "UTF8_BINARY_LCASE", "XYXYXY"), + StringRepeatTestCase("XY", 3, "UTF8_LCASE", "XYXYXY"), StringRepeatTestCase("123", 2, "UNICODE_CI", "123123") ) testCases.foreach(t => { @@ -185,16 +408,16 @@ class CollationStringExpressionsSuite case class AsciiUnBase64TestCase[R](q: String, dt: DataType, r: R) val testCases = Seq( AsciiUnBase64TestCase("select ascii('a' collate utf8_binary)", IntegerType, 97), - AsciiUnBase64TestCase("select ascii('B' collate utf8_binary_lcase)", IntegerType, 66), + AsciiUnBase64TestCase("select ascii('B' collate utf8_lcase)", IntegerType, 66), AsciiUnBase64TestCase("select ascii('#' collate unicode)", IntegerType, 35), AsciiUnBase64TestCase("select ascii('!' collate unicode_ci)", IntegerType, 33), AsciiUnBase64TestCase("select unbase64('QUJD' collate utf8_binary)", BinaryType, Seq(65, 66, 67)), - AsciiUnBase64TestCase("select unbase64('eHl6' collate utf8_binary_lcase)", BinaryType, + AsciiUnBase64TestCase("select unbase64('eHl6' collate utf8_lcase)", BinaryType, Seq(120, 121, 122)), AsciiUnBase64TestCase("select unbase64('IyMj' collate utf8_binary)", BinaryType, Seq(35, 35, 35)), - AsciiUnBase64TestCase("select unbase64('IQ==' collate utf8_binary_lcase)", BinaryType, + AsciiUnBase64TestCase("select unbase64('IQ==' collate utf8_lcase)", BinaryType, Seq(33)) ) testCases.foreach(t => { @@ -208,11 +431,11 @@ class CollationStringExpressionsSuite case class DefaultCollationTestCase[R](q: String, c: String, r: R) val testCases = Seq( DefaultCollationTestCase("select chr(97)", "UTF8_BINARY", "a"), - DefaultCollationTestCase("select chr(66)", "UTF8_BINARY_LCASE", "B"), + DefaultCollationTestCase("select chr(66)", "UTF8_LCASE", "B"), DefaultCollationTestCase("select base64('xyz')", "UNICODE", "eHl6"), DefaultCollationTestCase("select base64('!')", "UNICODE_CI", "IQ=="), DefaultCollationTestCase("select decode(encode('$', 'utf-8'), 'utf-8')", "UTF8_BINARY", "$"), - DefaultCollationTestCase("select decode(encode('X', 'utf-8'), 'utf-8')", "UTF8_BINARY_LCASE", + DefaultCollationTestCase("select decode(encode('X', 'utf-8'), 'utf-8')", "UTF8_LCASE", "X"), DefaultCollationTestCase("select format_number(123.123, '###.###')", "UNICODE", "123.123"), DefaultCollationTestCase("select format_number(99.99, '##.##')", "UNICODE_CI", "99.99") @@ -231,7 +454,7 @@ class CollationStringExpressionsSuite val testCases = Seq( EncodeToBinarySentencesTestCase("select encode('a' collate utf8_binary, 'utf-8')", BinaryType, Seq(97)), - EncodeToBinarySentencesTestCase("select encode('$' collate utf8_binary_lcase, 'utf-8')", + EncodeToBinarySentencesTestCase("select encode('$' collate utf8_lcase, 'utf-8')", BinaryType, Seq(36)), EncodeToBinarySentencesTestCase("select to_binary('B' collate unicode, 'utf-8')", BinaryType, Seq(66)), @@ -244,9 +467,9 @@ class CollationStringExpressionsSuite ArrayType(ArrayType(StringType)), Seq(Seq("Hello", "world"), Seq("Nice", "day"))), EncodeToBinarySentencesTestCase( """ - |select sentences('Something else. Nothing here.' collate utf8_binary_lcase) + |select sentences('Something else. Nothing here.' collate utf8_lcase) |""".stripMargin, - ArrayType(ArrayType(StringType("UTF8_BINARY_LCASE"))), + ArrayType(ArrayType(StringType("UTF8_LCASE"))), Seq(Seq("Something", "else"), Seq("Nothing", "here"))) ) testCases.foreach(t => { @@ -261,7 +484,7 @@ class CollationStringExpressionsSuite case class UpperTestCase[R](s: String, c: String, result: R) val testCases = Seq( UpperTestCase("aBc", "UTF8_BINARY", "ABC"), - UpperTestCase("aBc", "UTF8_BINARY_LCASE", "ABC"), + UpperTestCase("aBc", "UTF8_LCASE", "ABC"), UpperTestCase("aBc", "UNICODE", "ABC"), UpperTestCase("aBc", "UNICODE_CI", "ABC") ) @@ -278,7 +501,7 @@ class CollationStringExpressionsSuite case class LowerTestCase[R](s: String, c: String, result: R) val testCases = Seq( LowerTestCase("aBc", "UTF8_BINARY", "abc"), - LowerTestCase("aBc", "UTF8_BINARY_LCASE", "abc"), + LowerTestCase("aBc", "UTF8_LCASE", "abc"), LowerTestCase("aBc", "UNICODE", "abc"), LowerTestCase("aBc", "UNICODE_CI", "abc") ) @@ -295,7 +518,7 @@ class CollationStringExpressionsSuite case class InitCapTestCase[R](s: String, c: String, result: R) val testCases = Seq( InitCapTestCase("aBc ABc", "UTF8_BINARY", "Abc Abc"), - InitCapTestCase("aBc ABc", "UTF8_BINARY_LCASE", "Abc Abc"), + InitCapTestCase("aBc ABc", "UTF8_LCASE", "Abc Abc"), InitCapTestCase("aBc ABc", "UNICODE", "Abc Abc"), InitCapTestCase("aBc ABc", "UNICODE_CI", "Abc Abc") ) @@ -307,9 +530,552 @@ class CollationStringExpressionsSuite }) } + test("Overlay string expression with collation") { + // Supported collations + case class OverlayTestCase(l: String, r: String, pos: Int, c: String, result: String) + val testCases = Seq( + OverlayTestCase("hello", " world", 6, "UTF8_BINARY", "hello world"), + OverlayTestCase("nice", " day", 5, "UTF8_LCASE", "nice day"), + OverlayTestCase("A", "B", 1, "UNICODE", "B"), + OverlayTestCase("!", "!!!", 1, "UNICODE_CI", "!!!") + ) + testCases.foreach(t => { + val query = + s""" + |select overlay(collate('${t.l}', '${t.c}') placing + |collate('${t.r}', '${t.c}') from ${t.pos}) + |""".stripMargin + // Result & data type + checkAnswer(sql(query), Row(t.result)) + assert(sql(query).schema.fields.head.dataType.sameType(StringType(t.c))) + // Implicit casting + checkAnswer(sql( + s""" + |select overlay(collate('${t.l}', '${t.c}') placing '${t.r}' from ${t.pos}) + |""".stripMargin), Row(t.result)) + checkAnswer(sql( + s""" + |select overlay('${t.l}' placing collate('${t.r}', '${t.c}') from ${t.pos}) + |""".stripMargin), Row(t.result)) + checkAnswer(sql( + s""" + |select overlay(collate('${t.l}', '${t.c}') + |placing '${t.r}' from collate('${t.pos}', '${t.c}')) + |""".stripMargin), Row(t.result)) + }) + // Collation mismatch + assert( + intercept[AnalysisException] { + sql("SELECT overlay('a' collate UNICODE PLACING 'b' collate UNICODE_CI FROM 1)") + }.getErrorClass == "COLLATION_MISMATCH.EXPLICIT" + ) + } + + test("FormatString string expression with collation") { + // Supported collations + case class FormatStringTestCase(f: String, a: Seq[Any], c: String, r: String) + val testCases = Seq( + FormatStringTestCase("%s%s", Seq("'a'", "'b'"), "UTF8_BINARY", "ab"), + FormatStringTestCase("%d", Seq(123), "UTF8_LCASE", "123"), + FormatStringTestCase("%s%d", Seq("'A'", 0), "UNICODE", "A0"), + FormatStringTestCase("%s%s", Seq("'Hello'", "'!!!'"), "UNICODE_CI", "Hello!!!") + ) + testCases.foreach(t => { + val query = + s""" + |select format_string(collate('${t.f}', '${t.c}'), ${t.a.mkString(", ")}) + |""".stripMargin + // Result & data type + checkAnswer(sql(query), Row(t.r)) + assert(sql(query).schema.fields.head.dataType.sameType(StringType(t.c))) + }) + } + + test("SoundEx string expression with collation") { + // Supported collations + case class SoundExTestCase(q: String, c: String, r: String) + val testCases = Seq( + SoundExTestCase("select soundex('A' collate utf8_binary)", "UTF8_BINARY", "A000"), + SoundExTestCase("select soundex('!' collate utf8_lcase)", "UTF8_LCASE", "!"), + SoundExTestCase("select soundex('$' collate unicode)", "UNICODE", "$"), + SoundExTestCase("select soundex('X' collate unicode_ci)", "UNICODE_CI", "X000") + ) + testCases.foreach(t => { + withSQLConf(SQLConf.DEFAULT_COLLATION.key -> t.c) { + // Result & data type + checkAnswer(sql(t.q), Row(t.r)) + assert(sql(t.q).schema.fields.head.dataType.sameType(StringType(t.c))) + } + }) + } + + test("Length, BitLength & OctetLength string expressions with collations") { + // Supported collations + case class LenTestCase(q: String, r: Int) + val testCases = Seq( + LenTestCase("select length('hello' collate utf8_binary)", 5), + LenTestCase("select length('world' collate utf8_lcase)", 5), + LenTestCase("select length('ff' collate unicode)", 1), + LenTestCase("select bit_length('hello' collate unicode_ci)", 40), + LenTestCase("select bit_length('world' collate utf8_binary)", 40), + LenTestCase("select bit_length('ff' collate utf8_lcase)", 24), + LenTestCase("select octet_length('hello' collate unicode)", 5), + LenTestCase("select octet_length('world' collate unicode_ci)", 5), + LenTestCase("select octet_length('ff' collate utf8_binary)", 3) + ) + testCases.foreach(t => { + // Result & data type + checkAnswer(sql(t.q), Row(t.r)) + assert(sql(t.q).schema.fields.head.dataType.sameType(IntegerType)) + }) + } + + test("Luhncheck string expression with collation") { + // Supported collations + case class LuhncheckTestCase(q: String, c: String, r: Boolean) + val testCases = Seq( + LuhncheckTestCase("123", "UTF8_BINARY", r = false), + LuhncheckTestCase("000", "UTF8_LCASE", r = true), + LuhncheckTestCase("111", "UNICODE", r = false), + LuhncheckTestCase("222", "UNICODE_CI", r = false) + ) + testCases.foreach(t => { + val query = s"select luhn_check(${t.q})" + // Result & data type + checkAnswer(sql(query), Row(t.r)) + assert(sql(query).schema.fields.head.dataType.sameType(BooleanType)) + }) + } + + test("Levenshtein string expression with collation") { + // Supported collations + case class LevenshteinTestCase( + left: String, right: String, collationName: String, threshold: Option[Int], result: Int + ) + val testCases = Seq( + LevenshteinTestCase("kitten", "sitTing", "UTF8_BINARY", None, result = 4), + LevenshteinTestCase("kitten", "sitTing", "UTF8_LCASE", None, result = 4), + LevenshteinTestCase("kitten", "sitTing", "UNICODE", Some(3), result = -1), + LevenshteinTestCase("kitten", "sitTing", "UNICODE_CI", Some(3), result = -1) + ) + testCases.foreach(t => { + withSQLConf(SQLConf.DEFAULT_COLLATION.key -> t.collationName) { + val th = if (t.threshold.isDefined) s", ${t.threshold.get}" else "" + val query = s"select levenshtein('${t.left}', '${t.right}'$th)" + // Result & data type + checkAnswer(sql(query), Row(t.result)) + assert(sql(query).schema.fields.head.dataType.sameType(IntegerType)) + } + }) + } + + test("Support IsValidUTF8 string expression with collation") { + // Supported collations + case class IsValidUTF8TestCase(input: String, collationName: String, result: Any) + val testCases = Seq( + IsValidUTF8TestCase("null", "UTF8_BINARY", result = null), + IsValidUTF8TestCase("''", "UTF8_LCASE", result = true), + IsValidUTF8TestCase("'abc'", "UNICODE", result = true), + IsValidUTF8TestCase("x'FF'", "UNICODE_CI", result = false) + ) + testCases.foreach { testCase => + withSQLConf(SQLConf.DEFAULT_COLLATION.key -> testCase.collationName) { + val query = s"SELECT is_valid_utf8(${testCase.input})" + // Result & data type + checkAnswer(sql(query), Row(testCase.result)) + assert(sql(query).schema.fields.head.dataType.sameType(BooleanType)) + } + } + } + + test("Support MakeValidUTF8 string expression with collation") { + // Supported collations + case class MakeValidUTF8TestCase(input: String, collationName: String, result: Any) + val testCases = Seq( + MakeValidUTF8TestCase("null", "UTF8_BINARY", result = null), + MakeValidUTF8TestCase("''", "UTF8_LCASE", result = ""), + MakeValidUTF8TestCase("'abc'", "UNICODE", result = "abc"), + MakeValidUTF8TestCase("x'FF'", "UNICODE_CI", result = "\uFFFD") + ) + testCases.foreach { testCase => + withSQLConf(SQLConf.DEFAULT_COLLATION.key -> testCase.collationName) { + val query = s"SELECT make_valid_utf8(${testCase.input})" + // Result & data type + checkAnswer(sql(query), Row(testCase.result)) + val dataType = StringType(testCase.collationName) + assert(sql(query).schema.fields.head.dataType.sameType(dataType)) + } + } + } + + test("Support ValidateUTF8 string expression with collation") { + // Supported collations + case class ValidateUTF8TestCase(input: String, collationName: String, result: Any) + val testCases = Seq( + ValidateUTF8TestCase("null", "UTF8_BINARY", result = null), + ValidateUTF8TestCase("''", "UTF8_LCASE", result = ""), + ValidateUTF8TestCase("'abc'", "UNICODE", result = "abc"), + ValidateUTF8TestCase("x'FF'", "UNICODE_CI", result = None) + ) + testCases.foreach { testCase => + withSQLConf(SQLConf.DEFAULT_COLLATION.key -> testCase.collationName) { + val query = s"SELECT validate_utf8(${testCase.input})" + if (testCase.result == None) { + // Exception thrown + val e = intercept[SparkIllegalArgumentException] { + sql(query).collect() + } + assert(e.getErrorClass == "INVALID_UTF8_STRING") + assert(e.getMessageParameters.asScala == Map("str" -> "\\xFF")) + } else { + // Result & data type + checkAnswer(sql(query), Row(testCase.result)) + val dataType = StringType(testCase.collationName) + assert(sql(query).schema.fields.head.dataType.sameType(dataType)) + } + } + } + } + + test("Support TryValidateUTF8 string expression with collation") { + // Supported collations + case class ValidateUTF8TestCase(input: String, collationName: String, result: Any) + val testCases = Seq( + ValidateUTF8TestCase("null", "UTF8_BINARY", result = null), + ValidateUTF8TestCase("''", "UTF8_LCASE", result = ""), + ValidateUTF8TestCase("'abc'", "UNICODE", result = "abc"), + ValidateUTF8TestCase("x'FF'", "UNICODE_CI", result = null) + ) + testCases.foreach { testCase => + withSQLConf(SQLConf.DEFAULT_COLLATION.key -> testCase.collationName) { + val query = s"SELECT try_validate_utf8(${testCase.input})" + // Result & data type + checkAnswer(sql(query), Row(testCase.result)) + assert(sql(query).schema.fields.head.dataType.sameType(StringType(testCase.collationName))) + } + } + } + + test("Support Left/Right/Substr with collation") { + case class SubstringTestCase( + method: String, + str: String, + len: String, + pad: Option[String], + collation: String, + result: Row) { + val strString = if (str == "null") "null" else s"'$str'" + val query = + s"SELECT $method(collate($strString, '$collation')," + + s" $len${pad.map(p => s", '$p'").getOrElse("")})" + } + + val checks = Seq( + SubstringTestCase("substr", "example", "1", Some("100"), "utf8_lcase", Row("example")), + SubstringTestCase("substr", "example", "2", Some("2"), "utf8_binary", Row("xa")), + SubstringTestCase("right", "", "1", None, "utf8_lcase", Row("")), + SubstringTestCase("substr", "example", "0", Some("0"), "unicode", Row("")), + SubstringTestCase("substr", "example", "-3", Some("2"), "unicode_ci", Row("pl")), + SubstringTestCase("substr", " a世a ", "2", Some("3"), "utf8_lcase", Row("a世a")), + SubstringTestCase("left", " a世a ", "3", None, "utf8_binary", Row(" a世")), + SubstringTestCase("right", " a世a ", "3", None, "unicode", Row("世a ")), + SubstringTestCase("left", "ÀÃÂĀĂȦÄäåäáâãȻȻȻȻȻǢǼÆ", "3", None, "unicode_ci", Row("ÀÃÂ")), + SubstringTestCase("right", "ÀÃÂĀĂȦÄäâãȻȻȻȻȻǢǼÆ", "3", None, "utf8_lcase", Row("ǢǼÆ")), + SubstringTestCase("substr", "", "1", Some("1"), "utf8_lcase", Row("")), + SubstringTestCase("substr", "", "1", Some("1"), "unicode", Row("")), + SubstringTestCase("left", "", "1", None, "utf8_binary", Row("")), + SubstringTestCase("left", "null", "1", None, "utf8_lcase", Row(null)), + SubstringTestCase("right", "null", "1", None, "unicode", Row(null)), + SubstringTestCase("substr", "null", "1", None, "utf8_binary", Row(null)), + SubstringTestCase("substr", "null", "1", Some("1"), "unicode_ci", Row(null)), + SubstringTestCase("left", "null", "null", None, "utf8_lcase", Row(null)), + SubstringTestCase("right", "null", "null", None, "unicode", Row(null)), + SubstringTestCase("substr", "null", "null", Some("null"), "utf8_binary", Row(null)), + SubstringTestCase("substr", "null", "null", None, "unicode_ci", Row(null)), + SubstringTestCase("left", "ÀÃÂȦÄäåäáâãȻȻȻǢǼÆ", "null", None, "utf8_lcase", Row(null)), + SubstringTestCase("right", "ÀÃÂĀĂȦÄäåäáâãȻȻȻȻȻǢǼÆ", "null", None, "unicode", Row(null)), + SubstringTestCase("substr", "ÀÃÂĀĂȦÄäåäáâãȻȻȻȻȻǢǼÆ", "null", None, "utf8_binary", Row(null)), + SubstringTestCase("substr", "", "null", None, "unicode_ci", Row(null)) + ) + + checks.foreach { check => + // Result & data type + checkAnswer(sql(check.query), check.result) + assert(sql(check.query).schema.fields.head.dataType.sameType(StringType(check.collation))) + } + } + + test("Support StringRPad string expressions with collation") { + // Supported collations + case class StringRPadTestCase[R](s: String, len: Int, pad: String, c: String, result: R) + val testCases = Seq( + StringRPadTestCase("", 5, " ", "UTF8_BINARY", " "), + StringRPadTestCase("abc", 5, " ", "UNICODE", "abc "), + StringRPadTestCase("Hello", 7, "Wörld", "UTF8_LCASE", "HelloWö"), + StringRPadTestCase("1234567890", 5, "aaaAAa", "UNICODE_CI", "12345"), + StringRPadTestCase("aaAA", 2, " ", "UTF8_BINARY", "aa"), + StringRPadTestCase("ÀÃÂĀĂȦÄäåäáâãȻȻȻȻȻǢǼÆ℀℃", 2, "1", "UTF8_LCASE", "ÀÃ"), + StringRPadTestCase("ĂȦÄäåäá", 20, "ÀÃÂĀĂȦÄäåäáâãȻȻȻȻȻǢǼÆ", "UNICODE", "ĂȦÄäåäáÀÃÂĀĂȦÄäåäáâã"), + StringRPadTestCase("aȦÄä", 8, "a1", "UNICODE_CI", "aȦÄäa1a1") + ) + testCases.foreach(t => { + val query = s"SELECT rpad(collate('${t.s}', '${t.c}')," + + s" ${t.len}, collate('${t.pad}', '${t.c}'))" + // Result & data type + checkAnswer(sql(query), Row(t.result)) + assert(sql(query).schema.fields.head.dataType.sameType(StringType(t.c))) + // Implicit casting + checkAnswer( + sql(s"SELECT rpad(collate('${t.s}', '${t.c}'), ${t.len}, '${t.pad}')"), + Row(t.result)) + checkAnswer( + sql(s"SELECT rpad('${t.s}', ${t.len}, collate('${t.pad}', '${t.c}'))"), + Row(t.result)) + }) + // Collation mismatch + val collationMismatch = intercept[AnalysisException] { + sql("SELECT rpad(collate('abcde', 'UNICODE_CI'),1,collate('C', 'UTF8_LCASE'))") + } + assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT") + } + + test("Support StringLPad string expressions with collation") { + // Supported collations + case class StringLPadTestCase[R](s: String, len: Int, pad: String, c: String, result: R) + val testCases = Seq( + StringLPadTestCase("", 5, " ", "UTF8_BINARY", " "), + StringLPadTestCase("abc", 5, " ", "UNICODE", " abc"), + StringLPadTestCase("Hello", 7, "Wörld", "UTF8_LCASE", "WöHello"), + StringLPadTestCase("1234567890", 5, "aaaAAa", "UNICODE_CI", "12345"), + StringLPadTestCase("aaAA", 2, " ", "UTF8_BINARY", "aa"), + StringLPadTestCase("ÀÃÂĀĂȦÄäåäáâãȻȻȻȻȻǢǼÆ℀℃", 2, "1", "UTF8_LCASE", "ÀÃ"), + StringLPadTestCase("ĂȦÄäåäá", 20, "ÀÃÂĀĂȦÄäåäáâãȻȻȻȻȻǢǼÆ", "UNICODE", "ÀÃÂĀĂȦÄäåäáâãĂȦÄäåäá"), + StringLPadTestCase("aȦÄä", 8, "a1", "UNICODE_CI", "a1a1aȦÄä") + ) + testCases.foreach(t => { + val query = s"SELECT lpad(collate('${t.s}', '${t.c}')," + + s" ${t.len}, collate('${t.pad}', '${t.c}'))" + // Result & data type + checkAnswer(sql(query), Row(t.result)) + assert(sql(query).schema.fields.head.dataType.sameType(StringType(t.c))) + // Implicit casting + checkAnswer( + sql(s"SELECT lpad(collate('${t.s}', '${t.c}'), ${t.len}, '${t.pad}')"), + Row(t.result)) + checkAnswer( + sql(s"SELECT lpad('${t.s}', ${t.len}, collate('${t.pad}', '${t.c}'))"), + Row(t.result)) + }) + // Collation mismatch + val collationMismatch = intercept[AnalysisException] { + sql("SELECT lpad(collate('abcde', 'UNICODE_CI'),1,collate('C', 'UTF8_LCASE'))") + } + assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT") + } + + test("Support StringLPad string expressions with explicit collation on second parameter") { + val query = "SELECT lpad('abc', collate('5', 'unicode_ci'), ' ')" + checkAnswer(sql(query), Row(" abc")) + assert(sql(query).schema.fields.head.dataType.sameType(StringType(0))) + } + + test("Support Locate string expression with collation") { + case class StringLocateTestCase[R](substring: String, string: String, start: Integer, + c: String, result: R) + val testCases = Seq( + // scalastyle:off + StringLocateTestCase("aa", "aaads", 0, "UTF8_BINARY", 0), + StringLocateTestCase("aa", "Aaads", 0, "UTF8_LCASE", 0), + StringLocateTestCase("界x", "test大千世界X大千世界", 1, "UTF8_LCASE", 8), + StringLocateTestCase("aBc", "abcabc", 4, "UTF8_LCASE", 4), + StringLocateTestCase("aa", "Aaads", 0, "UNICODE", 0), + StringLocateTestCase("abC", "abCabC", 2, "UNICODE", 4), + StringLocateTestCase("aa", "Aaads", 0, "UNICODE_CI", 0), + StringLocateTestCase("界x", "test大千世界X大千世界", 1, "UNICODE_CI", 8) + // scalastyle:on + ) + testCases.foreach(t => { + val query = s"SELECT locate(collate('${t.substring}','${t.c}')," + + s"collate('${t.string}','${t.c}'),${t.start})" + // Result & data type + checkAnswer(sql(query), Row(t.result)) + assert(sql(query).schema.fields.head.dataType.sameType(IntegerType)) + // Implicit casting + checkAnswer(sql(s"SELECT locate(collate('${t.substring}','${t.c}')," + + s"'${t.string}',${t.start})"), Row(t.result)) + checkAnswer(sql(s"SELECT locate('${t.substring}',collate('${t.string}'," + + s"'${t.c}'),${t.start})"), Row(t.result)) + }) + // Collation mismatch + val collationMismatch = intercept[AnalysisException] { + sql("SELECT locate(collate('aBc', 'UTF8_BINARY'),collate('abcabc', 'UTF8_LCASE'),4)") + } + assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT") + } + + test("StringTrim* functions - unit tests for both paths (codegen and eval)") { + // Without trimString param. + checkEvaluation( + StringTrim(Literal.create( " asd ", StringType("UTF8_BINARY"))), "asd") + checkEvaluation( + StringTrimLeft(Literal.create(" asd ", StringType("UTF8_LCASE"))), "asd ") + checkEvaluation(StringTrimRight( + Literal.create(" asd ", StringType("UTF8_BINARY"))), " asd") + + // With trimString param. + checkEvaluation( + StringTrim( + Literal.create(" asd ", StringType("UTF8_BINARY")), + Literal.create(" ", StringType("UTF8_BINARY"))), + "asd") + checkEvaluation( + StringTrimLeft( + Literal.create(" asd ", StringType("UTF8_LCASE")), + Literal.create(" ", StringType("UTF8_LCASE"))), + "asd ") + checkEvaluation( + StringTrimRight( + Literal.create(" asd ", StringType("UTF8_BINARY")), + Literal.create(" ", StringType("UTF8_BINARY"))), + " asd") + + checkEvaluation( + StringTrim( + Literal.create("xxasdxx", StringType("UTF8_BINARY")), + Literal.create("x", StringType("UTF8_BINARY"))), + "asd") + checkEvaluation( + StringTrimLeft( + Literal.create("xxasdxx", StringType("UTF8_LCASE")), + Literal.create("x", StringType("UTF8_LCASE"))), + "asdxx") + checkEvaluation( + StringTrimRight( + Literal.create("xxasdxx", StringType("UTF8_BINARY")), + Literal.create("x", StringType("UTF8_BINARY"))), + "xxasd") + } + + test("StringTrim* functions - E2E tests") { + case class StringTrimTestCase( + collation: String, + trimFunc: String, + sourceString: String, + hasTrimString: Boolean, + trimString: String, + expectedResultString: String) + + val testCases = Seq( + StringTrimTestCase("UTF8_BINARY", "TRIM", " asd ", false, null, "asd"), + StringTrimTestCase("UTF8_BINARY", "BTRIM", " asd ", true, null, null), + StringTrimTestCase("UTF8_BINARY", "LTRIM", "xxasdxx", true, "x", "asdxx"), + StringTrimTestCase("UTF8_BINARY", "RTRIM", "xxasdxx", true, "x", "xxasd"), + + StringTrimTestCase("UTF8_LCASE", "TRIM", " asd ", true, null, null), + StringTrimTestCase("UTF8_LCASE", "BTRIM", "xxasdxx", true, "x", "asd"), + StringTrimTestCase("UTF8_LCASE", "LTRIM", "xxasdxx", true, "x", "asdxx"), + StringTrimTestCase("UTF8_LCASE", "RTRIM", " asd ", false, null, " asd"), + + StringTrimTestCase("UTF8_BINARY", "TRIM", "xxasdxx", true, "x", "asd"), + StringTrimTestCase("UTF8_BINARY", "BTRIM", "xxasdxx", true, "x", "asd"), + StringTrimTestCase("UTF8_BINARY", "LTRIM", " asd ", false, null, "asd "), + StringTrimTestCase("UTF8_BINARY", "RTRIM", " asd ", true, null, null) + + // Other more complex cases can be found in unit tests in CollationSupportSuite.java. + ) + + testCases.foreach(testCase => { + var df: DataFrame = null + + if (testCase.trimFunc.equalsIgnoreCase("BTRIM")) { + // BTRIM has arguments in (srcStr, trimStr) order + df = sql(s"SELECT ${testCase.trimFunc}(" + + s"COLLATE('${testCase.sourceString}', '${testCase.collation}')" + + (if (!testCase.hasTrimString) "" + else if (testCase.trimString == null) ", null" + else s", '${testCase.trimString}'") + + ")") + } + else { + // While other functions have arguments in (trimStr, srcStr) order + df = sql(s"SELECT ${testCase.trimFunc}(" + + (if (!testCase.hasTrimString) "" + else if (testCase.trimString == null) "null, " + else s"'${testCase.trimString}', ") + + s"COLLATE('${testCase.sourceString}', '${testCase.collation}')" + + ")") + } + + checkAnswer(df = df, expectedAnswer = Row(testCase.expectedResultString)) + }) + } + + test("StringTrim* functions - implicit collations") { + checkAnswer( + df = sql("SELECT TRIM(COLLATE('x', 'UTF8_BINARY'), COLLATE('xax', 'UTF8_BINARY'))"), + expectedAnswer = Row("a")) + checkAnswer( + df = sql("SELECT BTRIM(COLLATE('xax', 'UTF8_LCASE'), " + + "COLLATE('x', 'UTF8_LCASE'))"), + expectedAnswer = Row("a")) + checkAnswer( + df = sql("SELECT LTRIM(COLLATE('x', 'UTF8_BINARY'), COLLATE('xax', 'UTF8_BINARY'))"), + expectedAnswer = Row("ax")) + + checkAnswer( + df = sql("SELECT RTRIM('x', COLLATE('xax', 'UTF8_BINARY'))"), + expectedAnswer = Row("xa")) + checkAnswer( + df = sql("SELECT TRIM('x', COLLATE('xax', 'UTF8_LCASE'))"), + expectedAnswer = Row("a")) + checkAnswer( + df = sql("SELECT BTRIM('xax', COLLATE('x', 'UTF8_BINARY'))"), + expectedAnswer = Row("a")) + + checkAnswer( + df = sql("SELECT LTRIM(COLLATE('x', 'UTF8_BINARY'), 'xax')"), + expectedAnswer = Row("ax")) + checkAnswer( + df = sql("SELECT RTRIM(COLLATE('x', 'UTF8_LCASE'), 'xax')"), + expectedAnswer = Row("xa")) + checkAnswer( + df = sql("SELECT TRIM(COLLATE('x', 'UTF8_BINARY'), 'xax')"), + expectedAnswer = Row("a")) + } + + test("StringTrim* functions - collation type mismatch") { + List("TRIM", "LTRIM", "RTRIM").foreach(func => { + val collationMismatch = intercept[AnalysisException] { + sql("SELECT " + func + "(COLLATE('x', 'UTF8_LCASE'), " + + "COLLATE('xxaaaxx', 'UTF8_BINARY'))") + } + assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT") + }) + + val collationMismatch = intercept[AnalysisException] { + sql("SELECT BTRIM(COLLATE('xxaaaxx', 'UTF8_BINARY'), COLLATE('x', 'UTF8_LCASE'))") + } + assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT") + } + + test("StringTrim* functions - unsupported collation types") { + List("TRIM", "LTRIM", "RTRIM").foreach(func => { + val collationMismatch = intercept[AnalysisException] { + sql("SELECT " + func + "(COLLATE('x', 'UNICODE_CI'), COLLATE('xxaaaxx', 'UNICODE_CI'))") + } + assert(collationMismatch.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") + }) + + val collationMismatch = intercept[AnalysisException] { + sql("SELECT BTRIM(COLLATE('xxaaaxx', 'UNICODE_CI'), COLLATE('x', 'UNICODE_CI'))") + } + assert(collationMismatch.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") + } + // TODO: Add more tests for other string expressions } +// scalastyle:on nonascii class CollationStringExpressionsANSISuite extends CollationStringExpressionsSuite { override protected def sparkConf: SparkConf = diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala index c4ddd25c99b6c..f662b86eaf815 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala @@ -17,11 +17,12 @@ package org.apache.spark.sql -import scala.collection.immutable.Seq import scala.jdk.CollectionConverters.MapHasAsJava import org.apache.spark.SparkException import org.apache.spark.sql.catalyst.ExtendedAnalysisException +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.parser.ParseException import org.apache.spark.sql.catalyst.util.CollationFactory import org.apache.spark.sql.connector.{DatasourceV2SQLBase, FakeV2ProviderWithCustomSchema} import org.apache.spark.sql.connector.catalog.{Identifier, InMemoryTable} @@ -30,8 +31,9 @@ import org.apache.spark.sql.connector.catalog.CatalogV2Util.withDefaultOwnership import org.apache.spark.sql.errors.DataTypeErrors.toSQLType import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper import org.apache.spark.sql.execution.aggregate.{HashAggregateExec, ObjectHashAggregateExec} -import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, SortMergeJoinExec} -import org.apache.spark.sql.internal.SqlApiConf +import org.apache.spark.sql.execution.columnar.InMemoryTableScanExec +import org.apache.spark.sql.execution.joins._ +import org.apache.spark.sql.internal.{SqlApiConf, SQLConf} import org.apache.spark.sql.types.{MapType, StringType, StructField, StructType} class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { @@ -42,7 +44,7 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { private val allFileBasedDataSources = collationPreservingSources ++ collationNonPreservingSources test("collate returns proper type") { - Seq("utf8_binary", "utf8_binary_lcase", "unicode", "unicode_ci").foreach { collationName => + Seq("utf8_binary", "utf8_lcase", "unicode", "unicode_ci").foreach { collationName => checkAnswer(sql(s"select 'aaa' collate $collationName"), Row("aaa")) val collationId = CollationFactory.collationNameToId(collationName) assert(sql(s"select 'aaa' collate $collationName").schema(0).dataType @@ -51,7 +53,7 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { } test("collation name is case insensitive") { - Seq("uTf8_BiNaRy", "uTf8_BiNaRy_Lcase", "uNicOde", "UNICODE_ci").foreach { collationName => + Seq("uTf8_BiNaRy", "utf8_lcase", "uNicOde", "UNICODE_ci").foreach { collationName => checkAnswer(sql(s"select 'aaa' collate $collationName"), Row("aaa")) val collationId = CollationFactory.collationNameToId(collationName) assert(sql(s"select 'aaa' collate $collationName").schema(0).dataType @@ -60,15 +62,25 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { } test("collation expression returns name of collation") { - Seq("utf8_binary", "utf8_binary_lcase", "unicode", "unicode_ci").foreach { collationName => + Seq("utf8_binary", "utf8_lcase", "unicode", "unicode_ci").foreach { collationName => checkAnswer( sql(s"select collation('aaa' collate $collationName)"), Row(collationName.toUpperCase())) } } test("collate function syntax") { - assert(sql(s"select collate('aaa', 'utf8_binary')").schema(0).dataType == StringType(0)) - assert(sql(s"select collate('aaa', 'utf8_binary_lcase')").schema(0).dataType == StringType(1)) + assert(sql(s"select collate('aaa', 'utf8_binary')").schema(0).dataType == + StringType("UTF8_BINARY")) + assert(sql(s"select collate('aaa', 'utf8_lcase')").schema(0).dataType == + StringType("UTF8_LCASE")) + } + + test("collate function syntax with default collation set") { + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> "UTF8_LCASE") { + assert(sql(s"select collate('aaa', 'utf8_lcase')").schema(0).dataType == + StringType("UTF8_LCASE")) + assert(sql(s"select collate('aaa', 'UNICODE')").schema(0).dataType == StringType("UNICODE")) + } } test("collate function syntax invalid arg count") { @@ -141,7 +153,7 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { exception = intercept[SparkException] { sql("select 'aaa' collate UTF8_BS") }, errorClass = "COLLATION_INVALID_NAME", sqlState = "42704", - parameters = Map("proposal" -> "UTF8_BINARY", "collationName" -> "UTF8_BS")) + parameters = Map("collationName" -> "UTF8_BS", "proposals" -> "UTF8_LCASE")) } test("disable bucketing on collated string column") { @@ -178,9 +190,9 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { Seq( ("utf8_binary", "aaa", "AAA", false), ("utf8_binary", "aaa", "aaa", true), - ("utf8_binary_lcase", "aaa", "aaa", true), - ("utf8_binary_lcase", "aaa", "AAA", true), - ("utf8_binary_lcase", "aaa", "bbb", false), + ("utf8_lcase", "aaa", "aaa", true), + ("utf8_lcase", "aaa", "AAA", true), + ("utf8_lcase", "aaa", "bbb", false), ("unicode", "aaa", "aaa", true), ("unicode", "aaa", "AAA", false), ("unicode_CI", "aaa", "aaa", true), @@ -202,9 +214,9 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { ("utf8_binary", "AAA", "aaa", true), ("utf8_binary", "aaa", "aaa", false), ("utf8_binary", "aaa", "BBB", false), - ("utf8_binary_lcase", "aaa", "aaa", false), - ("utf8_binary_lcase", "AAA", "aaa", false), - ("utf8_binary_lcase", "aaa", "bbb", true), + ("utf8_lcase", "aaa", "aaa", false), + ("utf8_lcase", "AAA", "aaa", false), + ("utf8_lcase", "aaa", "bbb", true), ("unicode", "aaa", "aaa", false), ("unicode", "aaa", "AAA", true), ("unicode", "aaa", "BBB", true), @@ -276,9 +288,9 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { ("utf8_binary", Seq("AAA", "aaa"), Seq(Row(1, "AAA"), Row(1, "aaa"))), ("utf8_binary", Seq("aaa", "aaa"), Seq(Row(2, "aaa"))), ("utf8_binary", Seq("aaa", "bbb"), Seq(Row(1, "aaa"), Row(1, "bbb"))), - ("utf8_binary_lcase", Seq("aaa", "aaa"), Seq(Row(2, "aaa"))), - ("utf8_binary_lcase", Seq("AAA", "aaa"), Seq(Row(2, "AAA"))), - ("utf8_binary_lcase", Seq("aaa", "bbb"), Seq(Row(1, "aaa"), Row(1, "bbb"))), + ("utf8_lcase", Seq("aaa", "aaa"), Seq(Row(2, "aaa"))), + ("utf8_lcase", Seq("AAA", "aaa"), Seq(Row(2, "AAA"))), + ("utf8_lcase", Seq("aaa", "bbb"), Seq(Row(1, "aaa"), Row(1, "bbb"))), ("unicode", Seq("AAA", "aaa"), Seq(Row(1, "AAA"), Row(1, "aaa"))), ("unicode", Seq("aaa", "aaa"), Seq(Row(2, "aaa"))), ("unicode", Seq("aaa", "bbb"), Seq(Row(1, "aaa"), Row(1, "bbb"))), @@ -304,7 +316,7 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { val tableNameBinary = "T_BINARY" withTable(tableNameNonBinary) { withTable(tableNameBinary) { - sql(s"CREATE TABLE $tableNameNonBinary (c STRING COLLATE UTF8_BINARY_LCASE) USING PARQUET") + sql(s"CREATE TABLE $tableNameNonBinary (c STRING COLLATE UTF8_LCASE) USING PARQUET") sql(s"INSERT INTO $tableNameNonBinary VALUES ('aaa')") sql(s"CREATE TABLE $tableNameBinary (c STRING COLLATE UTF8_BINARY) USING PARQUET") sql(s"INSERT INTO $tableNameBinary VALUES ('aaa')") @@ -334,7 +346,7 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { test("create table with collation") { val tableName = "dummy_tbl" - val collationName = "UTF8_BINARY_LCASE" + val collationName = "UTF8_LCASE" val collationId = CollationFactory.collationNameToId(collationName) allFileBasedDataSources.foreach { format => @@ -382,7 +394,7 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { test("add collated column with alter table") { val tableName = "alter_column_tbl" val defaultCollation = "UTF8_BINARY" - val collationName = "UTF8_BINARY_LCASE" + val collationName = "UTF8_LCASE" val collationId = CollationFactory.collationNameToId(collationName) withTable(tableName) { @@ -413,12 +425,12 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { } } - test("implicit casting of collated strings") { + test("SPARK-47210: Implicit casting of collated strings") { val tableName = "parquet_dummy_implicit_cast_t22" withTable(tableName) { spark.sql( s""" - | CREATE TABLE $tableName(c1 STRING COLLATE UTF8_BINARY_LCASE, + | CREATE TABLE $tableName(c1 STRING COLLATE UTF8_LCASE, | c2 STRING COLLATE UNICODE, c3 STRING COLLATE UNICODE_CI, c4 STRING) | USING PARQUET |""".stripMargin) @@ -471,7 +483,7 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { // concat + in checkAnswer(sql(s"SELECT c1 FROM $tableName where c1 || 'a' " + - s"IN (COLLATE('aa', 'UTF8_BINARY_LCASE'))"), Seq(Row("a"), Row("A"))) + s"IN (COLLATE('aa', 'UTF8_LCASE'))"), Seq(Row("a"), Row("A"))) checkAnswer(sql(s"SELECT c1 FROM $tableName where (c1 || 'a') " + s"IN (COLLATE('aa', 'UTF8_BINARY'))"), Seq(Row("a"))) @@ -567,13 +579,72 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { } } - test("cast of default collated strings in IN expression") { + test("SPARK-47692: Parameter marker with EXECUTE IMMEDIATE implicit casting") { + sql(s"DECLARE stmtStr1 = 'SELECT collation(:var1 || :var2)';") + sql(s"DECLARE stmtStr2 = 'SELECT collation(:var1 || (\\\'a\\\' COLLATE UNICODE))';") + + checkAnswer( + sql( + """EXECUTE IMMEDIATE stmtStr1 USING + | 'a' AS var1, + | 'b' AS var2;""".stripMargin), + Seq(Row("UTF8_BINARY")) + ) + + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> "UNICODE") { + checkAnswer( + sql( + """EXECUTE IMMEDIATE stmtStr1 USING + | 'a' AS var1, + | 'b' AS var2;""".stripMargin), + Seq(Row("UNICODE")) + ) + } + + checkAnswer( + sql( + """EXECUTE IMMEDIATE stmtStr2 USING + | 'a' AS var1;""".stripMargin), + Seq(Row("UNICODE")) + ) + + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> "UNICODE") { + checkAnswer( + sql( + """EXECUTE IMMEDIATE stmtStr2 USING + | 'a' AS var1;""".stripMargin), + Seq(Row("UNICODE")) + ) + } + } + + test("SPARK-47692: Parameter markers with variable mapping") { + checkAnswer( + spark.sql( + "SELECT collation(:var1 || :var2)", + Map("var1" -> Literal.create('a', StringType("UTF8_BINARY")), + "var2" -> Literal.create('b', StringType("UNICODE")))), + Seq(Row("UTF8_BINARY")) + ) + + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> "UNICODE") { + checkAnswer( + spark.sql( + "SELECT collation(:var1 || :var2)", + Map("var1" -> Literal.create('a', StringType("UTF8_BINARY")), + "var2" -> Literal.create('b', StringType("UNICODE")))), + Seq(Row("UNICODE")) + ) + } + } + + test("SPARK-47210: Cast of default collated strings in IN expression") { val tableName = "t1" withTable(tableName) { spark.sql( s""" | CREATE TABLE $tableName(utf8_binary STRING COLLATE UTF8_BINARY, - | utf8_binary_lcase STRING COLLATE UTF8_BINARY_LCASE) + | utf8_lcase STRING COLLATE UTF8_LCASE) | USING PARQUET |""".stripMargin) sql(s"INSERT INTO $tableName VALUES ('aaa', 'aaa')") @@ -582,24 +653,24 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { sql(s"INSERT INTO $tableName VALUES ('BBB', 'BBB')") checkAnswer(sql(s"SELECT * FROM $tableName " + - s"WHERE utf8_binary_lcase IN " + - s"('aaa' COLLATE UTF8_BINARY_LCASE, 'bbb' COLLATE UTF8_BINARY_LCASE)"), + s"WHERE utf8_lcase IN " + + s"('aaa' COLLATE UTF8_LCASE, 'bbb' COLLATE UTF8_LCASE)"), Seq(Row("aaa", "aaa"), Row("AAA", "AAA"), Row("bbb", "bbb"), Row("BBB", "BBB"))) checkAnswer(sql(s"SELECT * FROM $tableName " + - s"WHERE utf8_binary_lcase IN ('aaa' COLLATE UTF8_BINARY_LCASE, 'bbb')"), + s"WHERE utf8_lcase IN ('aaa' COLLATE UTF8_LCASE, 'bbb')"), Seq(Row("aaa", "aaa"), Row("AAA", "AAA"), Row("bbb", "bbb"), Row("BBB", "BBB"))) } } // TODO(SPARK-47210): Add indeterminate support - test("indeterminate collation checks") { + test("SPARK-47210: Indeterminate collation checks") { val tableName = "t1" val newTableName = "t2" withTable(tableName) { spark.sql( s""" | CREATE TABLE $tableName(c1 STRING COLLATE UNICODE, - | c2 STRING COLLATE UTF8_BINARY_LCASE) + | c2 STRING COLLATE UTF8_LCASE) | USING PARQUET |""".stripMargin) sql(s"INSERT INTO $tableName VALUES ('aaa', 'aaa')") @@ -607,21 +678,21 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { sql(s"INSERT INTO $tableName VALUES ('bbb', 'bbb')") sql(s"INSERT INTO $tableName VALUES ('BBB', 'BBB')") - sql(s"SET spark.sql.legacy.createHiveTableByDefault=false") - - withTable(newTableName) { - checkError( - exception = intercept[AnalysisException] { - sql(s"CREATE TABLE $newTableName AS SELECT c1 || c2 FROM $tableName") - }, - errorClass = "COLLATION_MISMATCH.IMPLICIT") + withSQLConf("spark.sql.legacy.createHiveTableByDefault" -> "false") { + withTable(newTableName) { + checkError( + exception = intercept[AnalysisException] { + sql(s"CREATE TABLE $newTableName AS SELECT c1 || c2 FROM $tableName") + }, + errorClass = "COLLATION_MISMATCH.IMPLICIT") + } } } } test("create v2 table with collation column") { val tableName = "testcat.table_name" - val collationName = "UTF8_BINARY_LCASE" + val collationName = "UTF8_LCASE" val collationId = CollationFactory.collationNameToId(collationName) withTable(tableName) { @@ -685,7 +756,7 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { val schema = StructType(StructField( "col", - StringType(CollationFactory.collationNameToId("UTF8_BINARY_LCASE"))) :: Nil) + StringType(CollationFactory.collationNameToId("UTF8_LCASE"))) :: Nil) val df = spark.createDataFrame(sparkContext.parallelize(in), schema) df.repartition(10, df.col("col")).foreachPartition( @@ -699,37 +770,6 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { }) } - test("hash based joins not allowed for non-binary collated strings") { - val in = (('a' to 'z') ++ ('A' to 'Z')).map(_.toString * 3).map(e => Row.apply(e, e)) - - val schema = StructType(StructField( - "col_non_binary", - StringType(CollationFactory.collationNameToId("UTF8_BINARY_LCASE"))) :: - StructField("col_binary", StringType) :: Nil) - val df1 = spark.createDataFrame(sparkContext.parallelize(in), schema) - - // Binary collations are allowed to use hash join. - assert(collectFirst( - df1.hint("broadcast").join(df1, df1("col_binary") === df1("col_binary")) - .queryExecution.executedPlan) { - case _: BroadcastHashJoinExec => () - }.nonEmpty) - - // Even with hint broadcast, hash join is not used for non-binary collated strings. - assert(collectFirst( - df1.hint("broadcast").join(df1, df1("col_non_binary") === df1("col_non_binary")) - .queryExecution.executedPlan) { - case _: BroadcastHashJoinExec => () - }.isEmpty) - - // Instead they will default to sort merge join. - assert(collectFirst( - df1.hint("broadcast").join(df1, df1("col_non_binary") === df1("col_non_binary")) - .queryExecution.executedPlan) { - case _: SortMergeJoinExec => () - }.nonEmpty) - } - test("Generated column expressions using collations - errors out") { checkError( exception = intercept[AnalysisException] { @@ -747,7 +787,7 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { "fieldName" -> "c2", "expressionStr" -> "SUBSTRING(c1, 0, 1)", "reason" -> - "generation expression cannot contain non-binary orderable collated string type")) + "generation expression cannot contain non utf8 binary collated string type")) checkError( exception = intercept[AnalysisException] { @@ -765,7 +805,7 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { "fieldName" -> "c2", "expressionStr" -> "LOWER(c1)", "reason" -> - "generation expression cannot contain non-binary orderable collated string type")) + "generation expression cannot contain non utf8 binary collated string type")) checkError( exception = intercept[AnalysisException] { @@ -783,7 +823,7 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { "fieldName" -> "c2", "expressionStr" -> "UCASE(struct1.a)", "reason" -> - "generation expression cannot contain non-binary orderable collated string type")) + "generation expression cannot contain non utf8 binary collated string type")) } test("SPARK-47431: Default collation set to UNICODE, literal test") { @@ -792,6 +832,45 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { } } + test("SPARK-47972: Cast expression limitation for collations") { + checkError( + exception = intercept[ParseException] + (sql("SELECT cast(1 as string collate unicode)")), + errorClass = "UNSUPPORTED_DATATYPE", + parameters = Map( + "typeName" -> toSQLType(StringType("UNICODE"))), + context = + ExpectedContext(fragment = s"cast(1 as string collate unicode)", start = 7, stop = 39) + ) + + checkError( + exception = intercept[ParseException] + (sql("SELECT 'A' :: string collate unicode")), + errorClass = "UNSUPPORTED_DATATYPE", + parameters = Map( + "typeName" -> toSQLType(StringType("UNICODE"))), + context = ExpectedContext(fragment = s"'A' :: string collate unicode", start = 7, stop = 35) + ) + + checkAnswer(sql(s"SELECT cast(1 as string)"), Seq(Row("1"))) + checkAnswer(sql(s"SELECT cast('A' as string)"), Seq(Row("A"))) + + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> "UNICODE") { + checkError( + exception = intercept[ParseException] + (sql("SELECT cast(1 as string collate unicode)")), + errorClass = "UNSUPPORTED_DATATYPE", + parameters = Map( + "typeName" -> toSQLType(StringType("UNICODE"))), + context = + ExpectedContext(fragment = s"cast(1 as string collate unicode)", start = 7, stop = 39) + ) + + checkAnswer(sql(s"SELECT cast(1 as string)"), Seq(Row("1"))) + checkAnswer(sql(s"SELECT collation(cast(1 as string))"), Seq(Row("UNICODE"))) + } + } + test("SPARK-47431: Default collation set to UNICODE, column type test") { withTable("t") { withSQLConf(SqlApiConf.DEFAULT_COLLATION -> "UNICODE") { @@ -830,13 +909,13 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { val table = "table_agg" // array withTable(table) { - sql(s"create table $table (a array) using parquet") + sql(s"create table $table (a array) using parquet") sql(s"insert into $table values (array('aaa')), (array('AAA'))") checkAnswer(sql(s"select distinct a from $table"), Seq(Row(Seq("aaa")))) } // map doesn't support aggregation withTable(table) { - sql(s"create table $table (m map) using parquet") + sql(s"create table $table (m map) using parquet") val query = s"select distinct m from $table" checkError( exception = intercept[ExtendedAnalysisException](sql(query)), @@ -844,14 +923,14 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { parameters = Map( "colName" -> "`m`", "dataType" -> toSQLType(MapType( - StringType(CollationFactory.collationNameToId("UTF8_BINARY_LCASE")), + StringType(CollationFactory.collationNameToId("UTF8_LCASE")), StringType))), context = ExpectedContext(query, 0, query.length - 1) ) } // struct withTable(table) { - sql(s"create table $table (s struct) using parquet") + sql(s"create table $table (s struct) using parquet") sql(s"insert into $table values (named_struct('fld', 'aaa')), (named_struct('fld', 'AAA'))") checkAnswer(sql(s"select s.fld from $table group by s"), Seq(Row("aaa"))) } @@ -863,7 +942,7 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { // array withTable(tableLeft, tableRight) { Seq(tableLeft, tableRight).map(tab => - sql(s"create table $tab (a array) using parquet")) + sql(s"create table $tab (a array) using parquet")) Seq((tableLeft, "array('aaa')"), (tableRight, "array('AAA')")).map{ case (tab, data) => sql(s"insert into $tab values ($data)") } @@ -876,7 +955,7 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { // map doesn't support joins withTable(tableLeft, tableRight) { Seq(tableLeft, tableRight).map(tab => - sql(s"create table $tab (m map) using parquet")) + sql(s"create table $tab (m map) using parquet")) val query = s"select $tableLeft.m from $tableLeft join $tableRight on $tableLeft.m = $tableRight.m" val ctx = s"$tableLeft.m = $tableRight.m" @@ -886,7 +965,7 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { parameters = Map( "functionName" -> "`=`", "dataType" -> toSQLType(MapType( - StringType(CollationFactory.collationNameToId("UTF8_BINARY_LCASE")), + StringType(CollationFactory.collationNameToId("UTF8_LCASE")), StringType )), "sqlExpr" -> "\"(m = m)\""), @@ -895,7 +974,7 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { // struct withTable(tableLeft, tableRight) { Seq(tableLeft, tableRight).map(tab => - sql(s"create table $tab (s struct) using parquet")) + sql(s"create table $tab (s struct) using parquet")) Seq( (tableLeft, "named_struct('fld', 'aaa')"), (tableRight, "named_struct('fld', 'AAA')") @@ -911,37 +990,37 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { } test("Support operations on complex types containing collated strings") { - checkAnswer(sql("select reverse('abc' collate utf8_binary_lcase)"), Seq(Row("cba"))) + checkAnswer(sql("select reverse('abc' collate utf8_lcase)"), Seq(Row("cba"))) checkAnswer(sql( """ - |select reverse(array('a' collate utf8_binary_lcase, - |'b' collate utf8_binary_lcase)) + |select reverse(array('a' collate utf8_lcase, + |'b' collate utf8_lcase)) |""".stripMargin), Seq(Row(Seq("b", "a")))) checkAnswer(sql( """ - |select array_join(array('a' collate utf8_binary_lcase, - |'b' collate utf8_binary_lcase), ', ' collate utf8_binary_lcase) + |select array_join(array('a' collate utf8_lcase, + |'b' collate utf8_lcase), ', ' collate utf8_lcase) |""".stripMargin), Seq(Row("a, b"))) checkAnswer(sql( """ - |select array_join(array('a' collate utf8_binary_lcase, - |'b' collate utf8_binary_lcase, null), ', ' collate utf8_binary_lcase, - |'c' collate utf8_binary_lcase) + |select array_join(array('a' collate utf8_lcase, + |'b' collate utf8_lcase, null), ', ' collate utf8_lcase, + |'c' collate utf8_lcase) |""".stripMargin), Seq(Row("a, b, c"))) checkAnswer(sql( """ - |select concat('a' collate utf8_binary_lcase, 'b' collate utf8_binary_lcase) + |select concat('a' collate utf8_lcase, 'b' collate utf8_lcase) |""".stripMargin), Seq(Row("ab"))) checkAnswer(sql( """ - |select concat(array('a' collate utf8_binary_lcase, 'b' collate utf8_binary_lcase)) + |select concat(array('a' collate utf8_lcase, 'b' collate utf8_lcase)) |""".stripMargin), Seq(Row(Seq("a", "b")))) checkAnswer(sql( """ - |select map('a' collate utf8_binary_lcase, 1, 'b' collate utf8_binary_lcase, 2) - |['A' collate utf8_binary_lcase] + |select map('a' collate utf8_lcase, 1, 'b' collate utf8_lcase, 2) + |['A' collate utf8_lcase] |""".stripMargin), Seq(Row(1))) - val ctx = "map('aaa' collate utf8_binary_lcase, 1, 'AAA' collate utf8_binary_lcase, 2)['AaA']" + val ctx = "map('aaa' collate utf8_lcase, 1, 'AAA' collate utf8_lcase, 2)['AaA']" val query = s"select $ctx" checkError( exception = intercept[AnalysisException](sql(query)), @@ -952,7 +1031,7 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { "inputSql" -> "\"AaA\"", "inputType" -> toSQLType(StringType), "requiredType" -> toSQLType(StringType( - CollationFactory.collationNameToId("UTF8_BINARY_LCASE"))) + CollationFactory.collationNameToId("UTF8_LCASE"))) ), context = ExpectedContext( fragment = ctx, @@ -967,7 +1046,7 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { val t2 = "T_BINARY" withTable(t1, t2) { - sql(s"CREATE TABLE $t1 (c STRING COLLATE UTF8_BINARY_LCASE, i int) USING PARQUET") + sql(s"CREATE TABLE $t1 (c STRING COLLATE UTF8_LCASE, i int) USING PARQUET") sql(s"INSERT INTO $t1 VALUES ('aA', 2), ('Aa', 1), ('ab', 3), ('aa', 1)") sql(s"CREATE TABLE $t2 (c STRING, i int) USING PARQUET") @@ -981,4 +1060,409 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { checkAnswer(dfNonBinary, dfBinary) } } + + test("hash join should be used for collated strings") { + val t1 = "T_1" + val t2 = "T_2" + + case class HashJoinTestCase[R](collation: String, result: R) + val testCases = Seq( + HashJoinTestCase("UTF8_BINARY", Seq(Row("aa", 1, "aa", 2))), + HashJoinTestCase("UTF8_LCASE", Seq(Row("aa", 1, "AA", 2), Row("aa", 1, "aa", 2))), + HashJoinTestCase("UNICODE", Seq(Row("aa", 1, "aa", 2))), + HashJoinTestCase("UNICODE_CI", Seq(Row("aa", 1, "AA", 2), Row("aa", 1, "aa", 2))) + ) + + testCases.foreach(t => { + withTable(t1, t2) { + sql(s"CREATE TABLE $t1 (x STRING COLLATE ${t.collation}, i int) USING PARQUET") + sql(s"INSERT INTO $t1 VALUES ('aa', 1)") + + sql(s"CREATE TABLE $t2 (y STRING COLLATE ${t.collation}, j int) USING PARQUET") + sql(s"INSERT INTO $t2 VALUES ('AA', 2), ('aa', 2)") + + val df = sql(s"SELECT * FROM $t1 JOIN $t2 ON $t1.x = $t2.y") + checkAnswer(df, t.result) + + val queryPlan = df.queryExecution.executedPlan + + // confirm that hash join is used instead of sort merge join + assert( + collectFirst(queryPlan) { + case _: HashJoin => () + }.nonEmpty + ) + assert( + collectFirst(queryPlan) { + case _: SortMergeJoinExec => () + }.isEmpty + ) + + // Only if collation doesn't support binary equality, collation key should be injected. + if (!CollationFactory.fetchCollation(t.collation).supportsBinaryEquality) { + assert(collectFirst(queryPlan) { + case b: HashJoin => b.leftKeys.head + }.head.isInstanceOf[CollationKey]) + } else { + assert(!collectFirst(queryPlan) { + case b: HashJoin => b.leftKeys.head + }.head.isInstanceOf[CollationKey]) + } + } + }) + } + + test("hash join should be used for arrays of collated strings") { + val t1 = "T_1" + val t2 = "T_2" + + case class HashJoinTestCase[R](collation: String, result: R) + val testCases = Seq( + HashJoinTestCase("UTF8_BINARY", + Seq(Row(Seq("aa"), 1, Seq("aa"), 2))), + HashJoinTestCase("UTF8_LCASE", + Seq(Row(Seq("aa"), 1, Seq("AA"), 2), Row(Seq("aa"), 1, Seq("aa"), 2))), + HashJoinTestCase("UNICODE", + Seq(Row(Seq("aa"), 1, Seq("aa"), 2))), + HashJoinTestCase("UNICODE_CI", + Seq(Row(Seq("aa"), 1, Seq("AA"), 2), Row(Seq("aa"), 1, Seq("aa"), 2))) + ) + + testCases.foreach(t => { + withTable(t1, t2) { + sql(s"CREATE TABLE $t1 (x ARRAY, i int) USING PARQUET") + sql(s"INSERT INTO $t1 VALUES (array('aa'), 1)") + + sql(s"CREATE TABLE $t2 (y ARRAY, j int) USING PARQUET") + sql(s"INSERT INTO $t2 VALUES (array('AA'), 2), (array('aa'), 2)") + + val df = sql(s"SELECT * FROM $t1 JOIN $t2 ON $t1.x = $t2.y") + checkAnswer(df, t.result) + + val queryPlan = df.queryExecution.executedPlan + + // confirm that hash join is used instead of sort merge join + assert( + collectFirst(queryPlan) { + case _: HashJoin => () + }.nonEmpty + ) + assert( + collectFirst(queryPlan) { + case _: ShuffledJoin => () + }.isEmpty + ) + + // Only if collation doesn't support binary equality, collation key should be injected. + if (!CollationFactory.fetchCollation(t.collation).supportsBinaryEquality) { + assert(collectFirst(queryPlan) { + case b: BroadcastHashJoinExec => b.leftKeys.head + }.head.asInstanceOf[ArrayTransform].function.asInstanceOf[LambdaFunction]. + function.isInstanceOf[CollationKey]) + } else { + assert(!collectFirst(queryPlan) { + case b: BroadcastHashJoinExec => b.leftKeys.head + }.head.isInstanceOf[ArrayTransform]) + } + } + }) + } + + test("hash join should be used for arrays of arrays of collated strings") { + val t1 = "T_1" + val t2 = "T_2" + + case class HashJoinTestCase[R](collation: String, result: R) + val testCases = Seq( + HashJoinTestCase("UTF8_BINARY", + Seq(Row(Seq(Seq("aa")), 1, Seq(Seq("aa")), 2))), + HashJoinTestCase("UTF8_LCASE", + Seq(Row(Seq(Seq("aa")), 1, Seq(Seq("AA")), 2), Row(Seq(Seq("aa")), 1, Seq(Seq("aa")), 2))), + HashJoinTestCase("UNICODE", + Seq(Row(Seq(Seq("aa")), 1, Seq(Seq("aa")), 2))), + HashJoinTestCase("UNICODE_CI", + Seq(Row(Seq(Seq("aa")), 1, Seq(Seq("AA")), 2), Row(Seq(Seq("aa")), 1, Seq(Seq("aa")), 2))) + ) + + testCases.foreach(t => { + withTable(t1, t2) { + sql(s"CREATE TABLE $t1 (x ARRAY>, i int) USING " + + s"PARQUET") + sql(s"INSERT INTO $t1 VALUES (array(array('aa')), 1)") + + sql(s"CREATE TABLE $t2 (y ARRAY>, j int) USING " + + s"PARQUET") + sql(s"INSERT INTO $t2 VALUES (array(array('AA')), 2), (array(array('aa')), 2)") + + val df = sql(s"SELECT * FROM $t1 JOIN $t2 ON $t1.x = $t2.y") + checkAnswer(df, t.result) + + val queryPlan = df.queryExecution.executedPlan + + // confirm that hash join is used instead of sort merge join + assert( + collectFirst(queryPlan) { + case _: HashJoin => () + }.nonEmpty + ) + assert( + collectFirst(queryPlan) { + case _: ShuffledJoin => () + }.isEmpty + ) + + // Only if collation doesn't support binary equality, collation key should be injected. + if (!CollationFactory.fetchCollation(t.collation).supportsBinaryEquality) { + assert(collectFirst(queryPlan) { + case b: BroadcastHashJoinExec => b.leftKeys.head + }.head.asInstanceOf[ArrayTransform].function. + asInstanceOf[LambdaFunction].function.asInstanceOf[ArrayTransform].function. + asInstanceOf[LambdaFunction].function.isInstanceOf[CollationKey]) + } else { + assert(!collectFirst(queryPlan) { + case b: BroadcastHashJoinExec => b.leftKeys.head + }.head.isInstanceOf[ArrayTransform]) + } + } + }) + } + + test("hash join should respect collation for struct of strings") { + val t1 = "T_1" + val t2 = "T_2" + + case class HashJoinTestCase[R](collation: String, result: R) + val testCases = Seq( + HashJoinTestCase("UTF8_BINARY", + Seq(Row(Row("aa"), 1, Row("aa"), 2))), + HashJoinTestCase("UTF8_LCASE", + Seq(Row(Row("aa"), 1, Row("AA"), 2), Row(Row("aa"), 1, Row("aa"), 2))), + HashJoinTestCase("UNICODE", + Seq(Row(Row("aa"), 1, Row("aa"), 2))), + HashJoinTestCase("UNICODE_CI", + Seq(Row(Row("aa"), 1, Row("AA"), 2), Row(Row("aa"), 1, Row("aa"), 2))) + ) + testCases.foreach(t => { + withTable(t1, t2) { + sql(s"CREATE TABLE $t1 (x STRUCT, i int) USING PARQUET") + sql(s"INSERT INTO $t1 VALUES (named_struct('f', 'aa'), 1)") + + sql(s"CREATE TABLE $t2 (y STRUCT, j int) USING PARQUET") + sql(s"INSERT INTO $t2 VALUES (named_struct('f', 'AA'), 2), (named_struct('f', 'aa'), 2)") + + val df = sql(s"SELECT * FROM $t1 JOIN $t2 ON $t1.x = $t2.y") + checkAnswer(df, t.result) + + val queryPlan = df.queryExecution.executedPlan + + // Confirm that hash join is used instead of sort merge join. + assert( + collectFirst(queryPlan) { + case _: HashJoin => () + }.nonEmpty + ) + assert( + collectFirst(queryPlan) { + case _: ShuffledJoin => () + }.isEmpty + ) + + // Only if collation doesn't support binary equality, collation key should be injected. + if (!CollationFactory.fetchCollation(t.collation).supportsBinaryEquality) { + assert(queryPlan.toString().contains("collationkey")) + } else { + assert(!queryPlan.toString().contains("collationkey")) + } + } + }) + } + + test("hash join should respect collation for struct of array of struct of strings") { + val t1 = "T_1" + val t2 = "T_2" + + case class HashJoinTestCase[R](collation: String, result: R) + val testCases = Seq( + HashJoinTestCase("UTF8_BINARY", + Seq(Row(Row(Seq(Row("aa"))), 1, Row(Seq(Row("aa"))), 2))), + HashJoinTestCase("UTF8_LCASE", + Seq(Row(Row(Seq(Row("aa"))), 1, Row(Seq(Row("AA"))), 2), + Row(Row(Seq(Row("aa"))), 1, Row(Seq(Row("aa"))), 2))), + HashJoinTestCase("UNICODE", + Seq(Row(Row(Seq(Row("aa"))), 1, Row(Seq(Row("aa"))), 2))), + HashJoinTestCase("UNICODE_CI", + Seq(Row(Row(Seq(Row("aa"))), 1, Row(Seq(Row("AA"))), 2), + Row(Row(Seq(Row("aa"))), 1, Row(Seq(Row("aa"))), 2))) + ) + testCases.foreach(t => { + withTable(t1, t2) { + sql(s"CREATE TABLE $t1 (x STRUCT>>, " + + s"i int) USING PARQUET") + sql(s"INSERT INTO $t1 VALUES (named_struct('f', array(named_struct('f', 'aa'))), 1)") + + sql(s"CREATE TABLE $t2 (y STRUCT>>, " + + s"j int) USING PARQUET") + sql(s"INSERT INTO $t2 VALUES (named_struct('f', array(named_struct('f', 'AA'))), 2), " + + s"(named_struct('f', array(named_struct('f', 'aa'))), 2)") + + val df = sql(s"SELECT * FROM $t1 JOIN $t2 ON $t1.x = $t2.y") + checkAnswer(df, t.result) + + val queryPlan = df.queryExecution.executedPlan + + // confirm that hash join is used instead of sort merge join + assert( + collectFirst(queryPlan) { + case _: HashJoin => () + }.nonEmpty + ) + assert( + collectFirst(queryPlan) { + case _: ShuffledJoin => () + }.isEmpty + ) + + // Only if collation doesn't support binary equality, collation key should be injected. + if (!CollationFactory.fetchCollation(t.collation).supportsBinaryEquality) { + assert(queryPlan.toString().contains("collationkey")) + } else { + assert(!queryPlan.toString().contains("collationkey")) + } + } + }) + } + + test("rewrite with collationkey should be an excludable rule") { + val t1 = "T_1" + val t2 = "T_2" + val collation = "UTF8_LCASE" + val collationRewriteJoinRule = "org.apache.spark.sql.catalyst.analysis.RewriteCollationJoin" + withTable(t1, t2) { + withSQLConf(SQLConf.OPTIMIZER_EXCLUDED_RULES.key -> collationRewriteJoinRule) { + sql(s"CREATE TABLE $t1 (x STRING COLLATE $collation, i int) USING PARQUET") + sql(s"INSERT INTO $t1 VALUES ('aa', 1)") + + sql(s"CREATE TABLE $t2 (y STRING COLLATE $collation, j int) USING PARQUET") + sql(s"INSERT INTO $t2 VALUES ('AA', 2), ('aa', 2)") + + val df = sql(s"SELECT * FROM $t1 JOIN $t2 ON $t1.x = $t2.y") + checkAnswer(df, Seq(Row("aa", 1, "AA", 2), Row("aa", 1, "aa", 2))) + + val queryPlan = df.queryExecution.executedPlan + + // confirm that sort merge join is used instead of hash join + assert( + collectFirst(queryPlan) { + case _: HashJoin => () + }.isEmpty + ) + assert( + collectFirst(queryPlan) { + case _: SortMergeJoinExec => () + }.nonEmpty + ) + } + } + } + + test("rewrite with collationkey shouldn't disrupt multiple join conditions") { + val t1 = "T_1" + val t2 = "T_2" + + case class HashMultiJoinTestCase[R]( + type1: String, + type2: String, + data1: String, + data2: String, + result: R + ) + val testCases = Seq( + HashMultiJoinTestCase("STRING COLLATE UTF8_BINARY", "INT", + "'a', 0, 1", "'a', 0, 1", Row("a", 0, 1, "a", 0, 1)), + HashMultiJoinTestCase("STRING COLLATE UTF8_BINARY", "STRING COLLATE UTF8_BINARY", + "'a', 'a', 1", "'a', 'a', 1", Row("a", "a", 1, "a", "a", 1)), + HashMultiJoinTestCase("STRING COLLATE UTF8_BINARY", "STRING COLLATE UTF8_LCASE", + "'a', 'a', 1", "'a', 'A', 1", Row("a", "a", 1, "a", "A", 1)), + HashMultiJoinTestCase("STRING COLLATE UTF8_LCASE", "STRING COLLATE UNICODE_CI", + "'a', 'a', 1", "'A', 'A', 1", Row("a", "a", 1, "A", "A", 1)) + ) + + testCases.foreach(t => { + withTable(t1, t2) { + sql(s"CREATE TABLE $t1 (x ${t.type1}, y ${t.type2}, i int) USING PARQUET") + sql(s"INSERT INTO $t1 VALUES (${t.data1})") + sql(s"CREATE TABLE $t2 (x ${t.type1}, y ${t.type2}, i int) USING PARQUET") + sql(s"INSERT INTO $t2 VALUES (${t.data2})") + + val df = sql(s"SELECT * FROM $t1 JOIN $t2 ON $t1.x = $t2.x AND $t1.y = $t2.y") + checkAnswer(df, t.result) + + val queryPlan = df.queryExecution.executedPlan + + // confirm that hash join is used instead of sort merge join + assert( + collectFirst(queryPlan) { + case _: HashJoin => () + }.nonEmpty + ) + assert( + collectFirst(queryPlan) { + case _: SortMergeJoinExec => () + }.isEmpty + ) + } + }) + } + + test("hll sketch aggregate should respect collation") { + case class HllSketchAggTestCase[R](c: String, result: R) + val testCases = Seq( + HllSketchAggTestCase("UTF8_BINARY", 4), + HllSketchAggTestCase("UTF8_LCASE", 3), + HllSketchAggTestCase("UNICODE", 4), + HllSketchAggTestCase("UNICODE_CI", 3) + ) + testCases.foreach(t => { + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.c) { + val q = "SELECT hll_sketch_estimate(hll_sketch_agg(col)) FROM " + + "VALUES ('a'), ('A'), ('b'), ('b'), ('c') tab(col)" + val df = sql(q) + checkAnswer(df, Seq(Row(t.result))) + } + }) + } + + test("cache table with collated columns") { + val collations = Seq("UTF8_BINARY", "UTF8_LCASE", "UNICODE", "UNICODE_CI") + val lazyOptions = Seq(false, true) + + for ( + collation <- collations; + lazyTable <- lazyOptions + ) { + val lazyStr = if (lazyTable) "LAZY" else "" + + def checkCacheTable(values: String): Unit = { + sql(s"CACHE $lazyStr TABLE tbl AS SELECT col FROM VALUES ($values) AS (col)") + // Checks in-memory fetching code path. + val all = sql("SELECT col FROM tbl") + assert(all.queryExecution.executedPlan.collectFirst { + case _: InMemoryTableScanExec => true + }.nonEmpty) + checkAnswer(all, Row("a")) + // Checks column stats code path. + checkAnswer(sql("SELECT col FROM tbl WHERE col = 'a'"), Row("a")) + checkAnswer(sql("SELECT col FROM tbl WHERE col = 'b'"), Seq.empty) + } + + withTable("tbl") { + checkCacheTable(s"'a' COLLATE $collation") + } + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> collation) { + withTable("tbl") { + checkCacheTable("'a'") + } + } + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAsOfJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAsOfJoinSuite.scala index 280eb095dc753..a03f083123558 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAsOfJoinSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAsOfJoinSuite.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql import scala.jdk.CollectionConverters._ +import org.apache.spark.sql.catalyst.plans.AsOfJoinDirection import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper import org.apache.spark.sql.functions._ import org.apache.spark.sql.test.SharedSparkSession @@ -123,6 +124,24 @@ class DataFrameAsOfJoinSuite extends QueryTest parameters = Map.empty) } + test("as-of join - unsupported direction") { + val (df1, df2) = prepareForAsOfJoin() + val direction = "unknown" + checkError( + exception = intercept[AnalysisException] { + df1.joinAsOf(df2, df1.col("a"), df2.col("a"), usingColumns = Seq.empty, + joinType = "inner", tolerance = lit(-1), allowExactMatches = true, + direction = direction) + }, + errorClass = "AS_OF_JOIN.UNSUPPORTED_DIRECTION", + sqlState = "42604", + parameters = Map( + "direction" -> direction, + "supported" -> AsOfJoinDirection.supported.mkString("'", "', '", "'") + ) + ) + } + test("as-of join - allowExactMatches = false") { val (df1, df2) = prepareForAsOfJoin() checkAnswer( diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala index e42f397cbfc29..df1bb39f18744 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala @@ -82,7 +82,9 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { "bucket", "days", "hours", "months", "years", // Datasource v2 partition transformations "product", // Discussed in https://github.com/apache/spark/pull/30745 "unwrap_udt", - "collect_top_k" + "collect_top_k", + "timestamp_add", + "timestamp_diff" ) // We only consider functions matching this pattern, this excludes symbolic and other diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala index 01905e2c05fd7..f6fd6b501d790 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala @@ -621,4 +621,14 @@ class DataFrameJoinSuite extends QueryTest checkAnswer(joined, Row("x", null, null)) checkAnswer(joined.filter($"new".isNull), Row("x", null, null)) } + + test("SPARK-47810: replace equivalent expression to <=> in join condition") { + val joinTypes = Seq("inner", "outer", "left", "right", "semi", "anti", "cross") + joinTypes.foreach(joinType => { + val df1 = testData3.as("x").join(testData3.as("y"), + ($"x.a" <=> $"y.b").or($"x.a".isNull.and($"y.b".isNull)), joinType) + val df2 = testData3.as("x").join(testData3.as("y"), $"x.a" <=> $"y.b", joinType) + checkAnswer(df1, df2) + }) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameShowSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameShowSuite.scala index e889fe2545afa..d728cc5810a21 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameShowSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameShowSuite.scala @@ -190,7 +190,9 @@ class DataFrameShowSuite extends QueryTest with SharedSparkSession { ||[33 34]|[31 32 33 34 36]| |+-------+----------------+ |""".stripMargin - assert(df.showString(10) === expectedAnswer) + withSQLConf(SQLConf.BINARY_OUTPUT_STYLE.key -> "HEX_DISCRETE") { + assert(df.showString(10) === expectedAnswer) + } } test("showString: binary, vertical = true") { @@ -204,7 +206,9 @@ class DataFrameShowSuite extends QueryTest with SharedSparkSession { "-RECORD 1---------------\n" + " _1 | [33 34] \n" + " _2 | [31 32 33 34 36] \n" - assert(df.showString(10, vertical = true) === expectedAnswer) + withSQLConf(SQLConf.BINARY_OUTPUT_STYLE.key -> "HEX_DISCRETE") { + assert(df.showString(10, vertical = true) === expectedAnswer) + } } test("showString: minimum column width") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala index 6b34a6412cc0f..760ee80260808 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala @@ -32,7 +32,7 @@ import org.apache.spark.api.python.PythonEvalType import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier} import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder -import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Cast, EqualTo, ExpressionSet, GreaterThan, Literal, PythonUDF, ScalarSubquery, Uuid} +import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Cast, CreateMap, EqualTo, ExpressionSet, GreaterThan, Literal, PythonUDF, ScalarSubquery, Uuid} import org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation import org.apache.spark.sql.catalyst.parser.ParseException import org.apache.spark.sql.catalyst.plans.logical.{Filter, LeafNode, LocalRelation, LogicalPlan, OneRowRelation} @@ -1043,11 +1043,46 @@ class DataFrameSuite extends QueryTest ("12".getBytes(StandardCharsets.UTF_8), "ABC.".getBytes(StandardCharsets.UTF_8)), ("34".getBytes(StandardCharsets.UTF_8), "12346".getBytes(StandardCharsets.UTF_8)) ).toDF() - val expectedAnswer = Seq( - Seq("_1", "_2"), - Seq("[31 32]", "[41 42 43 2E]"), - Seq("[33 34]", "[31 32 33 34 36]")) - assert(df.getRows(10, 20) === expectedAnswer) + + withSQLConf(SQLConf.BINARY_OUTPUT_STYLE.key -> "HEX_DISCRETE") { + val expectedAnswer = Seq( + Seq("_1", "_2"), + Seq("[31 32]", "[41 42 43 2E]"), + Seq("[33 34]", "[31 32 33 34 36]")) + assert(df.getRows(10, 20) === expectedAnswer) + } + withSQLConf(SQLConf.BINARY_OUTPUT_STYLE.key -> "HEX") { + val expectedAnswer = Seq( + Seq("_1", "_2"), + Seq("3132", "4142432E"), + Seq("3334", "3132333436") + ) + assert(df.getRows(10, 20) === expectedAnswer) + } + withSQLConf(SQLConf.BINARY_OUTPUT_STYLE.key -> "BASE64") { + val expectedAnswer = Seq( + Seq("_1", "_2"), + Seq("MTI", "QUJDLg"), + Seq("MzQ", "MTIzNDY") + ) + assert(df.getRows(10, 20) === expectedAnswer) + } + withSQLConf(SQLConf.BINARY_OUTPUT_STYLE.key -> "UTF8") { + val expectedAnswer = Seq( + Seq("_1", "_2"), + Seq("12", "ABC."), + Seq("34", "12346") + ) + assert(df.getRows(10, 20) === expectedAnswer) + } + withSQLConf(SQLConf.BINARY_OUTPUT_STYLE.key -> "BASIC") { + val expectedAnswer = Seq( + Seq("_1", "_2"), + Seq("[49, 50]", "[65, 66, 67, 46]"), + Seq("[51, 52]", "[49, 50, 51, 52, 54]") + ) + assert(df.getRows(10, 20) === expectedAnswer) + } } test("createDataFrame(RDD[Row], StructType) should convert UDTs (SPARK-6672)") { @@ -2469,6 +2504,14 @@ class DataFrameSuite extends QueryTest assert(row.getInt(0).toString == row.getString(2)) assert(row.getInt(0).toString == row.getString(3)) } + + val v3 = Column(CreateMap(Seq(Literal("key"), Literal("value")))) + val v4 = to_csv(struct(v3.as("a"))) // to_csv is CodegenFallback + df.select(v3, v3, v4, v4).collect().foreach { row => + assert(row.getMap(0).toString() == row.getMap(1).toString()) + assert(row.getString(2) == s"{key -> ${row.getMap(0).get("key").get}}") + assert(row.getString(3) == s"{key -> ${row.getMap(0).get("key").get}}") + } } test("SPARK-45216: Non-deterministic functions with seed") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFramesSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFramesSuite.scala index fe1393af81749..95f4cc78d1564 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFramesSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFramesSuite.scala @@ -32,6 +32,28 @@ import org.apache.spark.sql.types.CalendarIntervalType class DataFrameWindowFramesSuite extends QueryTest with SharedSparkSession { import testImplicits._ + test("reuse window partitionBy") { + val df = Seq((1, "1"), (2, "2"), (1, "1"), (2, "2")).toDF("key", "value") + val w = Window.partitionBy("key").orderBy("value") + + checkAnswer( + df.select( + lead("key", 1).over(w), + lead("value", 1).over(w)), + Row(1, "1") :: Row(2, "2") :: Row(null, null) :: Row(null, null) :: Nil) + } + + test("reuse window orderBy") { + val df = Seq((1, "1"), (2, "2"), (1, "1"), (2, "2")).toDF("key", "value") + val w = Window.orderBy("value").partitionBy("key") + + checkAnswer( + df.select( + lead("key", 1).over(w), + lead("value", 1).over(w)), + Row(1, "1") :: Row(2, "2") :: Row(null, null) :: Row(null, null) :: Nil) + } + test("lead/lag with empty data frame") { val df = Seq.empty[(Int, String)].toDF("key", "value") val window = Window.partitionBy($"key").orderBy($"value") @@ -570,4 +592,30 @@ class DataFrameWindowFramesSuite extends QueryTest with SharedSparkSession { } } } + + test("SPARK-34227: WindowFunctionFrame should clear its states during preparation") { + // This creates a single partition dataframe with 3 records: + // "a", 0, null + // "a", 1, "x" + // "b", 0, null + val df = spark.range(0, 3, 1, 1).select( + when($"id" < 2, lit("a")).otherwise(lit("b")).as("key"), + ($"id" % 2).cast("int").as("order"), + when($"id" % 2 === 0, lit(null)).otherwise(lit("x")).as("value")) + + val window1 = Window.partitionBy($"key").orderBy($"order") + .rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing) + val window2 = Window.partitionBy($"key").orderBy($"order") + .rowsBetween(Window.unboundedPreceding, Window.currentRow) + checkAnswer( + df.select( + $"key", + $"order", + nth_value($"value", 1, ignoreNulls = true).over(window1), + nth_value($"value", 1, ignoreNulls = true).over(window2)), + Seq( + Row("a", 0, "x", null), + Row("a", 1, "x", "x"), + Row("b", 0, null, null))) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala index 4c852711451c7..e3aff9b36aece 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala @@ -44,28 +44,6 @@ class DataFrameWindowFunctionsSuite extends QueryTest import testImplicits._ - test("reuse window partitionBy") { - val df = Seq((1, "1"), (2, "2"), (1, "1"), (2, "2")).toDF("key", "value") - val w = Window.partitionBy("key").orderBy("value") - - checkAnswer( - df.select( - lead("key", 1).over(w), - lead("value", 1).over(w)), - Row(1, "1") :: Row(2, "2") :: Row(null, null) :: Row(null, null) :: Nil) - } - - test("reuse window orderBy") { - val df = Seq((1, "1"), (2, "2"), (1, "1"), (2, "2")).toDF("key", "value") - val w = Window.orderBy("value").partitionBy("key") - - checkAnswer( - df.select( - lead("key", 1).over(w), - lead("value", 1).over(w)), - Row(1, "1") :: Row(2, "2") :: Row(null, null) :: Row(null, null) :: Nil) - } - test("rank functions in unspecific window") { withTempView("window_table") { val df = Seq((1, "1"), (2, "2"), (1, "2"), (2, "2")).toDF("key", "value") @@ -1156,32 +1134,6 @@ class DataFrameWindowFunctionsSuite extends QueryTest Row(Seq(0.0f, -0.0f), Row(0.0d, Double.NaN), Seq(Row(0.0d, 0.0/0.0)), 2))) } - test("SPARK-34227: WindowFunctionFrame should clear its states during preparation") { - // This creates a single partition dataframe with 3 records: - // "a", 0, null - // "a", 1, "x" - // "b", 0, null - val df = spark.range(0, 3, 1, 1).select( - when($"id" < 2, lit("a")).otherwise(lit("b")).as("key"), - ($"id" % 2).cast("int").as("order"), - when($"id" % 2 === 0, lit(null)).otherwise(lit("x")).as("value")) - - val window1 = Window.partitionBy($"key").orderBy($"order") - .rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing) - val window2 = Window.partitionBy($"key").orderBy($"order") - .rowsBetween(Window.unboundedPreceding, Window.currentRow) - checkAnswer( - df.select( - $"key", - $"order", - nth_value($"value", 1, ignoreNulls = true).over(window1), - nth_value($"value", 1, ignoreNulls = true).over(window2)), - Seq( - Row("a", 0, "x", null), - Row("a", 1, "x", "x"), - Row("b", 0, null, null))) - } - test("SPARK-38237: require all cluster keys for child required distribution for window query") { def partitionExpressionsColumns(expressions: Seq[Expression]): Seq[String] = { expressions.flatMap { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala index 16a493b52909e..fdb2ec30fdd2d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala @@ -21,6 +21,7 @@ import java.io.{Externalizable, ObjectInput, ObjectOutput} import java.sql.{Date, Timestamp} import scala.collection.immutable.HashSet +import scala.jdk.CollectionConverters._ import scala.reflect.ClassTag import scala.util.Random @@ -29,7 +30,7 @@ import org.scalatest.Assertions._ import org.scalatest.exceptions.TestFailedException import org.scalatest.prop.TableDrivenPropertyChecks._ -import org.apache.spark.{SparkConf, SparkException, SparkRuntimeException, SparkUnsupportedOperationException, TaskContext} +import org.apache.spark.{SparkConf, SparkRuntimeException, SparkUnsupportedOperationException, TaskContext} import org.apache.spark.TestUtils.withListener import org.apache.spark.internal.config.MAX_RESULT_SIZE import org.apache.spark.scheduler.{SparkListener, SparkListenerJobStart} @@ -37,7 +38,7 @@ import org.apache.spark.sql.catalyst.{FooClassWithEnum, FooEnum, ScroogeLikeExam import org.apache.spark.sql.catalyst.encoders.{AgnosticEncoders, ExpressionEncoder, OuterScopes} import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.BoxedIntEncoder import org.apache.spark.sql.catalyst.expressions.{CodegenObjectFactoryMode, GenericRowWithSchema} -import org.apache.spark.sql.catalyst.plans.{LeftAnti, LeftSemi} +import org.apache.spark.sql.catalyst.plans.JoinType import org.apache.spark.sql.catalyst.trees.DataFrameQueryContext import org.apache.spark.sql.catalyst.util.sideBySide import org.apache.spark.sql.execution.{LogicalRDD, RDDScanExec, SQLExecution} @@ -542,25 +543,20 @@ class DatasetSuite extends QueryTest val ds1 = Seq(1, 2, 3).toDS().as("a") val ds2 = Seq(1, 2).toDS().as("b") - val e1 = intercept[AnalysisException] { - ds1.joinWith(ds2, $"a.value" === $"b.value", "left_semi") - }.getMessage - assert(e1.contains("Invalid join type in joinWith: " + LeftSemi.sql)) - - val e2 = intercept[AnalysisException] { - ds1.joinWith(ds2, $"a.value" === $"b.value", "semi") - }.getMessage - assert(e2.contains("Invalid join type in joinWith: " + LeftSemi.sql)) - - val e3 = intercept[AnalysisException] { - ds1.joinWith(ds2, $"a.value" === $"b.value", "left_anti") - }.getMessage - assert(e3.contains("Invalid join type in joinWith: " + LeftAnti.sql)) + def checkJoinWithJoinType(joinType: String): Unit = { + val semiErrorParameters = Map("joinType" -> JoinType(joinType).sql) + checkError( + exception = intercept[AnalysisException]( + ds1.joinWith(ds2, $"a.value" === $"b.value", joinType) + ), + errorClass = "INVALID_JOIN_TYPE_FOR_JOINWITH", + sqlState = "42613", + parameters = semiErrorParameters + ) + } - val e4 = intercept[AnalysisException] { - ds1.joinWith(ds2, $"a.value" === $"b.value", "anti") - }.getMessage - assert(e4.contains("Invalid join type in joinWith: " + LeftAnti.sql)) + Seq("leftsemi", "left_semi", "semi", "leftanti", "left_anti", "anti") + .foreach(checkJoinWithJoinType(_)) } test("groupBy function, keys") { @@ -957,6 +953,25 @@ class DatasetSuite extends QueryTest assert(result2.length == 3) } + test("SPARK-48718: cogroup deserializer expr is resolved before dedup relation") { + val lhs = spark.createDataFrame( + List(Row(123L)).asJava, + StructType(Seq(StructField("GROUPING_KEY", LongType))) + ) + val rhs = spark.createDataFrame( + List(Row(0L, 123L)).asJava, + StructType(Seq(StructField("ID", LongType), StructField("GROUPING_KEY", LongType))) + ) + + val lhsKV = lhs.groupByKey((r: Row) => r.getAs[Long]("GROUPING_KEY")) + val rhsKV = rhs.groupByKey((r: Row) => r.getAs[Long]("GROUPING_KEY")) + val cogrouped = lhsKV.cogroup(rhsKV)( + (a: Long, b: Iterator[Row], c: Iterator[Row]) => Iterator(0L) + ) + val joined = rhs.join(cogrouped, col("ID") === col("value"), "left") + checkAnswer(joined, Row(0L, 123L, 0L) :: Nil) + } + test("SPARK-34806: observation on datasets") { val namedObservation = Observation("named") val unnamedObservation = Observation() @@ -1251,11 +1266,10 @@ class DatasetSuite extends QueryTest // Shouldn't throw runtime exception when parent object (`ClassData`) is null assert(buildDataset(Row(null)).collect() === Array(NestedStruct(null))) - val message = intercept[RuntimeException] { + // Just check the error class here to avoid flakiness due to different parameters. + assert(intercept[SparkRuntimeException] { buildDataset(Row(Row("hello", null))).collect() - }.getCause.getMessage - - assert(message.contains("Null value appeared in non-nullable field")) + }.getErrorClass == "NOT_NULL_ASSERT_VIOLATION") } test("SPARK-12478: top level null field") { @@ -1593,9 +1607,8 @@ class DatasetSuite extends QueryTest } test("Dataset should throw RuntimeException if top-level product input object is null") { - val e = intercept[RuntimeException](Seq(ClassData("a", 1), null).toDS()) - assert(e.getCause.getMessage.contains("Null value appeared in non-nullable field")) - assert(e.getCause.getMessage.contains("top level Product or row object")) + val e = intercept[SparkRuntimeException](Seq(ClassData("a", 1), null).toDS()) + assert(e.getErrorClass == "NOT_NULL_ASSERT_VIOLATION") } test("dropDuplicates") { @@ -2038,19 +2051,34 @@ class DatasetSuite extends QueryTest test("SPARK-22472: add null check for top-level primitive values") { // If the primitive values are from Option, we need to do runtime null check. val ds = Seq(Some(1), None).toDS().as[Int] - val e1 = intercept[RuntimeException](ds.collect()) - assert(e1.getCause.isInstanceOf[NullPointerException]) - val e2 = intercept[SparkException](ds.map(_ * 2).collect()) - assert(e2.getCause.isInstanceOf[NullPointerException]) + val errorClass = "NOT_NULL_ASSERT_VIOLATION" + val sqlState = "42000" + val parameters = Map("walkedTypePath" -> "\n- root class: \"int\"\n") + checkError( + exception = intercept[SparkRuntimeException](ds.collect()), + errorClass = errorClass, + sqlState = sqlState, + parameters = parameters) + checkError( + exception = intercept[SparkRuntimeException](ds.map(_ * 2).collect()), + errorClass = errorClass, + sqlState = sqlState, + parameters = parameters) withTempPath { path => Seq(Integer.valueOf(1), null).toDF("i").write.parquet(path.getCanonicalPath) // If the primitive values are from files, we need to do runtime null check. val ds = spark.read.parquet(path.getCanonicalPath).as[Int] - val e1 = intercept[RuntimeException](ds.collect()) - assert(e1.getCause.isInstanceOf[NullPointerException]) - val e2 = intercept[SparkException](ds.map(_ * 2).collect()) - assert(e2.getCause.isInstanceOf[NullPointerException]) + checkError( + exception = intercept[SparkRuntimeException](ds.collect()), + errorClass = errorClass, + sqlState = sqlState, + parameters = parameters) + checkError( + exception = intercept[SparkRuntimeException](ds.map(_ * 2).collect()), + errorClass = errorClass, + sqlState = sqlState, + parameters = parameters) } } @@ -2068,8 +2096,8 @@ class DatasetSuite extends QueryTest test("SPARK-23835: null primitive data type should throw NullPointerException") { val ds = Seq[(Option[Int], Option[Int])]((Some(1), None)).toDS() - val e = intercept[RuntimeException](ds.as[(Int, Int)].collect()) - assert(e.getCause.isInstanceOf[NullPointerException]) + val exception = intercept[SparkRuntimeException](ds.as[(Int, Int)].collect()) + assert(exception.getErrorClass == "NOT_NULL_ASSERT_VIOLATION") } test("SPARK-24569: Option of primitive types are mistakenly mapped to struct type") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala index da04674b99205..22fdd96ce6bad 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala @@ -151,11 +151,11 @@ class ExplainSuite extends ExplainSuiteHelper with DisableAdaptiveExecutionSuite } test("explain table valued functions") { - checkKeywordsExistsInExplain(sql("select * from RaNgE(2)"), "Range (0, 2, step=1, splits=None)") + checkKeywordsExistsInExplain(sql("select * from RaNgE(2)"), "Range (0, 2, step=1)") checkKeywordsExistsInExplain(sql("SELECT * FROM range(3) CROSS JOIN range(3)"), "Join Cross", - ":- Range (0, 3, step=1, splits=None)", - "+- Range (0, 3, step=1, splits=None)") + ":- Range (0, 3, step=1)", + "+- Range (0, 3, step=1)") } test("explain lateral joins") { @@ -192,9 +192,11 @@ class ExplainSuite extends ExplainSuiteHelper with DisableAdaptiveExecutionSuite |) """.stripMargin) checkKeywordsExistsInExplain(df2, - "Project [concat(cast(id#xL as string), cast((id#xL + 1) as string), " + - "cast(encode(cast((id#xL + 2) as string), utf-8, false) as string), " + - "cast(encode(cast((id#xL + 3) as string), utf-8, false) as string)) AS col#x]") + "Project [concat(concat(col1#x, col2#x), cast(concat(col3#x, col4#x) as string)) AS col#x]", + "Project [cast(id#xL as string) AS col1#x, " + + "cast((id#xL + cast(1 as bigint)) as string) AS col2#x, " + + "encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x, " + + "encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS col4#x]") val df3 = sql( """ @@ -208,9 +210,10 @@ class ExplainSuite extends ExplainSuiteHelper with DisableAdaptiveExecutionSuite |) """.stripMargin) checkKeywordsExistsInExplain(df3, - "Project [concat(cast(id#xL as string), " + - "cast(encode(cast((id#xL + 2) as string), utf-8, false) as string), " + - "cast(encode(cast((id#xL + 3) as string), utf-8, false) as string)) AS col#x]") + "Project [concat(col1#x, cast(concat(col3#x, col4#x) as string)) AS col#x]", + "Project [cast(id#xL as string) AS col1#x, " + + "encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x, " + + "encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS col4#x]") } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ExpressionsSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ExpressionsSchemaSuite.scala index 73b2eba7060d0..443597f10056b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ExpressionsSchemaSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ExpressionsSchemaSuite.scala @@ -117,6 +117,10 @@ class ExpressionsSchemaSuite extends QueryTest with SharedSparkSession { // Note: We need to filter out the commands that set the parameters, such as: // SET spark.sql.parser.escapedStringLiterals=true example.split(" > ").tail.filterNot(_.trim.startsWith("SET")).take(1).foreach { + case _ if funcName == "from_avro" || funcName == "to_avro" => + // Skip running the example queries for the from_avro and to_avro functions because + // these functions dynamically load the AvroDataToCatalyst or CatalystDataToAvro classes + // which are not available in this test. case exampleRe(sql, _) => val df = spark.sql(sql) val escapedSql = sql.replaceAll("\\|", "|") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala index 8a092ab69cf17..229677d208136 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala @@ -33,7 +33,7 @@ import org.apache.spark.sql.catalyst.expressions.{AttributeReference, GreaterTha import org.apache.spark.sql.catalyst.expressions.IntegralLiteralTestUtils.{negativeInt, positiveInt} import org.apache.spark.sql.catalyst.plans.logical.Filter import org.apache.spark.sql.catalyst.types.DataTypeUtils -import org.apache.spark.sql.execution.{ExplainMode, FileSourceScanLike, SimpleMode} +import org.apache.spark.sql.execution.{FileSourceScanLike, SimpleMode} import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper import org.apache.spark.sql.execution.datasources.FilePartition import org.apache.spark.sql.execution.datasources.v2.{BatchScanExec, FileScan} @@ -474,7 +474,7 @@ class FileBasedDataSourceSuite extends QueryTest errorClass = "UNSUPPORTED_DATA_TYPE_FOR_DATASOURCE", parameters = Map( "columnName" -> "`vectors`", - "columnType" -> "\"ARRAY\"", + "columnType" -> "UDT(\"ARRAY\")", "format" -> "CSV") ) @@ -487,7 +487,7 @@ class FileBasedDataSourceSuite extends QueryTest errorClass = "UNSUPPORTED_DATA_TYPE_FOR_DATASOURCE", parameters = Map( "columnName" -> "`a`", - "columnType" -> "\"ARRAY\"", + "columnType" -> "UDT(\"ARRAY\")", "format" -> "CSV") ) } @@ -545,7 +545,7 @@ class FileBasedDataSourceSuite extends QueryTest errorClass = "UNSUPPORTED_DATA_TYPE_FOR_DATASOURCE", parameters = Map( "columnName" -> "`a`", - "columnType" -> "\"INTERVAL\"", + "columnType" -> "UDT(\"INTERVAL\")", "format" -> formatParameter ) ) @@ -595,7 +595,7 @@ class FileBasedDataSourceSuite extends QueryTest errorClass = "UNSUPPORTED_DATA_TYPE_FOR_DATASOURCE", parameters = Map( "columnName" -> "`testType()`", - "columnType" -> "\"VOID\"", + "columnType" -> "UDT(\"VOID\")", "format" -> formatParameter ) ) @@ -624,7 +624,7 @@ class FileBasedDataSourceSuite extends QueryTest errorClass = "UNSUPPORTED_DATA_TYPE_FOR_DATASOURCE", parameters = Map( "columnName" -> "`a`", - "columnType" -> "\"VOID\"", + "columnType" -> "UDT(\"VOID\")", "format" -> formatParameter ) ) @@ -1242,59 +1242,6 @@ class FileBasedDataSourceSuite extends QueryTest } } } - - test("disable filter pushdown for collated strings") { - Seq("parquet").foreach { format => - Seq(format, "").foreach { conf => - withSQLConf(SQLConf.USE_V1_SOURCE_LIST.key -> conf) { - withTempPath { path => - val collation = "'UTF8_BINARY_LCASE'" - val df = sql( - s"""SELECT - | COLLATE(c, $collation) as c1, - | struct(COLLATE(c, $collation)) as str, - | named_struct('f1', named_struct('f2', - | COLLATE(c, $collation), 'f3', 1)) as namedstr, - | array(COLLATE(c, $collation)) as arr, - | map(COLLATE(c, $collation), 1) as map1, - | map(1, COLLATE(c, $collation)) as map2 - |FROM VALUES ('aaa'), ('AAA'), ('bbb') - |as data(c) - |""".stripMargin) - - df.write.format(format).save(path.getAbsolutePath) - - // filter and expected result - val filters = Seq( - ("==", Seq(Row("aaa"), Row("AAA"))), - ("!=", Seq(Row("bbb"))), - ("<", Seq()), - ("<=", Seq(Row("aaa"), Row("AAA"))), - (">", Seq(Row("bbb"))), - (">=", Seq(Row("aaa"), Row("AAA"), Row("bbb")))) - - filters.foreach { filter => - val readback = spark.read - .format(format) - .load(path.getAbsolutePath) - .where(s"c1 ${filter._1} collate('aaa', $collation)") - .where(s"str ${filter._1} struct(collate('aaa', $collation))") - .where(s"namedstr.f1.f2 ${filter._1} collate('aaa', $collation)") - .where(s"arr ${filter._1} array(collate('aaa', $collation))") - .where(s"map_keys(map1) ${filter._1} array(collate('aaa', $collation))") - .where(s"map_values(map2) ${filter._1} array(collate('aaa', $collation))") - .select("c1") - - val explain = readback.queryExecution.explainString( - ExplainMode.fromString("extended")) - assert(explain.contains("PushedFilters: []")) - checkAnswer(readback, filter._2) - } - } - } - } - } - } } object TestingUDT { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ICUCollationsMapSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ICUCollationsMapSuite.scala new file mode 100644 index 0000000000000..42d486bd75454 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/ICUCollationsMapSuite.scala @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.catalyst.util.{fileToString, stringToFile, CollationFactory} + +// scalastyle:off line.size.limit +/** + * Guard against breaking changes in ICU locale names and codes supported by Collator class and provider by CollationFactory. + * Map is in form of rows of pairs (locale name, locale id); locale name consists of three parts: + * - 2-letter lowercase language code + * - 4-letter script code (optional) + * - 3-letter uppercase country code + * + * To re-generate collations map golden file, run: + * {{{ + * SPARK_GENERATE_GOLDEN_FILES=1 build/sbt "sql/testOnly org.apache.spark.sql.ICUCollationsMapSuite" + * }}} + */ +// scalastyle:on line.size.limit +class ICUCollationsMapSuite extends SparkFunSuite { + + private val collationsMapFile = { + getWorkspaceFilePath("sql", "core", "src", "test", "resources", + "collations", "ICU-collations-map.md").toFile + } + + if (regenerateGoldenFiles) { + val map = CollationFactory.getICULocaleNames + val mapOutput = map.zipWithIndex.map { + case (localeName, idx) => s"| $idx | $localeName |" }.mkString("\n") + val goldenOutput = { + s"\n" + + "## ICU locale ids to name map\n" + + "| Locale id | Locale name |\n" + + "| --------- | ----------- |\n" + + mapOutput + "\n" + } + val parent = collationsMapFile.getParentFile + if (!parent.exists()) { + assert(parent.mkdirs(), "Could not create directory: " + parent) + } + stringToFile(collationsMapFile, goldenOutput) + } + + test("ICU locales map breaking change") { + val goldenLines = fileToString(collationsMapFile).split('\n') + val goldenRelevantLines = goldenLines.slice(4, goldenLines.length) // skip header + val input = goldenRelevantLines.map( + s => (s.split('|')(2).strip(), s.split('|')(1).strip().toInt)) + assert(input sameElements CollationFactory.getICULocaleNames.zipWithIndex) + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/InjectRuntimeFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/InjectRuntimeFilterSuite.scala index fc1524be13179..bc16a69475106 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/InjectRuntimeFilterSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/InjectRuntimeFilterSuite.scala @@ -356,8 +356,8 @@ class InjectRuntimeFilterSuite extends QueryTest with SQLTestUtils with SharedSp "(bf1.c1 = bf2.c2 and bf2.a2 = 5)) as a join bf3 on bf3.c3 = a.c1", 2) // left anti join unsupported. // bf2 as creation side and inject runtime filter for bf3(by passing key). - assertRewroteWithBloomFilter("select * from (select * from bf1 left anti join bf2 on " + - "(bf1.c1 = bf2.c2 and bf2.a2 = 5)) as a join bf3 on bf3.c3 = a.c1") + assertDidNotRewriteWithBloomFilter("select * from (select * from bf1 left anti join bf2 " + + "on (bf1.c1 = bf2.c2 and bf2.a2 = 5)) as a join bf3 on bf3.c3 = a.c1") // left anti join unsupported and hasn't selective filter. assertRewroteWithBloomFilter("select * from (select * from bf1 left anti join bf2 on " + "(bf1.c1 = bf2.c2 and bf1.a1 = 5)) as a join bf3 on bf3.c3 = a.c1", 0) @@ -487,7 +487,7 @@ class InjectRuntimeFilterSuite extends QueryTest with SQLTestUtils with SharedSp withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "32", SQLConf.RUNTIME_BLOOM_FILTER_CREATION_SIDE_THRESHOLD.key -> "4000") { // Test that the max scan size rather than an individual scan size on the filter - // application side matters. `bf5filtered` has 14168 bytes and `bf2` has 3409 bytes. + // application side matters. `bf5filtered` has 15049 bytes and `bf2` has 3409 bytes. withSQLConf( SQLConf.RUNTIME_BLOOM_FILTER_APPLICATION_SIDE_SCAN_SIZE_THRESHOLD.key -> "5000") { assertRewroteWithBloomFilter("select * from " + @@ -495,7 +495,7 @@ class InjectRuntimeFilterSuite extends QueryTest with SQLTestUtils with SharedSp "join bf3 on t.c5 = bf3.c3 where bf3.a3 = 5", 2) } withSQLConf( - SQLConf.RUNTIME_BLOOM_FILTER_APPLICATION_SIDE_SCAN_SIZE_THRESHOLD.key -> "15000") { + SQLConf.RUNTIME_BLOOM_FILTER_APPLICATION_SIDE_SCAN_SIZE_THRESHOLD.key -> "16000") { assertDidNotRewriteWithBloomFilter("select * from " + "(select * from bf5filtered union all select * from bf2) t " + "join bf3 on t.c5 = bf3.c3 where bf3.a3 = 5") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/IntegratedUDFTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/IntegratedUDFTestUtils.scala index c1ca48162d207..957be07607b66 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/IntegratedUDFTestUtils.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/IntegratedUDFTestUtils.scala @@ -660,6 +660,49 @@ object IntegratedUDFTestUtils extends SQLHelper { orderBy = "OrderingColumn(\"input\")", select = "SelectedColumn(\"partition_col\")") + object UDTFPartitionByIndexingBug extends TestUDTF { + val pythonScript: String = + s""" + |from pyspark.sql.functions import ( + | AnalyzeArgument, + | AnalyzeResult, + | PartitioningColumn, + | SelectedColumn, + | udtf + |) + |from pyspark.sql.types import ( + | DoubleType, + | StringType, + | StructType, + |) + |class $name: + | @staticmethod + | def analyze(observed: AnalyzeArgument) -> AnalyzeResult: + | out_schema = StructType() + | out_schema.add("partition_col", StringType()) + | out_schema.add("double_col", DoubleType()) + | + | return AnalyzeResult( + | schema=out_schema, + | partitionBy=[PartitioningColumn("partition_col")], + | select=[ + | SelectedColumn("partition_col"), + | SelectedColumn("double_col"), + | ], + | ) + | + | def eval(self, *args, **kwargs): + | pass + | + | def terminate(self): + | for _ in range(5): + | yield { + | "partition_col": None, + | "double_col": 1.0, + | } + |""".stripMargin + } + object UDTFInvalidPartitionByOrderByParseError extends TestPythonUDTFPartitionByOrderByBase( partitionBy = "PartitioningColumn(\"unparsable\")", @@ -1216,6 +1259,7 @@ object IntegratedUDFTestUtils extends SQLHelper { UDTFPartitionByOrderBySelectExpr, UDTFPartitionByOrderBySelectComplexExpr, UDTFPartitionByOrderBySelectExprOnlyPartitionColumn, + UDTFPartitionByIndexingBug, InvalidAnalyzeMethodReturnsNonStructTypeSchema, InvalidAnalyzeMethodWithSinglePartitionNoInputTable, InvalidAnalyzeMethodWithPartitionByNoInputTable, diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala index be6862f5b96b7..fcb937d82ba42 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala @@ -27,7 +27,7 @@ import org.apache.spark.internal.config.SHUFFLE_SPILL_NUM_ELEMENTS_FORCE_SPILL_T import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation import org.apache.spark.sql.catalyst.expressions.{Ascending, GenericRow, SortOrder} -import org.apache.spark.sql.catalyst.optimizer.{BuildLeft, BuildRight} +import org.apache.spark.sql.catalyst.optimizer.{BuildLeft, BuildRight, JoinSelectionHelper} import org.apache.spark.sql.catalyst.plans.logical.{Filter, HintInfo, Join, JoinHint, NO_BROADCAST_AND_REPLICATION} import org.apache.spark.sql.execution.{BinaryExecNode, FilterExec, ProjectExec, SortExec, SparkPlan, WholeStageCodegenExec} import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper @@ -40,7 +40,8 @@ import org.apache.spark.sql.types.StructType import org.apache.spark.tags.SlowSQLTest @SlowSQLTest -class JoinSuite extends QueryTest with SharedSparkSession with AdaptiveSparkPlanHelper { +class JoinSuite extends QueryTest with SharedSparkSession with AdaptiveSparkPlanHelper + with JoinSelectionHelper { import testImplicits._ setupTestData() @@ -61,6 +62,7 @@ class JoinSuite extends QueryTest with SharedSparkSession with AdaptiveSparkPlan val sqlString = pair._1 val c = pair._2 val df = sql(sqlString) + val optimized = df.queryExecution.optimizedPlan val physical = df.queryExecution.sparkPlan val operators = physical.collect { case j: BroadcastHashJoinExec => j @@ -74,6 +76,10 @@ class JoinSuite extends QueryTest with SharedSparkSession with AdaptiveSparkPlan if (operators.head.getClass != c) { fail(s"$sqlString expected operator: $c, but got ${operators.head}\n physical: \n$physical") } + assert( + canPlanAsBroadcastHashJoin(optimized.asInstanceOf[Join], conf) === + operators.head.isInstanceOf[BroadcastHashJoinExec], + "canPlanAsBroadcastHashJoin not in sync with join selection codepath!") operators.head } @@ -89,11 +95,13 @@ class JoinSuite extends QueryTest with SharedSparkSession with AdaptiveSparkPlan val planned = spark.sessionState.planner.JoinSelection(join) assert(planned.size == 1) assert(planned.head.isInstanceOf[CartesianProductExec]) + assert(!canPlanAsBroadcastHashJoin(join, conf)) val plannedWithHint = spark.sessionState.planner.JoinSelection(joinWithHint) assert(plannedWithHint.size == 1) assert(plannedWithHint.head.isInstanceOf[BroadcastNestedLoopJoinExec]) assert(plannedWithHint.head.asInstanceOf[BroadcastNestedLoopJoinExec].buildSide == BuildLeft) + assert(!canPlanAsBroadcastHashJoin(joinWithHint, conf)) } } @@ -112,10 +120,12 @@ class JoinSuite extends QueryTest with SharedSparkSession with AdaptiveSparkPlan val planned = spark.sessionState.planner.JoinSelection(join) assert(planned.size == 1) assert(planned.head.isInstanceOf[BroadcastHashJoinExec]) + assert(canPlanAsBroadcastHashJoin(join, conf)) val plannedWithHint = spark.sessionState.planner.JoinSelection(joinWithHint) assert(plannedWithHint.size == 1) assert(plannedWithHint.head.isInstanceOf[SortMergeJoinExec]) + assert(!canPlanAsBroadcastHashJoin(joinWithHint, conf)) } test("NO_BROADCAST_AND_REPLICATION controls build side in BNLJ") { @@ -131,11 +141,13 @@ class JoinSuite extends QueryTest with SharedSparkSession with AdaptiveSparkPlan assert(planned.size == 1) assert(planned.head.isInstanceOf[BroadcastNestedLoopJoinExec]) assert(planned.head.asInstanceOf[BroadcastNestedLoopJoinExec].buildSide == BuildRight) + assert(!canPlanAsBroadcastHashJoin(join, conf)) val plannedWithHint = spark.sessionState.planner.JoinSelection(joinWithHint) assert(plannedWithHint.size == 1) assert(plannedWithHint.head.isInstanceOf[BroadcastNestedLoopJoinExec]) assert(plannedWithHint.head.asInstanceOf[BroadcastNestedLoopJoinExec].buildSide == BuildLeft) + assert(!canPlanAsBroadcastHashJoin(joinWithHint, conf)) } test("join operator selection") { @@ -191,6 +203,16 @@ class JoinSuite extends QueryTest with SharedSparkSession with AdaptiveSparkPlan // ).foreach { case (query, joinClass) => assertJoin(query, joinClass) } // } + test("broadcastable join with shuffle join hint") { + spark.sharedState.cacheManager.clearCache() + sql("CACHE TABLE testData") + // Make sure it's planned as broadcast join without the hint. + assertJoin("SELECT * FROM testData JOIN testData2 ON key = a", + classOf[BroadcastHashJoinExec]) + assertJoin("SELECT /*+ SHUFFLE_HASH(testData) */ * FROM testData JOIN testData2 ON key = a", + classOf[ShuffledHashJoinExec]) + } + test("broadcasted hash join operator selection") { spark.sharedState.cacheManager.clearCache() sql("CACHE TABLE testData") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala index 9946815bdf516..ea00e02e232c6 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala @@ -1164,8 +1164,8 @@ class JsonFunctionsSuite extends QueryTest with SharedSparkSession { exception = intercept[SparkIllegalArgumentException] { df.select(from_json($"json", invalidJsonSchema, Map.empty[String, String])).collect() }, - errorClass = "_LEGACY_ERROR_TEMP_3250", - parameters = Map("other" -> """{"a":123}""")) + errorClass = "INVALID_JSON_DATA_TYPE", + parameters = Map("invalidType" -> """{"a":123}""")) val invalidDataType = "MAP" val invalidDataTypeReason = "Unrecognized token 'MAP': " + diff --git a/sql/core/src/test/scala/org/apache/spark/sql/LogQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/LogQuerySuite.scala new file mode 100644 index 0000000000000..df0fbf15a98ee --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/LogQuerySuite.scala @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import java.io.File + +import org.apache.spark.internal.{Logging, LogKeys, MDC} +import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.util.LogUtils.LOG_SCHEMA + +/** + * Test suite for querying Spark logs using SQL. + */ +class LogQuerySuite extends QueryTest with SharedSparkSession with Logging { + + val logFile: File = { + val pwd = new File(".").getCanonicalPath + new File(pwd + "/target/LogQuerySuite.log") + } + + override def afterAll(): Unit = { + super.afterAll() + // Clear the log file + if (logFile.exists()) { + logFile.delete() + } + } + + private def createTempView(viewName: String): Unit = { + spark.read.schema(LOG_SCHEMA).json(logFile.getCanonicalPath).createOrReplaceTempView(viewName) + } + + test("Query Spark logs using SQL") { + val msg = log"Lost executor ${MDC(LogKeys.EXECUTOR_ID, "1")}." + logError(msg) + + withTempView("logs") { + createTempView("logs") + checkAnswer( + spark.sql(s"SELECT level, msg, context, exception FROM logs WHERE msg = '${msg.message}'"), + Row("ERROR", msg.message, Map(LogKeys.EXECUTOR_ID.name -> "1"), null) :: Nil) + } + } + + test("Query Spark logs with exception using SQL") { + val msg = log"Task ${MDC(LogKeys.TASK_ID, "2")} failed." + val exception = new RuntimeException("OOM") + logError(msg, exception) + + withTempView("logs") { + createTempView("logs") + val expectedMDC = Map(LogKeys.TASK_ID.name -> "2") + checkAnswer( + spark.sql("SELECT level, msg, context, exception.class, exception.msg FROM logs " + + s"WHERE msg = '${msg.message}'"), + Row("ERROR", msg.message, expectedMDC, "java.lang.RuntimeException", "OOM") :: Nil) + + val stacktrace = + spark.sql(s"SELECT exception.stacktrace FROM logs WHERE msg = '${msg.message}'").collect() + assert(stacktrace.length == 1) + val topStacktraceArray = stacktrace.head.getSeq[Row](0).head + assert(topStacktraceArray.getString(0) == this.getClass.getName) + assert(topStacktraceArray.getString(1) != "") + assert(topStacktraceArray.getString(2) == this.getClass.getSimpleName + ".scala") + assert(topStacktraceArray.getString(3) != "") + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/MathFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/MathFunctionsSuite.scala index ba04e3b691a1b..ac14b345a7621 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/MathFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/MathFunctionsSuite.scala @@ -707,6 +707,17 @@ class MathFunctionsSuite extends QueryTest with SharedSparkSession { df1.select(try_divide(make_interval(col("year"), col("month")), lit(0)))) } + test("try_remainder") { + val df = Seq((10, 3), (5, 5), (5, 0)).toDF("birth", "age") + checkAnswer(df.selectExpr("try_remainder(birth, age)"), Seq(Row(1), Row(0), Row(null))) + + val dfDecimal = Seq( + (BigDecimal(10), BigDecimal(3)), + (BigDecimal(5), BigDecimal(5)), + (BigDecimal(5), BigDecimal(0))).toDF("birth", "age") + checkAnswer(dfDecimal.selectExpr("try_remainder(birth, age)"), Seq(Row(1), Row(0), Row(null))) + } + test("try_element_at") { val df = Seq((Array(1, 2, 3), 2)).toDF("a", "b") checkAnswer(df.selectExpr("try_element_at(a, b)"), Seq(Row(2))) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/PlanStabilitySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/PlanStabilitySuite.scala index b5b3492269415..ad424b3a7cc76 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/PlanStabilitySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/PlanStabilitySuite.scala @@ -59,13 +59,13 @@ import org.apache.spark.tags.ExtendedSQLTest * To re-generate golden files for entire suite, run: * {{{ * SPARK_GENERATE_GOLDEN_FILES=1 build/sbt "sql/testOnly *PlanStability*Suite" - * SPARK_GENERATE_GOLDEN_FILES=1 SPARK_ANSI_SQL_MODE=true build/sbt "sql/testOnly *PlanStability*Suite" + * SPARK_GENERATE_GOLDEN_FILES=1 SPARK_ANSI_SQL_MODE=false build/sbt "sql/testOnly *PlanStability*Suite" * }}} * * To re-generate golden file for a single test, run: * {{{ * SPARK_GENERATE_GOLDEN_FILES=1 build/sbt "sql/testOnly *PlanStability*Suite -- -z (tpcds-v1.4/q49)" - * SPARK_GENERATE_GOLDEN_FILES=1 SPARK_ANSI_SQL_MODE=true build/sbt "sql/testOnly *PlanStability*Suite -- -z (tpcds-v1.4/q49)" + * SPARK_GENERATE_GOLDEN_FILES=1 SPARK_ANSI_SQL_MODE=false build/sbt "sql/testOnly *PlanStability*Suite -- -z (tpcds-v1.4/q49)" * }}} */ // scalastyle:on line.size.limit @@ -256,9 +256,11 @@ trait PlanStabilitySuite extends DisableAdaptiveExecutionSuite { protected def testQuery(tpcdsGroup: String, query: String, suffix: String = ""): Unit = { val queryString = resourceToString(s"$tpcdsGroup/$query.sql", classLoader = Thread.currentThread().getContextClassLoader) - // Disable char/varchar read-side handling for better performance. - withSQLConf(SQLConf.READ_SIDE_CHAR_PADDING.key -> "false", - SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "10MB") { + withSQLConf( + // Disable char/varchar read-side handling for better performance. + SQLConf.READ_SIDE_CHAR_PADDING.key -> "false", + SQLConf.LEGACY_NO_CHAR_PADDING_IN_PREDICATE.key -> "true", + SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "10MB") { val qe = sql(queryString).queryExecution val plan = qe.executedPlan val explain = normalizeLocation(normalizeIds(qe.explainString(FormattedMode))) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ResolveDefaultColumnsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ResolveDefaultColumnsSuite.scala index 48a9564ab8f95..bca1472799939 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ResolveDefaultColumnsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ResolveDefaultColumnsSuite.scala @@ -279,4 +279,12 @@ class ResolveDefaultColumnsSuite extends QueryTest with SharedSparkSession { checkAnswer(sql("select CAST(c as STRING) from t"), Row("2018-11-17 13:33:33")) } } + + test("SPARK-48033: default columns using runtime replaceable expression works") { + withTable("t") { + sql("CREATE TABLE t(v VARIANT DEFAULT parse_json('1')) USING PARQUET") + sql("INSERT INTO t VALUES(DEFAULT)") + checkAnswer(sql("select v from t"), sql("select parse_json('1')").collect()) + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/RuntimeNullChecksV2Writes.scala b/sql/core/src/test/scala/org/apache/spark/sql/RuntimeNullChecksV2Writes.scala index fbdd1428ba9b8..754c46cc5cd3e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/RuntimeNullChecksV2Writes.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/RuntimeNullChecksV2Writes.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql import java.util.Collections -import org.apache.spark.{SparkConf, SparkException} +import org.apache.spark.{SparkConf, SparkRuntimeException} import org.apache.spark.sql.connector.catalog.{Column => ColumnV2, Identifier, InMemoryTableCatalog} import org.apache.spark.sql.connector.expressions.Transform import org.apache.spark.sql.internal.SQLConf @@ -56,7 +56,7 @@ class RuntimeNullChecksV2Writes extends QueryTest with SQLTestUtils with SharedS withTable("t") { sql(s"CREATE TABLE t (s STRING, i INT NOT NULL) USING $FORMAT") - val e = intercept[SparkException] { + val e = intercept[SparkRuntimeException] { if (byName) { val inputDF = sql("SELECT 'txt' AS s, null AS i") inputDF.writeTo("t").append() @@ -64,7 +64,7 @@ class RuntimeNullChecksV2Writes extends QueryTest with SQLTestUtils with SharedS sql("INSERT INTO t VALUES ('txt', null)") } } - assertNotNullException(e, Seq("i")) + assert(e.getErrorClass == "NOT_NULL_ASSERT_VIOLATION") } } @@ -88,7 +88,7 @@ class RuntimeNullChecksV2Writes extends QueryTest with SQLTestUtils with SharedS |USING $FORMAT """.stripMargin) - val e1 = intercept[SparkException] { + val e1 = intercept[SparkRuntimeException] { if (byName) { val inputDF = sql( s"""SELECT @@ -106,7 +106,7 @@ class RuntimeNullChecksV2Writes extends QueryTest with SQLTestUtils with SharedS } assertNotNullException(e1, Seq("s", "ns")) - val e2 = intercept[SparkException] { + val e2 = intercept[SparkRuntimeException] { if (byName) { val inputDF = sql( s"""SELECT @@ -124,7 +124,7 @@ class RuntimeNullChecksV2Writes extends QueryTest with SQLTestUtils with SharedS } assertNotNullException(e2, Seq("s", "arr")) - val e3 = intercept[SparkException] { + val e3 = intercept[SparkRuntimeException] { if (byName) { val inputDF = sql( s"""SELECT @@ -177,7 +177,7 @@ class RuntimeNullChecksV2Writes extends QueryTest with SQLTestUtils with SharedS } checkAnswer(spark.table("t"), Row(1, Row(1, null))) - val e = intercept[SparkException] { + val e = intercept[SparkRuntimeException] { if (byName) { val inputDF = sql( s"""SELECT @@ -224,7 +224,7 @@ class RuntimeNullChecksV2Writes extends QueryTest with SQLTestUtils with SharedS } checkAnswer(spark.table("t"), Row(1, null)) - val e = intercept[SparkException] { + val e = intercept[SparkRuntimeException] { if (byName) { val inputDF = sql( s"""SELECT @@ -279,7 +279,7 @@ class RuntimeNullChecksV2Writes extends QueryTest with SQLTestUtils with SharedS } checkAnswer(spark.table("t"), Row(1, List(null, Row(1, 1)))) - val e = intercept[SparkException] { + val e = intercept[SparkRuntimeException] { if (byName) { val inputDF = sql( s"""SELECT @@ -325,7 +325,7 @@ class RuntimeNullChecksV2Writes extends QueryTest with SQLTestUtils with SharedS } checkAnswer(spark.table("t"), Row(1, null)) - val e = intercept[SparkException] { + val e = intercept[SparkRuntimeException] { if (byName) { val inputDF = sql("SELECT 1 AS i, map(1, null) AS m") inputDF.writeTo("t").append() @@ -364,7 +364,7 @@ class RuntimeNullChecksV2Writes extends QueryTest with SQLTestUtils with SharedS } checkAnswer(spark.table("t"), Row(1, Map(Row(1, 1) -> null))) - val e1 = intercept[SparkException] { + val e1 = intercept[SparkRuntimeException] { if (byName) { val inputDF = sql( s"""SELECT @@ -382,7 +382,7 @@ class RuntimeNullChecksV2Writes extends QueryTest with SQLTestUtils with SharedS } assertNotNullException(e1, Seq("m", "key", "x")) - val e2 = intercept[SparkException] { + val e2 = intercept[SparkRuntimeException] { if (byName) { val inputDF = sql( s"""SELECT @@ -402,11 +402,9 @@ class RuntimeNullChecksV2Writes extends QueryTest with SQLTestUtils with SharedS } } - private def assertNotNullException(e: SparkException, colPath: Seq[String]): Unit = { + private def assertNotNullException(e: SparkRuntimeException, colPath: Seq[String]): Unit = { e.getCause match { - case npe: NullPointerException => - assert(npe.getMessage.contains("Null value appeared in non-nullable field")) - assert(npe.getMessage.contains(colPath.mkString("\n", "\n", "\n"))) + case _ if e.getErrorClass == "NOT_NULL_ASSERT_VIOLATION" => case other => fail(s"Unexpected exception cause: $other") } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index f81369bbad367..56c364e20846a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -3748,22 +3748,21 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark test("SPARK-33084: Add jar support Ivy URI in SQL") { val sc = spark.sparkContext - val hiveVersion = "2.3.9" // transitive=false, only download specified jar - sql(s"ADD JAR ivy://org.apache.hive.hcatalog:hive-hcatalog-core:$hiveVersion?transitive=false") - assert(sc.listJars() - .exists(_.contains(s"org.apache.hive.hcatalog_hive-hcatalog-core-$hiveVersion.jar"))) + sql(s"ADD JAR ivy://org.springframework:spring-core:6.1.6?transitive=false") + assert(sc.listJars().exists(_.contains("org.springframework_spring-core-6.1.6.jar"))) + assert(!sc.listJars().exists(_.contains("org.springframework_spring-jcl-6.1.6.jar"))) // default transitive=true, test download ivy URL jar return multiple jars - sql("ADD JAR ivy://org.scala-js:scalajs-test-interface_2.12:1.2.0") - assert(sc.listJars().exists(_.contains("scalajs-library_2.12"))) - assert(sc.listJars().exists(_.contains("scalajs-test-interface_2.12"))) + sql("ADD JAR ivy://org.awaitility:awaitility:4.2.1") + assert(sc.listJars().exists(_.contains("org.awaitility_awaitility-4.2.1.jar"))) + assert(sc.listJars().exists(_.contains("org.hamcrest_hamcrest-2.1.jar"))) - sql(s"ADD JAR ivy://org.apache.hive:hive-contrib:$hiveVersion" + - "?exclude=org.pentaho:pentaho-aggdesigner-algorithm&transitive=true") - assert(sc.listJars().exists(_.contains(s"org.apache.hive_hive-contrib-$hiveVersion.jar"))) - assert(sc.listJars().exists(_.contains(s"org.apache.hive_hive-exec-$hiveVersion.jar"))) - assert(!sc.listJars().exists(_.contains("org.pentaho.pentaho_aggdesigner-algorithm"))) + sql("ADD JAR ivy://org.junit.jupiter:junit-jupiter:5.10.2" + + "?exclude=org.junit.jupiter:junit-jupiter-engine&transitive=true") + assert(sc.listJars().exists(_.contains("org.junit.jupiter_junit-jupiter-api-5.10.2.jar"))) + assert(sc.listJars().exists(_.contains("org.junit.jupiter_junit-jupiter-params-5.10.2.jar"))) + assert(!sc.listJars().exists(_.contains("org.junit.jupiter_junit-jupiter-engine-5.10.2.jar"))) } test("SPARK-33677: LikeSimplification should be skipped if pattern contains any escapeChar") { @@ -4400,8 +4399,8 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark checkAnswer(df, Row(1, 2, 3, 4, 5, 6, 7, 8, 9, 10) :: Row(2, 4, 6, 8, 10, 12, 14, 16, 18, 20) :: Nil) - assert(df.schema.names.sameElements( - Array("max(t)", "max(t", "=", "\n", ";", "a b", "{", ".", "a.b", "a"))) + assert(df.schema.names === + Array("max(t)", "max(t", "=", "\n", ";", "a b", "{", ".", "a.b", "a")) checkAnswer(df.select("`max(t)`", "`a b`", "`{`", "`.`", "`a.b`"), Row(1, 6, 7, 8, 9) :: Row(2, 12, 14, 16, 18) :: Nil) checkAnswer(df.where("`a.b` > 10"), @@ -4419,8 +4418,8 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark checkAnswer(df, Row(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11) :: Row(2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22) :: Nil) - assert(df.schema.names.sameElements( - Array("max(t)", "max(t", "=", "\n", ";", "a b", "{", ".", "a.b", "a", ","))) + assert(df.schema.names === + Array("max(t)", "max(t", "=", "\n", ";", "a b", "{", ".", "a.b", "a", ",")) checkAnswer(df.select("`max(t)`", "`a b`", "`{`", "`.`", "`a.b`"), Row(1, 6, 7, 8, 9) :: Row(2, 12, 14, 16, 18) :: Nil) checkAnswer(df.where("`a.b` > 10"), @@ -4716,6 +4715,147 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark val df6 = df3.join(df2, col("df3.zaak_id") === col("df2.customer_id"), "outer") df5.crossJoin(df6) } + + test("SPARK-47939: Describe should work with parameterized queries") { + checkAnswer( + spark.sql("describe select ?", Array(1)), + Array( + Row("1", "int", null) + ) + ) + checkAnswer( + spark.sql("describe select :first", Map("first" -> 1)), + Array( + Row("1", "int", null) + ) + ) + + checkAnswer( + spark.sql("describe select * from values (?, ?) t(x, y)", Array(1, "a")), + Array( + Row("x", "int", null), + Row("y", "string", null) + ) + ) + checkAnswer( + spark.sql( + "describe select * from values (:first, :second) t(x, y)", + Map("first" -> 1, "second" -> "a") + ), + Array( + Row("x", "int", null), + Row("y", "string", null) + ) + ) + } + + test("SPARK-47939: Explain should work with parameterized queries") { + def checkQueryPlan(df: DataFrame, plan: String): Unit = assert( + df.collect() + .map(_.getString(0)) + .map(_.replaceAll("#[0-9]+", "#N")) + === Array(plan.stripMargin) + ) + + checkQueryPlan( + spark.sql("explain select ?", Array(1)), + """== Physical Plan == + |*(1) Project [1 AS 1#N] + |+- *(1) Scan OneRowRelation[] + + |""" + ) + checkQueryPlan( + spark.sql("explain select :first", Map("first" -> 1)), + """== Physical Plan == + |*(1) Project [1 AS 1#N] + |+- *(1) Scan OneRowRelation[] + + |""" + ) + + checkQueryPlan( + spark.sql("explain explain explain select ?", Array(1)), + """== Physical Plan == + |Execute ExplainCommand + | +- ExplainCommand ExplainCommand 'PosParameterizedQuery [1], SimpleMode, SimpleMode + + |""" + ) + checkQueryPlan( + spark.sql("explain explain explain select :first", Map("first" -> 1)), + // scalastyle:off + """== Physical Plan == + |Execute ExplainCommand + | +- ExplainCommand ExplainCommand 'NameParameterizedQuery [first], [1], SimpleMode, SimpleMode + + |""" + // scalastyle:on + ) + + checkQueryPlan( + spark.sql("explain describe select ?", Array(1)), + """== Physical Plan == + |Execute DescribeQueryCommand + | +- DescribeQueryCommand select ? + + |""" + ) + checkQueryPlan( + spark.sql("explain describe select :first", Map("first" -> 1)), + """== Physical Plan == + |Execute DescribeQueryCommand + | +- DescribeQueryCommand select :first + + |""" + ) + + checkQueryPlan( + spark.sql("explain extended select * from values (?, ?) t(x, y)", Array(1, "a")), + """== Parsed Logical Plan == + |'PosParameterizedQuery [1, a] + |+- 'Project [*] + | +- 'SubqueryAlias t + | +- 'UnresolvedInlineTable [x, y], [[posparameter(39), posparameter(42)]] + + |== Analyzed Logical Plan == + |x: int, y: string + |Project [x#N, y#N] + |+- SubqueryAlias t + | +- LocalRelation [x#N, y#N] + + |== Optimized Logical Plan == + |LocalRelation [x#N, y#N] + + |== Physical Plan == + |LocalTableScan [x#N, y#N] + |""" + ) + checkQueryPlan( + spark.sql( + "explain extended select * from values (:first, :second) t(x, y)", + Map("first" -> 1, "second" -> "a") + ), + """== Parsed Logical Plan == + |'NameParameterizedQuery [first, second], [1, a] + |+- 'Project [*] + | +- 'SubqueryAlias t + | +- 'UnresolvedInlineTable [x, y], [[namedparameter(first), namedparameter(second)]] + + |== Analyzed Logical Plan == + |x: int, y: string + |Project [x#N, y#N] + |+- SubqueryAlias t + | +- LocalRelation [x#N, y#N] + + |== Optimized Logical Plan == + |LocalRelation [x#N, y#N] + + |== Physical Plan == + |LocalTableScan [x#N, y#N] + |""" + ) + } } case class Foo(bar: Option[String]) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala index c9cb459878cfb..b031f45ddbf34 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala @@ -703,9 +703,9 @@ class SQLQueryTestSuite extends QueryTest with SharedSparkSession with SQLHelper } test("Test logic for determining whether a query is semantically sorted") { - withTable("t1", "t2") { - spark.sql("CREATE TABLE t1(a int, b int) USING parquet") - spark.sql("CREATE TABLE t2(a int, b int) USING parquet") + withTempView("t1", "t2") { + spark.sql("CREATE TEMP VIEW t1 AS SELECT * FROM VALUES (1, 1) AS t1(a, b)") + spark.sql("CREATE TEMP VIEW t2 AS SELECT * FROM VALUES (1, 2) AS t2(a, b)") val unsortedSelectQuery = "select * from t1" val sortedSelectQuery = "select * from t1 order by a, b" diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SetCommandSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SetCommandSuite.scala index 2c803ceffe950..a8b359f308a2b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SetCommandSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SetCommandSuite.scala @@ -83,18 +83,6 @@ class SetCommandSuite extends QueryTest with SharedSparkSession with ResetSystem spark.sessionState.conf.clear() } - test("SPARK-19218 `SET -v` should not fail with null value configuration") { - import SQLConf._ - val confEntry = buildConf("spark.test").doc("doc").stringConf.createWithDefault(null) - - try { - val result = sql("SET -v").collect() - assert(result === result.sortBy(_.getString(0))) - } finally { - SQLConf.unregister(confEntry) - } - } - test("SET commands with illegal or inappropriate argument") { spark.sessionState.conf.clear() // Set negative mapred.reduce.tasks for automatically determining diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionExtensionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionExtensionSuite.scala index 18c1f4dcc4e00..1f0033a0efcdc 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionExtensionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionExtensionSuite.scala @@ -30,7 +30,7 @@ import org.apache.spark.sql.catalyst.catalog.BucketSpec import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate.{Final, Partial} -import org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, ParserInterface} +import org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, CompoundBody, ParserInterface} import org.apache.spark.sql.catalyst.plans.SQLHelper import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, Limit, LocalRelation, LogicalPlan, Statistics, UnresolvedHint} import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, SinglePartition} @@ -40,7 +40,7 @@ import org.apache.spark.sql.connector.write.WriterCommitMessage import org.apache.spark.sql.execution._ import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanExec, AdaptiveSparkPlanHelper, AQEShuffleReadExec, QueryStageExec, ShuffleQueryStageExec} import org.apache.spark.sql.execution.aggregate.HashAggregateExec -import org.apache.spark.sql.execution.datasources.{FileFormat, WriteFilesExec, WriteFilesSpec} +import org.apache.spark.sql.execution.datasources.{FileFormat, WriteFilesExec, WriteFilesExecBase, WriteFilesSpec} import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, BroadcastExchangeLike, ShuffleExchangeExec, ShuffleExchangeLike, ShuffleOrigin} import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector import org.apache.spark.sql.internal.SQLConf @@ -581,6 +581,9 @@ case class MyParser(spark: SparkSession, delegate: ParserInterface) extends Pars override def parseQuery(sqlText: String): LogicalPlan = delegate.parseQuery(sqlText) + + override def parseScript(sqlScriptText: String): CompoundBody = + delegate.parseScript(sqlScriptText) } object MyExtensions { @@ -842,14 +845,13 @@ class ColumnarProjectExec(projectList: Seq[NamedExpression], child: SparkPlan) new ColumnarProjectExec(projectList, newChild) } -class ColumnarWriteExec( +case class ColumnarWriteExec( child: SparkPlan, fileFormat: FileFormat, partitionColumns: Seq[Attribute], bucketSpec: Option[BucketSpec], options: Map[String, String], - staticPartitions: TablePartitionSpec) extends WriteFilesExec( - child, fileFormat, partitionColumns, bucketSpec, options, staticPartitions) { + staticPartitions: TablePartitionSpec) extends WriteFilesExecBase { override def supportsColumnar: Boolean = true @@ -858,8 +860,8 @@ class ColumnarWriteExec( throw new Exception("columnar write") } - override protected def withNewChildInternal(newChild: SparkPlan): WriteFilesExec = - new ColumnarWriteExec( + override protected def withNewChildInternal(newChild: SparkPlan): ColumnarWriteExec = + ColumnarWriteExec( newChild, fileFormat, partitionColumns, bucketSpec, options, staticPartitions) } @@ -971,7 +973,7 @@ case class PreRuleReplaceAddWithBrokenVersion() extends Rule[SparkPlan] { replaceWithColumnarExpression(exp).asInstanceOf[NamedExpression]), replaceWithColumnarPlan(plan.child)) case write: WriteFilesExec => - new ColumnarWriteExec( + ColumnarWriteExec( replaceWithColumnarPlan(write.child), write.fileFormat, write.partitionColumns, @@ -1014,6 +1016,7 @@ case class MyShuffleExchangeExec(delegate: ShuffleExchangeExec) extends ShuffleE val attributeStats = AttributeMap(Seq((child.output.head, columnStats))) Statistics(stats.sizeInBytes, stats.rowCount, attributeStats) } + override def shuffleId: Int = delegate.shuffleId override def child: SparkPlan = delegate.child override protected def doExecute(): RDD[InternalRow] = delegate.execute() override def outputPartitioning: Partitioning = delegate.outputPartitioning diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala index 393ecc95b66b2..5f1fa2904e341 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala @@ -644,7 +644,7 @@ class StatisticsCollectionSuite extends StatisticsCollectionTestBase with Shared test("analyzes column statistics in cached global temporary view") { withGlobalTempView("gTempView") { - val globalTempDB = spark.sharedState.globalTempViewManager.database + val globalTempDB = spark.sharedState.globalTempDB val e1 = intercept[AnalysisException] { sql(s"ANALYZE TABLE $globalTempDB.gTempView COMPUTE STATISTICS FOR COLUMNS id") } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala index 75b4415db6b54..31c1cac9fb718 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala @@ -332,6 +332,11 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession { // scalastyle:on } + test("string substring function using columns") { + val df = Seq(("Spark", 2, 3)).toDF("a", "b", "c") + checkAnswer(df.select(substring($"a", $"b", $"c")), Row("par")) + } + test("string encode/decode function") { val bytes = Array[Byte](-27, -92, -89, -27, -115, -125, -28, -72, -106, -25, -107, -116) // scalastyle:off @@ -525,6 +530,33 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession { Row(Seq("aa", "bb", "cc", ""))) } + test("SPARK-47845: string split function with column types") { + val df = Seq( + ("aa2bb3cc4", "[1-9]+", 0), + ("aa2bb3cc4", "[1-9]+", 2), + ("aa2bb3cc4", "[1-9]+", -2)).toDF("a", "b", "c") + + // without limit + val expectedNoLimit = Seq( + Row(Seq("aa", "bb", "cc", "")), + Row(Seq("aa", "bb", "cc", "")), + Row(Seq("aa", "bb", "cc", ""))) + + checkAnswer(df.select(split($"a", $"b")), expectedNoLimit) + + checkAnswer(df.selectExpr("split(a, b)"), expectedNoLimit) + + // with limit + val expectedWithLimit = Seq( + Row(Seq("aa", "bb", "cc", "")), + Row(Seq("aa", "bb3cc4")), + Row(Seq("aa", "bb", "cc", ""))) + + checkAnswer(df.select(split($"a", $"b", $"c")), expectedWithLimit) + + checkAnswer(df.selectExpr("split(a, b, c)"), expectedWithLimit) + } + test("string / binary length function") { val df = Seq(("123", Array[Byte](1, 2, 3, 4), 123, 2.0f, 3.015)) .toDF("a", "b", "c", "d", "e") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/TPCDSCollationQueryTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/TPCDSCollationQueryTestSuite.scala index a84dd9645bcc4..46a24acb475c4 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/TPCDSCollationQueryTestSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/TPCDSCollationQueryTestSuite.scala @@ -123,7 +123,7 @@ class TPCDSCollationQueryTestSuite extends QueryTest with TPCDSBase with SQLQuer val checks: Seq[Seq[CollationCheck]] = Seq( Seq( CaseSensitiveCollationCheck("tpcds_utf8", "UTF8_BINARY", "lower"), - CaseInsensitiveCollationCheck("tpcds_utf8_random", "UTF8_BINARY_LCASE", randomizeCase) + CaseInsensitiveCollationCheck("tpcds_utf8_random", "UTF8_LCASE", randomizeCase) ), Seq( CaseSensitiveCollationCheck("tpcds_unicode", "UNICODE", "lower"), diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala index 87ca3a07c4d56..7e940252430f8 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala @@ -1183,4 +1183,39 @@ class UDFSuite extends QueryTest with SharedSparkSession { df10.select(zip_with(col("array1"), col("array2"), (b1, b2) => reverseThenConcat2(b1, b2))) checkAnswer(test10, Row(Array(Row("cbaihg"), Row("fedlkj"))) :: Nil) } + + test("SPARK-47927: Correctly pass null values derived from join to UDF") { + val f = udf[Tuple1[Option[Int]], Tuple1[Option[Int]]](identity) + val ds1 = Seq(1).toDS() + val ds2 = Seq[Int]().toDS() + + checkAnswer( + ds1.join(ds2, ds1("value") === ds2("value"), "left_outer") + .select(f(struct(ds2("value").as("_1")))), + Row(Row(null))) + } + + test("char/varchar as UDF return type") { + Seq(CharType(5), VarcharType(5)).foreach { dt => + val f = udf( + new UDF0[String] { + override def call(): String = "a" + }, + dt + ) + checkError( + intercept[AnalysisException](spark.range(1).select(f())), + errorClass = "UNSUPPORTED_DATA_TYPE_FOR_ENCODER", + sqlState = "0A000", + parameters = Map("dataType" -> s"\"${dt.sql}\"") + ) + } + } + + test("SPARK-47927: ScalaUDF null handling") { + val f = udf[Int, Int](_ + 1) + val df = Seq(Some(1), None).toDF("c") + .select(f($"c").as("f"), f($"f")) + checkAnswer(df, Seq(Row(2, 3), Row(null, null))) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala index 9daa69ce9f155..18a6c538e0a80 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql import java.io.ByteArrayOutputStream -import org.apache.spark.{SparkConf, SparkFunSuite} +import org.apache.spark.{SparkConf, SparkFunSuite, SparkIllegalArgumentException} import org.apache.spark.serializer.{JavaSerializer, KryoSerializer} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{UnsafeProjection, UnsafeRow} @@ -188,4 +188,40 @@ class UnsafeRowSuite extends SparkFunSuite { unsafeRow.setDecimal(0, d2, 38) assert(unsafeRow.getDecimal(0, 38, 18) === null) } + + test("SPARK-48713: throw SparkIllegalArgumentException for illegal UnsafeRow.pointTo") { + val emptyRow = UnsafeRow.createFromByteArray(64, 2) + val byteArray = new Array[Byte](64) + + // Out of bounds + var errorMsg = intercept[SparkIllegalArgumentException] { + emptyRow.pointTo(byteArray, Platform.BYTE_ARRAY_OFFSET + 50, 32) + }.getMessage + assert( + errorMsg.contains( + "Invalid byte array backed UnsafeRow: byte array length=64, offset=50, byte size=32" + ) + ) + + // Negative size + errorMsg = intercept[SparkIllegalArgumentException] { + emptyRow.pointTo(byteArray, Platform.BYTE_ARRAY_OFFSET + 50, -32) + }.getMessage + assert( + errorMsg.contains( + "Invalid byte array backed UnsafeRow: byte array length=64, offset=50, byte size=-32" + ) + ) + + // Negative offset + errorMsg = intercept[SparkIllegalArgumentException] { + emptyRow.pointTo(byteArray, -5, 32) + }.getMessage + assert( + errorMsg.contains( + s"Invalid byte array backed UnsafeRow: byte array length=64, " + + s"offset=${-5 - Platform.BYTE_ARRAY_OFFSET}, byte size=32" + ) + ) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/VariantEndToEndSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/VariantEndToEndSuite.scala index 58528b9186736..c4dba850cf777 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/VariantEndToEndSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/VariantEndToEndSuite.scala @@ -14,13 +14,15 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.spark.sql -import org.apache.spark.sql.catalyst.expressions.{CreateArray, CreateNamedStruct, Literal, StructsToJson} +import org.apache.spark.sql.catalyst.expressions.{Cast, CreateArray, CreateNamedStruct, JsonToStructs, Literal, StructsToJson} import org.apache.spark.sql.catalyst.expressions.variant.ParseJson import org.apache.spark.sql.execution.WholeStageCodegenExec +import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.types._ +import org.apache.spark.sql.vectorized.ColumnarArray import org.apache.spark.types.variant.VariantBuilder import org.apache.spark.unsafe.types.VariantVal @@ -57,6 +59,72 @@ class VariantEndToEndSuite extends QueryTest with SharedSparkSession { check("[0.0, 1.00, 1.10, 1.23]", "[0,1,1.1,1.23]") } + test("from_json/to_json round-trip") { + def check(input: String, output: String = null): Unit = { + val df = Seq(input).toDF("v") + val variantDF = df.select(Column(StructsToJson(Map.empty, + JsonToStructs(VariantType, Map.empty, Column("v").expr)))) + val expected = if (output != null) output else input + checkAnswer(variantDF, Seq(Row(expected))) + } + + check("null") + check("true") + check("false") + check("-1") + check("1.0E10") + check("\"\"") + check("\"" + ("a" * 63) + "\"") + check("\"" + ("b" * 64) + "\"") + // scalastyle:off nonascii + check("\"" + ("你好,世界" * 20) + "\"") + // scalastyle:on nonascii + check("[]") + check("{}") + // scalastyle:off nonascii + check( + "[null, true, false,-1, 1e10, \"\\uD83D\\uDE05\", [ ], { } ]", + "[null,true,false,-1,1.0E10,\"😅\",[],{}]" + ) + // scalastyle:on nonascii + check("[0.0, 1.00, 1.10, 1.23]", "[0,1,1.1,1.23]") + } + + test("try_parse_json/to_json round-trip") { + def check(input: String, output: String = "INPUT IS OUTPUT"): Unit = { + val df = Seq(input).toDF("v") + val variantDF = df.selectExpr("to_json(try_parse_json(v)) as v").select(Column("v")) + val expected = if (output != "INPUT IS OUTPUT") output else input + checkAnswer(variantDF, Seq(Row(expected))) + } + + check("null") + check("true") + check("false") + check("-1") + check("1.0E10") + check("\"\"") + check("\"" + ("a" * 63) + "\"") + check("\"" + ("b" * 64) + "\"") + // scalastyle:off nonascii + check("\"" + ("你好,世界" * 20) + "\"") + // scalastyle:on nonascii + check("[]") + check("{}") + // scalastyle:off nonascii + check( + "[null, true, false,-1, 1e10, \"\\uD83D\\uDE05\", [ ], { } ]", + "[null,true,false,-1,1.0E10,\"😅\",[],{}]" + ) + // scalastyle:on nonascii + check("[0.0, 1.00, 1.10, 1.23]", "[0,1,1.1,1.23]") + // Places where parse_json should fail and therefore, try_parse_json should return null + check("{1:2}", null) + check("{\"a\":1", null) + check("{\"a\":[a,b,c]}", null) + check("\"" + "a" * (16 * 1024 * 1024) + "\"", null) + } + test("to_json with nested variant") { val df = Seq(1).toDF("v") val variantDF1 = df.select( @@ -91,6 +159,17 @@ class VariantEndToEndSuite extends QueryTest with SharedSparkSession { check("null", "VOID") check("1", "BIGINT") check("1.0", "DECIMAL(1,0)") + check("0.01", "DECIMAL(2,2)") + check("1.00", "DECIMAL(1,0)") + check("10.00", "DECIMAL(2,0)") + check("10.10", "DECIMAL(3,1)") + check("0.0", "DECIMAL(1,0)") + check("-0.0", "DECIMAL(1,0)") + check("2147483647.999", "DECIMAL(13,3)") + check("9223372036854775808", "DECIMAL(19,0)") + check("-9223372036854775808.0", "DECIMAL(19,0)") + check("9999999999999999999.9999999999999999999", "DECIMAL(38,19)") + check("9999999999999999999.99999999999999999999", "DOUBLE") check("1E0", "DOUBLE") check("true", "BOOLEAN") check("\"2000-01-01\"", "STRING") @@ -113,6 +192,35 @@ class VariantEndToEndSuite extends QueryTest with SharedSparkSession { ) } + test("from_json variant data type parsing") { + def check(variantTypeString: String): Unit = { + val df = Seq("{\"a\": 1, \"b\": [2, 3.1]}").toDF("j").selectExpr("variant_get(from_json(j,\"" + + variantTypeString + "\"),\"$.b[0]\")::int") + checkAnswer(df, Seq(Row(2))) + } + + check("variant") + check(" \t variant ") + check(" \n VaRiaNt ") + } + + test("is_variant_null with parse_json and variant_get") { + def check(json: String, path: String, expected: Boolean): Unit = { + val df = Seq(json).toDF("j").selectExpr(s"is_variant_null(variant_get(parse_json(j)," + + s"\"${path}\"))") + checkAnswer(df, Seq(Row(expected))) + } + + check("{ \"a\": null }", "$.a", expected = true) + check("{ \"a\": null }", "$.b", expected = false) + check("{ \"a\": null, \"b\": \"null\" }", "$.b", expected = false) + check("{ \"a\": null, \"b\": {\"c\": null} }", "$.b.c", expected = true) + check("{ \"a\": null, \"b\": {\"c\": null, \"d\": [13, null]} }", "$.b.d", expected = false) + check("{ \"a\": null, \"b\": {\"c\": null, \"d\": [13, null]} }", "$.b.d[0]", expected = false) + check("{ \"a\": null, \"b\": {\"c\": null, \"d\": [13, null]} }", "$.b.d[1]", expected = true) + check("{ \"a\": null, \"b\": {\"c\": null, \"d\": [13, null]} }", "$.b.d[2]", expected = false) + } + test("schema_of_variant_agg") { // Literal input. checkAnswer( @@ -154,4 +262,31 @@ class VariantEndToEndSuite extends QueryTest with SharedSparkSession { Seq.fill(3)(Row("STRUCT>")) ++ Seq(Row("STRUCT>"))) } } + + test("cast to variant with ColumnarArray input") { + val dataVector = new OnHeapColumnVector(4, LongType) + dataVector.appendNull() + dataVector.appendLong(123) + dataVector.appendNull() + dataVector.appendLong(456) + val array = new ColumnarArray(dataVector, 0, 4) + val variant = Cast(Literal(array, ArrayType(LongType)), VariantType).eval() + assert(variant.toString == "[null,123,null,456]") + dataVector.close() + } + + test("cast to variant with scan input") { + withTempPath { dir => + val path = dir.getAbsolutePath + val input = Seq(Row(Array(1, null), Map("k1" -> null, "k2" -> false), Row(null, "str"))) + val schema = StructType.fromDDL( + "a array, m map, s struct") + spark.createDataFrame(spark.sparkContext.parallelize(input), schema).write.parquet(path) + val df = spark.read.parquet(path).selectExpr( + s"cast(cast(a as variant) as ${schema(0).dataType.sql})", + s"cast(cast(m as variant) as ${schema(1).dataType.sql})", + s"cast(cast(s as variant) as ${schema(2).dataType.sql})") + checkAnswer(df, input) + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/VariantSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/VariantSuite.scala index d276ec4428b9f..0c00676607dd4 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/VariantSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/VariantSuite.scala @@ -18,20 +18,25 @@ package org.apache.spark.sql import java.io.File +import java.nio.charset.StandardCharsets +import java.nio.file.Files import scala.collection.mutable import scala.jdk.CollectionConverters._ import scala.util.Random -import org.apache.spark.sql.catalyst.expressions.CodegenObjectFactoryMode +import org.apache.spark.SparkRuntimeException +import org.apache.spark.sql.catalyst.expressions.{CodegenObjectFactoryMode, ExpressionEvalHelper, Literal} +import org.apache.spark.sql.catalyst.expressions.variant.{VariantExpressionEvalUtils, VariantGet} +import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, DateTimeUtils, GenericArrayData} import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession -import org.apache.spark.sql.types.{StringType, StructField, StructType, VariantType} -import org.apache.spark.unsafe.types.VariantVal +import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.{UTF8String, VariantVal} import org.apache.spark.util.ArrayImplicits._ -class VariantSuite extends QueryTest with SharedSparkSession { +class VariantSuite extends QueryTest with SharedSparkSession with ExpressionEvalHelper { import testImplicits._ test("basic tests") { @@ -55,6 +60,15 @@ class VariantSuite extends QueryTest with SharedSparkSession { } } + test("basic try_parse_json alias") { + val df = spark.createDataFrame(Seq(Row("""{ "a" : 1 }"""), Row("""{ a : 1 }""")).asJava, + new StructType().add("json", StringType)) + val actual = df.select(to_json(try_parse_json(col("json")))).collect() + + assert(actual(0)(0) == """{"a":1}""") + assert(actual(1)(0) == null) + } + test("basic parse_json alias") { val df = spark.createDataFrame(Seq(Row("""{ "a" : 1 }""")).asJava, new StructType().add("json", StringType)) @@ -66,6 +80,33 @@ class VariantSuite extends QueryTest with SharedSparkSession { assert(actual.getString(1) == """{"b":[{"c":"str2"}]}""") } + test("expression alias") { + val df = Seq("""{ "a" : 1 }""", """{ "b" : 2 }""").toDF("json") + val v = parse_json(col("json")) + + def rows(results: Any*): Seq[Row] = results.map(Row(_)) + + checkAnswer(df.select(is_variant_null(v)), rows(false, false)) + checkAnswer(df.select(schema_of_variant(v)), rows("STRUCT", "STRUCT")) + checkAnswer(df.select(schema_of_variant_agg(v)), rows("STRUCT")) + + checkAnswer(df.select(variant_get(v, "$.a", "int")), rows(1, null)) + checkAnswer(df.select(variant_get(v, "$.b", "int")), rows(null, 2)) + checkAnswer(df.select(variant_get(v, "$.a", "double")), rows(1.0, null)) + checkError( + exception = intercept[SparkRuntimeException] { + df.select(variant_get(v, "$.a", "binary")).collect() + }, + errorClass = "INVALID_VARIANT_CAST", + parameters = Map("value" -> "1", "dataType" -> "\"BINARY\"") + ) + + checkAnswer(df.select(try_variant_get(v, "$.a", "int")), rows(1, null)) + checkAnswer(df.select(try_variant_get(v, "$.b", "int")), rows(null, 2)) + checkAnswer(df.select(try_variant_get(v, "$.a", "double")), rows(1.0, null)) + checkAnswer(df.select(try_variant_get(v, "$.a", "binary")), rows(null, null)) + } + test("round trip tests") { val rand = new Random(42) val input = Seq.fill(50) { @@ -272,6 +313,88 @@ class VariantSuite extends QueryTest with SharedSparkSession { } } + test("json option constraints") { + withTempDir { dir => + val file = new File(dir, "file.json") + Files.write(file.toPath, "0".getBytes(StandardCharsets.UTF_8)) + + // Ensure that we get an error when setting the singleVariantColumn JSON option while also + // specifying a schema. + checkError( + exception = intercept[AnalysisException] { + spark.read.format("json").option("singleVariantColumn", "var").schema("var variant") + }, + errorClass = "INVALID_SINGLE_VARIANT_COLUMN", + parameters = Map.empty + ) + checkError( + exception = intercept[AnalysisException] { + spark.read.format("json").option("singleVariantColumn", "another_name") + .schema("var variant").json(file.getAbsolutePath).collect() + }, + errorClass = "INVALID_SINGLE_VARIANT_COLUMN", + parameters = Map.empty + ) + } + } + + test("json scan") { + val content = Seq( + "true", + """{"a": [], "b": null}""", + """{"a": 1}""", + "[1, 2, 3]" + ).mkString("\n").getBytes(StandardCharsets.UTF_8) + + withTempDir { dir => + val file = new File(dir, "file.json") + Files.write(file.toPath, content) + + checkAnswer( + spark.read.format("json").option("singleVariantColumn", "var") + .load(file.getAbsolutePath) + .selectExpr("to_json(var)"), + Seq(Row("true"), Row("""{"a":[],"b":null}"""), Row("""{"a":1}"""), Row("[1,2,3]")) + ) + + checkAnswer( + spark.read.format("json").schema("a variant, b variant") + .load(file.getAbsolutePath).selectExpr("to_json(a)", "to_json(b)"), + Seq(Row(null, null), Row("[]", "null"), Row("1", null), Row(null, null)) + ) + } + + // Test scan with partitions. + withTempDir { dir => + new File(dir, "a=1/b=2/").mkdirs() + Files.write(new File(dir, "a=1/b=2/file.json").toPath, content) + checkAnswer( + spark.read.format("json").option("singleVariantColumn", "var") + .load(dir.getAbsolutePath).selectExpr("a", "b", "to_json(var)"), + Seq(Row(1, 2, "true"), Row(1, 2, """{"a":[],"b":null}"""), Row(1, 2, """{"a":1}"""), + Row(1, 2, "[1,2,3]")) + ) + } + } + + test("json scan with map schema") { + withTempDir { dir => + val file = new File(dir, "file.json") + val content = Seq( + "true", + """{"v": null}""", + """{"v": {"a": 1, "b": null}}""" + ).mkString("\n").getBytes(StandardCharsets.UTF_8) + Files.write(file.toPath, content) + checkAnswer( + spark.read.format("json").schema("v map") + .load(file.getAbsolutePath) + .selectExpr("to_json(v)"), + Seq(Row(null), Row(null), Row("""{"a":1,"b":null}""")) + ) + } + } + test("group/order/join variant are disabled") { var ex = intercept[AnalysisException] { spark.sql("select parse_json('') group by 1") @@ -324,4 +447,184 @@ class VariantSuite extends QueryTest with SharedSparkSession { } } } + + test("SPARK-48067: default variant columns works") { + withTable("t") { + sql("""create table t( + v1 variant default null, + v2 variant default parse_json(null), + v3 variant default cast(null as variant), + v4 variant default parse_json('1'), + v5 variant default parse_json('1'), + v6 variant default parse_json('{\"k\": \"v\"}'), + v7 variant default cast(5 as int), + v8 variant default cast('hello' as string), + v9 variant default parse_json(to_json(parse_json('{\"k\": \"v\"}'))) + ) using parquet""") + sql("""insert into t values(DEFAULT, DEFAULT, DEFAULT, DEFAULT, DEFAULT, DEFAULT, DEFAULT, + DEFAULT, DEFAULT)""") + + val expected = sql("""select + cast(null as variant) as v1, + parse_json(null) as v2, + cast(null as variant) as v3, + parse_json('1') as v4, + parse_json('1') as v5, + parse_json('{\"k\": \"v\"}') as v6, + cast(cast(5 as int) as variant) as v7, + cast('hello' as variant) as v8, + parse_json(to_json(parse_json('{\"k\": \"v\"}'))) as v9 + """) + val actual = sql("select * from t") + checkAnswer(actual, expected.collect()) + } + } + + Seq( + ( + "basic int parse json", + VariantExpressionEvalUtils.parseJson(UTF8String.fromString("1")), + VariantType + ), + ( + "basic json parse json", + VariantExpressionEvalUtils.parseJson(UTF8String.fromString("{\"k\": \"v\"}")), + VariantType + ), + ( + "basic null parse json", + VariantExpressionEvalUtils.parseJson(UTF8String.fromString("null")), + VariantType + ), + ( + "basic null", + null, + VariantType + ), + ( + "basic array", + new GenericArrayData(Array[Int](1, 2, 3, 4, 5)), + new ArrayType(IntegerType, false) + ), + ( + "basic string", + UTF8String.fromString("literal string"), + StringType + ), + ( + "basic timestamp", + 0L, + TimestampType + ), + ( + "basic int", + 0, + IntegerType + ), + ( + "basic struct", + Literal.default(new StructType().add("col0", StringType)).eval(), + new StructType().add("col0", StringType) + ), + ( + "complex struct with child variant", + Literal.default(new StructType() + .add("col0", StringType) + .add("col1", new StructType().add("col0", VariantType)) + .add("col2", VariantType) + .add("col3", new ArrayType(VariantType, false)) + ).eval(), + new StructType() + .add("col0", StringType) + .add("col1", new StructType().add("col0", VariantType)) + .add("col2", VariantType) + .add("col3", new ArrayType(VariantType, false)) + ), + ( + "basic array with null", + new GenericArrayData(Array[Any](1, 2, null)), + new ArrayType(IntegerType, true) + ), + ( + "basic map with null", + new ArrayBasedMapData( + new GenericArrayData(Array[Any](UTF8String.fromString("k1"), UTF8String.fromString("k2"))), + new GenericArrayData(Array[Any](1, null)) + ), + new MapType(StringType, IntegerType, true) + ) + ).foreach { case (testName, value, dt) => + test(s"SPARK-48067: Variant literal `sql` correctly recreates the variant - $testName") { + val l = Literal.create( + VariantExpressionEvalUtils.castToVariant(value, dt.asInstanceOf[DataType]), VariantType) + val jsonString = l.eval().asInstanceOf[VariantVal] + .toJson(DateTimeUtils.getZoneId(SQLConf.get.sessionLocalTimeZone)) + val expectedSql = s"PARSE_JSON('$jsonString')" + assert(l.sql == expectedSql) + val valueFromLiteralSql = + spark.sql(s"select ${l.sql}").collect()(0).getAs[VariantVal](0) + + // Cast the variants to their specified type to compare for logical equality. + // Currently, variant equality naively compares its value and metadata binaries. However, + // variant equality is more complex than this. + val castVariantExpr = VariantGet( + l, + Literal.create(UTF8String.fromString("$"), StringType), + dt, + true, + Some(DateTimeUtils.getZoneId(SQLConf.get.sessionLocalTimeZone).toString()) + ) + val sqlVariantExpr = VariantGet( + Literal.create(valueFromLiteralSql, VariantType), + Literal.create(UTF8String.fromString("$"), StringType), + dt, + true, + Some(DateTimeUtils.getZoneId(SQLConf.get.sessionLocalTimeZone).toString()) + ) + checkEvaluation(castVariantExpr, sqlVariantExpr.eval()) + } + } + + test("variant_get size") { + val largeKey = "x" * 1000 + val df = Seq(s"""{ "$largeKey": {"a" : 1 }, + "b" : 2, + "c": [1,2,3,{"$largeKey": 4}] }""").toDF("json") + .selectExpr("parse_json(json) as v") + + // Check Variant with approximate bounds to avoid flakiness if we make minor format changes. + def checkSize(v: VariantVal, minMetadata: Long, maxMetadata: Long, + minValue: Long, maxValue: Long): Unit = { + val mSize = v.getMetadata.length + assert(mSize >= minMetadata) + assert(mSize <= maxMetadata) + val vSize = v.getValue.length + assert(vSize >= minValue) + assert(vSize <= maxValue) + } + + // The full Variant has large metadata (but only one copy of `largeKey`). + checkSize(df.selectExpr("variant_get(v, '$', 'variant')").collect()(0) + .getAs[VariantVal](0), 1000, 1050, 20, 40) + // Extracting Variant or a nested type containing Variant should strip out the large metadata. + checkSize(df.selectExpr("variant_get(v, '$.b', 'variant')").collect()(0) + .getAs[VariantVal](0), 2, 4, 2, 4) + // Behavior is the same without an explicit cast to Variant. + checkSize(df.selectExpr("variant_get(v, '$.b', 'variant')").collect()(0) + .getAs[VariantVal](0), 2, 4, 2, 4) + checkSize(df.selectExpr(s"variant_get(v, '$$.$largeKey', 'variant')").collect()(0) + .getAs[VariantVal](0), 5, 10, 5, 10) + checkSize(df.selectExpr(s"variant_get(v, '$$.$largeKey', 'struct')") + .collect()(0).getStruct(0).getAs[VariantVal](0), 2, 4, 2, 4) + // Only the array element that contains `largeKey` should be large. + checkSize(df.selectExpr("variant_get(v, '$.c', 'array')").collect()(0) + .getSeq[VariantVal](0)(0), 2, 4, 2, 4) + checkSize(df.selectExpr("variant_get(v, '$.c', 'array')").collect()(0) + .getSeq[VariantVal](0)(3), 1000, 1020, 5, 10) + // Cast to a nested type containing Variant should also remove metadata. + val structResult = df.selectExpr(s"cast(v as struct<$largeKey:variant,b:variant>)").collect()(0) + .getStruct(0) + checkSize(structResult.getAs[VariantVal](0), 5, 10, 5, 10) + checkSize(structResult.getAs[VariantVal](1), 2, 4, 2, 4) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/XmlFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/XmlFunctionsSuite.scala index 90a26af917aa9..1364fab3138e3 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/XmlFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/XmlFunctionsSuite.scala @@ -22,6 +22,7 @@ import java.util.Locale import scala.jdk.CollectionConverters._ +import org.apache.spark.sql.execution.WholeStageCodegenExec import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession @@ -39,6 +40,16 @@ class XmlFunctionsSuite extends QueryTest with SharedSparkSession { Row(Row(1)) :: Nil) } + test("SPARK-48300: from_xml - Codegen Support") { + withTempView("XmlToStructsTable") { + val dataDF = Seq("""1""").toDF("value") + dataDF.createOrReplaceTempView("XmlToStructsTable") + val df = sql("SELECT from_xml(value, 'a INT') FROM XmlToStructsTable") + assert(df.queryExecution.executedPlan.isInstanceOf[WholeStageCodegenExec]) + checkAnswer(df, Row(Row(1)) :: Nil) + } + } + test("from_xml with option (timestampFormat)") { val df = Seq("""""").toDS() val schema = new StructType().add("time", TimestampType) @@ -55,7 +66,7 @@ class XmlFunctionsSuite extends QueryTest with SharedSparkSession { val options = Map("rowTag" -> "foo").asJava checkAnswer( - df.select(from_xml($"value", schema)), + df.select(from_xml($"value", schema, options)), Row(Row(1)) :: Nil) } @@ -110,6 +121,36 @@ class XmlFunctionsSuite extends QueryTest with SharedSparkSession { Row(Row(1, "haa")) :: Nil) } + test("SPARK-48363: from_xml with non struct schema") { + checkError( + exception = intercept[AnalysisException] { + Seq("1").toDS().select(from_xml($"value", lit("ARRAY"), Map[String, String]().asJava)) + }, + errorClass = "INVALID_SCHEMA.NON_STRUCT_TYPE", + parameters = Map( + "inputSchema" -> "\"ARRAY\"", + "dataType" -> "\"ARRAY\"" + ), + context = ExpectedContext(fragment = "from_xml", getCurrentClassCallSitePattern) + ) + + checkError( + exception = intercept[AnalysisException] { + Seq("1").toDF("xml").selectExpr(s"from_xml(xml, 'ARRAY')") + }, + errorClass = "INVALID_SCHEMA.NON_STRUCT_TYPE", + parameters = Map( + "inputSchema" -> "\"ARRAY\"", + "dataType" -> "\"ARRAY\"" + ), + context = ExpectedContext( + fragment = "from_xml(xml, 'ARRAY')", + start = 0, + stop = 26 + ) + ) + } + test("to_xml - struct") { val schema = StructType(StructField("a", IntegerType, nullable = false) :: Nil) val data = Seq(Row(1)) @@ -383,6 +424,22 @@ class XmlFunctionsSuite extends QueryTest with SharedSparkSession { } } + test("SPARK-48296: to_xml - Codegen Support") { + withTempView("StructsToXmlTable") { + val schema = StructType(StructField("a", IntegerType, nullable = false) :: Nil) + val dataDF = spark.createDataFrame(Seq(Row(1)).asJava, schema).withColumn("a", struct($"a")) + dataDF.createOrReplaceTempView("StructsToXmlTable") + val df = sql("SELECT to_xml(a) FROM StructsToXmlTable") + val plan = df.queryExecution.executedPlan + assert(plan.isInstanceOf[WholeStageCodegenExec]) + val expected = + s"""| + | 1 + |""".stripMargin + checkAnswer(df, Seq(Row(expected))) + } + } + test("corrupt record column in the middle") { val schema = new StructType() .add("a", IntegerType) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/collation/CollatedFilterPushDownToParquetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/collation/CollatedFilterPushDownToParquetSuite.scala new file mode 100644 index 0000000000000..ab8e82162ce10 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/collation/CollatedFilterPushDownToParquetSuite.scala @@ -0,0 +1,265 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.collation + +import org.apache.parquet.schema.MessageType + +import org.apache.spark.SparkConf +import org.apache.spark.sql.{DataFrame, QueryTest} +import org.apache.spark.sql.catalyst.planning.PhysicalOperation +import org.apache.spark.sql.catalyst.util.RebaseDateTime.RebaseSpec +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper +import org.apache.spark.sql.execution.datasources.{DataSourceStrategy, HadoopFsRelation, LogicalRelation} +import org.apache.spark.sql.execution.datasources.parquet.{ParquetFilters, SparkToParquetSchemaConverter} +import org.apache.spark.sql.execution.datasources.v2.DataSourceV2ScanRelation +import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetScan +import org.apache.spark.sql.internal.{LegacyBehaviorPolicy, SQLConf} +import org.apache.spark.sql.sources.{EqualTo, Filter, IsNotNull} +import org.apache.spark.sql.test.SharedSparkSession + +abstract class CollatedFilterPushDownToParquetSuite extends QueryTest + with SharedSparkSession + with AdaptiveSparkPlanHelper { + + val dataSource = "parquet" + val nonCollatedCol = "c0" + val collatedCol = "c1" + val collatedStructCol = "c2" + val collatedStructNestedCol = "f1" + val collatedStructFieldAccess = s"$collatedStructCol.$collatedStructNestedCol" + val collatedArrayCol = "c3" + val collatedMapCol = "c4" + + val lcaseCollation = "'UTF8_LCASE'" + + def getPushedDownFilters(query: DataFrame): Seq[Filter] + + protected def createParquetFilters(schema: MessageType): ParquetFilters = + new ParquetFilters(schema, conf.parquetFilterPushDownDate, conf.parquetFilterPushDownTimestamp, + conf.parquetFilterPushDownDecimal, conf.parquetFilterPushDownStringPredicate, + conf.parquetFilterPushDownInFilterThreshold, + conf.caseSensitiveAnalysis, + RebaseSpec(LegacyBehaviorPolicy.CORRECTED)) + + def testPushDown( + filterString: String, + expectedPushedFilters: Seq[Filter], + expectedRowCount: Int): Unit = { + withTempPath { path => + val df = sql( + s""" + |SELECT + | c as $nonCollatedCol, + | COLLATE(c, $lcaseCollation) as $collatedCol, + | named_struct('$collatedStructNestedCol', + | COLLATE(c, $lcaseCollation)) as $collatedStructCol, + | array(COLLATE(c, $lcaseCollation)) as $collatedArrayCol, + | map(COLLATE(c, $lcaseCollation), 1) as $collatedMapCol + |FROM VALUES ('aaa'), ('AAA'), ('bbb') + |as data(c) + |""".stripMargin) + + df.write.format(dataSource).save(path.getAbsolutePath) + + val query = spark.read.format(dataSource).load(path.getAbsolutePath) + .filter(filterString) + + val actualPushedFilters = getPushedDownFilters(query) + assert(actualPushedFilters.toSet === expectedPushedFilters.toSet) + assert(query.count() === expectedRowCount) + } + } + + test("do not push down anything for literal comparison") { + testPushDown( + filterString = s"'aaa' COLLATE UNICODE = 'bbb' COLLATE UNICODE", + expectedPushedFilters = Seq.empty, + expectedRowCount = 0) + } + + test("push down null check for collated column") { + testPushDown( + filterString = s"$collatedCol = 'aaa'", + expectedPushedFilters = Seq(IsNotNull(collatedCol)), + expectedRowCount = 2) + } + + test("push down null check for non-equality check") { + testPushDown( + filterString = s"$collatedCol != 'aaa'", + expectedPushedFilters = Seq(IsNotNull(collatedCol)), + expectedRowCount = 1) + } + + test("push down null check for greater than check") { + testPushDown( + filterString = s"$collatedCol > 'aaa'", + expectedPushedFilters = Seq(IsNotNull(collatedCol)), + expectedRowCount = 1) + } + + test("push down null check for gte check") { + testPushDown( + filterString = s"$collatedCol >= 'aaa'", + expectedPushedFilters = Seq(IsNotNull(collatedCol)), + expectedRowCount = 3) + } + + test("push down null check for less than check") { + testPushDown( + filterString = s"$collatedCol < 'aaa'", + expectedPushedFilters = Seq(IsNotNull(collatedCol)), + expectedRowCount = 0) + } + + test("push down null check for lte check") { + testPushDown( + filterString = s"$collatedCol <= 'aaa'", + expectedPushedFilters = Seq(IsNotNull(collatedCol)), + expectedRowCount = 2) + } + + test("push down null check for STARTSWITH") { + testPushDown( + filterString = s"STARTSWITH($collatedCol, 'a')", + expectedPushedFilters = Seq(IsNotNull(collatedCol)), + expectedRowCount = 2) + } + + test("push down null check for ENDSWITH") { + testPushDown( + filterString = s"ENDSWITH($collatedCol, 'a')", + expectedPushedFilters = Seq(IsNotNull(collatedCol)), + expectedRowCount = 2) + } + + test("push down null check for CONTAINS") { + testPushDown( + filterString = s"CONTAINS($collatedCol, 'a')", + expectedPushedFilters = Seq(IsNotNull(collatedCol)), + expectedRowCount = 2) + } + + test("no push down for IN") { + testPushDown( + filterString = s"$collatedCol IN ('aaa', 'bbb')", + expectedPushedFilters = Seq.empty, + expectedRowCount = 3) + } + + test("push down null check for equality for non-collated column in AND") { + testPushDown( + filterString = s"$collatedCol = 'aaa' AND $nonCollatedCol = 'aaa'", + expectedPushedFilters = + Seq(IsNotNull(collatedCol), IsNotNull(nonCollatedCol), EqualTo(nonCollatedCol, "aaa")), + expectedRowCount = 1) + } + + test("for OR do not push down anything") { + testPushDown( + filterString = s"$collatedCol = 'aaa' OR $nonCollatedCol = 'aaa'", + expectedPushedFilters = Seq.empty, + expectedRowCount = 2) + } + + test("mix OR and AND") { + testPushDown( + filterString = s"$collatedCol = 'aaa' AND ($nonCollatedCol = 'aaa' OR $collatedCol = 'aaa')", + expectedPushedFilters = Seq(IsNotNull(collatedCol)), + expectedRowCount = 2) + } + + test("negate check on collated column") { + testPushDown( + filterString = s"NOT($collatedCol == 'aaa')", + expectedPushedFilters = Seq(IsNotNull(collatedCol)), + expectedRowCount = 1) + } + + test("compare entire struct - parquet does not support null check on complex types") { + testPushDown( + filterString = s"$collatedStructCol = " + + s"named_struct('$collatedStructNestedCol', collate('aaa', $lcaseCollation))", + expectedPushedFilters = Seq.empty, + expectedRowCount = 2) + } + + test("inner struct field access") { + testPushDown( + filterString = s"$collatedStructFieldAccess = 'aaa'", + expectedPushedFilters = Seq(IsNotNull(collatedStructFieldAccess)), + expectedRowCount = 2) + } + + test("array - parquet does not support null check on complex types") { + testPushDown( + filterString = s"$collatedArrayCol = array(collate('aaa', $lcaseCollation))", + expectedPushedFilters = Seq.empty, + expectedRowCount = 2) + } + + test("map - parquet does not support null check on complex types") { + testPushDown( + filterString = s"map_keys($collatedMapCol) != array(collate('aaa', $lcaseCollation))", + expectedPushedFilters = Seq.empty, + expectedRowCount = 1) + } +} + +class CollatedFilterPushDownToParquetV1Suite extends CollatedFilterPushDownToParquetSuite { + override protected def sparkConf: SparkConf = + super + .sparkConf + .set(SQLConf.USE_V1_SOURCE_LIST, dataSource) + + override def getPushedDownFilters(query: DataFrame): Seq[Filter] = { + var maybeRelation: Option[HadoopFsRelation] = None + val maybeAnalyzedPredicate = query.queryExecution.optimizedPlan.collect { + case PhysicalOperation(_, filters, + LogicalRelation(relation: HadoopFsRelation, _, _, _)) => + maybeRelation = Some(relation) + filters + }.flatten + + if (maybeAnalyzedPredicate.isEmpty) { + return Seq.empty + } + + val (_, selectedFilters, _) = + DataSourceStrategy.selectFilters(maybeRelation.get, maybeAnalyzedPredicate) + + val schema = new SparkToParquetSchemaConverter(conf).convert(query.schema) + val parquetFilters = createParquetFilters(schema) + parquetFilters.convertibleFilters(selectedFilters) + } +} + +class CollatedFilterPushDownToParquetV2Suite extends CollatedFilterPushDownToParquetSuite { + override protected def sparkConf: SparkConf = + super + .sparkConf + .set(SQLConf.USE_V1_SOURCE_LIST, "") + + override def getPushedDownFilters(query: DataFrame): Seq[Filter] = { + query.queryExecution.optimizedPlan.collectFirst { + case PhysicalOperation(_, _, + DataSourceV2ScanRelation(_, scan: ParquetScan, _, _, _)) => + scan.pushedFilters.toSeq + }.getOrElse(Seq.empty) + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/AlterTableTests.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/AlterTableTests.scala index 996d7acb1148d..28605958c71da 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/AlterTableTests.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/AlterTableTests.scala @@ -466,7 +466,7 @@ trait AlterTableTests extends SharedSparkSession with QueryErrorsBase { exception = intercept[AnalysisException] { sql(s"ALTER TABLE $t ADD COLUMNS $field double") }, - errorClass = "FIELDS_ALREADY_EXISTS", + errorClass = "FIELD_ALREADY_EXISTS", parameters = expectedParameters, context = ExpectedContext( fragment = s"ALTER TABLE $t ADD COLUMNS $field double", @@ -1116,7 +1116,7 @@ trait AlterTableTests extends SharedSparkSession with QueryErrorsBase { exception = intercept[AnalysisException] { sql(s"ALTER TABLE $t RENAME COLUMN $field TO $newName") }, - errorClass = "FIELDS_ALREADY_EXISTS", + errorClass = "FIELD_ALREADY_EXISTS", parameters = Map( "op" -> "rename", "fieldNames" -> s"${toSQLId(expectedName)}", diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2DataFrameSessionCatalogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2DataFrameSessionCatalogSuite.scala index 5d5ea6499c49d..7bbb6485c273f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2DataFrameSessionCatalogSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2DataFrameSessionCatalogSuite.scala @@ -110,7 +110,14 @@ class InMemoryTableSessionCatalog extends TestV2SessionCatalogBase[InMemoryTable Option(tables.get(ident)) match { case Some(table) => val properties = CatalogV2Util.applyPropertiesChanges(table.properties, changes) - val schema = CatalogV2Util.applySchemaChanges(table.schema, changes, None, "ALTER TABLE") + val provider = Option(properties.get("provider")) + + val schema = CatalogV2Util.applySchemaChanges( + table.schema, + changes, + provider, + "ALTER TABLE" + ) // fail if the last column in the schema was dropped if (schema.fields.isEmpty) { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala index d89c0a2525fd9..0382efaf9d7e5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala @@ -24,7 +24,7 @@ import java.util.Locale import scala.concurrent.duration.MICROSECONDS import scala.jdk.CollectionConverters._ -import org.apache.spark.{SparkException, SparkUnsupportedOperationException} +import org.apache.spark.{SparkException, SparkRuntimeException, SparkUnsupportedOperationException} import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.CurrentUserContext.CURRENT_USER import org.apache.spark.sql.catalyst.InternalRow @@ -814,14 +814,10 @@ class DataSourceV2SQLSuiteV1Filter if (nullable) { insertNullValueAndCheck() } else { - // TODO assign a error-classes name - checkError( - exception = intercept[SparkException] { - insertNullValueAndCheck() - }, - errorClass = null, - parameters = Map.empty - ) + val exception = intercept[SparkRuntimeException] { + insertNullValueAndCheck() + } + assert(exception.getErrorClass == "NOT_NULL_ASSERT_VIOLATION") } } } @@ -1743,6 +1739,16 @@ class DataSourceV2SQLSuiteV1Filter } } + test("SPARK-48709: varchar resolution mismatch for DataSourceV2 CTAS") { + withSQLConf( + SQLConf.STORE_ASSIGNMENT_POLICY.key -> SQLConf.StoreAssignmentPolicy.LEGACY.toString) { + withTable("testcat.ns.t1", "testcat.ns.t2") { + sql("CREATE TABLE testcat.ns.t1 (d1 string, d2 varchar(200)) USING parquet") + sql("CREATE TABLE testcat.ns.t2 USING foo as select * from testcat.ns.t1") + } + } + } + test("ShowCurrentNamespace: basic tests") { def testShowCurrentNamespace(expectedCatalogName: String, expectedNamespace: String): Unit = { val schema = new StructType() @@ -3488,6 +3494,30 @@ class DataSourceV2SQLSuiteV1Filter } } + test("SPARK-48286: Add new column with default value which is not foldable") { + val foldableExpressions = Seq("1", "2 + 1") + withSQLConf(SQLConf.DEFAULT_COLUMN_ALLOWED_PROVIDERS.key -> v2Source) { + withTable("tab") { + spark.sql(s"CREATE TABLE tab (col1 INT DEFAULT 100) USING $v2Source") + val exception = intercept[AnalysisException] { + // Rand function is not foldable + spark.sql(s"ALTER TABLE tab ADD COLUMN col2 DOUBLE DEFAULT rand()") + } + assert(exception.getSqlState == "42623") + assert(exception.errorClass.get == "INVALID_DEFAULT_VALUE.NOT_CONSTANT") + assert(exception.messageParameters("colName") == "`col2`") + assert(exception.messageParameters("defaultValue") == "rand()") + assert(exception.messageParameters("statement") == "ALTER TABLE") + } + foldableExpressions.foreach(expr => { + withTable("tab") { + spark.sql(s"CREATE TABLE tab (col1 INT DEFAULT 100) USING $v2Source") + spark.sql(s"ALTER TABLE tab ADD COLUMN col2 DOUBLE DEFAULT $expr") + } + }) + } + } + private def testNotSupportedV2Command( sqlCommand: String, sqlParams: String, diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/KeyGroupedPartitioningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/KeyGroupedPartitioningSuite.scala index ec275fe101fd6..d77a6e8b8ac16 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/KeyGroupedPartitioningSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/KeyGroupedPartitioningSuite.scala @@ -298,6 +298,12 @@ class KeyGroupedPartitioningSuite extends DistributionAndOrderingSuiteBase { Row("bbb", 20, 250.0), Row("bbb", 20, 350.0), Row("ccc", 30, 400.50))) } + private def collectAllShuffles(plan: SparkPlan): Seq[ShuffleExchangeExec] = { + collect(plan) { + case s: ShuffleExchangeExec => s + } + } + private def collectShuffles(plan: SparkPlan): Seq[ShuffleExchangeExec] = { // here we skip collecting shuffle operators that are not associated with SMJ collect(plan) { @@ -346,6 +352,23 @@ class KeyGroupedPartitioningSuite extends DistributionAndOrderingSuiteBase { Column.create("price", FloatType), Column.create("time", TimestampType)) + test("SPARK-48655: group by on partition keys should not introduce additional shuffle") { + val items_partitions = Array(identity("id")) + createTable(items, itemsColumns, items_partitions) + sql(s"INSERT INTO testcat.ns.$items VALUES " + + s"(1, 'aa', 40.0, cast('2020-01-01' as timestamp)), " + + s"(1, 'aa', 41.0, cast('2020-01-02' as timestamp)), " + + s"(2, 'bb', 10.0, cast('2020-01-01' as timestamp)), " + + s"(3, 'cc', 15.5, cast('2020-02-01' as timestamp))") + + val df = sql(s"SELECT MAX(price) AS res FROM testcat.ns.$items GROUP BY id") + val shuffles = collectAllShuffles(df.queryExecution.executedPlan) + assert(shuffles.isEmpty, + "should contain shuffle when not grouping by partition values") + + checkAnswer(df.sort("res"), Seq(Row(10.0), Row(15.5), Row(41.0))) + } + test("partitioned join: join with two partition keys and matching & sorted partitions") { val items_partitions = Array(bucket(8, "id"), days("arrive_time")) createTable(items, itemsColumns, items_partitions) @@ -1136,7 +1159,7 @@ class KeyGroupedPartitioningSuite extends DistributionAndOrderingSuiteBase { val df = createJoinTestDF(Seq("arrive_time" -> "time")) val shuffles = collectShuffles(df.queryExecution.executedPlan) if (shuffle) { - assert(shuffles.size == 2, "partitioning with transform not work now") + assert(shuffles.size == 1, "partitioning with transform should trigger SPJ") } else { assert(shuffles.size == 2, "should add two side shuffle when bucketing shuffle one side" + " is not enabled") @@ -1227,6 +1250,66 @@ class KeyGroupedPartitioningSuite extends DistributionAndOrderingSuiteBase { } } + test("SPARK-48065: SPJ: allowJoinKeysSubsetOfPartitionKeys is too strict") { + val table1 = "tab1e1" + val table2 = "table2" + val partition = Array(identity("id")) + createTable(table1, columns, partition) + sql(s"INSERT INTO testcat.ns.$table1 VALUES " + + "(1, 'aa', cast('2020-01-01' as timestamp)), " + + "(2, 'bb', cast('2020-01-01' as timestamp)), " + + "(2, 'cc', cast('2020-01-01' as timestamp)), " + + "(3, 'dd', cast('2020-01-01' as timestamp)), " + + "(3, 'dd', cast('2020-01-01' as timestamp)), " + + "(3, 'ee', cast('2020-01-01' as timestamp)), " + + "(3, 'ee', cast('2020-01-01' as timestamp))") + + createTable(table2, columns, partition) + sql(s"INSERT INTO testcat.ns.$table2 VALUES " + + "(4, 'zz', cast('2020-01-01' as timestamp)), " + + "(4, 'zz', cast('2020-01-01' as timestamp)), " + + "(3, 'dd', cast('2020-01-01' as timestamp)), " + + "(3, 'dd', cast('2020-01-01' as timestamp)), " + + "(3, 'xx', cast('2020-01-01' as timestamp)), " + + "(3, 'xx', cast('2020-01-01' as timestamp)), " + + "(2, 'ww', cast('2020-01-01' as timestamp))") + + Seq(true, false).foreach { pushDownValues => + Seq(true, false).foreach { partiallyClustered => + withSQLConf( + SQLConf.REQUIRE_ALL_CLUSTER_KEYS_FOR_CO_PARTITION.key -> "false", + SQLConf.V2_BUCKETING_PUSH_PART_VALUES_ENABLED.key -> pushDownValues.toString, + SQLConf.V2_BUCKETING_PARTIALLY_CLUSTERED_DISTRIBUTION_ENABLED.key -> + partiallyClustered.toString, + SQLConf.V2_BUCKETING_ALLOW_JOIN_KEYS_SUBSET_OF_PARTITION_KEYS.key -> "true") { + val df = sql( + s""" + |${selectWithMergeJoinHint("t1", "t2")} + |t1.id AS id, t1.data AS t1data, t2.data AS t2data + |FROM testcat.ns.$table1 t1 JOIN testcat.ns.$table2 t2 + |ON t1.id = t2.id AND t1.data = t2.data ORDER BY t1.id, t1data, t2data + |""".stripMargin) + val shuffles = collectShuffles(df.queryExecution.executedPlan) + assert(shuffles.isEmpty, "SPJ should be triggered") + + val scans = collectScans(df.queryExecution.executedPlan) + .map(_.inputRDD.partitions.length) + if (partiallyClustered) { + assert(scans == Seq(8, 8)) + } else { + assert(scans == Seq(4, 4)) + } + checkAnswer(df, Seq( + Row(3, "dd", "dd"), + Row(3, "dd", "dd"), + Row(3, "dd", "dd"), + Row(3, "dd", "dd") + )) + } + } + } + } + test("SPARK-44647: test join key is subset of cluster key " + "with push values and partially-clustered") { val table1 = "tab1e1" @@ -1931,22 +2014,19 @@ class KeyGroupedPartitioningSuite extends DistributionAndOrderingSuiteBase { "(6, 50.0, cast('2023-02-01' as timestamp))") Seq(true, false).foreach { pushdownValues => - Seq(true, false).foreach { partiallyClustered => - withSQLConf( - SQLConf.V2_BUCKETING_SHUFFLE_ENABLED.key -> "true", - SQLConf.V2_BUCKETING_PUSH_PART_VALUES_ENABLED.key -> pushdownValues.toString, - SQLConf.V2_BUCKETING_PARTIALLY_CLUSTERED_DISTRIBUTION_ENABLED.key - -> partiallyClustered.toString, - SQLConf.V2_BUCKETING_ALLOW_JOIN_KEYS_SUBSET_OF_PARTITION_KEYS.key -> "true") { - val df = createJoinTestDF(Seq("id" -> "item_id")) - val shuffles = collectShuffles(df.queryExecution.executedPlan) - assert(shuffles.size == 1, "SPJ should be triggered") - checkAnswer(df, Seq(Row(1, "aa", 30.0, 42.0), - Row(1, "aa", 30.0, 89.0), - Row(1, "aa", 40.0, 42.0), - Row(1, "aa", 40.0, 89.0), - Row(3, "bb", 10.0, 19.5))) - } + withSQLConf( + SQLConf.V2_BUCKETING_SHUFFLE_ENABLED.key -> "true", + SQLConf.V2_BUCKETING_PUSH_PART_VALUES_ENABLED.key -> pushdownValues.toString, + SQLConf.V2_BUCKETING_PARTIALLY_CLUSTERED_DISTRIBUTION_ENABLED.key -> "false", + SQLConf.V2_BUCKETING_ALLOW_JOIN_KEYS_SUBSET_OF_PARTITION_KEYS.key -> "true") { + val df = createJoinTestDF(Seq("id" -> "item_id")) + val shuffles = collectShuffles(df.queryExecution.executedPlan) + assert(shuffles.size == 1, "SPJ should be triggered") + checkAnswer(df, Seq(Row(1, "aa", 30.0, 42.0), + Row(1, "aa", 30.0, 89.0), + Row(1, "aa", 40.0, 42.0), + Row(1, "aa", 40.0, 89.0), + Row(3, "bb", 10.0, 19.5))) } } } @@ -1992,4 +2072,109 @@ class KeyGroupedPartitioningSuite extends DistributionAndOrderingSuiteBase { } } } + + test("SPARK-48012: one-side shuffle with partition transforms") { + val items_partitions = Array(bucket(2, "id"), identity("arrive_time")) + val items_partitions2 = Array(identity("arrive_time"), bucket(2, "id")) + + Seq(items_partitions, items_partitions2).foreach { partition => + catalog.clearTables() + + createTable(items, itemsColumns, partition) + sql(s"INSERT INTO testcat.ns.$items VALUES " + + "(1, 'aa', 40.0, cast('2020-01-01' as timestamp)), " + + "(1, 'bb', 30.0, cast('2020-01-01' as timestamp)), " + + "(1, 'cc', 30.0, cast('2020-01-02' as timestamp)), " + + "(3, 'dd', 10.0, cast('2020-01-01' as timestamp)), " + + "(4, 'ee', 15.5, cast('2020-02-01' as timestamp)), " + + "(5, 'ff', 32.1, cast('2020-03-01' as timestamp))") + + createTable(purchases, purchasesColumns, Array.empty) + sql(s"INSERT INTO testcat.ns.$purchases VALUES " + + "(1, 42.0, cast('2020-01-01' as timestamp)), " + + "(2, 10.7, cast('2020-01-01' as timestamp))," + + "(3, 19.5, cast('2020-02-01' as timestamp))," + + "(4, 56.5, cast('2020-02-01' as timestamp))") + + withSQLConf( + SQLConf.V2_BUCKETING_SHUFFLE_ENABLED.key -> "true") { + val df = createJoinTestDF(Seq("id" -> "item_id", "arrive_time" -> "time")) + val shuffles = collectShuffles(df.queryExecution.executedPlan) + assert(shuffles.size == 1, "only shuffle side that does not report partitioning") + + checkAnswer(df, Seq( + Row(1, "bb", 30.0, 42.0), + Row(1, "aa", 40.0, 42.0), + Row(4, "ee", 15.5, 56.5))) + } + } + } + + test("SPARK-48012: one-side shuffle with partition transforms and pushdown values") { + val items_partitions = Array(bucket(2, "id"), identity("arrive_time")) + createTable(items, itemsColumns, items_partitions) + + sql(s"INSERT INTO testcat.ns.$items VALUES " + + "(1, 'aa', 40.0, cast('2020-01-01' as timestamp)), " + + "(1, 'bb', 30.0, cast('2020-01-01' as timestamp)), " + + "(1, 'cc', 30.0, cast('2020-01-02' as timestamp))") + + createTable(purchases, purchasesColumns, Array.empty) + sql(s"INSERT INTO testcat.ns.$purchases VALUES " + + "(1, 42.0, cast('2020-01-01' as timestamp)), " + + "(2, 10.7, cast('2020-01-01' as timestamp))") + + Seq(true, false).foreach { pushDown => { + withSQLConf( + SQLConf.V2_BUCKETING_SHUFFLE_ENABLED.key -> "true", + SQLConf.V2_BUCKETING_PUSH_PART_VALUES_ENABLED.key -> + pushDown.toString) { + val df = createJoinTestDF(Seq("id" -> "item_id", "arrive_time" -> "time")) + val shuffles = collectShuffles(df.queryExecution.executedPlan) + assert(shuffles.size == 1, "only shuffle side that does not report partitioning") + + checkAnswer(df, Seq( + Row(1, "bb", 30.0, 42.0), + Row(1, "aa", 40.0, 42.0))) + } + } + } + } + + test("SPARK-48012: one-side shuffle with partition transforms " + + "with fewer join keys than partition kes") { + val items_partitions = Array(bucket(2, "id"), identity("name")) + createTable(items, itemsColumns, items_partitions) + + sql(s"INSERT INTO testcat.ns.$items VALUES " + + "(1, 'aa', 40.0, cast('2020-01-01' as timestamp)), " + + "(1, 'aa', 30.0, cast('2020-01-02' as timestamp)), " + + "(3, 'bb', 10.0, cast('2020-01-01' as timestamp)), " + + "(4, 'cc', 15.5, cast('2020-02-01' as timestamp))") + + createTable(purchases, purchasesColumns, Array.empty) + sql(s"INSERT INTO testcat.ns.$purchases VALUES " + + "(1, 42.0, cast('2020-01-01' as timestamp)), " + + "(1, 89.0, cast('2020-01-03' as timestamp)), " + + "(3, 19.5, cast('2020-02-01' as timestamp)), " + + "(5, 26.0, cast('2023-01-01' as timestamp)), " + + "(6, 50.0, cast('2023-02-01' as timestamp))") + + withSQLConf( + SQLConf.REQUIRE_ALL_CLUSTER_KEYS_FOR_CO_PARTITION.key -> "false", + SQLConf.V2_BUCKETING_SHUFFLE_ENABLED.key -> "true", + SQLConf.V2_BUCKETING_PUSH_PART_VALUES_ENABLED.key -> "true", + SQLConf.V2_BUCKETING_PARTIALLY_CLUSTERED_DISTRIBUTION_ENABLED.key -> "false", + SQLConf.V2_BUCKETING_ALLOW_JOIN_KEYS_SUBSET_OF_PARTITION_KEYS.key -> "true") { + val df = createJoinTestDF(Seq("id" -> "item_id")) + val shuffles = collectShuffles(df.queryExecution.executedPlan) + assert(shuffles.size == 2, "SPJ should not be triggered for transform expression with" + + "less join keys than partition keys for now.") + checkAnswer(df, Seq(Row(1, "aa", 30.0, 42.0), + Row(1, "aa", 30.0, 89.0), + Row(1, "aa", 40.0, 42.0), + Row(1, "aa", 40.0, 89.0), + Row(3, "bb", 10.0, 19.5))) + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/MergeIntoDataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/MergeIntoDataFrameSuite.scala index ed44111c81d2a..c080a66bce257 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/MergeIntoDataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/MergeIntoDataFrameSuite.scala @@ -943,4 +943,32 @@ class MergeIntoDataFrameSuite extends RowLevelOperationSuiteBase { Row(3, Row("y1 ", "y2"), "hr"))) // update (not matched by source) } } + + test("withSchemaEvolution carries over existing when clauses") { + withTempView("source") { + Seq(1, 2, 4).toDF("pk").createOrReplaceTempView("source") + + // an arbitrary merge + val writer1 = spark.table("source") + .mergeInto("dummy", $"col" === $"col") + .whenMatched(col("col") === 1) + .updateAll() + .whenMatched() + .delete() + .whenNotMatched(col("col") === 1) + .insertAll() + .whenNotMatchedBySource(col("col") === 1) + .delete() + val writer2 = writer1.withSchemaEvolution() + + assert(writer1.matchedActions.length === 2) + assert(writer1.notMatchedActions.length === 1) + assert(writer1.notMatchedBySourceActions.length === 1) + + assert(writer1.matchedActions === writer2.matchedActions) + assert(writer1.notMatchedActions === writer2.notMatchedActions) + assert(writer1.notMatchedBySourceActions === writer2.notMatchedBySourceActions) + assert(writer2.schemaEvolutionEnabled) + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/MergeIntoTableSuiteBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/MergeIntoTableSuiteBase.scala index 0b643ca534e39..9d4e4fc016722 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/MergeIntoTableSuiteBase.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/MergeIntoTableSuiteBase.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.connector -import org.apache.spark.{SparkException, SparkRuntimeException} +import org.apache.spark.SparkRuntimeException import org.apache.spark.sql.{AnalysisException, Row} import org.apache.spark.sql.catalyst.expressions.{AttributeReference, EqualTo, In, Not} import org.apache.spark.sql.catalyst.optimizer.BuildLeft @@ -1317,7 +1317,7 @@ abstract class MergeIntoTableSuiteBase extends RowLevelOperationSuiteBase { Seq(1, 4).toDF("pk").createOrReplaceTempView("source") - val e1 = intercept[SparkException] { + val e1 = intercept[SparkRuntimeException] { sql( s"""MERGE INTO $tableNameAsString t |USING source s @@ -1326,9 +1326,9 @@ abstract class MergeIntoTableSuiteBase extends RowLevelOperationSuiteBase { | UPDATE SET s = named_struct('n_i', null, 'n_l', -1L) |""".stripMargin) } - assert(e1.getCause.getMessage.contains("Null value appeared in non-nullable field")) + assert(e1.getErrorClass == "NOT_NULL_ASSERT_VIOLATION") - val e2 = intercept[SparkException] { + val e2 = intercept[SparkRuntimeException] { sql( s"""MERGE INTO $tableNameAsString t |USING source s @@ -1337,9 +1337,9 @@ abstract class MergeIntoTableSuiteBase extends RowLevelOperationSuiteBase { | UPDATE SET s = named_struct('n_i', null, 'n_l', -1L) |""".stripMargin) } - assert(e2.getCause.getMessage.contains("Null value appeared in non-nullable field")) + assert(e2.getErrorClass == "NOT_NULL_ASSERT_VIOLATION") - val e3 = intercept[SparkException] { + val e3 = intercept[SparkRuntimeException] { sql( s"""MERGE INTO $tableNameAsString t |USING source s @@ -1348,7 +1348,7 @@ abstract class MergeIntoTableSuiteBase extends RowLevelOperationSuiteBase { | INSERT (pk, s, dep) VALUES (s.pk, named_struct('n_i', null, 'n_l', -1L), 'invalid') |""".stripMargin) } - assert(e3.getCause.getMessage.contains("Null value appeared in non-nullable field")) + assert(e3.getErrorClass == "NOT_NULL_ASSERT_VIOLATION") } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/UpdateTableSuiteBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/UpdateTableSuiteBase.scala index b43101c2e0255..c2ae5f40cfaf6 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/UpdateTableSuiteBase.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/UpdateTableSuiteBase.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.connector -import org.apache.spark.SparkException +import org.apache.spark.SparkRuntimeException import org.apache.spark.sql.Row import org.apache.spark.sql.connector.catalog.{Column, ColumnDefaultValue} import org.apache.spark.sql.connector.expressions.LiteralValue @@ -575,9 +575,12 @@ abstract class UpdateTableSuiteBase extends RowLevelOperationSuiteBase { |{ "pk": 3, "s": { "n_i": 3, "n_l": 33 }, "dep": "hr" } |""".stripMargin) - val e = intercept[SparkException] { - sql(s"UPDATE $tableNameAsString SET s = named_struct('n_i', null, 'n_l', -1L) WHERE pk = 1") - } - assert(e.getCause.getMessage.contains("Null value appeared in non-nullable field")) + checkError( + exception = intercept[SparkRuntimeException] { + sql(s"UPDATE $tableNameAsString SET s = named_struct('n_i', null, 'n_l', -1L) WHERE pk = 1") + }, + errorClass = "NOT_NULL_ASSERT_VIOLATION", + sqlState = "42000", + parameters = Map("walkedTypePath" -> "\ns\nn_i\n")) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/V2CommandsCaseSensitivitySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/V2CommandsCaseSensitivitySuite.scala index ee71bd3af1e02..3ab7edb78439c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/V2CommandsCaseSensitivitySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/V2CommandsCaseSensitivitySuite.scala @@ -306,7 +306,7 @@ class V2CommandsCaseSensitivitySuite None, Some(UnresolvedFieldPosition(ColumnPosition.after("id"))), None))), - "FIELDS_ALREADY_EXISTS", + "FIELD_ALREADY_EXISTS", Map( "op" -> "add", "fieldNames" -> "`ID`", @@ -317,7 +317,7 @@ class V2CommandsCaseSensitivitySuite test("SPARK-36381: Check column name exist case sensitive and insensitive when rename column") { alterTableErrorClass( RenameColumn(table, UnresolvedFieldName(Array("id").toImmutableArraySeq), "DATA"), - "FIELDS_ALREADY_EXISTS", + "FIELD_ALREADY_EXISTS", Map( "op" -> "rename", "fieldNames" -> "`DATA`", diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/catalog/functions/transformFunctions.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/catalog/functions/transformFunctions.scala index 5cdb900901056..5364fc5d62423 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/catalog/functions/transformFunctions.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/catalog/functions/transformFunctions.scala @@ -16,9 +16,11 @@ */ package org.apache.spark.sql.connector.catalog.functions -import java.sql.Timestamp +import java.time.{Instant, LocalDate, ZoneId} +import java.time.temporal.ChronoUnit import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String @@ -44,7 +46,13 @@ object YearsFunction extends ScalarFunction[Long] { override def name(): String = "years" override def canonicalName(): String = name() - def invoke(ts: Long): Long = new Timestamp(ts).getYear + 1900 + val UTC: ZoneId = ZoneId.of("UTC") + val EPOCH_LOCAL_DATE: LocalDate = Instant.EPOCH.atZone(UTC).toLocalDate + + def invoke(ts: Long): Long = { + val localDate = DateTimeUtils.microsToInstant(ts).atZone(UTC).toLocalDate + ChronoUnit.YEARS.between(EPOCH_LOCAL_DATE, localDate) + } } object DaysFunction extends BoundFunction { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryCompilationErrorsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryCompilationErrorsSuite.scala index 4574d3328d48a..958d2b0130d8b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryCompilationErrorsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryCompilationErrorsSuite.scala @@ -926,6 +926,37 @@ class QueryCompilationErrorsSuite parameters = Map("message" -> "Cannot convert Spark data type \"DUMMY\" to any Parquet type.") ) } + + test("SPARK-48556: Ensure UNRESOLVED_COLUMN is thrown when query has grouping expressions " + + "with invalid column name") { + case class UnresolvedDummyColumnTest(query: String, pos: Int) + + withTable("t1") { + sql("create table t1(a int, b int) using parquet") + val tests = Seq( + UnresolvedDummyColumnTest("select grouping(a), dummy from t1 group by a with rollup", 20), + UnresolvedDummyColumnTest("select dummy, grouping(a) from t1 group by a with rollup", 7), + UnresolvedDummyColumnTest( + "select a, case when grouping(a) = 1 then 0 else b end, count(dummy) from t1 " + + "group by 1 with rollup", + 61), + UnresolvedDummyColumnTest( + "select a, max(dummy), case when grouping(a) = 1 then 0 else b end " + + "from t1 group by 1 with rollup", + 14) + ) + tests.foreach(test => { + checkError( + exception = intercept[AnalysisException] { + sql(test.query) + }, + errorClass = "UNRESOLVED_COLUMN.WITH_SUGGESTION", + parameters = Map("objectName" -> "`dummy`", "proposal" -> "`a`, `b`"), + context = ExpectedContext(fragment = "dummy", start = test.pos, stop = test.pos + 4) + ) + }) + } + } } class MyCastToString extends SparkUserDefinedFunction( diff --git a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryParsingErrorsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryParsingErrorsSuite.scala index d381dae6ea293..b7fb65091ef73 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryParsingErrorsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryParsingErrorsSuite.scala @@ -32,6 +32,22 @@ class QueryParsingErrorsSuite extends QueryTest with SharedSparkSession with SQL intercept[ParseException](sql(sqlText).collect()) } + test("PARSE_STACK_OVERFLOW_ERROR: Stack overflow hit") { + val query = (1 to 20000).map(x => "SELECT 1 as a").mkString(" UNION ALL ") + val e = intercept[ParseException] { + spark.sql(query) + } + checkError( + exception = parseException(query), + errorClass = "FAILED_TO_PARSE_TOO_COMPLEX", + parameters = Map(), + context = ExpectedContext( + query, + start = 0, + stop = query.length - 1) + ) + } + test("EXEC_IMMEDIATE_DUPLICATE_ARGUMENT_ALIASES: duplicate aliases provided in using statement") { val query = "EXECUTE IMMEDIATE 'SELECT 1707 WHERE ? = 1' USING 1 as first" + ", 2 as first, 3 as second, 4 as second, 5 as third" @@ -272,7 +288,7 @@ class QueryParsingErrorsSuite extends QueryTest with SharedSparkSession with SQL stop = 27)) } - test("INVALID_SQL_SYNTAX.CREATE_FUNC_WITH_IF_NOT_EXISTS_AND_REPLACE: " + + test("INVALID_SQL_SYNTAX.CREATE_ROUTINE_WITH_IF_NOT_EXISTS_AND_REPLACE: " + "Create function with both if not exists and replace") { val sqlText = """CREATE OR REPLACE FUNCTION IF NOT EXISTS func1 as @@ -281,7 +297,7 @@ class QueryParsingErrorsSuite extends QueryTest with SharedSparkSession with SQL checkError( exception = parseException(sqlText), - errorClass = "INVALID_SQL_SYNTAX.CREATE_FUNC_WITH_IF_NOT_EXISTS_AND_REPLACE", + errorClass = "INVALID_SQL_SYNTAX.CREATE_ROUTINE_WITH_IF_NOT_EXISTS_AND_REPLACE", sqlState = "42000", context = ExpectedContext( fragment = sqlText, @@ -631,6 +647,13 @@ class QueryParsingErrorsSuite extends QueryTest with SharedSparkSession with SQL sqlState = "42K01", parameters = Map("elementType" -> ""), context = ExpectedContext(fragment = "ARRAY", start = 30, stop = 34)) + // Create column of array type without specifying element type in lowercase + checkError( + exception = parseException("CREATE TABLE tbl_120691 (col1 array)"), + errorClass = "INCOMPLETE_TYPE_DEFINITION.ARRAY", + sqlState = "42K01", + parameters = Map("elementType" -> ""), + context = ExpectedContext(fragment = "array", start = 30, stop = 34)) } test("INCOMPLETE_TYPE_DEFINITION: struct type definition is incomplete") { @@ -658,6 +681,12 @@ class QueryParsingErrorsSuite extends QueryTest with SharedSparkSession with SQL errorClass = "PARSE_SYNTAX_ERROR", sqlState = "42601", parameters = Map("error" -> "'<'", "hint" -> ": missing ')'")) + // Create column of struct type without specifying field type in lowercase + checkError( + exception = parseException("CREATE TABLE tbl_120691 (col1 struct)"), + errorClass = "INCOMPLETE_TYPE_DEFINITION.STRUCT", + sqlState = "42K01", + context = ExpectedContext(fragment = "struct", start = 30, stop = 35)) } test("INCOMPLETE_TYPE_DEFINITION: map type definition is incomplete") { @@ -679,6 +708,12 @@ class QueryParsingErrorsSuite extends QueryTest with SharedSparkSession with SQL errorClass = "PARSE_SYNTAX_ERROR", sqlState = "42601", parameters = Map("error" -> "'<'", "hint" -> ": missing ')'")) + // Create column of map type without specifying key/value types in lowercase + checkError( + exception = parseException("SELECT CAST(map('1',2) AS map)"), + errorClass = "INCOMPLETE_TYPE_DEFINITION.MAP", + sqlState = "42K01", + context = ExpectedContext(fragment = "map", start = 26, stop = 28)) } test("INVALID_ESC: Escape string must contain only one character") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/CoalesceShufflePartitionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/CoalesceShufflePartitionsSuite.scala index e87b90dfdd84a..dc72b4a092aef 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/CoalesceShufflePartitionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/CoalesceShufflePartitionsSuite.scala @@ -21,6 +21,7 @@ import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.internal.config.IO_ENCRYPTION_ENABLED import org.apache.spark.internal.config.UI.UI_ENABLED import org.apache.spark.sql._ +import org.apache.spark.sql.catalyst.SQLConfHelper import org.apache.spark.sql.execution.adaptive._ import org.apache.spark.sql.execution.adaptive.AQEShuffleReadExec import org.apache.spark.sql.execution.exchange.ReusedExchangeExec @@ -28,7 +29,7 @@ import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.util.ArrayImplicits._ -class CoalesceShufflePartitionsSuite extends SparkFunSuite { +class CoalesceShufflePartitionsSuite extends SparkFunSuite with SQLConfHelper { private var originalActiveSparkSession: Option[SparkSession] = _ private var originalInstantiatedSparkSession: Option[SparkSession] = _ @@ -374,72 +375,73 @@ class CoalesceShufflePartitionsSuite extends SparkFunSuite { test("SPARK-24705 adaptive query execution works correctly when exchange reuse enabled") { val test: SparkSession => Unit = { spark: SparkSession => - spark.sql("SET spark.sql.exchange.reuse=true") - val df = spark.range(0, 6, 1).selectExpr("id AS key", "id AS value") - - // test case 1: a query stage has 3 child stages but they are the same stage. - // Final Stage 1 - // ShuffleQueryStage 0 - // ReusedQueryStage 0 - // ReusedQueryStage 0 - val resultDf = df.join(df, "key").join(df, "key") - QueryTest.checkAnswer(resultDf, (0 to 5).map(i => Row(i, i, i, i))) - val finalPlan = resultDf.queryExecution.executedPlan - .asInstanceOf[AdaptiveSparkPlanExec].executedPlan - assert(finalPlan.collect { - case ShuffleQueryStageExec(_, r: ReusedExchangeExec, _) => r - }.length == 2) - assert( - finalPlan.collect { - case r @ CoalescedShuffleRead() => r - }.length == 3) - - - // test case 2: a query stage has 2 parent stages. - // Final Stage 3 - // ShuffleQueryStage 1 - // ShuffleQueryStage 0 - // ShuffleQueryStage 2 - // ReusedQueryStage 0 - val grouped = df.groupBy((col("key") + 1).as("key")).agg(max("value").as("value")) - val resultDf2 = grouped.groupBy(col("key") + 1).max("value") - .union(grouped.groupBy(col("key") + 2).max("value")) - QueryTest.checkAnswer(resultDf2, Row(2, 0) :: Row(3, 0) :: Row(3, 1) :: Row(4, 1) :: - Row(4, 2) :: Row(5, 2) :: Row(5, 3) :: Row(6, 3) :: Row(6, 4) :: Row(7, 4) :: Row(7, 5) :: - Row(8, 5) :: Nil) - - val finalPlan2 = resultDf2.queryExecution.executedPlan - .asInstanceOf[AdaptiveSparkPlanExec].executedPlan - - // The result stage has 2 children - val level1Stages = finalPlan2.collect { case q: QueryStageExec => q } - assert(level1Stages.length == 2) - - assert( - finalPlan2.collect { - case r @ CoalescedShuffleRead() => r - }.length == 2, "finalPlan2") + withSQLConf("spark.sql.exchange.reuse" -> "true") { + val df = spark.range(0, 6, 1).selectExpr("id AS key", "id AS value") + + // test case 1: a query stage has 3 child stages but they are the same stage. + // Final Stage 1 + // ShuffleQueryStage 0 + // ReusedQueryStage 0 + // ReusedQueryStage 0 + val resultDf = df.join(df, "key").join(df, "key") + QueryTest.checkAnswer(resultDf, (0 to 5).map(i => Row(i, i, i, i))) + val finalPlan = resultDf.queryExecution.executedPlan + .asInstanceOf[AdaptiveSparkPlanExec].executedPlan + assert(finalPlan.collect { + case ShuffleQueryStageExec(_, r: ReusedExchangeExec, _) => r + }.length == 2) + assert( + finalPlan.collect { + case r@CoalescedShuffleRead() => r + }.length == 3) + + + // test case 2: a query stage has 2 parent stages. + // Final Stage 3 + // ShuffleQueryStage 1 + // ShuffleQueryStage 0 + // ShuffleQueryStage 2 + // ReusedQueryStage 0 + val grouped = df.groupBy((col("key") + 1).as("key")).agg(max("value").as("value")) + val resultDf2 = grouped.groupBy(col("key") + 1).max("value") + .union(grouped.groupBy(col("key") + 2).max("value")) + QueryTest.checkAnswer(resultDf2, Row(2, 0) :: Row(3, 0) :: Row(3, 1) :: Row(4, 1) :: + Row(4, 2) :: Row(5, 2) :: Row(5, 3) :: Row(6, 3) :: Row(6, 4) :: Row(7, 4) :: Row(7, 5) :: + Row(8, 5) :: Nil) + + val finalPlan2 = resultDf2.queryExecution.executedPlan + .asInstanceOf[AdaptiveSparkPlanExec].executedPlan - level1Stages.foreach(qs => - assert(qs.plan.collect { - case r @ CoalescedShuffleRead() => r - }.length == 1, - "Wrong CoalescedShuffleRead below " + qs.simpleString(3))) - - val leafStages = level1Stages.flatMap { stage => - // All of the child stages of result stage have only one child stage. - val children = stage.plan.collect { case q: QueryStageExec => q } - assert(children.length == 1) - children - } - assert(leafStages.length == 2) + // The result stage has 2 children + val level1Stages = finalPlan2.collect { case q: QueryStageExec => q } + assert(level1Stages.length == 2) + + assert( + finalPlan2.collect { + case r@CoalescedShuffleRead() => r + }.length == 2, "finalPlan2") + + level1Stages.foreach(qs => + assert(qs.plan.collect { + case r@CoalescedShuffleRead() => r + }.length == 1, + "Wrong CoalescedShuffleRead below " + qs.simpleString(3))) + + val leafStages = level1Stages.flatMap { stage => + // All of the child stages of result stage have only one child stage. + val children = stage.plan.collect { case q: QueryStageExec => q } + assert(children.length == 1) + children + } + assert(leafStages.length == 2) - val reusedStages = level1Stages.flatMap { stage => - stage.plan.collect { - case ShuffleQueryStageExec(_, r: ReusedExchangeExec, _) => r + val reusedStages = level1Stages.flatMap { stage => + stage.plan.collect { + case ShuffleQueryStageExec(_, r: ReusedExchangeExec, _) => r + } } + assert(reusedStages.length == 1) } - assert(reusedStages.length == 1) } withSparkSession(test, 400, None) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ExecuteImmediateEndToEndSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ExecuteImmediateEndToEndSuite.scala index 41ddcef89b7d4..6b0f0b5582dc5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ExecuteImmediateEndToEndSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ExecuteImmediateEndToEndSuite.scala @@ -16,7 +16,8 @@ */ package org.apache.spark.sql.execution -import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.{QueryTest} +import org.apache.spark.sql.catalyst.parser.ParseException import org.apache.spark.sql.test.SharedSparkSession class ExecuteImmediateEndToEndSuite extends QueryTest with SharedSparkSession { @@ -36,4 +37,30 @@ class ExecuteImmediateEndToEndSuite extends QueryTest with SharedSparkSession { spark.sql("DROP TEMPORARY VARIABLE IF EXISTS parm;") } } + + test("EXEC IMMEDIATE STACK OVERFLOW") { + try { + spark.sql("DECLARE parm = 1;") + val query = (1 to 20000).map(x => "SELECT 1 as a").mkString(" UNION ALL ") + Seq( + s"EXECUTE IMMEDIATE '$query'", + s"EXECUTE IMMEDIATE '$query' INTO parm").foreach { q => + val e = intercept[ParseException] { + spark.sql(q) + } + + checkError( + exception = intercept[ParseException](sql(query).collect()), + errorClass = "FAILED_TO_PARSE_TOO_COMPLEX", + parameters = Map(), + context = ExpectedContext( + query, + start = 0, + stop = query.length - 1) + ) + } + } finally { + spark.sql("DROP TEMPORARY VARIABLE IF EXISTS parm;") + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/GlobalTempViewSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/GlobalTempViewSuite.scala index 6e2200380d6cc..31d8dd0740e14 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/GlobalTempViewSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/GlobalTempViewSuite.scala @@ -30,7 +30,7 @@ class GlobalTempViewSuite extends QueryTest with SharedSparkSession { override protected def beforeAll(): Unit = { super.beforeAll() - globalTempDB = spark.sharedState.globalTempViewManager.database + globalTempDB = spark.sharedState.globalTempDB } private var globalTempDB: String = _ diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/HiveResultSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/HiveResultSuite.scala index f8366b3f7c5fa..936aaba51935a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/HiveResultSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/HiveResultSuite.scala @@ -61,7 +61,8 @@ class HiveResultSuite extends SharedSparkSession { test("toHiveString correctly handles UDTs") { val point = new ExamplePoint(50.0, 50.0) val tpe = new ExamplePointUDT() - assert(toHiveString((point, tpe), false, getTimeFormatters) === "(50.0, 50.0)") + assert(toHiveString((point, tpe), false, getTimeFormatters, getBinaryFormatter) === + "(50.0, 50.0)") } test("decimal formatting in hive result") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala index 15de4c5cc5b2d..1400ee25f4319 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala @@ -744,6 +744,14 @@ class PlannerSuite extends SharedSparkSession with AdaptiveSparkPlanHelper { } test("SPARK-24500: create union with stream of children") { + @scala.annotation.nowarn("cat=deprecation") + val df = Union(Stream( + Range(1, 1, 1, 1), + Range(1, 2, 1, 1))) + df.queryExecution.executedPlan.execute() + } + + test("SPARK-45685: create union with LazyList of children") { val df = Union(LazyList( Range(1, 1, 1, 1), Range(1, 2, 1, 1))) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/QueryExecutionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/QueryExecutionSuite.scala index 73e5165829327..3608e7c920767 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/QueryExecutionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/QueryExecutionSuite.scala @@ -33,6 +33,7 @@ import org.apache.spark.sql.execution.datasources.v2.ShowTablesExec import org.apache.spark.sql.execution.joins.SortMergeJoinExec import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.storage.ShuffleIndexBlockId import org.apache.spark.util.Utils case class QueryExecutionTestRecord( @@ -314,6 +315,48 @@ class QueryExecutionSuite extends SharedSparkSession { mockCallback.assertExecutedPlanPrepared() } + private def cleanupShuffles(): Unit = { + val blockManager = spark.sparkContext.env.blockManager + blockManager.diskBlockManager.getAllBlocks().foreach { + case ShuffleIndexBlockId(shuffleId, _, _) => + spark.sparkContext.env.shuffleManager.unregisterShuffle(shuffleId) + case _ => + } + } + + test("SPARK-47764: Cleanup shuffle dependencies - DoNotCleanup mode") { + val plan = spark.range(100).repartition(10).logicalPlan + val df = Dataset.ofRows(spark, plan, DoNotCleanup) + df.collect() + + val blockManager = spark.sparkContext.env.blockManager + assert(blockManager.migratableResolver.getStoredShuffles().nonEmpty) + assert(blockManager.diskBlockManager.getAllBlocks().nonEmpty) + cleanupShuffles() + } + + test("SPARK-47764: Cleanup shuffle dependencies - SkipMigration mode") { + val plan = spark.range(100).repartition(10).logicalPlan + val df = Dataset.ofRows(spark, plan, SkipMigration) + df.collect() + + val blockManager = spark.sparkContext.env.blockManager + assert(blockManager.migratableResolver.getStoredShuffles().isEmpty) + assert(blockManager.diskBlockManager.getAllBlocks().nonEmpty) + cleanupShuffles() + } + + test("SPARK-47764: Cleanup shuffle dependencies - RemoveShuffleFiles mode") { + val plan = spark.range(100).repartition(10).logicalPlan + val df = Dataset.ofRows(spark, plan, RemoveShuffleFiles) + df.collect() + + val blockManager = spark.sparkContext.env.blockManager + assert(blockManager.migratableResolver.getStoredShuffles().isEmpty) + assert(blockManager.diskBlockManager.getAllBlocks().isEmpty) + cleanupShuffles() + } + test("SPARK-35378: Return UnsafeRow in CommandResultExecCheck execute methods") { val plan = spark.sql("SHOW FUNCTIONS").queryExecution.executedPlan assert(plan.isInstanceOf[CommandResultExec]) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLExecutionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLExecutionSuite.scala index 48860f381efa8..b8a109919f8f6 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLExecutionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLExecutionSuite.scala @@ -29,12 +29,13 @@ import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite} import org.apache.spark.launcher.SparkLauncher import org.apache.spark.scheduler.{SparkListener, SparkListenerEvent, SparkListenerJobStart} import org.apache.spark.sql.{Row, SparkSession} +import org.apache.spark.sql.catalyst.SQLConfHelper import org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart import org.apache.spark.sql.types._ import org.apache.spark.util.ThreadUtils import org.apache.spark.util.Utils.REDACTION_REPLACEMENT_TEXT -class SQLExecutionSuite extends SparkFunSuite { +class SQLExecutionSuite extends SparkFunSuite with SQLConfHelper { test("concurrent query execution (SPARK-10548)") { val conf = new SparkConf() @@ -194,9 +195,9 @@ class SQLExecutionSuite extends SparkFunSuite { start.physicalPlanDescription.toLowerCase(Locale.ROOT).contains("project") }) spark.sql("SELECT 1").collect() - spark.sql("SET k2 = v2") - spark.sql("SET redaction.password = 123") - spark.sql("SELECT 1").collect() + withSQLConf("k2" -> "v2", "redaction.password" -> "123") { + spark.sql("SELECT 1").collect() + } spark.sparkContext.listenerBus.waitUntilEmpty() assert(index.get() == 2) } finally { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala index 1888844b9b986..f54a4f4606061 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala @@ -91,7 +91,7 @@ abstract class SQLViewSuite extends QueryTest with SQLTestUtils { "objName" -> s"`$SESSION_CATALOG_NAME`.`default`.`jtv1`", "tempObj" -> "VIEW", "tempObjName" -> "`temp_jtv1`")) - val globalTempDB = spark.sharedState.globalTempViewManager.database + val globalTempDB = spark.sharedState.globalTempDB sql("CREATE GLOBAL TEMP VIEW global_temp_jtv1 AS SELECT * FROM jt WHERE id > 0") checkError( exception = intercept[AnalysisException] { @@ -899,46 +899,48 @@ abstract class SQLViewSuite extends QueryTest with SQLTestUtils { test("resolve a view when the dataTypes of referenced table columns changed") { withTable("tab1") { - spark.range(1, 10).selectExpr("id", "id + 1 id1").write.saveAsTable("tab1") - withView("testView") { - sql("CREATE VIEW testView AS SELECT * FROM tab1") - - // Allow casting from IntegerType to LongType - val df = (1 until 10).map(i => (i, i + 1)).toDF("id", "id1") - df.write.format("json").mode(SaveMode.Overwrite).saveAsTable("tab1") - checkAnswer(sql("SELECT * FROM testView ORDER BY id1"), (1 to 9).map(i => Row(i, i + 1))) - - // Casting from DoubleType to LongType might truncate, throw an AnalysisException. - val df2 = (1 until 10).map(i => (i.toDouble, i.toDouble)).toDF("id", "id1") - df2.write.format("json").mode(SaveMode.Overwrite).saveAsTable("tab1") - checkError( - exception = intercept[AnalysisException](sql("SELECT * FROM testView")), - errorClass = "CANNOT_UP_CAST_DATATYPE", - parameters = Map( - "expression" -> s"$SESSION_CATALOG_NAME.default.tab1.id", - "sourceType" -> "\"DOUBLE\"", - "targetType" -> "\"BIGINT\"", - "details" -> ("The type path of the target object is:\n\n" + - "You can either add an explicit cast to the input data or " + - "choose a higher precision type of the field in the target object") + withSQLConf("spark.sql.legacy.viewSchemaCompensation" -> "false") { + spark.range(1, 10).selectExpr("id", "id + 1 id1").write.saveAsTable("tab1") + withView("testView") { + sql("CREATE VIEW testView AS SELECT * FROM tab1") + + // Allow casting from IntegerType to LongType + val df = (1 until 10).map(i => (i, i + 1)).toDF("id", "id1") + df.write.format("json").mode(SaveMode.Overwrite).saveAsTable("tab1") + checkAnswer(sql("SELECT * FROM testView ORDER BY id1"), (1 to 9).map(i => Row(i, i + 1))) + + // Casting from DoubleType to LongType might truncate, throw an AnalysisException. + val df2 = (1 until 10).map(i => (i.toDouble, i.toDouble)).toDF("id", "id1") + df2.write.format("json").mode(SaveMode.Overwrite).saveAsTable("tab1") + checkError( + exception = intercept[AnalysisException](sql("SELECT * FROM testView")), + errorClass = "CANNOT_UP_CAST_DATATYPE", + parameters = Map( + "expression" -> s"$SESSION_CATALOG_NAME.default.tab1.id", + "sourceType" -> "\"DOUBLE\"", + "targetType" -> "\"BIGINT\"", + "details" -> ("The type path of the target object is:\n\n" + + "You can either add an explicit cast to the input data or " + + "choose a higher precision type of the field in the target object") + ) ) - ) - // Can't cast from ArrayType to LongType, throw an AnalysisException. - val df3 = (1 until 10).map(i => (i, Seq(i))).toDF("id", "id1") - df3.write.format("json").mode(SaveMode.Overwrite).saveAsTable("tab1") - checkError( - exception = intercept[AnalysisException](sql("SELECT * FROM testView")), - errorClass = "CANNOT_UP_CAST_DATATYPE", - parameters = Map( - "expression" -> s"$SESSION_CATALOG_NAME.default.tab1.id1", - "sourceType" -> "\"ARRAY\"", - "targetType" -> "\"BIGINT\"", - "details" -> ("The type path of the target object is:\n\n" + - "You can either add an explicit cast to the input data or " + - "choose a higher precision type of the field in the target object") + // Can't cast from ArrayType to LongType, throw an AnalysisException. + val df3 = (1 until 10).map(i => (i, Seq(i))).toDF("id", "id1") + df3.write.format("json").mode(SaveMode.Overwrite).saveAsTable("tab1") + checkError( + exception = intercept[AnalysisException](sql("SELECT * FROM testView")), + errorClass = "CANNOT_UP_CAST_DATATYPE", + parameters = Map( + "expression" -> s"$SESSION_CATALOG_NAME.default.tab1.id1", + "sourceType" -> "\"ARRAY\"", + "targetType" -> "\"BIGINT\"", + "details" -> ("The type path of the target object is:\n\n" + + "You can either add an explicit cast to the input data or " + + "choose a higher precision type of the field in the target object") + ) ) - ) + } } } } @@ -1100,7 +1102,7 @@ abstract class SQLViewSuite extends QueryTest with SQLTestUtils { test("local temp view refers global temp view") { withGlobalTempView("v1") { withTempView("v2") { - val globalTempDB = spark.sharedState.globalTempViewManager.database + val globalTempDB = spark.sharedState.globalTempDB sql("CREATE GLOBAL TEMPORARY VIEW v1 AS SELECT 1") sql(s"CREATE TEMPORARY VIEW v2 AS SELECT * FROM ${globalTempDB}.v1") checkAnswer(sql("SELECT * FROM v2"), Seq(Row(1))) @@ -1111,7 +1113,7 @@ abstract class SQLViewSuite extends QueryTest with SQLTestUtils { test("global temp view refers local temp view") { withTempView("v1") { withGlobalTempView("v2") { - val globalTempDB = spark.sharedState.globalTempViewManager.database + val globalTempDB = spark.sharedState.globalTempDB sql("CREATE TEMPORARY VIEW v1 AS SELECT 1") sql(s"CREATE GLOBAL TEMPORARY VIEW v2 AS SELECT * FROM v1") checkAnswer(sql(s"SELECT * FROM ${globalTempDB}.v2"), Seq(Row(1))) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewTestSuite.scala index d2740f9eac789..e75413b804f48 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewTestSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewTestSuite.scala @@ -536,7 +536,7 @@ class LocalTempViewTestSuite extends TempViewTestSuite with SharedSparkSession { } class GlobalTempViewTestSuite extends TempViewTestSuite with SharedSparkSession { - private def db: String = spark.sharedState.globalTempViewManager.database + private def db: String = spark.sharedState.globalTempDB override protected def viewTypeString: String = "GLOBAL TEMPORARY VIEW" override protected def formattedViewName(viewName: String): String = { s"$db.$viewName" @@ -736,7 +736,8 @@ class PersistedViewTestSuite extends SQLViewTestSuite with SharedSparkSession { Seq(true, false).foreach { serde => withView(viewName) { createView(viewName, "SELECT 1 AS a") - val expected = s"CREATE VIEW ${formattedViewName(viewName)} ( a) AS SELECT 1 AS a" + val expected = s"CREATE VIEW ${formattedViewName(viewName)} ( a) " + + "WITH SCHEMA COMPENSATION AS SELECT 1 AS a" assert(getShowCreateDDL(formattedViewName(viewName), serde) == expected) } } @@ -748,7 +749,7 @@ class PersistedViewTestSuite extends SQLViewTestSuite with SharedSparkSession { withView(viewName) { createView(viewName, "SELECT 1 AS a, 2 AS b", Seq("a", "b COMMENT 'b column'")) val expected = s"CREATE VIEW ${formattedViewName(viewName)}" + - s" ( a, b COMMENT 'b column') AS SELECT 1 AS a, 2 AS b" + s" ( a, b COMMENT 'b column') WITH SCHEMA COMPENSATION AS SELECT 1 AS a, 2 AS b" assert(getShowCreateDDL(formattedViewName(viewName), serde) == expected) } } @@ -764,7 +765,7 @@ class PersistedViewTestSuite extends SQLViewTestSuite with SharedSparkSession { val expected = s"CREATE VIEW ${formattedViewName(viewName)} ( c1 COMMENT 'bla', c2)" + " COMMENT 'table comment'" + " TBLPROPERTIES ( 'prop1' = 'value1', 'prop2' = 'value2')" + - " AS SELECT 1 AS c1, '2' AS c2" + " WITH SCHEMA COMPENSATION AS SELECT 1 AS c1, '2' AS c2" assert(getShowCreateDDL(formattedViewName(viewName), serde) == expected) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala index d949342106159..928d732f2a160 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala @@ -130,7 +130,8 @@ class UnsafeRowSerializerSuite extends SparkFunSuite with LocalSparkSession { assert(sorter.numSpills > 0) // Merging spilled files should not throw assertion error - sorter.writePartitionedMapOutput(0, 0, mapOutputWriter) + sorter.writePartitionedMapOutput(0, 0, mapOutputWriter, + taskContext.taskMetrics.shuffleWriteMetrics) } test("SPARK-10403: unsafe row serializer with SortShuffleManager") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala index 3aaf61ffba465..4d2d465828924 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala @@ -785,6 +785,16 @@ class WholeStageCodegenSuite extends QueryTest with SharedSparkSession } test("SPARK-26680: Stream in groupBy does not cause StackOverflowError") { + @scala.annotation.nowarn("cat=deprecation") + val groupByCols = Stream(col("key")) + val df = Seq((1, 2), (2, 3), (1, 3)).toDF("key", "value") + .groupBy(groupByCols: _*) + .max("value") + + checkAnswer(df, Seq(Row(1, 3), Row(2, 3))) + } + + test("SPARK-45685: LazyList in groupBy does not cause StackOverflowError") { val groupByCols = LazyList(col("key")) val df = Seq((1, 2), (2, 3), (1, 3)).toDF("key", "value") .groupBy(groupByCols: _*) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala index 39f6aa8505b32..93df399731d42 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala @@ -25,11 +25,15 @@ import org.scalatest.PrivateMethodTester import org.scalatest.time.SpanSugar._ import org.apache.spark.SparkException +import org.apache.spark.rdd.RDD import org.apache.spark.scheduler.{SparkListener, SparkListenerEvent, SparkListenerJobStart} -import org.apache.spark.sql.{Dataset, QueryTest, Row, SparkSession, Strategy} +import org.apache.spark.shuffle.sort.SortShuffleManager +import org.apache.spark.sql.{DataFrame, Dataset, QueryTest, Row, SparkSession, Strategy} +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.optimizer.{BuildLeft, BuildRight} import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LogicalPlan} -import org.apache.spark.sql.execution.{CollectLimitExec, ColumnarToRowExec, LocalTableScanExec, PartialReducerPartitionSpec, QueryExecution, ReusedSubqueryExec, ShuffledRowRDD, SortExec, SparkPlan, SparkPlanInfo, UnionExec} +import org.apache.spark.sql.execution.{CollectLimitExec, ColumnarToRowExec, EmptyRelationExec, PartialReducerPartitionSpec, QueryExecution, ReusedSubqueryExec, ShuffledRowRDD, SortExec, SparkPlan, SparkPlanInfo, UnaryExecNode, UnionExec} import org.apache.spark.sql.execution.aggregate.BaseAggregateExec import org.apache.spark.sql.execution.columnar.{InMemoryTableScanExec, InMemoryTableScanLike} import org.apache.spark.sql.execution.command.DataWritingCommandExec @@ -61,7 +65,8 @@ class AdaptiveQueryExecSuite setupTestData() - private def runAdaptiveAndVerifyResult(query: String): (SparkPlan, SparkPlan) = { + private def runAdaptiveAndVerifyResult(query: String, + skipCheckAnswer: Boolean = false): (SparkPlan, SparkPlan) = { var finalPlanCnt = 0 var hasMetricsEvent = false val listener = new SparkListener { @@ -85,8 +90,10 @@ class AdaptiveQueryExecSuite assert(planBefore.toString.startsWith("AdaptiveSparkPlan isFinalPlan=false")) val result = dfAdaptive.collect() withSQLConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") { - val df = sql(query) - checkAnswer(df, result.toImmutableArraySeq) + if (!skipCheckAnswer) { + val df = sql(query) + checkAnswer(df, result.toImmutableArraySeq) + } } val planAfter = dfAdaptive.queryExecution.executedPlan assert(planAfter.toString.startsWith("AdaptiveSparkPlan isFinalPlan=true")) @@ -158,6 +165,12 @@ class AdaptiveQueryExecSuite } } + private def findTopLevelUnion(plan: SparkPlan): Seq[UnionExec] = { + collect(plan) { + case l: UnionExec => l + } + } + private def findReusedExchange(plan: SparkPlan): Seq[ReusedExchangeExec] = { collectWithSubqueries(plan) { case ShuffleQueryStageExec(_, e: ReusedExchangeExec, _) => e @@ -897,6 +910,92 @@ class AdaptiveQueryExecSuite } } + test("SPARK-47148: AQE should avoid to materialize ShuffleQueryStage on the cancellation") { + def createJoinedDF(): DataFrame = { + val df = spark.range(5).toDF("col") + val df2 = spark.range(10).toDF("col").coalesce(2) + val df3 = spark.range(15).toDF("col").filter(Symbol("col") >= 2) + df.join(df2, Seq("col")).join(df3, Seq("col")) + } + + try { + spark.experimental.extraStrategies = TestProblematicCoalesceStrategy :: Nil + withSQLConf( + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true", + SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { + val joinedDF = createJoinedDF() + + val error = intercept[SparkException] { + joinedDF.collect() + } + assert(error.getMessage() contains "ProblematicCoalesce execution is failed") + + val adaptivePlan = joinedDF.queryExecution.executedPlan.asInstanceOf[AdaptiveSparkPlanExec] + + // All QueryStages should be based on ShuffleQueryStageExec + val shuffleQueryStageExecs = collect(adaptivePlan) { + case sqse: ShuffleQueryStageExec => sqse + } + assert(shuffleQueryStageExecs.length == 3, s"Physical Plan should include " + + s"3 ShuffleQueryStages. Physical Plan: $adaptivePlan") + shuffleQueryStageExecs.foreach(sqse => assert(sqse.name.contains("ShuffleQueryStageExec-"))) + // First ShuffleQueryStage is materialized so it needs to be canceled. + assert(shuffleQueryStageExecs(0).shuffle.isMaterializationStarted(), + "Materialization should be started.") + // Second ShuffleQueryStage materialization is failed so + // it is excluded from the cancellation due to earlyFailedStage. + assert(shuffleQueryStageExecs(1).shuffle.isMaterializationStarted(), + "Materialization should be started but it is failed.") + // Last ShuffleQueryStage is not materialized yet so it does not require + // to be canceled and it is just skipped from the cancellation. + assert(!shuffleQueryStageExecs(2).shuffle.isMaterializationStarted(), + "Materialization should not be started.") + } + } finally { + spark.experimental.extraStrategies = Nil + } + } + + test("SPARK-47148: Check if BroadcastQueryStage materialization is started") { + def createJoinedDF(): DataFrame = { + spark.range(10).toDF("col1").createTempView("t1") + spark.range(5).coalesce(2).toDF("col2").createTempView("t2") + spark.range(15).toDF("col3").filter(Symbol("col3") >= 2).createTempView("t3") + sql("SELECT /*+ BROADCAST(t3) */ * FROM (SELECT /*+ BROADCAST(t2) */ * FROM t1 " + + "INNER JOIN t2 ON t1.col1 = t2.col2) t JOIN t3 ON t.col1 = t3.col3;") + } + withTempView("t1", "t2", "t3") { + try { + spark.experimental.extraStrategies = TestProblematicCoalesceStrategy :: Nil + withSQLConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true") { + val joinedDF = createJoinedDF() + + val error = intercept[SparkException] { + joinedDF.collect() + } + assert(error.getMessage() contains "ProblematicCoalesce execution is failed") + + val adaptivePlan = + joinedDF.queryExecution.executedPlan.asInstanceOf[AdaptiveSparkPlanExec] + + // All QueryStages should be based on BroadcastQueryStageExec + val broadcastQueryStageExecs = collect(adaptivePlan) { + case bqse: BroadcastQueryStageExec => bqse + } + assert(broadcastQueryStageExecs.length == 2, adaptivePlan) + broadcastQueryStageExecs.foreach { bqse => + assert(bqse.name.contains("BroadcastQueryStageExec-")) + // Both BroadcastQueryStages are materialized at the beginning. + assert(bqse.broadcast.isMaterializationStarted(), + s"${bqse.name}' s materialization should be started.") + } + } + } finally { + spark.experimental.extraStrategies = Nil + } + } + } + test("SPARK-30403: AQE should handle InSubquery") { withSQLConf( SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true", @@ -1551,13 +1650,13 @@ class AdaptiveQueryExecSuite val (plan1, adaptivePlan1) = runAdaptiveAndVerifyResult( "SELECT key FROM testData WHERE key = 0 ORDER BY key, value") assert(findTopLevelSort(plan1).size == 1) - assert(stripAQEPlan(adaptivePlan1).isInstanceOf[LocalTableScanExec]) + assert(stripAQEPlan(adaptivePlan1).isInstanceOf[EmptyRelationExec]) val (plan2, adaptivePlan2) = runAdaptiveAndVerifyResult( "SELECT key FROM (SELECT * FROM testData WHERE value = 'no_match' ORDER BY key)" + " WHERE key > rand()") assert(findTopLevelSort(plan2).size == 1) - assert(stripAQEPlan(adaptivePlan2).isInstanceOf[LocalTableScanExec]) + assert(stripAQEPlan(adaptivePlan2).isInstanceOf[EmptyRelationExec]) } } @@ -1565,18 +1664,18 @@ class AdaptiveQueryExecSuite withSQLConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true") { val (plan1, adaptivePlan1) = runAdaptiveAndVerifyResult( "SELECT key, count(*) FROM testData WHERE value = 'no_match' GROUP BY key") - assert(!plan1.isInstanceOf[LocalTableScanExec]) - assert(stripAQEPlan(adaptivePlan1).isInstanceOf[LocalTableScanExec]) + assert(!plan1.isInstanceOf[EmptyRelationExec]) + assert(stripAQEPlan(adaptivePlan1).isInstanceOf[EmptyRelationExec]) val (plan2, adaptivePlan2) = runAdaptiveAndVerifyResult( "SELECT key, count(*) FROM testData WHERE value = 'no_match' GROUP BY key limit 1") - assert(!plan2.isInstanceOf[LocalTableScanExec]) - assert(stripAQEPlan(adaptivePlan2).isInstanceOf[LocalTableScanExec]) + assert(!plan2.isInstanceOf[EmptyRelationExec]) + assert(stripAQEPlan(adaptivePlan2).isInstanceOf[EmptyRelationExec]) val (plan3, adaptivePlan3) = runAdaptiveAndVerifyResult( "SELECT count(*) FROM testData WHERE value = 'no_match'") - assert(!plan3.isInstanceOf[LocalTableScanExec]) - assert(!stripAQEPlan(adaptivePlan3).isInstanceOf[LocalTableScanExec]) + assert(!plan3.isInstanceOf[EmptyRelationExec]) + assert(!stripAQEPlan(adaptivePlan3).isInstanceOf[EmptyRelationExec]) } } @@ -1597,7 +1696,7 @@ class AdaptiveQueryExecSuite |""".stripMargin) checkNumUnion(plan1, 1) checkNumUnion(adaptivePlan1, 0) - assert(!stripAQEPlan(adaptivePlan1).isInstanceOf[LocalTableScanExec]) + assert(!stripAQEPlan(adaptivePlan1).isInstanceOf[EmptyRelationExec]) val (plan2, adaptivePlan2) = runAdaptiveAndVerifyResult( """ @@ -1607,7 +1706,7 @@ class AdaptiveQueryExecSuite |""".stripMargin) checkNumUnion(plan2, 1) checkNumUnion(adaptivePlan2, 0) - assert(stripAQEPlan(adaptivePlan2).isInstanceOf[LocalTableScanExec]) + assert(stripAQEPlan(adaptivePlan2).isInstanceOf[EmptyRelationExec]) } } @@ -1876,8 +1975,8 @@ class AdaptiveQueryExecSuite .map(_.getMessage.getFormattedMessage) .filter(_.startsWith("Materialize query stage")) .toArray - assert(materializeLogs(0).startsWith("Materialize query stage BroadcastQueryStageExec")) - assert(materializeLogs(1).startsWith("Materialize query stage ShuffleQueryStageExec")) + assert(materializeLogs(0).startsWith("Materialize query stage: BroadcastQueryStageExec-1")) + assert(materializeLogs(1).startsWith("Materialize query stage: ShuffleQueryStageExec-0")) } test("SPARK-34899: Use origin plan if we can not coalesce shuffle partition") { @@ -2410,6 +2509,28 @@ class AdaptiveQueryExecSuite } } + test("SPARK-48037: Fix SortShuffleWriter lacks shuffle write related metrics " + + "resulting in potentially inaccurate data") { + withTable("t3") { + withSQLConf( + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true", + SQLConf.SHUFFLE_PARTITIONS.key -> (SortShuffleManager + .MAX_SHUFFLE_OUTPUT_PARTITIONS_FOR_SERIALIZED_MODE + 1).toString) { + sql("CREATE TABLE t3 USING PARQUET AS SELECT id FROM range(2)") + val (plan, adaptivePlan) = runAdaptiveAndVerifyResult( + """ + |SELECT id, count(*) + |FROM t3 + |GROUP BY id + |LIMIT 1 + |""".stripMargin, skipCheckAnswer = true) + // The shuffle stage produces two rows and the limit operator should not been optimized out. + assert(findTopLevelLimit(plan).size == 1) + assert(findTopLevelLimit(adaptivePlan).size == 1) + } + } + } + test("SPARK-37063: OptimizeSkewInRebalancePartitions support optimize non-root node") { withTempView("v") { withSQLConf( @@ -2680,6 +2801,35 @@ class AdaptiveQueryExecSuite } } + test("SPARK-48155: AQEPropagateEmptyRelation check remained child for join") { + withSQLConf( + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true") { + // Before SPARK-48155, since the AQE will call ValidateSparkPlan, + // all AQE optimize rule won't work and return the origin plan. + // After SPARK-48155, Spark avoid invalid propagate of empty relation. + // Then the UNION first child empty relation can be propagate correctly + // and the JOIN won't be propagated since will generated a invalid plan. + val (_, adaptivePlan) = runAdaptiveAndVerifyResult( + """ + |SELECT /*+ BROADCAST(t3) */ t3.b, count(t3.a) FROM testData2 t1 + |INNER JOIN ( + | SELECT * FROM testData2 + | WHERE b = 0 + | UNION ALL + | SELECT * FROM testData2 + | WHErE b != 0 + |) t2 + |ON t1.b = t2.b AND t1.a = 0 + |RIGHT OUTER JOIN testData2 t3 + |ON t1.a > t3.a + |GROUP BY t3.b + """.stripMargin + ) + assert(findTopLevelBroadcastNestedLoopJoin(adaptivePlan).size == 1) + assert(findTopLevelUnion(adaptivePlan).size == 0) + } + } + test("SPARK-39915: Dataset.repartition(N) may not create N partitions") { withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "6") { // partitioning: HashPartitioning @@ -2703,7 +2853,7 @@ class AdaptiveQueryExecSuite // shuffleOrigin: REPARTITION_BY_NUM assert(spark.range(0).repartition(5).rdd.getNumPartitions == 5) // shuffleOrigin: REBALANCE_PARTITIONS_BY_NONE - assert(spark.range(0).repartition().rdd.getNumPartitions == 0) + assert(spark.range(0).repartition().rdd.getNumPartitions == 1) // through project assert(spark.range(0).selectExpr("id % 3 as c1", "id % 7 as c2") .repartition(5).select($"c2").rdd.getNumPartitions == 5) @@ -2900,3 +3050,26 @@ private case class SimpleShuffleSortCostEvaluator() extends CostEvaluator { SimpleCost(cost) } } + +/** + * Helps to simulate ExchangeQueryStageExec materialization failure. + */ +private object TestProblematicCoalesceStrategy extends Strategy { + private case class TestProblematicCoalesceExec(numPartitions: Int, child: SparkPlan) + extends UnaryExecNode { + override protected def doExecute(): RDD[InternalRow] = + throw new SparkException("ProblematicCoalesce execution is failed") + override def output: Seq[Attribute] = child.output + override protected def withNewChildInternal(newChild: SparkPlan): TestProblematicCoalesceExec = + copy(child = newChild) + } + + override def apply(plan: LogicalPlan): Seq[SparkPlan] = { + plan match { + case org.apache.spark.sql.catalyst.plans.logical.Repartition( + numPartitions, false, child) => + TestProblematicCoalesceExec(numPartitions, planLater(child)) :: Nil + case _ => Nil + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveRuleContextSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveRuleContextSuite.scala new file mode 100644 index 0000000000000..04c9e6c946b45 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveRuleContextSuite.scala @@ -0,0 +1,176 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.adaptive + +import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.{SparkSession, SparkSessionExtensionsProvider} +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.execution.{ColumnarRule, RangeExec, SparkPlan, SparkStrategy} +import org.apache.spark.sql.execution.aggregate.HashAggregateExec +import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec + +class AdaptiveRuleContextSuite extends SparkFunSuite with AdaptiveSparkPlanHelper { + + private def stop(spark: SparkSession): Unit = { + spark.stop() + SparkSession.clearActiveSession() + SparkSession.clearDefaultSession() + } + + private def withSession( + builders: Seq[SparkSessionExtensionsProvider])(f: SparkSession => Unit): Unit = { + val builder = SparkSession.builder().master("local[1]") + builders.foreach(builder.withExtensions) + val spark = builder.getOrCreate() + try f(spark) finally { + stop(spark) + } + } + + test("test adaptive rule context") { + withSession( + Seq(_.injectRuntimeOptimizerRule(_ => MyRuleContextForRuntimeOptimization), + _.injectPlannerStrategy(_ => MyRuleContextForPlannerStrategy), + _.injectQueryPostPlannerStrategyRule(_ => MyRuleContextForPostPlannerStrategyRule), + _.injectQueryStagePrepRule(_ => MyRuleContextForPreQueryStageRule), + _.injectQueryStageOptimizerRule(_ => MyRuleContextForQueryStageRule), + _.injectColumnar(_ => MyRuleContextForColumnarRule))) { spark => + val df = spark.range(1, 10, 1, 3).selectExpr("id % 3 as c").groupBy("c").count() + df.collect() + assert(collectFirst(df.queryExecution.executedPlan) { + case s: ShuffleExchangeExec if s.numPartitions == 2 => s + }.isDefined) + } + } + + test("test adaptive rule context with subquery") { + withSession( + Seq(_.injectQueryStagePrepRule(_ => MyRuleContextForQueryStageWithSubquery))) { spark => + spark.sql("select (select count(*) from range(10)), id from range(10)").collect() + } + } +} + +object MyRuleContext { + def checkAndGetRuleContext(): AdaptiveRuleContext = { + val ruleContextOpt = AdaptiveRuleContext.get() + assert(ruleContextOpt.isDefined) + ruleContextOpt.get + } + + def checkRuleContextForQueryStage(plan: SparkPlan): SparkPlan = { + val ruleContext = checkAndGetRuleContext() + assert(!ruleContext.isSubquery) + val stage = plan.find(_.isInstanceOf[ShuffleQueryStageExec]) + if (stage.isDefined && stage.get.asInstanceOf[ShuffleQueryStageExec].isMaterialized) { + assert(ruleContext.isFinalStage) + assert(!ruleContext.configs().get("spark.sql.shuffle.partitions").contains("2")) + } else { + assert(!ruleContext.isFinalStage) + assert(ruleContext.configs().get("spark.sql.shuffle.partitions").contains("2")) + } + plan + } +} + +object MyRuleContextForRuntimeOptimization extends Rule[LogicalPlan] { + override def apply(plan: LogicalPlan): LogicalPlan = { + MyRuleContext.checkAndGetRuleContext() + plan + } +} + +object MyRuleContextForPlannerStrategy extends SparkStrategy { + override def apply(plan: LogicalPlan): Seq[SparkPlan] = { + plan match { + case _: LogicalQueryStage => + val ruleContext = MyRuleContext.checkAndGetRuleContext() + assert(!ruleContext.configs().get("spark.sql.shuffle.partitions").contains("2")) + Nil + case _ => Nil + } + } +} + +object MyRuleContextForPostPlannerStrategyRule extends Rule[SparkPlan] { + override def apply(plan: SparkPlan): SparkPlan = { + val ruleContext = MyRuleContext.checkAndGetRuleContext() + if (plan.find(_.isInstanceOf[RangeExec]).isDefined) { + ruleContext.setConfig("spark.sql.shuffle.partitions", "2") + } + plan + } +} + +object MyRuleContextForPreQueryStageRule extends Rule[SparkPlan] { + override def apply(plan: SparkPlan): SparkPlan = { + val ruleContext = MyRuleContext.checkAndGetRuleContext() + assert(!ruleContext.isFinalStage) + plan + } +} + +object MyRuleContextForQueryStageRule extends Rule[SparkPlan] { + override def apply(plan: SparkPlan): SparkPlan = { + MyRuleContext.checkRuleContextForQueryStage(plan) + } +} + +object MyRuleContextForColumnarRule extends ColumnarRule { + override def preColumnarTransitions: Rule[SparkPlan] = { + plan: SparkPlan => { + if (plan.isInstanceOf[AdaptiveSparkPlanExec]) { + // skip if we are not inside AQE + assert(AdaptiveRuleContext.get().isEmpty) + plan + } else { + MyRuleContext.checkRuleContextForQueryStage(plan) + } + } + } + + override def postColumnarTransitions: Rule[SparkPlan] = { + plan: SparkPlan => { + if (plan.isInstanceOf[AdaptiveSparkPlanExec]) { + // skip if we are not inside AQE + assert(AdaptiveRuleContext.get().isEmpty) + plan + } else { + MyRuleContext.checkRuleContextForQueryStage(plan) + } + } + } +} + +object MyRuleContextForQueryStageWithSubquery extends Rule[SparkPlan] { + override def apply(plan: SparkPlan): SparkPlan = { + val ruleContext = MyRuleContext.checkAndGetRuleContext() + if (plan.exists(_.isInstanceOf[HashAggregateExec])) { + assert(ruleContext.isSubquery) + if (plan.exists(_.isInstanceOf[RangeExec])) { + assert(!ruleContext.isFinalStage) + } else { + assert(ruleContext.isFinalStage) + } + } else { + assert(!ruleContext.isSubquery) + } + plan + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CollationBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CollationBenchmark.scala index 7a93c7c495e26..86e9320ae9cde 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CollationBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CollationBenchmark.scala @@ -19,12 +19,12 @@ package org.apache.spark.sql.execution.benchmark import scala.concurrent.duration._ import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} -import org.apache.spark.sql.catalyst.util.CollationFactory +import org.apache.spark.sql.catalyst.util.{CollationFactory, CollationSupport} import org.apache.spark.unsafe.types.UTF8String abstract class CollationBenchmarkBase extends BenchmarkBase { protected val collationTypes: Seq[String] = - Seq("UTF8_BINARY_LCASE", "UNICODE", "UTF8_BINARY", "UNICODE_CI") + Seq("UTF8_BINARY", "UTF8_LCASE", "UNICODE", "UNICODE_CI") def generateSeqInput(n: Long): Seq[UTF8String] @@ -36,18 +36,19 @@ abstract class CollationBenchmarkBase extends BenchmarkBase { utf8Strings.size * 10, warmupTime = 10.seconds, output = output) - collationTypes.foreach(collationType => { + collationTypes.foreach { collationType => { val collation = CollationFactory.fetchCollation(collationType) benchmark.addCase(s"$collationType") { _ => - sublistStrings.foreach(s1 => - utf8Strings.foreach(s => - (0 to 10).foreach(_ => - collation.equalsFunction(s, s1).booleanValue()) - ) - ) + sublistStrings.foreach { s1 => + utf8Strings.foreach { s => + (0 to 3).foreach { _ => + collation.equalsFunction(s, s1).booleanValue() + } + } + } } } - ) + } benchmark.run() } @@ -59,19 +60,19 @@ abstract class CollationBenchmarkBase extends BenchmarkBase { utf8Strings.size * 10, warmupTime = 10.seconds, output = output) - collationTypes.foreach(collationType => { + collationTypes.foreach { collationType => { val collation = CollationFactory.fetchCollation(collationType) benchmark.addCase(s"$collationType") { _ => - sublistStrings.foreach(s1 => - utf8Strings.foreach(s => - (0 to 10).foreach(_ => + sublistStrings.foreach { s1 => + utf8Strings.foreach { s => + (0 to 3).foreach { _ => collation.comparator.compare(s, s1) - ) - ) - ) + } + } + } } } - ) + } benchmark.run() } @@ -85,19 +86,103 @@ abstract class CollationBenchmarkBase extends BenchmarkBase { utf8Strings.size * 10, warmupTime = 10.seconds, output = output) - collationTypes.foreach(collationType => { + collationTypes.foreach { collationType => { val collation = CollationFactory.fetchCollation(collationType) benchmark.addCase(s"$collationType") { _ => - sublistStrings.foreach(_ => - utf8Strings.foreach(s => - (0 to 10).foreach(_ => + sublistStrings.foreach { _ => + utf8Strings.foreach { s => + (0 to 3).foreach { _ => collation.hashFunction.applyAsLong(s) - ) - ) - ) + } + } + } + } + } + } + benchmark.run() + } + + def benchmarkContains( + collationTypes: Seq[String], + utf8Strings: Seq[UTF8String]): Unit = { + val sublistStrings = utf8Strings + + val benchmark = new Benchmark( + "collation unit benchmarks - contains", + utf8Strings.size * 10, + warmupTime = 10.seconds, + output = output) + collationTypes.foreach { collationType => { + val collation = CollationFactory.fetchCollation(collationType) + benchmark.addCase(s"$collationType") { _ => + sublistStrings.foreach { s1 => + utf8Strings.foreach { s => + (0 to 3).foreach { _ => + CollationSupport.Contains.exec( + s, s1, CollationFactory.collationNameToId(collation.collationName) + ) + } + } + } } } - ) + } + benchmark.run() + } + + def benchmarkStartsWith( + collationTypes: Seq[String], + utf8Strings: Seq[UTF8String]): Unit = { + val sublistStrings = utf8Strings + + val benchmark = new Benchmark( + "collation unit benchmarks - startsWith", + utf8Strings.size * 10, + warmupTime = 10.seconds, + output = output) + collationTypes.foreach { collationType => { + val collation = CollationFactory.fetchCollation(collationType) + benchmark.addCase(s"$collationType") { _ => + sublistStrings.foreach { s1 => + utf8Strings.foreach { s => + (0 to 3).foreach { _ => + CollationSupport.StartsWith.exec( + s, s1, CollationFactory.collationNameToId(collation.collationName) + ) + } + } + } + } + } + } + benchmark.run() + } + + def benchmarkEndsWith( + collationTypes: Seq[String], + utf8Strings: Seq[UTF8String]): Unit = { + val sublistStrings = utf8Strings + + val benchmark = new Benchmark( + "collation unit benchmarks - endsWith", + utf8Strings.size * 10, + warmupTime = 10.seconds, + output = output) + collationTypes.foreach { collationType => { + val collation = CollationFactory.fetchCollation(collationType) + benchmark.addCase(s"$collationType") { _ => + sublistStrings.foreach { s1 => + utf8Strings.foreach { s => + (0 to 3).foreach { _ => + CollationSupport.EndsWith.exec( + s, s1, CollationFactory.collationNameToId(collation.collationName) + ) + } + } + } + } + } + } benchmark.run() } } @@ -127,9 +212,13 @@ object CollationBenchmark extends CollationBenchmarkBase { } override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { - benchmarkUTFStringEquals(collationTypes, generateSeqInput(10000L)) - benchmarkUTFStringCompare(collationTypes, generateSeqInput(10000L)) - benchmarkUTFStringHashFunction(collationTypes, generateSeqInput(10000L)) + val inputs = generateSeqInput(10000L) + benchmarkUTFStringEquals(collationTypes, inputs) + benchmarkUTFStringCompare(collationTypes, inputs) + benchmarkUTFStringHashFunction(collationTypes, inputs) + benchmarkContains(collationTypes, inputs) + benchmarkStartsWith(collationTypes, inputs) + benchmarkEndsWith(collationTypes, inputs) } } @@ -152,8 +241,12 @@ object CollationNonASCIIBenchmark extends CollationBenchmarkBase { } override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { - benchmarkUTFStringEquals(collationTypes, generateSeqInput(4000L)) - benchmarkUTFStringCompare(collationTypes, generateSeqInput(4000L)) - benchmarkUTFStringHashFunction(collationTypes, generateSeqInput(4000L)) + val inputs = generateSeqInput(4000L) + benchmarkUTFStringEquals(collationTypes, inputs) + benchmarkUTFStringCompare(collationTypes, inputs) + benchmarkUTFStringHashFunction(collationTypes, inputs) + benchmarkContains(collationTypes, inputs) + benchmarkStartsWith(collationTypes, inputs) + benchmarkEndsWith(collationTypes, inputs) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceReadBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceReadBenchmark.scala index ff01221d0d7c7..94e88a96f37e0 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceReadBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceReadBenchmark.scala @@ -127,6 +127,13 @@ object DataSourceReadBenchmark extends SqlBasedBenchmark { private def withParquetVersions(f: String => Unit): Unit = Seq("V1", "V2").foreach(f) + private def getExpr(dataType: DataType = IntegerType): String = dataType match { + case BooleanType => "CASE WHEN value % 2 = 0 THEN true ELSE false END" + case ByteType => "cast(value % 128 as byte)" + case ShortType => "cast(value % 32768 as short)" + case _ => s"cast(value % ${Int.MaxValue} as ${dataType.sql})" + } + def numericScanBenchmark(values: Int, dataType: DataType): Unit = { // Benchmarks running through spark sql. val sqlBenchmark = new Benchmark( @@ -143,12 +150,14 @@ object DataSourceReadBenchmark extends SqlBasedBenchmark { withTempPath { dir => withTempTable("t1", "csvTable", "jsonTable", "parquetV1Table", "parquetV2Table", "orcTable") { import spark.implicits._ - spark.range(values).map(_ => Random.nextLong()).createOrReplaceTempView("t1") + spark.range(values).map(_ => Random.nextLong()) + .selectExpr(getExpr(dataType) + " as id") + .createOrReplaceTempView("t1") - prepareTable(dir, spark.sql(s"SELECT CAST(value as ${dataType.sql}) id FROM t1")) + prepareTable(dir, spark.sql(s"SELECT id FROM t1")) val query = dataType match { - case BooleanType => "sum(cast(id as bigint))" + case BooleanType => "sum(if(cast(id as boolean), 1, 0))" case _ => "sum(id)" } @@ -291,7 +300,7 @@ object DataSourceReadBenchmark extends SqlBasedBenchmark { spark.range(values).map(_ => Random.nextLong()).createOrReplaceTempView("t1") prepareTable(dir, - spark.sql(s"SELECT named_struct('f', CAST(value as ${dataType.sql})) as col FROM t1"), + spark.sql(s"SELECT named_struct('f', ${getExpr(dataType)}) as col FROM t1"), onlyParquetOrc = true) sqlBenchmark.addCase(s"SQL ORC MR") { _ => @@ -416,7 +425,7 @@ object DataSourceReadBenchmark extends SqlBasedBenchmark { prepareTable( dir, - spark.sql("SELECT CAST(value AS INT) AS c1, CAST(value as STRING) AS c2 FROM t1")) + spark.sql(s"SELECT ${getExpr()} c1, CAST(value as STRING) AS c2 FROM t1")) benchmark.addCase("SQL CSV") { _ => spark.sql("select sum(c1), sum(length(c2)) from csvTable").noop() @@ -512,7 +521,8 @@ object DataSourceReadBenchmark extends SqlBasedBenchmark { import spark.implicits._ spark.range(values).map(_ => Random.nextLong()).createOrReplaceTempView("t1") - prepareTable(dir, spark.sql("SELECT value % 2 AS p, value AS id FROM t1"), Some("p")) + prepareTable(dir, + spark.sql(s"SELECT value % 2 AS p, ${getExpr()} id FROM t1"), Some("p")) benchmark.addCase("Data column - CSV") { _ => spark.sql("select sum(id) from csvTable").noop() @@ -710,7 +720,7 @@ object DataSourceReadBenchmark extends SqlBasedBenchmark { withTempTable("t1", "csvTable", "jsonTable", "parquetV1Table", "parquetV2Table", "orcTable") { import spark.implicits._ val middle = width / 2 - val selectExpr = (1 to width).map(i => s"value as c$i") + val selectExpr = (1 to width).map(i => s"${getExpr()} as c$i") spark.range(values).map(_ => Random.nextLong()).toDF() .selectExpr(selectExpr: _*).createOrReplaceTempView("t1") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeBenchmark.scala index 6359f1b5f4f47..e5cb3ef8a04c7 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeBenchmark.scala @@ -47,6 +47,12 @@ object DateTimeBenchmark extends SqlBasedBenchmark { .noop() } + private def doBenchmarkAnsiOff(cardinality: Int, exprs: String*): Unit = { + withSQLConf(SQLConf.ANSI_ENABLED.key -> "false") { + doBenchmark(cardinality, exprs: _*) + } + } + private def run(cardinality: Int, name: String, exprs: String*): Unit = { codegenBenchmark(name, cardinality) { doBenchmark(cardinality, exprs: _*) @@ -75,7 +81,7 @@ object DateTimeBenchmark extends SqlBasedBenchmark { doBenchmark(N, s"$dt + interval 1 month 2 day") } benchmark.addCase("date + interval(m, d, ms)") { _ => - doBenchmark(N, s"$dt + interval 1 month 2 day 5 hour") + doBenchmarkAnsiOff(N, s"$dt + interval 1 month 2 day 5 hour") } benchmark.addCase("date - interval(m)") { _ => doBenchmark(N, s"$dt - interval 1 month") @@ -84,7 +90,7 @@ object DateTimeBenchmark extends SqlBasedBenchmark { doBenchmark(N, s"$dt - interval 1 month 2 day") } benchmark.addCase("date - interval(m, d, ms)") { _ => - doBenchmark(N, s"$dt - interval 1 month 2 day 5 hour") + doBenchmarkAnsiOff(N, s"$dt - interval 1 month 2 day 5 hour") } benchmark.addCase("timestamp + interval(m)") { _ => doBenchmark(N, s"$ts + interval 1 month") @@ -161,7 +167,7 @@ object DateTimeBenchmark extends SqlBasedBenchmark { } val dateExpr = "cast(timestamp_seconds(id) as date)" Seq("year", "yyyy", "yy", "mon", "month", "mm").foreach { level => - run(N, s"trunc $level", s"trunc('$level', $dateExpr)") + run(N, s"trunc $level", s"trunc($dateExpr, '$level')") } } runBenchmark("Parsing") { @@ -171,7 +177,7 @@ object DateTimeBenchmark extends SqlBasedBenchmark { run(n, "to timestamp str", timestampStrExpr) run(n, "to_timestamp", s"to_timestamp($timestampStrExpr, $pattern)") run(n, "to_unix_timestamp", s"to_unix_timestamp($timestampStrExpr, $pattern)") - val dateStrExpr = "concat('2019-01-', lpad(mod(id, 25), 2, '0'))" + val dateStrExpr = "concat('2019-01-', lpad(mod(id, 25) + 1, 2, '0'))" run(n, "to date str", dateStrExpr) run(n, "to_date", s"to_date($dateStrExpr, 'yyyy-MM-dd')") } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/EncodeBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/EncodeBenchmark.scala new file mode 100644 index 0000000000000..76ebd7f41677b --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/EncodeBenchmark.scala @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.benchmark + +import org.apache.spark.benchmark.Benchmark + +/** + * Benchmark for encode + * To run this benchmark: + * {{{ + * 1. without sbt: + * bin/spark-submit --class --jars + * 2. build/sbt "sql/Test/runMain " + * 3. generate result: + * SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/Test/runMain " + * Results will be written to "benchmarks/EncodeBenchmark-results.txt". + * }}} + */ +object EncodeBenchmark extends SqlBasedBenchmark { + import spark.implicits._ + private val N = 10L * 1000 * 1000 + + override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { + withTempPath { path => + // scalastyle:off nonascii + val exprs = Seq( + "", + "Spark", + "白日依山尽,黄河入海流。欲穷千里目,更上一层楼。", + "το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας μεγάλων δεδομένων παγκοσμίως", + "세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark", + "Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。") + // scalastyle:off nonascii + + spark.range(N).map { i => + val idx = (i % 6).toInt + val str = exprs(idx) + (str, str * 3, str * 5, str * 9, "") + }.write.parquet(path.getCanonicalPath) + + val benchmark = new Benchmark("encode", N, output = output) + def addBenchmarkCase(charset: String): Unit = { + benchmark.addCase(charset) { _ => + spark.read.parquet(path.getCanonicalPath).selectExpr( + s"encode(_1, '$charset')", + s"encode(_2, '$charset')", + s"encode(_3, '$charset')", + s"encode(_4, '$charset')", + s"encode(_5, '$charset')").noop() + } + } + addBenchmarkCase("UTF-32") + addBenchmarkCase("UTF-16") + addBenchmarkCase("UTF-8") + benchmark.run() + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/InExpressionBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/InExpressionBenchmark.scala index faaea51c0028d..a3d4cd12a1962 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/InExpressionBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/InExpressionBenchmark.scala @@ -46,14 +46,14 @@ object InExpressionBenchmark extends SqlBasedBenchmark { private def runByteBenchmark(numItems: Int, numRows: Long, minNumIters: Int): Unit = { val name = s"$numItems bytes" val values = (Byte.MinValue until Byte.MinValue + numItems).map(v => s"${v}Y") - val df = spark.range(0, numRows).select($"id".cast(ByteType)) + val df = spark.range(0, numRows).selectExpr("mod(id, 256) as id") runBenchmark(name, df, values, numRows, minNumIters) } private def runShortBenchmark(numItems: Int, numRows: Long, minNumIters: Int): Unit = { val name = s"$numItems shorts" val values = (1 to numItems).map(v => s"${v}S") - val df = spark.range(0, numRows).select($"id".cast(ShortType)) + val df = spark.range(0, numRows).selectExpr("mod(id, 32768) as id") runBenchmark(name, df, values, numRows, minNumIters) } @@ -64,14 +64,14 @@ object InExpressionBenchmark extends SqlBasedBenchmark { require(isLookupSwitch(rangeSize, numItems)) val name = s"$numItems shorts (non-compact)" val values = (Short.MinValue until maxValue by step).map(v => s"${v}S") - val df = spark.range(0, numRows).select($"id".cast(ShortType)) + val df = spark.range(0, numRows).selectExpr("mod(id, 32768) as id") runBenchmark(name, df, values, numRows, minNumIters) } private def runIntBenchmark(numItems: Int, numRows: Long, minNumIters: Int): Unit = { val name = s"$numItems ints" val values = 1 to numItems - val df = spark.range(0, numRows).select($"id".cast(IntegerType)) + val df = spark.range(0, numRows).selectExpr("mod(id, 2147483648) as id") runBenchmark(name, df, values, numRows, minNumIters) } @@ -82,7 +82,7 @@ object InExpressionBenchmark extends SqlBasedBenchmark { require(isLookupSwitch(rangeSize, numItems)) val name = s"$numItems ints (non-compact)" val values = Int.MinValue until maxValue.toInt by step.toInt - val df = spark.range(0, numRows).select($"id".cast(IntegerType)) + val df = spark.range(0, numRows).selectExpr("mod(id, 2147483648) as id") runBenchmark(name, df, values, numRows, minNumIters) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/MetadataStructBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/MetadataStructBenchmark.scala index 38fff24abe506..1ed3292a69fa8 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/MetadataStructBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/MetadataStructBenchmark.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.execution.benchmark import org.apache.spark.benchmark.Benchmark import org.apache.spark.sql.DataFrame import org.apache.spark.sql.execution.datasources.FileFormat -import org.apache.spark.sql.functions.lit +import org.apache.spark.sql.functions.{concat, lit} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType import org.apache.spark.util.Utils @@ -39,7 +39,7 @@ object MetadataStructBenchmark extends SqlBasedBenchmark { spark.range(0, NUM_ROWS, 1, 1).toDF("id") .withColumn("num1", $"id" + 10) .withColumn("num2", $"id" / 10) - .withColumn("str", lit("a sample string ") + $"id".cast("string")) + .withColumn("str", concat(lit("a sample string "), $"id".cast("string"))) .write.format(format).save(dir.getAbsolutePath) val df = spark.read.format(format).load(dir.getAbsolutePath) f(df) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala index c76ad0434d4f6..d70e25bb026e7 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala @@ -58,6 +58,7 @@ object TPCDSQueryBenchmark extends SqlBasedBenchmark with Logging { .set("spark.sql.crossJoin.enabled", "true") .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .set("spark.kryo.registrationRequired", "true") + .set("spark.sql.ansi.enabled", "false") SparkSession.builder().config(conf).getOrCreate() } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnStatsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnStatsSuite.scala index f39057013e64b..bdb118b91fa28 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnStatsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnStatsSuite.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.columnar import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.types.PhysicalDataType +import org.apache.spark.sql.types.StringType class ColumnStatsSuite extends SparkFunSuite { testColumnStats(classOf[BooleanColumnStats], BOOLEAN, Array(true, false, 0)) @@ -28,9 +29,9 @@ class ColumnStatsSuite extends SparkFunSuite { testColumnStats(classOf[LongColumnStats], LONG, Array(Long.MaxValue, Long.MinValue, 0)) testColumnStats(classOf[FloatColumnStats], FLOAT, Array(Float.MaxValue, Float.MinValue, 0)) testColumnStats(classOf[DoubleColumnStats], DOUBLE, Array(Double.MaxValue, Double.MinValue, 0)) - testColumnStats(classOf[StringColumnStats], STRING, Array(null, null, 0)) testDecimalColumnStats(Array(null, null, 0)) testIntervalColumnStats(Array(null, null, 0)) + testStringColumnStats(Array(null, null, 0)) def testColumnStats[T <: PhysicalDataType, U <: ColumnStats]( columnStatsClass: Class[U], @@ -141,4 +142,60 @@ class ColumnStatsSuite extends SparkFunSuite { } } } + + def testStringColumnStats[T <: PhysicalDataType, U <: ColumnStats]( + initialStatistics: Array[Any]): Unit = { + + Seq("UTF8_BINARY", "UTF8_LCASE", "UNICODE", "UNICODE_CI").foreach(collation => { + val columnType = STRING(StringType(collation)) + + test(s"STRING($collation): empty") { + val columnStats = new StringColumnStats(StringType(collation).collationId) + columnStats.collectedStatistics.zip(initialStatistics).foreach { + case (actual, expected) => assert(actual === expected) + } + } + + test(s"STRING($collation): non-empty") { + import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ + + val columnStats = new StringColumnStats(StringType(collation).collationId) + val rows = Seq.fill(10)(makeRandomRow(columnType)) ++ Seq.fill(10)(makeNullRow(1)) + rows.foreach(columnStats.gatherStats(_, 0)) + + val values = rows.take(10).map(_.get(0, + ColumnarDataTypeUtils.toLogicalDataType(columnType.dataType))) + val ordering = PhysicalDataType.ordering( + ColumnarDataTypeUtils.toLogicalDataType(columnType.dataType)) + val stats = columnStats.collectedStatistics + + assertResult(values.min(ordering), "Wrong lower bound")(stats(0)) + assertResult(values.max(ordering), "Wrong upper bound")(stats(1)) + assertResult(10, "Wrong null count")(stats(2)) + assertResult(20, "Wrong row count")(stats(3)) + assertResult(stats(4), "Wrong size in bytes") { + rows.map { row => + if (row.isNullAt(0)) 4 else columnType.actualSize(row, 0) + }.sum + } + } + }) + + test("STRING(UTF8_LCASE): collation-defined ordering") { + import org.apache.spark.sql.catalyst.expressions.GenericInternalRow + import org.apache.spark.unsafe.types.UTF8String + + val columnStats = new StringColumnStats(StringType("UTF8_LCASE").collationId) + val rows = Seq("b", "a", "C", "A").map(str => { + val row = new GenericInternalRow(1) + row(0) = UTF8String.fromString(str) + row + }) + rows.foreach(columnStats.gatherStats(_, 0)) + + val stats = columnStats.collectedStatistics + assertResult(UTF8String.fromString("a"), "Wrong lower bound")(stats(0)) + assertResult(UTF8String.fromString("C"), "Wrong upper bound")(stats(1)) + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnTypeSuite.scala index d79ac8dc35459..a95bda9bf71df 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnTypeSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnTypeSuite.scala @@ -26,6 +26,7 @@ import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection} import org.apache.spark.sql.catalyst.types.{PhysicalArrayType, PhysicalDataType, PhysicalMapType, PhysicalStructType} +import org.apache.spark.sql.catalyst.util.CollationFactory import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.CalendarInterval @@ -40,7 +41,9 @@ class ColumnTypeSuite extends SparkFunSuite { val checks = Map( NULL -> 0, BOOLEAN -> 1, BYTE -> 1, SHORT -> 2, INT -> 4, LONG -> 8, FLOAT -> 4, DOUBLE -> 8, COMPACT_DECIMAL(15, 10) -> 8, LARGE_DECIMAL(20, 10) -> 12, - STRING -> 8, BINARY -> 16, STRUCT_TYPE -> 20, ARRAY_TYPE -> 28, MAP_TYPE -> 68, + STRING(StringType) -> 8, STRING(StringType("UTF8_LCASE")) -> 8, + STRING(StringType("UNICODE")) -> 8, STRING(StringType("UNICODE_CI")) -> 8, + BINARY -> 16, STRUCT_TYPE -> 20, ARRAY_TYPE -> 28, MAP_TYPE -> 68, CALENDAR_INTERVAL -> 16) checks.foreach { case (columnType, expectedSize) => @@ -73,7 +76,12 @@ class ColumnTypeSuite extends SparkFunSuite { checkActualSize(LONG, Long.MaxValue, 8) checkActualSize(FLOAT, Float.MaxValue, 4) checkActualSize(DOUBLE, Double.MaxValue, 8) - checkActualSize(STRING, "hello", 4 + "hello".getBytes(StandardCharsets.UTF_8).length) + Seq( + "UTF8_BINARY", "UTF8_LCASE", "UNICODE", "UNICODE_CI" + ).foreach(collation => { + checkActualSize(STRING(StringType(collation)), + "hello", 4 + "hello".getBytes(StandardCharsets.UTF_8).length) + }) checkActualSize(BINARY, Array.fill[Byte](4)(0.toByte), 4 + 4) checkActualSize(COMPACT_DECIMAL(15, 10), Decimal(0, 15, 10), 8) checkActualSize(LARGE_DECIMAL(20, 10), Decimal(0, 20, 10), 5) @@ -93,7 +101,10 @@ class ColumnTypeSuite extends SparkFunSuite { testNativeColumnType(FLOAT) testNativeColumnType(DOUBLE) testNativeColumnType(COMPACT_DECIMAL(15, 10)) - testNativeColumnType(STRING) + testNativeColumnType(STRING(StringType)) // UTF8_BINARY + testNativeColumnType(STRING(StringType("UTF8_LCASE"))) + testNativeColumnType(STRING(StringType("UNICODE"))) + testNativeColumnType(STRING(StringType("UNICODE_CI"))) testColumnType(NULL) testColumnType(BINARY) @@ -104,11 +115,18 @@ class ColumnTypeSuite extends SparkFunSuite { testColumnType(CALENDAR_INTERVAL) def testNativeColumnType[T <: PhysicalDataType](columnType: NativeColumnType[T]): Unit = { - testColumnType[T#InternalType](columnType) + val typeName = columnType match { + case s: STRING => + val collation = CollationFactory.fetchCollation(s.collationId).collationName + Some(if (collation == "UTF8_BINARY") "STRING" else s"STRING($collation)") + case _ => None + } + testColumnType[T#InternalType](columnType, typeName) } - def testColumnType[JvmType](columnType: ColumnType[JvmType]): Unit = { - + def testColumnType[JvmType]( + columnType: ColumnType[JvmType], + typeName: Option[String] = None): Unit = { val proj = UnsafeProjection.create( Array[DataType](ColumnarDataTypeUtils.toLogicalDataType(columnType.dataType))) val converter = CatalystTypeConverters.createToScalaConverter( @@ -116,8 +134,9 @@ class ColumnTypeSuite extends SparkFunSuite { val seq = (0 until 4).map(_ => proj(makeRandomRow(columnType)).copy()) val totalSize = seq.map(_.getSizeInBytes).sum val bufferSize = Math.max(DEFAULT_BUFFER_SIZE, totalSize) + val testName = typeName.getOrElse(columnType.toString) - test(s"$columnType append/extract") { + test(s"$testName append/extract") { val buffer = ByteBuffer.allocate(bufferSize).order(ByteOrder.nativeOrder()) seq.foreach(r => columnType.append(columnType.getField(r, 0), buffer)) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnarTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnarTestUtils.scala index e7b509c087b79..d08c34056f565 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnarTestUtils.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnarTestUtils.scala @@ -50,7 +50,7 @@ object ColumnarTestUtils { case LONG => Random.nextLong() case FLOAT => Random.nextFloat() case DOUBLE => Random.nextDouble() - case STRING => UTF8String.fromString(Random.nextString(Random.nextInt(32))) + case _: STRING => UTF8String.fromString(Random.nextString(Random.nextInt(32))) case BINARY => randomBytes(Random.nextInt(32)) case CALENDAR_INTERVAL => new CalendarInterval(Random.nextInt(), Random.nextInt(), Random.nextLong()) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/NullableColumnAccessorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/NullableColumnAccessorSuite.scala index 169d9356c00cc..ee622793ee0a3 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/NullableColumnAccessorSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/NullableColumnAccessorSuite.scala @@ -23,6 +23,7 @@ import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection} import org.apache.spark.sql.catalyst.types.{PhysicalArrayType, PhysicalMapType, PhysicalStructType} +import org.apache.spark.sql.catalyst.util.CollationFactory import org.apache.spark.sql.types._ class TestNullableColumnAccessor[JvmType]( @@ -41,21 +42,33 @@ object TestNullableColumnAccessor { class NullableColumnAccessorSuite extends SparkFunSuite { import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ - Seq( + val stringTypes = Seq( + STRING(StringType), // UTF8_BINARY + STRING(StringType("UTF8_LCASE")), + STRING(StringType("UNICODE")), + STRING(StringType("UNICODE_CI"))) + val otherTypes = Seq( NULL, BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, - STRING, BINARY, COMPACT_DECIMAL(15, 10), LARGE_DECIMAL(20, 10), + BINARY, COMPACT_DECIMAL(15, 10), LARGE_DECIMAL(20, 10), STRUCT(PhysicalStructType(Array(StructField("a", StringType)))), ARRAY(PhysicalArrayType(IntegerType, true)), MAP(PhysicalMapType(IntegerType, StringType, true)), CALENDAR_INTERVAL) - .foreach { + + stringTypes.foreach(s => { + val collation = CollationFactory.fetchCollation(s.collationId).collationName + val typeName = if (collation == "UTF8_BINARY") "STRING" else s"STRING($collation)" + testNullableColumnAccessor(s, Some(typeName)) + }) + otherTypes.foreach { testNullableColumnAccessor(_) } def testNullableColumnAccessor[JvmType]( - columnType: ColumnType[JvmType]): Unit = { + columnType: ColumnType[JvmType], + testTypeName: Option[String] = None): Unit = { - val typeName = columnType.getClass.getSimpleName.stripSuffix("$") + val typeName = testTypeName.getOrElse(columnType.getClass.getSimpleName.stripSuffix("$")) val nullRow = makeNullRow(1) test(s"Nullable $typeName column accessor: empty column") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/NullableColumnBuilderSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/NullableColumnBuilderSuite.scala index 22f557e49ded5..609212c95e987 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/NullableColumnBuilderSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/NullableColumnBuilderSuite.scala @@ -21,6 +21,7 @@ import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection} import org.apache.spark.sql.catalyst.types.{PhysicalArrayType, PhysicalMapType, PhysicalStructType} +import org.apache.spark.sql.catalyst.util.CollationFactory import org.apache.spark.sql.types._ class TestNullableColumnBuilder[JvmType](columnType: ColumnType[JvmType]) @@ -39,21 +40,33 @@ object TestNullableColumnBuilder { class NullableColumnBuilderSuite extends SparkFunSuite { import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ - Seq( + val stringTypes = Seq( + STRING(StringType), // UTF8_BINARY + STRING(StringType("UTF8_LCASE")), + STRING(StringType("UNICODE")), + STRING(StringType("UNICODE_CI"))) + val otherTypes = Seq( BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, - STRING, BINARY, COMPACT_DECIMAL(15, 10), LARGE_DECIMAL(20, 10), + BINARY, COMPACT_DECIMAL(15, 10), LARGE_DECIMAL(20, 10), STRUCT(PhysicalStructType(Array(StructField("a", StringType)))), ARRAY(PhysicalArrayType(IntegerType, true)), MAP(PhysicalMapType(IntegerType, StringType, true)), CALENDAR_INTERVAL) - .foreach { + + stringTypes.foreach(s => { + val collation = CollationFactory.fetchCollation(s.collationId).collationName + val typeName = if (collation == "UTF8_BINARY") "STRING" else s"STRING($collation)" + testNullableColumnBuilder(s, Some(typeName)) + }) + otherTypes.foreach { testNullableColumnBuilder(_) } def testNullableColumnBuilder[JvmType]( - columnType: ColumnType[JvmType]): Unit = { + columnType: ColumnType[JvmType], + testTypeName: Option[String] = None): Unit = { - val typeName = columnType.getClass.getSimpleName.stripSuffix("$") + val typeName = testTypeName.getOrElse(columnType.getClass.getSimpleName.stripSuffix("$")) val dataType = columnType.dataType val proj = UnsafeProjection.create(Array[DataType]( ColumnarDataTypeUtils.toLogicalDataType(dataType))) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/CompressionSchemeBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/CompressionSchemeBenchmark.scala index 2da0adf439dae..05ae575305299 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/CompressionSchemeBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/CompressionSchemeBenchmark.scala @@ -27,6 +27,7 @@ import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.catalyst.types.PhysicalDataType import org.apache.spark.sql.execution.columnar.{BOOLEAN, INT, LONG, NativeColumnType, SHORT, STRING} +import org.apache.spark.sql.types.StringType import org.apache.spark.util.Utils._ /** @@ -231,8 +232,8 @@ object CompressionSchemeBenchmark extends BenchmarkBase with AllCompressionSchem } testData.rewind() - runEncodeBenchmark("STRING Encode", iters, count, STRING, testData) - runDecodeBenchmark("STRING Decode", iters, count, STRING, testData) + runEncodeBenchmark("STRING Encode", iters, count, STRING(StringType), testData) + runDecodeBenchmark("STRING Decode", iters, count, STRING(StringType), testData) } override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/DictionaryEncodingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/DictionaryEncodingSuite.scala index 10d5e8a0eb9a3..2b2bc7e761368 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/DictionaryEncodingSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/DictionaryEncodingSuite.scala @@ -25,19 +25,27 @@ import org.apache.spark.sql.catalyst.types.PhysicalDataType import org.apache.spark.sql.execution.columnar._ import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector +import org.apache.spark.sql.types.StringType class DictionaryEncodingSuite extends SparkFunSuite { val nullValue = -1 testDictionaryEncoding(new IntColumnStats, INT) testDictionaryEncoding(new LongColumnStats, LONG) - testDictionaryEncoding(new StringColumnStats, STRING, false) + Seq( + "UTF8_BINARY", "UTF8_LCASE", "UNICODE", "UNICODE_CI" + ).foreach(collation => { + val dt = StringType(collation) + val typeName = if (collation == "UTF8_BINARY") "STRING" else s"STRING($collation)" + testDictionaryEncoding(new StringColumnStats(dt), STRING(dt), false, Some(typeName)) + }) def testDictionaryEncoding[T <: PhysicalDataType]( columnStats: ColumnStats, columnType: NativeColumnType[T], - testDecompress: Boolean = true): Unit = { + testDecompress: Boolean = true, + testTypeName: Option[String] = None): Unit = { - val typeName = columnType.getClass.getSimpleName.stripSuffix("$") + val typeName = testTypeName.getOrElse(columnType.getClass.getSimpleName.stripSuffix("$")) def buildDictionary(buffer: ByteBuffer) = { (0 until buffer.getInt()).map(columnType.extract(buffer) -> _.toShort).toMap diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/RunLengthEncodingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/RunLengthEncodingSuite.scala index 00f242a6b9c4b..9b0067fd29832 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/RunLengthEncodingSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/RunLengthEncodingSuite.scala @@ -23,6 +23,7 @@ import org.apache.spark.sql.catalyst.types.PhysicalDataType import org.apache.spark.sql.execution.columnar._ import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector +import org.apache.spark.sql.types.StringType class RunLengthEncodingSuite extends SparkFunSuite { val nullValue = -1 @@ -31,14 +32,21 @@ class RunLengthEncodingSuite extends SparkFunSuite { testRunLengthEncoding(new ShortColumnStats, SHORT) testRunLengthEncoding(new IntColumnStats, INT) testRunLengthEncoding(new LongColumnStats, LONG) - testRunLengthEncoding(new StringColumnStats, STRING, false) + Seq( + "UTF8_BINARY", "UTF8_LCASE", "UNICODE", "UNICODE_CI" + ).foreach(collation => { + val dt = StringType(collation) + val typeName = if (collation == "UTF8_BINARY") "STRING" else s"STRING($collation)" + testRunLengthEncoding(new StringColumnStats(dt), STRING(dt), false, Some(typeName)) + }) def testRunLengthEncoding[T <: PhysicalDataType]( columnStats: ColumnStats, columnType: NativeColumnType[T], - testDecompress: Boolean = true): Unit = { + testDecompress: Boolean = true, + testTypeName: Option[String] = None): Unit = { - val typeName = columnType.getClass.getSimpleName.stripSuffix("$") + val typeName = testTypeName.getOrElse(columnType.getClass.getSimpleName.stripSuffix("$")) def skeleton(uniqueValueCount: Int, inputRuns: Seq[(Int, Int)]): Unit = { // ------------- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterNamespaceSetPropertiesSuiteBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterNamespaceSetPropertiesSuiteBase.scala index c28c7b9db0436..7f5b3de4865c9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterNamespaceSetPropertiesSuiteBase.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterNamespaceSetPropertiesSuiteBase.scala @@ -83,10 +83,19 @@ trait AlterNamespaceSetPropertiesSuiteBase extends QueryTest with DDLCommandTest CatalogV2Util.NAMESPACE_RESERVED_PROPERTIES.filterNot(_ == PROP_COMMENT).foreach { key => withNamespace(ns) { sql(s"CREATE NAMESPACE $ns") - val exception = intercept[ParseException] { - sql(s"ALTER NAMESPACE $ns SET PROPERTIES ('$key'='dummyVal')") - } - assert(exception.getMessage.contains(s"$key is a reserved namespace property")) + val sqlText = s"ALTER NAMESPACE $ns SET PROPERTIES ('$key'='dummyVal')" + checkErrorMatchPVals( + exception = intercept[ParseException] { + sql(sqlText) + }, + errorClass = "UNSUPPORTED_FEATURE.SET_NAMESPACE_PROPERTY", + parameters = Map("property" -> key, "msg" -> ".*"), + sqlState = None, + context = ExpectedContext( + fragment = sqlText, + start = 0, + stop = 46 + ns.length + key.length) + ) } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterNamespaceUnsetPropertiesParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterNamespaceUnsetPropertiesParserSuite.scala new file mode 100644 index 0000000000000..72d307c816664 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterNamespaceUnsetPropertiesParserSuite.scala @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution.command + +import org.apache.spark.SparkThrowable +import org.apache.spark.sql.catalyst.analysis.{AnalysisTest, UnresolvedNamespace} +import org.apache.spark.sql.catalyst.parser.ParseException +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.execution.SparkSqlParser +import org.apache.spark.sql.test.SharedSparkSession + +class AlterNamespaceUnsetPropertiesParserSuite extends AnalysisTest with SharedSparkSession { + + private lazy val parser = new SparkSqlParser() + + private def parseException(sqlText: String): SparkThrowable = { + intercept[ParseException](sql(sqlText).collect()) + } + + private def parsePlan(sqlText: String): LogicalPlan = { + parser.parsePlan(sqlText) + } + + test("unset namespace properties") { + Seq("DATABASE", "SCHEMA", "NAMESPACE").foreach { nsToken => + Seq("PROPERTIES", "DBPROPERTIES").foreach { propToken => + comparePlans( + parsePlan(s"ALTER $nsToken a.b.c UNSET $propToken ('a', 'b', 'c')"), + UnsetNamespacePropertiesCommand( + UnresolvedNamespace(Seq("a", "b", "c")), Seq("a", "b", "c"))) + + comparePlans( + parsePlan(s"ALTER $nsToken a.b.c UNSET $propToken ('a')"), + UnsetNamespacePropertiesCommand(UnresolvedNamespace(Seq("a", "b", "c")), Seq("a"))) + } + } + } + + test("property values must not be set") { + val sql = "ALTER NAMESPACE my_db UNSET PROPERTIES('key_without_value', 'key_with_value'='x')" + checkError( + exception = parseException(sql), + errorClass = "_LEGACY_ERROR_TEMP_0035", + parameters = Map("message" -> "Values should not be specified for key(s): [key_with_value]"), + context = ExpectedContext( + fragment = sql, + start = 0, + stop = 80)) + } + + test("not support clause - IF EXISTS") { + Seq("DATABASE", "SCHEMA", "NAMESPACE").foreach { nsToken => + Seq("PROPERTIES", "DBPROPERTIES").foreach { propToken => + val sql = s"ALTER $nsToken a.b.c UNSET $propToken IF EXISTS ('a', 'b', 'c')" + checkError( + exception = parseException(sql), + errorClass = "PARSE_SYNTAX_ERROR", + parameters = Map("error" -> "'IF'", "hint" -> ": missing '('") + ) + } + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterNamespaceUnsetPropertiesSuiteBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterNamespaceUnsetPropertiesSuiteBase.scala new file mode 100644 index 0000000000000..1d43cc5938487 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterNamespaceUnsetPropertiesSuiteBase.scala @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.command + +import org.apache.spark.sql.{AnalysisException, QueryTest} +import org.apache.spark.sql.catalyst.parser.ParseException +import org.apache.spark.sql.connector.catalog.{CatalogV2Util, SupportsNamespaces} +import org.apache.spark.sql.internal.SQLConf + +/** + * This base suite contains unified tests for the `ALTER NAMESPACE ... UNSET PROPERTIES` command + * that check V1 and V2 table catalogs. The tests that cannot run for all supported catalogs are + * located in more specific test suites: + * + * - V2 table catalog tests: + * `org.apache.spark.sql.execution.command.v2.AlterNamespaceUnsetPropertiesSuite` + * - V1 table catalog tests: + * `org.apache.spark.sql.execution.command.v1.AlterNamespaceUnsetPropertiesSuiteBase` + * - V1 In-Memory catalog: + * `org.apache.spark.sql.execution.command.v1.AlterNamespaceUnsetPropertiesSuite` + * - V1 Hive External catalog: + * `org.apache.spark.sql.hive.execution.command.AlterNamespaceUnsetPropertiesSuite` + */ +trait AlterNamespaceUnsetPropertiesSuiteBase extends QueryTest with DDLCommandTestUtils { + override val command = "ALTER NAMESPACE ... UNSET PROPERTIES" + + protected def namespace: String + + protected def getProperties(namespace: String): String = { + val propsRow = sql(s"DESCRIBE NAMESPACE EXTENDED $namespace") + .toDF("key", "value") + .where("key like 'Properties%'") + .collect() + assert(propsRow.length == 1) + propsRow(0).getString(1) + } + + test("namespace does not exist") { + val ns = "not_exist" + val e = intercept[AnalysisException] { + sql(s"ALTER NAMESPACE $catalog.$ns UNSET PROPERTIES ('d')") + } + checkError(e, + errorClass = "SCHEMA_NOT_FOUND", + parameters = Map("schemaName" -> s"`$ns`")) + } + + test("basic test") { + val ns = s"$catalog.$namespace" + withNamespace(ns) { + sql(s"CREATE NAMESPACE $ns") + assert(getProperties(ns) === "") + sql(s"ALTER NAMESPACE $ns SET PROPERTIES ('a'='a', 'b'='b', 'c'='c')") + assert(getProperties(ns) === "((a,a), (b,b), (c,c))") + sql(s"ALTER NAMESPACE $ns UNSET PROPERTIES ('b')") + assert(getProperties(ns) === "((a,a), (c,c))") + + // unset non-existent properties + // it will be successful, ignoring non-existent properties + sql(s"ALTER NAMESPACE $ns UNSET PROPERTIES ('b')") + assert(getProperties(ns) === "((a,a), (c,c))") + } + } + + test("test reserved properties") { + import SupportsNamespaces._ + import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ + val ns = s"$catalog.$namespace" + withSQLConf((SQLConf.LEGACY_PROPERTY_NON_RESERVED.key, "false")) { + CatalogV2Util.NAMESPACE_RESERVED_PROPERTIES.filterNot(_ == PROP_COMMENT).foreach { key => + withNamespace(ns) { + sql(s"CREATE NAMESPACE $ns") + val sqlText = s"ALTER NAMESPACE $ns UNSET PROPERTIES ('$key')" + checkErrorMatchPVals( + exception = intercept[ParseException] { + sql(sqlText) + }, + errorClass = "UNSUPPORTED_FEATURE.SET_NAMESPACE_PROPERTY", + parameters = Map("property" -> key, "msg" -> ".*"), + sqlState = None, + context = ExpectedContext( + fragment = sqlText, + start = 0, + stop = 37 + ns.length + key.length) + ) + } + } + } + withSQLConf((SQLConf.LEGACY_PROPERTY_NON_RESERVED.key, "true")) { + CatalogV2Util.NAMESPACE_RESERVED_PROPERTIES.filterNot(_ == PROP_COMMENT).foreach { key => + withNamespace(ns) { + // Set the location explicitly because v2 catalog may not set the default location. + // Without this, `meta.get(key)` below may return null. + sql(s"CREATE NAMESPACE $ns LOCATION 'tmp/prop_test'") + assert(getProperties(ns) === "") + sql(s"ALTER NAMESPACE $ns UNSET PROPERTIES ('$key')") + assert(getProperties(ns) === "", s"$key is a reserved namespace property and ignored") + val meta = spark.sessionState.catalogManager.catalog(catalog) + .asNamespaceCatalog.loadNamespaceMetadata(namespace.split('.')) + assert(!meta.get(key).contains("foo"), + "reserved properties should not have side effects") + } + } + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableDropPartitionSuiteBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableDropPartitionSuiteBase.scala index 02c9d318bb46f..ef9ae47847405 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableDropPartitionSuiteBase.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableDropPartitionSuiteBase.scala @@ -236,7 +236,7 @@ trait AlterTableDropPartitionSuiteBase extends QueryTest with DDLCommandTestUtil checkCachedRelation("v1", Seq(Row(0, 0), Row(3, 3))) } - val v2 = s"${spark.sharedState.globalTempViewManager.database}.v2" + val v2 = s"${spark.sharedState.globalTempDB}.v2" withGlobalTempView("v2") { sql(s"CREATE GLOBAL TEMP VIEW v2 AS SELECT * FROM $t") cacheRelation(v2) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableRenamePartitionSuiteBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableRenamePartitionSuiteBase.scala index 0aaeb8d2160c3..d91085956e330 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableRenamePartitionSuiteBase.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableRenamePartitionSuiteBase.scala @@ -222,7 +222,7 @@ trait AlterTableRenamePartitionSuiteBase extends QueryTest with DDLCommandTestUt checkCachedRelation("v1", Seq(Row(0, 2), Row(1, 3))) } - val v2 = s"${spark.sharedState.globalTempViewManager.database}.v2" + val v2 = s"${spark.sharedState.globalTempDB}.v2" withGlobalTempView("v2") { sql(s"CREATE GLOBAL TEMP VIEW v2 AS SELECT * FROM $t") cacheRelation(v2) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableSetTblPropertiesParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableSetTblPropertiesParserSuite.scala new file mode 100644 index 0000000000000..78abd1a8b7fd3 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableSetTblPropertiesParserSuite.scala @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.command + +import org.apache.spark.SparkThrowable +import org.apache.spark.sql.catalyst.analysis.{AnalysisTest, UnresolvedTable} +import org.apache.spark.sql.catalyst.parser.CatalystSqlParser.parsePlan +import org.apache.spark.sql.catalyst.parser.ParseException +import org.apache.spark.sql.catalyst.plans.logical.SetTableProperties +import org.apache.spark.sql.test.SharedSparkSession + +class AlterTableSetTblPropertiesParserSuite extends AnalysisTest with SharedSparkSession { + + private def parseException(sqlText: String): SparkThrowable = { + intercept[ParseException](sql(sqlText).collect()) + } + + // ALTER TABLE table_name SET TBLPROPERTIES ('comment' = new_comment); + test("alter table: alter table properties") { + val sql1_table = "ALTER TABLE table_name SET TBLPROPERTIES ('test' = 'test', " + + "'comment' = 'new_comment')" + comparePlans( + parsePlan(sql1_table), + SetTableProperties( + UnresolvedTable(Seq("table_name"), "ALTER TABLE ... SET TBLPROPERTIES", true), + Map("test" -> "test", "comment" -> "new_comment"))) + } + + test("alter table - property values must be set") { + val sql = "ALTER TABLE my_tab SET TBLPROPERTIES('key_without_value', 'key_with_value'='x')" + checkError( + exception = parseException(sql), + errorClass = "_LEGACY_ERROR_TEMP_0035", + parameters = Map("message" -> "Values must be specified for key(s): [key_without_value]"), + context = ExpectedContext( + fragment = sql, + start = 0, + stop = 78)) + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableSetTblPropertiesSuiteBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableSetTblPropertiesSuiteBase.scala new file mode 100644 index 0000000000000..64b70d709b93f --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableSetTblPropertiesSuiteBase.scala @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.command + +import org.apache.spark.sql.{AnalysisException, QueryTest} +import org.apache.spark.sql.catalyst.TableIdentifier + +/** + * This base suite contains unified tests for the `ALTER TABLE .. SET TBLPROPERTIES` + * command that check V1 and V2 table catalogs. The tests that cannot run for all supported + * catalogs are located in more specific test suites: + * + * - V2 table catalog tests: + * `org.apache.spark.sql.execution.command.v2.AlterTableSetTblPropertiesSuite` + * - V1 table catalog tests: + * `org.apache.spark.sql.execution.command.v1.AlterTableSetTblPropertiesSuiteBase` + * - V1 In-Memory catalog: + * `org.apache.spark.sql.execution.command.v1.AlterTableSetTblPropertiesSuite` + * - V1 Hive External catalog: + * `org.apache.spark.sql.hive.execution.command.AlterTableSetTblPropertiesSuite` + */ +trait AlterTableSetTblPropertiesSuiteBase extends QueryTest with DDLCommandTestUtils { + override val command = "ALTER TABLE .. SET TBLPROPERTIES" + + def checkTblProps(tableIdent: TableIdentifier, expectedTblProps: Map[String, String]): Unit + + test("alter table set tblproperties") { + withNamespaceAndTable("ns", "tbl") { t => + sql(s"CREATE TABLE $t (col1 int, col2 string, a int, b int) $defaultUsing") + val tableIdent = TableIdentifier("tbl", Some("ns"), Some(catalog)) + checkTblProps(tableIdent, Map.empty[String, String]) + + sql(s"ALTER TABLE $t SET TBLPROPERTIES ('k1' = 'v1', 'k2' = 'v2', 'k3' = 'v3')") + checkTblProps(tableIdent, Map("k1" -> "v1", "k2" -> "v2", "k3" -> "v3")) + + sql(s"USE $catalog.ns") + sql(s"ALTER TABLE tbl SET TBLPROPERTIES ('k1' = 'v1', 'k2' = 'v2', 'k3' = 'v3')") + checkTblProps(tableIdent, Map("k1" -> "v1", "k2" -> "v2", "k3" -> "v3")) + + sql(s"ALTER TABLE $t SET TBLPROPERTIES ('k1' = 'v1', 'k2' = 'v8')") + checkTblProps(tableIdent, Map("k1" -> "v1", "k2" -> "v8", "k3" -> "v3")) + + // table to alter does not exist + checkError( + exception = intercept[AnalysisException] { + sql("ALTER TABLE does_not_exist SET TBLPROPERTIES ('winner' = 'loser')") + }, + errorClass = "TABLE_OR_VIEW_NOT_FOUND", + parameters = Map("relationName" -> "`does_not_exist`"), + context = ExpectedContext(fragment = "does_not_exist", start = 12, stop = 25) + ) + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala index e4df8c64e59ef..505f0b4bdea62 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala @@ -18,7 +18,8 @@ package org.apache.spark.sql.execution.command import org.apache.spark.SparkThrowable -import org.apache.spark.sql.catalyst.analysis.{AnalysisTest, GlobalTempView, LocalTempView, UnresolvedAttribute, UnresolvedFunctionName, UnresolvedIdentifier} +import org.apache.spark.sql.catalyst.FunctionIdentifier +import org.apache.spark.sql.catalyst.analysis.{AnalysisTest, GlobalTempView, LocalTempView, SchemaCompensation, UnresolvedAttribute, UnresolvedFunctionName, UnresolvedIdentifier} import org.apache.spark.sql.catalyst.catalog.{ArchiveResource, FileResource, FunctionResource, JarResource} import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans @@ -36,6 +37,9 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession { super.parseException(parser.parsePlan)(sqlText) } + private def intercept(sqlCommand: String, messages: String*): Unit = + interceptParseException(parser.parsePlan)(sqlCommand, messages: _*)() + private def compareTransformQuery(sql: String, expected: LogicalPlan): Unit = { val plan = parser.parsePlan(sql).asInstanceOf[ScriptTransformation].copy(ioschema = null) comparePlans(plan, expected, checkAnalysis = false) @@ -103,18 +107,6 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession { stop = 98)) } - test("alter table - property values must be set") { - val sql = "ALTER TABLE my_tab SET TBLPROPERTIES('key_without_value', 'key_with_value'='x')" - checkError( - exception = parseException(sql), - errorClass = "_LEGACY_ERROR_TEMP_0035", - parameters = Map("message" -> "Values must be specified for key(s): [key_without_value]"), - context = ExpectedContext( - fragment = sql, - start = 0, - stop = 78)) - } - test("alter table unset properties - property values must NOT be set") { val sql = "ALTER TABLE my_tab UNSET TBLPROPERTIES('key_without_value', 'key_with_value'='x')" checkError( @@ -522,7 +514,8 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession { Some("SELECT * FROM tab1"), parser.parsePlan("SELECT * FROM tab1"), false, - false) + false, + SchemaCompensation) comparePlans(parsed1, expected1) val v2 = "CREATE TEMPORARY VIEW a AS SELECT * FROM tab1" @@ -569,7 +562,8 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession { Some("SELECT * FROM tab1"), parser.parsePlan("SELECT * FROM tab1"), false, - true) + true, + SchemaCompensation) comparePlans(parsed1, expected1) val v2 = @@ -839,4 +833,44 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession { parser.parsePlan("SHOW CATALOGS LIKE 'defau*'"), ShowCatalogsCommand(Some("defau*"))) } + + test("Create SQL functions") { + comparePlans( + parser.parsePlan("CREATE TEMP FUNCTION foo() RETURNS INT RETURN 1"), + CreateSQLFunctionCommand( + FunctionIdentifier("foo"), + inputParamText = None, + returnTypeText = "INT", + exprText = Some("1"), + queryText = None, + comment = None, + isDeterministic = None, + containsSQL = None, + isTableFunc = false, + isTemp = true, + ignoreIfExists = false, + replace = false)) + intercept("CREATE FUNCTION foo() RETURNS INT RETURN 1", + "Operation not allowed: creating persistent SQL functions is not supported") + } + + test("create SQL functions with unsupported routine characteristics") { + intercept("CREATE FUNCTION foo() RETURNS INT LANGUAGE blah RETURN 1", + "Operation not allowed: Unsupported language for user defined functions: blah") + + intercept("CREATE FUNCTION foo() RETURNS INT SPECIFIC foo1 RETURN 1", + "Operation not allowed: SQL function with SPECIFIC name is not supported") + + intercept("CREATE FUNCTION foo() RETURNS INT NO SQL RETURN 1", + "Operation not allowed: SQL function with NO SQL is not supported") + + intercept("CREATE FUNCTION foo() RETURNS INT NO SQL CONTAINS SQL RETURN 1", + "Found duplicate clauses: SQL DATA ACCESS") + + intercept("CREATE FUNCTION foo() RETURNS INT RETURNS NULL ON NULL INPUT RETURN 1", + "Operation not allowed: SQL function with RETURNS NULL ON NULL INPUT is not supported") + + intercept("CREATE FUNCTION foo() RETURNS INT SQL SECURITY INVOKER RETURN 1", + "Operation not allowed: SQL function with SQL SECURITY INVOKER is not supported") + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala index e8af606d797e3..553b68bec52fe 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala @@ -327,10 +327,6 @@ abstract class DDLSuite extends QueryTest with DDLSuiteBase { protected val reversedProperties = Seq(PROP_OWNER) - test("alter table: set properties (datasource table)") { - testSetProperties(isDatasourceTable = true) - } - test("alter table: unset properties (datasource table)") { testUnsetProperties(isDatasourceTable = true) } @@ -1117,40 +1113,6 @@ abstract class DDLSuite extends QueryTest with DDLSuiteBase { ) } - protected def testSetProperties(isDatasourceTable: Boolean): Unit = { - if (!isUsingHiveMetastore) { - assert(isDatasourceTable, "InMemoryCatalog only supports data source tables") - } - val catalog = spark.sessionState.catalog - val tableIdent = TableIdentifier("tab1", Some("dbx")) - createDatabase(catalog, "dbx") - createTable(catalog, tableIdent, isDatasourceTable) - def getProps: Map[String, String] = { - if (isUsingHiveMetastore) { - normalizeCatalogTable(catalog.getTableMetadata(tableIdent)).properties - } else { - catalog.getTableMetadata(tableIdent).properties - } - } - assert(getProps.isEmpty) - // set table properties - sql("ALTER TABLE dbx.tab1 SET TBLPROPERTIES ('andrew' = 'or14', 'kor' = 'bel')") - assert(getProps == Map("andrew" -> "or14", "kor" -> "bel")) - // set table properties without explicitly specifying database - catalog.setCurrentDatabase("dbx") - sql("ALTER TABLE tab1 SET TBLPROPERTIES ('kor' = 'belle', 'kar' = 'bol')") - assert(getProps == Map("andrew" -> "or14", "kor" -> "belle", "kar" -> "bol")) - // table to alter does not exist - checkError( - exception = intercept[AnalysisException] { - sql("ALTER TABLE does_not_exist SET TBLPROPERTIES ('winner' = 'loser')") - }, - errorClass = "TABLE_OR_VIEW_NOT_FOUND", - parameters = Map("relationName" -> "`does_not_exist`"), - context = ExpectedContext(fragment = "does_not_exist", start = 12, stop = 25) - ) - } - protected def testUnsetProperties(isDatasourceTable: Boolean): Unit = { if (!isUsingHiveMetastore) { assert(isDatasourceTable, "InMemoryCatalog only supports data source tables") @@ -2260,7 +2222,7 @@ abstract class DDLSuite extends QueryTest with DDLSuiteBase { ) withGlobalTempView("src") { - val globalTempDB = spark.sharedState.globalTempViewManager.database + val globalTempDB = spark.sharedState.globalTempDB sql("CREATE GLOBAL TEMP VIEW src AS SELECT 1 AS a, '2' AS b") sql(s"CREATE TABLE t4 LIKE $globalTempDB.src USING parquet") val table = catalog.getTableMetadata(TableIdentifier("t4")) @@ -2437,6 +2399,100 @@ abstract class DDLSuite extends QueryTest with DDLSuiteBase { ) } } + + test("Change column collation") { + withTable("t1", "t2", "t3", "t4") { + // Plain `StringType`. + sql("CREATE TABLE t1(col STRING) USING parquet") + sql("INSERT INTO t1 VALUES ('a')") + checkAnswer(sql("SELECT COLLATION(col) FROM t1"), Row("UTF8_BINARY")) + sql("ALTER TABLE t1 ALTER COLUMN col TYPE STRING COLLATE UTF8_LCASE") + checkAnswer(sql("SELECT COLLATION(col) FROM t1"), Row("UTF8_LCASE")) + + // Invalid "ALTER COLUMN" to Integer. + val alterInt = "ALTER TABLE t1 ALTER COLUMN col TYPE INTEGER" + checkError( + exception = intercept[AnalysisException] { + sql(alterInt) + }, + errorClass = "NOT_SUPPORTED_CHANGE_COLUMN", + parameters = Map( + "originType" -> "\"STRING COLLATE UTF8_LCASE\"", + "originName" -> "`col`", + "table" -> "`spark_catalog`.`default`.`t1`", + "newType" -> "\"INT\"", + "newName" -> "`col`" + ), + context = ExpectedContext(fragment = alterInt, start = 0, stop = alterInt.length - 1) + ) + + // `ArrayType` with collation. + sql("CREATE TABLE t2(col ARRAY) USING parquet") + sql("INSERT INTO t2 VALUES (ARRAY('a'))") + checkAnswer(sql("SELECT COLLATION(col[0]) FROM t2"), Row("UTF8_BINARY")) + sql("ALTER TABLE t2 ALTER COLUMN col TYPE ARRAY") + checkAnswer(sql("SELECT COLLATION(col[0]) FROM t2"), Row("UTF8_LCASE")) + + // `MapType` with collation. + sql("CREATE TABLE t3(col MAP) USING parquet") + sql("INSERT INTO t3 VALUES (MAP('k', 'v'))") + checkAnswer(sql("SELECT COLLATION(col['k']) FROM t3"), Row("UTF8_BINARY")) + sql( + """ + |ALTER TABLE t3 ALTER COLUMN col TYPE + |MAP""".stripMargin) + checkAnswer(sql("SELECT COLLATION(col['k']) FROM t3"), Row("UTF8_LCASE")) + + // Invalid change of map key collation. + val alterMap = + "ALTER TABLE t3 ALTER COLUMN col TYPE " + + "MAP" + checkError( + exception = intercept[AnalysisException] { + sql(alterMap) + }, + errorClass = "NOT_SUPPORTED_CHANGE_COLUMN", + parameters = Map( + "originType" -> "\"MAP\"", + "originName" -> "`col`", + "table" -> "`spark_catalog`.`default`.`t3`", + "newType" -> "\"MAP\"", + "newName" -> "`col`" + ), + context = ExpectedContext(fragment = alterMap, start = 0, stop = alterMap.length - 1) + ) + + // `StructType` with collation. + sql("CREATE TABLE t4(col STRUCT) USING parquet") + sql("INSERT INTO t4 VALUES (NAMED_STRUCT('a', 'value'))") + checkAnswer(sql("SELECT COLLATION(col.a) FROM t4"), Row("UTF8_BINARY")) + sql("ALTER TABLE t4 ALTER COLUMN col TYPE STRUCT") + checkAnswer(sql("SELECT COLLATION(col.a) FROM t4"), Row("UTF8_LCASE")) + } + } + + test("Invalid collation change on partition and bucket columns") { + withTable("t1", "t2") { + sql("CREATE TABLE t1(col STRING, i INTEGER) USING parquet PARTITIONED BY (col)") + checkError( + exception = intercept[AnalysisException] { + sql("ALTER TABLE t1 ALTER COLUMN col TYPE STRING COLLATE UTF8_LCASE") + }, + errorClass = "CANNOT_ALTER_PARTITION_COLUMN", + sqlState = "428FR", + parameters = Map("tableName" -> "`spark_catalog`.`default`.`t1`", "columnName" -> "`col`") + ) + sql("CREATE TABLE t2(col STRING) USING parquet CLUSTERED BY (col) INTO 1 BUCKETS") + checkError( + exception = intercept[AnalysisException] { + sql("ALTER TABLE t2 ALTER COLUMN col TYPE STRING COLLATE UTF8_LCASE") + }, + errorClass = "CANNOT_ALTER_COLLATION_BUCKET_COLUMN", + sqlState = "428FR", + parameters = Map("tableName" -> "`spark_catalog`.`default`.`t2`", "columnName" -> "`col`") + ) + } + } } object FakeLocalFsFileSystem { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala index 28ea2c9bec1ab..f004ab7137f79 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala @@ -1586,7 +1586,7 @@ class PlanResolutionSuite extends AnalysisTest { // basic val sql1 = s""" - |MERGE INTO $target AS target + |MERGE WITH SCHEMA EVOLUTION INTO $target AS target |USING $source AS source |ON target.i = source.i |WHEN MATCHED AND (target.s='delete') THEN DELETE @@ -1608,12 +1608,14 @@ class PlanResolutionSuite extends AnalysisTest { insertAssigns)), Seq(DeleteAction(Some(EqualTo(ndl: AttributeReference, StringLiteral("delete")))), UpdateAction(Some(EqualTo(nul: AttributeReference, StringLiteral("update"))), - notMatchedBySourceUpdateAssigns))) => + notMatchedBySourceUpdateAssigns)), + withSchemaEvolution) => checkMergeConditionResolution(target, source, mergeCondition) checkMatchedClausesResolution(target, source, Some(dl), Some(ul), updateAssigns) checkNotMatchedClausesResolution(target, source, Some(il), insertAssigns) checkNotMatchedBySourceClausesResolution(target, Some(ndl), Some(nul), notMatchedBySourceUpdateAssigns) + assert(withSchemaEvolution === true) case other => fail("Expect MergeIntoTable, but got:\n" + other.treeString) } @@ -1638,11 +1640,13 @@ class PlanResolutionSuite extends AnalysisTest { StringLiteral("update"))), updateAssigns)), Seq(InsertAction(Some(EqualTo(il: AttributeReference, StringLiteral("insert"))), insertAssigns)), - Seq()) => + Seq(), + withSchemaEvolution) => checkMergeConditionResolution(target, source, mergeCondition) checkMatchedClausesResolution(target, source, Some(dl), Some(ul), updateAssigns, starInUpdate = true) checkNotMatchedClausesResolution(target, source, Some(il), insertAssigns) + assert(withSchemaEvolution === false) case other => fail("Expect MergeIntoTable, but got:\n" + other.treeString) } @@ -1663,11 +1667,13 @@ class PlanResolutionSuite extends AnalysisTest { mergeCondition, Seq(UpdateAction(None, updateAssigns)), Seq(InsertAction(None, insertAssigns)), - Seq()) => + Seq(), + withSchemaEvolution) => checkMergeConditionResolution(target, source, mergeCondition) checkMatchedClausesResolution(target, source, None, None, updateAssigns, starInUpdate = true) checkNotMatchedClausesResolution(target, source, None, insertAssigns) + assert(withSchemaEvolution === false) case other => fail("Expect MergeIntoTable, but got:\n" + other.treeString) } @@ -1692,12 +1698,14 @@ class PlanResolutionSuite extends AnalysisTest { Seq(DeleteAction(Some(_)), UpdateAction(None, updateAssigns)), Seq(InsertAction(None, insertAssigns)), Seq(DeleteAction(Some(EqualTo(_: AttributeReference, StringLiteral("delete")))), - UpdateAction(None, notMatchedBySourceUpdateAssigns))) => + UpdateAction(None, notMatchedBySourceUpdateAssigns)), + withSchemaEvolution) => checkMergeConditionResolution(target, source, mergeCondition) checkMatchedClausesResolution(target, source, None, None, updateAssigns) checkNotMatchedClausesResolution(target, source, None, insertAssigns) checkNotMatchedBySourceClausesResolution(target, None, None, notMatchedBySourceUpdateAssigns) + assert(withSchemaEvolution === false) case other => fail("Expect MergeIntoTable, but got:\n" + other.treeString) } @@ -1727,12 +1735,14 @@ class PlanResolutionSuite extends AnalysisTest { insertAssigns)), Seq(DeleteAction(Some(EqualTo(ndl: AttributeReference, StringLiteral("delete")))), UpdateAction(Some(EqualTo(nul: AttributeReference, StringLiteral("update"))), - notMatchedBySourceUpdateAssigns))) => + notMatchedBySourceUpdateAssigns)), + withSchemaEvolution) => checkMergeConditionResolution(target, source, mergeCondition) checkMatchedClausesResolution(target, source, Some(dl), Some(ul), updateAssigns) checkNotMatchedClausesResolution(target, source, Some(il), insertAssigns) checkNotMatchedBySourceClausesResolution(target, Some(ndl), Some(nul), notMatchedBySourceUpdateAssigns) + assert(withSchemaEvolution === false) case other => fail("Expect MergeIntoTable, but got:\n" + other.treeString) } @@ -1764,12 +1774,14 @@ class PlanResolutionSuite extends AnalysisTest { insertAssigns)), Seq(DeleteAction(Some(EqualTo(ndl: AttributeReference, StringLiteral("delete")))), UpdateAction(Some(EqualTo(nul: AttributeReference, StringLiteral("update"))), - notMatchedBySourceUpdateAssigns))) => + notMatchedBySourceUpdateAssigns)), + withSchemaEvolution) => checkMergeConditionResolution(target, source, mergeCondition) checkMatchedClausesResolution(target, source, Some(dl), Some(ul), updateAssigns) checkNotMatchedClausesResolution(target, source, Some(il), insertAssigns) checkNotMatchedBySourceClausesResolution(target, Some(ndl), Some(nul), notMatchedBySourceUpdateAssigns) + assert(withSchemaEvolution === false) case other => fail("Expect MergeIntoTable, but got:\n" + other.treeString) } @@ -1837,6 +1849,7 @@ class PlanResolutionSuite extends AnalysisTest { case other => fail("unexpected second not matched by source action " + other) } + assert(m.withSchemaEvolution === false) case other => fail("Expect MergeIntoTable, but got:\n" + other.treeString) @@ -1905,6 +1918,7 @@ class PlanResolutionSuite extends AnalysisTest { Seq(Assignment(_: AttributeReference, Literal(42, IntegerType)))) => case other => fail("unexpected second not matched by source action " + other) } + assert(m.withSchemaEvolution === false) case other => fail("Expect MergeIntoTable, but got:\n" + other.treeString) @@ -2009,6 +2023,7 @@ class PlanResolutionSuite extends AnalysisTest { assert(m.matchedActions.length == 2) assert(m.notMatchedActions.length == 1) assert(m.notMatchedBySourceActions.length == 2) + assert(m.withSchemaEvolution === false) case other => fail("Expect MergeIntoTable, but got:\n" + other.treeString) @@ -2045,7 +2060,8 @@ class PlanResolutionSuite extends AnalysisTest { Seq(InsertAction( Some(EqualTo(il: AttributeReference, StringLiteral("a"))), insertAssigns)), - Seq(DeleteAction(Some(_)), UpdateAction(None, secondUpdateAssigns))) => + Seq(DeleteAction(Some(_)), UpdateAction(None, secondUpdateAssigns)), + withSchemaEvolution) => val ti = target.output.find(_.name == "i").get val ts = target.output.find(_.name == "s").get val si = source.output.find(_.name == "i").get @@ -2064,6 +2080,7 @@ class PlanResolutionSuite extends AnalysisTest { assert(secondUpdateAssigns.size == 1) // UPDATE key is resolved with target table only, so column `s` is not ambiguous. assert(secondUpdateAssigns.head.key.asInstanceOf[AttributeReference].sameRef(ts)) + assert(withSchemaEvolution === false) case p => fail("Expect MergeIntoTable, but got:\n" + p.treeString) } @@ -2150,7 +2167,8 @@ class PlanResolutionSuite extends AnalysisTest { _, Seq(), Seq(), - notMatchedBySourceActions) => + notMatchedBySourceActions, + withSchemaEvolution) => assert(notMatchedBySourceActions.length == 2) notMatchedBySourceActions(0) match { case DeleteAction(Some(EqualTo(dl: AttributeReference, StringLiteral("b")))) => @@ -2171,6 +2189,7 @@ class PlanResolutionSuite extends AnalysisTest { assert(us.sameRef(ti)) case other => fail("unexpected second not matched by source action " + other) } + assert(withSchemaEvolution === false) } val sql7 = @@ -2205,6 +2224,7 @@ class PlanResolutionSuite extends AnalysisTest { case u: MergeIntoTable => assert(u.targetTable.isInstanceOf[UnresolvedRelation]) assert(u.sourceTable.isInstanceOf[UnresolvedRelation]) + assert(u.withSchemaEvolution === false) case _ => fail("Expect MergeIntoTable, but got:\n" + parsed.treeString) } @@ -2283,6 +2303,7 @@ class PlanResolutionSuite extends AnalysisTest { assert(s2.functionName == "varcharTypeWriteSideCheck") case other => fail("Expect UpdateAction, but got: " + other) } + assert(m.withSchemaEvolution === false) case other => fail("Expect MergeIntoTable, but got:\n" + other.treeString) } } @@ -2304,12 +2325,14 @@ class PlanResolutionSuite extends AnalysisTest { _, Seq(DeleteAction(None)), Seq(InsertAction(None, insertAssigns)), - Nil) => + Nil, + withSchemaEvolution) => // There is only one assignment, the missing col is not filled with default value assert(insertAssigns.size == 1) // Special case: Spark does not resolve any columns in MERGE if table accepts any schema. assert(insertAssigns.head.key.asInstanceOf[UnresolvedAttribute].name == "target.i") assert(insertAssigns.head.value.asInstanceOf[UnresolvedAttribute].name == "DEFAULT") + assert(withSchemaEvolution === false) case l => fail("Expected unresolved MergeIntoTable, but got:\n" + l.treeString) } @@ -2824,11 +2847,9 @@ class PlanResolutionSuite extends AnalysisTest { assert(desc.viewText.isEmpty) assert(desc.viewQueryColumnNames.isEmpty) assert(desc.storage.locationUri.isEmpty) - assert(desc.storage.inputFormat == - Some("org.apache.hadoop.mapred.TextInputFormat")) - assert(desc.storage.outputFormat == - Some("org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat")) - assert(desc.storage.serde == Some("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")) + assert(desc.storage.inputFormat.isEmpty) + assert(desc.storage.outputFormat.isEmpty) + assert(desc.storage.serde.isEmpty) assert(desc.storage.properties.isEmpty) assert(desc.properties.isEmpty) assert(desc.comment.isEmpty) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/ShowTablesSuiteBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/ShowTablesSuiteBase.scala index c88217221ab76..d6b91bcf3eb8e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/ShowTablesSuiteBase.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/ShowTablesSuiteBase.scala @@ -351,6 +351,7 @@ trait ShowTablesSuiteBase extends QueryTest with DDLCommandTestUtils { |Created By: |Type: VIEW |View Text: SELECT id FROM $catalog.$namespace.$table + |View Schema Mode: BINDING |View Catalog and Namespace: spark_catalog.default |View Query Output Columns: [id] |Schema: root @@ -377,6 +378,7 @@ trait ShowTablesSuiteBase extends QueryTest with DDLCommandTestUtils { |Created By: |Type: VIEW |View Text: SELECT id FROM $catalog.$namespace.$table + |View Schema Mode: BINDING |View Catalog and Namespace: spark_catalog.default |View Query Output Columns: [id] |Schema: root @@ -394,6 +396,7 @@ trait ShowTablesSuiteBase extends QueryTest with DDLCommandTestUtils { |Created By: |Type: VIEW |View Text: SELECT id FROM $catalog.$namespace.$table + |View Schema Mode: BINDING |View Catalog and Namespace: spark_catalog.default |View Query Output Columns: [id] |Schema: root diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/TruncateTableSuiteBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/TruncateTableSuiteBase.scala index facbfa3dedf8c..982c568d09a79 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/TruncateTableSuiteBase.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/TruncateTableSuiteBase.scala @@ -206,7 +206,7 @@ trait TruncateTableSuiteBase extends QueryTest with DDLCommandTestUtils { ) } - val v2 = s"${spark.sharedState.globalTempViewManager.database}.v2" + val v2 = s"${spark.sharedState.globalTempDB}.v2" withGlobalTempView("v2") { sql(s"CREATE GLOBAL TEMP VIEW v2 AS SELECT * FROM $t") checkError( @@ -245,7 +245,7 @@ trait TruncateTableSuiteBase extends QueryTest with DDLCommandTestUtils { checkCachedRelation("v1", Seq(Row(0, 0, 0))) } - val v2 = s"${spark.sharedState.globalTempViewManager.database}.v2" + val v2 = s"${spark.sharedState.globalTempDB}.v2" withGlobalTempView("v2") { sql(s"INSERT INTO $t PARTITION (width = 10, length = 10) SELECT 10") sql(s"CREATE GLOBAL TEMP VIEW v2 AS SELECT * FROM $t") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/AlterNamespaceUnsetPropertiesSuiteBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/AlterNamespaceUnsetPropertiesSuiteBase.scala new file mode 100644 index 0000000000000..da7fdbba16b0b --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/AlterNamespaceUnsetPropertiesSuiteBase.scala @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.command.v1 + +import org.apache.spark.sql.execution.command + +/** + * This base suite contains unified tests for the `ALTER NAMESPACE ... UNSET PROPERTIES` command + * that checks V1 table catalogs. The tests that cannot run for all V1 catalogs are located in more + * specific test suites: + * + * - V1 In-Memory catalog: + * `org.apache.spark.sql.execution.command.v1.AlterNamespaceUnsetPropertiesSuite` + * - V1 Hive External catalog: + * `org.apache.spark.sql.hive.execution.command.AlterNamespaceUnsetPropertiesSuite` + */ +trait AlterNamespaceUnsetPropertiesSuiteBase extends command.AlterNamespaceUnsetPropertiesSuiteBase + with command.TestsV1AndV2Commands { + override def namespace: String = "db" +} + +/** + * The class contains tests for the `ALTER NAMESPACE ... UNSET PROPERTIES` command to + * check V1 In-Memory table catalog. + */ +class AlterNamespaceUnsetPropertiesSuite extends AlterNamespaceUnsetPropertiesSuiteBase + with CommandSuiteBase { + override def commandVersion: String = + super[AlterNamespaceUnsetPropertiesSuiteBase].commandVersion +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/AlterTableAddPartitionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/AlterTableAddPartitionSuite.scala index 71f04159638aa..dac99c8ff7023 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/AlterTableAddPartitionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/AlterTableAddPartitionSuite.scala @@ -133,7 +133,7 @@ trait AlterTableAddPartitionSuiteBase extends command.AlterTableAddPartitionSuit checkCachedRelation("v1", Seq(Row(0, 0), Row(0, 1), Row(0, 2))) } - val v2 = s"${spark.sharedState.globalTempViewManager.database}.v2" + val v2 = s"${spark.sharedState.globalTempDB}.v2" withGlobalTempView("v2") { sql(s"CREATE GLOBAL TEMP VIEW v2 AS SELECT * FROM $t") cacheRelation(v2) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/AlterTableSetTblPropertiesSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/AlterTableSetTblPropertiesSuite.scala new file mode 100644 index 0000000000000..e74e5d4fc9ea5 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/AlterTableSetTblPropertiesSuite.scala @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.command.v1 + +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.execution.command +import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION + +/** + * This base suite contains unified tests for the `ALTER TABLE .. SET TBLPROPERTIES` + * command that check V1 table catalogs. The tests that cannot run for all V1 catalogs + * are located in more specific test suites: + * + * - V1 In-Memory catalog: + * `org.apache.spark.sql.execution.command.v1.AlterTableSetTblPropertiesSuite` + * - V1 Hive External catalog: + * `org.apache.spark.sql.hive.execution.command.AlterTableSetTblPropertiesSuite` + */ +trait AlterTableSetTblPropertiesSuiteBase extends command.AlterTableSetTblPropertiesSuiteBase { + + private[sql] lazy val sessionCatalog = spark.sessionState.catalog + + private def isUsingHiveMetastore: Boolean = { + spark.sparkContext.conf.get(CATALOG_IMPLEMENTATION) == "hive" + } + + private def normalizeTblProps(props: Map[String, String]): Map[String, String] = { + props.filterNot(p => Seq("transient_lastDdlTime").contains(p._1)) + } + + private def getTableProperties(tableIdent: TableIdentifier): Map[String, String] = { + sessionCatalog.getTableMetadata(tableIdent).properties + } + + override def checkTblProps(tableIdent: TableIdentifier, + expectedTblProps: Map[String, String]): Unit = { + val actualTblProps = getTableProperties(tableIdent) + if (isUsingHiveMetastore) { + assert(normalizeTblProps(actualTblProps) == expectedTblProps) + } else { + assert(actualTblProps == expectedTblProps) + } + } +} + +class AlterTableSetTblPropertiesSuite + extends AlterTableSetTblPropertiesSuiteBase with CommandSuiteBase diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/ShowTablesSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/ShowTablesSuite.scala index 4b4742910bd18..9be802b5f1fea 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/ShowTablesSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/ShowTablesSuite.scala @@ -205,6 +205,7 @@ class ShowTablesSuite extends ShowTablesSuiteBase with CommandSuiteBase { |Type: VIEW |View Text: SELECT id FROM $catalog.$namespace.$table |View Original Text: SELECT id FROM $catalog.$namespace.$table + |View Schema Mode: COMPENSATION |View Catalog and Namespace: $catalog.$namespace |View Query Output Columns: [id] |Schema: root diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AlterNamespaceUnsetPropertiesSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AlterNamespaceUnsetPropertiesSuite.scala new file mode 100644 index 0000000000000..352238eda2eab --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AlterNamespaceUnsetPropertiesSuite.scala @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.command.v2 + +import org.apache.spark.sql.execution.command + +/** + * The class contains tests for the `ALTER NAMESPACE ... UNSET PROPERTIES` command to check V2 + * table catalogs. + */ +class AlterNamespaceUnsetPropertiesSuite extends command.AlterNamespaceUnsetPropertiesSuiteBase + with CommandSuiteBase { + + override def namespace: String = "ns1.ns2" +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AlterTableAddPartitionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AlterTableAddPartitionSuite.scala index b733666f0d84a..defa026c0e281 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AlterTableAddPartitionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AlterTableAddPartitionSuite.scala @@ -105,7 +105,7 @@ class AlterTableAddPartitionSuite checkCachedRelation("v1", Seq(Row(0, 0), Row(0, 1), Row(1, 2))) } - val v2 = s"${spark.sharedState.globalTempViewManager.database}.v2" + val v2 = s"${spark.sharedState.globalTempDB}.v2" withGlobalTempView(v2) { sql(s"CREATE GLOBAL TEMP VIEW v2 AS SELECT * FROM $t") cacheRelation(v2) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AlterTableSetTblPropertiesSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AlterTableSetTblPropertiesSuite.scala new file mode 100644 index 0000000000000..7d7b2ad8686ee --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AlterTableSetTblPropertiesSuite.scala @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.command.v2 + +import scala.jdk.CollectionConverters._ + +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.connector.catalog.{Identifier, Table} +import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.CatalogHelper +import org.apache.spark.sql.execution.command + +/** + * The class contains tests for the `ALTER TABLE .. SET TBLPROPERTIES` command to + * check V2 table catalogs. + */ +class AlterTableSetTblPropertiesSuite + extends command.AlterTableSetTblPropertiesSuiteBase with CommandSuiteBase { + + private def normalizeTblProps(props: Map[String, String]): Map[String, String] = { + props.filterNot(p => Seq("provider", "owner").contains(p._1)) + } + + private def getTableMetadata(tableIndent: TableIdentifier): Table = { + val nameParts = tableIndent.nameParts + val v2Catalog = spark.sessionState.catalogManager.catalog(nameParts.head).asTableCatalog + val namespace = nameParts.drop(1).init.toArray + v2Catalog.loadTable(Identifier.of(namespace, nameParts.last)) + } + + override def checkTblProps(tableIdent: TableIdentifier, + expectedTblProps: Map[String, String]): Unit = { + val actualTblProps = getTableMetadata(tableIdent).properties.asScala.toMap + assert(normalizeTblProps(actualTblProps) === expectedTblProps) + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategySuite.scala index 2b9ec97bace1e..9f0396ab60e32 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategySuite.scala @@ -327,8 +327,10 @@ class DataSourceStrategySuite extends PlanTest with SharedSparkSession { test("SPARK-41636: selectFilters returns predicates in deterministic order") { - val predicates = Seq(EqualTo($"id", 1), EqualTo($"id", 2), - EqualTo($"id", 3), EqualTo($"id", 4), EqualTo($"id", 5), EqualTo($"id", 6)) + val idColAttribute = AttributeReference("id", IntegerType)() + val predicates = Seq(EqualTo(idColAttribute, 1), EqualTo(idColAttribute, 2), + EqualTo(idColAttribute, 3), EqualTo(idColAttribute, 4), EqualTo(idColAttribute, 5), + EqualTo(idColAttribute, 6)) val (unhandledPredicates, pushedFilters, handledFilters) = DataSourceStrategy.selectFilters(FakeRelation(), predicates) @@ -338,4 +340,21 @@ class DataSourceStrategySuite extends PlanTest with SharedSparkSession { }) assert(handledFilters.isEmpty) } + + test("SPARK-48431: Push filters on columns with UTF8_BINARY collation") { + val colAttr = $"col".string("UTF8_BINARY") + testTranslateFilter(EqualTo(colAttr, Literal("value")), Some(sources.EqualTo("col", "value"))) + testTranslateFilter(Not(EqualTo(colAttr, Literal("value"))), + Some(sources.Not(sources.EqualTo("col", "value")))) + testTranslateFilter(LessThan(colAttr, Literal("value")), + Some(sources.LessThan("col", "value"))) + testTranslateFilter(LessThan(colAttr, Literal("value")), Some(sources.LessThan("col", "value"))) + testTranslateFilter(LessThanOrEqual(colAttr, Literal("value")), + Some(sources.LessThanOrEqual("col", "value"))) + testTranslateFilter(GreaterThan(colAttr, Literal("value")), + Some(sources.GreaterThan("col", "value"))) + testTranslateFilter(GreaterThanOrEqual(colAttr, Literal("value")), + Some(sources.GreaterThanOrEqual("col", "value"))) + testTranslateFilter(IsNotNull(colAttr), Some(sources.IsNotNull("col"))) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala index 110c330f16956..6399eb6da049f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala @@ -31,7 +31,7 @@ import org.mockito.Mockito.{mock, when} import org.apache.spark.{SparkException, SparkRuntimeException} import org.apache.spark.metrics.source.HiveCatalogMetrics -import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.util._ import org.apache.spark.sql.functions.col import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf} @@ -547,6 +547,66 @@ class FileIndexSuite extends SharedSparkSession { assert(fileIndex.leafFileStatuses.toSeq == statuses) } + test("SPARK-48649: Ignore invalid partitions") { + // Table: + // id part_col + // 1 1 + // 2 2 + val df = spark.range(1, 3, 1, 2).toDF("id") + .withColumn("part_col", col("id")) + + withTempPath { directoryPath => + df.write + .mode("overwrite") + .format("parquet") + .partitionBy("part_col") + .save(directoryPath.getCanonicalPath) + + // Rename one of the folders. + new File(directoryPath, "part_col=1").renameTo(new File(directoryPath, "undefined")) + + // By default, we expect the invalid path assertion to trigger. + val ex = intercept[AssertionError] { + spark.read + .format("parquet") + .load(directoryPath.getCanonicalPath) + .collect() + } + assert(ex.getMessage.contains("Conflicting directory structures detected")) + + // With the config enabled, we should only read the valid partition. + withSQLConf(SQLConf.IGNORE_INVALID_PARTITION_PATHS.key -> "true") { + assert( + spark.read + .format("parquet") + .load(directoryPath.getCanonicalPath) + .collect() === Seq(Row(2, 2))) + } + + // Data source option override takes precedence. + withSQLConf(SQLConf.IGNORE_INVALID_PARTITION_PATHS.key -> "true") { + val ex = intercept[AssertionError] { + spark.read + .format("parquet") + .option(FileIndexOptions.IGNORE_INVALID_PARTITION_PATHS, "false") + .load(directoryPath.getCanonicalPath) + .collect() + } + assert(ex.getMessage.contains("Conflicting directory structures detected")) + } + + // Data source option override takes precedence. + withSQLConf(SQLConf.IGNORE_INVALID_PARTITION_PATHS.key -> "false") { + assert( + spark.read + .format("parquet") + .option(FileIndexOptions.IGNORE_INVALID_PARTITION_PATHS, "true") + .load(directoryPath.getCanonicalPath) + .collect() === Seq(Row(2, 2))) + } + } + } + test("expire FileStatusCache if TTL is configured") { val previousValue = SQLConf.get.getConf(StaticSQLConf.METADATA_CACHE_TTL_SECONDS) try { @@ -585,9 +645,10 @@ class FileIndexSuite extends SharedSparkSession { } test("SPARK-40667: validate FileIndex Options") { - assert(FileIndexOptions.getAllOptions.size == 7) + assert(FileIndexOptions.getAllOptions.size == 8) // Please add validation on any new FileIndex options here assert(FileIndexOptions.isValidOption("ignoreMissingFiles")) + assert(FileIndexOptions.isValidOption("ignoreInvalidPartitionPaths")) assert(FileIndexOptions.isValidOption("timeZone")) assert(FileIndexOptions.isValidOption("recursiveFileLookup")) assert(FileIndexOptions.isValidOption("basePath")) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ReadSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ReadSchemaSuite.scala index 5256043289d5e..fefb16a351fdb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ReadSchemaSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ReadSchemaSuite.scala @@ -161,7 +161,9 @@ class ParquetReadSchemaSuite with HideColumnInTheMiddleTest with AddNestedColumnTest with HideNestedColumnTest - with ChangePositionTest { + with ChangePositionTest + with IntegralTypeTest + with ToDoubleTypeTest { override val format: String = "parquet" @@ -183,7 +185,9 @@ class VectorizedParquetReadSchemaSuite with HideColumnInTheMiddleTest with AddNestedColumnTest with HideNestedColumnTest - with ChangePositionTest { + with ChangePositionTest + with IntegralTypeTest + with ToDoubleTypeTest { override val format: String = "parquet" @@ -205,7 +209,9 @@ class MergedParquetReadSchemaSuite with HideColumnInTheMiddleTest with AddNestedColumnTest with HideNestedColumnTest - with ChangePositionTest { + with ChangePositionTest + with IntegralTypeTest + with ToDoubleTypeTest { override val format: String = "parquet" diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommandSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommandSuite.scala index 51c9b960a8eab..3762241719acd 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommandSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommandSuite.scala @@ -82,7 +82,7 @@ class SaveIntoDataSourceCommandSuite extends QueryTest with SharedSparkSession { val df = spark.range(1).selectExpr( "cast('a' as binary) a", "true b", "cast(1 as byte) c", "1.23 d", "'abc'", - "'abc' COLLATE UTF8_BINARY_LCASE") + "'abc' COLLATE UTF8_LCASE") dataSource.planForWriting(SaveMode.ErrorIfExists, df.logicalPlan) // Variant and Interval types are disallowed by default. diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/V1WriteCommandSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/V1WriteCommandSuite.scala index ce43edb79c127..04a7b4834f4b8 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/V1WriteCommandSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/V1WriteCommandSuite.scala @@ -214,8 +214,8 @@ class V1WriteCommandSuite extends QueryTest with SharedSparkSession with V1Write val executedPlan = FileFormatWriter.executedPlan.get val plan = if (enabled) { - assert(executedPlan.isInstanceOf[WriteFilesExec]) - executedPlan.asInstanceOf[WriteFilesExec].child + assert(executedPlan.isInstanceOf[WriteFilesExecBase]) + executedPlan.asInstanceOf[WriteFilesExecBase].child } else { executedPlan.transformDown { case a: AdaptiveSparkPlanExec => a.executedPlan @@ -261,8 +261,8 @@ class V1WriteCommandSuite extends QueryTest with SharedSparkSession with V1Write val executedPlan = FileFormatWriter.executedPlan.get val plan = if (enabled) { - assert(executedPlan.isInstanceOf[WriteFilesExec]) - executedPlan.asInstanceOf[WriteFilesExec].child + assert(executedPlan.isInstanceOf[WriteFilesExecBase]) + executedPlan.asInstanceOf[WriteFilesExecBase].child } else { executedPlan.transformDown { case a: AdaptiveSparkPlanExec => a.executedPlan diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index 22ea133ee19aa..f7ea8a735068e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -80,6 +80,7 @@ abstract class CSVSuite private val valueMalformedFile = "test-data/value-malformed.csv" private val badAfterGoodFile = "test-data/bad_after_good.csv" private val malformedRowFile = "test-data/malformedRow.csv" + private val charFile = "test-data/char.csv" /** Verifies data and schema. */ private def verifyCars( @@ -1246,14 +1247,13 @@ abstract class CSVSuite val exp = spark.sql("select timestamp_ntz'2020-12-12 12:12:12' as col0") for (pattern <- patterns) { withTempPath { path => - val actualPath = path.toPath.toUri.toURL.toString val ex = intercept[SparkException] { exp.write.format("csv").option("timestampNTZFormat", pattern).save(path.getAbsolutePath) } checkErrorMatchPVals( exception = ex, errorClass = "TASK_WRITE_FAILED", - parameters = Map("path" -> s"$actualPath.*")) + parameters = Map("path" -> s".*${path.getName}.*")) val msg = ex.getCause.getMessage assert( msg.contains("Unsupported field: OffsetSeconds") || @@ -1488,16 +1488,21 @@ abstract class CSVSuite val e = intercept[SparkException] { spark.read.csv(inputFile.toURI.toString).collect() } - checkError( + checkErrorMatchPVals( exception = e, errorClass = "FAILED_READ_FILE.NO_HINT", - parameters = Map("path" -> inputFile.toPath.toUri.toString) + parameters = Map("path" -> s".*${inputFile.getName}.*") ) assert(e.getCause.isInstanceOf[EOFException]) assert(e.getCause.getMessage === "Unexpected end of input stream") val e2 = intercept[SparkException] { spark.read.option("multiLine", true).csv(inputFile.toURI.toString).collect() } + checkErrorMatchPVals( + exception = e2, + errorClass = "FAILED_READ_FILE.NO_HINT", + parameters = Map("path" -> s".*${inputFile.getName}.*") + ) assert(e2.getCause.getCause.getCause.isInstanceOf[EOFException]) assert(e2.getCause.getCause.getCause.getMessage === "Unexpected end of input stream") } @@ -3342,6 +3347,29 @@ abstract class CSVSuite expected) } } + + test("SPARK-48241: CSV parsing failure with char/varchar type columns") { + withTable("charVarcharTable") { + spark.sql( + s""" + |CREATE TABLE charVarcharTable( + | color char(4), + | name varchar(10)) + |USING csv + |OPTIONS ( + | header "true", + | path "${testFile(charFile)}" + |) + """.stripMargin) + val expected = Seq( + Row("pink", "Bob"), + Row("blue", "Mike"), + Row("grey", "Tom")) + checkAnswer( + sql("SELECT * FROM charVarcharTable"), + expected) + } + } } class CSVv1Suite extends CSVSuite { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala index f3c332bab1833..9e5ecc08e24a2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala @@ -1924,10 +1924,10 @@ abstract class JsonSuite val e = intercept[SparkException] { spark.read.json(inputFile.toURI.toString).collect() } - checkError( + checkErrorMatchPVals( exception = e, errorClass = "FAILED_READ_FILE.NO_HINT", - parameters = Map("path" -> inputFile.toPath.toUri.toString)) + parameters = Map("path" -> s".*${inputFile.getName}.*")) assert(e.getCause.isInstanceOf[EOFException]) assert(e.getCause.getMessage === "Unexpected end of input stream") val e2 = intercept[SparkException] { @@ -3039,14 +3039,13 @@ abstract class JsonSuite val exp = spark.sql("select timestamp_ntz'2020-12-12 12:12:12' as col0") for (pattern <- patterns) { withTempPath { path => - val actualPath = path.toPath.toUri.toURL.toString val err = intercept[SparkException] { exp.write.option("timestampNTZFormat", pattern).json(path.getAbsolutePath) } checkErrorMatchPVals( exception = err, errorClass = "TASK_WRITE_FAILED", - parameters = Map("path" -> s"$actualPath.*")) + parameters = Map("path" -> s".*${path.getName}.*")) val msg = err.getCause.getMessage assert( @@ -3728,7 +3727,7 @@ abstract class JsonSuite } test("SPARK-40667: validate JSON Options") { - assert(JSONOptions.getAllOptions.size == 28) + assert(JSONOptions.getAllOptions.size == 29) // Please add validation on any new Json options here assert(JSONOptions.isValidOption("samplingRatio")) assert(JSONOptions.isValidOption("primitivesAsString")) @@ -3756,6 +3755,7 @@ abstract class JsonSuite assert(JSONOptions.isValidOption("columnNameOfCorruptRecord")) assert(JSONOptions.isValidOption("timeZone")) assert(JSONOptions.isValidOption("writeNonAsciiCharacterAsCodePoint")) + assert(JSONOptions.isValidOption("singleVariantColumn")) assert(JSONOptions.isValidOption("encoding")) assert(JSONOptions.isValidOption("charset")) // Please add validation on any new Json options with alternative here @@ -3864,6 +3864,64 @@ abstract class JsonSuite } } } + + test("SPARK-48148: values are unchanged when read as string") { + withTempPath { path => + def extractData( + jsonString: String, + expectedInexactData: Seq[String], + expectedExactData: Seq[String], + multiLine: Boolean = false): Unit = { + Seq(jsonString).toDF() + .repartition(1) + .write + .mode("overwrite") + .text(path.getAbsolutePath) + + withClue("Exact string parsing") { + withSQLConf(SQLConf.JSON_EXACT_STRING_PARSING.key -> "true") { + val df = spark.read + .schema("data STRING") + .option("multiLine", multiLine.toString) + .json(path.getAbsolutePath) + checkAnswer(df, expectedExactData.map(d => Row(d))) + } + } + + withClue("Inexact string parsing") { + withSQLConf(SQLConf.JSON_EXACT_STRING_PARSING.key -> "false") { + val df = spark.read + .schema("data STRING") + .option("multiLine", multiLine.toString) + .json(path.getAbsolutePath) + checkAnswer(df, expectedInexactData.map(d => Row(d))) + } + } + } + extractData( + """{"data": {"white": "space"}}""", + expectedInexactData = Seq("""{"white":"space"}"""), + expectedExactData = Seq("""{"white": "space"}""") + ) + extractData( + """{"data": ["white", "space"]}""", + expectedInexactData = Seq("""["white","space"]"""), + expectedExactData = Seq("""["white", "space"]""") + ) + val granularFloat = "-999.99999999999999999999999999999999995" + extractData( + s"""{"data": {"v": ${granularFloat}}}""", + expectedInexactData = Seq("""{"v":-1000.0}"""), + expectedExactData = Seq(s"""{"v": ${granularFloat}}""") + ) + extractData( + s"""{"data": {"white":\n"space"}}""", + expectedInexactData = Seq("""{"white":"space"}"""), + expectedExactData = Seq(s"""{"white":\n"space"}"""), + multiLine = true + ) + } + } } class JsonV1Suite extends JsonSuite { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala index 9fb490dd823ad..02e1c70cc8cb7 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala @@ -1208,7 +1208,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSparkSession test("SPARK-7837 Do not close output writer twice when commitTask() fails") { withSQLConf(SQLConf.FILE_COMMIT_PROTOCOL_CLASS.key -> - classOf[SQLHadoopMapReduceCommitProtocol].getCanonicalName) { + classOf[SQLHadoopMapReduceCommitProtocol].getCanonicalName) { // Using a output committer that always fail when committing a task, so that both // `commitTask()` and `abortTask()` are invoked. val extraOptions = Map[String, String]( diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala index fffc9e2b19246..baa11df302b04 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala @@ -213,8 +213,8 @@ class ParquetInteroperabilitySuite extends ParquetCompatibilityTest with SharedS // predicates because (a) in ParquetFilters, we ignore TimestampType and (b) parquet // does not read statistics from int96 fields, as they are unsigned. See // scalastyle:off line.size.limit - // https://github.com/apache/parquet-mr/blob/2fd62ee4d524c270764e9b91dca72e5cf1a005b7/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java#L419 - // https://github.com/apache/parquet-mr/blob/2fd62ee4d524c270764e9b91dca72e5cf1a005b7/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java#L348 + // https://github.com/apache/parquet-java/blob/2fd62ee4d524c270764e9b91dca72e5cf1a005b7/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java#L419 + // https://github.com/apache/parquet-java/blob/2fd62ee4d524c270764e9b91dca72e5cf1a005b7/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java#L348 // scalastyle:on line.size.limit // // Just to be defensive in case anything ever changes in parquet, this test checks diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala index 183c4f71df6c6..a6ad147c865d2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala @@ -112,7 +112,8 @@ abstract class ParquetPartitionDiscoverySuite "hdfs://host:9000/path/a=10.5/b=hello") var exception = intercept[AssertionError] { - parsePartitions(paths.map(new Path(_)), true, Set.empty[Path], None, true, true, timeZoneId) + parsePartitions( + paths.map(new Path(_)), true, Set.empty[Path], None, true, true, timeZoneId, false) } assert(exception.getMessage().contains("Conflicting directory structures detected")) @@ -129,7 +130,8 @@ abstract class ParquetPartitionDiscoverySuite None, true, true, - timeZoneId) + timeZoneId, + false) // Valid paths = Seq( @@ -145,7 +147,8 @@ abstract class ParquetPartitionDiscoverySuite None, true, true, - timeZoneId) + timeZoneId, + false) // Valid paths = Seq( @@ -161,7 +164,8 @@ abstract class ParquetPartitionDiscoverySuite None, true, true, - timeZoneId) + timeZoneId, + false) // Invalid paths = Seq( @@ -177,7 +181,8 @@ abstract class ParquetPartitionDiscoverySuite None, true, true, - timeZoneId) + timeZoneId, + false) } assert(exception.getMessage().contains("Conflicting directory structures detected")) @@ -200,7 +205,8 @@ abstract class ParquetPartitionDiscoverySuite None, true, true, - timeZoneId) + timeZoneId, + false) } assert(exception.getMessage().contains("Conflicting directory structures detected")) } @@ -296,7 +302,8 @@ abstract class ParquetPartitionDiscoverySuite None, true, true, - timeZoneId) + timeZoneId, + false) assert(actualSpec.partitionColumns === spec.partitionColumns) assert(actualSpec.partitions.length === spec.partitions.length) actualSpec.partitions.zip(spec.partitions).foreach { case (actual, expected) => @@ -427,7 +434,7 @@ abstract class ParquetPartitionDiscoverySuite def check(paths: Seq[String], spec: PartitionSpec): Unit = { val actualSpec = parsePartitions(paths.map(new Path(_)), false, Set.empty[Path], None, - true, true, timeZoneId) + true, true, timeZoneId, false) assert(actualSpec === spec) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala index a329d3fdc3cbc..4d413efe50430 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala @@ -369,16 +369,14 @@ abstract class ParquetQuerySuite extends QueryTest with ParquetTest with SharedS } withSQLConf(SQLConf.IGNORE_CORRUPT_FILES.key -> sqlConf) { - withSQLConf(SQLConf.IGNORE_CORRUPT_FILES.key -> "false") { - val exception = intercept[SparkException] { - testIgnoreCorruptFiles(options) - }.getCause - assert(exception.getMessage().contains("is not a Parquet file")) - val exception2 = intercept[SparkException] { - testIgnoreCorruptFilesWithoutSchemaInfer(options) - }.getCause - assert(exception2.getMessage().contains("is not a Parquet file")) - } + val exception = intercept[SparkException] { + testIgnoreCorruptFiles(options) + }.getCause + assert(exception.getMessage().contains("is not a Parquet file")) + val exception2 = intercept[SparkException] { + testIgnoreCorruptFilesWithoutSchemaInfer(options) + }.getCause + assert(exception2.getMessage().contains("is not a Parquet file")) } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetVectorizedSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetVectorizedSuite.scala index 35e1a38376dd8..f2d04a9c28f2a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetVectorizedSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetVectorizedSuite.scala @@ -502,7 +502,7 @@ class ParquetVectorizedSuite extends QueryTest with ParquetTest with SharedSpark val ty = parquetSchema.asGroupType().getType("a").asPrimitiveType() val cd = new ColumnDescriptor(Seq("a").toArray, ty, 0, maxDef) val repetitionLevels = Array.fill[Int](inputValues.length)(0) - val definitionLevels = inputValues.map(v => if (v == null) 0 else 1) + val definitionLevels = inputValues.map(v => if (v == null) 0 else maxDef) val memPageStore = new MemPageStore(expectedValues.length) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/FileWriterFactorySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/FileWriterFactorySuite.scala new file mode 100644 index 0000000000000..bd20307974416 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/FileWriterFactorySuite.scala @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution.datasources.v2 + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl +import org.mockito.Mockito._ +import org.scalatest.PrivateMethodTester + +import org.apache.spark.SparkFunSuite +import org.apache.spark.internal.io.FileCommitProtocol +import org.apache.spark.sql.execution.datasources.WriteJobDescription +import org.apache.spark.util.SerializableConfiguration + +class FileWriterFactorySuite extends SparkFunSuite with PrivateMethodTester { + + test("SPARK-48484: V2Write uses different TaskAttemptIds for different task attempts") { + val jobDescription = mock(classOf[WriteJobDescription]) + when(jobDescription.serializableHadoopConf).thenReturn( + new SerializableConfiguration(new Configuration(false))) + val committer = mock(classOf[FileCommitProtocol]) + + val writerFactory = FileWriterFactory(jobDescription, committer) + val createTaskAttemptContext = + PrivateMethod[TaskAttemptContextImpl](Symbol("createTaskAttemptContext")) + + val attemptContext = + writerFactory.invokePrivate(createTaskAttemptContext(0, 1)) + val attemptContext1 = + writerFactory.invokePrivate(createTaskAttemptContext(0, 2)) + assert(attemptContext.getTaskAttemptID.getTaskID == attemptContext1.getTaskAttemptID.getTaskID) + assert(attemptContext.getTaskAttemptID.getId != attemptContext1.getTaskAttemptID.getId) + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/DerbyTableCatalogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/DerbyTableCatalogSuite.scala index e3714e6044955..d793ef526c47b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/DerbyTableCatalogSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/DerbyTableCatalogSuite.scala @@ -51,4 +51,12 @@ class DerbyTableCatalogSuite extends QueryTest with SharedSparkSession { checkAnswer(sql(s"SHOW TABLES IN derby.test1"), Row("test1", "TABLE2", false)) } } + + test("SPARK-48439: Calculate suitable precision and scale for DECIMAL type") { + withTable("derby.test1.table1") { + sql("CREATE TABLE derby.test1.table1 (c1 decimal(38, 18))") + sql("INSERT INTO derby.test1.table1 VALUES (1.123456789123456789)") + checkAnswer(sql("SELECT * FROM derby.test1.table1"), Row(1.12345678912)) + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/JDBCTableCatalogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/JDBCTableCatalogSuite.scala index f4e7921e88bc2..d2ff33e104772 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/JDBCTableCatalogSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/JDBCTableCatalogSuite.scala @@ -200,7 +200,7 @@ class JDBCTableCatalogSuite extends QueryTest with SharedSparkSession { exception = intercept[AnalysisException] { sql(s"ALTER TABLE $tableName ADD COLUMNS (c3 DOUBLE)") }, - errorClass = "FIELDS_ALREADY_EXISTS", + errorClass = "FIELD_ALREADY_EXISTS", parameters = Map( "op" -> "add", "fieldNames" -> "`c3`", @@ -239,7 +239,7 @@ class JDBCTableCatalogSuite extends QueryTest with SharedSparkSession { exception = intercept[AnalysisException] { sql(s"ALTER TABLE $tableName RENAME COLUMN C TO C0") }, - errorClass = "FIELDS_ALREADY_EXISTS", + errorClass = "FIELD_ALREADY_EXISTS", parameters = Map( "op" -> "rename", "fieldNames" -> "`C0`", @@ -619,15 +619,15 @@ class JDBCTableCatalogSuite extends QueryTest with SharedSparkSession { test("CREATE TABLE with table property") { withTable("h2.test.new_table") { - checkError( + checkErrorMatchPVals( exception = intercept[AnalysisException] { sql("CREATE TABLE h2.test.new_table(i INT, j STRING)" + " TBLPROPERTIES('ENGINE'='tableEngineName')") }, - errorClass = "FAILED_JDBC.UNCLASSIFIED", + errorClass = "FAILED_JDBC.CREATE_TABLE", parameters = Map( - "url" -> "jdbc:", - "message" -> "Failed table creation: test.new_table")) + "url" -> "jdbc:.*", + "tableName" -> "`test`.`new_table`")) } } @@ -639,14 +639,14 @@ class JDBCTableCatalogSuite extends QueryTest with SharedSparkSession { } test("SPARK-42904: CREATE TABLE with char/varchar with invalid char length") { - checkError( + checkErrorMatchPVals( exception = intercept[AnalysisException]{ sql("CREATE TABLE h2.test.new_table(c CHAR(1000000001))") }, - errorClass = "FAILED_JDBC.UNCLASSIFIED", + errorClass = "FAILED_JDBC.CREATE_TABLE", parameters = Map( - "url" -> "jdbc:", - "message" -> "Failed table creation: test.new_table")) + "url" -> "jdbc:.*", + "tableName" -> "`test`.`new_table`")) } test("SPARK-42955: Skip classifyException and wrap AnalysisException for SparkThrowable") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/state/StateDataSourceReadSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/state/StateDataSourceReadSuite.scala index c800168b507a8..e6cdd0dce9efa 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/state/StateDataSourceReadSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/state/StateDataSourceReadSuite.scala @@ -18,16 +18,18 @@ package org.apache.spark.sql.execution.datasources.v2.state import java.io.{File, FileWriter} +import org.apache.hadoop.conf.Configuration import org.scalatest.Assertions -import org.apache.spark.SparkUnsupportedOperationException +import org.apache.spark.{SparkException, SparkUnsupportedOperationException} import org.apache.spark.io.CompressionCodec import org.apache.spark.sql.{AnalysisException, DataFrame, Encoders, Row} import org.apache.spark.sql.catalyst.expressions.{BoundReference, GenericInternalRow} import org.apache.spark.sql.catalyst.plans.physical.HashPartitioning import org.apache.spark.sql.execution.datasources.v2.state.utils.SchemaUtil import org.apache.spark.sql.execution.streaming.{CommitLog, MemoryStream, OffsetSeqLog} -import org.apache.spark.sql.execution.streaming.state.{HDFSBackedStateStoreProvider, RocksDBStateStoreProvider, StateStore} +import org.apache.spark.sql.execution.streaming.state._ +import org.apache.spark.sql.functions.col import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.{IntegerType, StructType} @@ -194,6 +196,78 @@ class StateDataSourceNegativeTestSuite extends StateDataSourceTestBase { } } } + + test("ERROR: snapshotStartBatchId specified as a negative value") { + withTempDir { tempDir => + val exc = intercept[StateDataSourceInvalidOptionValueIsNegative] { + spark.read.format("statestore") + // trick to bypass getting the last committed batch before validating operator ID + .option(StateSourceOptions.BATCH_ID, 0) + .option(StateSourceOptions.SNAPSHOT_START_BATCH_ID, -1) + .load(tempDir.getAbsolutePath) + } + checkError(exc, "STDS_INVALID_OPTION_VALUE.IS_NEGATIVE", "42616", + Map("optionName" -> StateSourceOptions.SNAPSHOT_START_BATCH_ID)) + } + } + + test("ERROR: snapshotPartitionId specified as a negative value") { + withTempDir { tempDir => + val exc = intercept[StateDataSourceInvalidOptionValueIsNegative] { + spark.read.format("statestore") + // trick to bypass getting the last committed batch before validating operator ID + .option(StateSourceOptions.BATCH_ID, 0) + .option(StateSourceOptions.SNAPSHOT_PARTITION_ID, -1) + .load(tempDir.getAbsolutePath) + } + checkError(exc, "STDS_INVALID_OPTION_VALUE.IS_NEGATIVE", "42616", + Map("optionName" -> StateSourceOptions.SNAPSHOT_PARTITION_ID)) + } + } + + test("ERROR: snapshotStartBatchId specified without snapshotPartitionId or vice versa") { + withTempDir { tempDir => + val exc = intercept[StateDataSourceUnspecifiedRequiredOption] { + spark.read.format("statestore") + // trick to bypass getting the last committed batch before validating operator ID + .option(StateSourceOptions.BATCH_ID, 0) + .option(StateSourceOptions.SNAPSHOT_START_BATCH_ID, 0) + .load(tempDir.getAbsolutePath) + } + checkError(exc, "STDS_REQUIRED_OPTION_UNSPECIFIED", "42601", + Map("optionName" -> StateSourceOptions.SNAPSHOT_PARTITION_ID)) + } + + withTempDir { tempDir => + val exc = intercept[StateDataSourceUnspecifiedRequiredOption] { + spark.read.format("statestore") + // trick to bypass getting the last committed batch before validating operator ID + .option(StateSourceOptions.BATCH_ID, 0) + .option(StateSourceOptions.SNAPSHOT_PARTITION_ID, 0) + .load(tempDir.getAbsolutePath) + } + checkError(exc, "STDS_REQUIRED_OPTION_UNSPECIFIED", "42601", + Map("optionName" -> StateSourceOptions.SNAPSHOT_START_BATCH_ID)) + } + } + + test("ERROR: snapshotStartBatchId is greater than snapshotEndBatchId") { + withTempDir { tempDir => + val startBatchId = 1 + val endBatchId = 0 + val exc = intercept[StateDataSourceInvalidOptionValue] { + spark.read.format("statestore") + // trick to bypass getting the last committed batch before validating operator ID + .option(StateSourceOptions.SNAPSHOT_START_BATCH_ID, startBatchId) + .option(StateSourceOptions.BATCH_ID, endBatchId) + .load(tempDir.getAbsolutePath) + } + checkError(exc, "STDS_INVALID_OPTION_VALUE.WITH_MESSAGE", "42616", + Map( + "optionName" -> StateSourceOptions.SNAPSHOT_START_BATCH_ID, + "message" -> s"value should be less than or equal to $endBatchId")) + } + } } /** @@ -301,35 +375,138 @@ class StateDataSourceSQLConfigSuite extends StateDataSourceTestBase { } class HDFSBackedStateDataSourceReadSuite extends StateDataSourceReadSuite { + override protected def newStateStoreProvider(): HDFSBackedStateStoreProvider = + new HDFSBackedStateStoreProvider + override def beforeAll(): Unit = { super.beforeAll() spark.conf.set(SQLConf.STATE_STORE_PROVIDER_CLASS.key, - classOf[HDFSBackedStateStoreProvider].getName) + newStateStoreProvider().getClass.getName) + // make sure we have a snapshot for every two delta files + // HDFS maintenance task will not count the latest delta file, which has the same version + // as the snapshot version + spark.conf.set(SQLConf.STATE_STORE_MIN_DELTAS_FOR_SNAPSHOT.key, 1) + } + + test("ERROR: snapshot of version not found") { + testSnapshotNotFound() + } + + test("provider.replayReadStoreFromSnapshot(snapshotVersion, endVersion)") { + testGetReadStoreWithStartVersion() + } + + test("option snapshotPartitionId") { + testSnapshotPartitionId() + } + + test("snapshotStatBatchId on limit state") { + testSnapshotOnLimitState("hdfs") + } + + test("snapshotStatBatchId on aggregation state") { + testSnapshotOnAggregateState("hdfs") + } + + test("snapshotStatBatchId on deduplication state") { + testSnapshotOnDeduplicateState("hdfs") + } + + test("snapshotStatBatchId on join state") { + testSnapshotOnJoinState("hdfs", 1) + testSnapshotOnJoinState("hdfs", 2) } } class RocksDBStateDataSourceReadSuite extends StateDataSourceReadSuite { + override protected def newStateStoreProvider(): RocksDBStateStoreProvider = + new RocksDBStateStoreProvider + override def beforeAll(): Unit = { super.beforeAll() spark.conf.set(SQLConf.STATE_STORE_PROVIDER_CLASS.key, - classOf[RocksDBStateStoreProvider].getName) + newStateStoreProvider().getClass.getName) spark.conf.set("spark.sql.streaming.stateStore.rocksdb.changelogCheckpointing.enabled", "false") } } -class RocksDBWithChangelogCheckpointStateDataSourceReaderSuite extends StateDataSourceReadSuite { +class RocksDBWithChangelogCheckpointStateDataSourceReaderSuite extends +StateDataSourceReadSuite { + override protected def newStateStoreProvider(): RocksDBStateStoreProvider = + new RocksDBStateStoreProvider + override def beforeAll(): Unit = { super.beforeAll() spark.conf.set(SQLConf.STATE_STORE_PROVIDER_CLASS.key, - classOf[RocksDBStateStoreProvider].getName) + newStateStoreProvider().getClass.getName) spark.conf.set("spark.sql.streaming.stateStore.rocksdb.changelogCheckpointing.enabled", "true") + // make sure we have a snapshot for every other checkpoint + // RocksDB maintenance task will count the latest checkpoint, so we need to set it to 2 + spark.conf.set(SQLConf.STATE_STORE_MIN_DELTAS_FOR_SNAPSHOT.key, 2) + } + + test("ERROR: snapshot of version not found") { + testSnapshotNotFound() + } + + test("provider.getReadStore(snapshotVersion, endVersion)") { + testGetReadStoreWithStartVersion() + } + + test("option snapshotPartitionId") { + testSnapshotPartitionId() + } + + test("snapshotStatBatchId on limit state") { + testSnapshotOnLimitState("rocksdb") + } + + test("snapshotStatBatchId on aggregation state") { + testSnapshotOnAggregateState("rocksdb") + } + + test("snapshotStatBatchId on deduplication state") { + testSnapshotOnDeduplicateState("rocksdb") + } + + test("snapshotStatBatchId on join state") { + testSnapshotOnJoinState("rocksdb", 1) + testSnapshotOnJoinState("rocksdb", 2) } } abstract class StateDataSourceReadSuite extends StateDataSourceTestBase with Assertions { + import testImplicits._ + import StateStoreTestsHelper._ + + protected val keySchema: StructType = StateStoreTestsHelper.keySchema + protected val valueSchema: StructType = StateStoreTestsHelper.valueSchema + + protected def newStateStoreProvider(): StateStoreProvider + + /** + * Calls the overridable [[newStateStoreProvider]] to create the state store provider instance. + * Initialize it with the configuration set by child classes. + * + * @param checkpointDir path to store state information + * @return instance of class extending [[StateStoreProvider]] + */ + private def getNewStateStoreProvider(checkpointDir: String): StateStoreProvider = { + val provider = newStateStoreProvider() + provider.init( + StateStoreId(checkpointDir, 0, 0), + keySchema, + valueSchema, + NoPrefixKeyStateEncoderSpec(keySchema), + useColumnFamilies = false, + StateStoreConf(spark.sessionState.conf), + new Configuration) + provider + } + test("simple aggregation, state ver 1") { testStreamingAggregation(1) } @@ -796,4 +973,228 @@ abstract class StateDataSourceReadSuite extends StateDataSourceTestBase with Ass testForSide("right") } } + + protected def testSnapshotNotFound(): Unit = { + withTempDir { tempDir => + val provider = getNewStateStoreProvider(tempDir.getAbsolutePath) + for (i <- 1 to 4) { + val store = provider.getStore(i - 1) + put(store, "a", i, i) + store.commit() + provider.doMaintenance() // create a snapshot every other delta file + } + + val exc = intercept[SparkException] { + provider.asInstanceOf[SupportsFineGrainedReplay] + .replayReadStateFromSnapshot(1, 2) + } + checkError(exc, "CANNOT_LOAD_STATE_STORE.UNCATEGORIZED") + } + } + + protected def testGetReadStoreWithStartVersion(): Unit = { + withTempDir { tempDir => + val provider = getNewStateStoreProvider(tempDir.getAbsolutePath) + for (i <- 1 to 4) { + val store = provider.getStore(i - 1) + put(store, "a", i, i) + store.commit() + provider.doMaintenance() + } + + val result = + provider.asInstanceOf[SupportsFineGrainedReplay] + .replayReadStateFromSnapshot(2, 3) + + assert(get(result, "a", 1).get == 1) + assert(get(result, "a", 2).get == 2) + assert(get(result, "a", 3).get == 3) + assert(get(result, "a", 4).isEmpty) + + provider.close() + } + } + + protected def testSnapshotPartitionId(): Unit = { + withTempDir { tempDir => + val inputData = MemoryStream[Int] + val df = inputData.toDF().limit(10) + + testStream(df)( + StartStream(checkpointLocation = tempDir.getAbsolutePath), + AddData(inputData, 1, 2, 3, 4), + CheckLastBatch(1, 2, 3, 4) + ) + + val stateDf = spark.read.format("statestore") + .option(StateSourceOptions.SNAPSHOT_START_BATCH_ID, 0) + .option(StateSourceOptions.SNAPSHOT_PARTITION_ID, 0) + .option(StateSourceOptions.BATCH_ID, 0) + .load(tempDir.getAbsolutePath) + + // should result in only one partition && should not throw error in planning stage + assert(stateDf.rdd.getNumPartitions == 1) + + // should throw error when partition id is out of range + val stateDfError = spark.read.format("statestore") + .option(StateSourceOptions.SNAPSHOT_START_BATCH_ID, 0) + .option( + StateSourceOptions.SNAPSHOT_PARTITION_ID, 1) + .option(StateSourceOptions.BATCH_ID, 0) + .load(tempDir.getAbsolutePath) + + val exc = intercept[StateStoreSnapshotPartitionNotFound] { + stateDfError.show() + } + assert(exc.getErrorClass === "CANNOT_LOAD_STATE_STORE.SNAPSHOT_PARTITION_ID_NOT_FOUND") + } + } + + private def testSnapshotStateDfAgainstStateDf(resourceDir: File): Unit = { + val stateSnapshotDf = spark.read.format("statestore") + .option("snapshotPartitionId", 0) + .option("snapshotStartBatchId", 1) + .load(resourceDir.getAbsolutePath) + + val stateDf = spark.read.format("statestore") + .load(resourceDir.getAbsolutePath) + .filter(col("partition_id") === 0) + + checkAnswer(stateSnapshotDf, stateDf) + } + + protected def testSnapshotOnLimitState(providerName: String): Unit = { + /** The golden files are generated by: + withSQLConf({ + SQLConf.STREAMING_MAINTENANCE_INTERVAL.key -> "100" + }) { + val inputData = MemoryStream[(Int, Long)] + val query = inputData.toDF().limit(10) + testStream(query)( + StartStream(checkpointLocation = <...>), + AddData(inputData, (1, 1L), (2, 2L), (3, 3L)), + ProcessAllAvailable(), + Execute { _ => Thread.sleep(2000) }, + AddData(inputData, (4, 4L), (5, 5L), (6, 6L)), + ProcessAllAvailable(), + Execute { _ => Thread.sleep(2000) }, + AddData(inputData, (7, 7L), (8, 8L), (9, 9L)), + ProcessAllAvailable(), + Execute { _ => Thread.sleep(2000) }, + AddData(inputData, (10, 10L), (11, 11L), (12, 12L)), + ProcessAllAvailable(), + Execute { _ => Thread.sleep(2000) } + ) + } + */ + val resourceUri = this.getClass.getResource( + s"/structured-streaming/checkpoint-version-4.0.0/$providerName/limit/" + ).toURI + + testSnapshotStateDfAgainstStateDf(new File(resourceUri)) + } + + protected def testSnapshotOnAggregateState(providerName: String): Unit = { + /** The golden files are generated by: + withSQLConf({ + SQLConf.STREAMING_MAINTENANCE_INTERVAL.key -> "100" + }) { + val inputData = MemoryStream[(Int, Long)] + val query = inputData.toDF().groupBy("_1").count() + testStream(query, OutputMode.Update)( + StartStream(checkpointLocation = <...>), + AddData(inputData, (1, 1L), (2, 2L), (3, 3L)), + ProcessAllAvailable(), + Execute { _ => Thread.sleep(2000) }, + AddData(inputData, (2, 2L), (3, 3L), (4, 4L)), + ProcessAllAvailable(), + Execute { _ => Thread.sleep(2000) }, + AddData(inputData, (3, 3L), (4, 4L), (5, 5L)), + ProcessAllAvailable(), + Execute { _ => Thread.sleep(2000) }, + AddData(inputData, (4, 4L), (5, 5L), (6, 6L)), + ProcessAllAvailable(), + Execute { _ => Thread.sleep(2000) } + ) + } + */ + val resourceUri = this.getClass.getResource( + s"/structured-streaming/checkpoint-version-4.0.0/$providerName/dedup/" + ).toURI + + testSnapshotStateDfAgainstStateDf(new File(resourceUri)) + } + + protected def testSnapshotOnDeduplicateState(providerName: String): Unit = { + /** The golden files are generated by: + withSQLConf({ + SQLConf.STREAMING_MAINTENANCE_INTERVAL.key -> "100" + }) { + val inputData = MemoryStream[(Int, Long)] + val query = inputData.toDF().dropDuplicates("_1") + testStream(query)( + StartStream(checkpointLocation = <...>), + AddData(inputData, (1, 1L), (2, 2L), (3, 3L)), + ProcessAllAvailable(), + Execute { _ => Thread.sleep(2000) }, + AddData(inputData, (2, 2L), (3, 3L), (4, 4L)), + ProcessAllAvailable(), + Execute { _ => Thread.sleep(2000) }, + AddData(inputData, (3, 3L), (4, 4L), (5, 5L)), + ProcessAllAvailable(), + Execute { _ => Thread.sleep(2000) }, + AddData(inputData, (4, 4L), (5, 5L), (6, 6L)), + ProcessAllAvailable(), + Execute { _ => Thread.sleep(2000) } + ) + } + */ + val resourceUri = this.getClass.getResource( + s"/structured-streaming/checkpoint-version-4.0.0/$providerName/dedup/" + ).toURI + + testSnapshotStateDfAgainstStateDf(new File(resourceUri)) + } + + protected def testSnapshotOnJoinState(providerName: String, stateVersion: Int): Unit = { + /** The golden files are generated by: + withSQLConf({ + SQLConf.STREAMING_JOIN_STATE_FORMAT_VERSION.key -> stateVersion.toString + SQLConf.STREAMING_MAINTENANCE_INTERVAL.key -> "100" + }) { + val inputData = MemoryStream[(Int, Long)] + val query = getStreamStreamJoinQuery(inputData) + testStream(query)( + StartStream(checkpointLocation = <...>), + AddData(inputData, (1, 1L), (2, 2L), (3, 3L), (4, 4L), (5, 5L)), + ProcessAllAvailable(), + Execute { _ => Thread.sleep(2000) }, + AddData(inputData, (6, 6L), (7, 7L), (8, 8L), (9, 9L), (10, 10L)), + ProcessAllAvailable(), + Execute { _ => Thread.sleep(2000) }, + AddData(inputData, (11, 11L), (12, 12L), (13, 13L), (14, 14L), (15, 15L)), + ProcessAllAvailable(), + Execute { _ => Thread.sleep(2000) } + ) + } + */ + val resourceUri = this.getClass.getResource( + s"/structured-streaming/checkpoint-version-4.0.0/$providerName/join$stateVersion/" + ).toURI + + val resourceDir = new File(resourceUri) + + val stateSnapshotDf = spark.read.format("statestore") + .option("snapshotPartitionId", 2) + .option("snapshotStartBatchId", 1) + .option("joinSide", "left") + .load(resourceDir.getAbsolutePath) + + val stateDf = spark.read.format("statestore") + .option("joinSide", "left") + .load(resourceDir.getAbsolutePath) + .filter(col("partition_id") === 2) + + checkAnswer(stateSnapshotDf, stateDf) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala index 3c408ab8ee90e..930cc29878108 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala @@ -77,6 +77,20 @@ class XmlSuite override def excluded: Seq[String] = Seq( s"Propagate Hadoop configs from $dataSourceFormat options to underlying file system") + private val baseOptions = Map("rowTag" -> "ROW") + + private def readData( + xmlString: String, + schemaOpt: Option[StructType], + options: Map[String, String] = Map.empty): DataFrame = { + val ds = spark.createDataset(spark.sparkContext.parallelize(Seq(xmlString)))(Encoders.STRING) + if (schemaOpt.isDefined) { + spark.read.schema(schemaOpt.get).options(options).xml(ds) + } else { + spark.read.options(options).xml(ds) + } + } + // Tests test("DSL test") { @@ -252,10 +266,10 @@ class XmlSuite .xml(inputFile) .collect() } - checkError( + checkErrorMatchPVals( exception = exceptionInParsing, errorClass = "FAILED_READ_FILE.NO_HINT", - parameters = Map("path" -> Path.of(inputFile).toUri.toString)) + parameters = Map("path" -> s".*$inputFile.*")) checkError( exception = exceptionInParsing.getCause.asInstanceOf[SparkException], errorClass = "MALFORMED_RECORD_IN_PARSING.WITHOUT_SUGGESTION", @@ -284,10 +298,10 @@ class XmlSuite .xml(inputFile) .show() } - checkError( + checkErrorMatchPVals( exception = exceptionInParsing, errorClass = "FAILED_READ_FILE.NO_HINT", - parameters = Map("path" -> Path.of(inputFile).toUri.toString)) + parameters = Map("path" -> s".*$inputFile.*")) checkError( exception = exceptionInParsing.getCause.asInstanceOf[SparkException], errorClass = "MALFORMED_RECORD_IN_PARSING.WITHOUT_SUGGESTION", @@ -1206,14 +1220,16 @@ class XmlSuite } test("test XSD validation") { - val basketDF = spark.read - .option("rowTag", "basket") - .option("inferSchema", true) - .option("rowValidationXSDPath", getTestResourcePath(resDir + "basket.xsd") - .replace("file:/", "/")) - .xml(getTestResourcePath(resDir + "basket.xml")) - // Mostly checking it doesn't fail - assert(basketDF.selectExpr("entry[0].key").head().getLong(0) === 9027) + Seq("basket.xsd", "include-example/first.xsd").foreach { xsdFile => + val basketDF = spark.read + .option("rowTag", "basket") + .option("inferSchema", true) + .option("rowValidationXSDPath", getTestResourcePath(resDir + xsdFile) + .replace("file:/", "/")) + .xml(getTestResourcePath(resDir + "basket.xml")) + // Mostly checking it doesn't fail + assert(basketDF.selectExpr("entry[0].key").head().getLong(0) === 9027) + } } test("test XSD validation with validation error") { @@ -1279,26 +1295,6 @@ class XmlSuite assert(result.select("decoded._foo").head().getString(0) === "bar") } - /* - test("from_xml array basic test") { - val xmlData = - """12345dave guy - |67890other guy""".stripMargin - val df = Seq((8, xmlData)).toDF("number", "payload") - val xmlSchema = ArrayType( - StructType( - StructField("pid", IntegerType) :: - StructField("name", StringType) :: Nil)) - val expectedSchema = df.schema.add("decoded", xmlSchema) - val result = df.withColumn("decoded", - from_xml(df.col("payload"), xmlSchema)) - assert(expectedSchema === result.schema) - // TODO: ArrayType and MapType support in from_xml - // assert(result.selectExpr("decoded[0].pid").head().getInt(0) === 12345) - // assert(result.selectExpr("decoded[1].pid").head().getInt(1) === 67890) - } - */ - test("from_xml error test") { // XML contains error val xmlData = @@ -2445,7 +2441,6 @@ class XmlSuite val exp = spark.sql("select timestamp_ntz'2020-12-12 12:12:12' as col0") for (pattern <- patterns) { withTempPath { path => - val actualPath = path.toPath.toUri.toURL.toString val err = intercept[SparkException] { exp.write.option("timestampNTZFormat", pattern) .option("rowTag", "ROW").xml(path.getAbsolutePath) @@ -2453,7 +2448,7 @@ class XmlSuite checkErrorMatchPVals( exception = err, errorClass = "TASK_WRITE_FAILED", - parameters = Map("path" -> s"$actualPath.*")) + parameters = Map("path" -> s".*${path.getName}.*")) val msg = err.getCause.getMessage assert( msg.contains("Unsupported field: OffsetSeconds") || @@ -2625,6 +2620,18 @@ class XmlSuite val expectedResults3 = Seq.range(1, 18).map(Row(_)) checkAnswer(results3, expectedResults3) + + val results4 = spark.read.format("xml") + .option("rowTag", "ROW") + .load(getTestResourcePath(resDir + "cdata-no-ignore.xml")) + + val expectedResults4 = Seq( + Row("1"), + Row("2"), + Row("3"), + Row("4"), + Row("5")) + checkAnswer(results4, expectedResults4) } test("capture values interspersed between elements - nested struct") { @@ -2960,11 +2967,10 @@ class XmlSuite .mode(SaveMode.Overwrite) .xml(path) } - val actualPath = Path.of(dir.getAbsolutePath).toUri.toURL.toString.stripSuffix("/") checkErrorMatchPVals( exception = e, errorClass = "TASK_WRITE_FAILED", - parameters = Map("path" -> s"$actualPath.*")) + parameters = Map("path" -> s".*${dir.getName}.*")) assert(e.getCause.isInstanceOf[XMLStreamException]) assert(e.getCause.getMessage.contains(errorMsg)) } @@ -3013,6 +3019,149 @@ class XmlSuite } } } + + ///////////////////////////////////// + // Projection, sorting, filtering // + ///////////////////////////////////// + test("select with string xml object") { + val xmlString = + s""" + | + | John + | 3 + | + |""".stripMargin + val schema = new StructType() + .add("name", StringType) + .add("metadata", StringType) + val df = readData(xmlString, Some(schema), baseOptions) + checkAnswer(df.select("name"), Seq(Row("John"))) + } + + test("select with duplicate field name in string xml object") { + val xmlString = + s""" + | + | c + | d + | + |""".stripMargin + val schema = new StructType() + .add("a", StringType) + .add("b", StringType) + val df = readData(xmlString, Some(schema), baseOptions) + val dfWithSchemaInference = readData(xmlString, None, baseOptions) + Seq(df, dfWithSchemaInference).foreach { df => + checkAnswer(df.select("b"), Seq(Row("d"))) + } + } + + test("select nested struct objects") { + val xmlString = + s""" + | + | + | + | 1 + | 2 + | + | + | + |""".stripMargin + val schema = new StructType() + .add( + "struct", + new StructType() + .add("innerStruct", new StructType().add("field1", LongType).add("field2", LongType)) + ) + val df = readData(xmlString, Some(schema), baseOptions) + val dfWithSchemaInference = readData(xmlString, None, baseOptions) + Seq(df, dfWithSchemaInference).foreach { df => + checkAnswer(df.select("struct"), Seq(Row(Row(Row(1, 2))))) + checkAnswer(df.select("struct.innerStruct"), Seq(Row(Row(1, 2)))) + } + } + + test("select a struct of lists") { + val xmlString = + s""" + | + | + | 1 + | 2 + | 3 + | + | + |""".stripMargin + val schema = new StructType() + .add( + "struct", + new StructType() + .add("array", ArrayType(StructType(StructField("field", LongType) :: Nil)))) + + val df = readData(xmlString, Some(schema), baseOptions) + val dfWithSchemaInference = readData(xmlString, None, baseOptions) + Seq(df, dfWithSchemaInference).foreach { df => + checkAnswer(df.select("struct"), Seq(Row(Row(Array(Row(1), Row(2), Row(3)))))) + checkAnswer(df.select("struct.array"), Seq(Row(Array(Row(1), Row(2), Row(3))))) + } + } + + test("select complex objects") { + val xmlString = + s""" + | + | 1 + | + | value2 + | + | 3 + | + | value4 + | + | 5 + | 1 + | value6 + | 2 + | 7 + | + | value8 + | string + | 9 + | + | value10 + | + | + | 3 + | 11 + | 4 + | + | string + | value12 + | + | 13 + | 3 + | value14 + | + | 15 + | + | + | value16 + | + | + |""".stripMargin + val df = readData(xmlString, None, baseOptions ++ Map("valueTag" -> "VALUE")) + checkAnswer(df.select("struct1.VALUE"), Seq(Row(Seq("value2", "15")))) + checkAnswer(df.select("struct1.struct2.array1"), Seq(Row(Seq( + Row(Seq("value4", "value8", "9"), "string", Row(Seq("5", "value6", "7"), Seq(1, 2))), + Row(Seq("value12"), "string", Row(Seq("11"), Seq(3, 4))) + )))) + checkAnswer(df.select("struct1.struct2.array1.struct3"), Seq(Row(Seq( + Row(Seq("5", "value6", "7"), Seq(1, 2)), + Row(Seq("11"), Seq(3, 4)) + )))) + checkAnswer(df.select("struct1.struct2.array1.string"), Seq(Row(Seq("string", "string")))) + } } // Mock file system that checks the number of open files diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/parsers/StaxXmlParserUtilsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/parsers/StaxXmlParserUtilsSuite.scala index a4ac25b036c41..ad5b176f71f7c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/parsers/StaxXmlParserUtilsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/parsers/StaxXmlParserUtilsSuite.scala @@ -73,17 +73,34 @@ final class StaxXmlParserUtilsSuite extends SparkFunSuite with BeforeAndAfterAll val input = Sam Mad Dog Smith1 922 - val parser = factory.createXMLEventReader(new StringReader(input.toString)) - // We assume here it's reading the value within `id` field. - StaxXmlParserUtils.skipUntil(parser, XMLStreamConstants.CHARACTERS) - StaxXmlParserUtils.skipChildren(parser) - assert(parser.nextEvent().asEndElement().getName.getLocalPart === "info") - parser.next() - StaxXmlParserUtils.skipChildren(parser) - assert(parser.nextEvent().asEndElement().getName.getLocalPart === "abc") - parser.next() - StaxXmlParserUtils.skipChildren(parser) - assert(parser.nextEvent().asEndElement().getName.getLocalPart === "test") + val xmlOptions = new XmlOptions() + // skip the entire row + val parser1 = factory.createXMLEventReader(new StringReader(input.toString)) + StaxXmlParserUtils.skipUntil(parser1, XMLStreamConstants.START_ELEMENT) + StaxXmlParserUtils.skipChildren(parser1, "ROW", xmlOptions) + assert(parser1.peek().getEventType === XMLStreamConstants.END_DOCUMENT) + + // skip and respectively + val parser2 = factory.createXMLEventReader(new StringReader(input.toString)) + StaxXmlParserUtils.skipUntil(parser2, XMLStreamConstants.CHARACTERS) + // skip + val elementName1 = + StaxXmlParserUtils.getName(parser2.nextEvent().asStartElement().getName, xmlOptions) + StaxXmlParserUtils.skipChildren(parser2, elementName1, xmlOptions) + assert(parser2.peek().getEventType === XMLStreamConstants.START_ELEMENT) + val elementName2 = + StaxXmlParserUtils.getName(parser2.peek().asStartElement().getName, xmlOptions) + assert( + StaxXmlParserUtils + .getName(parser2.peek().asStartElement().getName, xmlOptions) == elementName2 + ) + // skip + parser2.nextEvent() + StaxXmlParserUtils.skipChildren(parser2, elementName2, xmlOptions) + assert(parser2.peek().getEventType === XMLStreamConstants.END_ELEMENT) + assert( + StaxXmlParserUtils.getName(parser2.peek().asEndElement().getName, xmlOptions) == "info" + ) } test("XML Input Factory disables DTD parsing") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonStreamingDataSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonStreamingDataSourceSuite.scala index 6f4bd1888fbb4..2d1449bd96cb5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonStreamingDataSourceSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonStreamingDataSourceSuite.scala @@ -25,7 +25,7 @@ import org.apache.spark.SparkException import org.apache.spark.sql.{AnalysisException, DataFrame, Row} import org.apache.spark.sql.IntegratedUDFTestUtils.{createUserDefinedPythonDataSource, shouldTestPandasUDFs} import org.apache.spark.sql.execution.datasources.v2.python.{PythonDataSourceV2, PythonMicroBatchStream, PythonStreamingSourceOffset} -import org.apache.spark.sql.execution.streaming.{MemoryStream, ProcessingTimeTrigger} +import org.apache.spark.sql.execution.streaming.{CommitLog, MemoryStream, OffsetSeqLog, ProcessingTimeTrigger} import org.apache.spark.sql.streaming.StreamingQueryException import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.CaseInsensitiveStringMap @@ -36,11 +36,11 @@ class PythonStreamingDataSourceSuite extends PythonDataSourceSuiteBase { val waitTimeout = 15.seconds - protected def simpleDataStreamReaderScript: String = + protected def testDataStreamReaderScript: String = """ |from pyspark.sql.datasource import DataSourceStreamReader, InputPartition | - |class SimpleDataStreamReader(DataSourceStreamReader): + |class TestDataStreamReader(DataSourceStreamReader): | current = 0 | def initialOffset(self): | return {"offset": {"partition-1": 0}} @@ -57,6 +57,43 @@ class PythonStreamingDataSourceSuite extends PythonDataSourceSuiteBase { | yield (partition.value,) |""".stripMargin + protected def simpleDataStreamReaderScript: String = + """ + |from pyspark.sql.datasource import SimpleDataSourceStreamReader + | + |class SimpleDataStreamReader(SimpleDataSourceStreamReader): + | def initialOffset(self): + | return {"partition-1": 0} + | def read(self, start: dict): + | start_idx = start["partition-1"] + | it = iter([(i, ) for i in range(start_idx, start_idx + 2)]) + | return (it, {"partition-1": start_idx + 2}) + | def readBetweenOffsets(self, start: dict, end: dict): + | start_idx = start["partition-1"] + | end_idx = end["partition-1"] + | return iter([(i, ) for i in range(start_idx, end_idx)]) + |""".stripMargin + + protected def simpleDataStreamReaderWithEmptyBatchScript: String = + """ + |from pyspark.sql.datasource import SimpleDataSourceStreamReader + | + |class SimpleDataStreamReader(SimpleDataSourceStreamReader): + | def initialOffset(self): + | return {"partition-1": 0} + | def read(self, start: dict): + | start_idx = start["partition-1"] + | if start_idx % 4 == 0: + | it = iter([(i, ) for i in range(start_idx, start_idx + 2)]) + | else: + | it = iter([]) + | return (it, {"partition-1": start_idx + 2}) + | def readBetweenOffsets(self, start: dict, end: dict): + | start_idx = start["partition-1"] + | end_idx = end["partition-1"] + | return iter([(i, ) for i in range(start_idx, end_idx)]) + |""".stripMargin + protected def errorDataStreamReaderScript: String = """ |from pyspark.sql.datasource import DataSourceStreamReader, InputPartition @@ -117,11 +154,11 @@ class PythonStreamingDataSourceSuite extends PythonDataSourceSuiteBase { val dataSourceScript = s""" |from pyspark.sql.datasource import DataSource - |$simpleDataStreamReaderScript + |$testDataStreamReaderScript | |class $dataSourceName(DataSource): | def streamReader(self, schema): - | return SimpleDataStreamReader() + | return TestDataStreamReader() |""".stripMargin val inputSchema = StructType.fromDDL("input BINARY") @@ -144,7 +181,7 @@ class PythonStreamingDataSourceSuite extends PythonDataSourceSuiteBase { stream.stop() } - test("Read from simple data stream source") { + test("SimpleDataSourceStreamReader run query and restart") { assume(shouldTestPandasUDFs) val dataSourceScript = s""" @@ -154,9 +191,264 @@ class PythonStreamingDataSourceSuite extends PythonDataSourceSuiteBase { |class $dataSourceName(DataSource): | def schema(self) -> str: | return "id INT" - | def streamReader(self, schema): + | def simpleStreamReader(self, schema): + | return SimpleDataStreamReader() + |""".stripMargin + val dataSource = createUserDefinedPythonDataSource(dataSourceName, dataSourceScript) + spark.dataSource.registerPython(dataSourceName, dataSource) + assert(spark.sessionState.dataSourceManager.dataSourceExists(dataSourceName)) + withTempDir { dir => + val path = dir.getAbsolutePath + val checkpointDir = new File(path, "checkpoint") + val df = spark.readStream.format(dataSourceName).load() + + val stopSignal1 = new CountDownLatch(1) + + val q1 = df + .writeStream + .option("checkpointLocation", checkpointDir.getAbsolutePath) + .foreachBatch((df: DataFrame, batchId: Long) => { + df.cache() + checkAnswer(df, Seq(Row(batchId * 2), Row(batchId * 2 + 1))) + if (batchId == 10) stopSignal1.countDown() + }) + .start() + stopSignal1.await() + assert(q1.recentProgress.forall(_.numInputRows == 2)) + q1.stop() + q1.awaitTermination() + + val stopSignal2 = new CountDownLatch(1) + val q2 = df + .writeStream + .option("checkpointLocation", checkpointDir.getAbsolutePath) + .foreachBatch((df: DataFrame, batchId: Long) => { + df.cache() + checkAnswer(df, Seq(Row(batchId * 2), Row(batchId * 2 + 1))) + if (batchId == 20) stopSignal2.countDown() + }).start() + stopSignal2.await() + assert(q2.recentProgress.forall(_.numInputRows == 2)) + q2.stop() + q2.awaitTermination() + } + } + + // Verify prefetch and cache pattern of SimpleDataSourceStreamReader handle empty + // data batch correctly. + test("SimpleDataSourceStreamReader read empty batch") { + assume(shouldTestPandasUDFs) + val dataSourceScript = + s""" + |from pyspark.sql.datasource import DataSource + |$simpleDataStreamReaderWithEmptyBatchScript + | + |class $dataSourceName(DataSource): + | def schema(self) -> str: + | return "id INT" + | def simpleStreamReader(self, schema): | return SimpleDataStreamReader() |""".stripMargin + val dataSource = createUserDefinedPythonDataSource(dataSourceName, dataSourceScript) + spark.dataSource.registerPython(dataSourceName, dataSource) + assert(spark.sessionState.dataSourceManager.dataSourceExists(dataSourceName)) + withTempDir { dir => + val path = dir.getAbsolutePath + val checkpointDir = new File(path, "checkpoint") + val df = spark.readStream.format(dataSourceName).load() + + val stopSignal = new CountDownLatch(1) + + val q = df + .writeStream + .option("checkpointLocation", checkpointDir.getAbsolutePath) + .foreachBatch((df: DataFrame, batchId: Long) => { + df.cache() + if (batchId % 2 == 0) { + checkAnswer(df, Seq(Row(batchId * 2), Row(batchId * 2 + 1))) + } else { + assert(df.isEmpty) + } + if (batchId == 10) stopSignal.countDown() + }) + .start() + stopSignal.await() + q.stop() + q.awaitTermination() + } + } + + test("SimpleDataSourceStreamReader read exactly once") { + assume(shouldTestPandasUDFs) + val dataSourceScript = + s""" + |from pyspark.sql.datasource import DataSource + |$simpleDataStreamReaderScript + | + |class $dataSourceName(DataSource): + | def schema(self) -> str: + | return "id INT" + | def simpleStreamReader(self, schema): + | return SimpleDataStreamReader() + |""".stripMargin + val dataSource = createUserDefinedPythonDataSource(dataSourceName, dataSourceScript) + spark.dataSource.registerPython(dataSourceName, dataSource) + assert(spark.sessionState.dataSourceManager.dataSourceExists(dataSourceName)) + withTempDir { dir => + val path = dir.getAbsolutePath + val checkpointDir = new File(path, "checkpoint") + val outputDir = new File(path, "output") + val df = spark.readStream.format(dataSourceName).load() + var lastBatchId = 0 + // Restart streaming query multiple times to verify exactly once guarantee. + for (i <- 1 to 5) { + + if (i % 2 == 0) { + // Remove the last entry of commit log to test replaying microbatch during restart. + val offsetLog = new OffsetSeqLog( + spark, new File(checkpointDir, "offsets").getCanonicalPath) + val commitLog = new CommitLog( + spark, new File(checkpointDir, "commits").getCanonicalPath) + commitLog.purgeAfter(offsetLog.getLatest().get._1 - 1) + } + + val q = df + .writeStream + .option("checkpointLocation", checkpointDir.getAbsolutePath) + .format("json") + .start(outputDir.getAbsolutePath) + + while (q.recentProgress.length < 5) { + Thread.sleep(200) + } + q.stop() + q.awaitTermination() + lastBatchId = q.lastProgress.batchId.toInt + } + assert(lastBatchId > 20) + val rowCount = spark.read.format("json").load(outputDir.getAbsolutePath).count() + // There may be one uncommitted batch that is not recorded in query progress. + // The number of batch can be lastBatchId + 1 or lastBatchId + 2. + assert(rowCount == 2 * (lastBatchId + 1) || rowCount == 2 * (lastBatchId + 2)) + checkAnswer(spark.read.format("json").load(outputDir.getAbsolutePath), + (0 until rowCount.toInt).map(Row(_))) + } + } + + test("initialOffset() method not implemented in SimpleDataSourceStreamReader") { + assume(shouldTestPandasUDFs) + val initialOffsetNotImplementedScript = + s""" + |from pyspark.sql.datasource import DataSource + |from pyspark.sql.datasource import SimpleDataSourceStreamReader + |class ErrorDataStreamReader(SimpleDataSourceStreamReader): + | ... + | + |class $errorDataSourceName(DataSource): + | def simpleStreamReader(self, schema): + | return ErrorDataStreamReader() + |""".stripMargin + val inputSchema = StructType.fromDDL("input BINARY") + + val dataSource = + createUserDefinedPythonDataSource(errorDataSourceName, initialOffsetNotImplementedScript) + spark.dataSource.registerPython(errorDataSourceName, dataSource) + val pythonDs = new PythonDataSourceV2 + pythonDs.setShortName("ErrorDataSource") + + def testMicroBatchStreamError(action: String, msg: String) + (func: PythonMicroBatchStream => Unit): Unit = { + val stream = new PythonMicroBatchStream( + pythonDs, errorDataSourceName, inputSchema, CaseInsensitiveStringMap.empty()) + val err = intercept[SparkException] { + func(stream) + } + checkErrorMatchPVals(err, + errorClass = "PYTHON_STREAMING_DATA_SOURCE_RUNTIME_ERROR", + parameters = Map( + "action" -> action, + "msg" -> "(.|\\n)*" + )) + assert(err.getMessage.contains(msg)) + assert(err.getMessage.contains("ErrorDataSource")) + stream.stop() + } + + testMicroBatchStreamError( + "initialOffset", "[NOT_IMPLEMENTED] initialOffset is not implemented") { + stream => stream.initialOffset() + } + + // User don't need to implement latestOffset for SimpleDataSourceStreamReader. + // The latestOffset method of simple stream reader invokes initialOffset() and read() + // So the not implemented method is initialOffset. + testMicroBatchStreamError( + "latestOffset", "[NOT_IMPLEMENTED] initialOffset is not implemented") { + stream => stream.latestOffset() + } + } + + test("read() method throw error in SimpleDataSourceStreamReader") { + assume(shouldTestPandasUDFs) + val initialOffsetNotImplementedScript = + s""" + |from pyspark.sql.datasource import DataSource + |from pyspark.sql.datasource import SimpleDataSourceStreamReader + |class ErrorDataStreamReader(SimpleDataSourceStreamReader): + | def initialOffset(self): + | return {"partition": 1} + | def read(self, start): + | raise Exception("error reading available data") + | + |class $errorDataSourceName(DataSource): + | def simpleStreamReader(self, schema): + | return ErrorDataStreamReader() + |""".stripMargin + val inputSchema = StructType.fromDDL("input BINARY") + + val dataSource = + createUserDefinedPythonDataSource(errorDataSourceName, initialOffsetNotImplementedScript) + spark.dataSource.registerPython(errorDataSourceName, dataSource) + val pythonDs = new PythonDataSourceV2 + pythonDs.setShortName("ErrorDataSource") + + def testMicroBatchStreamError(action: String, msg: String) + (func: PythonMicroBatchStream => Unit): Unit = { + val stream = new PythonMicroBatchStream( + pythonDs, errorDataSourceName, inputSchema, CaseInsensitiveStringMap.empty()) + val err = intercept[SparkException] { + func(stream) + } + checkErrorMatchPVals(err, + errorClass = "PYTHON_STREAMING_DATA_SOURCE_RUNTIME_ERROR", + parameters = Map( + "action" -> action, + "msg" -> "(.|\\n)*" + )) + assert(err.getMessage.contains(msg)) + assert(err.getMessage.contains("ErrorDataSource")) + stream.stop() + } + + testMicroBatchStreamError( + "latestOffset", "Exception: error reading available data") { + stream => stream.latestOffset() + } + } + + test("Read from test data stream source") { + assume(shouldTestPandasUDFs) + val dataSourceScript = + s""" + |from pyspark.sql.datasource import DataSource + |$testDataStreamReaderScript + | + |class $dataSourceName(DataSource): + | def schema(self) -> str: + | return "id INT" + | def streamReader(self, schema): + | return TestDataStreamReader() + |""".stripMargin val dataSource = createUserDefinedPythonDataSource(dataSourceName, dataSourceScript) spark.dataSource.registerPython(dataSourceName, dataSource) @@ -178,6 +470,42 @@ class PythonStreamingDataSourceSuite extends PythonDataSourceSuiteBase { q.awaitTermination() } + // Verify that socket between python runner and JVM doesn't timeout with large trigger interval. + test("Read from test data stream source, trigger interval=20 seconds") { + assume(shouldTestPandasUDFs) + val dataSourceScript = + s""" + |from pyspark.sql.datasource import DataSource + |$testDataStreamReaderScript + | + |class $dataSourceName(DataSource): + | def schema(self) -> str: + | return "id INT" + | def streamReader(self, schema): + | return TestDataStreamReader() + |""".stripMargin + + val dataSource = createUserDefinedPythonDataSource(dataSourceName, dataSourceScript) + spark.dataSource.registerPython(dataSourceName, dataSource) + assert(spark.sessionState.dataSourceManager.dataSourceExists(dataSourceName)) + val df = spark.readStream.format(dataSourceName).load() + + val stopSignal = new CountDownLatch(1) + + val q = df.writeStream.foreachBatch((df: DataFrame, batchId: Long) => { + // checkAnswer may materialize the dataframe more than once + // Cache here to make sure the numInputRows metrics is consistent. + df.cache() + checkAnswer(df, Seq(Row(batchId * 2), Row(batchId * 2 + 1))) + if (batchId >= 2) stopSignal.countDown() + }).trigger(ProcessingTimeTrigger(20 * 1000)).start() + stopSignal.await() + assert(q.recentProgress.forall(_.numInputRows == 2)) + q.stop() + q.awaitTermination() + assert(q.exception.isEmpty) + } + test("Streaming data source read with custom partitions") { assume(shouldTestPandasUDFs) val dataSourceScript = @@ -188,7 +516,7 @@ class PythonStreamingDataSourceSuite extends PythonDataSourceSuiteBase { | self.start = start | self.end = end | - |class SimpleDataStreamReader(DataSourceStreamReader): + |class TestDataStreamReader(DataSourceStreamReader): | current = 0 | def initialOffset(self): | return {"offset": 0} @@ -210,7 +538,7 @@ class PythonStreamingDataSourceSuite extends PythonDataSourceSuiteBase { | return "id INT" | | def streamReader(self, schema): - | return SimpleDataStreamReader() + | return TestDataStreamReader() |""".stripMargin val dataSource = createUserDefinedPythonDataSource(dataSourceName, dataSourceScript) spark.dataSource.registerPython(dataSourceName, dataSource) @@ -303,7 +631,6 @@ class PythonStreamingDataSourceSuite extends PythonDataSourceSuiteBase { assert(err.getMessage.contains("error reading data")) } - test("Method not implemented in stream reader") { assume(shouldTestPandasUDFs) val dataSourceScript = @@ -476,6 +803,46 @@ class PythonStreamingDataSourceSuite extends PythonDataSourceSuiteBase { } } + // Verify that commit runner work correctly with large timeout interval. + test(s"data source stream write, trigger interval=20 seconds") { + assume(shouldTestPandasUDFs) + val dataSource = + createUserDefinedPythonDataSource(dataSourceName, simpleDataStreamWriterScript) + spark.dataSource.registerPython(dataSourceName, dataSource) + val inputData = MemoryStream[Int](numPartitions = 3) + val df = inputData.toDF() + withTempDir { dir => + val path = dir.getAbsolutePath + val checkpointDir = new File(path, "checkpoint") + checkpointDir.mkdir() + val outputDir = new File(path, "output") + outputDir.mkdir() + val q = df + .writeStream + .format(dataSourceName) + .option("checkpointLocation", checkpointDir.getAbsolutePath) + .trigger(ProcessingTimeTrigger(20 * 1000)) + .start(outputDir.getAbsolutePath) + def resultDf: DataFrame = spark.read.format("json") + .load(outputDir.getAbsolutePath) + + inputData.addData(1 to 3) + eventually(timeout(waitTimeout * 5)) { + assert(q.lastProgress.batchId >= 1) + } + checkAnswer(resultDf, (1 to 3).map(Row(_))) + + inputData.addData(4 to 6) + eventually(timeout(waitTimeout * 5)) { + assert(q.lastProgress.batchId >= 2) + } + checkAnswer(resultDf, (1 to 6).map(Row(_))) + q.stop() + q.awaitTermination() + assert(q.exception.isEmpty) + } + } + test("streaming sink write commit and abort") { assume(shouldTestPandasUDFs) // The data source write the number of rows and partitions into batchId.json in diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonUDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonUDFSuite.scala index 3101281251b1b..2e56ad0ab4160 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonUDFSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonUDFSuite.scala @@ -17,8 +17,8 @@ package org.apache.spark.sql.execution.python -import org.apache.spark.sql.{IntegratedUDFTestUtils, QueryTest} -import org.apache.spark.sql.functions.count +import org.apache.spark.sql.{AnalysisException, IntegratedUDFTestUtils, QueryTest} +import org.apache.spark.sql.functions.{array, count, transform} import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types.LongType @@ -112,4 +112,16 @@ class PythonUDFSuite extends QueryTest with SharedSparkSession { val pandasTestUDF = TestGroupedAggPandasUDF(name = udfName) assert(df.agg(pandasTestUDF(df("id"))).schema.fieldNames.exists(_.startsWith(udfName))) } + + test("SPARK-48706: Negative test case for Python UDF in higher order functions") { + assume(shouldTestPythonUDFs) + checkError( + exception = intercept[AnalysisException] { + spark.range(1).select(transform(array("id"), x => pythonTestUDF(x))).collect() + }, + errorClass = "UNSUPPORTED_FEATURE.LAMBDA_FUNCTION_WITH_PYTHON_UDF", + parameters = Map("funcName" -> "\"pyUDF(namedlambdavariable())\""), + context = ExpectedContext( + "transform", s".*${this.getClass.getSimpleName}.*")) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonUDTFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonUDTFSuite.scala index 989597ae041db..1eaf1d24056da 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonUDTFSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonUDTFSuite.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.execution.python import org.apache.spark.api.python.PythonEvalType import org.apache.spark.sql.{AnalysisException, IntegratedUDFTestUtils, QueryTest, Row} import org.apache.spark.sql.catalyst.expressions.{Add, Alias, Expression, FunctionTableSubqueryArgumentExpression, Literal} +import org.apache.spark.sql.catalyst.parser.ParseException import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan, OneRowRelation, Project, Repartition, RepartitionByExpression, Sort, SubqueryAlias} import org.apache.spark.sql.functions.lit import org.apache.spark.sql.internal.SQLConf @@ -363,4 +364,29 @@ class PythonUDTFSuite extends QueryTest with SharedSparkSession { Row("abc")) } } + + test("SPARK-48180: Analyzer bug with multiple ORDER BY items for input table argument") { + assume(shouldTestPythonUDFs) + spark.udtf.registerPython("testUDTF", pythonUDTF) + checkError( + exception = intercept[ParseException](sql( + """ + |SELECT * FROM testUDTF( + | TABLE(SELECT 1 AS device_id, 2 AS data_ds) + | WITH SINGLE PARTITION + | ORDER BY device_id, data_ds) + |""".stripMargin)), + errorClass = "_LEGACY_ERROR_TEMP_0064", + parameters = Map("msg" -> + ("The table function call includes a table argument with an invalid " + + "partitioning/ordering specification: the ORDER BY clause included multiple " + + "expressions without parentheses surrounding them; please add parentheses around these " + + "expressions and then retry the query again")), + context = ExpectedContext( + fragment = "TABLE(SELECT 1 AS device_id, 2 AS data_ds)\n " + + "WITH SINGLE PARTITION\n " + + "ORDER BY device_id, data_ds", + start = 27, + stop = 122)) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/MapStateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/MapStateSuite.scala index 572fc2429273b..5b304c55dd5a7 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/MapStateSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/MapStateSuite.scala @@ -17,12 +17,14 @@ package org.apache.spark.sql.execution.streaming.state +import java.time.Duration import java.util.UUID +import org.apache.spark.SparkUnsupportedOperationException import org.apache.spark.sql.Encoders import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder -import org.apache.spark.sql.execution.streaming.{ImplicitGroupingKeyTracker, StatefulProcessorHandleImpl} -import org.apache.spark.sql.streaming.{ListState, MapState, TimeMode, ValueState} +import org.apache.spark.sql.execution.streaming.{ImplicitGroupingKeyTracker, MapStateImplWithTTL, StatefulProcessorHandleImpl} +import org.apache.spark.sql.streaming.{ListState, MapState, TimeMode, TTLConfig, ValueState} import org.apache.spark.sql.types.{BinaryType, StructType} /** @@ -167,4 +169,90 @@ class MapStateSuite extends StateVariableSuiteBase { assert(mapTestState2.iterator().toList === List(("k2", 4))) } } + + test("test Map state TTL") { + tryWithProviderResource(newStoreProviderWithStateVariable(true)) { provider => + val store = provider.getStore(0) + val timestampMs = 10 + val handle = new StatefulProcessorHandleImpl(store, UUID.randomUUID(), + Encoders.STRING.asInstanceOf[ExpressionEncoder[Any]], TimeMode.ProcessingTime(), + batchTimestampMs = Some(timestampMs)) + + val ttlConfig = TTLConfig(ttlDuration = Duration.ofMinutes(1)) + val testState: MapStateImplWithTTL[String, String] = + handle.getMapState[String, String]("testState", Encoders.STRING, + Encoders.STRING, ttlConfig).asInstanceOf[MapStateImplWithTTL[String, String]] + ImplicitGroupingKeyTracker.setImplicitKey("test_key") + testState.updateValue("k1", "v1") + assert(testState.getValue("k1") === "v1") + assert(testState.getWithoutEnforcingTTL("k1").get === "v1") + + val ttlExpirationMs = timestampMs + 60000 + var ttlValue = testState.getTTLValue("k1") + assert(ttlValue.isDefined) + assert(ttlValue.get._2 === ttlExpirationMs) + var ttlStateValueIterator = testState.getKeyValuesInTTLState().map(_._2) + assert(ttlStateValueIterator.hasNext) + + // increment batchProcessingTime, or watermark and ensure expired value is not returned + val nextBatchHandle = new StatefulProcessorHandleImpl(store, UUID.randomUUID(), + Encoders.STRING.asInstanceOf[ExpressionEncoder[Any]], + TimeMode.ProcessingTime(), batchTimestampMs = Some(ttlExpirationMs)) + + val nextBatchTestState: MapStateImplWithTTL[String, String] = + nextBatchHandle.getMapState[String, String]( + "testState", Encoders.STRING, Encoders.STRING, ttlConfig) + .asInstanceOf[MapStateImplWithTTL[String, String]] + + ImplicitGroupingKeyTracker.setImplicitKey("test_key") + + // ensure get does not return the expired value + assert(!nextBatchTestState.exists()) + assert(nextBatchTestState.getValue("k1") === null) + + // ttl value should still exist in state + ttlValue = nextBatchTestState.getTTLValue("k1") + assert(ttlValue.isDefined) + assert(ttlValue.get._2 === ttlExpirationMs) + ttlStateValueIterator = nextBatchTestState.getKeyValuesInTTLState().map(_._2) + assert(ttlStateValueIterator.hasNext) + assert(ttlStateValueIterator.next() === ttlExpirationMs) + assert(ttlStateValueIterator.isEmpty) + + // getWithoutTTL should still return the expired value + assert(nextBatchTestState.getWithoutEnforcingTTL("k1").get === "v1") + + nextBatchTestState.clear() + assert(!nextBatchTestState.exists()) + assert(nextBatchTestState.getValue("k1") === null) + } + } + + test("test negative or zero TTL duration throws error") { + tryWithProviderResource(newStoreProviderWithStateVariable(true)) { provider => + val store = provider.getStore(0) + val batchTimestampMs = 10 + val handle = new StatefulProcessorHandleImpl(store, UUID.randomUUID(), + Encoders.STRING.asInstanceOf[ExpressionEncoder[Any]], + TimeMode.ProcessingTime(), batchTimestampMs = Some(batchTimestampMs)) + + Seq(null, Duration.ZERO, Duration.ofMinutes(-1)).foreach { ttlDuration => + val ttlConfig = TTLConfig(ttlDuration) + val ex = intercept[SparkUnsupportedOperationException] { + handle.getMapState[String, String]( + "testState", Encoders.STRING, Encoders.STRING, ttlConfig) + } + + checkError( + ex, + errorClass = "STATEFUL_PROCESSOR_TTL_DURATION_MUST_BE_POSITIVE", + parameters = Map( + "operationType" -> "update", + "stateName" -> "testState" + ), + matchPVals = true + ) + } + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/RocksDBSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/RocksDBSuite.scala index ab2afa1b8a617..ea54fb997ca2e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/RocksDBSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/RocksDBSuite.scala @@ -19,8 +19,11 @@ package org.apache.spark.sql.execution.streaming.state import java.io._ import java.nio.charset.Charset +import java.util.concurrent.Executors import scala.collection.mutable +import scala.concurrent.{ExecutionContext, Future} +import scala.concurrent.duration._ import scala.language.implicitConversions import org.apache.commons.io.FileUtils @@ -874,6 +877,41 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared ) } + testWithChangelogCheckpointingEnabled("RocksDBFileManager: " + + "background snapshot upload doesn't acquire RocksDB instance lock") { + // Create a custom ExecutionContext + implicit val ec: ExecutionContext = ExecutionContext + .fromExecutor(Executors.newSingleThreadExecutor()) + + val remoteDir = Utils.createTempDir().toString + val conf = dbConf.copy(lockAcquireTimeoutMs = 10000, minDeltasForSnapshot = 0) + new File(remoteDir).delete() // to make sure that the directory gets created + + withDB(remoteDir, conf = conf) { db => + db.load(0) + db.put("0", "0") + db.commit() + + // Acquire lock + db.load(1) + db.put("1", "1") + + // Run doMaintenance in another thread + val maintenanceFuture = Future { + db.doMaintenance() + } + + val timeout = 5.seconds + + // Ensure that maintenance task runs without being blocked by task thread + ThreadUtils.awaitResult(maintenanceFuture, timeout) + assert(snapshotVersionsPresent(remoteDir) == Seq(1)) + + // Release lock + db.commit() + } + } + testWithChangelogCheckpointingEnabled("RocksDBFileManager: read and write changelog") { val dfsRootDir = new File(Utils.createTempDir().getAbsolutePath + "/state/1/1") val fileManager = new RocksDBFileManager( @@ -1699,6 +1737,11 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared db.load(0) db.put("a", "1") db.commit() + if (boundedMemoryUsage == "true") { + assert(db.metricsOpt.get.totalMemUsageBytes === 0) + } else { + assert(db.metricsOpt.get.totalMemUsageBytes > 0) + } db.getWriteBufferManagerAndCache() } @@ -1709,6 +1752,11 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared db.load(0) db.put("a", "1") db.commit() + if (boundedMemoryUsage == "true") { + assert(db.metricsOpt.get.totalMemUsageBytes === 0) + } else { + assert(db.metricsOpt.get.totalMemUsageBytes > 0) + } db.getWriteBufferManagerAndCache() } @@ -1758,6 +1806,7 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared db.remove("a") db.put("c", "3") db.commit() + assert(db.metricsOpt.get.totalMemUsageBytes === 0) } } finally { RocksDBMemoryManager.resetWriteBufferManagerAndCache @@ -1930,7 +1979,7 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared } testWithChangelogCheckpointingEnabled("time travel 4 -" + - " validate successful RocksDB load") { + " validate successful RocksDB load when metadata file is overwritten") { val remoteDir = Utils.createTempDir().toString val conf = dbConf.copy(minDeltasForSnapshot = 2, compactOnCommit = false) new File(remoteDir).delete() // to make sure that the directory gets created @@ -1945,8 +1994,7 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared db.load(1) db.put("3", "3") - // do maintenance - upload any latest snapshots so far - // would fail to acquire lock and no snapshots would be uploaded + // upload any latest snapshots so far db.doMaintenance() db.commit() // upload newly created snapshot 2.zip @@ -1958,6 +2006,47 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared } } + testWithChangelogCheckpointingEnabled("time travel 5 -" + + "validate successful RocksDB load when metadata file is not overwritten") { + // Ensure commit doesn't modify the latestSnapshot that doMaintenance will upload + val fmClass = "org.apache.spark.sql.execution.streaming.state." + + "NoOverwriteFileSystemBasedCheckpointFileManager" + withTempDir { dir => + val conf = dbConf.copy(minDeltasForSnapshot = 0) // create snapshot every commit + val hadoopConf = new Configuration() + hadoopConf.set(STREAMING_CHECKPOINT_FILE_MANAGER_CLASS.parent.key, fmClass) + + val remoteDir = dir.getCanonicalPath + withDB(remoteDir, conf = conf, hadoopConf = hadoopConf) { db => + db.load(0) + db.put("a", "1") + db.commit() + + // load previous version, and recreate the snapshot + db.load(0) + db.put("a", "1") + + // upload version 1 snapshot created above + db.doMaintenance() + assert(snapshotVersionsPresent(remoteDir) == Seq(1)) + + db.commit() // create snapshot again + + // load version 1 - should succeed + withDB(remoteDir, version = 1, conf = conf, hadoopConf = hadoopConf) { db => + } + + // upload recently created snapshot + db.doMaintenance() + assert(snapshotVersionsPresent(remoteDir) == Seq(1)) + + // load version 1 again - should succeed + withDB(remoteDir, version = 1, conf = conf, hadoopConf = hadoopConf) { db => + } + } + } + } + test("validate Rocks DB SST files do not have a VersionIdMismatch" + " when metadata file is not overwritten - scenario 1") { val fmClass = "org.apache.spark.sql.execution.streaming.state." + @@ -2257,7 +2346,11 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared numKeys: Int): Unit = { val checkpointDir = Utils.createTempDir().getAbsolutePath // local dir to create checkpoints generateFiles(checkpointDir, fileToLengths) - fileManager.saveCheckpointToDfs(checkpointDir, version, numKeys) + fileManager.saveCheckpointToDfs( + checkpointDir, + version, + numKeys, + fileManager.captureFileMapReference()) } def loadAndVerifyCheckpointFiles( diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateSchemaCompatibilityCheckerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateSchemaCompatibilityCheckerSuite.scala index a089a05469f75..feab7a5fa3b0a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateSchemaCompatibilityCheckerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateSchemaCompatibilityCheckerSuite.scala @@ -19,10 +19,12 @@ package org.apache.spark.sql.execution.streaming.state import java.util.UUID -import scala.util.Random +import scala.util.{Random, Try} import org.apache.hadoop.conf.Configuration +import org.apache.spark.SparkUnsupportedOperationException +import org.apache.spark.sql.execution.streaming.StatefulOperatorStateInfo import org.apache.spark.sql.execution.streaming.state.StateStoreTestsHelper.newDir import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types._ @@ -65,12 +67,12 @@ class StateSchemaCompatibilityCheckerSuite extends SharedSparkSession { private val keySchemaWithCollation = new StructType() .add(StructField("key1", IntegerType, nullable = true)) - .add(StructField("key2", StringType("UTF8_BINARY_LCASE"), nullable = true)) + .add(StructField("key2", StringType("UTF8_LCASE"), nullable = true)) .add(StructField("key3", structSchema, nullable = true)) private val valueSchemaWithCollation = new StructType() .add(StructField("value1", IntegerType, nullable = true)) - .add(StructField("value2", StringType("UTF8_BINARY_LCASE"), nullable = true)) + .add(StructField("value2", StringType("UTF8_LCASE"), nullable = true)) .add(StructField("value3", structSchema, nullable = true)) // Checks on adding/removing (nested) field. @@ -253,9 +255,9 @@ class StateSchemaCompatibilityCheckerSuite extends SharedSparkSession { test("SPARK-47776: checking for compatibility with collation change in key") { verifyException(keySchema, valueSchema, keySchemaWithCollation, valueSchema, - ignoreValueSchema = false) + ignoreValueSchema = false, keyCollationChecks = true) verifyException(keySchemaWithCollation, valueSchema, keySchema, valueSchema, - ignoreValueSchema = false) + ignoreValueSchema = false, keyCollationChecks = true) } test("SPARK-47776: checking for compatibility with collation change in value") { @@ -287,47 +289,47 @@ class StateSchemaCompatibilityCheckerSuite extends SharedSparkSession { StructType(newFields) } - private def runSchemaChecker( - dir: String, - queryId: UUID, - newKeySchema: StructType, - newValueSchema: StructType, - ignoreValueSchema: Boolean): Unit = { - // in fact, Spark doesn't support online state schema change, so need to check - // schema only once for each running of JVM - val providerId = StateStoreProviderId( - StateStoreId(dir, opId, partitionId), queryId) - - new StateSchemaCompatibilityChecker(providerId, hadoopConf) - .check(newKeySchema, newValueSchema, ignoreValueSchema = ignoreValueSchema) - } - private def verifyException( oldKeySchema: StructType, oldValueSchema: StructType, newKeySchema: StructType, newValueSchema: StructType, - ignoreValueSchema: Boolean = false): Unit = { + ignoreValueSchema: Boolean = false, + keyCollationChecks: Boolean = false): Unit = { val dir = newDir() - val queryId = UUID.randomUUID() - runSchemaChecker(dir, queryId, oldKeySchema, oldValueSchema, - ignoreValueSchema = ignoreValueSchema) - - val e = intercept[StateSchemaNotCompatible] { - runSchemaChecker(dir, queryId, newKeySchema, newValueSchema, - ignoreValueSchema = ignoreValueSchema) + val runId = UUID.randomUUID() + val stateInfo = StatefulOperatorStateInfo(dir, runId, opId, 0, 200) + val formatValidationForValue = !ignoreValueSchema + val extraOptions = Map(StateStoreConf.FORMAT_VALIDATION_CHECK_VALUE_CONFIG + -> formatValidationForValue.toString) + + val result = Try( + StateSchemaCompatibilityChecker.validateAndMaybeEvolveStateSchema(stateInfo, hadoopConf, + oldKeySchema, oldValueSchema, spark.sessionState, extraOptions) + ).toEither.fold(Some(_), _ => None) + + val ex = if (result.isDefined) { + result.get.asInstanceOf[SparkUnsupportedOperationException] + } else { + intercept[SparkUnsupportedOperationException] { + StateSchemaCompatibilityChecker.validateAndMaybeEvolveStateSchema(stateInfo, hadoopConf, + newKeySchema, newValueSchema, spark.sessionState, extraOptions) + } } - assert(e.getMessage.contains("Provided schema doesn't match to the schema for existing state!")) - assert(e.getMessage.contains(newKeySchema.toString())) - assert(e.getMessage.contains(oldKeySchema.toString())) - - if (ignoreValueSchema) { - assert(!e.getMessage.contains(newValueSchema.toString())) - assert(!e.getMessage.contains(oldValueSchema.toString())) + // collation checks are also performed in this path. so we need to check for them explicitly. + if (keyCollationChecks) { + assert(ex.getMessage.contains("Binary inequality column is not supported")) + assert(ex.getErrorClass === "STATE_STORE_UNSUPPORTED_OPERATION_BINARY_INEQUALITY") } else { - assert(e.getMessage.contains(newValueSchema.toString())) - assert(e.getMessage.contains(oldValueSchema.toString())) + if (ignoreValueSchema) { + // if value schema is ignored, the mismatch has to be on the key schema + assert(ex.getErrorClass === "STATE_STORE_KEY_SCHEMA_NOT_COMPATIBLE") + } else { + assert(ex.getErrorClass === "STATE_STORE_KEY_SCHEMA_NOT_COMPATIBLE" || + ex.getErrorClass === "STATE_STORE_VALUE_SCHEMA_NOT_COMPATIBLE") + } + assert(ex.getMessage.contains("does not match existing")) } } @@ -338,10 +340,16 @@ class StateSchemaCompatibilityCheckerSuite extends SharedSparkSession { newValueSchema: StructType, ignoreValueSchema: Boolean = false): Unit = { val dir = newDir() - val queryId = UUID.randomUUID() - runSchemaChecker(dir, queryId, oldKeySchema, oldValueSchema, - ignoreValueSchema = ignoreValueSchema) - runSchemaChecker(dir, queryId, newKeySchema, newValueSchema, - ignoreValueSchema = ignoreValueSchema) + val runId = UUID.randomUUID() + val stateInfo = StatefulOperatorStateInfo(dir, runId, opId, 0, 200) + val formatValidationForValue = !ignoreValueSchema + val extraOptions = Map(StateStoreConf.FORMAT_VALIDATION_CHECK_VALUE_CONFIG + -> formatValidationForValue.toString) + + StateSchemaCompatibilityChecker.validateAndMaybeEvolveStateSchema(stateInfo, hadoopConf, + oldKeySchema, oldValueSchema, spark.sessionState, extraOptions) + + StateSchemaCompatibilityChecker.validateAndMaybeEvolveStateSchema(stateInfo, hadoopConf, + newKeySchema, newValueSchema, spark.sessionState, extraOptions) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala index 4523a14ca1ccd..2c4111ec026ae 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala @@ -30,6 +30,8 @@ import scala.util.Random import org.apache.commons.io.FileUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs._ +import org.json4s.DefaultFormats +import org.json4s.jackson.JsonMethods import org.scalatest.{BeforeAndAfter, PrivateMethodTester} import org.scalatest.concurrent.Eventually._ import org.scalatest.time.SpanSugar._ @@ -41,6 +43,7 @@ import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProj import org.apache.spark.sql.catalyst.util.quietly import org.apache.spark.sql.execution.streaming._ import org.apache.spark.sql.execution.streaming.state.StateStoreCoordinatorSuite.withCoordinatorRef +import org.apache.spark.sql.execution.streaming.state.StateStoreValueRowFormatValidationFailure import org.apache.spark.sql.functions.count import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ @@ -388,6 +391,44 @@ class StateStoreSuite extends StateStoreSuiteBase[HDFSBackedStateStoreProvider] } } + test("SPARK-48105: state store unload/close happens during the maintenance") { + tryWithProviderResource( + newStoreProvider(opId = Random.nextInt(), partition = 0, minDeltasForSnapshot = 1)) { + provider => + val store = provider.getStore(0).asInstanceOf[provider.HDFSBackedStateStore] + val values = (1 to 20) + val keys = values.map(i => ("a" + i)) + keys.zip(values).map{case (k, v) => put(store, k, 0, v)} + // commit state store with 20 keys. + store.commit() + // get the state store iterator: mimic the case which the iterator is hold in the + // maintenance thread. + val storeIterator = store.iterator() + + // the store iterator should still be valid as the maintenance thread may have already + // hold it and is doing snapshotting even though the state store is unloaded. + val outputKeys = new mutable.ArrayBuffer[String] + val outputValues = new mutable.ArrayBuffer[Int] + var cnt = 0 + while (storeIterator.hasNext) { + if (cnt == 10) { + // Mimic the case where the provider is loaded in another executor in the middle of + // iteration. When this happens, the provider will be unloaded and closed in + // current executor. + provider.close() + } + val unsafeRowPair = storeIterator.next() + val (key, _) = keyRowToData(unsafeRowPair.key) + outputKeys.append(key) + outputValues.append(valueRowToData(unsafeRowPair.value)) + + cnt = cnt + 1 + } + assert(keys.sorted === outputKeys.sorted) + assert(values.sorted === outputValues.sorted) + } + } + test("maintenance") { val conf = new SparkConf() .setMaster("local") @@ -1568,12 +1609,12 @@ abstract class StateStoreSuiteBase[ProviderClass <: StateStoreProvider] // By default, when there is an invalid pair of value row and value schema, it should throw val keyRow = dataToKeyRow("key", 1) val valueRow = dataToValueRow(2) - val e = intercept[InvalidUnsafeRowException] { + val e = intercept[StateStoreValueRowFormatValidationFailure] { // Here valueRow doesn't match with prefixKeySchema StateStoreProvider.validateStateRowFormat( keyRow, keySchema, valueRow, keySchema, getDefaultStoreConf()) } - assert(e.getMessage.contains("The streaming query failed by state format invalidation")) + assert(e.getMessage.contains("The streaming query failed to validate written state")) // When sqlConf.stateStoreFormatValidationEnabled is set to false and // StateStoreConf.FORMAT_VALIDATION_CHECK_VALUE_CONFIG is set to true, @@ -1588,6 +1629,30 @@ abstract class StateStoreSuiteBase[ProviderClass <: StateStoreProvider] keyRow, keySchema, valueRow, keySchema, storeConf) } + test("test serialization and deserialization of NoPrefixKeyStateEncoderSpec") { + implicit val formats: DefaultFormats.type = DefaultFormats + val encoderSpec = NoPrefixKeyStateEncoderSpec(keySchema) + val jsonMap = JsonMethods.parse(encoderSpec.json).extract[Map[String, Any]] + val deserializedEncoderSpec = KeyStateEncoderSpec.fromJson(keySchema, jsonMap) + assert(encoderSpec == deserializedEncoderSpec) + } + + test("test serialization and deserialization of PrefixKeyScanStateEncoderSpec") { + implicit val formats: DefaultFormats.type = DefaultFormats + val encoderSpec = PrefixKeyScanStateEncoderSpec(keySchema, 1) + val jsonMap = JsonMethods.parse(encoderSpec.json).extract[Map[String, Any]] + val deserializedEncoderSpec = KeyStateEncoderSpec.fromJson(keySchema, jsonMap) + assert(encoderSpec == deserializedEncoderSpec) + } + + test("test serialization and deserialization of RangeKeyScanStateEncoderSpec") { + implicit val formats: DefaultFormats.type = DefaultFormats + val encoderSpec = RangeKeyScanStateEncoderSpec(keySchema, Seq(1)) + val jsonMap = JsonMethods.parse(encoderSpec.json).extract[Map[String, Any]] + val deserializedEncoderSpec = KeyStateEncoderSpec.fromJson(keySchema, jsonMap) + assert(encoderSpec == deserializedEncoderSpec) + } + /** Return a new provider with a random id */ def newStoreProvider(): ProviderClass diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StatefulProcessorHandleSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StatefulProcessorHandleSuite.scala index aafbf4df60af7..52bdb0213c7e5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StatefulProcessorHandleSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StatefulProcessorHandleSuite.scala @@ -255,6 +255,24 @@ class StatefulProcessorHandleSuite extends StateVariableSuiteBase { } } + test("ttl States are populated for mapState and timeMode=ProcessingTime") { + tryWithProviderResource(newStoreProviderWithStateVariable(true)) { provider => + val store = provider.getStore(0) + val handle = new StatefulProcessorHandleImpl(store, + UUID.randomUUID(), keyExprEncoder, TimeMode.ProcessingTime(), + batchTimestampMs = Some(10)) + + val mapStateWithTTL = handle.getMapState("testState", + Encoders.STRING, Encoders.STRING, TTLConfig(Duration.ofHours(1))) + + // create another state without TTL, this should not be captured in the handle + handle.getMapState("testState", Encoders.STRING, Encoders.STRING) + + assert(handle.ttlStates.size() === 1) + assert(handle.ttlStates.get(0) === mapStateWithTTL) + } + } + test("ttl States are not populated for timeMode=None") { tryWithProviderResource(newStoreProviderWithStateVariable(true)) { provider => val store = provider.getStore(0) @@ -263,6 +281,7 @@ class StatefulProcessorHandleSuite extends StateVariableSuiteBase { handle.getValueState("testValueState", Encoders.STRING) handle.getListState("testListState", Encoders.STRING) + handle.getMapState("testMapState", Encoders.STRING, Encoders.STRING) assert(handle.ttlStates.isEmpty) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala index 7cce6086c6fd8..aca968745d198 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala @@ -476,6 +476,180 @@ class ColumnVectorSuite extends SparkFunSuite with SQLHelper { assert(testVector.getDoubles(0, 3)(2) == 1342.17729d) } + def check(expected: Seq[Any], testVector: WritableColumnVector): Unit = { + expected.zipWithIndex.foreach { + case (v: Integer, idx) => + assert(testVector.getInt(idx) == v) + assert(testVector.getInts(0, testVector.capacity)(idx) == v) + case (v: Short, idx) => + assert(testVector.getShort(idx) == v) + assert(testVector.getShorts(0, testVector.capacity)(idx) == v) + case (v: Byte, idx) => + assert(testVector.getByte(idx) == v) + assert(testVector.getBytes(0, testVector.capacity)(idx) == v) + case (v: Long, idx) => + assert(testVector.getLong(idx) == v) + assert(testVector.getLongs(0, testVector.capacity)(idx) == v) + case (v: Float, idx) => + assert(testVector.getFloat(idx) == v) + assert(testVector.getFloats(0, testVector.capacity)(idx) == v) + case (v: Double, idx) => + assert(testVector.getDouble(idx) == v) + assert(testVector.getDoubles(0, testVector.capacity)(idx) == v) + case (null, idx) => testVector.isNullAt(idx) + case (_, idx) => assert(false, s"Unexpected value at $idx") + } + + // Verify ColumnarArray.copy() works as expected + val arr = new ColumnarArray(testVector, 0, testVector.capacity) + assert(arr.toSeq(testVector.dataType) == expected) + assert(arr.copy().toSeq(testVector.dataType) == expected) + } + + testVectors("getInts with dictionary and nulls", 3, IntegerType) { testVector => + // Validate without dictionary + val expected = Seq(1, null, 3) + expected.foreach { + case i: Integer => testVector.appendInt(i) + case _ => testVector.appendNull() + } + check(expected, testVector) + + // Validate with dictionary + val expectedDictionary = Seq(7, null, 9) + val dictArray = (Seq(-1, -1) ++ expectedDictionary.map { + case i: Integer => i.toInt + case _ => -1 + }).toArray + val dict = new ColumnDictionary(dictArray) + testVector.setDictionary(dict) + testVector.reserveDictionaryIds(3) + testVector.getDictionaryIds.putInt(0, 2) + testVector.getDictionaryIds.putInt(1, -1) // This is a null, so the entry should be ignored + testVector.getDictionaryIds.putInt(2, 4) + check(expectedDictionary, testVector) + } + + testVectors("getShorts with dictionary and nulls", 3, ShortType) { testVector => + // Validate without dictionary + val expected = Seq(1.toShort, null, 3.toShort) + expected.foreach { + case i: Short => testVector.appendShort(i) + case _ => testVector.appendNull() + } + check(expected, testVector) + + // Validate with dictionary + val expectedDictionary = Seq(7.toShort, null, 9.toShort) + val dictArray = (Seq(-1, -1) ++ expectedDictionary.map { + case i: Short => i.toInt + case _ => -1 + }).toArray + val dict = new ColumnDictionary(dictArray) + testVector.setDictionary(dict) + testVector.reserveDictionaryIds(3) + testVector.getDictionaryIds.putInt(0, 2) + testVector.getDictionaryIds.putInt(1, -1) // This is a null, so the entry should be ignored + testVector.getDictionaryIds.putInt(2, 4) + check(expectedDictionary, testVector) + } + + testVectors("getBytes with dictionary and nulls", 3, ByteType) { testVector => + // Validate without dictionary + val expected = Seq(1.toByte, null, 3.toByte) + expected.foreach { + case i: Byte => testVector.appendByte(i) + case _ => testVector.appendNull() + } + check(expected, testVector) + + // Validate with dictionary + val expectedDictionary = Seq(7.toByte, null, 9.toByte) + val dictArray = (Seq(-1, -1) ++ expectedDictionary.map { + case i: Byte => i.toInt + case _ => -1 + }).toArray + val dict = new ColumnDictionary(dictArray) + testVector.setDictionary(dict) + testVector.reserveDictionaryIds(3) + testVector.getDictionaryIds.putInt(0, 2) + testVector.getDictionaryIds.putInt(1, -1) // This is a null, so the entry should be ignored + testVector.getDictionaryIds.putInt(2, 4) + check(expectedDictionary, testVector) + } + + testVectors("getLongs with dictionary and nulls", 3, LongType) { testVector => + // Validate without dictionary + val expected = Seq(2147483L, null, 2147485L) + expected.foreach { + case i: Long => testVector.appendLong(i) + case _ => testVector.appendNull() + } + check(expected, testVector) + + // Validate with dictionary + val expectedDictionary = Seq(2147483648L, null, 2147483650L) + val dictArray = (Seq(-1L, -1L) ++ expectedDictionary.map { + case i: Long => i + case _ => -1L + }).toArray + val dict = new ColumnDictionary(dictArray) + testVector.setDictionary(dict) + testVector.reserveDictionaryIds(3) + testVector.getDictionaryIds.putInt(0, 2) + testVector.getDictionaryIds.putInt(1, -1) // This is a null, so the entry should be ignored + testVector.getDictionaryIds.putInt(2, 4) + check(expectedDictionary, testVector) + } + + testVectors("getFloats with dictionary and nulls", 3, FloatType) { testVector => + // Validate without dictionary + val expected = Seq(1.1f, null, 3.3f) + expected.foreach { + case i: Float => testVector.appendFloat(i) + case _ => testVector.appendNull() + } + check(expected, testVector) + + // Validate with dictionary + val expectedDictionary = Seq(0.1f, null, 0.3f) + val dictArray = (Seq(-1f, -1f) ++ expectedDictionary.map { + case i: Float => i + case _ => -1f + }).toArray + val dict = new ColumnDictionary(dictArray) + testVector.setDictionary(dict) + testVector.reserveDictionaryIds(3) + testVector.getDictionaryIds.putInt(0, 2) + testVector.getDictionaryIds.putInt(1, -1) // This is a null, so the entry should be ignored + testVector.getDictionaryIds.putInt(2, 4) + check(expectedDictionary, testVector) + } + + testVectors("getDoubles with dictionary and nulls", 3, DoubleType) { testVector => + // Validate without dictionary + val expected = Seq(1.1d, null, 3.3d) + expected.foreach { + case i: Double => testVector.appendDouble(i) + case _ => testVector.appendNull() + } + check(expected, testVector) + + // Validate with dictionary + val expectedDictionary = Seq(1342.17727d, null, 1342.17729d) + val dictArray = (Seq(-1d, -1d) ++ expectedDictionary.map { + case i: Double => i + case _ => -1d + }).toArray + val dict = new ColumnDictionary(dictArray) + testVector.setDictionary(dict) + testVector.reserveDictionaryIds(3) + testVector.getDictionaryIds.putInt(0, 2) + testVector.getDictionaryIds.putInt(1, -1) // This is a null, so the entry should be ignored + testVector.getDictionaryIds.putInt(2, 4) + check(expectedDictionary, testVector) + } + test("[SPARK-22092] off-heap column vector reallocation corrupts array data") { withVector(new OffHeapColumnVector(8, arrayType)) { testVector => val data = testVector.arrayData() diff --git a/sql/core/src/test/scala/org/apache/spark/sql/expressions/ExpressionInfoSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/expressions/ExpressionInfoSuite.scala index 19251330cffe3..bf5d1b24af219 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/expressions/ExpressionInfoSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/expressions/ExpressionInfoSuite.scala @@ -141,7 +141,10 @@ class ExpressionInfoSuite extends SparkFunSuite with SharedSparkSession { // Examples demonstrate alternative syntax, see SPARK-45574 "org.apache.spark.sql.catalyst.expressions.Cast", // Examples demonstrate alternative syntax, see SPARK-47012 - "org.apache.spark.sql.catalyst.expressions.Collate" + "org.apache.spark.sql.catalyst.expressions.Collate", + classOf[ShiftLeft].getName, + classOf[ShiftRight].getName, + classOf[ShiftRightUnsigned].getName ) spark.sessionState.functionRegistry.listFunction().foreach { funcId => val info = spark.sessionState.catalog.lookupFunctionInfo(funcId) @@ -222,6 +225,9 @@ class ExpressionInfoSuite extends SparkFunSuite with SharedSparkSession { // Throws an error "org.apache.spark.sql.catalyst.expressions.RaiseErrorExpressionBuilder", "org.apache.spark.sql.catalyst.expressions.AssertTrue", + // Requires dynamic class loading not available in this test suite. + "org.apache.spark.sql.catalyst.expressions.FromAvro", + "org.apache.spark.sql.catalyst.expressions.ToAvro", classOf[CurrentUser].getName, // The encrypt expression includes a random initialization vector to its encrypted result classOf[AesEncrypt].getName) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfEntrySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfEntrySuite.scala index cd6894ee43711..26011af37bf42 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfEntrySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfEntrySuite.scala @@ -107,7 +107,7 @@ class SQLConfEntrySuite extends SparkFunSuite { test("stringConf") { val key = "spark.sql.SQLConfEntrySuite.string" - val confEntry = buildConf(key).stringConf.createWithDefault(null) + val confEntry = buildConf(key).stringConf.createWithDefault("") assert(conf.getConf(confEntry, "abc") === "abc") conf.setConf(confEntry, "abcd") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala index 18a06e83c076f..404ec865c1b00 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala @@ -121,16 +121,12 @@ class SQLConfSuite extends QueryTest with SharedSparkSession { test(s"SPARK-35168: ${SQLConf.Deprecated.MAPRED_REDUCE_TASKS} should respect" + s" ${SQLConf.SHUFFLE_PARTITIONS.key}") { - spark.sessionState.conf.clear() - try { - sql(s"SET ${SQLConf.ADAPTIVE_EXECUTION_ENABLED.key}=true") - sql(s"SET ${SQLConf.COALESCE_PARTITIONS_ENABLED.key}=true") - sql(s"SET ${SQLConf.COALESCE_PARTITIONS_INITIAL_PARTITION_NUM.key}=1") - sql(s"SET ${SQLConf.SHUFFLE_PARTITIONS.key}=2") + withSQLConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true", + SQLConf.COALESCE_PARTITIONS_ENABLED.key -> "true", + SQLConf.COALESCE_PARTITIONS_INITIAL_PARTITION_NUM.key -> "1", + SQLConf.SHUFFLE_PARTITIONS.key -> "2") { checkAnswer(sql(s"SET ${SQLConf.Deprecated.MAPRED_REDUCE_TASKS}"), Row(SQLConf.SHUFFLE_PARTITIONS.key, "2")) - } finally { - spark.sessionState.conf.clear() } } @@ -243,9 +239,9 @@ class SQLConfSuite extends QueryTest with SharedSparkSession { } test("invalid conf value") { - spark.sessionState.conf.clear() val e = intercept[IllegalArgumentException] { - sql(s"set ${SQLConf.CASE_SENSITIVE.key}=10") + withSQLConf(SQLConf.CASE_SENSITIVE.key -> "10") { + } } assert(e.getMessage === s"${SQLConf.CASE_SENSITIVE.key} should be boolean, but was 10") } @@ -506,7 +502,7 @@ class SQLConfSuite extends QueryTest with SharedSparkSession { } test("SPARK-47765: set collation") { - Seq("UNICODE", "UNICODE_CI", "utf8_binary_lcase", "utf8_binary").foreach { collation => + Seq("UNICODE", "UNICODE_CI", "utf8_lcase", "utf8_binary").foreach { collation => sql(s"set collation $collation") assert(spark.conf.get(SQLConf.DEFAULT_COLLATION) === collation.toUpperCase(Locale.ROOT)) } @@ -519,8 +515,16 @@ class SQLConfSuite extends QueryTest with SharedSparkSession { parameters = Map( "confValue" -> "UNICODE_C", "confName" -> "spark.sql.session.collation.default", - "proposal" -> "UNICODE_CI" + "proposals" -> "UNICODE" )) + + withSQLConf(SQLConf.COLLATION_ENABLED.key -> "false") { + checkError( + exception = intercept[AnalysisException](sql(s"SET COLLATION UNICODE_CI")), + errorClass = "UNSUPPORTED_FEATURE.COLLATION", + parameters = Map.empty + ) + } } test("SPARK-43028: config not found error") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala index 88bb53cc7488d..e4116b565818e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala @@ -915,7 +915,7 @@ class JDBCSuite extends QueryTest with SharedSparkSession { test("DB2Dialect type mapping") { val db2Dialect = JdbcDialects.get("jdbc:db2://127.0.0.1/db") assert(db2Dialect.getJDBCType(StringType).map(_.databaseTypeDefinition).get == "CLOB") - assert(db2Dialect.getJDBCType(BooleanType).map(_.databaseTypeDefinition).get == "CHAR(1)") + assert(db2Dialect.getJDBCType(BooleanType).map(_.databaseTypeDefinition).get == "BOOLEAN") assert(db2Dialect.getJDBCType(ShortType).map(_.databaseTypeDefinition).get == "SMALLINT") assert(db2Dialect.getJDBCType(ByteType).map(_.databaseTypeDefinition).get == "SMALLINT") // test db2 dialect mappings on read @@ -955,6 +955,27 @@ class JDBCSuite extends QueryTest with SharedSparkSession { Some(DoubleType)) assert(mySqlDialect.getCatalystType(java.sql.Types.CHAR, "JSON", Int.MaxValue, metadata) === Some(StringType)) + assert(mySqlDialect.getCatalystType(java.sql.Types.TIMESTAMP, "DATETIME", 1, + metadata.putBoolean("isTimestampNTZ", false)) === Some(TimestampType)) + assert(mySqlDialect.getCatalystType(java.sql.Types.TIMESTAMP, "DATETIME", 1, + metadata.putBoolean("isTimestampNTZ", true)) === Some(TimestampNTZType)) + withSQLConf(SQLConf.LEGACY_MYSQL_TIMESTAMPNTZ_MAPPING_ENABLED.key -> "true") { + // in legacy mode, fallback to common mapping + assert(mySqlDialect.getCatalystType(java.sql.Types.TIMESTAMP, "TIMESTAMP", 1, + metadata.putBoolean("isTimestampNTZ", true)) === None) + mySqlDialect.getJDBCType(TimestampNTZType).foreach { jdbcType => + assert(jdbcType.databaseTypeDefinition === "TIMESTAMP") + } + } + withSQLConf(SQLConf.LEGACY_MYSQL_TIMESTAMPNTZ_MAPPING_ENABLED.key -> "false") { + Seq(true, false).foreach(isTimestampNTZ => { + assert(mySqlDialect.getCatalystType(java.sql.Types.TIMESTAMP, "TIMESTAMP", 1, + metadata.putBoolean("isTimestampNTZ", isTimestampNTZ)) === Some(TimestampType)) + }) + mySqlDialect.getJDBCType(TimestampNTZType).foreach { jdbcType => + assert(jdbcType.databaseTypeDefinition === "DATETIME") + } + } } test("SPARK-35446: MySQLDialect type mapping of float") { @@ -1333,6 +1354,7 @@ class JDBCSuite extends QueryTest with SharedSparkSession { assert(getJdbcType(oracleDialect, ByteType) == "NUMBER(3)") assert(getJdbcType(oracleDialect, ShortType) == "NUMBER(5)") assert(getJdbcType(oracleDialect, StringType) == "VARCHAR2(255)") + assert(getJdbcType(oracleDialect, VarcharType(100)) == "VARCHAR2(100)") assert(getJdbcType(oracleDialect, BinaryType) == "BLOB") assert(getJdbcType(oracleDialect, DateType) == "DATE") assert(getJdbcType(oracleDialect, TimestampType) == "TIMESTAMP WITH LOCAL TIME ZONE") @@ -1371,9 +1393,9 @@ class JDBCSuite extends QueryTest with SharedSparkSession { test("SPARK-16387: Reserved SQL words are not escaped by JDBC writer") { val df = spark.createDataset(Seq("a", "b", "c")).toDF("order") val schema = JdbcUtils.schemaString( + JdbcDialects.get("jdbc:mysql://localhost:3306/temp"), df.schema, - df.sparkSession.sessionState.conf.caseSensitiveAnalysis, - "jdbc:mysql://localhost:3306/temp") + df.sparkSession.sessionState.conf.caseSensitiveAnalysis) assert(schema.contains("`order` LONGTEXT")) } @@ -1455,16 +1477,11 @@ class JDBCSuite extends QueryTest with SharedSparkSession { } } - test("SPARK-15648: teradataDialect StringType data mapping") { - val teradataDialect = JdbcDialects.get("jdbc:teradata://127.0.0.1/db") - assert(teradataDialect.getJDBCType(StringType). - map(_.databaseTypeDefinition).get == "VARCHAR(255)") - } - - test("SPARK-15648: teradataDialect BooleanType data mapping") { - val teradataDialect = JdbcDialects.get("jdbc:teradata://127.0.0.1/db") - assert(teradataDialect.getJDBCType(BooleanType). - map(_.databaseTypeDefinition).get == "CHAR(1)") + test("SPARK-48399: TeradataDialect jdbc data mapping") { + val dialect = JdbcDialects.get("jdbc:teradata://127.0.0.1/db") + assert(dialect.getJDBCType(StringType).map(_.databaseTypeDefinition).get == "VARCHAR(255)") + assert(dialect.getJDBCType(BooleanType).map(_.databaseTypeDefinition).get == "CHAR(1)") + assert(dialect.getJDBCType(ByteType).map(_.databaseTypeDefinition).get == "BYTEINT") } test("SPARK-38846: TeradataDialect catalyst type mapping") { @@ -2181,4 +2198,12 @@ class JDBCSuite extends QueryTest with SharedSparkSession { dialect = JdbcDialects.get("jdbc:dummy:dummy_host:dummy_port/dummy_db") assert(dialect === NoopDialect) } + + test("SPARK-47882: createTableColumnTypes need to be mapped to database types") { + val dialect = JdbcDialects.get("jdbc:oracle:dummy_host:dummy_port/dummy_db") + val schema = new StructType().add("b", "boolean") + val schemaStr = + JdbcUtils.schemaString(dialect, schema, caseSensitive = false, Some("b boolean")) + assert(schemaStr === """"b" NUMBER(1) """) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCV2Suite.scala index 1b3672cdba5a4..e1a7971b283cc 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCV2Suite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCV2Suite.scala @@ -369,6 +369,20 @@ class JDBCV2Suite extends QueryTest with SharedSparkSession with ExplainSuiteHel } } + test("null value for option exception") { + val df = spark.read + .option("pushDownOffset", null) + .table("h2.test.employee") + checkError( + exception = intercept[AnalysisException] { + df.collect() + }, + errorClass = "NULL_DATA_SOURCE_OPTION", + parameters = Map( + "option" -> "pushDownOffset") + ) + } + test("simple scan with OFFSET") { val df1 = spark.read .table("h2.test.employee") @@ -1305,7 +1319,7 @@ class JDBCV2Suite extends QueryTest with SharedSparkSession with ExplainSuiteHel val df5 = spark.table("h2.test.address").filter($"email".startsWith("abc_'%")) checkFiltersRemoved(df5) checkPushedInfo(df5, - raw"PushedFilters: [EMAIL IS NOT NULL, EMAIL LIKE 'abc\_\'\%%' ESCAPE '\']") + raw"PushedFilters: [EMAIL IS NOT NULL, EMAIL LIKE 'abc\_''\%%' ESCAPE '\']") checkAnswer(df5, Seq(Row("abc_'%def@gmail.com"))) val df6 = spark.table("h2.test.address").filter($"email".endsWith("_def@gmail.com")) @@ -1336,7 +1350,7 @@ class JDBCV2Suite extends QueryTest with SharedSparkSession with ExplainSuiteHel val df10 = spark.table("h2.test.address").filter($"email".endsWith("_'%def@gmail.com")) checkFiltersRemoved(df10) checkPushedInfo(df10, - raw"PushedFilters: [EMAIL IS NOT NULL, EMAIL LIKE '%\_\'\%def@gmail.com' ESCAPE '\']") + raw"PushedFilters: [EMAIL IS NOT NULL, EMAIL LIKE '%\_''\%def@gmail.com' ESCAPE '\']") checkAnswer(df10, Seq(Row("abc_'%def@gmail.com"))) val df11 = spark.table("h2.test.address").filter($"email".contains("c_d")) @@ -1364,7 +1378,7 @@ class JDBCV2Suite extends QueryTest with SharedSparkSession with ExplainSuiteHel val df15 = spark.table("h2.test.address").filter($"email".contains("c_'%d")) checkFiltersRemoved(df15) checkPushedInfo(df15, - raw"PushedFilters: [EMAIL IS NOT NULL, EMAIL LIKE '%c\_\'\%d%' ESCAPE '\']") + raw"PushedFilters: [EMAIL IS NOT NULL, EMAIL LIKE '%c\_''\%d%' ESCAPE '\']") checkAnswer(df15, Seq(Row("abc_'%def@gmail.com"))) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala index 0d9dc2f76faf1..76a092b552f98 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala @@ -406,19 +406,21 @@ class JDBCWriteSuite extends SharedSparkSession with BeforeAndAfter { test("SPARK-10849: test schemaString - from createTableColumnTypes option values") { def testCreateTableColDataTypes(types: Seq[String]): Unit = { + val dialect = JdbcDialects.get(url1) val colTypes = types.zipWithIndex.map { case (t, i) => (s"col$i", t) } val schema = colTypes .foldLeft(new StructType())((schema, colType) => schema.add(colType._1, colType._2)) val createTableColTypes = colTypes.map { case (col, dataType) => s"$col $dataType" }.mkString(", ") - val expectedSchemaStr = - colTypes.map { case (col, dataType) => s""""$col" $dataType """ }.mkString(", ") + val expectedSchemaStr = schema.map { f => + s""""${f.name}" ${JdbcUtils.getJdbcType(f.dataType, dialect).databaseTypeDefinition} """ + }.mkString(", ") assert(JdbcUtils.schemaString( + dialect, schema, spark.sessionState.conf.caseSensitiveAnalysis, - url1, Option(createTableColTypes)) == expectedSchemaStr) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala index 93698fdd7bc0f..e3e385e9d1810 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala @@ -23,7 +23,7 @@ import java.time.{Duration, Period} import org.apache.hadoop.fs.{FileAlreadyExistsException, FSDataOutputStream, Path, RawLocalFileSystem} -import org.apache.spark.{SparkArithmeticException, SparkException} +import org.apache.spark.{SparkArithmeticException, SparkException, SparkRuntimeException} import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType} @@ -953,10 +953,10 @@ class InsertSuite extends DataSourceTest with SharedSparkSession { spark.sessionState.catalog.createTable(newTable, false) sql("INSERT INTO TABLE test_table SELECT 1, 'a'") - val msg = intercept[SparkException] { + val msg = intercept[SparkRuntimeException] { sql("INSERT INTO TABLE test_table SELECT 2, null") - }.getCause.getMessage - assert(msg.contains("Null value appeared in non-nullable field")) + } + assert(msg.getErrorClass == "NOT_NULL_ASSERT_VIOLATION") } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala index fd3d59af7e6b8..ca4f2a7f26ced 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala @@ -2305,7 +2305,7 @@ class FileStreamSourceSuite extends FileStreamSourceTest { } // batch 5 will trigger list operation though the batch 4 should have 1 unseen file: - // 1 is smaller than the threshold (refer FileStreamSource.DISCARD_UNSEEN_FILES_RATIO), + // 1 is smaller than the threshold (refer FileStreamOptions.discardCachedInputRatio), // hence unseen files for batch 4 will be discarded. val offsetBatch = source.latestOffset(FileStreamSourceOffset(-1L), ReadLimit.maxFiles(10)) .asInstanceOf[FileStreamSourceOffset] @@ -2357,6 +2357,142 @@ class FileStreamSourceSuite extends FileStreamSourceTest { } } + test("Options for caching unread files") { + withCountListingLocalFileSystemAsLocalFileSystem { + withThreeTempDirs { case (src, meta, tmp) => + val options = Map("latestFirst" -> "false", "maxFilesPerTrigger" -> "10", + "maxCachedFiles" -> "12", "discardCachedInputRatio" -> "0.1") + val scheme = CountListingLocalFileSystem.scheme + val source = new FileStreamSource(spark, s"$scheme:///${src.getCanonicalPath}/*/*", "text", + StructType(Nil), Seq.empty, meta.getCanonicalPath, options) + val _metadataLog = PrivateMethod[FileStreamSourceLog](Symbol("metadataLog")) + val metadataLog = source invokePrivate _metadataLog() + + def verifyBatch( + offset: FileStreamSourceOffset, + expectedBatchId: Long, + inputFiles: Seq[File], + expectedFileOffset: Int, + expectedFilesInBatch: Int, + expectedListingCount: Int): Unit = { + val batchId = offset.logOffset + assert(batchId === expectedBatchId) + + val files = metadataLog.get(batchId).getOrElse(Array.empty[FileEntry]) + assert(files.forall(_.batchId == batchId)) + + val actualInputFiles = files.map { p => p.sparkPath.toUri.getPath } + val expectedInputFiles = inputFiles.slice( + expectedFileOffset, + expectedFileOffset + expectedFilesInBatch + ) + .map(_.getCanonicalPath) + assert(actualInputFiles === expectedInputFiles) + + assert(expectedListingCount === CountListingLocalFileSystem.pathToNumListStatusCalled + .get(src.getCanonicalPath).map(_.get()).getOrElse(0)) + } + + CountListingLocalFileSystem.resetCount() + + // provide 44 files in src, with sequential "last modified" to guarantee ordering + val inputFiles = (0 to 43).map { idx => + val f = createFile(idx.toString, new File(src, idx.toString), tmp) + f.setLastModified(idx * 10000) + f + } + + // first 3 batches only perform 1 listing + // batch 0 processes 10 (12 cached) + // batch 1 processes 10 from cache (2 cached) + // batch 2 processes 2 from cache (0 cached) since + // discardCachedInputRatio is less than threshold + val offsetBatch0 = source.latestOffset(FileStreamSourceOffset(-1L), ReadLimit.maxFiles(10)) + .asInstanceOf[FileStreamSourceOffset] + verifyBatch(offsetBatch0, expectedBatchId = 0, inputFiles, + expectedFileOffset = 0, expectedFilesInBatch = 10, expectedListingCount = 1) + val offsetBatch1 = source.latestOffset(FileStreamSourceOffset(-1L), ReadLimit.maxFiles(10)) + .asInstanceOf[FileStreamSourceOffset] + verifyBatch(offsetBatch1, expectedBatchId = 1, inputFiles, + expectedFileOffset = 10, expectedFilesInBatch = 10, expectedListingCount = 1) + val offsetBatch2 = source.latestOffset(FileStreamSourceOffset(-1L), ReadLimit.maxFiles(10)) + .asInstanceOf[FileStreamSourceOffset] + verifyBatch(offsetBatch2, expectedBatchId = 2, inputFiles, + expectedFileOffset = 20, expectedFilesInBatch = 2, expectedListingCount = 1) + + // next 3 batches perform another listing + // batch 3 processes 10 (12 cached) + // batch 4 processes 10 from cache (2 cached) + // batch 5 processes 2 from cache (0 cached) since + // discardCachedInputRatio is less than threshold + val offsetBatch3 = source.latestOffset(FileStreamSourceOffset(-1L), ReadLimit.maxFiles(10)) + .asInstanceOf[FileStreamSourceOffset] + verifyBatch(offsetBatch3, expectedBatchId = 3, inputFiles, + expectedFileOffset = 22, expectedFilesInBatch = 10, expectedListingCount = 2) + val offsetBatch4 = source.latestOffset(FileStreamSourceOffset(-1L), ReadLimit.maxFiles(10)) + .asInstanceOf[FileStreamSourceOffset] + verifyBatch(offsetBatch4, expectedBatchId = 4, inputFiles, + expectedFileOffset = 32, expectedFilesInBatch = 10, expectedListingCount = 2) + val offsetBatch5 = source.latestOffset(FileStreamSourceOffset(-1L), ReadLimit.maxFiles(10)) + .asInstanceOf[FileStreamSourceOffset] + verifyBatch(offsetBatch5, expectedBatchId = 5, inputFiles, + expectedFileOffset = 42, expectedFilesInBatch = 2, expectedListingCount = 2) + + // validate no remaining files and another listing is performed + val offsetBatch = source.latestOffset(FileStreamSourceOffset(-1L), ReadLimit.maxFiles(10)) + .asInstanceOf[FileStreamSourceOffset] + assert(5 === offsetBatch.logOffset) + assert(3 === CountListingLocalFileSystem.pathToNumListStatusCalled + .get(src.getCanonicalPath).map(_.get()).getOrElse(0)) + } + } + } + + test("SPARK-48314: Don't cache unread files when using Trigger.AvailableNow") { + withCountListingLocalFileSystemAsLocalFileSystem { + withThreeTempDirs { case (src, meta, tmp) => + val options = Map("latestFirst" -> "false", "maxFilesPerTrigger" -> "5", + "maxCachedFiles" -> "2") + val scheme = CountListingLocalFileSystem.scheme + val source = new FileStreamSource(spark, s"$scheme:///${src.getCanonicalPath}/*/*", "text", + StructType(Nil), Seq.empty, meta.getCanonicalPath, options) + val _metadataLog = PrivateMethod[FileStreamSourceLog](Symbol("metadataLog")) + val metadataLog = source invokePrivate _metadataLog() + + // provide 20 files in src, with sequential "last modified" to guarantee ordering + (0 to 19).map { idx => + val f = createFile(idx.toString, new File(src, idx.toString), tmp) + f.setLastModified(idx * 10000) + f + } + + source.prepareForTriggerAvailableNow() + CountListingLocalFileSystem.resetCount() + + var offset = source.latestOffset(FileStreamSourceOffset(-1L), ReadLimit.maxFiles(5)) + .asInstanceOf[FileStreamSourceOffset] + var files = metadataLog.get(offset.logOffset).getOrElse(Array.empty[FileEntry]) + + // All files are already tracked in allFilesForTriggerAvailableNow + assert(0 === CountListingLocalFileSystem.pathToNumListStatusCalled + .get(src.getCanonicalPath).map(_.get()).getOrElse(0)) + // Should be 5 files in the batch based on maxFiles limit + assert(files.length == 5) + + // Reading again leverages the files already tracked in allFilesForTriggerAvailableNow, + // so no more listings need to happen + offset = source.latestOffset(FileStreamSourceOffset(-1L), ReadLimit.maxFiles(5)) + .asInstanceOf[FileStreamSourceOffset] + files = metadataLog.get(offset.logOffset).getOrElse(Array.empty[FileEntry]) + + assert(0 === CountListingLocalFileSystem.pathToNumListStatusCalled + .get(src.getCanonicalPath).map(_.get()).getOrElse(0)) + // Should be 5 files in the batch since cached files are ignored + assert(files.length == 5) + } + } + } + test("SPARK-31962: file stream source shouldn't allow modifiedBefore/modifiedAfter") { def formatTime(time: LocalDateTime): String = { time.format(DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss")) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala index 32822994c81cb..b4f29fa9f01ac 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala @@ -41,7 +41,7 @@ import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.execution.{LocalLimitExec, SimpleMode, SparkPlan} import org.apache.spark.sql.execution.command.ExplainCommand import org.apache.spark.sql.execution.streaming._ -import org.apache.spark.sql.execution.streaming.sources.{ContinuousMemoryStream, MemorySink} +import org.apache.spark.sql.execution.streaming.sources.{ContinuousMemoryStream, ForeachBatchUserFuncException, MemorySink} import org.apache.spark.sql.execution.streaming.state.{KeyStateEncoderSpec, StateStore, StateStoreConf, StateStoreId, StateStoreProvider} import org.apache.spark.sql.expressions.Window import org.apache.spark.sql.functions._ @@ -1185,6 +1185,17 @@ class StreamSuite extends StreamTest { checkAnswer(spark.sql("select * from output"), Row("true")) } + private val py4JInterruptedExceptions = Seq( + classOf[InterruptedException].getName, + classOf[InterruptedIOException].getName, + classOf[ClosedByInterruptException].getName).map { s => + new py4j.Py4JException( + s""" + |py4j.protocol.Py4JJavaError: An error occurred while calling o44.count. + |: $s + |""".stripMargin) + } + for (e <- Seq( new InterruptedException, new InterruptedIOException, @@ -1192,16 +1203,8 @@ class StreamSuite extends StreamTest { new UncheckedIOException("test", new ClosedByInterruptException), new ExecutionException("test", new InterruptedException), new UncheckedExecutionException("test", new InterruptedException)) ++ - Seq( - classOf[InterruptedException].getName, - classOf[InterruptedIOException].getName, - classOf[ClosedByInterruptException].getName).map { s => - new py4j.Py4JException( - s""" - |py4j.protocol.Py4JJavaError: An error occurred while calling o44.count. - |: $s - |""".stripMargin) - }) { + py4JInterruptedExceptions ++ + py4JInterruptedExceptions.map { e => ForeachBatchUserFuncException(e) }) { test(s"view ${e.getClass.getSimpleName} [${e.getMessage}] as a normal query stop") { ThrowingExceptionInCreateSource.createSourceLatch = new CountDownLatch(1) ThrowingExceptionInCreateSource.exception = e @@ -1323,6 +1326,36 @@ class StreamSuite extends StreamTest { } } } + + test("isInterruptionException should correctly unwrap classic py4j InterruptedException") { + val e1 = new py4j.Py4JException( + """ + |py4j.protocol.Py4JJavaError: An error occurred while calling o1073599.sql. + |: java.util.concurrent.ExecutionException: java.lang.InterruptedException + |""".stripMargin) + val febError1 = ForeachBatchUserFuncException(e1) + assert(StreamExecution.isInterruptionException(febError1, spark.sparkContext)) + + // scalastyle:off line.size.limit + val e2 = new py4j.Py4JException( + """ + |py4j.protocol.Py4JJavaError: An error occurred while calling o2141502.saveAsTable. + |: org.apache.spark.SparkException: Job aborted. + |at org.apache.spark.sql.errors.QueryExecutionErrors$.jobAbortedError(QueryExecutionErrors.scala:882) + |at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$write$1(FileFormatWriter.scala:334) + | + |org.apache.spark.sql.execution.streaming.StreamExecution.withAttributionTags(StreamExecution.scala:82) + |at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runStream(StreamExecution.scala:339) + |at org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.$anonfun$run$2(StreamExecution.scala:262) + |at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) + |at org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:262) + |*Caused by: java.lang.InterruptedException + |at java.util.concurrent.locks.AbstractQueuedSynchronizer.doAcquireSharedInterruptibly(AbstractQueuedSynchronizer.java:1000)* + |""".stripMargin) + // scalastyle:on line.size.limit + val febError2 = ForeachBatchUserFuncException(e2) + assert(StreamExecution.isInterruptionException(febError2, spark.sparkContext)) + } } abstract class FakeSource extends StreamSourceProvider { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala index d7401897ff6a4..7439c7ab6d6e1 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala @@ -346,7 +346,8 @@ trait StreamTest extends QueryTest with SharedSparkSession with TimeLimits with def testStream( _stream: Dataset[_], outputMode: OutputMode = OutputMode.Append, - extraOptions: Map[String, String] = Map.empty)(actions: StreamAction*): Unit = synchronized { + extraOptions: Map[String, String] = Map.empty, + sink: MemorySink = new MemorySink())(actions: StreamAction*): Unit = synchronized { import org.apache.spark.sql.streaming.util.StreamManualClock // `synchronized` is added to prevent the user from calling multiple `testStream`s concurrently @@ -359,7 +360,6 @@ trait StreamTest extends QueryTest with SharedSparkSession with TimeLimits with var currentStream: StreamExecution = null var lastStream: StreamExecution = null val awaiting = new mutable.HashMap[Int, OffsetV2]() // source index -> offset to wait for - val sink = new MemorySink val resetConfValues = mutable.Map[String, Option[String]]() val defaultCheckpointLocation = Utils.createTempDir(namePrefix = "streaming.metadata").getCanonicalPath diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala index 8d79cf4af7717..bcf0d4ac46655 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala @@ -25,7 +25,7 @@ import scala.annotation.tailrec import org.apache.commons.io.FileUtils import org.scalatest.Assertions -import org.apache.spark.{SparkEnv, SparkException} +import org.apache.spark.{SparkEnv, SparkException, SparkUnsupportedOperationException} import org.apache.spark.rdd.BlockRDD import org.apache.spark.sql.{AnalysisException, DataFrame, Dataset, Row, SparkSession} import org.apache.spark.sql.catalyst.InternalRow @@ -36,7 +36,7 @@ import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} import org.apache.spark.sql.execution.exchange.Exchange import org.apache.spark.sql.execution.streaming._ import org.apache.spark.sql.execution.streaming.sources.MemorySink -import org.apache.spark.sql.execution.streaming.state.{StateSchemaNotCompatible, StateStore, StreamingAggregationStateManager} +import org.apache.spark.sql.execution.streaming.state.{StateStore, StateStoreValueSchemaNotCompatible, StreamingAggregationStateManager} import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.streaming.OutputMode._ @@ -782,11 +782,11 @@ class StreamingAggregationSuite extends StateStoreMetricsTest with Assertions { testStream(aggregated, Update())( StartStream(checkpointLocation = tempDir.getAbsolutePath), AddData(inputData, 21), - ExpectFailure[SparkException] { e => + ExpectFailure[StateStoreValueSchemaNotCompatible] { e => val stateSchemaExc = findStateSchemaNotCompatible(e) assert(stateSchemaExc.isDefined) val msg = stateSchemaExc.get.getMessage - assert(msg.contains("Provided schema doesn't match to the schema for existing state")) + assert(msg.contains("does not match existing")) // other verifications are presented in StateStoreSuite } ) @@ -909,9 +909,10 @@ class StreamingAggregationSuite extends StateStoreMetricsTest with Assertions { } @tailrec - private def findStateSchemaNotCompatible(exc: Throwable): Option[StateSchemaNotCompatible] = { + private def findStateSchemaNotCompatible(exc: Throwable): + Option[SparkUnsupportedOperationException] = { exc match { - case e1: StateSchemaNotCompatible => Some(e1) + case e1: SparkUnsupportedOperationException => Some(e1) case e1 if e1.getCause != null => findStateSchemaNotCompatible(e1.getCause) case _ => None } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingDeduplicationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingDeduplicationSuite.scala index 5c3d8d877f390..854893b1f033e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingDeduplicationSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingDeduplicationSuite.scala @@ -21,11 +21,13 @@ import java.io.File import org.apache.commons.io.FileUtils +import org.apache.spark.SparkUnsupportedOperationException import org.apache.spark.sql.DataFrame import org.apache.spark.sql.catalyst.streaming.InternalOutputModes._ import org.apache.spark.sql.execution.streaming.MemoryStream import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.StringType import org.apache.spark.tags.SlowSQLTest import org.apache.spark.util.Utils @@ -451,28 +453,29 @@ class StreamingDeduplicationSuite extends StateStoreMetricsTest { } } - test("SPARK-39650: recovery from checkpoint having all columns as value schema") { - // NOTE: We are also changing the schema of input compared to the checkpoint. In the checkpoint - // we define the input schema as (String, Int). - val inputData = MemoryStream[(String, Int, String)] - val dedupe = inputData.toDS().dropDuplicates("_1") + Seq("3.3.0", "3.5.1").foreach { sparkVersion => + test("SPARK-39650: recovery from checkpoint having all columns as value schema " + + s"with sparkVersion=$sparkVersion") { + // NOTE: We are also changing the schema of input compared to the checkpoint. + // In the checkpoint we define the input schema as (String, Int). + val inputData = MemoryStream[(String, Int, String)] + val dedupe = inputData.toDS().dropDuplicates("_1") - // The fix will land after Spark 3.3.0, hence we can check backward compatibility with - // checkpoint being built from Spark 3.3.0. - val resourceUri = this.getClass.getResource( - "/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/").toURI + val resourcePath = "/structured-streaming/checkpoint-version-" + sparkVersion + + "-streaming-deduplication/" + val resourceUri = this.getClass.getResource(resourcePath).toURI - val checkpointDir = Utils.createTempDir().getCanonicalFile - // Copy the checkpoint to a temp dir to prevent changes to the original. - // Not doing this will lead to the test passing on the first run, but fail subsequent runs. - FileUtils.copyDirectory(new File(resourceUri), checkpointDir) + val checkpointDir = Utils.createTempDir().getCanonicalFile + // Copy the checkpoint to a temp dir to prevent changes to the original. + // Not doing this will lead to the test passing on the first run, but fail subsequent runs. + FileUtils.copyDirectory(new File(resourceUri), checkpointDir) - inputData.addData(("a", 1, "dummy")) - inputData.addData(("a", 2, "dummy"), ("b", 3, "dummy")) + inputData.addData(("a", 1, "dummy")) + inputData.addData(("a", 2, "dummy"), ("b", 3, "dummy")) - testStream(dedupe, Append)( - StartStream(checkpointLocation = checkpointDir.getAbsolutePath), - /* + testStream(dedupe, Append)( + StartStream(checkpointLocation = checkpointDir.getAbsolutePath), + /* Note: The checkpoint was generated using the following input in Spark version 3.3.0 AddData(inputData, ("a", 1)), CheckLastBatch(("a", 1)), @@ -480,8 +483,95 @@ class StreamingDeduplicationSuite extends StateStoreMetricsTest { CheckLastBatch(("b", 3)) */ - AddData(inputData, ("a", 5, "a"), ("b", 2, "b"), ("c", 9, "c")), - CheckLastBatch(("c", 9, "c")) + AddData(inputData, ("a", 5, "a"), ("b", 2, "b"), ("c", 9, "c")), + CheckLastBatch(("c", 9, "c")) + ) + } + } + + Seq("3.3.0", "3.5.1").foreach { sparkVersion => + test("SPARK-39650: recovery from checkpoint with changes on key schema " + + s"are not allowed with sparkVersion=$sparkVersion") { + // NOTE: We are also changing the schema of input compared to the checkpoint. + // In the checkpoint we define the input schema as (String, Int). + val inputData = MemoryStream[(String, Int, String)] + val dedupe = inputData.toDS().dropDuplicates("_1", "_2") + + val resourcePath = "/structured-streaming/checkpoint-version-" + sparkVersion + + "-streaming-deduplication/" + val resourceUri = this.getClass.getResource(resourcePath).toURI + + val checkpointDir = Utils.createTempDir().getCanonicalFile + // Copy the checkpoint to a temp dir to prevent changes to the original. + // Not doing this will lead to the test passing on the first run, but fail subsequent runs. + FileUtils.copyDirectory(new File(resourceUri), checkpointDir) + + inputData.addData(("a", 1, "dummy")) + inputData.addData(("a", 2, "dummy"), ("b", 3, "dummy")) + + // trying to evolve the key schema is not allowed and should throw an exception + val ex = intercept[StreamingQueryException] { + testStream(dedupe, Append)( + StartStream(checkpointLocation = checkpointDir.getAbsolutePath), + AddData(inputData, ("a", 5, "a"), ("b", 2, "b"), ("c", 9, "c")), + CheckLastBatch(("a", 5, "a"), ("b", 2, "b"), ("c", 9, "c")) + ) + } + + // verify that the key schema not compatible error is thrown + checkError( + ex.getCause.asInstanceOf[SparkUnsupportedOperationException], + errorClass = "STATE_STORE_KEY_SCHEMA_NOT_COMPATIBLE", + parameters = Map("storedKeySchema" -> ".*", + "newKeySchema" -> ".*"), + matchPVals = true + ) + } + } + + test("collation aware deduplication") { + val inputData = MemoryStream[(String, Int)] + val result = inputData.toDF() + .select(col("_1") + .try_cast(StringType("UTF8_BINARY")).as("str"), + col("_2").as("int")) + .dropDuplicates("str") + + testStream(result, Append)( + AddData(inputData, "a" -> 1), + CheckLastBatch("a" -> 1), + assertNumStateRows(total = 1, updated = 1, droppedByWatermark = 0), + AddData(inputData, "a" -> 2), // Dropped + CheckLastBatch(), + assertNumStateRows(total = 1, updated = 0, droppedByWatermark = 0), + // scalastyle:off + AddData(inputData, "ä" -> 1), + CheckLastBatch("ä" -> 1), + // scalastyle:on + assertNumStateRows(total = 2, updated = 1, droppedByWatermark = 0) + ) + } + + test("non-binary collation aware deduplication not supported") { + val inputData = MemoryStream[(String)] + val result = inputData.toDF() + .select(col("value") + .try_cast(StringType("UTF8_LCASE")).as("str")) + .dropDuplicates("str") + + val ex = intercept[StreamingQueryException] { + testStream(result, Append)( + AddData(inputData, "a"), + CheckLastBatch("a")) + } + + checkError( + ex.getCause.asInstanceOf[SparkUnsupportedOperationException], + errorClass = "STATE_STORE_UNSUPPORTED_OPERATION_BINARY_INEQUALITY", + parameters = Map( + "schema" -> ".+\"str\":\"spark.UTF8_LCASE\".+" + ), + matchPVals = true ) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala index e05cb4d3c35ce..5e9bdad8fd825 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala @@ -27,6 +27,7 @@ import scala.util.Random import org.apache.commons.io.FileUtils import org.scalatest.BeforeAndAfter +import org.apache.spark.SparkUnsupportedOperationException import org.apache.spark.scheduler.ExecutorCacheTaskLocation import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression} @@ -688,6 +689,146 @@ class StreamingInnerJoinSuite extends StreamingJoinSuite { ) } + test("SPARK-48687 - restore the stream-stream inner join query from Spark 3.5 and " + + "changing the join condition (key schema) should fail the query") { + // NOTE: We are also changing the schema of input compared to the checkpoint. + // In the checkpoint we define the input schema as (Int, Long), which does not have name + // in both left and right. + val inputStream = MemoryStream[(Int, Long, String)] + val df = inputStream.toDS() + .select(col("_1").as("value"), timestamp_seconds($"_2").as("timestamp"), + col("_3").as("name")) + + val leftStream = df.select(col("value").as("leftId"), + col("timestamp").as("leftTime"), col("name").as("leftName")) + + val rightStream = df + // Introduce misses for ease of debugging + .where(col("value") % 2 === 0) + .select(col("value").as("rightId"), + col("timestamp").as("rightTime"), col("name").as("rightName")) + + val query = leftStream + .withWatermark("leftTime", "5 seconds") + .join( + rightStream.withWatermark("rightTime", "5 seconds"), + expr("rightId = leftId AND leftName = rightName AND rightTime >= leftTime AND " + + "rightTime <= leftTime + interval 5 seconds"), + joinType = "inner") + .select(col("leftId"), col("leftTime").cast("int"), + col("leftName"), + col("rightId"), col("rightTime").cast("int"), + col("rightName")) + + val resourceUri = this.getClass.getResource( + "/structured-streaming/checkpoint-version-3.5.1-streaming-join/").toURI + val checkpointDir = Utils.createTempDir().getCanonicalFile + // Copy the checkpoint to a temp dir to prevent changes to the original. + // Not doing this will lead to the test passing on the first run, but fail subsequent runs. + FileUtils.copyDirectory(new File(resourceUri), checkpointDir) + inputStream.addData((1, 1L, "a"), (2, 2L, "b"), (3, 3L, "c"), (4, 4L, "d"), (5, 5L, "e")) + + val ex = intercept[StreamingQueryException] { + testStream(query)( + StartStream(checkpointLocation = checkpointDir.getAbsolutePath), + /* + Note: The checkpoint was generated using the following input in Spark version 3.5.1 + The base query is different because it does not use the leftName/rightName columns + as part of the join keys/condition that is used as part of the key schema. + + AddData(inputStream, (1, 1L), (2, 2L), (3, 3L), (4, 4L), (5, 5L)), + // batch 1 - global watermark = 0 + // states + // left: (1, 1L), (2, 2L), (3, 3L), (4, 4L), (5, 5L) + // right: (2, 2L), (4, 4L) + CheckNewAnswer((2, 2L, 2, 2L), (4, 4L, 4, 4L)), + */ + AddData(inputStream, (6, 6L, "a"), (7, 7L, "a"), (8, 8L, "a"), (9, 9L, "a"), + (10, 10L, "a")), + CheckNewAnswer((6, 6L, "a", 6, 6L, "a"), (8, 8L, "a", 8, 8L, "a"), + (10, 10L, "a", 10, 10L, "a")) + ) + } + + checkError( + ex.getCause.asInstanceOf[SparkUnsupportedOperationException], + errorClass = "STATE_STORE_KEY_SCHEMA_NOT_COMPATIBLE", + parameters = Map("storedKeySchema" -> ".*", + "newKeySchema" -> ".*"), + matchPVals = true + ) + } + + test("SPARK-48687 - restore the stream-stream inner join query from Spark 3.5 and " + + "changing the value schema should fail the query") { + // NOTE: We are also changing the schema of input compared to the checkpoint. + // In the checkpoint we define the input schema as (Int, Long), which does not have name + // in both left and right. + val inputStream = MemoryStream[(Int, Long, String)] + val df = inputStream.toDS() + .select(col("_1").as("value"), timestamp_seconds($"_2").as("timestamp"), + col("_3").as("name")) + + val leftStream = df.select(col("value").as("leftId"), + col("timestamp").as("leftTime"), col("name").as("leftName")) + + val rightStream = df + // Introduce misses for ease of debugging + .where(col("value") % 2 === 0) + .select(col("value").as("rightId"), + col("timestamp").as("rightTime"), col("name").as("rightName")) + + val query = leftStream + .withWatermark("leftTime", "5 seconds") + .join( + rightStream.withWatermark("rightTime", "5 seconds"), + expr("rightId = leftId AND rightTime >= leftTime AND " + + "rightTime <= leftTime + interval 5 seconds"), + joinType = "inner") + .select(col("leftId"), col("leftTime").cast("int"), + col("leftName"), + col("rightId"), col("rightTime").cast("int"), + col("rightName")) + + val resourceUri = this.getClass.getResource( + "/structured-streaming/checkpoint-version-3.5.1-streaming-join/").toURI + val checkpointDir = Utils.createTempDir().getCanonicalFile + // Copy the checkpoint to a temp dir to prevent changes to the original. + // Not doing this will lead to the test passing on the first run, but fail subsequent runs. + FileUtils.copyDirectory(new File(resourceUri), checkpointDir) + inputStream.addData((1, 1L, "a"), (2, 2L, "b"), (3, 3L, "c"), (4, 4L, "d"), (5, 5L, "e")) + + val ex = intercept[StreamingQueryException] { + testStream(query)( + StartStream(checkpointLocation = checkpointDir.getAbsolutePath), + /* + Note: The checkpoint was generated using the following input in Spark version 3.5.1 + The base query is different because it does not use the leftName/rightName columns + as part of the generated output that is used as part of the value schema. + + AddData(inputStream, (1, 1L), (2, 2L), (3, 3L), (4, 4L), (5, 5L)), + // batch 1 - global watermark = 0 + // states + // left: (1, 1L), (2, 2L), (3, 3L), (4, 4L), (5, 5L) + // right: (2, 2L), (4, 4L) + CheckNewAnswer((2, 2L, 2, 2L), (4, 4L, 4, 4L)), + */ + AddData(inputStream, (6, 6L, "a"), (7, 7L, "a"), (8, 8L, "a"), (9, 9L, "a"), + (10, 10L, "a")), + CheckNewAnswer((6, 6L, "a", 6, 6L, "a"), (8, 8L, "a", 8, 8L, "a"), + (10, 10L, "a", 10, 10L, "a")) + ) + } + + checkError( + ex.getCause.asInstanceOf[SparkUnsupportedOperationException], + errorClass = "STATE_STORE_VALUE_SCHEMA_NOT_COMPATIBLE", + parameters = Map("storedValueSchema" -> ".*", + "newValueSchema" -> ".*"), + matchPVals = true + ) + } + test("SPARK-35896: metrics in StateOperatorProgress are output correctly") { val input1 = MemoryStream[Int] val input2 = MemoryStream[Int] diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryHashPartitionVerifySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryHashPartitionVerifySuite.scala index 3423b8b8cb287..3d8c20af3b384 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryHashPartitionVerifySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryHashPartitionVerifySuite.scala @@ -31,11 +31,24 @@ import org.apache.spark.sql.catalyst.expressions.{BoundReference, GenericInterna import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.types.{BinaryType, DataType, DoubleType, FloatType, IntegerType, LongType, StringType, StructType, TimestampType} +/** + * To run the test suite: + * {{{ + * build/sbt "sql/testOnly *StreamingQueryHashPartitionVerifySuite" + * }}} + * + * To re-generate the golden file with size limit under 10Mb, run: + * {{{ + * SPARK_GENERATE_GOLDEN_FILES=1 build/sbt "sql/testOnly *StreamingQueryHashPartitionVerifySuite" + * -Dspark.sql.test.randomDataGenerator.maxStrLen=100 + * -Dspark.sql.test.randomDataGenerator.maxArraySize=4 + * }}} + */ class StreamingQueryHashPartitionVerifySuite extends StreamTest { - // Configs for golden file - private val goldenFileURI = - this.getClass.getResource("/structured-streaming/partition-tests/").toURI + // A golden file directory in `src/test` instead of `target` directory. + private val goldenFileURI = getWorkspaceFilePath( + "sql", "core", "src", "test", "resources", "structured-streaming", "partition-tests").toUri private val schemaFileName = "randomSchemas" // files for storing random input schemas private val rowAndPartIdFilename = @@ -152,9 +165,6 @@ class StreamingQueryHashPartitionVerifySuite extends StreamTest { val rowAndPartIdFile = new File(goldenFileURI.getPath, rowAndPartIdFilename) if (regenerateGoldenFiles) { - // To limit the golden file size under 10Mb, please set the final val MAX_STR_LEN: Int = 100 - // and final val MAX_ARR_SIZE: Int = 4 in org.apache.spark.sql.RandomDataGenerator - val random = new Random() val schemas = getRandomSchemas(random) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryOptimizationCorrectnessSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryOptimizationCorrectnessSuite.scala index efc84c8e4c7cf..782badaef924f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryOptimizationCorrectnessSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryOptimizationCorrectnessSuite.scala @@ -21,7 +21,8 @@ import java.sql.Timestamp import org.apache.spark.sql.Row import org.apache.spark.sql.execution.streaming.MemoryStream -import org.apache.spark.sql.functions.{lit, window} +import org.apache.spark.sql.functions.{expr, lit, window} +import org.apache.spark.sql.internal.SQLConf /** * This test ensures that any optimizations done by Spark SQL optimizer are @@ -416,4 +417,111 @@ class StreamingQueryOptimizationCorrectnessSuite extends StreamTest { ) } } + + test("SPARK-48267: regression test, stream-stream union followed by stream-batch join") { + withTempDir { dir => + val input1 = MemoryStream[Int] + val input2 = MemoryStream[Int] + + val df1 = input1.toDF().withColumn("code", lit(1)) + val df2 = input2.toDF().withColumn("code", lit(null)) + + // NOTE: The column 'ref_code' is known to be non-nullable. + val batchDf = spark.range(1, 5).select($"id".as("ref_code")) + + val unionDf = df1.union(df2) + .join(batchDf, expr("code = ref_code")) + .select("value") + + testStream(unionDf)( + StartStream(checkpointLocation = dir.getAbsolutePath), + + AddData(input1, 1, 2, 3), + CheckNewAnswer(1, 2, 3), + + AddData(input2, 1, 2, 3), + // The test failed before SPARK-47305 - the test failed with below error message: + // org.apache.spark.sql.streaming.StreamingQueryException: Stream-stream join without + // equality predicate is not supported.; + // Join Inner + // :- StreamingDataSourceV2ScanRelation[value#3] MemoryStreamDataSource + // +- LocalRelation + // Note that LocalRelation is actually a batch source (Range) but due to + // a bug, it was incorrect marked to the streaming. SPARK-47305 fixed the bug. + CheckNewAnswer() + ) + } + } + + test("SPARK-48481: DISTINCT with empty stream source should retain AGGREGATE") { + def doTest(numExpectedStatefulOperatorsForOneEmptySource: Int): Unit = { + withTempView("tv1", "tv2") { + val inputStream1 = MemoryStream[Int] + val ds1 = inputStream1.toDS() + ds1.registerTempTable("tv1") + + val inputStream2 = MemoryStream[Int] + val ds2 = inputStream2.toDS() + ds2.registerTempTable("tv2") + + // DISTINCT is rewritten to AGGREGATE, hence an AGGREGATEs for each source + val unioned = spark.sql( + """ + | WITH u AS ( + | SELECT DISTINCT value AS value FROM tv1 + | ), v AS ( + | SELECT DISTINCT value AS value FROM tv2 + | ) + | SELECT value FROM u UNION ALL SELECT value FROM v + |""".stripMargin + ) + + testStream(unioned, OutputMode.Update())( + MultiAddData(inputStream1, 1, 1, 2)(inputStream2, 1, 1, 2), + CheckNewAnswer(1, 2, 1, 2), + Execute { qe => + val stateOperators = qe.lastProgress.stateOperators + // Aggregate should be "stateful" one + assert(stateOperators.length === 2) + stateOperators.zipWithIndex.foreach { case (op, id) => + assert(op.numRowsUpdated === 2, s"stateful OP ID: $id") + } + }, + AddData(inputStream2, 2, 2, 3), + // NOTE: this is probably far from expectation to have 2 as output given user intends + // deduplicate, but the behavior is still correct with rewritten node and output mode: + // Aggregate & Update mode. + // TODO: Probably we should disallow DISTINCT or rewrite to + // dropDuplicates(WithinWatermark) for streaming source? + CheckNewAnswer(2, 3), + Execute { qe => + val stateOperators = qe.lastProgress.stateOperators + // Aggregate should be "stateful" one + assert(stateOperators.length === numExpectedStatefulOperatorsForOneEmptySource) + val opWithUpdatedRows = stateOperators.zipWithIndex.filterNot(_._1.numRowsUpdated == 0) + assert(opWithUpdatedRows.length === 1) + // If this were dropDuplicates, numRowsUpdated should have been 1. + assert(opWithUpdatedRows.head._1.numRowsUpdated === 2, + s"stateful OP ID: ${opWithUpdatedRows.head._2}") + }, + AddData(inputStream1, 4, 4, 5), + CheckNewAnswer(4, 5), + Execute { qe => + val stateOperators = qe.lastProgress.stateOperators + assert(stateOperators.length === numExpectedStatefulOperatorsForOneEmptySource) + val opWithUpdatedRows = stateOperators.zipWithIndex.filterNot(_._1.numRowsUpdated == 0) + assert(opWithUpdatedRows.length === 1) + assert(opWithUpdatedRows.head._1.numRowsUpdated === 2, + s"stateful OP ID: ${opWithUpdatedRows.head._2}") + } + ) + } + } + + doTest(numExpectedStatefulOperatorsForOneEmptySource = 2) + + withSQLConf(SQLConf.STREAMING_OPTIMIZE_ONE_ROW_PLAN_ENABLED.key -> "true") { + doTest(numExpectedStatefulOperatorsForOneEmptySource = 1) + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusAndProgressSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusAndProgressSuite.scala index dce27bdc5d1ca..e748ae8e7d7df 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusAndProgressSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusAndProgressSuite.scala @@ -172,6 +172,7 @@ class StreamingQueryStatusAndProgressSuite extends StreamTest with Eventually { test("StreamingQueryProgress - json") { assert(compact(parse(testProgress1.json)) === testProgress1.json) assert(compact(parse(testProgress2.json)) === testProgress2.json) + assert(compact(parse(testProgress3.json)) === testProgress3.json) } test("StreamingQueryProgress - toString") { @@ -499,6 +500,28 @@ object StreamingQueryStatusAndProgressSuite { "event_b2" -> row(schema2, 200L, "fzo", "baz")).asJava) ) + val testProgress3 = new StreamingQueryProgress( + id = UUID.randomUUID, + runId = UUID.randomUUID, + name = "myName", + timestamp = "2024-05-28T00:00:00.233Z", + batchId = 2L, + batchDuration = 0L, + durationMs = null, + eventTime = null, + stateOperators = Array(new StateOperatorProgress(operatorName = "op1", + numRowsTotal = 0, numRowsUpdated = 1, allUpdatesTimeMs = 1, numRowsRemoved = 2, + allRemovalsTimeMs = 34, commitTimeMs = 23, memoryUsedBytes = 3, numRowsDroppedByWatermark = 0, + numShufflePartitions = 2, numStateStoreInstances = 2, + customMetrics = new java.util.HashMap(Map("stateOnCurrentVersionSizeBytes" -> 2L, + "loadedMapCacheHitCount" -> 1L, "loadedMapCacheMissCount" -> 0L) + .transform((_, v) => long2Long(v)).asJava) + )), + sources = Array(), + sink = SinkProgress("sink", None), + observedMetrics = null + ) + val testStatus = new StreamingQueryStatus("active", true, false) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala index 504c0b334e426..061b353879d14 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala @@ -1364,10 +1364,39 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging wi ) } + test("Collation aware streaming") { + withTable("parquet_streaming_tbl") { + spark.sql( + """ + |CREATE TABLE parquet_streaming_tbl + |( + | key STRING COLLATE UTF8_LCASE, + | value_stream INTEGER + |) USING parquet""".stripMargin) + + val streamDf = spark.readStream.table("parquet_streaming_tbl") + val filteredDf = streamDf.filter("key = 'aaa'") + + val clock = new StreamManualClock() + testStream(filteredDf)( + StartStream(triggerClock = clock, trigger = Trigger.ProcessingTime(100)), + Execute { _ => + spark.createDataFrame(Seq("aaa" -> 1, "AAA" -> 2, "bbb" -> 3, "aa" -> 4)) + .toDF("key", "value_stream") + .write.format("parquet").mode(SaveMode.Append) + .saveAsTable("parquet_streaming_tbl") + }, + AdvanceManualClock(150), + waitUntilBatchProcessed(clock), + CheckLastBatch(("aaa", 1), ("AAA", 2)) + ) + } + } + test("SPARK-47776: streaming aggregation having binary inequality column in the grouping " + "key must be disallowed") { val tableName = "parquet_dummy_tbl" - val collationName = "UTF8_BINARY_LCASE" + val collationName = "UTF8_LCASE" withTable(tableName) { sql( @@ -1396,13 +1425,30 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging wi ex.getCause.asInstanceOf[SparkUnsupportedOperationException], errorClass = "STATE_STORE_UNSUPPORTED_OPERATION_BINARY_INEQUALITY", parameters = Map( - "schema" -> ".+\"type\":\"string collate UTF8_BINARY_LCASE\".+" + "schema" -> ".+\"c1\":\"spark.UTF8_LCASE\".+" ), matchPVals = true ) } } + test("SPARK-48447: check state store provider class before invoking the constructor") { + withSQLConf(SQLConf.STATE_STORE_PROVIDER_CLASS.key -> classOf[Object].getCanonicalName) { + val input = MemoryStream[Int] + input.addData(1) + val query = input.toDF().limit(2).writeStream + .trigger(Trigger.AvailableNow()) + .format("console") + .start() + val ex = intercept[StreamingQueryException] { + query.processAllAvailable() + } + assert(ex.getMessage.contains( + s"The given State Store Provider ${classOf[Object].getCanonicalName} does not " + + "extend org.apache.spark.sql.execution.streaming.state.StateStoreProvider.")) + } + } + private def checkExceptionMessage(df: DataFrame): Unit = { withTempDir { outputDir => withTempDir { checkpointDir => diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingStateStoreFormatCompatibilitySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingStateStoreFormatCompatibilitySuite.scala index 4827d06d64d07..8a9d4d42ef2b5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingStateStoreFormatCompatibilitySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingStateStoreFormatCompatibilitySuite.scala @@ -23,11 +23,11 @@ import scala.annotation.tailrec import org.apache.commons.io.FileUtils -import org.apache.spark.SparkException +import org.apache.spark.{SparkException, SparkUnsupportedOperationException} import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.streaming.InternalOutputModes.Complete import org.apache.spark.sql.execution.streaming.MemoryStream -import org.apache.spark.sql.execution.streaming.state.{InvalidUnsafeRowException, StateSchemaNotCompatible} +import org.apache.spark.sql.execution.streaming.state.{StateStoreKeyRowFormatValidationFailure, StateStoreValueRowFormatValidationFailure} import org.apache.spark.sql.functions._ import org.apache.spark.tags.SlowSQLTest import org.apache.spark.util.Utils @@ -253,8 +253,9 @@ class StreamingStateStoreFormatCompatibilitySuite extends StreamTest { @tailrec private def findStateSchemaException(exc: Throwable): Boolean = { exc match { - case _: StateSchemaNotCompatible => true - case _: InvalidUnsafeRowException => true + case _: SparkUnsupportedOperationException => true + case _: StateStoreKeyRowFormatValidationFailure => true + case _: StateStoreValueRowFormatValidationFailure => true case e1 if e1.getCause != null => findStateSchemaException(e1.getCause) case _ => false } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithMapStateTTLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithMapStateTTLSuite.scala new file mode 100644 index 0000000000000..bf46c802fdea4 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithMapStateTTLSuite.scala @@ -0,0 +1,322 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.streaming + +import java.time.Duration + +import org.apache.spark.sql.Encoders +import org.apache.spark.sql.execution.streaming.{MapStateImplWithTTL, MemoryStream} +import org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.streaming.util.StreamManualClock + +class MapStateSingleKeyTTLProcessor(ttlConfig: TTLConfig) + extends StatefulProcessor[String, InputEvent, OutputEvent] { + + @transient private var _mapState: MapStateImplWithTTL[String, Int] = _ + + override def init( + outputMode: OutputMode, + timeMode: TimeMode): Unit = { + _mapState = getHandle + .getMapState("mapState", Encoders.STRING, Encoders.scalaInt, ttlConfig) + .asInstanceOf[MapStateImplWithTTL[String, Int]] + } + + override def handleInputRows( + key: String, + inputRows: Iterator[InputEvent], + timerValues: TimerValues, + expiredTimerInfo: ExpiredTimerInfo): Iterator[OutputEvent] = { + var results = List[OutputEvent]() + + for (row <- inputRows) { + val resultIter = processRow(row, _mapState) + resultIter.foreach { r => + results = r :: results + } + } + + results.iterator + } + + def processRow( + row: InputEvent, + mapState: MapStateImplWithTTL[String, Int]): Iterator[OutputEvent] = { + var results = List[OutputEvent]() + val key = row.key + val userKey = "key" + if (row.action == "get") { + if (mapState.containsKey(userKey)) { + results = OutputEvent(key, mapState.getValue(userKey), isTTLValue = false, -1) :: results + } + } else if (row.action == "get_without_enforcing_ttl") { + val currState = mapState.getWithoutEnforcingTTL(userKey) + if (currState.isDefined) { + results = OutputEvent(key, currState.get, isTTLValue = false, -1) :: results + } + } else if (row.action == "get_ttl_value_from_state") { + val ttlValue = mapState.getTTLValue(userKey) + if (ttlValue.isDefined) { + val value = ttlValue.get._1 + val ttlExpiration = ttlValue.get._2 + results = OutputEvent(key, value, isTTLValue = true, ttlExpiration) :: results + } + } else if (row.action == "put") { + mapState.updateValue(userKey, row.value) + } else if (row.action == "get_values_in_ttl_state") { + val ttlValues = mapState.getKeyValuesInTTLState() + ttlValues.foreach { v => + results = OutputEvent(key, -1, isTTLValue = true, ttlValue = v._2) :: results + } + } + + results.iterator + } +} + +case class MapInputEvent( + key: String, + userKey: String, + action: String, + value: Int) + +case class MapOutputEvent( + key: String, + userKey: String, + value: Int, + isTTLValue: Boolean, + ttlValue: Long) + +class MapStateTTLProcessor(ttlConfig: TTLConfig) + extends StatefulProcessor[String, MapInputEvent, MapOutputEvent] { + + @transient private var _mapState: MapStateImplWithTTL[String, Int] = _ + + override def init( + outputMode: OutputMode, + timeMode: TimeMode): Unit = { + _mapState = getHandle + .getMapState("mapState", Encoders.STRING, Encoders.scalaInt, ttlConfig) + .asInstanceOf[MapStateImplWithTTL[String, Int]] + } + + override def handleInputRows( + key: String, + inputRows: Iterator[MapInputEvent], + timerValues: TimerValues, + expiredTimerInfo: ExpiredTimerInfo): Iterator[MapOutputEvent] = { + var results = List[MapOutputEvent]() + + for (row <- inputRows) { + val resultIter = processRow(row, _mapState) + resultIter.foreach { r => + results = r :: results + } + } + + results.iterator + } + + def processRow( + row: MapInputEvent, + mapState: MapStateImplWithTTL[String, Int]): Iterator[MapOutputEvent] = { + var results = List[MapOutputEvent]() + val key = row.key + val userKey = row.userKey + if (row.action == "get") { + if (mapState.containsKey(userKey)) { + results = MapOutputEvent(key, userKey, mapState.getValue(userKey), + isTTLValue = false, -1) :: results + } + } else if (row.action == "get_without_enforcing_ttl") { + val currState = mapState.getWithoutEnforcingTTL(userKey) + if (currState.isDefined) { + results = MapOutputEvent(key, userKey, currState.get, isTTLValue = false, -1) :: results + } + } else if (row.action == "get_ttl_value_from_state") { + val ttlValue = mapState.getTTLValue(userKey) + if (ttlValue.isDefined) { + val value = ttlValue.get._1 + val ttlExpiration = ttlValue.get._2 + results = MapOutputEvent(key, userKey, value, isTTLValue = true, ttlExpiration) :: results + } + } else if (row.action == "put") { + mapState.updateValue(userKey, row.value) + } else if (row.action == "get_values_in_ttl_state") { + val ttlValues = mapState.getKeyValuesInTTLState() + ttlValues.foreach { elem => + results = MapOutputEvent(key, elem._1, -1, isTTLValue = true, ttlValue = elem._2) :: results + } + } else if (row.action == "iterator") { + val iter = mapState.iterator() + iter.foreach { elem => + results = MapOutputEvent(key, elem._1, elem._2, isTTLValue = false, -1) :: results + } + } + + results.iterator + } +} + +class TransformWithMapStateTTLSuite extends TransformWithStateTTLTest { + + import testImplicits._ + override def getProcessor(ttlConfig: TTLConfig): + StatefulProcessor[String, InputEvent, OutputEvent] = { + new MapStateSingleKeyTTLProcessor(ttlConfig) + } + + override def getStateTTLMetricName: String = "numMapStateWithTTLVars" + + test("validate state is evicted with multiple user keys") { + withSQLConf(SQLConf.STATE_STORE_PROVIDER_CLASS.key -> + classOf[RocksDBStateStoreProvider].getName, + SQLConf.SHUFFLE_PARTITIONS.key -> "1") { + + val inputStream = MemoryStream[MapInputEvent] + val ttlConfig = TTLConfig(ttlDuration = Duration.ofMinutes(1)) + val result = inputStream.toDS() + .groupByKey(x => x.key) + .transformWithState( + new MapStateTTLProcessor(ttlConfig), + TimeMode.ProcessingTime(), + OutputMode.Append()) + + val clock = new StreamManualClock + testStream(result)( + StartStream(Trigger.ProcessingTime("1 second"), triggerClock = clock), + AddData(inputStream, MapInputEvent("k1", "key1", "put", 1)), + AdvanceManualClock(1 * 1000), + CheckNewAnswer(), + AddData(inputStream, MapInputEvent("k1", "key1", "get", -1)), + AdvanceManualClock(30 * 1000), + CheckNewAnswer(MapOutputEvent("k1", "key1", 1, isTTLValue = false, -1)), + AddData(inputStream, MapInputEvent("k1", "key2", "put", 2)), + AdvanceManualClock(1 * 1000), + CheckNewAnswer(), + // advance clock to expire first key + AdvanceManualClock(30 * 1000), + AddData(inputStream, MapInputEvent("k1", "key1", "get", -1), + MapInputEvent("k1", "key2", "get", -1)), + AdvanceManualClock(1 * 1000), + CheckNewAnswer(MapOutputEvent("k1", "key2", 2, isTTLValue = false, -1)), + StopStream + ) + } + } + + test("verify iterator doesn't return expired keys") { + withSQLConf(SQLConf.STATE_STORE_PROVIDER_CLASS.key -> + classOf[RocksDBStateStoreProvider].getName, + SQLConf.SHUFFLE_PARTITIONS.key -> "1") { + + val inputStream = MemoryStream[MapInputEvent] + val ttlConfig = TTLConfig(ttlDuration = Duration.ofMinutes(1)) + val result = inputStream.toDS() + .groupByKey(x => x.key) + .transformWithState( + new MapStateTTLProcessor(ttlConfig), + TimeMode.ProcessingTime(), + OutputMode.Append()) + + val clock = new StreamManualClock + testStream(result)( + StartStream(Trigger.ProcessingTime("1 second"), triggerClock = clock), + AddData(inputStream, + MapInputEvent("k1", "key1", "put", 1), + MapInputEvent("k1", "key2", "put", 2) + ), + AdvanceManualClock(1 * 1000), // batch timestamp: 1000 + CheckNewAnswer(), + AddData(inputStream, + MapInputEvent("k1", "key1", "get", -1), + MapInputEvent("k1", "key2", "get", -1) + ), + AdvanceManualClock(30 * 1000), // batch timestamp: 31000 + CheckNewAnswer( + MapOutputEvent("k1", "key1", 1, isTTLValue = false, -1), + MapOutputEvent("k1", "key2", 2, isTTLValue = false, -1) + ), + // get values from ttl state + AddData(inputStream, + MapInputEvent("k1", "", "get_values_in_ttl_state", -1) + ), + AdvanceManualClock(1 * 1000), // batch timestamp: 32000 + CheckNewAnswer( + MapOutputEvent("k1", "key1", -1, isTTLValue = true, 61000), + MapOutputEvent("k1", "key2", -1, isTTLValue = true, 61000) + ), + // advance clock to expire first two values + AdvanceManualClock(30 * 1000), // batch timestamp: 62000 + AddData(inputStream, + MapInputEvent("k1", "key3", "put", 3), + MapInputEvent("k1", "key4", "put", 4), + MapInputEvent("k1", "key5", "put", 5), + MapInputEvent("k1", "", "iterator", -1) + ), + AdvanceManualClock(1 * 1000), // batch timestamp: 63000 + CheckNewAnswer( + MapOutputEvent("k1", "key3", 3, isTTLValue = false, -1), + MapOutputEvent("k1", "key4", 4, isTTLValue = false, -1), + MapOutputEvent("k1", "key5", 5, isTTLValue = false, -1) + ), + AddData(inputStream, + MapInputEvent("k1", "", "get_values_in_ttl_state", -1) + ), + AdvanceManualClock(1 * 1000), + CheckNewAnswer( + MapOutputEvent("k1", "key3", -1, isTTLValue = true, 123000), + MapOutputEvent("k1", "key4", -1, isTTLValue = true, 123000), + MapOutputEvent("k1", "key5", -1, isTTLValue = true, 123000) + ), + // get all values without enforcing ttl + AddData(inputStream, + MapInputEvent("k1", "key1", "get_without_enforcing_ttl", -1), + MapInputEvent("k1", "key2", "get_without_enforcing_ttl", -1), + MapInputEvent("k1", "key3", "get_without_enforcing_ttl", -1), + MapInputEvent("k1", "key4", "get_without_enforcing_ttl", -1), + MapInputEvent("k1", "key5", "get_without_enforcing_ttl", -1) + ), + AdvanceManualClock(1 * 1000), + CheckNewAnswer( + MapOutputEvent("k1", "key3", 3, isTTLValue = false, -1), + MapOutputEvent("k1", "key4", 4, isTTLValue = false, -1), + MapOutputEvent("k1", "key5", 5, isTTLValue = false, -1) + ), + // check that updating a key updates its TTL + AddData(inputStream, MapInputEvent("k1", "key3", "put", 3)), + AdvanceManualClock(1 * 1000), + AddData(inputStream, MapInputEvent("k1", "", "get_values_in_ttl_state", -1)), + AdvanceManualClock(1 * 1000), + CheckNewAnswer( + MapOutputEvent("k1", "key3", -1, isTTLValue = true, 123000), + MapOutputEvent("k1", "key3", -1, isTTLValue = true, 126000), + MapOutputEvent("k1", "key4", -1, isTTLValue = true, 123000), + MapOutputEvent("k1", "key5", -1, isTTLValue = true, 123000) + ), + AddData(inputStream, MapInputEvent("k1", "key3", "get_ttl_value_from_state", -1)), + AdvanceManualClock(1 * 1000), + CheckNewAnswer( + MapOutputEvent("k1", "key3", 3, isTTLValue = true, 126000) + ), + StopStream + ) + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithStateChainingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithStateChainingSuite.scala new file mode 100644 index 0000000000000..5388d6f1fb68a --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithStateChainingSuite.scala @@ -0,0 +1,411 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.streaming + +import java.sql.Timestamp +import java.time.{Instant, LocalDateTime, ZoneId} + +import org.apache.spark.{SparkRuntimeException, SparkThrowable} +import org.apache.spark.sql.AnalysisException +import org.apache.spark.sql.catalyst.ExtendedAnalysisException +import org.apache.spark.sql.execution.streaming.{MemoryStream, StreamExecution} +import org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider +import org.apache.spark.sql.functions.window +import org.apache.spark.sql.internal.SQLConf + +case class InputEventRow( + key: String, + eventTime: Timestamp, + event: String) + +case class OutputRow( + key: String, + outputEventTime: Timestamp, + count: Int) + +class TestStatefulProcessor + extends StatefulProcessor[String, InputEventRow, OutputRow] { + override def init(outputMode: OutputMode, timeMode: TimeMode): Unit = {} + + override def handleInputRows( + key: String, + inputRows: Iterator[InputEventRow], + timerValues: TimerValues, + expiredTimerInfo: ExpiredTimerInfo): Iterator[OutputRow] = { + if (inputRows.isEmpty) { + Iterator.empty + } else { + var minEventTime = inputRows.next().eventTime + var count = 1 + inputRows.foreach { row => + if (row.eventTime.before(minEventTime)) { + minEventTime = row.eventTime + } + count += 1 + } + Iterator.single(OutputRow(key, minEventTime, count)) + } + } +} + +class InputCountStatefulProcessor[T] + extends StatefulProcessor[String, T, Int] { + override def init(outputMode: OutputMode, timeMode: TimeMode): Unit = {} + + override def handleInputRows( + key: String, + inputRows: Iterator[T], + timerValues: TimerValues, + expiredTimerInfo: ExpiredTimerInfo): Iterator[Int] = { + Iterator.single(inputRows.size) + } +} + +/** + * Emits output row with timestamp older than current watermark for batchId > 0. + */ +class StatefulProcessorEmittingRowsOlderThanWatermark + extends StatefulProcessor[String, InputEventRow, OutputRow] { + override def init(outputMode: OutputMode, timeMode: TimeMode): Unit = {} + + override def handleInputRows( + key: String, + inputRows: Iterator[InputEventRow], + timerValues: TimerValues, + expiredTimerInfo: ExpiredTimerInfo): Iterator[OutputRow] = { + Iterator.single( + OutputRow( + key, + // always emit value with eventTime 1 which will fail after first batch, as + // watermark will move past 0L + Timestamp.from(Instant.ofEpochMilli(1)), + inputRows.size)) + } +} + +case class Window( + start: Timestamp, + end: Timestamp) + +case class AggEventRow( + window: Window, + count: Long) + +class TransformWithStateChainingSuite extends StreamTest { + import testImplicits._ + + test("watermark is propagated correctly for next stateful operator" + + " after transformWithState") { + withSQLConf(SQLConf.STATE_STORE_PROVIDER_CLASS.key -> + classOf[RocksDBStateStoreProvider].getName) { + val inputData = MemoryStream[InputEventRow] + + val result = inputData.toDS() + .withWatermark("eventTime", "1 minute") + .groupByKey(x => x.key) + .transformWithState[OutputRow]( + new TestStatefulProcessor(), + "outputEventTime", + OutputMode.Append()) + .groupBy(window($"outputEventTime", "1 minute")) + .count() + .as[AggEventRow] + + testStream(result, OutputMode.Append())( + AddData(inputData, InputEventRow("k1", timestamp("2024-01-01 00:00:00"), "e1")), + // watermark should be 1 minute behind `2024-01-01 00:00:00`, nothing is + // emitted as all records have timestamp > epoch + CheckNewAnswer(), + Execute("assertWatermarkEquals") { q => + assertWatermarkEquals(q, timestamp("2023-12-31 23:59:00")) + }, + AddData(inputData, InputEventRow("k1", timestamp("2024-02-01 00:00:00"), "e1")), + // global watermark should now be 1 minute behind `2024-02-01 00:00:00`. + CheckNewAnswer(AggEventRow( + Window(timestamp("2024-01-01 00:00:00"), timestamp("2024-01-01 00:01:00")), 1) + ), + Execute("assertWatermarkEquals") { q => + assertWatermarkEquals(q, timestamp("2024-01-31 23:59:00")) + }, + AddData(inputData, InputEventRow("k1", timestamp("2024-02-02 00:00:00"), "e1")), + CheckNewAnswer(AggEventRow( + Window(timestamp("2024-02-01 00:00:00"), timestamp("2024-02-01 00:01:00")), 1) + ) + ) + } + } + + test("passing eventTime column to transformWithState fails if" + + " no watermark is defined") { + withSQLConf(SQLConf.STATE_STORE_PROVIDER_CLASS.key -> + classOf[RocksDBStateStoreProvider].getName) { + val inputData = MemoryStream[InputEventRow] + + val ex = intercept[AnalysisException] { + inputData.toDS() + .groupByKey(x => x.key) + .transformWithState[OutputRow]( + new TestStatefulProcessor(), + "outputEventTime", + OutputMode.Append()) + } + + checkError(ex, "CANNOT_ASSIGN_EVENT_TIME_COLUMN_WITHOUT_WATERMARK") + } + } + + test("missing eventTime column to transformWithState fails the query if" + + " another stateful operator is added") { + withSQLConf(SQLConf.STATE_STORE_PROVIDER_CLASS.key -> + classOf[RocksDBStateStoreProvider].getName) { + val inputData = MemoryStream[InputEventRow] + + val result = inputData.toDS() + .withWatermark("eventTime", "1 minute") + .groupByKey(x => x.key) + .transformWithState[OutputRow]( + new TestStatefulProcessor(), + TimeMode.None(), + OutputMode.Append()) + .groupBy(window($"outputEventTime", "1 minute")) + .count() + + val ex = intercept[ExtendedAnalysisException] { + testStream(result, OutputMode.Append())( + StartStream() + ) + } + assert(ex.getMessage.contains("there are streaming aggregations on" + + " streaming DataFrames/DataSets without watermark")) + } + } + + test("chaining multiple transformWithState operators") { + withSQLConf(SQLConf.STATE_STORE_PROVIDER_CLASS.key -> + classOf[RocksDBStateStoreProvider].getName) { + val inputData = MemoryStream[InputEventRow] + + val result = inputData.toDS() + .withWatermark("eventTime", "1 minute") + .groupByKey(x => x.key) + .transformWithState[OutputRow]( + new TestStatefulProcessor(), + "outputEventTime", + OutputMode.Append()) + .groupByKey(x => x.key) + .transformWithState( + new InputCountStatefulProcessor[OutputRow](), + TimeMode.None(), + OutputMode.Append() + ) + + testStream(result, OutputMode.Append())( + AddData(inputData, InputEventRow("k1", timestamp("2024-01-01 00:00:00"), "e1")), + CheckNewAnswer(1), + Execute("assertWatermarkEquals") { q => + assertWatermarkEquals(q, timestamp("2023-12-31 23:59:00")) + }, + AddData(inputData, InputEventRow("k1", timestamp("2024-02-01 00:00:00"), "e1")), + CheckNewAnswer(1), + Execute("assertWatermarkEquals") { q => + assertWatermarkEquals(q, timestamp("2024-01-31 23:59:00")) + }, + AddData(inputData, InputEventRow("k1", timestamp("2024-02-02 00:00:00"), "e1")), + CheckNewAnswer(1) + ) + } + } + + test("dropDuplicateWithWatermark after transformWithState operator" + + " fails if watermark column is not provided") { + withSQLConf(SQLConf.STATE_STORE_PROVIDER_CLASS.key -> + classOf[RocksDBStateStoreProvider].getName) { + val inputData = MemoryStream[InputEventRow] + val result = inputData.toDS() + .withWatermark("eventTime", "1 minute") + .groupByKey(x => x.key) + .transformWithState[OutputRow]( + new TestStatefulProcessor(), + TimeMode.None(), + OutputMode.Append()) + .dropDuplicatesWithinWatermark() + + val ex = intercept[ExtendedAnalysisException] { + testStream(result, OutputMode.Append())( + StartStream() + ) + } + assert(ex.getMessage.contains("dropDuplicatesWithinWatermark is not supported on" + + " streaming DataFrames/DataSets without watermark")) + } + } + + test("dropDuplicateWithWatermark after transformWithState operator") { + withSQLConf(SQLConf.STATE_STORE_PROVIDER_CLASS.key -> + classOf[RocksDBStateStoreProvider].getName) { + val inputData = MemoryStream[InputEventRow] + val result = inputData.toDS() + .withWatermark("eventTime", "1 minute") + .groupByKey(x => x.key) + .transformWithState[OutputRow]( + new TestStatefulProcessor(), + "outputEventTime", + OutputMode.Append()) + .dropDuplicatesWithinWatermark() + + testStream(result, OutputMode.Append())( + AddData(inputData, InputEventRow("k1", timestamp("2024-02-01 00:00:00"), "e1"), + InputEventRow("k1", timestamp("2024-02-01 00:00:00"), "e1")), + CheckNewAnswer(OutputRow("k1", timestamp("2024-02-01 00:00:00"), 2)), + Execute("assertWatermarkEquals") { q => + assertWatermarkEquals(q, timestamp("2024-01-31 23:59:00")) + } + ) + } + } + + test("query fails if the output dataset does not contain specified eventTimeColumn") { + withSQLConf(SQLConf.STATE_STORE_PROVIDER_CLASS.key -> + classOf[RocksDBStateStoreProvider].getName) { + val inputData = MemoryStream[InputEventRow] + + val ex = intercept[ExtendedAnalysisException] { + val result = inputData.toDS() + .withWatermark("eventTime", "1 minute") + .groupByKey(x => x.key) + .transformWithState[OutputRow]( + new TestStatefulProcessor(), + "missingEventTimeColumn", + OutputMode.Append()) + + testStream(result, OutputMode.Append())( + StartStream() + ) + } + + checkError(ex, "UNRESOLVED_COLUMN.WITH_SUGGESTION", + parameters = Map( + "objectName" -> "`missingEventTimeColumn`", + "proposal" -> "`outputEventTime`, `count`, `key`")) + } + } + + test("query fails if the output dataset contains rows older than current watermark") { + withSQLConf(SQLConf.STATE_STORE_PROVIDER_CLASS.key -> + classOf[RocksDBStateStoreProvider].getName) { + val inputData = MemoryStream[InputEventRow] + val result = inputData.toDS() + .withWatermark("eventTime", "1 minute") + .groupByKey(x => x.key) + .transformWithState[OutputRow]( + new StatefulProcessorEmittingRowsOlderThanWatermark(), + "outputEventTime", + OutputMode.Append()) + + testStream(result, OutputMode.Append())( + AddData(inputData, InputEventRow("k1", timestamp("2024-02-01 00:00:00"), "e1")), + // after first batch, the rows are emitted with timestamp 1 ms after epoch + CheckNewAnswer(OutputRow("k1", Timestamp.from(Instant.ofEpochMilli(1)), 1)), + // this batch would fail now, because watermark will move past 1ms after epoch + AddData(inputData, InputEventRow("k1", timestamp("2024-02-02 00:00:00"), "e1")), + ExpectFailure[SparkRuntimeException] { ex => + checkError(ex.asInstanceOf[SparkThrowable], + "EMITTING_ROWS_OLDER_THAN_WATERMARK_NOT_ALLOWED", + parameters = Map("currentWatermark" -> "1706774340000", + "emittedRowEventTime" -> "1000")) + } + ) + } + } + + test("ensure that watermark delay is resolved from a view") { + withSQLConf(SQLConf.STATE_STORE_PROVIDER_CLASS.key -> + classOf[RocksDBStateStoreProvider].getName) { + val inputData = MemoryStream[InputEventRow] + inputData.toDS() + .withWatermark("eventTime", "1 minute") + .createTempView("tempViewWithWatermark") + + val result = spark.readStream.table("tempViewWithWatermark") + .as[InputEventRow] + .groupByKey(x => x.key) + .transformWithState[OutputRow]( + new TestStatefulProcessor(), + "outputEventTime", + OutputMode.Append()) + + testStream(result, OutputMode.Append())( + AddData(inputData, InputEventRow("k1", timestamp("2024-02-01 00:00:00"), "e1")), + CheckNewAnswer(OutputRow("k1", timestamp("2024-02-01 00:00:00"), 1)), + Execute("assertWatermarkEquals") { q => + assertWatermarkEquals(q, timestamp("2024-01-31 23:59:00")) + } + ) + } + } + + test("ensure that query fails if there is no watermark when reading from a view") { + withSQLConf(SQLConf.STATE_STORE_PROVIDER_CLASS.key -> + classOf[RocksDBStateStoreProvider].getName) { + val inputData = MemoryStream[InputEventRow] + inputData.toDS() + .createTempView("tempViewWithoutWatermark") + + val ex = intercept[AnalysisException] { + val result = spark.readStream.table("tempViewWithoutWatermark") + .as[InputEventRow] + .groupByKey(x => x.key) + .transformWithState[OutputRow]( + new TestStatefulProcessor(), + "outputEventTime", + OutputMode.Append()) + + testStream(result, OutputMode.Append())( + AddData(inputData, InputEventRow("k1", timestamp("2024-02-01 00:00:00"), "e1")), + ExpectFailure[SparkRuntimeException] { ex => + checkError(ex.asInstanceOf[AnalysisException], + "CANNOT_ASSIGN_EVENT_TIME_COLUMN_WITHOUT_WATERMARK") + } + ) + } + + checkError(ex, "CANNOT_ASSIGN_EVENT_TIME_COLUMN_WITHOUT_WATERMARK") + } + } + + private def timestamp(str: String): Timestamp = { + Timestamp.valueOf(str) + } + + private def assertWatermarkEquals(q: StreamExecution, watermark: Timestamp): Unit = { + val queryWatermark = getQueryWatermark(q) + assert(queryWatermark.isDefined) + assert(queryWatermark.get === watermark) + } + + private def getQueryWatermark(q: StreamExecution): Option[Timestamp] = { + import scala.jdk.CollectionConverters._ + val eventTimeMap = q.lastProgress.eventTime.asScala + val queryWatermark = eventTimeMap.get("watermark") + queryWatermark.map { v => + val instant = Instant.parse(v) + val local = LocalDateTime.ofInstant(instant, ZoneId.systemDefault()) + Timestamp.valueOf(local) + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithStateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithStateSuite.scala index 0057af44d3e37..5e408dc999f82 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithStateSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithStateSuite.scala @@ -25,7 +25,8 @@ import org.apache.spark.internal.Logging import org.apache.spark.sql.{Dataset, Encoders} import org.apache.spark.sql.catalyst.util.stringToFile import org.apache.spark.sql.execution.streaming._ -import org.apache.spark.sql.execution.streaming.state.{AlsoTestWithChangelogCheckpointingEnabled, RocksDBStateStoreProvider, StatefulProcessorCannotPerformOperationWithInvalidHandleState, StateStoreMultipleColumnFamiliesNotSupportedException} +import org.apache.spark.sql.execution.streaming.TransformWithStateKeyValueRowSchema.{KEY_ROW_SCHEMA, VALUE_ROW_SCHEMA} +import org.apache.spark.sql.execution.streaming.state.{AlsoTestWithChangelogCheckpointingEnabled, ColumnFamilySchemaV1, NoPrefixKeyStateEncoderSpec, RocksDBStateStoreProvider, StatefulProcessorCannotPerformOperationWithInvalidHandleState, StateSchemaV3File, StateStoreMultipleColumnFamiliesNotSupportedException} import org.apache.spark.sql.functions.timestamp_seconds import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.streaming.util.StreamManualClock @@ -784,6 +785,79 @@ class TransformWithStateSuite extends StateStoreMetricsTest } } } + + test("transformWithState - verify StateSchemaV3 serialization and deserialization" + + " works with one batch") { + withSQLConf(SQLConf.STATE_STORE_PROVIDER_CLASS.key -> + classOf[RocksDBStateStoreProvider].getName, + SQLConf.SHUFFLE_PARTITIONS.key -> + TransformWithStateSuiteUtils.NUM_SHUFFLE_PARTITIONS.toString) { + withTempDir { checkpointDir => + val schema = List(ColumnFamilySchemaV1( + "countState", + KEY_ROW_SCHEMA, + VALUE_ROW_SCHEMA, + NoPrefixKeyStateEncoderSpec(KEY_ROW_SCHEMA), + None + )) + + val schemaFile = new StateSchemaV3File( + spark.sessionState.newHadoopConf(), checkpointDir.getCanonicalPath) + val path = schemaFile.addWithUUID(0, schema) + + assert(schemaFile.getWithPath(path) == schema) + } + } + } + + test("transformWithState - verify StateSchemaV3 serialization and deserialization" + + " works with multiple batches") { + withSQLConf(SQLConf.STATE_STORE_PROVIDER_CLASS.key -> + classOf[RocksDBStateStoreProvider].getName, + SQLConf.SHUFFLE_PARTITIONS.key -> + TransformWithStateSuiteUtils.NUM_SHUFFLE_PARTITIONS.toString) { + withTempDir { checkpointDir => + + val schema0 = List(ColumnFamilySchemaV1( + "countState", + KEY_ROW_SCHEMA, + VALUE_ROW_SCHEMA, + NoPrefixKeyStateEncoderSpec(KEY_ROW_SCHEMA), + None + )) + + val schema1 = List( + ColumnFamilySchemaV1( + "countState", + KEY_ROW_SCHEMA, + VALUE_ROW_SCHEMA, + NoPrefixKeyStateEncoderSpec(KEY_ROW_SCHEMA), + None + ), + ColumnFamilySchemaV1( + "mostRecent", + KEY_ROW_SCHEMA, + VALUE_ROW_SCHEMA, + NoPrefixKeyStateEncoderSpec(KEY_ROW_SCHEMA), + None + ) + ) + + val schemaFile = new StateSchemaV3File(spark.sessionState.newHadoopConf(), + checkpointDir.getCanonicalPath) + val path0 = schemaFile.addWithUUID(0, schema0) + + assert(schemaFile.getWithPath(path0) == schema0) + + // test the case where we are trying to add the schema after + // restarting after a few batches + val path1 = schemaFile.addWithUUID(3, schema1) + val latestSchema = schemaFile.getWithPath(path1) + + assert(latestSchema == schema1) + } + } + } } class TransformWithStateValidationSuite extends StateStoreMetricsTest { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala index 7da2bb47038ed..5fbf379644f6d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala @@ -339,8 +339,7 @@ private[sql] trait SQLTestUtilsBase val tableIdent = spark.sessionState.sqlParser.parseTableIdentifier(tableName) val cascade = !spark.sessionState.catalog.isTempView(tableIdent) spark.sharedState.cacheManager.uncacheQuery( - spark, - spark.table(tableName).logicalPlan, + spark.table(tableName), cascade = cascade, blocking = true) } diff --git a/sql/gen-sql-functions-docs.py b/sql/gen-sql-functions-docs.py index 053e11d10295b..dc48a5a6155ed 100644 --- a/sql/gen-sql-functions-docs.py +++ b/sql/gen-sql-functions-docs.py @@ -163,7 +163,8 @@ def _make_pretty_examples(jspark, infos): pretty_output = "" for info in infos: - if info.examples.startswith("\n Examples:"): + if (info.examples.startswith("\n Examples:") + and info.name.lower() not in ("from_avro", "to_avro")): output = [] output.append("-- %s" % info.name) query_examples = filter(lambda x: x.startswith(" > "), info.examples.split("\n")) diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/AbstractService.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/AbstractService.java index 6481cf15075a7..b31d024eeeeb9 100644 --- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/AbstractService.java +++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/AbstractService.java @@ -21,8 +21,11 @@ import java.util.List; import org.apache.hadoop.hive.conf.HiveConf; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; + +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.MDC; /** * AbstractService. @@ -30,7 +33,7 @@ */ public abstract class AbstractService implements Service { - private static final Logger LOG = LoggerFactory.getLogger(AbstractService.class); + private static final SparkLogger LOG = SparkLoggerFactory.getLogger(AbstractService.class); /** * Service state: initially {@link STATE#NOTINITED}. @@ -85,7 +88,7 @@ public synchronized void init(HiveConf hiveConf) { ensureCurrentState(STATE.NOTINITED); this.hiveConf = hiveConf; changeState(STATE.INITED); - LOG.info("Service:" + getName() + " is inited."); + LOG.info("Service:{} is inited.", MDC.of(LogKeys.SERVICE_NAME$.MODULE$, getName())); } /** @@ -100,7 +103,7 @@ public synchronized void start() { startTime = System.currentTimeMillis(); ensureCurrentState(STATE.INITED); changeState(STATE.STARTED); - LOG.info("Service:" + getName() + " is started."); + LOG.info("Service:{} is started.", MDC.of(LogKeys.SERVICE_NAME$.MODULE$, getName())); } /** @@ -121,7 +124,7 @@ public synchronized void stop() { } ensureCurrentState(STATE.STARTED); changeState(STATE.STOPPED); - LOG.info("Service:" + getName() + " is stopped."); + LOG.info("Service:{} is stopped.", MDC.of(LogKeys.SERVICE_NAME$.MODULE$, getName())); } @Override diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/CompositeService.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/CompositeService.java index 55c1aa52b95ca..663bcdb86f9f6 100644 --- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/CompositeService.java +++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/CompositeService.java @@ -23,8 +23,11 @@ import java.util.List; import org.apache.hadoop.hive.conf.HiveConf; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; + +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.MDC; /** * CompositeService. @@ -32,7 +35,7 @@ */ public class CompositeService extends AbstractService { - private static final Logger LOG = LoggerFactory.getLogger(CompositeService.class); + private static final SparkLogger LOG = SparkLoggerFactory.getLogger(CompositeService.class); private final List serviceList = new ArrayList(); @@ -70,7 +73,7 @@ public synchronized void start() { } super.start(); } catch (Throwable e) { - LOG.error("Error starting services " + getName(), e); + LOG.error("Error starting services {}", e, MDC.of(LogKeys.SERVICE_NAME$.MODULE$, getName())); // Note that the state of the failed service is still INITED and not // STARTED. Even though the last service is not started completely, still // call stop() on all services including failed service to make sure cleanup @@ -100,7 +103,7 @@ private synchronized void stop(int numOfServicesStarted) { try { service.stop(); } catch (Throwable t) { - LOG.info("Error stopping " + service.getName(), t); + LOG.info("Error stopping {}", t, MDC.of(LogKeys.SERVICE_NAME$.MODULE$, service.getName())); } } } @@ -123,7 +126,8 @@ public void run() { // Stop the Composite Service compositeService.stop(); } catch (Throwable t) { - LOG.info("Error stopping " + compositeService.getName(), t); + LOG.info("Error stopping {}", t, + MDC.of(LogKeys.SERVICE_NAME$.MODULE$, compositeService.getName())); } } } diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/CookieSigner.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/CookieSigner.java index 4b8d2cb1536cd..c315478939c8d 100644 --- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/CookieSigner.java +++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/CookieSigner.java @@ -21,8 +21,9 @@ import java.security.NoSuchAlgorithmException; import org.apache.commons.codec.binary.Base64; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; + +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; /** * The cookie signer generates a signature based on SHA digest @@ -33,7 +34,7 @@ public class CookieSigner { private static final String SIGNATURE = "&s="; private static final String SHA_STRING = "SHA-256"; private byte[] secretBytes; - private static final Logger LOG = LoggerFactory.getLogger(CookieSigner.class); + private static final SparkLogger LOG = SparkLoggerFactory.getLogger(CookieSigner.class); /** * Constructor diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/ServiceOperations.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/ServiceOperations.java index 434676aa8d215..92d733c563cab 100644 --- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/ServiceOperations.java +++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/ServiceOperations.java @@ -18,15 +18,18 @@ package org.apache.hive.service; import org.apache.hadoop.hive.conf.HiveConf; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; + +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.MDC; /** * ServiceOperations. * */ public final class ServiceOperations { - private static final Logger LOG = LoggerFactory.getLogger(ServiceOperations.class); + private static final SparkLogger LOG = SparkLoggerFactory.getLogger(ServiceOperations.class); private ServiceOperations() { } @@ -129,9 +132,8 @@ public static Exception stopQuietly(Service service) { try { stop(service); } catch (Exception e) { - LOG.warn("When stopping the service " + service.getName() - + " : " + e, - e); + LOG.warn("When stopping the service {}", e, + MDC.of(LogKeys.SERVICE_NAME$.MODULE$, service.getName())); return e; } return null; diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/ServiceUtils.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/ServiceUtils.java index 7552bda57dc0b..25db121207bbf 100644 --- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/ServiceUtils.java +++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/ServiceUtils.java @@ -18,7 +18,7 @@ import java.io.IOException; -import org.slf4j.Logger; +import org.apache.spark.internal.SparkLogger; public class ServiceUtils { @@ -52,7 +52,7 @@ public static int indexOfDomainMatch(String userName) { * @param log the log to record problems to at debug level. Can be null. * @param closeables the objects to close */ - public static void cleanup(Logger log, java.io.Closeable... closeables) { + public static void cleanup(SparkLogger log, java.io.Closeable... closeables) { for (java.io.Closeable c : closeables) { if (c != null) { try { diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/auth/HiveAuthFactory.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/auth/HiveAuthFactory.java index e3316cef241c3..ecbda2661e960 100644 --- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/auth/HiveAuthFactory.java +++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/auth/HiveAuthFactory.java @@ -27,9 +27,7 @@ import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.conf.HiveConf.ConfVars; -import org.apache.hadoop.hive.metastore.HiveMetaStore; -import org.apache.hadoop.hive.metastore.HiveMetaStore.HMSHandler; -import org.apache.hadoop.hive.metastore.api.MetaException; +import org.apache.hadoop.hive.ql.metadata.Hive; import org.apache.hadoop.hive.shims.HadoopShims.KerberosNameShim; import org.apache.hadoop.hive.shims.ShimLoader; import org.apache.hadoop.hive.thrift.DBTokenStore; @@ -44,16 +42,19 @@ import org.apache.thrift.TProcessorFactory; import org.apache.thrift.transport.TTransportException; import org.apache.thrift.transport.TTransportFactory; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; + +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.MDC; /** * This class helps in some aspects of authentication. It creates the proper Thrift classes for the * given configuration as well as helps with authenticating requests. */ public class HiveAuthFactory { - private static final Logger LOG = LoggerFactory.getLogger(HiveAuthFactory.class); + private static final SparkLogger LOG = SparkLoggerFactory.getLogger(HiveAuthFactory.class); public enum AuthTypes { NOSASL("NOSASL"), @@ -132,16 +133,15 @@ public HiveAuthFactory(HiveConf conf) throws TTransportException, IOException { HiveConf.ConfVars.METASTORE_CLUSTER_DELEGATION_TOKEN_STORE_CLS); if (tokenStoreClass.equals(DBTokenStore.class.getName())) { - HMSHandler baseHandler = new HiveMetaStore.HMSHandler( - "new db based metaserver", conf, true); - rawStore = baseHandler.getMS(); + // Follows https://issues.apache.org/jira/browse/HIVE-12270 + rawStore = Hive.class; } delegationTokenManager.startDelegationTokenSecretManager( conf, rawStore, ServerMode.HIVESERVER2); saslServer.setSecretManager(delegationTokenManager.getSecretManager()); } - catch (MetaException|IOException e) { + catch (IOException e) { throw new TTransportException("Failed to start token manager", e); } } @@ -288,9 +288,9 @@ public String verifyDelegationToken(String delegationToken) throws HiveSQLExcept try { return delegationTokenManager.verifyDelegationToken(delegationToken); } catch (IOException e) { - String msg = "Error verifying delegation token " + delegationToken; - LOG.error(msg, e); - throw new HiveSQLException(msg, "08S01", e); + String msg = "Error verifying delegation token"; + LOG.error(msg + " {}", e, MDC.of(LogKeys.TOKEN$.MODULE$, delegationToken)); + throw new HiveSQLException(msg + delegationToken, "08S01", e); } } diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/auth/HttpAuthUtils.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/auth/HttpAuthUtils.java index 08a8258db06f2..e307bdab04498 100644 --- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/auth/HttpAuthUtils.java +++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/auth/HttpAuthUtils.java @@ -39,8 +39,11 @@ import org.ietf.jgss.GSSManager; import org.ietf.jgss.GSSName; import org.ietf.jgss.Oid; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; + +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.MDC; /** * Utility functions for HTTP mode authentication. @@ -50,7 +53,7 @@ public final class HttpAuthUtils { public static final String AUTHORIZATION = "Authorization"; public static final String BASIC = "Basic"; public static final String NEGOTIATE = "Negotiate"; - private static final Logger LOG = LoggerFactory.getLogger(HttpAuthUtils.class); + private static final SparkLogger LOG = SparkLoggerFactory.getLogger(HttpAuthUtils.class); private static final String COOKIE_ATTR_SEPARATOR = "&"; private static final String COOKIE_CLIENT_USER_NAME = "cu"; private static final String COOKIE_CLIENT_RAND_NUMBER = "rn"; @@ -109,7 +112,8 @@ public static String getUserNameFromCookieToken(String tokenStr) { Map map = splitCookieToken(tokenStr); if (!map.keySet().equals(COOKIE_ATTRIBUTES)) { - LOG.error("Invalid token with missing attributes " + tokenStr); + LOG.error("Invalid token with missing attributes {}", + MDC.of(LogKeys.TOKEN$.MODULE$, tokenStr)); return null; } return map.get(COOKIE_CLIENT_USER_NAME); @@ -129,7 +133,7 @@ private static Map splitCookieToken(String tokenStr) { String part = st.nextToken(); int separator = part.indexOf(COOKIE_KEY_VALUE_SEPARATOR); if (separator == -1) { - LOG.error("Invalid token string " + tokenStr); + LOG.error("Invalid token string {}", MDC.of(LogKeys.TOKEN$.MODULE$, tokenStr)); return null; } String key = part.substring(0, separator); diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/auth/KerberosSaslHelper.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/auth/KerberosSaslHelper.java index 175412ed98c6c..ef91f94eeec2b 100644 --- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/auth/KerberosSaslHelper.java +++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/auth/KerberosSaslHelper.java @@ -30,6 +30,7 @@ import org.apache.thrift.TProcessorFactory; import org.apache.thrift.transport.TSaslClientTransport; import org.apache.thrift.transport.TTransport; +import org.apache.thrift.transport.TTransportException; public final class KerberosSaslHelper { @@ -68,8 +69,8 @@ public static TTransport createSubjectAssumedTransport(String principal, new TSaslClientTransport("GSSAPI", null, names[0], names[1], saslProps, null, underlyingTransport); return new TSubjectAssumingTransport(saslTransport); - } catch (SaslException se) { - throw new IOException("Could not instantiate SASL transport", se); + } catch (SaslException | TTransportException se) { + throw new IOException("Could not instantiate transport", se); } } diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/auth/PlainSaslHelper.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/auth/PlainSaslHelper.java index c06f6ec34653f..5ac29950f4f85 100644 --- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/auth/PlainSaslHelper.java +++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/auth/PlainSaslHelper.java @@ -38,6 +38,7 @@ import org.apache.thrift.transport.TSaslClientTransport; import org.apache.thrift.transport.TSaslServerTransport; import org.apache.thrift.transport.TTransport; +import org.apache.thrift.transport.TTransportException; import org.apache.thrift.transport.TTransportFactory; public final class PlainSaslHelper { @@ -64,7 +65,7 @@ public static TTransportFactory getPlainTransportFactory(String authTypeStr) } public static TTransport getPlainTransport(String username, String password, - TTransport underlyingTransport) throws SaslException { + TTransport underlyingTransport) throws SaslException, TTransportException { return new TSaslClientTransport("PLAIN", null, null, null, new HashMap(), new PlainCallbackHandler(username, password), underlyingTransport); } diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/auth/TSetIpAddressProcessor.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/auth/TSetIpAddressProcessor.java index 1205d21be6be6..e0091d6c04fe7 100644 --- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/auth/TSetIpAddressProcessor.java +++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/auth/TSetIpAddressProcessor.java @@ -25,8 +25,9 @@ import org.apache.thrift.transport.TSaslServerTransport; import org.apache.thrift.transport.TSocket; import org.apache.thrift.transport.TTransport; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; + +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; /** * This class is responsible for setting the ipAddress for operations executed via HiveServer2. @@ -38,18 +39,19 @@ */ public class TSetIpAddressProcessor extends TCLIService.Processor { - private static final Logger LOGGER = LoggerFactory.getLogger(TSetIpAddressProcessor.class.getName()); + private static final SparkLogger LOGGER = SparkLoggerFactory.getLogger(TSetIpAddressProcessor.class); public TSetIpAddressProcessor(Iface iface) { super(iface); } @Override - public boolean process(final TProtocol in, final TProtocol out) throws TException { + public void process(final TProtocol in, final TProtocol out) throws TException { setIpAddress(in); setUserName(in); try { - return super.process(in, out); + super.process(in, out); + return; } finally { THREAD_LOCAL_USER_NAME.remove(); THREAD_LOCAL_IP_ADDRESS.remove(); diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/CLIService.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/CLIService.java index caccb0c4b76f7..86fb725d3a3cc 100644 --- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/CLIService.java +++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/CLIService.java @@ -49,8 +49,11 @@ import org.apache.hive.service.rpc.thrift.TRowSet; import org.apache.hive.service.rpc.thrift.TTableSchema; import org.apache.hive.service.server.HiveServer2; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; + +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.MDC; /** * CLIService. @@ -58,7 +61,7 @@ */ public class CLIService extends CompositeService implements ICLIService { - private static final Logger LOG = LoggerFactory.getLogger(CLIService.class); + private static final SparkLogger LOG = SparkLoggerFactory.getLogger(CLIService.class); public static final TProtocolVersion SERVER_VERSION; @@ -99,8 +102,9 @@ public synchronized void init(HiveConf hiveConf) { String principal = hiveConf.getVar(ConfVars.HIVE_SERVER2_SPNEGO_PRINCIPAL); String keyTabFile = hiveConf.getVar(ConfVars.HIVE_SERVER2_SPNEGO_KEYTAB); if (principal.isEmpty() || keyTabFile.isEmpty()) { - LOG.info("SPNego httpUGI not created, spNegoPrincipal: " + principal + - ", ketabFile: " + keyTabFile); + LOG.info("SPNego httpUGI not created, spNegoPrincipal: {}, keytabFile: {}", + MDC.of(LogKeys.PRINCIPAL$.MODULE$, principal), + MDC.of(LogKeys.KEYTAB_FILE$.MODULE$, keyTabFile)); } else { try { this.httpUGI = HiveAuthFactory.loginFromSpnegoKeytabAndReturnUGI(hiveConf); @@ -457,7 +461,8 @@ public OperationStatus getOperationStatus(OperationHandle opHandle) LOG.trace(opHandle + ": The background operation was cancelled", e); } catch (ExecutionException e) { // The background operation thread was aborted - LOG.warn(opHandle + ": The background operation was aborted", e); + LOG.warn("{}: The background operation was aborted", e, + MDC.of(LogKeys.OPERATION_HANDLE$.MODULE$, opHandle)); } catch (InterruptedException e) { // No op, this thread was interrupted // In this case, the call might return sooner than long polling timeout @@ -551,7 +556,7 @@ public String getDelegationToken(SessionHandle sessionHandle, HiveAuthFactory au String owner, String renewer) throws HiveSQLException { String delegationToken = sessionManager.getSession(sessionHandle) .getDelegationToken(authFactory, owner, renewer); - LOG.info(sessionHandle + ": getDelegationToken()"); + LOG.info("{}: getDelegationToken()", MDC.of(LogKeys.SESSION_HANDLE$.MODULE$, sessionHandle)); return delegationToken; } @@ -559,14 +564,14 @@ public String getDelegationToken(SessionHandle sessionHandle, HiveAuthFactory au public void cancelDelegationToken(SessionHandle sessionHandle, HiveAuthFactory authFactory, String tokenStr) throws HiveSQLException { sessionManager.getSession(sessionHandle).cancelDelegationToken(authFactory, tokenStr); - LOG.info(sessionHandle + ": cancelDelegationToken()"); + LOG.info("{}: cancelDelegationToken()", MDC.of(LogKeys.SESSION_HANDLE$.MODULE$, sessionHandle)); } @Override public void renewDelegationToken(SessionHandle sessionHandle, HiveAuthFactory authFactory, String tokenStr) throws HiveSQLException { sessionManager.getSession(sessionHandle).renewDelegationToken(authFactory, tokenStr); - LOG.info(sessionHandle + ": renewDelegationToken()"); + LOG.info("{}: renewDelegationToken()", MDC.of(LogKeys.SESSION_HANDLE$.MODULE$, sessionHandle)); } @Override diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/ColumnBasedSet.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/ColumnBasedSet.java index 629d9abdac2c0..4331f6829fbf3 100644 --- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/ColumnBasedSet.java +++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/ColumnBasedSet.java @@ -30,8 +30,11 @@ import org.apache.thrift.protocol.TCompactProtocol; import org.apache.thrift.protocol.TProtocol; import org.apache.thrift.transport.TIOStreamTransport; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; + +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.MDC; /** * ColumnBasedSet. @@ -44,7 +47,7 @@ public class ColumnBasedSet implements RowSet { private final List columns; private byte[] blob; private boolean isBlobBased = false; - public static final Logger LOG = LoggerFactory.getLogger(ColumnBasedSet.class); + public static final SparkLogger LOG = SparkLoggerFactory.getLogger(ColumnBasedSet.class); public ColumnBasedSet(TableSchema schema) { descriptors = schema.toTypeDescriptors(); @@ -68,7 +71,7 @@ public ColumnBasedSet(TRowSet tRowSet) throws TException { try { tvalue.read(protocol); } catch (TException e) { - LOG.error(e.getMessage(), e); + LOG.error("{}", e, MDC.of(LogKeys.ERROR$.MODULE$, e.getMessage())); throw new TException("Error reading column value from the row set blob", e); } columns.add(new ColumnBuffer(tvalue)); diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/ClassicTableTypeMapping.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/ClassicTableTypeMapping.java index 96c16beac7c4d..0b71b606b9d65 100644 --- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/ClassicTableTypeMapping.java +++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/ClassicTableTypeMapping.java @@ -28,8 +28,11 @@ import com.google.common.collect.Iterables; import com.google.common.collect.Multimap; import org.apache.hadoop.hive.metastore.TableType; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; + +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.MDC; /** * ClassicTableTypeMapping. @@ -40,7 +43,7 @@ */ public class ClassicTableTypeMapping implements TableTypeMapping { - private static final Logger LOG = LoggerFactory.getLogger(ClassicTableTypeMapping.class); + private static final SparkLogger LOG = SparkLoggerFactory.getLogger(ClassicTableTypeMapping.class); public enum ClassicTableTypes { TABLE, @@ -69,7 +72,8 @@ public ClassicTableTypeMapping() { public String[] mapToHiveType(String clientTypeName) { Collection hiveTableType = clientToHiveMap.get(clientTypeName.toUpperCase()); if (hiveTableType == null) { - LOG.warn("Not supported client table type " + clientTypeName); + LOG.warn("Not supported client table type {}", + MDC.of(LogKeys.TABLE_TYPE$.MODULE$, clientTypeName)); return new String[] {clientTypeName}; } return Iterables.toArray(hiveTableType, String.class); @@ -79,7 +83,8 @@ public String[] mapToHiveType(String clientTypeName) { public String mapToClientType(String hiveTypeName) { String clientTypeName = hiveToClientMap.get(hiveTypeName); if (clientTypeName == null) { - LOG.warn("Invalid hive table type " + hiveTypeName); + LOG.warn("Invalid hive table type {}", + MDC.of(LogKeys.TABLE_TYPE$.MODULE$, hiveTypeName)); return hiveTypeName; } return clientTypeName; diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/Operation.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/Operation.java index ad42925207d69..f488a411c31f3 100644 --- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/Operation.java +++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/Operation.java @@ -38,15 +38,18 @@ import org.apache.hive.service.rpc.thrift.TProtocolVersion; import org.apache.hive.service.rpc.thrift.TRowSet; import org.apache.hive.service.rpc.thrift.TTableSchema; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; + +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.MDC; public abstract class Operation { protected final HiveSession parentSession; private OperationState state = OperationState.INITIALIZED; private final OperationHandle opHandle; private HiveConf configuration; - public static final Logger LOG = LoggerFactory.getLogger(Operation.class.getName()); + public static final SparkLogger LOG = SparkLoggerFactory.getLogger(Operation.class); public static final FetchOrientation DEFAULT_FETCH_ORIENTATION = FetchOrientation.FETCH_NEXT; public static final long DEFAULT_FETCH_MAX_ROWS = 100; protected boolean hasResultSet; @@ -208,8 +211,8 @@ protected void createOperationLog() { // create log file try { if (operationLogFile.exists()) { - LOG.warn("The operation log file should not exist, but it is already there: " + - operationLogFile.getAbsolutePath()); + LOG.warn("The operation log file should not exist, but it is already there: {}", + MDC.of(LogKeys.PATH$.MODULE$, operationLogFile.getAbsolutePath())); operationLogFile.delete(); } if (!operationLogFile.createNewFile()) { @@ -217,13 +220,15 @@ protected void createOperationLog() { // If it can be read/written, keep its contents and use it. if (!operationLogFile.canRead() || !operationLogFile.canWrite()) { LOG.warn("The already existed operation log file cannot be recreated, " + - "and it cannot be read or written: " + operationLogFile.getAbsolutePath()); + "and it cannot be read or written: {}", + MDC.of(LogKeys.PATH$.MODULE$, operationLogFile.getAbsolutePath())); isOperationLogEnabled = false; return; } } } catch (Exception e) { - LOG.warn("Unable to create operation log file: " + operationLogFile.getAbsolutePath(), e); + LOG.warn("Unable to create operation log file: {}", e, + MDC.of(LogKeys.PATH$.MODULE$, operationLogFile.getAbsolutePath())); isOperationLogEnabled = false; return; } @@ -232,8 +237,8 @@ protected void createOperationLog() { try { operationLog = new OperationLog(opHandle.toString(), operationLogFile, parentSession.getHiveConf()); } catch (FileNotFoundException e) { - LOG.warn("Unable to instantiate OperationLog object for operation: " + - opHandle, e); + LOG.warn("Unable to instantiate OperationLog object for operation: {}", e, + MDC.of(LogKeys.OPERATION_HANDLE$.MODULE$, opHandle)); isOperationLogEnabled = false; return; } @@ -283,8 +288,9 @@ public void run() throws HiveSQLException { protected void cleanupOperationLog() { if (isOperationLogEnabled) { if (operationLog == null) { - LOG.error("Operation [ " + opHandle.getHandleIdentifier() + " ] " - + "logging is enabled, but its OperationLog object cannot be found."); + LOG.error("Operation [ {} ] logging is enabled, " + + "but its OperationLog object cannot be found.", + MDC.of(LogKeys.OPERATION_HANDLE_ID$.MODULE$, opHandle.getHandleIdentifier())); } else { operationLog.close(); } diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/OperationManager.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/OperationManager.java index bb68c840496ad..fd8266d1a9acc 100644 --- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/OperationManager.java +++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/OperationManager.java @@ -40,15 +40,18 @@ import org.apache.hive.service.rpc.thrift.TRowSet; import org.apache.hive.service.rpc.thrift.TTableSchema; import org.apache.logging.log4j.core.Appender; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; + +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.MDC; /** * OperationManager. * */ public class OperationManager extends AbstractService { - private static final Logger LOG = LoggerFactory.getLogger(OperationManager.class); + private static final SparkLogger LOG = SparkLoggerFactory.getLogger(OperationManager.class); private final Map handleToOperation = new HashMap(); @@ -289,7 +292,8 @@ public List removeExpiredOperations(OperationHandle[] handles) { for (OperationHandle handle : handles) { Operation operation = removeTimedOutOperation(handle); if (operation != null) { - LOG.warn("Operation " + handle + " is timed-out and will be closed"); + LOG.warn("Operation {} is timed-out and will be closed", + MDC.of(LogKeys.OPERATION_HANDLE$.MODULE$, handle)); removed.add(operation); } } diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/session/HiveSessionImpl.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/session/HiveSessionImpl.java index e00d2705d4172..4b55453ec7a8b 100644 --- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/session/HiveSessionImpl.java +++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/session/HiveSessionImpl.java @@ -22,6 +22,7 @@ import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; import java.util.HashSet; import java.util.List; import java.util.Map; @@ -69,8 +70,11 @@ import org.apache.hive.service.rpc.thrift.TRowSet; import org.apache.hive.service.rpc.thrift.TTableSchema; import org.apache.hive.service.server.ThreadWithGarbageCleanup; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; + +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.MDC; import static org.apache.hadoop.hive.conf.SystemVariables.ENV_PREFIX; import static org.apache.hadoop.hive.conf.SystemVariables.HIVECONF_PREFIX; @@ -91,7 +95,7 @@ public class HiveSessionImpl implements HiveSession { private String ipAddress; private static final String FETCH_WORK_SERDE_CLASS = "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe"; - private static final Logger LOG = LoggerFactory.getLogger(HiveSessionImpl.class); + private static final SparkLogger LOG = SparkLoggerFactory.getLogger(HiveSessionImpl.class); private SessionManager sessionManager; private OperationManager operationManager; private final Set opHandleSet = new HashSet(); @@ -116,7 +120,7 @@ public HiveSessionImpl(TProtocolVersion protocol, String username, String passwo ShimLoader.getHadoopShims().refreshDefaultQueue(hiveConf, username); } } catch (IOException e) { - LOG.warn("Error setting scheduler queue: " + e, e); + LOG.warn("Error setting scheduler queue: ", e); } // Set an explicit session name to control the download directory name hiveConf.set("hive.session.id", @@ -146,8 +150,8 @@ public void open(Map sessionConfMap) throws HiveSQLException { sessionState.loadAuxJars(); sessionState.loadReloadableAuxJars(); } catch (IOException e) { - String msg = "Failed to load reloadable jar file path: " + e; - LOG.error(msg, e); + String msg = "Failed to load reloadable jar file path."; + LOG.error("{}", e, MDC.of(LogKeys.ERROR$.MODULE$, msg)); throw new HiveSQLException(msg, e); } // Process global init file: .hiverc @@ -168,7 +172,7 @@ protected BufferedReader loadFile(String fileName) throws IOException { FileInputStream initStream = null; BufferedReader bufferedReader = null; initStream = new FileInputStream(fileName); - bufferedReader = new BufferedReader(new InputStreamReader(initStream)); + bufferedReader = new BufferedReader(new InputStreamReader(initStream, StandardCharsets.UTF_8)); return bufferedReader; } @@ -197,7 +201,8 @@ private void processGlobalInitFile() { hivercFile = new File(hivercFile, SessionManager.HIVERCFILE); } if (hivercFile.isFile()) { - LOG.info("Running global init file: " + hivercFile); + LOG.info("Running global init file: {}", + MDC.of(LogKeys.GLOBAL_INIT_FILE$.MODULE$, hivercFile)); int rc = processor.processFile(hivercFile.getAbsolutePath()); if (rc != 0) { LOG.error("Failed on initializing global .hiverc file"); @@ -297,28 +302,29 @@ private static void setConf(String varname, String key, String varvalue, boolean @Override public void setOperationLogSessionDir(File operationLogRootDir) { if (!operationLogRootDir.exists()) { - LOG.warn("The operation log root directory is removed, recreating: " + - operationLogRootDir.getAbsolutePath()); + LOG.warn("The operation log root directory is removed, recreating: {}", + MDC.of(LogKeys.PATH$.MODULE$, operationLogRootDir.getAbsolutePath())); if (!operationLogRootDir.mkdirs()) { - LOG.warn("Unable to create operation log root directory: " + - operationLogRootDir.getAbsolutePath()); + LOG.warn("Unable to create operation log root directory: {}", + MDC.of(LogKeys.PATH$.MODULE$, operationLogRootDir.getAbsolutePath())); } } if (!operationLogRootDir.canWrite()) { - LOG.warn("The operation log root directory is not writable: " + - operationLogRootDir.getAbsolutePath()); + LOG.warn("The operation log root directory is not writable: {}", + MDC.of(LogKeys.PATH$.MODULE$, operationLogRootDir.getAbsolutePath())); } sessionLogDir = new File(operationLogRootDir, sessionHandle.getHandleIdentifier().toString()); isOperationLogEnabled = true; if (!sessionLogDir.exists()) { if (!sessionLogDir.mkdir()) { - LOG.warn("Unable to create operation log session directory: " + - sessionLogDir.getAbsolutePath()); + LOG.warn("Unable to create operation log session directory: {}", + MDC.of(LogKeys.PATH$.MODULE$, sessionLogDir.getAbsolutePath())); isOperationLogEnabled = false; } } if (isOperationLogEnabled) { - LOG.info("Operation log session directory is created: " + sessionLogDir.getAbsolutePath()); + LOG.info("Operation log session directory is created: {}", + MDC.of(LogKeys.PATH$.MODULE$, sessionLogDir.getAbsolutePath())); } } @@ -653,7 +659,8 @@ public void close() throws HiveSQLException { try { operationManager.closeOperation(opHandle); } catch (Exception e) { - LOG.warn("Exception is thrown closing operation " + opHandle, e); + LOG.warn("Exception is thrown closing operation {}", e, + MDC.of(LogKeys.OPERATION_HANDLE$.MODULE$, opHandle)); } } opHandleSet.clear(); @@ -693,13 +700,14 @@ private void cleanupPipeoutFile() { (dir, name) -> name.startsWith(sessionID) && name.endsWith(".pipeout")); if (fileAry == null) { - LOG.error("Unable to access pipeout files in " + lScratchDir); + LOG.error("Unable to access pipeout files in {}", + MDC.of(LogKeys.LOCAL_SCRATCH_DIR$.MODULE$, lScratchDir)); } else { for (File file : fileAry) { try { FileUtils.forceDelete(file); } catch (Exception e) { - LOG.error("Failed to cleanup pipeout file: " + file, e); + LOG.error("Failed to cleanup pipeout file: {}", e, MDC.of(LogKeys.PATH$.MODULE$, file)); } } } @@ -710,7 +718,8 @@ private void cleanupSessionLogDir() { try { FileUtils.forceDelete(sessionLogDir); } catch (Exception e) { - LOG.error("Failed to cleanup session log dir: " + sessionHandle, e); + LOG.error("Failed to cleanup session log dir: {}", e, + MDC.of(LogKeys.SESSION_HANDLE$.MODULE$, sessionHandle)); } } } @@ -759,7 +768,8 @@ private void closeTimedOutOperations(List operations) { try { operation.close(); } catch (Exception e) { - LOG.warn("Exception is thrown closing timed-out operation " + operation.getHandle(), e); + LOG.warn("Exception is thrown closing timed-out operation {}", e, + MDC.of(LogKeys.OPERATION_HANDLE$.MODULE$, operation.getHandle())); } } } finally { diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/session/HiveSessionImplwithUGI.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/session/HiveSessionImplwithUGI.java index 514b19eb7111a..0ec13424fd0f5 100644 --- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/session/HiveSessionImplwithUGI.java +++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/session/HiveSessionImplwithUGI.java @@ -28,8 +28,6 @@ import org.apache.hive.service.auth.HiveAuthFactory; import org.apache.hive.service.cli.HiveSQLException; import org.apache.hive.service.rpc.thrift.TProtocolVersion; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; /** * @@ -41,23 +39,13 @@ public class HiveSessionImplwithUGI extends HiveSessionImpl { private UserGroupInformation sessionUgi = null; private String delegationTokenStr = null; - private Hive sessionHive = null; private HiveSession proxySession = null; - static final Logger LOG = LoggerFactory.getLogger(HiveSessionImplwithUGI.class); public HiveSessionImplwithUGI(TProtocolVersion protocol, String username, String password, HiveConf hiveConf, String ipAddress, String delegationToken) throws HiveSQLException { super(protocol, username, password, hiveConf, ipAddress); setSessionUGI(username); setDelegationToken(delegationToken); - - // create a new metastore connection for this particular user session - Hive.set(null); - try { - sessionHive = Hive.getWithoutRegisterFns(getHiveConf()); - } catch (HiveException e) { - throw new HiveSQLException("Failed to setup metastore connection", e); - } } // setup appropriate UGI for the session @@ -85,15 +73,6 @@ public String getDelegationToken() { return this.delegationTokenStr; } - @Override - protected synchronized void acquire(boolean userAccess) { - super.acquire(userAccess); - // if we have a metastore connection with impersonation, then set it first - if (sessionHive != null) { - Hive.set(sessionHive); - } - } - /** * Close the file systems for the session and remove it from the FileSystem cache. * Cancel the session's delegation token and close the metastore connection diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/session/SessionManager.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/session/SessionManager.java index fa342feacc7f4..3f60fd00b82a7 100644 --- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/session/SessionManager.java +++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/session/SessionManager.java @@ -38,8 +38,11 @@ import org.apache.hive.service.rpc.thrift.TProtocolVersion; import org.apache.hive.service.server.HiveServer2; import org.apache.hive.service.server.ThreadFactoryWithGarbageCleanup; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; + +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.MDC; /** * SessionManager. @@ -47,7 +50,7 @@ */ public class SessionManager extends CompositeService { - private static final Logger LOG = LoggerFactory.getLogger(SessionManager.class); + private static final SparkLogger LOG = SparkLoggerFactory.getLogger(SessionManager.class); public static final String HIVERCFILE = ".hiverc"; private HiveConf hiveConf; private final Map handleToSession = @@ -84,13 +87,15 @@ public synchronized void init(HiveConf hiveConf) { private void createBackgroundOperationPool() { int poolSize = hiveConf.getIntVar(ConfVars.HIVE_SERVER2_ASYNC_EXEC_THREADS); - LOG.info("HiveServer2: Background operation thread pool size: " + poolSize); + LOG.info("HiveServer2: Background operation thread pool size: {}", + MDC.of(LogKeys.THREAD_POOL_SIZE$.MODULE$, poolSize)); int poolQueueSize = hiveConf.getIntVar(ConfVars.HIVE_SERVER2_ASYNC_EXEC_WAIT_QUEUE_SIZE); - LOG.info("HiveServer2: Background operation thread wait queue size: " + poolQueueSize); + LOG.info("HiveServer2: Background operation thread wait queue size: {}", + MDC.of(LogKeys.THREAD_POOL_WAIT_QUEUE_SIZE$.MODULE$, poolQueueSize)); long keepAliveTime = HiveConf.getTimeVar( hiveConf, ConfVars.HIVE_SERVER2_ASYNC_EXEC_KEEPALIVE_TIME, TimeUnit.SECONDS); - LOG.info( - "HiveServer2: Background operation thread keepalive time: " + keepAliveTime + " seconds"); + LOG.info("HiveServer2: Background operation thread keepalive time: {} ms", + MDC.of(LogKeys.THREAD_POOL_KEEPALIVE_TIME$.MODULE$, keepAliveTime * 1000L)); // Create a thread pool with #poolSize threads // Threads terminate when they are idle for more than the keepAliveTime @@ -115,26 +120,27 @@ private void initOperationLogRootDir() { isOperationLogEnabled = true; if (operationLogRootDir.exists() && !operationLogRootDir.isDirectory()) { - LOG.warn("The operation log root directory exists, but it is not a directory: " + - operationLogRootDir.getAbsolutePath()); + LOG.warn("The operation log root directory exists, but it is not a directory: {}", + MDC.of(LogKeys.PATH$.MODULE$, operationLogRootDir.getAbsolutePath())); isOperationLogEnabled = false; } if (!operationLogRootDir.exists()) { if (!operationLogRootDir.mkdirs()) { - LOG.warn("Unable to create operation log root directory: " + - operationLogRootDir.getAbsolutePath()); + LOG.warn("Unable to create operation log root directory: {}", + MDC.of(LogKeys.PATH$.MODULE$, operationLogRootDir.getAbsolutePath())); isOperationLogEnabled = false; } } if (isOperationLogEnabled) { - LOG.info("Operation log root directory is created: " + operationLogRootDir.getAbsolutePath()); + LOG.info("Operation log root directory is created: {}", + MDC.of(LogKeys.PATH$.MODULE$, operationLogRootDir.getAbsolutePath())); try { FileUtils.forceDeleteOnExit(operationLogRootDir); } catch (IOException e) { - LOG.warn("Failed to schedule cleanup HS2 operation logging root dir: " + - operationLogRootDir.getAbsolutePath(), e); + LOG.warn("Failed to schedule cleanup HS2 operation logging root dir: {}", e, + MDC.of(LogKeys.PATH$.MODULE$, operationLogRootDir.getAbsolutePath())); } } } @@ -164,12 +170,14 @@ public void run() { if (sessionTimeout > 0 && session.getLastAccessTime() + sessionTimeout <= current && (!checkOperation || session.getNoOperationTime() > sessionTimeout)) { SessionHandle handle = session.getSessionHandle(); - LOG.warn("Session " + handle + " is Timed-out (last access : " + - new Date(session.getLastAccessTime()) + ") and will be closed"); + LOG.warn("Session {} is Timed-out (last access : {}) and will be closed", + MDC.of(LogKeys.SESSION_HANDLE$.MODULE$, handle), + MDC.of(LogKeys.LAST_ACCESS_TIME$.MODULE$, new Date(session.getLastAccessTime()))); try { closeSession(handle); } catch (HiveSQLException e) { - LOG.warn("Exception is thrown closing session " + handle, e); + LOG.warn("Exception is thrown closing session {}", e, + MDC.of(LogKeys.SESSION_HANDLE$.MODULE$, handle)); } } else { session.closeExpiredOperations(); @@ -210,8 +218,9 @@ public synchronized void stop() { try { backgroundOperationPool.awaitTermination(timeout, TimeUnit.SECONDS); } catch (InterruptedException e) { - LOG.warn("HIVE_SERVER2_ASYNC_EXEC_SHUTDOWN_TIMEOUT = " + timeout + - " seconds has been exceeded. RUNNING background operations will be shut down", e); + LOG.warn("HIVE_SERVER2_ASYNC_EXEC_SHUTDOWN_TIMEOUT = {} ms has been exceeded. " + + "RUNNING background operations will be shut down", e, + MDC.of(LogKeys.TIMEOUT$.MODULE$, timeout * 1000)); } backgroundOperationPool = null; } @@ -223,8 +232,8 @@ private void cleanupLoggingRootDir() { try { FileUtils.forceDelete(operationLogRootDir); } catch (Exception e) { - LOG.warn("Failed to cleanup root dir of HS2 logging: " + operationLogRootDir - .getAbsolutePath(), e); + LOG.warn("Failed to cleanup root dir of HS2 logging: {}", e, + MDC.of(LogKeys.PATH$.MODULE$, operationLogRootDir.getAbsolutePath())); } } } diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/thrift/ThriftBinaryCLIService.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/thrift/ThriftBinaryCLIService.java index 4d99496876fdc..c7fa7b5f3e0ac 100644 --- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/thrift/ThriftBinaryCLIService.java +++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/thrift/ThriftBinaryCLIService.java @@ -91,16 +91,10 @@ protected void initializeServer() { // Server args int maxMessageSize = hiveConf.getIntVar(HiveConf.ConfVars.HIVE_SERVER2_THRIFT_MAX_MESSAGE_SIZE); - int requestTimeout = (int) hiveConf.getTimeVar( - HiveConf.ConfVars.HIVE_SERVER2_THRIFT_LOGIN_TIMEOUT, TimeUnit.SECONDS); - int beBackoffSlotLength = (int) hiveConf.getTimeVar( - HiveConf.ConfVars.HIVE_SERVER2_THRIFT_LOGIN_BEBACKOFF_SLOT_LENGTH, TimeUnit.MILLISECONDS); TThreadPoolServer.Args sargs = new TThreadPoolServer.Args(serverSocket) .processorFactory(processorFactory).transportFactory(transportFactory) .protocolFactory(new TBinaryProtocol.Factory()) .inputProtocolFactory(new TBinaryProtocol.Factory(true, true, maxMessageSize, maxMessageSize)) - .requestTimeout(requestTimeout).requestTimeoutUnit(TimeUnit.SECONDS) - .beBackoffSlotLength(beBackoffSlotLength).beBackoffSlotLengthUnit(TimeUnit.MILLISECONDS) .executorService(executorService); // TCP Server diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/thrift/ThriftCLIService.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/thrift/ThriftCLIService.java index 4b18e2950a3de..07af0013846ba 100644 --- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/thrift/ThriftCLIService.java +++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/thrift/ThriftCLIService.java @@ -41,8 +41,11 @@ import org.apache.thrift.server.ServerContext; import org.apache.thrift.server.TServerEventHandler; import org.apache.thrift.transport.TTransport; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; + +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.MDC; /** * ThriftCLIService. @@ -50,7 +53,7 @@ */ public abstract class ThriftCLIService extends AbstractService implements TCLIService.Iface, Runnable { - public static final Logger LOG = LoggerFactory.getLogger(ThriftCLIService.class.getName()); + public static final SparkLogger LOG = SparkLoggerFactory.getLogger(ThriftCLIService.class); protected CLIService cliService; private static final TStatus OK_STATUS = new TStatus(TStatusCode.SUCCESS_STATUS); @@ -83,6 +86,16 @@ public void setSessionHandle(SessionHandle sessionHandle) { public SessionHandle getSessionHandle() { return sessionHandle; } + + @Override + public T unwrap(Class aClass) { + return null; + } + + @Override + public boolean isWrapperFor(Class aClass) { + return false; + } } public ThriftCLIService(CLIService service, String serviceName) { @@ -106,7 +119,7 @@ public void deleteContext(ServerContext serverContext, try { cliService.closeSession(sessionHandle); } catch (HiveSQLException e) { - LOG.warn("Failed to close session: " + e, e); + LOG.warn("Failed to close session: ", e); } } } @@ -236,7 +249,8 @@ private TStatus notSupportTokenErrorStatus() { @Override public TOpenSessionResp OpenSession(TOpenSessionReq req) throws TException { - LOG.info("Client protocol version: " + req.getClient_protocol()); + LOG.info("Client protocol version: {}", + MDC.of(LogKeys.PROTOCOL_VERSION$.MODULE$, req.getClient_protocol())); TOpenSessionResp resp = new TOpenSessionResp(); try { SessionHandle sessionHandle = getSessionHandle(req, resp); @@ -272,7 +286,7 @@ public TSetClientInfoResp SetClientInfo(TSetClientInfoReq req) throws TException sb.append(e.getKey()).append(" = ").append(e.getValue()); } if (sb != null) { - LOG.info("{}", sb); + LOG.info("{}", MDC.of(LogKeys.SET_CLIENT_INFO_REQUEST$.MODULE$, sb)); } } return new TSetClientInfoResp(OK_STATUS); diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/thrift/ThriftHttpServlet.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/thrift/ThriftHttpServlet.java index b0bede741cb19..d9bf361fdef63 100644 --- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/thrift/ThriftHttpServlet.java +++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/thrift/ThriftHttpServlet.java @@ -55,8 +55,11 @@ import org.ietf.jgss.GSSManager; import org.ietf.jgss.GSSName; import org.ietf.jgss.Oid; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; + +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.MDC; /** * @@ -66,7 +69,7 @@ public class ThriftHttpServlet extends TServlet { private static final long serialVersionUID = 1L; - public static final Logger LOG = LoggerFactory.getLogger(ThriftHttpServlet.class.getName()); + public static final SparkLogger LOG = SparkLoggerFactory.getLogger(ThriftHttpServlet.class); private final String authType; private final UserGroupInformation serviceUGI; private final UserGroupInformation httpUGI; @@ -174,7 +177,8 @@ protected void doPost(HttpServletRequest request, HttpServletResponse response) } else { response.addCookie(hs2Cookie); } - LOG.info("Cookie added for clientUserName " + clientUserName); + LOG.info("Cookie added for clientUserName {}", + MDC.of(LogKeys.USER_NAME$.MODULE$, clientUserName)); } super.doPost(request, response); } @@ -228,7 +232,7 @@ private String getClientNameFromCookie(Cookie[] cookies) { String userName = HttpAuthUtils.getUserNameFromCookieToken(currValue); if (userName == null) { - LOG.warn("Invalid cookie token " + currValue); + LOG.warn("Invalid cookie token {}", MDC.of(LogKeys.TOKEN$.MODULE$, currValue)); continue; } //We have found a valid cookie in the client request. diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/server/HiveServer2.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/server/HiveServer2.java index ad5ca51b9e63d..46ee775e8dd49 100644 --- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/server/HiveServer2.java +++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/server/HiveServer2.java @@ -36,9 +36,11 @@ import org.apache.hive.service.cli.thrift.ThriftBinaryCLIService; import org.apache.hive.service.cli.thrift.ThriftCLIService; import org.apache.hive.service.cli.thrift.ThriftHttpCLIService; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; +import org.apache.spark.internal.LogKeys; +import org.apache.spark.internal.MDC; import org.apache.spark.util.ShutdownHookManager; import org.apache.spark.util.SparkExitCode; @@ -47,7 +49,7 @@ * */ public class HiveServer2 extends CompositeService { - private static final Logger LOG = LoggerFactory.getLogger(HiveServer2.class); + private static final SparkLogger LOG = SparkLoggerFactory.getLogger(HiveServer2.class); private CLIService cliService; private ThriftCLIService thriftCLIService; @@ -142,8 +144,8 @@ private static void startHiveServer2() throws Throwable { if (++attempts >= maxAttempts) { throw new Error("Max start attempts " + maxAttempts + " exhausted", throwable); } else { - LOG.warn("Error starting HiveServer2 on attempt " + attempts - + ", will retry in 60 seconds", throwable); + LOG.warn("Error starting HiveServer2 on attempt {}, will retry in 60 seconds", + throwable, MDC.of(LogKeys.NUM_RETRY$.MODULE$, attempts)); try { Thread.sleep(60L * 1000L); } catch (InterruptedException e) { @@ -159,7 +161,7 @@ public static void main(String[] args) { ServerOptionsProcessor oproc = new ServerOptionsProcessor("hiveserver2"); ServerOptionsProcessorResponse oprocResponse = oproc.parse(args); - HiveStringUtils.startupShutdownMessage(HiveServer2.class, args, LOG); + HiveStringUtils.startupShutdownMessage(HiveServer2.class, args, LOG.getSlf4jLogger()); // Call the executor which will execute the appropriate command based on the parsed options oprocResponse.getServerOptionsExecutor().execute(); diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/server/ThreadWithGarbageCleanup.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/server/ThreadWithGarbageCleanup.java index afaa1403bfdcd..16d8540b40560 100644 --- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/server/ThreadWithGarbageCleanup.java +++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/server/ThreadWithGarbageCleanup.java @@ -22,8 +22,9 @@ import org.apache.hadoop.hive.metastore.HiveMetaStore; import org.apache.hadoop.hive.metastore.RawStore; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; + +import org.apache.spark.internal.SparkLogger; +import org.apache.spark.internal.SparkLoggerFactory; /** * A HiveServer2 thread used to construct new server threads. @@ -31,7 +32,7 @@ * when killed by its corresponding ExecutorService. */ public class ThreadWithGarbageCleanup extends Thread { - private static final Logger LOG = LoggerFactory.getLogger(ThreadWithGarbageCleanup.class); + private static final SparkLogger LOG = SparkLoggerFactory.getLogger(ThreadWithGarbageCleanup.class); Map threadRawStoreMap = ThreadFactoryWithGarbageCleanup.getThreadRawStoreMap(); diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/RowSetUtils.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/RowSetUtils.scala index c0a0f2e42f5f7..66319fff2468a 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/RowSetUtils.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/RowSetUtils.scala @@ -25,7 +25,7 @@ import scala.language.implicitConversions import org.apache.hive.service.rpc.thrift._ import org.apache.spark.sql.Row -import org.apache.spark.sql.execution.HiveResult.{toHiveString, TimeFormatters} +import org.apache.spark.sql.execution.HiveResult._ import org.apache.spark.sql.types.{BinaryType, BooleanType, ByteType, DataType, DoubleType, FloatType, IntegerType, LongType, ShortType, StringType} object RowSetUtils { @@ -38,12 +38,11 @@ object RowSetUtils { startRowOffSet: Long, rows: Seq[Row], schema: Array[DataType], - protocolVersion: TProtocolVersion, - timeFormatters: TimeFormatters): TRowSet = { + protocolVersion: TProtocolVersion): TRowSet = { if (protocolVersion.getValue < TProtocolVersion.HIVE_CLI_SERVICE_PROTOCOL_V6.getValue) { - toRowBasedSet(startRowOffSet, rows, schema, timeFormatters) + toRowBasedSet(startRowOffSet, rows, schema, getTimeFormatters, getBinaryFormatter) } else { - toColumnBasedSet(startRowOffSet, rows, schema, timeFormatters) + toColumnBasedSet(startRowOffSet, rows, schema, getTimeFormatters, getBinaryFormatter) } } @@ -51,13 +50,14 @@ object RowSetUtils { startRowOffSet: Long, rows: Seq[Row], schema: Array[DataType], - timeFormatters: TimeFormatters): TRowSet = { + timeFormatters: TimeFormatters, + binaryFormatter: BinaryFormatter): TRowSet = { val tRows = rows.map { row => val tRow = new TRow() var j = 0 val columnSize = row.length while (j < columnSize) { - val columnValue = toTColumnValue(j, row, schema(j), timeFormatters) + val columnValue = toTColumnValue(j, row, schema(j), timeFormatters, binaryFormatter) tRow.addToColVals(columnValue) j += 1 } @@ -70,13 +70,14 @@ object RowSetUtils { startRowOffSet: Long, rows: Seq[Row], schema: Array[DataType], - timeFormatters: TimeFormatters): TRowSet = { + timeFormatters: TimeFormatters, + binaryFormatter: BinaryFormatter): TRowSet = { val rowSize = rows.length val tRowSet = new TRowSet(startRowOffSet, new java.util.ArrayList[TRow](rowSize)) var i = 0 val columnSize = schema.length while (i < columnSize) { - val tColumn = toTColumn(rows, i, schema(i), timeFormatters) + val tColumn = toTColumn(rows, i, schema(i), timeFormatters, binaryFormatter) tRowSet.addToColumns(tColumn) i += 1 } @@ -84,7 +85,11 @@ object RowSetUtils { } private def toTColumn( - rows: Seq[Row], ordinal: Int, typ: DataType, timeFormatters: TimeFormatters): TColumn = { + rows: Seq[Row], + ordinal: Int, + typ: DataType, + timeFormatters: TimeFormatters, + binaryFormatter: BinaryFormatter): TColumn = { val nulls = new java.util.BitSet() typ match { case BooleanType => @@ -137,7 +142,7 @@ object RowSetUtils { val value = if (row.isNullAt(ordinal)) { "" } else { - toHiveString((row.get(ordinal), typ), nested = true, timeFormatters) + toHiveString((row.get(ordinal), typ), nested = true, timeFormatters, binaryFormatter) } values.add(value) i += 1 @@ -170,7 +175,8 @@ object RowSetUtils { ordinal: Int, row: Row, dataType: DataType, - timeFormatters: TimeFormatters): TColumnValue = { + timeFormatters: TimeFormatters, + binaryFormatter: BinaryFormatter): TColumnValue = { dataType match { case BooleanType => val boolValue = new TBoolValue @@ -226,7 +232,8 @@ object RowSetUtils { case _ => val tStrValue = new TStringValue if (!row.isNullAt(ordinal)) { - val value = toHiveString((row.get(ordinal), dataType), nested = false, timeFormatters) + val value = toHiveString( + (row.get(ordinal), dataType), nested = false, timeFormatters, binaryFormatter) tStrValue.setValue(value) } TColumnValue.stringVal(tStrValue) diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala index f8f58cd422b67..51a5e88aa633e 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala @@ -30,12 +30,11 @@ import org.apache.hive.service.cli.operation.ExecuteStatementOperation import org.apache.hive.service.cli.session.HiveSession import org.apache.hive.service.rpc.thrift.{TCLIServiceConstants, TColumnDesc, TPrimitiveTypeEntry, TRowSet, TTableSchema, TTypeDesc, TTypeEntry, TTypeId, TTypeQualifiers, TTypeQualifierValue} -import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{HIVE_OPERATION_STATE, STATEMENT_ID, TIMEOUT, USER_NAME} +import org.apache.spark.internal.{Logging, LogKeys, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.sql.{DataFrame, Row, SQLContext} import org.apache.spark.sql.catalyst.util.CharVarcharUtils import org.apache.spark.sql.catalyst.util.DateTimeConstants.MILLIS_PER_SECOND -import org.apache.spark.sql.execution.HiveResult.getTimeFormatters import org.apache.spark.sql.internal.{SQLConf, VariableSubstitution} import org.apache.spark.sql.types._ import org.apache.spark.util.{Utils => SparkUtils} @@ -84,7 +83,7 @@ private[hive] class SparkExecuteStatementOperation( val sparkType = new StructType().add("Result", "string") SparkExecuteStatementOperation.toTTableSchema(sparkType) } else { - logInfo(s"Result Schema: ${result.schema.sql}") + logInfo(log"Result Schema: ${MDC(LogKeys.SCHEMA, result.schema.sql)}") SparkExecuteStatementOperation.toTTableSchema(result.schema) } } @@ -119,14 +118,16 @@ private[hive] class SparkExecuteStatementOperation( val rows = iter.take(maxRows).toList log.debug(s"Returning result set with ${rows.length} rows from offsets " + s"[${iter.getFetchStart}, ${iter.getPosition}) with $statementId") - RowSetUtils.toTRowSet(offset, rows, dataTypes, getProtocolVersion, getTimeFormatters) + RowSetUtils.toTRowSet(offset, rows, dataTypes, getProtocolVersion) } def getResultSetSchema: TTableSchema = resultSchema override def runInternal(): Unit = { setState(OperationState.PENDING) - logInfo(s"Submitting query '$redactedStatement' with $statementId") + logInfo( + log"Submitting query '${MDC(LogKeys.REDACTED_STATEMENT, redactedStatement)}' with " + + log"${MDC(LogKeys.STATEMENT_ID, statementId)}") HiveThriftServer2.eventManager.onStatementStart( statementId, parentSession.getSessionHandle.getSessionId.toString, @@ -213,10 +214,12 @@ private[hive] class SparkExecuteStatementOperation( try { synchronized { if (getStatus.getState.isTerminal) { - logInfo(s"Query with $statementId in terminal state before it started running") + logInfo( + log"Query with ${MDC(LogKeys.STATEMENT_ID, statementId)} in terminal state " + + log"before it started running") return } else { - logInfo(s"Running query with $statementId") + logInfo(log"Running query with ${MDC(STATEMENT_ID, statementId)}") setState(OperationState.RUNNING) } } @@ -285,7 +288,9 @@ private[hive] class SparkExecuteStatementOperation( def timeoutCancel(): Unit = { synchronized { if (!getStatus.getState.isTerminal) { - logInfo(s"Query with $statementId timed out after $timeout seconds") + logInfo( + log"Query with ${MDC(LogKeys.STATEMENT_ID, statementId)} timed out " + + log"after ${MDC(LogKeys.TIMEOUT, timeout)} seconds") setState(OperationState.TIMEDOUT) cleanup() HiveThriftServer2.eventManager.onStatementTimeout(statementId) @@ -296,7 +301,7 @@ private[hive] class SparkExecuteStatementOperation( override def cancel(): Unit = { synchronized { if (!getStatus.getState.isTerminal) { - logInfo(s"Cancel query with $statementId") + logInfo(log"Cancel query with ${MDC(STATEMENT_ID, statementId)}") setState(OperationState.CANCELED) cleanup() HiveThriftServer2.eventManager.onStatementCanceled(statementId) diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetCatalogsOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetCatalogsOperation.scala index 01ef78cde8956..fd99a5b246d9b 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetCatalogsOperation.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetCatalogsOperation.scala @@ -22,7 +22,8 @@ import org.apache.hive.service.cli.OperationState import org.apache.hive.service.cli.operation.GetCatalogsOperation import org.apache.hive.service.cli.session.HiveSession -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.sql.SQLContext /** @@ -40,7 +41,7 @@ private[hive] class SparkGetCatalogsOperation( override def runInternal(): Unit = { val logMsg = "Listing catalogs" - logInfo(s"$logMsg with $statementId") + logInfo(log"Listing catalogs with ${MDC(STATEMENT_ID, statementId)}") setState(OperationState.RUNNING) // Always use the latest class loader provided by executionHive's state. val executionHiveClassLoader = sqlContext.sharedState.jarClassLoader diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetColumnsOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetColumnsOperation.scala index 5dd8caf3f221d..507dfc2ec50eb 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetColumnsOperation.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetColumnsOperation.scala @@ -27,7 +27,8 @@ import org.apache.hive.service.cli._ import org.apache.hive.service.cli.operation.GetColumnsOperation import org.apache.hive.service.cli.session.HiveSession -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.sql.SQLContext import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog.SessionCatalog @@ -60,7 +61,14 @@ private[hive] class SparkGetColumnsOperation( // Do not change cmdStr. It's used for Hive auditing and authorization. val cmdStr = s"catalog : $catalogName, schemaPattern : $schemaName, tablePattern : $tableName" val logMsg = s"Listing columns '$cmdStr, columnName : $columnName'" - logInfo(s"$logMsg with $statementId") + + val catalogNameStr = if (catalogName == null) "null" else catalogName + val schemaNameStr = if (schemaName == null) "null" else schemaName + logInfo(log"Listing columns 'catalog : ${MDC(CATALOG_NAME, catalogNameStr)}, " + + log"schemaPattern : ${MDC(DATABASE_NAME, schemaNameStr)}, " + + log"tablePattern : ${MDC(TABLE_NAME, tableName)}, " + + log"columnName : ${MDC(COLUMN_NAME, columnName)}' " + + log"with ${MDC(STATEMENT_ID, statementId)}") setState(OperationState.RUNNING) // Always use the latest class loader provided by executionHive's state. @@ -101,7 +109,7 @@ private[hive] class SparkGetColumnsOperation( } // Global temporary views - val globalTempViewDb = catalog.globalTempViewManager.database + val globalTempViewDb = catalog.globalTempDatabase val databasePattern = Pattern.compile(CLIServiceUtils.patternToRegex(schemaName)) if (databasePattern.matcher(globalTempViewDb).matches()) { catalog.globalTempViewManager.listViewNames(tablePattern).foreach { globalTempView => diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetFunctionsOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetFunctionsOperation.scala index 53a94a128c0ef..b060bf3d4ec8d 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetFunctionsOperation.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetFunctionsOperation.scala @@ -27,7 +27,7 @@ import org.apache.hive.service.cli.operation.GetFunctionsOperation import org.apache.hive.service.cli.operation.MetadataOperation.DEFAULT_HIVE_CATALOG import org.apache.hive.service.cli.session.HiveSession -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.sql.SQLContext /** @@ -51,9 +51,11 @@ private[hive] class SparkGetFunctionsOperation( override def runInternal(): Unit = { // Do not change cmdStr. It's used for Hive auditing and authorization. - val cmdStr = s"catalog : $catalogName, schemaPattern : $schemaName" - val logMsg = s"Listing functions '$cmdStr, functionName : $functionName'" - logInfo(s"$logMsg with $statementId") + val cmdMDC = log"catalog : ${MDC(LogKeys.CATALOG_NAME, catalogName)}, " + + log"schemaPattern : ${MDC(LogKeys.DATABASE_NAME, schemaName)}" + val logMDC = log"Listing functions '" + cmdMDC + + log", functionName : ${MDC(LogKeys.FUNCTION_NAME, functionName)}'" + logInfo(logMDC + log" with ${MDC(LogKeys.STATEMENT_ID, statementId)}") setState(OperationState.RUNNING) // Always use the latest class loader provided by executionHive's state. val executionHiveClassLoader = sqlContext.sharedState.jarClassLoader @@ -69,13 +71,13 @@ private[hive] class SparkGetFunctionsOperation( // authorize this call on the schema objects val privObjs = HivePrivilegeObjectUtils.getHivePrivDbObjects(matchingDbs.asJava) - authorizeMetaGets(HiveOperationType.GET_FUNCTIONS, privObjs, cmdStr) + authorizeMetaGets(HiveOperationType.GET_FUNCTIONS, privObjs, cmdMDC.message) } HiveThriftServer2.eventManager.onStatementStart( statementId, parentSession.getSessionHandle.getSessionId.toString, - logMsg, + logMDC.message, statementId, parentSession.getUsername) diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetSchemasOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetSchemasOperation.scala index 45cfa86ba9343..db1cf201b2e92 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetSchemasOperation.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetSchemasOperation.scala @@ -25,7 +25,8 @@ import org.apache.hive.service.cli.operation.GetSchemasOperation import org.apache.hive.service.cli.operation.MetadataOperation.DEFAULT_HIVE_CATALOG import org.apache.hive.service.cli.session.HiveSession -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.sql.SQLContext /** @@ -49,7 +50,13 @@ private[hive] class SparkGetSchemasOperation( // Do not change cmdStr. It's used for Hive auditing and authorization. val cmdStr = s"catalog : $catalogName, schemaPattern : $schemaName" val logMsg = s"Listing databases '$cmdStr'" - logInfo(s"$logMsg with $statementId") + + val catalogNameStr = if (catalogName == null) "null" else catalogName + val schemaNameStr = if (schemaName == null) "null" else schemaName + logInfo(log"Listing databases 'catalog : ${MDC(CATALOG_NAME, catalogNameStr)}, " + + log"schemaPattern : ${MDC(DATABASE_NAME, schemaNameStr)}' " + + log"with ${MDC(STATEMENT_ID, statementId)}") + setState(OperationState.RUNNING) // Always use the latest class loader provided by executionHive's state. val executionHiveClassLoader = sqlContext.sharedState.jarClassLoader @@ -72,7 +79,7 @@ private[hive] class SparkGetSchemasOperation( rowSet.addRow(Array[AnyRef](dbName, DEFAULT_HIVE_CATALOG)) } - val globalTempViewDb = sqlContext.sessionState.catalog.globalTempViewManager.database + val globalTempViewDb = sqlContext.sessionState.catalog.globalTempDatabase val databasePattern = Pattern.compile(CLIServiceUtils.patternToRegex(schemaName)) if (schemaName == null || schemaName.isEmpty || databasePattern.matcher(globalTempViewDb).matches()) { diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetTableTypesOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetTableTypesOperation.scala index 9e31b8baad78e..a0c6cd1dcd92f 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetTableTypesOperation.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetTableTypesOperation.scala @@ -24,7 +24,8 @@ import org.apache.hive.service.cli._ import org.apache.hive.service.cli.operation.GetTableTypesOperation import org.apache.hive.service.cli.session.HiveSession -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.sql.SQLContext import org.apache.spark.sql.catalyst.catalog.CatalogTableType @@ -44,7 +45,7 @@ private[hive] class SparkGetTableTypesOperation( override def runInternal(): Unit = { statementId = UUID.randomUUID().toString val logMsg = "Listing table types" - logInfo(s"$logMsg with $statementId") + logInfo(log"Listing table types with ${MDC(STATEMENT_ID, statementId)}") setState(OperationState.RUNNING) // Always use the latest class loader provided by executionHive's state. val executionHiveClassLoader = sqlContext.sharedState.jarClassLoader diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetTablesOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetTablesOperation.scala index 38aaed0be2148..9d90878050678 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetTablesOperation.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetTablesOperation.scala @@ -27,7 +27,8 @@ import org.apache.hive.service.cli._ import org.apache.hive.service.cli.operation.GetTablesOperation import org.apache.hive.service.cli.session.HiveSession -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.sql.SQLContext import org.apache.spark.sql.catalyst.catalog.CatalogTableType._ @@ -57,7 +58,14 @@ private[hive] class SparkGetTablesOperation( val cmdStr = s"catalog : $catalogName, schemaPattern : $schemaName" val tableTypesStr = if (tableTypes == null) "null" else tableTypes.asScala.mkString(",") val logMsg = s"Listing tables '$cmdStr, tableTypes : $tableTypesStr, tableName : $tableName'" - logInfo(s"$logMsg with $statementId") + + val catalogNameStr = if (catalogName == null) "null" else catalogName + val schemaNameStr = if (schemaName == null) "null" else schemaName + logInfo(log"Listing tables 'catalog: ${MDC(CATALOG_NAME, catalogNameStr)}, " + + log"schemaPattern: ${MDC(DATABASE_NAME, schemaNameStr)}, " + + log"tableTypes: ${MDC(TABLE_TYPES, tableTypesStr)}, " + + log"tableName: ${MDC(TABLE_NAME, tableName)}' " + + log"with ${MDC(STATEMENT_ID, statementId)}") setState(OperationState.RUNNING) // Always use the latest class loader provided by executionHive's state. val executionHiveClassLoader = sqlContext.sharedState.jarClassLoader @@ -95,7 +103,7 @@ private[hive] class SparkGetTablesOperation( // Temporary views and global temporary views if (tableTypes == null || tableTypes.isEmpty || tableTypes.contains(VIEW.name)) { - val globalTempViewDb = catalog.globalTempViewManager.database + val globalTempViewDb = catalog.globalTempDatabase val databasePattern = Pattern.compile(CLIServiceUtils.patternToRegex(schemaName)) val tempViews = if (databasePattern.matcher(globalTempViewDb).matches()) { catalog.listTables(globalTempViewDb, tablePattern, includeLocalTempViews = true) diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetTypeInfoOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetTypeInfoOperation.scala index cecb0dec72c80..9ae62ed2fed74 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetTypeInfoOperation.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetTypeInfoOperation.scala @@ -26,7 +26,8 @@ import org.apache.hive.service.cli.OperationState import org.apache.hive.service.cli.operation.GetTypeInfoOperation import org.apache.hive.service.cli.session.HiveSession -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.sql.SQLContext /** @@ -45,7 +46,7 @@ private[hive] class SparkGetTypeInfoOperation( override def runInternal(): Unit = { statementId = UUID.randomUUID().toString val logMsg = "Listing type info" - logInfo(s"$logMsg with $statementId") + logInfo(log"Listing type info with ${MDC(STATEMENT_ID, statementId)}") setState(OperationState.RUNNING) // Always use the latest class loader provided by executionHive's state. val executionHiveClassLoader = sqlContext.sharedState.jarClassLoader diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkOperation.scala index d5874fe776655..11e4817fe2a4c 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkOperation.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkOperation.scala @@ -22,7 +22,7 @@ import org.apache.hive.service.cli.operation.Operation import org.apache.spark.SparkContext import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{HIVE_OPERATION_TYPE, STATEMENT_ID} +import org.apache.spark.internal.LogKeys.{HIVE_OPERATION_TYPE, STATEMENT_ID} import org.apache.spark.sql.{SparkSession, SQLContext} import org.apache.spark.sql.catalyst.CurrentUserContext.CURRENT_USER import org.apache.spark.sql.catalyst.catalog.CatalogTableType @@ -50,7 +50,7 @@ private[hive] trait SparkOperation extends Operation with Logging { abstract override def close(): Unit = { super.close() cleanup() - logInfo(s"Close statement with $statementId") + logInfo(log"Close statement with ${MDC(STATEMENT_ID, statementId)}") HiveThriftServer2.eventManager.onOperationClosed(statementId) } diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala index 888c086e90422..e64e1c283e27c 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala @@ -41,7 +41,7 @@ import sun.misc.{Signal, SignalHandler} import org.apache.spark.{ErrorMessageFormat, SparkConf, SparkThrowable, SparkThrowableHelper} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey._ +import org.apache.spark.internal.LogKeys._ import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.analysis.FunctionRegistry import org.apache.spark.sql.catalyst.util.SQLKeywordUtils diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala index 7262bc22dc429..46537f75f1a11 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala @@ -33,8 +33,8 @@ import org.apache.hive.service.Service.STATE import org.apache.hive.service.auth.HiveAuthFactory import org.apache.hive.service.cli._ import org.apache.hive.service.server.HiveServer2 -import org.slf4j.Logger +import org.apache.spark.internal.SparkLogger import org.apache.spark.sql.SQLContext import org.apache.spark.sql.catalyst.util.SQLKeywordUtils import org.apache.spark.sql.errors.QueryExecutionErrors @@ -113,10 +113,10 @@ private[hive] class SparkSQLCLIService(hiveServer: HiveServer2, sqlContext: SQLC private[thriftserver] trait ReflectedCompositeService { this: AbstractService => - private val logInfo = (msg: String) => getAncestorField[Logger](this, 3, "LOG").info(msg) + private val logInfo = (msg: String) => getAncestorField[SparkLogger](this, 3, "LOG").info(msg) private val logError = (msg: String, e: Throwable) => - getAncestorField[Logger](this, 3, "LOG").error(msg, e) + getAncestorField[SparkLogger](this, 3, "LOG").error(msg, e) def initCompositeService(hiveConf: HiveConf): Unit = { // Emulating `CompositeService.init(hiveConf)` diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLDriver.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLDriver.scala index 29e468aaa9fe6..a9c5d3e250797 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLDriver.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLDriver.scala @@ -28,7 +28,7 @@ import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse import org.apache.spark.SparkThrowable import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.COMMAND +import org.apache.spark.internal.LogKeys.COMMAND import org.apache.spark.sql.SQLContext import org.apache.spark.sql.catalyst.plans.logical.CommandResult import org.apache.spark.sql.execution.{QueryExecution, SQLExecution} diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/HiveThriftServer2Listener.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/HiveThriftServer2Listener.scala index 8b7e9b00cb52b..8d03d5f848b76 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/HiveThriftServer2Listener.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/HiveThriftServer2Listener.scala @@ -26,7 +26,7 @@ import org.apache.hive.service.server.HiveServer2 import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey._ +import org.apache.spark.internal.LogKeys._ import org.apache.spark.internal.config.Status.LIVE_ENTITY_UPDATE_PERIOD import org.apache.spark.scheduler._ import org.apache.spark.sql.hive.thriftserver.HiveThriftServer2.ExecutionState diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerQueryTestSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerQueryTestSuite.scala index 3f9eb02d39b01..026b2388c593c 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerQueryTestSuite.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerQueryTestSuite.scala @@ -30,7 +30,7 @@ import org.apache.spark.internal.Logging import org.apache.spark.sql.SQLQueryTestSuite import org.apache.spark.sql.catalyst.analysis.NoSuchTableException import org.apache.spark.sql.catalyst.util.fileToString -import org.apache.spark.sql.execution.HiveResult.{getTimeFormatters, toHiveString, TimeFormatters} +import org.apache.spark.sql.execution.HiveResult.{getBinaryFormatter, getTimeFormatters, toHiveString, BinaryFormatter, TimeFormatters} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.SQLConf.TimestampTypes import org.apache.spark.sql.types._ @@ -131,13 +131,15 @@ class ThriftServerQueryTestSuite extends SQLQueryTestSuite with SharedThriftServ } // Run the SQL queries preparing them for comparison. - val outputs: Seq[QueryTestOutput] = queries.map { sql => - val (_, output) = handleExceptions(getNormalizedResult(statement, sql)) - // We might need to do some query canonicalization in the future. - ExecutionOutput( - sql = sql, - schema = Some(""), - output = output.mkString("\n").replaceAll("\\s+$", "")) + val outputs: Seq[QueryTestOutput] = withSQLConf(configSet: _*) { + queries.map { sql => + val (_, output) = handleExceptions(getNormalizedResult(statement, sql)) + // We might need to do some query canonicalization in the future. + ExecutionOutput( + sql = sql, + schema = Some(""), + output = output.mkString("\n").replaceAll("\\s+$", "")) + } } // Read back the golden file. @@ -298,8 +300,9 @@ class ThriftServerQueryTestSuite extends SQLQueryTestSuite with SharedThriftServ val rs = statement.executeQuery(sql) val cols = rs.getMetaData.getColumnCount val timeFormatters = getTimeFormatters + val binaryFormatter = getBinaryFormatter val buildStr = () => (for (i <- 1 to cols) yield { - getHiveResult(rs.getObject(i), timeFormatters) + getHiveResult(rs.getObject(i), timeFormatters, binaryFormatter) }).mkString("\t") val answer = Iterator.continually(rs.next()).takeWhile(identity).map(_ => buildStr()).toSeq @@ -321,18 +324,20 @@ class ThriftServerQueryTestSuite extends SQLQueryTestSuite with SharedThriftServ upperCase.startsWith("(") } - private def getHiveResult(obj: Object, timeFormatters: TimeFormatters): String = { + private def getHiveResult( + obj: Object, timeFormatters: TimeFormatters, binaryFormatter: BinaryFormatter): String = { obj match { case null => - toHiveString((null, StringType), false, timeFormatters) + toHiveString((null, StringType), false, timeFormatters, binaryFormatter) case d: java.sql.Date => - toHiveString((d, DateType), false, timeFormatters) + toHiveString((d, DateType), false, timeFormatters, binaryFormatter) case t: Timestamp => - toHiveString((t, TimestampType), false, timeFormatters) + toHiveString((t, TimestampType), false, timeFormatters, binaryFormatter) case d: java.math.BigDecimal => - toHiveString((d, DecimalType.fromDecimal(Decimal(d))), false, timeFormatters) + toHiveString(( + d, DecimalType.fromDecimal(Decimal(d))), false, timeFormatters, binaryFormatter) case bin: Array[Byte] => - toHiveString((bin, BinaryType), false, timeFormatters) + toHiveString((bin, BinaryType), false, timeFormatters, binaryFormatter) case other => other.toString } diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerWithSparkContextSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerWithSparkContextSuite.scala index 51123b17eeec1..e757487915bbf 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerWithSparkContextSuite.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerWithSparkContextSuite.scala @@ -35,7 +35,7 @@ trait ThriftServerWithSparkContextSuite extends SharedThriftServer { test("SPARK-29911: Uncache cached tables when session closed") { val cacheManager = spark.sharedState.cacheManager - val globalTempDB = spark.sharedState.globalTempViewManager.database + val globalTempDB = spark.sharedState.globalTempDB withJdbcStatement() { statement => statement.execute("CACHE TABLE tempTbl AS SELECT 1") } @@ -214,7 +214,7 @@ trait ThriftServerWithSparkContextSuite extends SharedThriftServer { val sessionHandle = client.openSession(user, "") val infoValue = client.getInfo(sessionHandle, GetInfoType.CLI_ODBC_KEYWORDS) // scalastyle:off line.size.limit - assert(infoValue.getStringValue == "ADD,AFTER,ALL,ALTER,ALWAYS,ANALYZE,AND,ANTI,ANY,ANY_VALUE,ARCHIVE,ARRAY,AS,ASC,AT,AUTHORIZATION,BETWEEN,BIGINT,BINARY,BOOLEAN,BOTH,BUCKET,BUCKETS,BY,BYTE,CACHE,CASCADE,CASE,CAST,CATALOG,CATALOGS,CHANGE,CHAR,CHARACTER,CHECK,CLEAR,CLUSTER,CLUSTERED,CODEGEN,COLLATE,COLLATION,COLLECTION,COLUMN,COLUMNS,COMMENT,COMMIT,COMPACT,COMPACTIONS,COMPUTE,CONCATENATE,CONSTRAINT,COST,CREATE,CROSS,CUBE,CURRENT,CURRENT_DATE,CURRENT_TIME,CURRENT_TIMESTAMP,CURRENT_USER,DATA,DATABASE,DATABASES,DATE,DATEADD,DATEDIFF,DATE_ADD,DATE_DIFF,DAY,DAYOFYEAR,DAYS,DBPROPERTIES,DEC,DECIMAL,DECLARE,DEFAULT,DEFINED,DELETE,DELIMITED,DESC,DESCRIBE,DFS,DIRECTORIES,DIRECTORY,DISTINCT,DISTRIBUTE,DIV,DOUBLE,DROP,ELSE,END,ESCAPE,ESCAPED,EXCEPT,EXCHANGE,EXCLUDE,EXECUTE,EXISTS,EXPLAIN,EXPORT,EXTENDED,EXTERNAL,EXTRACT,FALSE,FETCH,FIELDS,FILEFORMAT,FILTER,FIRST,FLOAT,FOLLOWING,FOR,FOREIGN,FORMAT,FORMATTED,FROM,FULL,FUNCTION,FUNCTIONS,GENERATED,GLOBAL,GRANT,GROUP,GROUPING,HAVING,HOUR,HOURS,IDENTIFIER,IF,IGNORE,ILIKE,IMMEDIATE,IMPORT,IN,INCLUDE,INDEX,INDEXES,INNER,INPATH,INPUTFORMAT,INSERT,INT,INTEGER,INTERSECT,INTERVAL,INTO,IS,ITEMS,JOIN,KEYS,LAST,LATERAL,LAZY,LEADING,LEFT,LIKE,LIMIT,LINES,LIST,LOAD,LOCAL,LOCATION,LOCK,LOCKS,LOGICAL,LONG,MACRO,MAP,MATCHED,MERGE,MICROSECOND,MICROSECONDS,MILLISECOND,MILLISECONDS,MINUS,MINUTE,MINUTES,MONTH,MONTHS,MSCK,NAME,NAMESPACE,NAMESPACES,NANOSECOND,NANOSECONDS,NATURAL,NO,NOT,NULL,NULLS,NUMERIC,OF,OFFSET,ON,ONLY,OPTION,OPTIONS,OR,ORDER,OUT,OUTER,OUTPUTFORMAT,OVER,OVERLAPS,OVERLAY,OVERWRITE,PARTITION,PARTITIONED,PARTITIONS,PERCENT,PERCENTILE_CONT,PERCENTILE_DISC,PIVOT,PLACING,POSITION,PRECEDING,PRIMARY,PRINCIPALS,PROPERTIES,PURGE,QUARTER,QUERY,RANGE,REAL,RECORDREADER,RECORDWRITER,RECOVER,REDUCE,REFERENCES,REFRESH,RENAME,REPAIR,REPEATABLE,REPLACE,RESET,RESPECT,RESTRICT,REVOKE,RIGHT,ROLE,ROLES,ROLLBACK,ROLLUP,ROW,ROWS,SCHEMA,SCHEMAS,SECOND,SECONDS,SELECT,SEMI,SEPARATED,SERDE,SERDEPROPERTIES,SESSION_USER,SET,SETS,SHORT,SHOW,SINGLE,SKEWED,SMALLINT,SOME,SORT,SORTED,SOURCE,START,STATISTICS,STORED,STRATIFY,STRING,STRUCT,SUBSTR,SUBSTRING,SYNC,SYSTEM_TIME,SYSTEM_VERSION,TABLE,TABLES,TABLESAMPLE,TARGET,TBLPROPERTIES,TERMINATED,THEN,TIME,TIMEDIFF,TIMESTAMP,TIMESTAMPADD,TIMESTAMPDIFF,TIMESTAMP_LTZ,TIMESTAMP_NTZ,TINYINT,TO,TOUCH,TRAILING,TRANSACTION,TRANSACTIONS,TRANSFORM,TRIM,TRUE,TRUNCATE,TRY_CAST,TYPE,UNARCHIVE,UNBOUNDED,UNCACHE,UNION,UNIQUE,UNKNOWN,UNLOCK,UNPIVOT,UNSET,UPDATE,USE,USER,USING,VALUES,VAR,VARCHAR,VARIABLE,VARIANT,VERSION,VIEW,VIEWS,VOID,WEEK,WEEKS,WHEN,WHERE,WINDOW,WITH,WITHIN,X,YEAR,YEARS,ZONE") + assert(infoValue.getStringValue == "ADD,AFTER,ALL,ALTER,ALWAYS,ANALYZE,AND,ANTI,ANY,ANY_VALUE,ARCHIVE,ARRAY,AS,ASC,AT,AUTHORIZATION,BEGIN,BETWEEN,BIGINT,BINARY,BINDING,BOOLEAN,BOTH,BUCKET,BUCKETS,BY,BYTE,CACHE,CALLED,CASCADE,CASE,CAST,CATALOG,CATALOGS,CHANGE,CHAR,CHARACTER,CHECK,CLEAR,CLUSTER,CLUSTERED,CODEGEN,COLLATE,COLLATION,COLLECTION,COLUMN,COLUMNS,COMMENT,COMMIT,COMPACT,COMPACTIONS,COMPENSATION,COMPUTE,CONCATENATE,CONSTRAINT,CONTAINS,COST,CREATE,CROSS,CUBE,CURRENT,CURRENT_DATE,CURRENT_TIME,CURRENT_TIMESTAMP,CURRENT_USER,DATA,DATABASE,DATABASES,DATE,DATEADD,DATEDIFF,DATE_ADD,DATE_DIFF,DAY,DAYOFYEAR,DAYS,DBPROPERTIES,DEC,DECIMAL,DECLARE,DEFAULT,DEFINED,DEFINER,DELETE,DELIMITED,DESC,DESCRIBE,DETERMINISTIC,DFS,DIRECTORIES,DIRECTORY,DISTINCT,DISTRIBUTE,DIV,DOUBLE,DROP,ELSE,END,ESCAPE,ESCAPED,EVOLUTION,EXCEPT,EXCHANGE,EXCLUDE,EXECUTE,EXISTS,EXPLAIN,EXPORT,EXTENDED,EXTERNAL,EXTRACT,FALSE,FETCH,FIELDS,FILEFORMAT,FILTER,FIRST,FLOAT,FOLLOWING,FOR,FOREIGN,FORMAT,FORMATTED,FROM,FULL,FUNCTION,FUNCTIONS,GENERATED,GLOBAL,GRANT,GROUP,GROUPING,HAVING,HOUR,HOURS,IDENTIFIER,IF,IGNORE,ILIKE,IMMEDIATE,IMPORT,IN,INCLUDE,INDEX,INDEXES,INNER,INPATH,INPUT,INPUTFORMAT,INSERT,INT,INTEGER,INTERSECT,INTERVAL,INTO,INVOKER,IS,ITEMS,JOIN,KEYS,LANGUAGE,LAST,LATERAL,LAZY,LEADING,LEFT,LIKE,LIMIT,LINES,LIST,LOAD,LOCAL,LOCATION,LOCK,LOCKS,LOGICAL,LONG,MACRO,MAP,MATCHED,MERGE,MICROSECOND,MICROSECONDS,MILLISECOND,MILLISECONDS,MINUS,MINUTE,MINUTES,MODIFIES,MONTH,MONTHS,MSCK,NAME,NAMESPACE,NAMESPACES,NANOSECOND,NANOSECONDS,NATURAL,NO,NOT,NULL,NULLS,NUMERIC,OF,OFFSET,ON,ONLY,OPTION,OPTIONS,OR,ORDER,OUT,OUTER,OUTPUTFORMAT,OVER,OVERLAPS,OVERLAY,OVERWRITE,PARTITION,PARTITIONED,PARTITIONS,PERCENT,PIVOT,PLACING,POSITION,PRECEDING,PRIMARY,PRINCIPALS,PROPERTIES,PURGE,QUARTER,QUERY,RANGE,READS,REAL,RECORDREADER,RECORDWRITER,RECOVER,REDUCE,REFERENCES,REFRESH,RENAME,REPAIR,REPEATABLE,REPLACE,RESET,RESPECT,RESTRICT,RETURN,RETURNS,REVOKE,RIGHT,ROLE,ROLES,ROLLBACK,ROLLUP,ROW,ROWS,SCHEMA,SCHEMAS,SECOND,SECONDS,SECURITY,SELECT,SEMI,SEPARATED,SERDE,SERDEPROPERTIES,SESSION_USER,SET,SETS,SHORT,SHOW,SINGLE,SKEWED,SMALLINT,SOME,SORT,SORTED,SOURCE,SPECIFIC,SQL,START,STATISTICS,STORED,STRATIFY,STRING,STRUCT,SUBSTR,SUBSTRING,SYNC,SYSTEM_TIME,SYSTEM_VERSION,TABLE,TABLES,TABLESAMPLE,TARGET,TBLPROPERTIES,TERMINATED,THEN,TIME,TIMEDIFF,TIMESTAMP,TIMESTAMPADD,TIMESTAMPDIFF,TIMESTAMP_LTZ,TIMESTAMP_NTZ,TINYINT,TO,TOUCH,TRAILING,TRANSACTION,TRANSACTIONS,TRANSFORM,TRIM,TRUE,TRUNCATE,TRY_CAST,TYPE,UNARCHIVE,UNBOUNDED,UNCACHE,UNION,UNIQUE,UNKNOWN,UNLOCK,UNPIVOT,UNSET,UPDATE,USE,USER,USING,VALUES,VAR,VARCHAR,VARIABLE,VARIANT,VERSION,VIEW,VIEWS,VOID,WEEK,WEEKS,WHEN,WHERE,WINDOW,WITH,WITHIN,X,YEAR,YEARS,ZONE") // scalastyle:on line.size.limit } } diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/UISeleniumSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/UISeleniumSuite.scala index b552611b75d17..2b2cbec41d643 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/UISeleniumSuite.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/UISeleniumSuite.scala @@ -108,7 +108,7 @@ class UISeleniumSuite val baseURL = s"http://$localhost:$uiPort" val queries = Seq( - "CREATE TABLE test_map(key INT, value STRING)", + "CREATE TABLE test_map (key INT, value STRING) USING HIVE", s"LOAD DATA LOCAL INPATH '${TestData.smallKv}' OVERWRITE INTO TABLE test_map") queries.foreach(statement.execute) diff --git a/sql/hive/benchmarks/InsertIntoHiveTableBenchmark-hive2.3-results.txt b/sql/hive/benchmarks/InsertIntoHiveTableBenchmark-hive2.3-results.txt index cedff45f362d5..1e39455f294ab 100644 --- a/sql/hive/benchmarks/InsertIntoHiveTableBenchmark-hive2.3-results.txt +++ b/sql/hive/benchmarks/InsertIntoHiveTableBenchmark-hive2.3-results.txt @@ -1,11 +1,11 @@ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor insert hive table benchmark: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -INSERT INTO DYNAMIC 3629 3775 207 0.0 354389.9 1.0X -INSERT INTO HYBRID 500 542 30 0.0 48850.2 7.3X -INSERT INTO STATIC 150 163 10 0.1 14611.0 24.3X -INSERT OVERWRITE DYNAMIC 3071 3249 252 0.0 299902.1 1.2X -INSERT OVERWRITE HYBRID 447 454 9 0.0 43634.8 8.1X -INSERT OVERWRITE STATIC 169 177 5 0.1 16521.3 21.5X +INSERT INTO DYNAMIC 3930 4221 411 0.0 383795.5 1.0X +INSERT INTO HYBRID 588 621 32 0.0 57447.2 6.7X +INSERT INTO STATIC 188 214 13 0.1 18338.3 20.9X +INSERT OVERWRITE DYNAMIC 3723 3853 183 0.0 363603.4 1.1X +INSERT OVERWRITE HYBRID 513 527 17 0.0 50096.3 7.7X +INSERT OVERWRITE STATIC 191 221 14 0.1 18612.1 20.6X diff --git a/sql/hive/benchmarks/InsertIntoHiveTableBenchmark-jdk21-hive2.3-results.txt b/sql/hive/benchmarks/InsertIntoHiveTableBenchmark-jdk21-hive2.3-results.txt index 50b39ecf2fa87..6d925278cc405 100644 --- a/sql/hive/benchmarks/InsertIntoHiveTableBenchmark-jdk21-hive2.3-results.txt +++ b/sql/hive/benchmarks/InsertIntoHiveTableBenchmark-jdk21-hive2.3-results.txt @@ -1,11 +1,11 @@ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor insert hive table benchmark: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -INSERT INTO DYNAMIC 3478 3720 342 0.0 339620.8 1.0X -INSERT INTO HYBRID 511 544 35 0.0 49878.9 6.8X -INSERT INTO STATIC 151 167 14 0.1 14758.4 23.0X -INSERT OVERWRITE DYNAMIC 3152 3338 263 0.0 307817.5 1.1X -INSERT OVERWRITE HYBRID 433 454 24 0.0 42330.3 8.0X -INSERT OVERWRITE STATIC 167 178 18 0.1 16325.7 20.8X +INSERT INTO DYNAMIC 3770 4154 543 0.0 368168.8 1.0X +INSERT INTO HYBRID 511 568 55 0.0 49868.7 7.4X +INSERT INTO STATIC 163 189 23 0.1 15947.9 23.1X +INSERT OVERWRITE DYNAMIC 3813 4094 397 0.0 372395.4 1.0X +INSERT OVERWRITE HYBRID 489 516 22 0.0 47714.3 7.7X +INSERT OVERWRITE STATIC 182 202 29 0.1 17768.5 20.7X diff --git a/sql/hive/benchmarks/ObjectHashAggregateExecBenchmark-jdk21-results.txt b/sql/hive/benchmarks/ObjectHashAggregateExecBenchmark-jdk21-results.txt index 0126dc82c0137..26d129b66e825 100644 --- a/sql/hive/benchmarks/ObjectHashAggregateExecBenchmark-jdk21-results.txt +++ b/sql/hive/benchmarks/ObjectHashAggregateExecBenchmark-jdk21-results.txt @@ -2,44 +2,44 @@ Hive UDAF vs Spark AF ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor hive udaf vs spark af: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -hive udaf w/o group by 3122 3229 75 0.0 47645.3 1.0X -spark af w/o group by 19 24 4 3.4 297.4 160.2X -hive udaf w/ group by 1981 2031 38 0.0 30230.8 1.6X -spark af w/ group by w/o fallback 20 23 3 3.3 302.9 157.3X -spark af w/ group by w/ fallback 24 28 6 2.7 364.6 130.7X +hive udaf w/o group by 3276 3322 70 0.0 49983.2 1.0X +spark af w/o group by 20 25 4 3.3 300.3 166.5X +hive udaf w/ group by 2090 2101 7 0.0 31892.1 1.6X +spark af w/ group by w/o fallback 21 24 3 3.2 316.8 157.8X +spark af w/ group by w/ fallback 25 27 4 2.7 375.5 133.1X ================================================================================================ ObjectHashAggregateExec vs SortAggregateExec - typed_count ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor object agg v.s. sort agg: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -sort agg w/ group by 22021 22273 310 4.8 210.0 1.0X -object agg w/ group by w/o fallback 6455 7304 390 16.2 61.6 3.4X -object agg w/ group by w/ fallback 14156 14380 225 7.4 135.0 1.6X -sort agg w/o group by 4084 4117 24 25.7 39.0 5.4X -object agg w/o group by w/o fallback 4054 4144 72 25.9 38.7 5.4X +sort agg w/ group by 22925 23221 419 4.6 218.6 1.0X +object agg w/ group by w/o fallback 7021 7103 64 14.9 67.0 3.3X +object agg w/ group by w/ fallback 14719 15622 1324 7.1 140.4 1.6X +sort agg w/o group by 3908 3946 27 26.8 37.3 5.9X +object agg w/o group by w/o fallback 3780 4011 331 27.7 36.0 6.1X ================================================================================================ ObjectHashAggregateExec vs SortAggregateExec - percentile_approx ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor object agg v.s. sort agg: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -sort agg w/ group by 408 424 13 5.1 194.6 1.0X -object agg w/ group by w/o fallback 330 337 5 6.4 157.2 1.2X -object agg w/ group by w/ fallback 445 454 8 4.7 212.0 0.9X -sort agg w/o group by 276 282 3 7.6 131.8 1.5X -object agg w/o group by w/o fallback 266 273 4 7.9 126.8 1.5X +sort agg w/ group by 408 431 15 5.1 194.6 1.0X +object agg w/ group by w/o fallback 326 334 5 6.4 155.4 1.3X +object agg w/ group by w/ fallback 451 472 19 4.6 215.2 0.9X +sort agg w/o group by 274 281 4 7.6 130.8 1.5X +object agg w/o group by w/o fallback 273 277 5 7.7 130.4 1.5X diff --git a/sql/hive/benchmarks/ObjectHashAggregateExecBenchmark-results.txt b/sql/hive/benchmarks/ObjectHashAggregateExecBenchmark-results.txt index 79409a2aba027..8fb04e97f4bc5 100644 --- a/sql/hive/benchmarks/ObjectHashAggregateExecBenchmark-results.txt +++ b/sql/hive/benchmarks/ObjectHashAggregateExecBenchmark-results.txt @@ -2,44 +2,44 @@ Hive UDAF vs Spark AF ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor hive udaf vs spark af: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -hive udaf w/o group by 3496 3530 35 0.0 53350.3 1.0X -spark af w/o group by 20 26 4 3.3 304.8 175.1X -hive udaf w/ group by 2193 2205 10 0.0 33464.2 1.6X -spark af w/ group by w/o fallback 22 26 5 3.0 328.8 162.3X -spark af w/ group by w/ fallback 25 28 3 2.7 376.0 141.9X +hive udaf w/o group by 3375 3488 67 0.0 51493.5 1.0X +spark af w/o group by 21 26 4 3.2 313.8 164.1X +hive udaf w/ group by 2174 2193 14 0.0 33173.8 1.6X +spark af w/ group by w/o fallback 22 27 4 2.9 339.0 151.9X +spark af w/ group by w/ fallback 25 28 3 2.6 383.5 134.3X ================================================================================================ ObjectHashAggregateExec vs SortAggregateExec - typed_count ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor object agg v.s. sort agg: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -sort agg w/ group by 24980 25578 846 4.2 238.2 1.0X -object agg w/ group by w/o fallback 7305 7525 163 14.4 69.7 3.4X -object agg w/ group by w/ fallback 14433 14586 147 7.3 137.6 1.7X -sort agg w/o group by 4331 4359 20 24.2 41.3 5.8X -object agg w/o group by w/o fallback 3997 4029 40 26.2 38.1 6.2X +sort agg w/ group by 23621 24285 938 4.4 225.3 1.0X +object agg w/ group by w/o fallback 6890 7186 232 15.2 65.7 3.4X +object agg w/ group by w/ fallback 14883 15203 299 7.0 141.9 1.6X +sort agg w/o group by 4104 4125 17 25.5 39.1 5.8X +object agg w/o group by w/o fallback 3695 3723 26 28.4 35.2 6.4X ================================================================================================ ObjectHashAggregateExec vs SortAggregateExec - percentile_approx ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor object agg v.s. sort agg: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -sort agg w/ group by 419 427 9 5.0 199.7 1.0X -object agg w/ group by w/o fallback 340 349 8 6.2 162.0 1.2X -object agg w/ group by w/ fallback 467 476 7 4.5 222.6 0.9X -sort agg w/o group by 310 315 5 6.8 147.9 1.4X -object agg w/o group by w/o fallback 302 309 4 6.9 144.2 1.4X +sort agg w/ group by 404 412 6 5.2 192.8 1.0X +object agg w/ group by w/o fallback 332 339 6 6.3 158.5 1.2X +object agg w/ group by w/ fallback 461 467 8 4.6 219.6 0.9X +sort agg w/o group by 308 313 5 6.8 146.9 1.3X +object agg w/o group by w/o fallback 304 308 4 6.9 144.8 1.3X diff --git a/sql/hive/benchmarks/OrcReadBenchmark-jdk21-results.txt b/sql/hive/benchmarks/OrcReadBenchmark-jdk21-results.txt index ad32f1d3ee3ad..eedfc34b5ea13 100644 --- a/sql/hive/benchmarks/OrcReadBenchmark-jdk21-results.txt +++ b/sql/hive/benchmarks/OrcReadBenchmark-jdk21-results.txt @@ -2,221 +2,221 @@ SQL Single Numeric Column Scan ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor SQL Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 662 691 46 23.8 42.1 1.0X -Native ORC MR 784 812 28 20.1 49.9 0.8X -Native ORC Vectorized 92 120 35 171.5 5.8 7.2X +Hive built-in ORC 627 665 40 25.1 39.9 1.0X +Native ORC MR 699 703 4 22.5 44.4 0.9X +Native ORC Vectorized 61 81 21 258.1 3.9 10.3X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor SQL Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 692 724 31 22.7 44.0 1.0X -Native ORC MR 789 803 15 19.9 50.1 0.9X -Native ORC Vectorized 73 94 17 214.7 4.7 9.4X +Hive built-in ORC 681 699 17 23.1 43.3 1.0X +Native ORC MR 792 803 14 19.9 50.3 0.9X +Native ORC Vectorized 72 86 16 217.6 4.6 9.4X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor SQL Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 685 708 21 23.0 43.5 1.0X -Native ORC MR 811 854 40 19.4 51.5 0.8X -Native ORC Vectorized 79 94 18 199.8 5.0 8.7X +Hive built-in ORC 741 764 29 21.2 47.1 1.0X +Native ORC MR 907 929 29 17.4 57.6 0.8X +Native ORC Vectorized 95 105 14 164.8 6.1 7.8X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor SQL Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 733 771 39 21.5 46.6 1.0X -Native ORC MR 803 819 15 19.6 51.1 0.9X -Native ORC Vectorized 113 128 16 138.8 7.2 6.5X +Hive built-in ORC 860 868 11 18.3 54.7 1.0X +Native ORC MR 831 871 37 18.9 52.8 1.0X +Native ORC Vectorized 93 104 15 169.9 5.9 9.3X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor SQL Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 804 815 15 19.6 51.1 1.0X -Native ORC MR 873 904 27 18.0 55.5 0.9X -Native ORC Vectorized 138 169 29 114.2 8.8 5.8X +Hive built-in ORC 803 841 34 19.6 51.1 1.0X +Native ORC MR 839 857 24 18.7 53.3 1.0X +Native ORC Vectorized 129 168 37 122.0 8.2 6.2X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor SQL Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 812 825 12 19.4 51.6 1.0X -Native ORC MR 934 943 13 16.8 59.4 0.9X -Native ORC Vectorized 188 213 24 83.6 12.0 4.3X +Hive built-in ORC 959 966 8 16.4 61.0 1.0X +Native ORC MR 997 1021 35 15.8 63.4 1.0X +Native ORC Vectorized 214 264 30 73.5 13.6 4.5X ================================================================================================ Int and String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Int and String Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 1547 1557 15 6.8 147.5 1.0X -Native ORC MR 1469 1478 13 7.1 140.1 1.1X -Native ORC Vectorized 573 601 32 18.3 54.6 2.7X +Hive built-in ORC 1565 1567 2 6.7 149.3 1.0X +Native ORC MR 1574 1602 40 6.7 150.1 1.0X +Native ORC Vectorized 656 660 6 16.0 62.6 2.4X ================================================================================================ Partitioned Table Scan ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Partitioned Table: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Data column - Hive built-in ORC 827 859 48 19.0 52.6 1.0X -Data column - Native ORC MR 977 1025 77 16.1 62.1 0.8X -Data column - Native ORC Vectorized 109 124 15 144.3 6.9 7.6X -Partition column - Hive built-in ORC 702 715 12 22.4 44.6 1.2X -Partition column - Native ORC MR 556 562 9 28.3 35.3 1.5X -Partition column - Native ORC Vectorized 31 41 8 510.7 2.0 26.8X -Both columns - Hive built-in ORC 888 933 40 17.7 56.5 0.9X -Both columns - Native ORC MR 1076 1083 10 14.6 68.4 0.8X -Both columns - Native ORC Vectorized 126 144 17 125.3 8.0 6.6X +Data column - Hive built-in ORC 893 933 35 17.6 56.8 1.0X +Data column - Native ORC MR 1154 1159 6 13.6 73.4 0.8X +Data column - Native ORC Vectorized 97 123 30 161.6 6.2 9.2X +Partition column - Hive built-in ORC 702 719 22 22.4 44.7 1.3X +Partition column - Native ORC MR 653 670 19 24.1 41.5 1.4X +Partition column - Native ORC Vectorized 34 47 11 456.3 2.2 25.9X +Both columns - Hive built-in ORC 1006 1019 20 15.6 63.9 0.9X +Both columns - Native ORC MR 1085 1096 15 14.5 69.0 0.8X +Both columns - Native ORC Vectorized 111 140 26 142.2 7.0 8.1X ================================================================================================ Repeated String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Repeated String: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 832 886 48 12.6 79.3 1.0X -Native ORC MR 821 825 6 12.8 78.3 1.0X -Native ORC Vectorized 173 189 14 60.7 16.5 4.8X +Hive built-in ORC 808 823 15 13.0 77.1 1.0X +Native ORC MR 791 794 4 13.3 75.4 1.0X +Native ORC Vectorized 124 137 15 84.4 11.8 6.5X ================================================================================================ String with Nulls Scan ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor String with Nulls Scan (0.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 1421 1433 16 7.4 135.6 1.0X -Native ORC MR 1205 1215 14 8.7 114.9 1.2X -Native ORC Vectorized 285 314 29 36.8 27.2 5.0X +Hive built-in ORC 1404 1416 17 7.5 133.9 1.0X +Native ORC MR 1275 1283 11 8.2 121.6 1.1X +Native ORC Vectorized 310 327 16 33.8 29.6 4.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor String with Nulls Scan (50.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 1261 1273 17 8.3 120.3 1.0X -Native ORC MR 1121 1131 14 9.4 106.9 1.1X -Native ORC Vectorized 341 357 13 30.8 32.5 3.7X +Hive built-in ORC 1196 1198 4 8.8 114.0 1.0X +Native ORC MR 1182 1182 0 8.9 112.7 1.0X +Native ORC Vectorized 346 373 35 30.3 33.0 3.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor String with Nulls Scan (95.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 840 842 2 12.5 80.1 1.0X -Native ORC MR 750 751 1 14.0 71.5 1.1X -Native ORC Vectorized 144 160 9 72.7 13.8 5.8X +Hive built-in ORC 741 769 25 14.1 70.7 1.0X +Native ORC MR 834 838 5 12.6 79.5 0.9X +Native ORC Vectorized 136 175 36 77.2 13.0 5.5X ================================================================================================ Single Column Scan From Wide Columns ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Single Column Scan from 100 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 669 688 23 1.6 638.3 1.0X -Native ORC MR 85 100 13 12.3 81.4 7.8X -Native ORC Vectorized 36 46 9 29.3 34.1 18.7X +Hive built-in ORC 570 588 23 1.8 543.8 1.0X +Native ORC MR 84 102 21 12.5 80.0 6.8X +Native ORC Vectorized 29 36 8 35.8 27.9 19.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Single Column Scan from 200 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 1221 1230 13 0.9 1164.4 1.0X -Native ORC MR 100 111 12 10.5 95.7 12.2X -Native ORC Vectorized 50 68 11 20.9 47.8 24.4X +Hive built-in ORC 1062 1069 10 1.0 1012.4 1.0X +Native ORC MR 91 109 21 11.5 87.2 11.6X +Native ORC Vectorized 37 48 8 28.3 35.4 28.6X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Single Column Scan from 300 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 1834 1884 70 0.6 1749.1 1.0X -Native ORC MR 112 135 23 9.3 107.2 16.3X -Native ORC Vectorized 61 70 8 17.3 58.0 30.2X +Hive built-in ORC 1593 1665 101 0.7 1519.1 1.0X +Native ORC MR 101 110 9 10.4 96.2 15.8X +Native ORC Vectorized 45 52 6 23.2 43.1 35.3X ================================================================================================ Struct scan ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Single Struct Column Scan with 10 Fields: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 279 338 41 3.8 266.0 1.0X -Native ORC MR 220 268 44 4.8 209.8 1.3X -Native ORC Vectorized 97 115 18 10.8 92.5 2.9X +Hive built-in ORC 290 350 48 3.6 276.9 1.0X +Native ORC MR 225 243 25 4.7 215.0 1.3X +Native ORC Vectorized 97 109 20 10.8 92.3 3.0X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Single Struct Column Scan with 100 Fields: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Hive built-in ORC 1869 1890 29 0.6 1782.7 1.0X -Native ORC MR 1684 1729 64 0.6 1606.4 1.1X -Native ORC Vectorized 869 913 56 1.2 828.5 2.2X +Hive built-in ORC 2077 2114 52 0.5 1981.2 1.0X +Native ORC MR 1778 1786 12 0.6 1695.4 1.2X +Native ORC Vectorized 893 941 45 1.2 851.8 2.3X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Single Struct Column Scan with 300 Fields: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Hive built-in ORC 5329 5433 148 0.2 5082.1 1.0X -Native ORC MR 5514 5545 43 0.2 5259.0 1.0X -Native ORC Vectorized 5450 5502 74 0.2 5197.3 1.0X +Hive built-in ORC 6108 6135 39 0.2 5824.6 1.0X +Native ORC MR 5695 5742 66 0.2 5431.5 1.1X +Native ORC Vectorized 5662 5701 55 0.2 5399.8 1.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Single Struct Column Scan with 600 Fields: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Hive built-in ORC 11352 11377 36 0.1 10826.0 1.0X -Native ORC MR 12781 12796 22 0.1 12188.7 0.9X -Native ORC Vectorized 12673 12779 151 0.1 12085.7 0.9X +Hive built-in ORC 12790 12832 60 0.1 12197.3 1.0X +Native ORC MR 12987 13006 27 0.1 12385.1 1.0X +Native ORC Vectorized 12870 12946 107 0.1 12274.1 1.0X ================================================================================================ Nested Struct scan ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Nested Struct Scan with 10 Elements, 10 Fields: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 1793 1818 36 0.6 1710.3 1.0X -Native ORC MR 2053 2069 22 0.5 1958.2 0.9X -Native ORC Vectorized 539 569 37 1.9 513.7 3.3X +Hive built-in ORC 1907 1949 59 0.5 1818.9 1.0X +Native ORC MR 1645 1678 46 0.6 1569.2 1.2X +Native ORC Vectorized 549 566 26 1.9 523.8 3.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Nested Struct Scan with 30 Elements, 10 Fields: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 4477 4533 78 0.2 4270.0 1.0X -Native ORC MR 4467 4484 25 0.2 4259.6 1.0X -Native ORC Vectorized 1470 1490 29 0.7 1402.0 3.0X +Hive built-in ORC 5234 5237 4 0.2 4991.9 1.0X +Native ORC MR 3998 4042 63 0.3 3812.4 1.3X +Native ORC Vectorized 1489 1494 7 0.7 1420.4 3.5X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Nested Struct Scan with 10 Elements, 30 Fields: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 4417 4430 19 0.2 4212.1 1.0X -Native ORC MR 5039 5069 42 0.2 4805.9 0.9X -Native ORC Vectorized 1726 1737 16 0.6 1645.8 2.6X +Hive built-in ORC 5144 5174 42 0.2 4905.7 1.0X +Native ORC MR 4441 4510 99 0.2 4234.9 1.2X +Native ORC Vectorized 1793 1877 118 0.6 1710.3 2.9X diff --git a/sql/hive/benchmarks/OrcReadBenchmark-results.txt b/sql/hive/benchmarks/OrcReadBenchmark-results.txt index 0cbddae8dc9ef..7cdd02dbb8129 100644 --- a/sql/hive/benchmarks/OrcReadBenchmark-results.txt +++ b/sql/hive/benchmarks/OrcReadBenchmark-results.txt @@ -2,221 +2,221 @@ SQL Single Numeric Column Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor SQL Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 650 680 33 24.2 41.3 1.0X -Native ORC MR 755 818 73 20.8 48.0 0.9X -Native ORC Vectorized 91 108 12 172.9 5.8 7.1X +Hive built-in ORC 686 697 18 22.9 43.6 1.0X +Native ORC MR 792 845 62 19.9 50.4 0.9X +Native ORC Vectorized 82 99 13 192.4 5.2 8.4X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor SQL Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 657 699 56 23.9 41.8 1.0X -Native ORC MR 735 777 49 21.4 46.8 0.9X -Native ORC Vectorized 74 88 8 212.5 4.7 8.9X +Hive built-in ORC 785 805 25 20.0 49.9 1.0X +Native ORC MR 810 833 36 19.4 51.5 1.0X +Native ORC Vectorized 92 115 18 171.8 5.8 8.6X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor SQL Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 732 735 3 21.5 46.5 1.0X -Native ORC MR 787 809 22 20.0 50.1 0.9X -Native ORC Vectorized 78 96 12 200.9 5.0 9.3X +Hive built-in ORC 827 847 18 19.0 52.6 1.0X +Native ORC MR 870 871 2 18.1 55.3 1.0X +Native ORC Vectorized 115 133 15 136.3 7.3 7.2X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor SQL Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 754 755 2 20.9 47.9 1.0X -Native ORC MR 795 804 8 19.8 50.6 0.9X -Native ORC Vectorized 112 124 13 140.4 7.1 6.7X +Hive built-in ORC 933 951 28 16.9 59.3 1.0X +Native ORC MR 897 908 10 17.5 57.0 1.0X +Native ORC Vectorized 113 128 11 139.3 7.2 8.3X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor SQL Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 795 798 3 19.8 50.5 1.0X -Native ORC MR 875 908 29 18.0 55.6 0.9X -Native ORC Vectorized 141 159 19 111.6 9.0 5.6X +Hive built-in ORC 879 882 5 17.9 55.9 1.0X +Native ORC MR 917 935 25 17.2 58.3 1.0X +Native ORC Vectorized 151 182 25 104.3 9.6 5.8X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor SQL Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 797 813 15 19.7 50.7 1.0X -Native ORC MR 866 878 13 18.2 55.1 0.9X -Native ORC Vectorized 183 191 8 86.1 11.6 4.4X +Hive built-in ORC 939 949 12 16.7 59.7 1.0X +Native ORC MR 1016 1039 32 15.5 64.6 0.9X +Native ORC Vectorized 248 259 10 63.5 15.7 3.8X ================================================================================================ Int and String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Int and String Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 1422 1514 131 7.4 135.6 1.0X -Native ORC MR 1415 1449 47 7.4 134.9 1.0X -Native ORC Vectorized 569 580 8 18.4 54.2 2.5X +Hive built-in ORC 1577 1591 20 6.7 150.4 1.0X +Native ORC MR 1524 1539 21 6.9 145.3 1.0X +Native ORC Vectorized 630 661 24 16.6 60.1 2.5X ================================================================================================ Partitioned Table Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Partitioned Table: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Data column - Hive built-in ORC 838 847 8 18.8 53.3 1.0X -Data column - Native ORC MR 937 965 42 16.8 59.6 0.9X -Data column - Native ORC Vectorized 109 124 11 144.5 6.9 7.7X -Partition column - Hive built-in ORC 594 617 16 26.5 37.8 1.4X -Partition column - Native ORC MR 599 618 18 26.3 38.1 1.4X -Partition column - Native ORC Vectorized 32 41 5 491.9 2.0 26.2X -Both columns - Hive built-in ORC 889 922 55 17.7 56.5 0.9X -Both columns - Native ORC MR 997 1003 8 15.8 63.4 0.8X -Both columns - Native ORC Vectorized 124 141 12 127.0 7.9 6.8X +Data column - Hive built-in ORC 1040 1043 4 15.1 66.1 1.0X +Data column - Native ORC MR 1190 1201 15 13.2 75.7 0.9X +Data column - Native ORC Vectorized 120 134 8 131.2 7.6 8.7X +Partition column - Hive built-in ORC 675 682 8 23.3 42.9 1.5X +Partition column - Native ORC MR 725 771 40 21.7 46.1 1.4X +Partition column - Native ORC Vectorized 45 52 6 353.1 2.8 23.4X +Both columns - Hive built-in ORC 1049 1078 41 15.0 66.7 1.0X +Both columns - Native ORC MR 1238 1321 118 12.7 78.7 0.8X +Both columns - Native ORC Vectorized 133 153 14 117.8 8.5 7.8X ================================================================================================ Repeated String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Repeated String: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 830 847 17 12.6 79.2 1.0X -Native ORC MR 789 791 3 13.3 75.3 1.1X -Native ORC Vectorized 123 130 8 85.2 11.7 6.7X +Hive built-in ORC 859 878 22 12.2 81.9 1.0X +Native ORC MR 855 877 21 12.3 81.5 1.0X +Native ORC Vectorized 145 161 19 72.1 13.9 5.9X ================================================================================================ String with Nulls Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor String with Nulls Scan (0.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 1350 1378 40 7.8 128.8 1.0X -Native ORC MR 1229 1254 35 8.5 117.2 1.1X -Native ORC Vectorized 282 289 5 37.2 26.9 4.8X +Hive built-in ORC 1465 1465 0 7.2 139.7 1.0X +Native ORC MR 1412 1438 36 7.4 134.7 1.0X +Native ORC Vectorized 326 355 25 32.2 31.1 4.5X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor String with Nulls Scan (50.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 1219 1264 64 8.6 116.3 1.0X -Native ORC MR 1132 1140 12 9.3 107.9 1.1X -Native ORC Vectorized 336 362 29 31.2 32.1 3.6X +Hive built-in ORC 1270 1275 7 8.3 121.1 1.0X +Native ORC MR 1311 1318 9 8.0 125.0 1.0X +Native ORC Vectorized 371 378 5 28.3 35.3 3.4X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor String with Nulls Scan (95.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 748 758 13 14.0 71.3 1.0X -Native ORC MR 784 786 2 13.4 74.8 1.0X -Native ORC Vectorized 143 158 14 73.1 13.7 5.2X +Hive built-in ORC 775 779 3 13.5 73.9 1.0X +Native ORC MR 894 907 16 11.7 85.3 0.9X +Native ORC Vectorized 161 178 15 65.3 15.3 4.8X ================================================================================================ Single Column Scan From Wide Columns ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Single Column Scan from 100 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 403 430 28 2.6 384.0 1.0X -Native ORC MR 88 96 9 11.9 84.3 4.6X -Native ORC Vectorized 36 43 5 28.8 34.7 11.1X +Hive built-in ORC 447 494 41 2.3 426.4 1.0X +Native ORC MR 106 119 15 9.9 100.8 4.2X +Native ORC Vectorized 40 51 9 26.1 38.3 11.1X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Single Column Scan from 200 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 717 734 26 1.5 684.1 1.0X -Native ORC MR 101 118 11 10.4 96.4 7.1X -Native ORC Vectorized 51 60 7 20.5 48.8 14.0X +Hive built-in ORC 772 804 33 1.4 736.2 1.0X +Native ORC MR 114 124 10 9.2 108.7 6.8X +Native ORC Vectorized 49 59 8 21.5 46.5 15.8X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Single Column Scan from 300 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 1052 1088 50 1.0 1003.6 1.0X -Native ORC MR 118 137 13 8.9 112.4 8.9X -Native ORC Vectorized 62 75 11 16.8 59.4 16.9X +Hive built-in ORC 1197 1206 12 0.9 1141.7 1.0X +Native ORC MR 131 151 18 8.0 125.1 9.1X +Native ORC Vectorized 61 72 8 17.3 58.0 19.7X ================================================================================================ Struct scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Single Struct Column Scan with 10 Fields: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 319 334 29 3.3 303.9 1.0X -Native ORC MR 278 298 19 3.8 265.0 1.1X -Native ORC Vectorized 103 136 20 10.1 98.6 3.1X +Hive built-in ORC 480 491 11 2.2 457.4 1.0X +Native ORC MR 366 387 20 2.9 348.7 1.3X +Native ORC Vectorized 152 175 13 6.9 145.1 3.2X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Single Struct Column Scan with 100 Fields: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Hive built-in ORC 2138 2169 45 0.5 2038.6 1.0X -Native ORC MR 1607 1667 84 0.7 1532.8 1.3X -Native ORC Vectorized 855 902 42 1.2 815.2 2.5X +Hive built-in ORC 3607 3669 89 0.3 3439.5 1.0X +Native ORC MR 1890 1921 43 0.6 1802.9 1.9X +Native ORC Vectorized 1259 1311 74 0.8 1200.6 2.9X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Single Struct Column Scan with 300 Fields: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Hive built-in ORC 6442 6476 48 0.2 6143.4 1.0X -Native ORC MR 5475 5493 26 0.2 5220.9 1.2X -Native ORC Vectorized 5492 5500 11 0.2 5237.5 1.2X +Hive built-in ORC 11342 11426 119 0.1 10816.1 1.0X +Native ORC MR 6475 6524 68 0.2 6175.5 1.8X +Native ORC Vectorized 6379 6408 41 0.2 6083.8 1.8X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Single Struct Column Scan with 600 Fields: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Hive built-in ORC 13992 14055 89 0.1 13344.0 1.0X -Native ORC MR 12827 12858 45 0.1 12232.5 1.1X -Native ORC Vectorized 12910 12950 58 0.1 12311.8 1.1X +Hive built-in ORC 24544 24920 532 0.0 23406.9 1.0X +Native ORC MR 15124 15472 492 0.1 14423.6 1.6X +Native ORC Vectorized 15066 15264 280 0.1 14368.4 1.6X ================================================================================================ Nested Struct scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Nested Struct Scan with 10 Elements, 10 Fields: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 2584 2616 46 0.4 2464.2 1.0X -Native ORC MR 2355 2368 18 0.4 2246.2 1.1X -Native ORC Vectorized 660 662 2 1.6 629.3 3.9X +Hive built-in ORC 3951 3965 19 0.3 3768.2 1.0X +Native ORC MR 2319 2417 139 0.5 2211.5 1.7X +Native ORC Vectorized 743 769 27 1.4 708.8 5.3X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Nested Struct Scan with 30 Elements, 10 Fields: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 6123 6123 1 0.2 5839.1 1.0X -Native ORC MR 5065 5235 240 0.2 4830.8 1.2X -Native ORC Vectorized 1533 1547 20 0.7 1461.8 4.0X +Hive built-in ORC 10020 10058 54 0.1 9555.4 1.0X +Native ORC MR 4704 4747 61 0.2 4486.5 2.1X +Native ORC Vectorized 2038 2092 76 0.5 1944.0 4.9X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1018-azure AMD EPYC 7763 64-Core Processor Nested Struct Scan with 10 Elements, 30 Fields: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------ -Hive built-in ORC 5890 5899 13 0.2 5616.8 1.0X -Native ORC MR 5753 5836 118 0.2 5486.6 1.0X -Native ORC Vectorized 2154 2183 41 0.5 2053.8 2.7X +Hive built-in ORC 9520 9564 62 0.1 9079.2 1.0X +Native ORC MR 5648 5669 31 0.2 5386.2 1.7X +Native ORC Vectorized 3237 3253 22 0.3 3087.2 2.9X diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml index 3895d9dc5a634..5a325f5f56bfc 100644 --- a/sql/hive/pom.xml +++ b/sql/hive/pom.xml @@ -40,6 +40,10 @@ spark-core_${scala.binary.version} ${project.version} + + commons-lang + commons-lang + org.apache.spark spark-core_${scala.binary.version} @@ -113,12 +117,12 @@ ${hive.shims.scope} - org.apache.hive + ${hive.group} hive-llap-common ${hive.llap.scope} - org.apache.hive + ${hive.group} hive-llap-client ${hive.llap.scope} diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala index 8c35e10b383f6..77ed81482396b 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala @@ -34,7 +34,7 @@ import org.apache.thrift.TException import org.apache.spark.{SparkConf, SparkException} import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{DATABASE_NAME, SCHEMA, SCHEMA2, TABLE_NAME} +import org.apache.spark.internal.LogKeys.{DATABASE_NAME, INCOMPATIBLE_TYPES, PROVIDER, SCHEMA, SCHEMA2, TABLE_NAME} import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException @@ -49,6 +49,7 @@ import org.apache.spark.sql.hive.client.HiveClient import org.apache.spark.sql.internal.HiveSerDe import org.apache.spark.sql.internal.StaticSQLConf._ import org.apache.spark.sql.types._ +import org.apache.spark.sql.util.SchemaUtils /** * A persistent implementation of the system catalog using Hive. @@ -233,12 +234,39 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat tableDefinition.storage.locationUri } + val hiveCompatibleSchema = tryGetHiveCompatibleSchema(tableDefinition.schema) + if (DDLUtils.isDatasourceTable(tableDefinition)) { + // To work around some hive metastore issues, e.g. not case-preserving, bad decimal type + // support, no column nullability, etc., we should do some extra works before saving table + // metadata into Hive metastore: + // 1. Put table metadata like table schema, partition columns, etc. in table properties. + // 2. Check if this table is hive compatible. + // 2.1 If it's not hive compatible, set location URI, schema, partition columns and bucket + // spec to empty and save table metadata to Hive. + // 2.2 If it's hive compatible, set serde information in table metadata and try to save + // it to Hive. If it fails, treat it as not hive compatible and go back to 2.1 + val tableProperties = tableMetaToTableProps(tableDefinition) + + // put table provider and partition provider in table properties. + tableProperties.put(DATASOURCE_PROVIDER, tableDefinition.provider.get) + if (tableDefinition.tracksPartitionsInCatalog) { + tableProperties.put(TABLE_PARTITION_PROVIDER, TABLE_PARTITION_PROVIDER_CATALOG) + } + + // we have to set the table schema here so that the table schema JSON + // string in the table properties still uses the original schema + val hiveTable = tableDefinition.copy( + schema = hiveCompatibleSchema, + properties = tableDefinition.properties ++ tableProperties + ) + createDataSourceTable( - tableDefinition.withNewStorage(locationUri = tableLocation), + hiveTable.withNewStorage(locationUri = tableLocation), ignoreIfExists) } else { val tableWithDataSourceProps = tableDefinition.copy( + schema = hiveCompatibleSchema, // We can't leave `locationUri` empty and count on Hive metastore to set a default table // location, because Hive metastore uses hive.metastore.warehouse.dir to generate default // table location for tables in default database, while we expect to use the location of @@ -268,23 +296,6 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat val provider = table.provider.get val options = new SourceOptions(table.storage.properties) - // To work around some hive metastore issues, e.g. not case-preserving, bad decimal type - // support, no column nullability, etc., we should do some extra works before saving table - // metadata into Hive metastore: - // 1. Put table metadata like table schema, partition columns, etc. in table properties. - // 2. Check if this table is hive compatible. - // 2.1 If it's not hive compatible, set location URI, schema, partition columns and bucket - // spec to empty and save table metadata to Hive. - // 2.2 If it's hive compatible, set serde information in table metadata and try to save - // it to Hive. If it fails, treat it as not hive compatible and go back to 2.1 - val tableProperties = tableMetaToTableProps(table) - - // put table provider and partition provider in table properties. - tableProperties.put(DATASOURCE_PROVIDER, provider) - if (table.tracksPartitionsInCatalog) { - tableProperties.put(TABLE_PARTITION_PROVIDER, TABLE_PARTITION_PROVIDER_CATALOG) - } - // Ideally we should also put `locationUri` in table properties like provider, schema, etc. // However, in older version of Spark we already store table location in storage properties // with key "path". Here we keep this behaviour for backward compatibility. @@ -303,8 +314,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat locationUri = None, properties = storagePropsWithLocation), schema = StructType(EMPTY_DATA_SCHEMA ++ table.partitionSchema), - bucketSpec = None, - properties = table.properties ++ tableProperties) + bucketSpec = None) } // converts the table metadata to Hive compatible format, i.e. set the serde information. @@ -326,8 +336,8 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat outputFormat = serde.outputFormat, serde = serde.serde, properties = storagePropsWithLocation - ), - properties = table.properties ++ tableProperties) + ) + ) } val qualifiedTableName = table.identifier.quotedString @@ -338,35 +348,37 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat val (hiveCompatibleTable, logMessage) = maybeSerde match { case _ if options.skipHiveMetadata => val message = - s"Persisting data source table $qualifiedTableName into Hive metastore in" + - "Spark SQL specific format, which is NOT compatible with Hive." + log"Persisting data source table ${MDC(TABLE_NAME, qualifiedTableName)} into Hive " + + log"metastore in Spark SQL specific format, which is NOT compatible with Hive." (None, message) case _ if incompatibleTypes.nonEmpty => + val incompatibleTypesStr = incompatibleTypes.mkString(", ") val message = - s"Hive incompatible types found: ${incompatibleTypes.mkString(", ")}. " + - s"Persisting data source table $qualifiedTableName into Hive metastore in " + - "Spark SQL specific format, which is NOT compatible with Hive." + log"Hive incompatible types found: ${MDC(INCOMPATIBLE_TYPES, incompatibleTypesStr)}. " + + log"Persisting data source table ${MDC(TABLE_NAME, qualifiedTableName)} into Hive " + + log"metastore in Spark SQL specific format, which is NOT compatible with Hive." (None, message) // our bucketing is un-compatible with hive(different hash function) case Some(serde) if table.bucketSpec.nonEmpty => val message = - s"Persisting bucketed data source table $qualifiedTableName into " + - "Hive metastore in Spark SQL specific format, which is NOT compatible with " + - "Hive bucketed table. But Hive can read this table as a non-bucketed table." + log"Persisting bucketed data source table ${MDC(TABLE_NAME, qualifiedTableName)} into " + + log"Hive metastore in Spark SQL specific format, which is NOT compatible with " + + log"Hive bucketed table. But Hive can read this table as a non-bucketed table." (Some(newHiveCompatibleMetastoreTable(serde)), message) case Some(serde) => val message = - s"Persisting file based data source table $qualifiedTableName into " + - s"Hive metastore in Hive compatible format." + log"Persisting file based data source table ${MDC(TABLE_NAME, qualifiedTableName)} " + + log"into Hive metastore in Hive compatible format." (Some(newHiveCompatibleMetastoreTable(serde)), message) case _ => val message = - s"Couldn't find corresponding Hive SerDe for data source provider $provider. " + - s"Persisting data source table $qualifiedTableName into Hive metastore in " + - s"Spark SQL specific format, which is NOT compatible with Hive." + log"Couldn't find corresponding Hive SerDe for data source provider " + + log"${MDC(PROVIDER, provider)}. Persisting data source table " + + log"${MDC(TABLE_NAME, qualifiedTableName)} into Hive metastore in " + + log"Spark SQL specific format, which is NOT compatible with Hive." (None, message) } @@ -667,6 +679,8 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat val schemaProps = tableMetaToTableProps(oldTable, StructType(newDataSchema ++ oldTable.partitionSchema)).toMap + val hiveSchema = tryGetHiveCompatibleSchema(newDataSchema) + if (isDatasourceTable(oldTable)) { // For data source tables, first try to write it with the schema set; if that does not work, // try again with updated properties and the partition schema. This is a simplified version of @@ -674,7 +688,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat // (for example, the schema does not match the data source schema, or does not match the // storage descriptor). try { - client.alterTableDataSchema(db, table, newDataSchema, schemaProps) + client.alterTableDataSchema(db, table, hiveSchema, schemaProps) } catch { case NonFatal(e) => val warningMessage = log"Could not alter schema of table " + @@ -684,10 +698,21 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat client.alterTableDataSchema(db, table, EMPTY_DATA_SCHEMA, schemaProps) } } else { - client.alterTableDataSchema(db, table, newDataSchema, schemaProps) + client.alterTableDataSchema(db, table, hiveSchema, schemaProps) } } + /** + * Tries to fix the schema so that all column data types are Hive-compatible + * ie. the types are converted to the types that Hive supports. + */ + private def tryGetHiveCompatibleSchema(schema: StructType): StructType = { + // Since collated strings do not exist in Hive as a type we need to replace them with + // the the regular string type. However, as we save the original schema in the table + // properties we will be able to restore the original schema when reading back the table. + SchemaUtils.replaceCollatedStringWithString(schema).asInstanceOf[StructType] + } + /** Alter the statistics of a table. If `stats` is None, then remove all existing statistics. */ override def alterTableStats( db: String, @@ -790,7 +815,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat val partColumnNames = getPartitionColumnsFromTableProperties(table) val reorderedSchema = reorderSchema(schema = schemaFromTableProps, partColumnNames) - if (DataTypeUtils.equalsIgnoreCaseAndNullability(reorderedSchema, table.schema) || + if (DataTypeUtils.equalsIgnoreCaseNullabilityAndCollation(reorderedSchema, table.schema) || options.respectSparkSchema) { hiveTable.copy( schema = reorderedSchema, @@ -1423,6 +1448,7 @@ object HiveExternalCatalog { case a: ArrayType => isHiveCompatibleDataType(a.elementType) case m: MapType => isHiveCompatibleDataType(m.keyType) && isHiveCompatibleDataType(m.valueType) + case st: StringType => st.isUTF8BinaryCollation case _ => true } } diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala index a05043b63d51b..9f1954cbf6868 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala @@ -34,6 +34,7 @@ import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.util._ +import org.apache.spark.sql.errors.DataTypeErrors.toSQLType import org.apache.spark.sql.execution.datasources.DaysWritable import org.apache.spark.sql.types import org.apache.spark.sql.types._ @@ -1126,7 +1127,7 @@ private[hive] trait HiveInspectors { private def decimalTypeInfo(decimalType: DecimalType): TypeInfo = decimalType match { case DecimalType.Fixed(precision, scale) => new DecimalTypeInfo(precision, scale) case dt => throw new AnalysisException( - errorClass = "_LEGACY_ERROR_TEMP_3094", messageParameters = Map("dt" -> dt.catalogString)) + errorClass = "_LEGACY_ERROR_TEMP_3094", messageParameters = Map("dt" -> toSQLType(dt))) } def toTypeInfo: TypeInfo = dt match { @@ -1155,7 +1156,7 @@ private[hive] trait HiveInspectors { case _: YearMonthIntervalType => intervalYearMonthTypeInfo case dt => throw new AnalysisException( - errorClass = "_LEGACY_ERROR_TEMP_3095", messageParameters = Map("dt" -> dt.catalogString)) + errorClass = "_LEGACY_ERROR_TEMP_3095", messageParameters = Map("dt" -> toSQLType(dt))) } } } diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala index 5b3160c563043..60858089875a2 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala @@ -26,7 +26,7 @@ import org.apache.hadoop.fs.Path import org.apache.spark.SparkException import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{FILE_FORMAT, FILE_FORMAT2, INFERENCE_MODE, TABLE_NAME} +import org.apache.spark.internal.LogKeys._ import org.apache.spark.sql.{AnalysisException, SparkSession} import org.apache.spark.sql.catalyst.{QualifiedTableName, TableIdentifier} import org.apache.spark.sql.catalyst.catalog._ @@ -339,8 +339,8 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log val shouldInfer = (inferenceMode != NEVER_INFER) && !relation.tableMeta.schemaPreservesCase val tableName = relation.tableMeta.identifier.unquotedString if (shouldInfer) { - logInfo(s"Inferring case-sensitive schema for table $tableName (inference mode: " + - s"$inferenceMode)") + logInfo(log"Inferring case-sensitive schema for table ${MDC(TABLE_NAME, tableName)} " + + log"(inference mode: ${MDC(INFERENCE_MODE, inferenceMode)})})") val fileIndex = fileIndexOpt.getOrElse { val rootPath = new Path(relation.tableMeta.location) new InMemoryFileIndex(sparkSession, Seq(rootPath), options, None) @@ -372,7 +372,8 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log } private def updateDataSchema(identifier: TableIdentifier, newDataSchema: StructType): Unit = try { - logInfo(s"Saving case-sensitive schema for table ${identifier.unquotedString}") + logInfo( + log"Saving case-sensitive schema for table ${MDC(TABLE_NAME, identifier.unquotedString)}") sparkSession.sessionState.catalog.alterTableDataSchema(identifier, newDataSchema) } catch { case NonFatal(ex) => diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala index 416299b189cd5..979ff1e24ef5c 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala @@ -114,6 +114,7 @@ class HiveSessionStateBuilder( TableCapabilityCheck +: CommandCheck +: CollationCheck +: + ViewSyncSchemaToMetaStore +: customCheckRules } diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala index 5972a9df78ecc..e74cc088a1f66 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala @@ -34,6 +34,7 @@ import org.apache.spark.sql.execution.command.{CreateTableCommand, DDLUtils, Ins import org.apache.spark.sql.execution.datasources.{CreateTable, DataSourceStrategy, HadoopFsRelation, InsertIntoHadoopFsRelationCommand, LogicalRelation} import org.apache.spark.sql.hive.execution._ import org.apache.spark.sql.hive.execution.HiveScriptTransformationExec +import org.apache.spark.sql.hive.execution.InsertIntoHiveTable.BY_CTAS import org.apache.spark.sql.internal.HiveSerDe @@ -194,6 +195,8 @@ object HiveAnalysis extends Rule[LogicalPlan] { * - When writing to non-partitioned Hive-serde Parquet/Orc tables * - When writing to partitioned Hive-serde Parquet/Orc tables when * `spark.sql.hive.convertInsertingPartitionedTable` is true + * - When writing to unpartitioned Hive-serde Parquet/Orc tables when + * `spark.sql.hive.convertInsertingUnpartitionedTable` is true * - When writing to directory with Hive-serde * - When writing to non-partitioned Hive-serde Parquet/ORC tables using CTAS * - When scanning Hive-serde Parquet/ORC tables @@ -230,7 +233,8 @@ case class RelationConversions( case InsertIntoStatement( r: HiveTableRelation, partition, cols, query, overwrite, ifPartitionNotExists, byName) if query.resolved && DDLUtils.isHiveTable(r.tableMeta) && - (!r.isPartitioned || conf.getConf(HiveUtils.CONVERT_INSERTING_PARTITIONED_TABLE)) + ((r.isPartitioned && conf.getConf(HiveUtils.CONVERT_INSERTING_PARTITIONED_TABLE)) || + (!r.isPartitioned && conf.getConf(HiveUtils.CONVERT_INSERTING_UNPARTITIONED_TABLE))) && isConvertible(r) => InsertIntoStatement(metastoreCatalog.convert(r, isWrite = true), partition, cols, query, overwrite, ifPartitionNotExists, byName) @@ -245,11 +249,11 @@ case class RelationConversions( // that only matches table insertion inside Hive CTAS. // This pattern would not cause conflicts because this rule is always applied before // `HiveAnalysis` and both of these rules are running once. - case InsertIntoHiveTable( + case i @ InsertIntoHiveTable( tableDesc, _, query, overwrite, ifPartitionNotExists, _, _, _, _, _, _) if query.resolved && DDLUtils.isHiveTable(tableDesc) && tableDesc.partitionColumnNames.isEmpty && isConvertible(tableDesc) && - conf.getConf(HiveUtils.CONVERT_METASTORE_CTAS) => + conf.getConf(HiveUtils.CONVERT_METASTORE_CTAS) && i.getTagValue(BY_CTAS).isDefined => // validation is required to be done here before relation conversion. DDLUtils.checkTableColumns(tableDesc.copy(schema = query.schema)) val hiveTable = DDLUtils.readHiveTable(tableDesc) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala index 68f34bd2beb01..30201dcee552d 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala @@ -37,7 +37,7 @@ import org.apache.hive.common.util.HiveVersionInfo import org.apache.spark.SparkConf import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.PATH +import org.apache.spark.internal.LogKeys import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.execution.command.DDLUtils @@ -74,7 +74,7 @@ private[spark] object HiveUtils extends Logging { val HIVE_METASTORE_VERSION = buildStaticConf("spark.sql.hive.metastore.version") .doc("Version of the Hive metastore. Available options are " + - "2.0.0 through 2.3.9 and " + + "2.0.0 through 2.3.10 and " + "3.0.0 through 3.1.3.") .version("1.4.0") .stringConf @@ -154,6 +154,16 @@ private[spark] object HiveUtils extends Logging { .booleanConf .createWithDefault(true) + val CONVERT_INSERTING_UNPARTITIONED_TABLE = + buildConf("spark.sql.hive.convertInsertingUnpartitionedTable") + .doc("When set to true, and `spark.sql.hive.convertMetastoreParquet` or " + + "`spark.sql.hive.convertMetastoreOrc` is true, the built-in ORC/Parquet writer is used" + + "to process inserting into unpartitioned ORC/Parquet tables created by using the HiveSQL " + + "syntax.") + .version("4.0.0") + .booleanConf + .createWithDefault(true) + val CONVERT_METASTORE_CTAS = buildConf("spark.sql.hive.convertMetastoreCtas") .doc("When set to true, Spark will try to use built-in data source writer " + "instead of Hive serde in CTAS. This flag is effective only if " + @@ -287,7 +297,8 @@ private[spark] object HiveUtils extends Logging { protected[hive] def newClientForExecution( conf: SparkConf, hadoopConf: Configuration): HiveClientImpl = { - logInfo(s"Initializing execution hive, version $builtinHiveVersion") + logInfo(log"Initializing execution hive, version " + + log"${MDC(LogKeys.HIVE_METASTORE_VERSION, builtinHiveVersion)}") val loader = new IsolatedClientLoader( version = IsolatedClientLoader.hiveVersion(builtinHiveVersion), sparkConf = conf, @@ -321,7 +332,7 @@ private[spark] object HiveUtils extends Logging { if (file.getName == "*") { val files = file.getParentFile.listFiles() if (files == null) { - logWarning(log"Hive jar path '${MDC(PATH, file.getPath)}' does not exist.") + logWarning(log"Hive jar path '${MDC(LogKeys.PATH, file.getPath)}' does not exist.") Nil } else { files.filter(_.getName.toLowerCase(Locale.ROOT).endsWith(".jar")).map(_.toURI.toURL) @@ -332,6 +343,12 @@ private[spark] object HiveUtils extends Logging { } } + def logInitWithPath(jars: Seq[URL]): Unit = { + logInfo(log"Initializing HiveMetastoreConnection version " + + log"${MDC(LogKeys.HIVE_METASTORE_VERSION, hiveMetastoreVersion)} using paths: " + + log"${MDC(LogKeys.PATH, jars.mkString(", "))}") + } + val isolatedLoader = if (hiveMetastoreJars == "builtin") { if (builtinHiveVersion != hiveMetastoreVersion) { throw new IllegalArgumentException( @@ -342,7 +359,8 @@ private[spark] object HiveUtils extends Logging { } logInfo( - s"Initializing HiveMetastoreConnection version $hiveMetastoreVersion using Spark classes.") + log"Initializing HiveMetastoreConnection version " + + log"${MDC(LogKeys.HIVE_METASTORE_VERSION, hiveMetastoreVersion)} using Spark classes.") new IsolatedClientLoader( version = metaVersion, sparkConf = conf, @@ -355,7 +373,8 @@ private[spark] object HiveUtils extends Logging { } else if (hiveMetastoreJars == "maven") { // TODO: Support for loading the jars from an already downloaded location. logInfo( - s"Initializing HiveMetastoreConnection version $hiveMetastoreVersion using maven.") + log"Initializing HiveMetastoreConnection version " + + log"${MDC(LogKeys.HIVE_METASTORE_VERSION, hiveMetastoreVersion)} using maven.") IsolatedClientLoader.forVersion( hiveMetastoreVersion = hiveMetastoreVersion, hadoopVersion = VersionInfo.getVersion, @@ -381,9 +400,7 @@ private[spark] object HiveUtils extends Logging { ).map(_.toUri.toURL) } - logInfo( - s"Initializing HiveMetastoreConnection version $hiveMetastoreVersion " + - s"using path: ${jars.mkString(";")}") + logInitWithPath(jars) new IsolatedClientLoader( version = metaVersion, sparkConf = conf, @@ -402,9 +419,7 @@ private[spark] object HiveUtils extends Logging { addLocalHiveJars(new File(path)) } - logInfo( - s"Initializing HiveMetastoreConnection version $hiveMetastoreVersion " + - s"using ${jars.mkString(":")}") + logInitWithPath(jars.toSeq) new IsolatedClientLoader( version = metaVersion, sparkConf = conf, diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala index 60970eecc2df1..a93c6bd6b4e9b 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala @@ -37,7 +37,7 @@ import org.apache.hadoop.mapreduce.{InputFormat => newInputClass} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey._ +import org.apache.spark.internal.LogKeys._ import org.apache.spark.rdd.{EmptyRDD, HadoopRDD, NewHadoopRDD, RDD, UnionRDD} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.{InternalRow, SQLConfHelper} diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala index 502cec3be9c82..11e077e891bd7 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala @@ -47,7 +47,7 @@ import org.apache.hadoop.security.UserGroupInformation import org.apache.spark.{SparkConf, SparkException} import org.apache.spark.deploy.SparkHadoopUtil.SOURCE_SPARK import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey._ +import org.apache.spark.internal.LogKeys._ import org.apache.spark.metrics.source.HiveCatalogMetrics import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.{DatabaseAlreadyExistsException, NoSuchDatabaseException, NoSuchPartitionException, NoSuchPartitionsException, NoSuchTableException, PartitionsAlreadyExistException} @@ -161,8 +161,9 @@ private[hive] class HiveClientImpl( // Log the default warehouse location. logInfo( - s"Warehouse location for Hive client (version ${version.fullVersion}) is " + - s"${conf.getVar(HiveConf.getConfVars("hive.metastore.warehouse.dir"))}") + log"Warehouse location for Hive client (version " + + log"${MDC(HIVE_CLIENT_VERSION, version.fullVersion)}) is " + + log"${MDC(PATH, conf.getVar(HiveConf.getConfVars("hive.metastore.warehouse.dir")))}") private def newState(): SessionState = { val hiveConf = newHiveConf(sparkConf, hadoopConf, extraConfig, Some(initClassLoader)) @@ -231,7 +232,7 @@ private[hive] class HiveClientImpl( caughtException = e logWarning( log"HiveClient got thrift exception, destroying client and retrying " + - log"${MDC(RETRY_COUNT, numTries)} times", e) + log"${MDC(NUM_RETRY, numTries)} times", e) clientLoader.cachedHive = null Thread.sleep(retryDelayMillis) } @@ -1339,6 +1340,15 @@ private[hive] object HiveClientImpl extends Logging { log"will be reset to 'mr' to disable useless hive logic") hiveConf.set("hive.execution.engine", "mr", SOURCE_SPARK) } + val cpType = hiveConf.get("datanucleus.connectionPoolingType") + // Bonecp might cause memory leak, it could affect some hive client versions we support + // See more details in HIVE-15551 + // Also, Bonecp is removed in Hive 4.0.0, see HIVE-23258 + // Here we use DBCP to replace bonecp instead of HikariCP as HikariCP was introduced in + // Hive 2.2.0 (see HIVE-13931) while the minium Hive we support is 2.0.0. + if ("bonecp".equalsIgnoreCase(cpType)) { + hiveConf.set("datanucleus.connectionPoolingType", "DBCP", SOURCE_SPARK) + } hiveConf } diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala index 07daa29386282..c03fed4cc3184 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala @@ -38,7 +38,7 @@ import org.apache.hadoop.hive.ql.session.SessionState import org.apache.hadoop.hive.serde.serdeConstants import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{CONFIG, CONFIG2, CONFIG3} +import org.apache.spark.internal.LogKeys.{CONFIG, CONFIG2, CONFIG3} import org.apache.spark.metrics.source.HiveCatalogMetrics import org.apache.spark.sql.catalyst.{FunctionIdentifier, InternalRow} import org.apache.spark.sql.catalyst.analysis.NoSuchPermanentFunctionException diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala index e4bab4631ab19..b0570f5d30352 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala @@ -31,7 +31,7 @@ import org.apache.hadoop.hive.shims.ShimLoader import org.apache.spark.SparkConf import org.apache.spark.deploy.SparkSubmit import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{FALLBACK_VERSION, HADOOP_VERSION} +import org.apache.spark.internal.LogKeys.{FALLBACK_VERSION, HADOOP_VERSION, PATH} import org.apache.spark.sql.catalyst.util.quietly import org.apache.spark.sql.errors.QueryExecutionErrors import org.apache.spark.sql.hive.HiveUtils @@ -149,7 +149,7 @@ private[hive] object IsolatedClientLoader extends Logging { // TODO: Remove copy logic. val tempDir = Utils.createTempDir(namePrefix = s"hive-${version}") allFiles.foreach(f => FileUtils.copyFileToDirectory(f, tempDir)) - logInfo(s"Downloaded metastore jars to ${tempDir.getCanonicalPath}") + logInfo(log"Downloaded metastore jars to ${MDC(PATH, tempDir.getCanonicalPath)}") tempDir.listFiles().map(_.toURI.toURL).toImmutableArraySeq } diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/package.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/package.scala index 564c87a0fca8e..d172af21a9170 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/package.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/package.scala @@ -59,13 +59,12 @@ package object client { "org.pentaho:pentaho-aggdesigner-algorithm")) // Since HIVE-23980, calcite-core included in Hive package jar. - case object v2_3 extends HiveVersion("2.3.9", + case object v2_3 extends HiveVersion("2.3.10", exclusions = Seq("org.apache.calcite:calcite-core", "org.apache.calcite:calcite-druid", "org.apache.calcite.avatica:avatica", - "com.fasterxml.jackson.core:*", "org.apache.curator:*", - "org.pentaho:pentaho-aggdesigner-algorithm", + "net.hydromatic:aggdesigner-algorithm", "org.apache.hive:hive-vector-code-gen")) // Since Hive 3.0, HookUtils uses org.apache.logging.log4j.util.Strings diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateHiveTableAsSelectCommand.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateHiveTableAsSelectCommand.scala index 811d186b17d26..154d07f80d898 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateHiveTableAsSelectCommand.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateHiveTableAsSelectCommand.scala @@ -25,6 +25,7 @@ import org.apache.spark.sql.catalyst.plans.logical.{CTEInChildren, CTERelationDe import org.apache.spark.sql.catalyst.util.CharVarcharUtils import org.apache.spark.sql.errors.QueryCompilationErrors import org.apache.spark.sql.execution.command.{DataWritingCommand, LeafRunnableCommand} +import org.apache.spark.sql.hive.execution.InsertIntoHiveTable.BY_CTAS /** * Create table and insert the query result into it. @@ -98,13 +99,15 @@ case class CreateHiveTableAsSelectCommand( tableExists: Boolean): DataWritingCommand = { // For CTAS, there is no static partition values to insert. val partition = tableDesc.partitionColumnNames.map(_ -> None).toMap - InsertIntoHiveTable( + val insertHive = InsertIntoHiveTable( tableDesc, partition, query, overwrite = false, ifPartitionNotExists = false, outputColumnNames = outputColumnNames) + insertHive.setTagValue(BY_CTAS, ()) + insertHive } override def argString(maxFields: Int): String = { diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveFileFormat.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveFileFormat.scala index 1b76478a5cf33..cabdddd4c475d 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveFileFormat.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveFileFormat.scala @@ -32,7 +32,7 @@ import org.apache.hadoop.mapred.{JobConf, Reporter} import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext} import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.CLASS_NAME +import org.apache.spark.internal.LogKeys.CLASS_NAME import org.apache.spark.internal.config.SPECULATION_ENABLED import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTempPath.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTempPath.scala index a16191b72a8d3..16edfea67e38e 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTempPath.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTempPath.scala @@ -32,7 +32,7 @@ import org.apache.hadoop.hive.ql.exec.TaskRunner import org.apache.spark.SparkException import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.PATH +import org.apache.spark.internal.LogKeys.PATH import org.apache.spark.sql.SparkSession import org.apache.spark.sql.errors.QueryExecutionErrors import org.apache.spark.sql.hive.HiveExternalCatalog diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala index 4a92bfd840405..cf296e8be4f14 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala @@ -28,6 +28,7 @@ import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec import org.apache.spark.sql.catalyst.expressions.{Attribute, SortOrder} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.catalyst.trees.TreeNodeTag import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap import org.apache.spark.sql.errors.QueryExecutionErrors import org.apache.spark.sql.execution.SparkPlan @@ -235,6 +236,12 @@ case class InsertIntoHiveTable( } object InsertIntoHiveTable extends V1WritesHiveUtils { + + /** + * A tag to identify if this command is created by a CTAS. + */ + val BY_CTAS = TreeNodeTag[Unit]("by_ctas") + def apply( table: CatalogTable, partition: Map[String, Option[String]], diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileOperator.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileOperator.scala index 3e1bdff8c007b..d588e9f5bd5c4 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileOperator.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileOperator.scala @@ -26,7 +26,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.PATH +import org.apache.spark.internal.LogKeys.PATH import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.errors.QueryExecutionErrors import org.apache.spark.sql.types.StructType @@ -59,8 +59,8 @@ private[hive] object OrcFileOperator extends Logging { reader.getObjectInspector match { case oi: StructObjectInspector if oi.getAllStructFieldRefs.size() == 0 => logInfo( - s"ORC file $path has empty schema, it probably contains no rows. " + - "Trying to read another ORC file to figure out the schema.") + log"ORC file ${MDC(PATH, path)} has empty schema, it probably contains no rows. " + + log"Trying to read another ORC file to figure out the schema.") false case _ => true } diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/security/HiveDelegationTokenProvider.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/security/HiveDelegationTokenProvider.scala index 13ff721736b2c..0e357d5e39b26 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/security/HiveDelegationTokenProvider.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/security/HiveDelegationTokenProvider.scala @@ -33,7 +33,7 @@ import org.apache.hadoop.security.token.Token import org.apache.spark.SparkConf import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.CLASS_NAME +import org.apache.spark.internal.LogKeys.CLASS_NAME import org.apache.spark.internal.config.KEYTAB import org.apache.spark.security.HadoopDelegationTokenProvider import org.apache.spark.sql.hive.client.HiveClientImpl diff --git a/sql/hive/src/test/java/org/apache/spark/sql/hive/test/Complex.java b/sql/hive/src/test/java/org/apache/spark/sql/hive/test/Complex.java index 9afe53fe825ca..e94a84d6446a9 100644 --- a/sql/hive/src/test/java/org/apache/spark/sql/hive/test/Complex.java +++ b/sql/hive/src/test/java/org/apache/spark/sql/hive/test/Complex.java @@ -16,7 +16,7 @@ */ package org.apache.spark.sql.hive.test; -import org.apache.commons.lang.builder.HashCodeBuilder; +import org.apache.commons.lang3.builder.HashCodeBuilder; import org.apache.thrift.scheme.IScheme; import org.apache.thrift.scheme.SchemeFactory; import org.apache.thrift.scheme.StandardScheme; diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala index 89fe10d5c4bd9..d7918f8cbf4f0 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala @@ -335,9 +335,10 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with TestHiveSingleto options = Map.empty)(sparkSession = spark) val plan = LogicalRelation(relation, tableMeta) - spark.sharedState.cacheManager.cacheQuery(Dataset.ofRows(spark, plan)) + val df = Dataset.ofRows(spark, plan) + spark.sharedState.cacheManager.cacheQuery(df) - assert(spark.sharedState.cacheManager.lookupCachedData(plan).isDefined) + assert(spark.sharedState.cacheManager.lookupCachedData(df).isDefined) val sameCatalog = new CatalogFileIndex(spark, tableMeta, 0) val sameRelation = HadoopFsRelation( @@ -347,9 +348,9 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with TestHiveSingleto bucketSpec = None, fileFormat = new ParquetFileFormat(), options = Map.empty)(sparkSession = spark) - val samePlan = LogicalRelation(sameRelation, tableMeta) + val samePlanDf = Dataset.ofRows(spark, LogicalRelation(sameRelation, tableMeta)) - assert(spark.sharedState.cacheManager.lookupCachedData(samePlan).isDefined) + assert(spark.sharedState.cacheManager.lookupCachedData(samePlanDf).isDefined) } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala index e413e0ee73cb9..8bb33e3383be1 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala @@ -22,9 +22,10 @@ import org.apache.hadoop.conf.Configuration import org.apache.spark.SparkConf import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog._ +import org.apache.spark.sql.catalyst.types.DataTypeUtils import org.apache.spark.sql.execution.QueryExecutionException import org.apache.spark.sql.execution.command.DDLUtils -import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.types.{StringType, StructField, StructType} /** * Test suite for the [[HiveExternalCatalog]]. @@ -200,4 +201,44 @@ class HiveExternalCatalogSuite extends ExternalCatalogSuite { assert(alteredTable.provider === Some("foo")) }) } + + test("write collated strings as regular strings in hive - but read them back as collated") { + val catalog = newBasicCatalog() + val tableName = "collation_tbl" + val columnName = "col1" + + val collationsSchema = StructType(Seq( + StructField(columnName, StringType("UNICODE")) + )) + val noCollationsSchema = StructType(Seq( + StructField(columnName, StringType) + )) + + val tableDDL = CatalogTable( + identifier = TableIdentifier(tableName, Some("db1")), + tableType = CatalogTableType.MANAGED, + storage = storageFormat, + schema = collationsSchema, + provider = Some("hive")) + + catalog.createTable(tableDDL, ignoreIfExists = false) + + val rawTable = externalCatalog.getRawTable("db1", tableName) + assert(DataTypeUtils.sameType(rawTable.schema, noCollationsSchema)) + + val readBackTable = externalCatalog.getTable("db1", tableName) + assert(DataTypeUtils.sameType(readBackTable.schema, collationsSchema)) + + // perform alter table + val newSchema = StructType(Seq( + StructField("col1", StringType("UTF8_LCASE")) + )) + catalog.alterTableDataSchema("db1", tableName, newSchema) + + val alteredRawTable = externalCatalog.getRawTable("db1", tableName) + assert(DataTypeUtils.sameType(alteredRawTable.schema, noCollationsSchema)) + + val alteredTable = externalCatalog.getTable("db1", tableName) + assert(DataTypeUtils.sameType(alteredTable.schema, newSchema)) + } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala index 726341ffdf9e3..95baffdee06cb 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala @@ -211,7 +211,6 @@ class HiveExternalCatalogVersionsSuite extends SparkSubmitTestUtils { tryDownloadSpark(version, sparkTestingDir.getCanonicalPath) } - // Extract major.minor for testing Spark 3.1.x and 3.0.x with metastore 2.3.9 and Java 11. val hiveMetastoreVersion = """^\d+\.\d+""".r.findFirstIn(hiveVersion).get val args = Seq( "--name", "prepare testing tables", diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSharedStateSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSharedStateSuite.scala index e3b649f9a9f01..d84b9f7960231 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSharedStateSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSharedStateSuite.scala @@ -76,7 +76,7 @@ class HiveSharedStateSuite extends SparkFunSuite { assert(client.getConf("hive.metastore.warehouse.dir", "") === qualifiedWHPath, "session level conf should be passed to catalog") - assert(state.globalTempViewManager.database === tmpDb) + assert(state.globalTempDB === tmpDb) val ss2 = builder.config("spark.foo", "bar2222").config(WAREHOUSE_PATH.key, invalidPath).getOrCreate() diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala index c7aa412959097..e88a37f019b7d 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala @@ -149,7 +149,7 @@ class HiveSparkSubmitSuite "--conf", s"${EXECUTOR_MEMORY.key}=512m", "--conf", "spark.ui.enabled=false", "--conf", "spark.master.rest.enabled=false", - "--conf", "spark.sql.hive.metastore.version=2.3.9", + "--conf", "spark.sql.hive.metastore.version=2.3.10", "--conf", "spark.sql.hive.metastore.jars=maven", "--driver-java-options", "-Dderby.system.durability=test", unusedJar.toString) @@ -370,7 +370,7 @@ class HiveSparkSubmitSuite "--master", "local-cluster[2,1,512]", "--conf", s"${EXECUTOR_MEMORY.key}=512m", "--conf", s"${LEGACY_TIME_PARSER_POLICY.key}=LEGACY", - "--conf", s"${HiveUtils.HIVE_METASTORE_VERSION.key}=2.3.9", + "--conf", s"${HiveUtils.HIVE_METASTORE_VERSION.key}=2.3.10", "--conf", s"${HiveUtils.HIVE_METASTORE_JARS.key}=maven", "--conf", s"spark.hadoop.javax.jdo.option.ConnectionURL=$metastore", unusedJar.toString) @@ -387,7 +387,7 @@ object SetMetastoreURLTest extends Logging { val builder = SparkSession.builder() .config(sparkConf) .config(UI_ENABLED.key, "false") - .config(HiveUtils.HIVE_METASTORE_VERSION.key, "2.3.9") + .config(HiveUtils.HIVE_METASTORE_VERSION.key, "2.3.10") // The issue described in SPARK-16901 only appear when // spark.sql.hive.metastore.jars is not set to builtin. .config(HiveUtils.HIVE_METASTORE_JARS.key, "maven") @@ -698,7 +698,7 @@ object SparkSQLConfTest extends Logging { val filteredSettings = super.getAll.filterNot(e => isMetastoreSetting(e._1)) // Always add these two metastore settings at the beginning. - (HiveUtils.HIVE_METASTORE_VERSION.key -> "2.3.9") +: + (HiveUtils.HIVE_METASTORE_VERSION.key -> "2.3.10") +: (HiveUtils.HIVE_METASTORE_JARS.key -> "maven") +: filteredSettings } @@ -726,7 +726,7 @@ object SPARK_9757 extends QueryTest { val hiveWarehouseLocation = Utils.createTempDir() val sparkContext = new SparkContext( new SparkConf() - .set(HiveUtils.HIVE_METASTORE_VERSION.key, "2.3.9") + .set(HiveUtils.HIVE_METASTORE_VERSION.key, "2.3.10") .set(HiveUtils.HIVE_METASTORE_JARS.key, "maven") .set(UI_ENABLED, false) .set(WAREHOUSE_PATH.key, hiveWarehouseLocation.toString)) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala index 5502414629c01..7dc7fc41dc708 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala @@ -1418,10 +1418,7 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto checkAnswer(df, expectedAnswer) // check correctness of output - spark.sessionState.conf.settings.synchronized { - val tmp = spark.sessionState.conf.autoBroadcastJoinThreshold - - sql(s"""SET ${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key}=-1""") + withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { df = sql(query) bhj = df.queryExecution.sparkPlan.collect { case j: BroadcastHashJoinExec => j } assert(bhj.isEmpty, "BroadcastHashJoin still planned even though it is switched off") @@ -1429,10 +1426,7 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto val shj = df.queryExecution.sparkPlan.collect { case j: SortMergeJoinExec => j } assert(shj.size === 1, "SortMergeJoin should be planned when BroadcastHashJoin is turned off") - - sql(s"""SET ${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key}=$tmp""") } - after() } @@ -1474,10 +1468,7 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto checkAnswer(df, answer) // check correctness of output - spark.sessionState.conf.settings.synchronized { - val tmp = spark.sessionState.conf.autoBroadcastJoinThreshold - - sql(s"SET ${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key}=-1") + withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { df = sql(leftSemiJoinQuery) bhj = df.queryExecution.sparkPlan.collect { case j: BroadcastHashJoinExec => j @@ -1489,10 +1480,7 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto } assert(shj.size === 1, "SortMergeJoinExec should be planned when BroadcastHashJoin is turned off") - - sql(s"SET ${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key}=$tmp") } - } test("Deals with wrong Hive's statistics (zero rowCount)") { @@ -1615,7 +1603,7 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto Seq(tbl, ext_tbl).foreach { tblName => sql(s"INSERT INTO $tblName VALUES (1, 'a', '2019-12-13')") - val expectedSize = 657 + val expectedSize = 690 // analyze table sql(s"ANALYZE TABLE $tblName COMPUTE STATISTICS NOSCAN") var tableStats = getTableStats(tblName) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientSuite.scala index 0bc288501a01e..b60adfb6f4cf1 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientSuite.scala @@ -686,6 +686,7 @@ class HiveClientSuite(version: String) extends HiveVersionSuite(version) { versionSpark.sql( s""" |CREATE TABLE tab(c1 string) + |USING HIVE |location '${tmpDir.toURI.toString}' """.stripMargin) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientSuites.scala index e0d5236e1e019..e43308f62a496 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientSuites.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientSuites.scala @@ -19,8 +19,6 @@ package org.apache.spark.sql.hive.client import java.net.URI -import scala.collection.immutable.IndexedSeq - import org.apache.hadoop.conf.Configuration import org.scalatest.Suite diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientUserNameSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientUserNameSuites.scala index e076c01c08980..dcf14855a5883 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientUserNameSuites.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientUserNameSuites.scala @@ -17,8 +17,6 @@ package org.apache.spark.sql.hive.client -import scala.collection.immutable.IndexedSeq - import org.scalatest.Suite class HiveClientUserNameSuites extends Suite with HiveClientVersions { diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientVersions.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientVersions.scala index 1dee9e6dcfc83..0bc6702079bdb 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientVersions.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientVersions.scala @@ -17,8 +17,6 @@ package org.apache.spark.sql.hive.client -import scala.collection.immutable.IndexedSeq - private[client] trait HiveClientVersions { private val testVersions = sys.env.get("SPARK_TEST_HIVE_CLIENT_VERSIONS") protected val versions = if (testVersions.nonEmpty) { diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HivePartitionFilteringSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HivePartitionFilteringSuites.scala index f10e63865423d..6e526bdc6f168 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HivePartitionFilteringSuites.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HivePartitionFilteringSuites.scala @@ -17,8 +17,6 @@ package org.apache.spark.sql.hive.client -import scala.collection.immutable.IndexedSeq - import org.scalatest.Suite class HivePartitionFilteringSuites extends Suite with HiveClientVersions { diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala index 241fdd4b9ec5a..65b70ad8bcaeb 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala @@ -154,10 +154,6 @@ class HiveDDLSuite fs.exists(filesystemPath) } - test("alter table: set properties") { - testSetProperties(isDatasourceTable = false) - } - test("alter table: unset properties") { testUnsetProperties(isDatasourceTable = false) } @@ -216,7 +212,7 @@ class HiveDDLSuite test("SPARK-22431: alter table tests with nested types") { withTable("t1", "t2", "t3") { - spark.sql("CREATE TABLE t1 (q STRUCT, i1 INT)") + spark.sql("CREATE TABLE t1 (q STRUCT, i1 INT) USING HIVE") spark.sql("ALTER TABLE t1 ADD COLUMNS (newcol1 STRUCT<`col1`:STRING, col2:Int>)") val newcol = spark.sql("SELECT * FROM t1").schema.fields(2).name assert("newcol1".equals(newcol)) @@ -2614,7 +2610,7 @@ class HiveDDLSuite "msg" -> "java.lang.UnsupportedOperationException: Unknown field type: void") ) - sql("CREATE TABLE t3 AS SELECT NULL AS null_col") + sql("CREATE TABLE t3 USING HIVE AS SELECT NULL AS null_col") checkAnswer(sql("SELECT * FROM t3"), Row(null)) } @@ -2642,9 +2638,6 @@ class HiveDDLSuite sql("CREATE TABLE t3 (v VOID) USING hive") checkAnswer(sql("SELECT * FROM t3"), Seq.empty) - - sql("CREATE TABLE t4 (v VOID)") - checkAnswer(sql("SELECT * FROM t4"), Seq.empty) } // Create table with void type using spark.catalog.createTable @@ -3324,7 +3317,7 @@ class HiveDDLSuite | INTERVAL '1-1' YEAR TO MONTH AS YM, | INTERVAL '1 02:03:04.123456' DAY TO SECOND AS DT |""".stripMargin, - s"CREATE TABLE $tbl (dt INTERVAL HOUR TO MINUTE)" + s"CREATE TABLE $tbl (dt INTERVAL HOUR TO MINUTE) USING HIVE" ).foreach { sqlCmd => checkError( exception = intercept[SparkUnsupportedOperationException] { diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala index 86e6b01cb6cae..4d23ac0639b3e 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala @@ -369,7 +369,7 @@ class HiveQuerySuite extends HiveComparisonTest with SQLTestUtils with BeforeAnd test("SPARK-7270: consider dynamic partition when comparing table output") { withTable("test_partition", "ptest") { - sql(s"CREATE TABLE test_partition (a STRING) PARTITIONED BY (b BIGINT, c STRING)") + sql(s"CREATE TABLE test_partition (a STRING) USING HIVE PARTITIONED BY (b BIGINT, c STRING)") sql(s"CREATE TABLE ptest (a STRING, b BIGINT, c STRING)") val analyzedPlan = sql( @@ -804,7 +804,7 @@ class HiveQuerySuite extends HiveComparisonTest with SQLTestUtils with BeforeAnd } test("ADD JAR command") { - sql("CREATE TABLE alter1(a INT, b INT)") + sql("CREATE TABLE alter1(a INT, b INT) USING HIVE") checkError( exception = intercept[AnalysisException] { sql( @@ -1208,50 +1208,52 @@ class HiveQuerySuite extends HiveComparisonTest with SQLTestUtils with BeforeAnd } test("SPARK-5592: get java.net.URISyntaxException when dynamic partitioning") { - sql(""" - |create table sc as select * - |from (select '2011-01-11', '2011-01-11+14:18:26' from src tablesample (1 rows) - |union all - |select '2011-01-11', '2011-01-11+15:18:26' from src tablesample (1 rows) - |union all - |select '2011-01-11', '2011-01-11+16:18:26' from src tablesample (1 rows) ) s + withSQLConf("hive.exec.dynamic.partition" -> "true", + "hive.exec.dynamic.partition.mode" -> "nonstrict") { + sql( + """ + |create table sc as select * + |from (select '2011-01-11', '2011-01-11+14:18:26' from src tablesample (1 rows) + |union all + |select '2011-01-11', '2011-01-11+15:18:26' from src tablesample (1 rows) + |union all + |select '2011-01-11', '2011-01-11+16:18:26' from src tablesample (1 rows) ) s """.stripMargin) - sql("create table sc_part (key string) partitioned by (ts string) stored as rcfile") - sql("set hive.exec.dynamic.partition=true") - sql("set hive.exec.dynamic.partition.mode=nonstrict") - sql("insert overwrite table sc_part partition(ts) select * from sc") - sql("drop table sc_part") + sql("create table sc_part (key string) partitioned by (ts string) stored as rcfile") + sql("insert overwrite table sc_part partition(ts) select * from sc") + sql("drop table sc_part") + } } test("Partition spec validation") { - sql("DROP TABLE IF EXISTS dp_test") - sql("CREATE TABLE dp_test(key INT, value STRING) PARTITIONED BY (dp INT, sp INT)") - sql("SET hive.exec.dynamic.partition.mode=strict") - - // Should throw when using strict dynamic partition mode without any static partition - checkError( - exception = intercept[AnalysisException] { - sql( - """INSERT INTO TABLE dp_test PARTITION(dp) - |SELECT key, value, key % 5 FROM src""".stripMargin) - }, - errorClass = "INSERT_COLUMN_ARITY_MISMATCH.NOT_ENOUGH_DATA_COLUMNS", - parameters = Map( - "tableName" -> "`spark_catalog`.`default`.`dp_test`", - "tableColumns" -> "`key`, `value`, `dp`, `sp`", - "dataColumns" -> "`key`, `value`, `(key % 5)`")) - - sql("SET hive.exec.dynamic.partition.mode=nonstrict") - - // Should throw when a static partition appears after a dynamic partition - checkError( - exception = intercept[AnalysisException] { - sql( - """INSERT INTO TABLE dp_test PARTITION(dp, sp = 1) - |SELECT key, value, key % 5 FROM src""".stripMargin) - }, - errorClass = "_LEGACY_ERROR_TEMP_3079", - parameters = Map.empty) + withTable("dp_test") { + sql("CREATE TABLE dp_test(key INT, value STRING) USING HIVE PARTITIONED BY (dp INT, sp INT)") + withSQLConf("hive.exec.dynamic.partition.mode" -> "strict") { + // Should throw when using strict dynamic partition mode without any static partition + checkError( + exception = intercept[AnalysisException] { + sql( + """INSERT INTO TABLE dp_test PARTITION(dp) + |SELECT key, value, key % 5 FROM src""".stripMargin) + }, + errorClass = "INSERT_COLUMN_ARITY_MISMATCH.NOT_ENOUGH_DATA_COLUMNS", + parameters = Map( + "tableName" -> "`spark_catalog`.`default`.`dp_test`", + "tableColumns" -> "`key`, `value`, `dp`, `sp`", + "dataColumns" -> "`key`, `value`, `(key % 5)`")) + } + withSQLConf("hive.exec.dynamic.partition.mode" -> "nonstrict") { + // Should throw when a static partition appears after a dynamic partition + checkError( + exception = intercept[AnalysisException] { + sql( + """INSERT INTO TABLE dp_test PARTITION(dp, sp = 1) + |SELECT key, value, key % 5 FROM src""".stripMargin) + }, + errorClass = "_LEGACY_ERROR_TEMP_3079", + parameters = Map.empty) + } + } } test("SPARK-3414 regression: should store analyzed logical plan when creating a temporary view") { @@ -1292,21 +1294,22 @@ class HiveQuerySuite extends HiveComparisonTest with SQLTestUtils with BeforeAnd } test("SPARK-3810: PreprocessTableInsertion dynamic partitioning support") { - val analyzedPlan = { - loadTestTable("srcpart") - sql("DROP TABLE IF EXISTS withparts") - sql("CREATE TABLE withparts LIKE srcpart") - sql("SET hive.exec.dynamic.partition.mode=nonstrict") - - sql("CREATE TABLE IF NOT EXISTS withparts LIKE srcpart") - sql("INSERT INTO TABLE withparts PARTITION(ds, hr) SELECT key, value, '1', '2' FROM src") - .queryExecution.analyzed - } + withSQLConf("hive.exec.dynamic.partition.mode" -> "nonstrict") { + val analyzedPlan = { + loadTestTable("srcpart") + sql("DROP TABLE IF EXISTS withparts") + sql("CREATE TABLE withparts LIKE srcpart") + + sql("CREATE TABLE IF NOT EXISTS withparts LIKE srcpart") + sql("INSERT INTO TABLE withparts PARTITION(ds, hr) SELECT key, value, '1', '2' FROM src") + .queryExecution.analyzed + } - assertResult(2, "Duplicated project detected\n" + analyzedPlan) { - analyzedPlan.collect { - case i: InsertIntoHiveTable => i.query.collect { case p: Project => () }.size - }.sum + assertResult(2, "Duplicated project detected\n" + analyzedPlan) { + analyzedPlan.collect { + case i: InsertIntoHiveTable => i.query.collect { case p: Project => () }.size + }.sum + } } } @@ -1627,10 +1630,8 @@ class HiveQuerySuite extends HiveComparisonTest with SQLTestUtils with BeforeAnd test("SPARK-33084: Add jar support Ivy URI in SQL") { val testData = TestHive.getHiveFile("data/files/sample.json").toURI withTable("t") { - // hive-catalog-core has some transitive dependencies which dont exist on maven central - // and hence cannot be found in the test environment or are non-jar (.pom) which cause - // failures in tests. Use transitive=false as it should be good enough to test the Ivy - // support in Hive ADD JAR + // Use transitive=false as it should be good enough to test the Ivy support + // in Hive ADD JAR sql(s"ADD JAR ivy://org.apache.hive.hcatalog:hive-hcatalog-core:$hiveVersion" + "?transitive=false") sql( diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveScriptTransformationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveScriptTransformationSuite.scala index daf99886ff6d4..0c704d845a3bf 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveScriptTransformationSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveScriptTransformationSuite.scala @@ -391,7 +391,7 @@ class HiveScriptTransformationSuite extends BaseScriptTransformationSuite with T ioschema = hiveIOSchema) SparkPlanTest.executePlan(plan, hiveContext) }.getMessage - assert(e1.contains("interval cannot be converted to Hive TypeInfo")) + assert(e1.contains("\"INTERVAL\" cannot be converted to Hive TypeInfo")) val e2 = intercept[AnalysisException] { val plan = createScriptTransformationExec( @@ -403,7 +403,7 @@ class HiveScriptTransformationSuite extends BaseScriptTransformationSuite with T ioschema = hiveIOSchema) SparkPlanTest.executePlan(plan, hiveContext) }.getMessage - assert(e2.contains("array cannot be converted to Hive TypeInfo")) + assert(e2.contains("UDT(\"ARRAY\") cannot be converted to Hive TypeInfo")) } } @@ -416,7 +416,6 @@ class HiveScriptTransformationSuite extends BaseScriptTransformationSuite with T (1, new CalendarInterval(7, 1, 1000), new TestUDT.MyDenseVector(Array(1, 2, 3))) ).toDF("a", "b", "c") df.createTempView("v") - val e1 = intercept[AnalysisException] { sql( """ @@ -424,7 +423,7 @@ class HiveScriptTransformationSuite extends BaseScriptTransformationSuite with T |FROM v """.stripMargin).collect() }.getMessage - assert(e1.contains("interval cannot be converted to Hive TypeInfo")) + assert(e1.contains("\"INTERVAL\" cannot be converted to Hive TypeInfo")) val e2 = intercept[AnalysisException] { sql( @@ -433,7 +432,7 @@ class HiveScriptTransformationSuite extends BaseScriptTransformationSuite with T |FROM v """.stripMargin).collect() }.getMessage - assert(e2.contains("array cannot be converted to Hive TypeInfo")) + assert(e2.contains("UDT(\"ARRAY\") cannot be converted to Hive TypeInfo")) } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala index 3cf8d5eadb5b1..8280b9624fa2f 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala @@ -262,15 +262,17 @@ class HiveTableScanSuite extends HiveComparisonTest with SQLTestUtils with TestH sql("INSERT INTO t VALUES(1)") val dir = new File(f.getCanonicalPath + "/data") dir.mkdir() - sql("set mapreduce.input.fileinputformat.input.dir.recursive=true") - assert(sql("select * from t").collect().head.getLong(0) == 1) - sql("set mapreduce.input.fileinputformat.input.dir.recursive=false") - val e = intercept[IOException] { - sql("SELECT * FROM t").collect() + withSQLConf("mapreduce.input.fileinputformat.input.dir.recursive" -> "true") { + assert(sql("select * from t").collect().head.getLong(0) == 1) + } + withSQLConf("mapreduce.input.fileinputformat.input.dir.recursive" -> "false") { + val e = intercept[IOException] { + sql("SELECT * FROM t").collect() + } + assert(e.getMessage.contains(s"Path: ${dir.getAbsoluteFile} is a directory, " + + s"which is not supported by the record reader " + + s"when `mapreduce.input.fileinputformat.input.dir.recursive` is false.")) } - assert(e.getMessage.contains(s"Path: ${dir.getAbsoluteFile} is a directory, " + - s"which is not supported by the record reader " + - s"when `mapreduce.input.fileinputformat.input.dir.recursive` is false.")) dir.delete() } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index 0bcac639443cd..05b73e31d1156 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -178,24 +178,24 @@ abstract class SQLQuerySuiteBase extends QueryTest with SQLTestUtils with TestHi |PARTITIONED BY (state STRING, month INT) |STORED AS PARQUET """.stripMargin) + withSQLConf("hive.exec.dynamic.partition.mode" -> "nonstrict") { + sql("INSERT INTO TABLE orders PARTITION(state, month) SELECT * FROM orders1") + sql("INSERT INTO TABLE orderupdates PARTITION(state, month) SELECT * FROM orderupdates1") - sql("set hive.exec.dynamic.partition.mode=nonstrict") - sql("INSERT INTO TABLE orders PARTITION(state, month) SELECT * FROM orders1") - sql("INSERT INTO TABLE orderupdates PARTITION(state, month) SELECT * FROM orderupdates1") - - checkAnswer( - sql( - """ - |select orders.state, orders.month - |from orders - |join ( - | select distinct orders.state,orders.month - | from orders - | join orderupdates - | on orderupdates.id = orders.id) ao - | on ao.state = orders.state and ao.month = orders.month + checkAnswer( + sql( + """ + |select orders.state, orders.month + |from orders + |join ( + | select distinct orders.state,orders.month + | from orders + | join orderupdates + | on orderupdates.id = orders.id) ao + | on ao.state = orders.state and ao.month = orders.month """.stripMargin), - (1 to 6).map(_ => Row("CA", 20151))) + (1 to 6).map(_ => Row("CA", 20151))) + } } } } @@ -715,21 +715,23 @@ abstract class SQLQuerySuiteBase extends QueryTest with SQLTestUtils with TestHi } test("command substitution") { - sql("set tbl=src") - checkAnswer( - sql("SELECT key FROM ${hiveconf:tbl} ORDER BY key, value limit 1"), - sql("SELECT key FROM src ORDER BY key, value limit 1").collect().toSeq) + withSQLConf("tbl" -> "src") { + checkAnswer( + sql("SELECT key FROM ${hiveconf:tbl} ORDER BY key, value limit 1"), + sql("SELECT key FROM src ORDER BY key, value limit 1").collect().toSeq) + } - sql("set spark.sql.variable.substitute=false") // disable the substitution - sql("set tbl2=src") - intercept[Exception] { - sql("SELECT key FROM ${hiveconf:tbl2} ORDER BY key, value limit 1").collect() + withSQLConf("tbl2" -> "src", "spark.sql.variable.substitute" -> "false") { + intercept[Exception] { + sql("SELECT key FROM ${hiveconf:tbl2} ORDER BY key, value limit 1").collect() + } } - sql("set spark.sql.variable.substitute=true") // enable the substitution - checkAnswer( - sql("SELECT key FROM ${hiveconf:tbl2} ORDER BY key, value limit 1"), - sql("SELECT key FROM src ORDER BY key, value limit 1").collect().toSeq) + withSQLConf("tbl2" -> "src", "spark.sql.variable.substitute" -> "true") { + checkAnswer( + sql("SELECT key FROM ${hiveconf:tbl2} ORDER BY key, value limit 1"), + sql("SELECT key FROM src ORDER BY key, value limit 1").collect().toSeq) + } } test("ordering not in select") { @@ -1108,35 +1110,30 @@ abstract class SQLQuerySuiteBase extends QueryTest with SQLTestUtils with TestHi } test("dynamic partition value test") { - try { - sql("set hive.exec.dynamic.partition.mode=nonstrict") - // date - sql("drop table if exists dynparttest1") - sql("create table dynparttest1 (value int) partitioned by (pdate date)") - sql( - """ - |insert into table dynparttest1 partition(pdate) - | select count(*), cast('2015-05-21' as date) as pdate from src - """.stripMargin) - checkAnswer( - sql("select * from dynparttest1"), - Seq(Row(500, java.sql.Date.valueOf("2015-05-21")))) + withTable("dynparttest1", "dynparttest2") { + withSQLConf("hive.exec.dynamic.partition.mode" -> "nonstrict") { + // date + sql("create table dynparttest1 (value int) partitioned by (pdate date)") + sql( + """ + |insert into table dynparttest1 partition(pdate) + | select count(*), cast('2015-05-21' as date) as pdate from src + """.stripMargin) + checkAnswer( + sql("select * from dynparttest1"), + Seq(Row(500, java.sql.Date.valueOf("2015-05-21")))) - // decimal - sql("drop table if exists dynparttest2") - sql("create table dynparttest2 (value int) partitioned by (pdec decimal(5, 1))") - sql( - """ - |insert into table dynparttest2 partition(pdec) - | select count(*), cast('100.12' as decimal(5, 1)) as pdec from src - """.stripMargin) - checkAnswer( - sql("select * from dynparttest2"), - Seq(Row(500, new java.math.BigDecimal("100.1")))) - } finally { - sql("drop table if exists dynparttest1") - sql("drop table if exists dynparttest2") - sql("set hive.exec.dynamic.partition.mode=strict") + // decimal + sql("create table dynparttest2 (value int) partitioned by (pdec decimal(5, 1))") + sql( + """ + |insert into table dynparttest2 partition(pdec) + | select count(*), cast('100.12' as decimal(5, 1)) as pdec from src + """.stripMargin) + checkAnswer( + sql("select * from dynparttest2"), + Seq(Row(500, new java.math.BigDecimal("100.1")))) + } } } @@ -1911,14 +1908,14 @@ abstract class SQLQuerySuiteBase extends QueryTest with SQLTestUtils with TestHi } test("SPARK-17354: Partitioning by dates/timestamps works with Parquet vectorized reader") { - withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "true") { + withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "true", + "hive.exec.dynamic.partition.mode" -> "nonstrict") { sql( """CREATE TABLE order(id INT) |PARTITIONED BY (pd DATE, pt TIMESTAMP) |STORED AS PARQUET """.stripMargin) - sql("set hive.exec.dynamic.partition.mode=nonstrict") sql( """INSERT INTO TABLE order PARTITION(pd, pt) |SELECT 1 AS id, CAST('1990-02-24' AS DATE) AS pd, CAST('1990-02-24' AS TIMESTAMP) AS pt diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/command/AlterNamespaceUnsetPropertiesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/command/AlterNamespaceUnsetPropertiesSuite.scala new file mode 100644 index 0000000000000..22d833649fc6c --- /dev/null +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/command/AlterNamespaceUnsetPropertiesSuite.scala @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hive.execution.command + +import org.apache.spark.sql.execution.command.v1 + +/** + * The class contains tests for the `ALTER NAMESPACE ... UNSET PROPERTIES` command to check + * V1 Hive external table catalog. + */ +class AlterNamespaceUnsetPropertiesSuite extends v1.AlterNamespaceUnsetPropertiesSuiteBase + with CommandSuiteBase { + override def commandVersion: String = super[AlterNamespaceUnsetPropertiesSuiteBase].commandVersion +} diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/command/AlterTableSetTblPropertiesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/command/AlterTableSetTblPropertiesSuite.scala new file mode 100644 index 0000000000000..3926db41b7eb8 --- /dev/null +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/command/AlterTableSetTblPropertiesSuite.scala @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hive.execution.command + +import org.apache.spark.sql.execution.command.v1 + +/** + * The class contains tests for the `ALTER TABLE .. SET TBLPROPERTIES` command to check + * V1 Hive external table catalog. + */ +class AlterTableSetTblPropertiesSuite + extends v1.AlterTableSetTblPropertiesSuiteBase with CommandSuiteBase diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/command/ShowCreateTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/command/ShowCreateTableSuite.scala index 3098015dc7da8..3dc73e1161523 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/command/ShowCreateTableSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/command/ShowCreateTableSuite.scala @@ -43,6 +43,7 @@ class ShowCreateTableSuite extends v1.ShowCreateTableSuiteBase with CommandSuite | c1 INT COMMENT 'bla', | c2 STRING |) + |USING HIVE |TBLPROPERTIES ( | 'prop1' = 'value1', | 'prop2' = 'value2' @@ -67,6 +68,7 @@ class ShowCreateTableSuite extends v1.ShowCreateTableSuiteBase with CommandSuite | c1 INT COMMENT 'bla', | c2 STRING |) + |USING HIVE |LOCATION '${dir.toURI}' |TBLPROPERTIES ( | 'prop1' = 'value1', @@ -94,6 +96,7 @@ class ShowCreateTableSuite extends v1.ShowCreateTableSuiteBase with CommandSuite | c1 INT COMMENT 'bla', | c2 STRING |) + |USING HIVE |COMMENT 'bla' |PARTITIONED BY ( | p1 BIGINT COMMENT 'bla', @@ -193,6 +196,7 @@ class ShowCreateTableSuite extends v1.ShowCreateTableSuiteBase with CommandSuite withNamespaceAndTable(ns, table) { t => sql( s"""CREATE TABLE $t (a INT, b STRING) + |STORED AS TEXTFILE |CLUSTERED BY (a) |SORTED BY (b) |INTO 2 BUCKETS diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/command/ShowTablesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/command/ShowTablesSuite.scala index 79b1eb6c0961a..9ee3a0277c9a1 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/command/ShowTablesSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/command/ShowTablesSuite.scala @@ -97,6 +97,7 @@ class ShowTablesSuite extends v1.ShowTablesSuiteBase with CommandSuiteBase { |Type: VIEW |View Text: SELECT id FROM $catalog.$namespace.$table |View Original Text: SELECT id FROM $catalog.$namespace.$table + |View Schema Mode: COMPENSATION |View Catalog and Namespace: $catalog.$namespace |View Query Output Columns: [id] |Table Properties: diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcQuerySuite.scala index e52d9b639dc4f..284717739a814 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcQuerySuite.scala @@ -207,10 +207,7 @@ class HiveOrcQuerySuite extends OrcQueryTest with TestHiveSingleton { } } - // SPARK-28885 String value is not allowed to be stored as numeric type with - // ANSI store assignment policy. - // TODO: re-enable the test case when SPARK-29462 is fixed. - ignore("SPARK-23340 Empty float/double array columns raise EOFException") { + test("SPARK-23340 Empty float/double array columns raise EOFException") { withSQLConf(HiveUtils.CONVERT_METASTORE_ORC.key -> "false") { withTable("spark_23340") { sql("CREATE TABLE spark_23340(a array, b array) STORED AS ORC") @@ -284,6 +281,43 @@ class HiveOrcQuerySuite extends OrcQueryTest with TestHiveSingleton { } } + test("SPARK-47850 ORC conversation could be applied for unpartitioned table insertion") { + withTempView("single") { + val singleRowDF = Seq((0, "foo")).toDF("key", "value") + singleRowDF.createOrReplaceTempView("single") + Seq("true", "false").foreach { conversion => + withSQLConf(HiveUtils.CONVERT_METASTORE_ORC.key -> "true", + HiveUtils.CONVERT_INSERTING_UNPARTITIONED_TABLE.key -> conversion) { + withTable("dummy_orc_unpartitioned") { + spark.sql( + s""" + |CREATE TABLE dummy_orc_unpartitioned(key INT, value STRING) + |STORED AS ORC + """.stripMargin) + + spark.sql( + s""" + |INSERT INTO TABLE dummy_orc_unpartitioned + |SELECT key, value FROM single + """.stripMargin) + + val orcUnpartitionedTable = TableIdentifier("dummy_orc_unpartitioned", Some("default")) + if (conversion == "true") { + // if converted, we refresh the cached relation. + assert(getCachedDataSourceTable(orcUnpartitionedTable) === null) + } else { + // otherwise, not cached. + assert(getCachedDataSourceTable(orcUnpartitionedTable) === null) + } + + val df = spark.sql("SELECT key, value FROM dummy_orc_unpartitioned WHERE key=0") + checkAnswer(df, singleRowDF) + } + } + } + } + } + test("SPARK-32234 read ORC table with column names all starting with '_col'") { Seq("native", "hive").foreach { orcImpl => Seq("false", "true").foreach { vectorized => diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala index 870e71e17cda0..bac48f6c0c018 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala @@ -143,7 +143,7 @@ class HiveOrcSourceSuite extends OrcSuite with TestHiveSingleton { errorClass = "UNSUPPORTED_DATA_TYPE_FOR_DATASOURCE", parameters = Map( "columnName" -> "`testType()`", - "columnType" -> "\"INTERVAL\"", + "columnType" -> "UDT(\"INTERVAL\")", "format" -> "ORC") ) @@ -170,7 +170,7 @@ class HiveOrcSourceSuite extends OrcSuite with TestHiveSingleton { errorClass = "UNSUPPORTED_DATA_TYPE_FOR_DATASOURCE", parameters = Map( "columnName" -> "`a`", - "columnType" -> "\"INTERVAL\"", + "columnType" -> "UDT(\"INTERVAL\")", "format" -> "ORC") ) } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala index a1095ce58a061..0334e9c441610 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala @@ -79,6 +79,12 @@ object OrcReadBenchmark extends SqlBasedBenchmark { spark.read.format(HIVE_ORC_FORMAT).load(dirORC).createOrReplaceTempView("hiveOrcTable") } + private def getExpr(dataType: DataType = IntegerType): String = dataType match { + case ByteType => "cast(value % 128 as byte)" + case ShortType => "cast(value % 32768 as short)" + case _ => s"cast(value % ${Int.MaxValue} as ${dataType.sql})" + } + def numericScanBenchmark(values: Int, dataType: DataType): Unit = { val benchmark = new Benchmark(s"SQL Single ${dataType.sql} Column Scan", values, output = output) @@ -87,7 +93,7 @@ object OrcReadBenchmark extends SqlBasedBenchmark { import spark.implicits._ spark.range(values).map(_ => Random.nextLong()).createOrReplaceTempView("t1") - prepareTable(dir, spark.sql(s"SELECT CAST(value as ${dataType.sql}) id FROM t1")) + prepareTable(dir, spark.sql(s"SELECT ${getExpr(dataType)} id FROM t1")) benchmark.addCase("Hive built-in ORC") { _ => spark.sql("SELECT sum(id) FROM hiveOrcTable").noop() @@ -118,7 +124,7 @@ object OrcReadBenchmark extends SqlBasedBenchmark { prepareTable( dir, - spark.sql("SELECT CAST(value AS INT) AS c1, CAST(value as STRING) AS c2 FROM t1")) + spark.sql(s"SELECT ${getExpr()} AS c1, CAST(value as STRING) AS c2 FROM t1")) benchmark.addCase("Hive built-in ORC") { _ => spark.sql("SELECT sum(c1), sum(length(c2)) FROM hiveOrcTable").noop() @@ -147,7 +153,8 @@ object OrcReadBenchmark extends SqlBasedBenchmark { import spark.implicits._ spark.range(values).map(_ => Random.nextLong()).createOrReplaceTempView("t1") - prepareTable(dir, spark.sql("SELECT value % 2 AS p, value AS id FROM t1"), Some("p")) + prepareTable(dir, + spark.sql(s"SELECT value % 2 AS p, ${getExpr()} AS id FROM t1"), Some("p")) benchmark.addCase("Data column - Hive built-in ORC") { _ => spark.sql("SELECT sum(id) FROM hiveOrcTable").noop() @@ -268,7 +275,7 @@ object OrcReadBenchmark extends SqlBasedBenchmark { withTempTable("t1", "nativeOrcTable", "hiveOrcTable") { import spark.implicits._ val middle = width / 2 - val selectExpr = (1 to width).map(i => s"value as c$i") + val selectExpr = (1 to width).map(i => s"${getExpr()} as c$i") spark.range(values).map(_ => Random.nextLong()).toDF() .selectExpr(selectExpr: _*).createOrReplaceTempView("t1") diff --git a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala index 53d897af5beff..f09f9caf129bd 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala @@ -28,7 +28,8 @@ import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.{SparkConf, SparkException} import org.apache.spark.deploy.SparkHadoopUtil -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} +import org.apache.spark.internal.LogKeys.{BACKUP_FILE, CHECKPOINT_FILE, CHECKPOINT_TIME, NUM_RETRY, PATH, TEMP_FILE} import org.apache.spark.internal.config.UI._ import org.apache.spark.io.CompressionCodec import org.apache.spark.streaming.scheduler.JobGenerator @@ -85,7 +86,7 @@ class Checkpoint(ssc: StreamingContext, val checkpointTime: Time) } // Add Yarn proxy filter specific configurations to the recovered SparkConf - val filter = "org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter" + val filter = "org.apache.spark.deploy.yarn.AmIpFilter" val filterPrefix = s"spark.$filter.param." newReloadConf.getAll.foreach { case (k, v) => if (k.startsWith(filterPrefix) && k.length > filterPrefix.length) { @@ -101,7 +102,7 @@ class Checkpoint(ssc: StreamingContext, val checkpointTime: Time) assert(framework != null, "Checkpoint.framework is null") assert(graph != null, "Checkpoint.graph is null") assert(checkpointTime != null, "Checkpoint.checkpointTime is null") - logInfo(s"Checkpoint for time $checkpointTime validated") + logInfo(log"Checkpoint for time ${MDC(LogKeys.CHECKPOINT_TIME, checkpointTime)} validated") } } @@ -141,12 +142,12 @@ object Checkpoint extends Logging { val filtered = paths.filter(p => REGEX.findFirstIn(p.getName).nonEmpty) filtered.sortWith(sortFunc).toImmutableArraySeq } else { - logWarning(s"Listing $path returned null") + logWarning(log"Listing ${MDC(PATH, path)} returned null") Seq.empty } } catch { case _: FileNotFoundException => - logWarning(s"Checkpoint directory $path does not exist") + logWarning(log"Checkpoint directory ${MDC(PATH, path)} does not exist") Seq.empty } } @@ -241,7 +242,8 @@ class CheckpointWriter( while (attempts < MAX_ATTEMPTS && !stopped) { attempts += 1 try { - logInfo(s"Saving checkpoint for time $checkpointTime to file '$checkpointFile'") + logInfo(log"Saving checkpoint for time ${MDC(LogKeys.CHECKPOINT_TIME, checkpointTime)} " + + log"to file '${MDC(LogKeys.CHECKPOINT_FILE, checkpointFile)}'") if (fs == null) { fs = new Path(checkpointDir).getFileSystem(hadoopConf) } @@ -259,38 +261,46 @@ class CheckpointWriter( if (fs.exists(checkpointFile)) { fs.delete(backupFile, true) // just in case it exists if (!fs.rename(checkpointFile, backupFile)) { - logWarning(s"Could not rename $checkpointFile to $backupFile") + logWarning(log"Could not rename ${MDC(CHECKPOINT_FILE, checkpointFile)} to " + + log"${MDC(BACKUP_FILE, backupFile)}") } } // Rename temp file to the final checkpoint file if (!fs.rename(tempFile, checkpointFile)) { - logWarning(s"Could not rename $tempFile to $checkpointFile") + logWarning(log"Could not rename ${MDC(TEMP_FILE, tempFile)} to " + + log"${MDC(CHECKPOINT_FILE, checkpointFile)}") } // Delete old checkpoint files val allCheckpointFiles = Checkpoint.getCheckpointFiles(checkpointDir, Some(fs)) if (allCheckpointFiles.size > 10) { allCheckpointFiles.take(allCheckpointFiles.size - 10).foreach { file => - logInfo(s"Deleting $file") + logInfo(log"Deleting ${MDC(LogKeys.FILE_NAME, file)}") fs.delete(file, true) } } // All done, print success - logInfo(s"Checkpoint for time $checkpointTime saved to file '$checkpointFile'" + - s", took ${bytes.length} bytes and " + - s"${TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startTimeNs)} ms") + logInfo( + log"Checkpoint for time ${MDC(LogKeys.CHECKPOINT_TIME, checkpointTime)} " + + log"saved to file " + + log"'${MDC(LogKeys.CHECKPOINT_FILE, checkpointFile)}', took " + + log"${MDC(LogKeys.BYTE_SIZE, bytes.length)} bytes and " + + log"${MDC(LogKeys.TIME, TimeUnit.NANOSECONDS.toMillis(System.nanoTime() + - startTimeNs))} ms") jobGenerator.onCheckpointCompletion(checkpointTime, clearCheckpointDataLater) return } catch { case ioe: IOException => - val msg = s"Error in attempt $attempts of writing checkpoint to '$checkpointFile'" + val msg = log"Error in attempt ${MDC(NUM_RETRY, attempts)} of writing checkpoint " + + log"to '${MDC(CHECKPOINT_FILE, checkpointFile)}'" logWarning(msg, ioe) fs = null } } - logWarning(s"Could not write checkpoint for time $checkpointTime to file '$checkpointFile'") + logWarning(log"Could not write checkpoint for time ${MDC(CHECKPOINT_TIME, checkpointTime)} " + + log"to file '${MDC(CHECKPOINT_FILE, checkpointFile)}'") } } @@ -299,7 +309,8 @@ class CheckpointWriter( val bytes = Checkpoint.serialize(checkpoint, conf) executor.execute(new CheckpointWriteHandler( checkpoint.checkpointTime, bytes, clearCheckpointDataLater)) - logInfo(s"Submitted checkpoint of time ${checkpoint.checkpointTime} to writer queue") + logInfo(log"Submitted checkpoint of time ${MDC(LogKeys.CHECKPOINT_TIME, + checkpoint.checkpointTime)} to writer queue") } catch { case rej: RejectedExecutionException => logError("Could not submit checkpoint task to the thread pool executor", rej) @@ -311,8 +322,10 @@ class CheckpointWriter( val startTimeNs = System.nanoTime() ThreadUtils.shutdown(executor, FiniteDuration(10, TimeUnit.SECONDS)) - logInfo(s"CheckpointWriter executor terminated? ${executor.isTerminated}," + - s" waited for ${TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startTimeNs)} ms.") + logInfo(log"CheckpointWriter executor terminated? " + + log"${MDC(LogKeys.EXECUTOR_STATE, executor.isTerminated)}, waited for " + + log"${MDC(LogKeys.DURATION, TimeUnit.NANOSECONDS.toMillis( + System.nanoTime() - startTimeNs))} ms.") stopped = true } } @@ -352,20 +365,22 @@ object CheckpointReader extends Logging { } // Try to read the checkpoint files in the order - logInfo(s"Checkpoint files found: ${checkpointFiles.mkString(",")}") + logInfo(log"Checkpoint files found: " + + log"${MDC(LogKeys.CHECKPOINT_FILE, checkpointFiles.mkString(","))}") var readError: Exception = null checkpointFiles.foreach { file => - logInfo(s"Attempting to load checkpoint from file $file") + logInfo(log"Attempting to load checkpoint from file ${MDC(LogKeys.FILE_NAME, file)}") try { val fis = fs.open(file) val cp = Checkpoint.deserialize(fis, conf) - logInfo(s"Checkpoint successfully loaded from file $file") - logInfo(s"Checkpoint was generated at time ${cp.checkpointTime}") + logInfo(log"Checkpoint successfully loaded from file ${MDC(LogKeys.FILE_NAME, file)}") + logInfo(log"Checkpoint was generated at time " + + log"${MDC(LogKeys.CHECKPOINT_TIME, cp.checkpointTime)}") return Some(cp) } catch { case e: Exception => readError = e - logWarning(s"Error reading checkpoint from file $file", e) + logWarning(log"Error reading checkpoint from file ${MDC(PATH, file)}", e) } } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala index 30bd30329283b..94b695e6452e5 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala @@ -36,7 +36,7 @@ import org.apache.spark._ import org.apache.spark.annotation.DeveloperApi import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.input.FixedLengthBinaryInputFormat -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.rdd.{RDD, RDDOperationScope} import org.apache.spark.scheduler.LiveListenerBus import org.apache.spark.serializer.SerializationDebugger @@ -725,7 +725,8 @@ class StreamingContext private[streaming] ( private def stopOnShutdown(): Unit = { val stopGracefully = conf.get(STOP_GRACEFULLY_ON_SHUTDOWN) - logInfo(s"Invoking stop(stopGracefully=$stopGracefully) from shutdown hook") + logInfo(log"Invoking stop(stopGracefully=" + + log"${MDC(LogKeys.VALUE, stopGracefully)}) from shutdown hook") // Do not stop SparkContext, let its own shutdown hook stop it stop(stopSparkContext = false, stopGracefully = stopGracefully) } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala index 03fffd9cd6f20..87d6a4909fdd4 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala @@ -26,7 +26,8 @@ import scala.reflect.ClassTag import scala.util.matching.Regex import org.apache.spark.{SparkContext, SparkException} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} +import org.apache.spark.internal.LogKeys.{FROM_TIME, SLIDE_DURATION, TO_TIME} import org.apache.spark.internal.io.SparkHadoopWriterUtils import org.apache.spark.rdd.{BlockRDD, RDD, RDDOperationScope} import org.apache.spark.storage.StorageLevel @@ -200,7 +201,8 @@ abstract class DStream[T: ClassTag] ( // Set the checkpoint interval to be slideDuration or 10 seconds, which ever is larger if (mustCheckpoint && checkpointDuration == null) { checkpointDuration = slideDuration * math.ceil(Seconds(10) / slideDuration).toInt - logInfo(s"Checkpoint interval automatically set to $checkpointDuration") + logInfo(log"Checkpoint interval automatically set to " + + log"${MDC(LogKeys.CHECKPOINT_INTERVAL, checkpointDuration)}") } // Set the minimum value of the rememberDuration if not already set @@ -276,11 +278,11 @@ abstract class DStream[T: ClassTag] ( dependencies.foreach(_.validateAtStart()) - logInfo(s"Slide time = $slideDuration") - logInfo(s"Storage level = ${storageLevel.description}") - logInfo(s"Checkpoint interval = $checkpointDuration") - logInfo(s"Remember interval = $rememberDuration") - logInfo(s"Initialized and validated $this") + logInfo(log"Slide time = ${MDC(LogKeys.SLIDE_DURATION, slideDuration)}") + logInfo(log"Storage level = ${MDC(LogKeys.STORAGE_LEVEL, storageLevel.description)}") + logInfo(log"Checkpoint interval = ${MDC(LogKeys.CHECKPOINT_INTERVAL, checkpointDuration)}") + logInfo(log"Remember interval = ${MDC(LogKeys.INTERVAL, rememberDuration)}") + logInfo(log"Initialized and validated ${MDC(LogKeys.DSTREAM, this)}") } private[streaming] def setContext(s: StreamingContext): Unit = { @@ -288,7 +290,7 @@ abstract class DStream[T: ClassTag] ( throw new SparkException(s"Context must not be set again for $this") } ssc = s - logInfo(s"Set context for $this") + logInfo(log"Set context for ${MDC(LogKeys.STREAMING_CONTEXT, this)}") dependencies.foreach(_.setContext(ssc)) } @@ -303,7 +305,9 @@ abstract class DStream[T: ClassTag] ( private[streaming] def remember(duration: Duration): Unit = { if (duration != null && (rememberDuration == null || duration > rememberDuration)) { rememberDuration = duration - logInfo(s"Duration for remembering RDDs set to $rememberDuration for $this") + logInfo(log"Duration for remembering RDDs set to " + + log"${MDC(LogKeys.DURATION, rememberDuration)} for " + + log"${MDC(LogKeys.DSTREAM, this.toString)}") } dependencies.foreach(_.remember(parentRememberDuration)) } @@ -313,8 +317,10 @@ abstract class DStream[T: ClassTag] ( if (!isInitialized) { throw new SparkException (this.toString + " has not been initialized") } else if (time <= zeroTime || ! (time - zeroTime).isMultipleOf(slideDuration)) { - logInfo(s"Time $time is invalid as zeroTime is $zeroTime" + - s" , slideDuration is $slideDuration and difference is ${time - zeroTime}") + logInfo(log"Time ${MDC(LogKeys.TIME, time)} is invalid as zeroTime is " + + log"${MDC(LogKeys.ZERO_TIME, zeroTime)}, slideDuration is " + + log"${MDC(LogKeys.SLIDE_DURATION, slideDuration)} and difference is " + + log"${MDC(LogKeys.DURATION, time - zeroTime)}") false } else { logDebug(s"Time $time is valid") @@ -352,7 +358,8 @@ abstract class DStream[T: ClassTag] ( } if (checkpointDuration != null && (time - zeroTime).isMultipleOf(checkpointDuration)) { newRDD.checkpoint() - logInfo(s"Marking RDD ${newRDD.id} for time $time for checkpointing") + logInfo(log"Marking RDD ${MDC(LogKeys.RDD_ID, newRDD.id)} for time " + + log"${MDC(LogKeys.TIME, time)} for checkpointing") } generatedRDDs.put(time, newRDD) } @@ -460,7 +467,8 @@ abstract class DStream[T: ClassTag] ( // Explicitly remove blocks of BlockRDD rdd match { case b: BlockRDD[_] => - logInfo(s"Removing blocks of RDD $b of time $time") + logInfo(log"Removing blocks of RDD ${MDC(LogKeys.RDD_ID, b)} " + + log"of time ${MDC(LogKeys.TIME, time)}") b.removeBlocks() case _ => } @@ -884,19 +892,23 @@ abstract class DStream[T: ClassTag] ( val alignedToTime = if ((toTime - zeroTime).isMultipleOf(slideDuration)) { toTime } else { - logWarning(s"toTime ($toTime) is not a multiple of slideDuration ($slideDuration)") + logWarning(log"toTime (${MDC(TO_TIME, toTime)}) is not a multiple of slideDuration " + + log"(${MDC(SLIDE_DURATION, slideDuration)})") toTime.floor(slideDuration, zeroTime) } val alignedFromTime = if ((fromTime - zeroTime).isMultipleOf(slideDuration)) { fromTime } else { - logWarning(s"fromTime ($fromTime) is not a multiple of slideDuration ($slideDuration)") + logWarning(log"fromTime (${MDC(FROM_TIME, fromTime)}) is not a multiple of slideDuration " + + log"(${MDC(SLIDE_DURATION, slideDuration)})") fromTime.floor(slideDuration, zeroTime) } - logInfo(s"Slicing from $fromTime to $toTime" + - s" (aligned to $alignedFromTime and $alignedToTime)") + logInfo(log"Slicing from ${MDC(LogKeys.FROM_TIME, fromTime)} to " + + log"${MDC(LogKeys.TO_TIME, toTime)}" + + log" (aligned to ${MDC(LogKeys.ALIGNED_FROM_TIME, alignedFromTime)} and " + + log"${MDC(LogKeys.ALIGNED_TO_TIME, alignedToTime)})") alignedFromTime.to(alignedToTime, slideDuration).flatMap { time => if (time >= zeroTime) getOrCompute(time) else None diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStreamCheckpointData.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStreamCheckpointData.scala index 667edf3713d43..8894b3cdc2396 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStreamCheckpointData.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStreamCheckpointData.scala @@ -24,7 +24,8 @@ import scala.reflect.ClassTag import org.apache.hadoop.fs.{FileSystem, Path} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{PATH, TIME} import org.apache.spark.streaming.Time import org.apache.spark.util.Utils @@ -90,12 +91,14 @@ class DStreamCheckpointData[T: ClassTag](dstream: DStream[T]) if (fileSystem.delete(path, true)) { logInfo("Deleted checkpoint file '" + file + "' for time " + time) } else { - logWarning(s"Error deleting old checkpoint file '$file' for time $time") + logWarning(log"Error deleting old checkpoint file '${MDC(PATH, file)}' for time " + + log"${MDC(TIME, time)}") } timeToCheckpointFile -= time } catch { case e: Exception => - logWarning("Error deleting old checkpoint file '" + file + "' for time " + time, e) + logWarning(log"Error deleting old checkpoint file '${MDC(PATH, file)}' for time " + + log"${MDC(TIME, time)}", e) fileSystem = null } } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala index e301311c922a2..d133454b832fc 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala @@ -26,7 +26,7 @@ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, FileSystem, Path} import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat} -import org.apache.spark.internal.LogKey.PATH +import org.apache.spark.internal.LogKeys.{ELAPSED_TIME, PATH} import org.apache.spark.internal.MDC import org.apache.spark.rdd.{RDD, UnionRDD} import org.apache.spark.streaming._ @@ -205,19 +205,18 @@ class FileInputDStream[K, V, F <: NewInputFormat[K, V]]( val timeTaken = clock.getTimeMillis() - lastNewFileFindingTime logDebug(s"Finding new files took $timeTaken ms") if (timeTaken > slideDuration.milliseconds) { - logWarning( - s"Time taken to find new files $timeTaken exceeds the batch size. " + - "Consider increasing the batch size or reducing the number of " + - "files in the monitored directories." + logWarning(log"Time taken to find new files ${MDC(ELAPSED_TIME, timeTaken)} exceeds the " + + log"batch size. Consider increasing the batch size or reducing the number of files in " + + log"the monitored directories." ) } newFiles } catch { case e: FileNotFoundException => - logWarning(s"No directory to scan: $directoryPath: $e") + logWarning(log"No directory to scan: ${MDC(PATH, directoryPath)}:", e) Array.empty case e: Exception => - logWarning(s"Error finding new files under $directoryPath", e) + logWarning(log"Error finding new files under ${MDC(PATH, directoryPath)}", e) reset() Array.empty } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/InputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/InputDStream.scala index 5a75b77659960..2deb388eb4b82 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/InputDStream.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/InputDStream.scala @@ -22,6 +22,8 @@ import java.util.Locale import scala.reflect.ClassTag import org.apache.spark.SparkContext +import org.apache.spark.internal.LogKeys.{LAST_VALID_TIME, TIME} +import org.apache.spark.internal.MDC import org.apache.spark.rdd.RDDOperationScope import org.apache.spark.streaming.{Duration, StreamingContext, Time} import org.apache.spark.streaming.scheduler.RateController @@ -91,8 +93,8 @@ abstract class InputDStream[T: ClassTag](_ssc: StreamingContext) } else { // Time is valid, but check it is more than lastValidTime if (lastValidTime != null && time < lastValidTime) { - logWarning(s"isTimeValid called with $time whereas the last valid time " + - s"is $lastValidTime") + logWarning(log"isTimeValid called with ${MDC(TIME, time)} whereas the last valid time " + + log"is ${MDC(LAST_VALID_TIME, lastValidTime)}") } lastValidTime = time true diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/SocketInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/SocketInputDStream.scala index 883d56c012f61..34b079219c993 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/SocketInputDStream.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/SocketInputDStream.scala @@ -24,7 +24,7 @@ import java.nio.charset.StandardCharsets import scala.reflect.ClassTag import scala.util.control.NonFatal -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.receiver.Receiver @@ -56,7 +56,7 @@ class SocketReceiver[T: ClassTag]( def onStart(): Unit = { - logInfo(s"Connecting to $host:$port") + logInfo(log"Connecting to ${MDC(LogKeys.HOST, host)}:${MDC(LogKeys.PORT, port)}") try { socket = new Socket(host, port) } catch { @@ -64,7 +64,7 @@ class SocketReceiver[T: ClassTag]( restart(s"Error connecting to $host:$port", e) return } - logInfo(s"Connected to $host:$port") + logInfo(log"Connected to ${MDC(LogKeys.HOST, host)}:${MDC(LogKeys.PORT, port)}") // Start the thread that receives data over a connection new Thread("Socket Receiver") { @@ -79,7 +79,7 @@ class SocketReceiver[T: ClassTag]( if (socket != null) { socket.close() socket = null - logInfo(s"Closed socket to $host:$port") + logInfo(log"Closed socket to ${MDC(LogKeys.HOST, host)}:${MDC(LogKeys.PORT, port)}") } } } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala b/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala index 9c461f0d4270e..12c6c95f7d8d3 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala @@ -24,6 +24,7 @@ import scala.reflect.ClassTag import scala.util.control.NonFatal import org.apache.spark._ +import org.apache.spark.internal.{LogKeys, MDC} import org.apache.spark.rdd.BlockRDD import org.apache.spark.storage.{BlockId, StorageLevel} import org.apache.spark.streaming.util._ @@ -156,8 +157,8 @@ class WriteAheadLogBackedBlockRDD[T: ClassTag]( s"Could not read data from write ahead log record ${partition.walRecordHandle}, " + s"read returned null") } - logInfo(s"Read partition data of $this from write ahead log, record handle " + - partition.walRecordHandle) + logInfo(log"Read partition data of ${MDC(LogKeys.RDD, this)} from write ahead log, " + + log"record handle ${MDC(LogKeys.WRITE_AHEAD_LOG_RECORD_HANDLE, partition.walRecordHandle)}") if (storeInBlockManager) { blockManager.putBytes(blockId, new ChunkedByteBuffer(dataRead.duplicate()), storageLevel) logDebug(s"Stored partition data of $this into block manager with level $storageLevel") diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala index d641f55fa7f6f..33995e6ad0786 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala @@ -22,7 +22,8 @@ import java.util.concurrent.{ArrayBlockingQueue, TimeUnit} import scala.collection.mutable.ArrayBuffer import org.apache.spark.{SparkConf, SparkException} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.STATUS import org.apache.spark.storage.StreamBlockId import org.apache.spark.streaming.StreamingConf.BLOCK_INTERVAL import org.apache.spark.streaming.util.RecurringTimer @@ -140,7 +141,8 @@ private[streaming] class BlockGenerator( if (state == Active) { state = StoppedAddingData } else { - logWarning(s"Cannot stop BlockGenerator as its not in the Active state [state = $state]") + logWarning(log"Cannot stop BlockGenerator as its not in the Active state " + + log"[state = ${MDC(STATUS, state)}]") return } } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceivedBlockHandler.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceivedBlockHandler.scala index e5b98dd714b3d..e513a75b69903 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceivedBlockHandler.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceivedBlockHandler.scala @@ -24,7 +24,8 @@ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.spark.{SparkConf, SparkException} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.{EFFECTIVE_STORAGE_LEVEL, STORAGE_LEVEL, STORAGE_LEVEL_DESERIALIZED, STORAGE_LEVEL_REPLICATION} import org.apache.spark.serializer.SerializerManager import org.apache.spark.storage._ import org.apache.spark.streaming.receiver.WriteAheadLogBasedBlockHandler._ @@ -137,20 +138,23 @@ private[streaming] class WriteAheadLogBasedBlockHandler( private val effectiveStorageLevel = { if (storageLevel.deserialized) { - logWarning(s"Storage level serialization ${storageLevel.deserialized} is not supported when" + - s" write ahead log is enabled, change to serialization false") + logWarning(log"Storage level serialization " + + log"${MDC(STORAGE_LEVEL_DESERIALIZED, storageLevel.deserialized)} is not " + + log"supported when write ahead log is enabled, change to serialization false") } if (storageLevel.replication > 1) { - logWarning(s"Storage level replication ${storageLevel.replication} is unnecessary when " + - s"write ahead log is enabled, change to replication 1") + logWarning(log"Storage level replication " + + log"${MDC(STORAGE_LEVEL_REPLICATION, storageLevel.replication)} is unnecessary when " + + log"write ahead log is enabled, change to replication 1") } StorageLevel(storageLevel.useDisk, storageLevel.useMemory, storageLevel.useOffHeap, false, 1) } if (storageLevel != effectiveStorageLevel) { - logWarning(s"User defined storage level $storageLevel is changed to effective storage level " + - s"$effectiveStorageLevel when write ahead log is enabled") + logWarning(log"User defined storage level ${MDC(STORAGE_LEVEL, storageLevel)} is changed to " + + log"effective storage level ${MDC(EFFECTIVE_STORAGE_LEVEL, effectiveStorageLevel)} when " + + log"write ahead log is enabled") } // Write ahead log manages diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala index 15f3464848646..7cc08b421f780 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala @@ -25,8 +25,8 @@ import scala.concurrent._ import scala.util.control.NonFatal import org.apache.spark.SparkConf -import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{ERROR, STREAM_ID} +import org.apache.spark.internal.{Logging, LogKeys, MDC} +import org.apache.spark.internal.LogKeys.{DELAY, ERROR, MESSAGE, STREAM_ID} import org.apache.spark.storage.StreamBlockId import org.apache.spark.util.{ThreadUtils, Utils} @@ -145,10 +145,10 @@ private[streaming] abstract class ReceiverSupervisor( def startReceiver(): Unit = synchronized { try { if (onReceiverStart()) { - logInfo(s"Starting receiver $streamId") + logInfo(log"Starting receiver ${MDC(LogKeys.STREAM_ID, streamId)}") receiverState = Started receiver.onStart() - logInfo(s"Called receiver $streamId onStart") + logInfo(log"Called receiver ${MDC(LogKeys.STREAM_ID, streamId)} onStart") } else { // The driver refused us stop("Registered unsuccessfully because Driver refused to start receiver " + streamId, None) @@ -162,7 +162,8 @@ private[streaming] abstract class ReceiverSupervisor( /** Stop receiver */ def stopReceiver(message: String, error: Option[Throwable]): Unit = synchronized { try { - logInfo("Stopping receiver with message: " + message + ": " + error.getOrElse("")) + logInfo(log"Stopping receiver with message: ${MDC(LogKeys.MESSAGE, message)}: " + + log"${MDC(LogKeys.ERROR, error.getOrElse(""))}") receiverState match { case Initialized => logWarning("Skip stopping receiver because it has not yet stared") @@ -191,8 +192,8 @@ private[streaming] abstract class ReceiverSupervisor( Future { // This is a blocking action so we should use "futureExecutionContext" which is a cached // thread pool. - logWarning("Restarting receiver with delay " + delay + " ms: " + message, - error.orNull) + logWarning(log"Restarting receiver with delay ${MDC(DELAY, delay)} ms: " + + log"${MDC(MESSAGE, message)}", error.orNull) stopReceiver("Restarting receiver with delay " + delay + "ms: " + message, error) logDebug("Sleeping for " + delay) Thread.sleep(delay) diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisorImpl.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisorImpl.scala index daaf7ed7eb2b6..aafa99bd5285d 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisorImpl.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisorImpl.scala @@ -28,7 +28,8 @@ import com.google.common.base.Throwables import org.apache.hadoop.conf.Configuration import org.apache.spark.{SparkEnv, SparkException} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} +import org.apache.spark.internal.LogKeys.{ERROR, MESSAGE} import org.apache.spark.rpc.{RpcEnv, ThreadSafeRpcEndpoint} import org.apache.spark.storage.StreamBlockId import org.apache.spark.streaming.Time @@ -84,7 +85,7 @@ private[streaming] class ReceiverSupervisorImpl( logDebug("Received delete old batch signal") cleanupOldBlocks(threshTime) case UpdateRateLimit(eps) => - logInfo(s"Received a new rate limit: $eps.") + logInfo(log"Received a new rate limit: ${MDC(LogKeys.RATE_LIMIT, eps)}.") registeredBlockGenerators.asScala.foreach { bg => bg.updateRate(eps) } @@ -169,7 +170,7 @@ private[streaming] class ReceiverSupervisorImpl( def reportError(message: String, error: Throwable): Unit = { val errorString = Option(error).map(Throwables.getStackTraceAsString).getOrElse("") trackerEndpoint.send(ReportError(streamId, message, errorString)) - logWarning("Reported error " + message + " - " + error) + logWarning(log"Reported error ${MDC(MESSAGE, message)} - ${MDC(ERROR, error)}") } override protected def onStart(): Unit = { @@ -194,10 +195,10 @@ private[streaming] class ReceiverSupervisorImpl( } override protected def onReceiverStop(message: String, error: Option[Throwable]): Unit = { - logInfo("Deregistering receiver " + streamId) + logInfo(log"Deregistering receiver ${MDC(LogKeys.STREAM_ID, streamId)}") val errorString = error.map(Throwables.getStackTraceAsString).getOrElse("") trackerEndpoint.askSync[Boolean](DeregisterReceiver(streamId, message, errorString)) - logInfo("Stopped receiver " + streamId) + logInfo(log"Stopped receiver ${MDC(LogKeys.STREAM_ID, streamId)}") } override def createBlockGenerator( diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ExecutorAllocationManager.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ExecutorAllocationManager.scala index 5aa2a9df3ba87..903cde8082db7 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ExecutorAllocationManager.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ExecutorAllocationManager.scala @@ -21,7 +21,7 @@ package org.apache.spark.streaming.scheduler import scala.util.Random import org.apache.spark.{ExecutorAllocationClient, SparkConf} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.internal.config.DECOMMISSION_ENABLED import org.apache.spark.internal.config.Streaming._ import org.apache.spark.resource.ResourceProfile @@ -75,8 +75,10 @@ private[streaming] class ExecutorAllocationManager( def start(): Unit = { timer.start() - logInfo(s"ExecutorAllocationManager started with " + - s"ratios = [$scalingUpRatio, $scalingDownRatio] and interval = $scalingIntervalSecs sec") + logInfo(log"ExecutorAllocationManager started with ratios = " + + log"[${MDC(LogKeys.SCALING_UP_RATIO, scalingUpRatio)}, " + + log"${MDC(LogKeys.SCALING_DOWN_RATIO, scalingDownRatio)}] and interval = " + + log"${MDC(LogKeys.INTERVAL, scalingIntervalSecs)} sec") } def stop(): Unit = { @@ -89,11 +91,14 @@ private[streaming] class ExecutorAllocationManager( * batch statistics. */ private def manageAllocation(): Unit = synchronized { - logInfo(s"Managing executor allocation with ratios = [$scalingUpRatio, $scalingDownRatio]") + logInfo(log"Managing executor allocation with ratios = [" + + log"${MDC(LogKeys.SCALING_UP_RATIO, scalingUpRatio)}, " + + log"${MDC(LogKeys.SCALING_DOWN_RATIO, scalingDownRatio)}]") if (batchProcTimeCount > 0) { val averageBatchProcTime = batchProcTimeSum / batchProcTimeCount val ratio = averageBatchProcTime.toDouble / batchDurationMs - logInfo(s"Average: $averageBatchProcTime, ratio = $ratio" ) + logInfo(log"Average: ${MDC(LogKeys.AVG_BATCH_PROC_TIME, averageBatchProcTime)}, " + + log"ratio = ${MDC(LogKeys.RATIO, ratio)}") if (ratio >= scalingUpRatio) { logDebug("Requesting executors") val numNewExecutors = math.max(math.round(ratio).toInt, 1) @@ -119,7 +124,8 @@ private[streaming] class ExecutorAllocationManager( Map(ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID -> targetTotalExecutors), Map(ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID -> 0), Map.empty) - logInfo(s"Requested total $targetTotalExecutors executors") + logInfo(log"Requested total ${MDC(LogKeys.NUM_EXECUTORS, + targetTotalExecutors)} executors") } /** Kill an executor that is not running any receiver, if possible */ @@ -129,7 +135,9 @@ private[streaming] class ExecutorAllocationManager( if (allExecIds.nonEmpty && allExecIds.size > minNumExecutors) { val execIdsWithReceivers = receiverTracker.allocatedExecutors().values.flatten.toSeq - logInfo(s"Executors with receivers (${execIdsWithReceivers.size}): ${execIdsWithReceivers}") + logInfo(log"Executors with receivers (${MDC(LogKeys.NUM_EXECUTORS, + execIdsWithReceivers.size)}): " + + log"${MDC(LogKeys.EXECUTOR_IDS, execIdsWithReceivers)}") val removableExecIds = allExecIds.diff(execIdsWithReceivers) logDebug(s"Removable executors (${removableExecIds.size}): ${removableExecIds}") @@ -142,7 +150,7 @@ private[streaming] class ExecutorAllocationManager( } else { client.killExecutor(execIdToRemove) } - logInfo(s"Requested to kill executor $execIdToRemove") + logInfo(log"Requested to kill executor ${MDC(LogKeys.EXECUTOR_ID, execIdToRemove)}") } else { logInfo(s"No non-receiver executors to kill") } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/InputInfoTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/InputInfoTracker.scala index 639ac6de4f5d3..bd9ea7b5a2688 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/InputInfoTracker.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/InputInfoTracker.scala @@ -20,7 +20,7 @@ package org.apache.spark.streaming.scheduler import scala.collection.mutable import org.apache.spark.annotation.DeveloperApi -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.streaming.{StreamingContext, Time} /** @@ -82,7 +82,8 @@ private[streaming] class InputInfoTracker(ssc: StreamingContext) extends Logging /** Cleanup the tracked input information older than threshold batch time */ def cleanup(batchThreshTime: Time): Unit = synchronized { val timesToCleanup = batchTimeToInputInfos.keys.filter(_ < batchThreshTime) - logInfo(s"remove old batch metadata: ${timesToCleanup.mkString(" ")}") + logInfo(log"remove old batch metadata: " + + log"${MDC(LogKeys.DURATION, timesToCleanup.mkString(" "))}") batchTimeToInputInfos --= timesToCleanup } } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala index 282946dd8ef4b..c0636af690382 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala @@ -21,7 +21,8 @@ import java.util.concurrent.TimeUnit import scala.util.{Failure, Success, Try} -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, MDC} +import org.apache.spark.internal.LogKeys.TIMEOUT import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Checkpoint, CheckpointWriter, StreamingConf, Time} import org.apache.spark.streaming.api.python.PythonDStream @@ -123,7 +124,8 @@ class JobGenerator(jobScheduler: JobScheduler) extends Logging { val diff = TimeUnit.NANOSECONDS.toMillis((System.nanoTime() - timeWhenStopStarted)) val timedOut = diff > stopTimeoutMs if (timedOut) { - logWarning("Timed out while stopping the job generator (timeout = " + stopTimeoutMs + ")") + logWarning(log"Timed out while stopping the job generator " + + log"(timeout = ${MDC(TIMEOUT, stopTimeoutMs)})") } timedOut } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala index bdb9103372410..7fb35a04be6da 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala @@ -27,8 +27,8 @@ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.spark.SparkConf -import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.RECEIVED_BLOCK_INFO +import org.apache.spark.internal.{Logging, LogKeys, MDC} +import org.apache.spark.internal.LogKeys.{RECEIVED_BLOCK_INFO, RECEIVED_BLOCK_TRACKER_LOG_EVENT} import org.apache.spark.network.util.JavaUtils import org.apache.spark.streaming.Time import org.apache.spark.streaming.util.{WriteAheadLog, WriteAheadLogUtils} @@ -127,7 +127,9 @@ private[streaming] class ReceivedBlockTracker( timeToAllocatedBlocks.put(batchTime, allocatedBlocks) lastAllocatedBatchTime = batchTime } else { - logInfo(s"Possibly processed batch $batchTime needs to be processed again in WAL recovery") + logInfo(log"Possibly processed batch ${MDC(LogKeys.BATCH_TIMESTAMP, + batchTime)} needs to be " + + log"processed again in WAL recovery") } } else { // This situation occurs when: @@ -137,7 +139,9 @@ private[streaming] class ReceivedBlockTracker( // 2. Slow checkpointing makes recovered batch time older than WAL recovered // lastAllocatedBatchTime. // This situation will only occurs in recovery time. - logInfo(s"Possibly processed batch $batchTime needs to be processed again in WAL recovery") + logInfo(log"Possibly processed batch ${MDC(LogKeys.BATCH_TIMESTAMP, + batchTime)} needs to be processed " + + log"again in WAL recovery") } } @@ -175,7 +179,7 @@ private[streaming] class ReceivedBlockTracker( def cleanupOldBatches(cleanupThreshTime: Time, waitForCompletion: Boolean): Unit = synchronized { require(cleanupThreshTime.milliseconds < clock.getTimeMillis()) val timesToCleanup = timeToAllocatedBlocks.keys.filter { _ < cleanupThreshTime }.toSeq - logInfo(s"Deleting batches: ${timesToCleanup.mkString(" ")}") + logInfo(log"Deleting batches: ${MDC(LogKeys.DURATION, timesToCleanup.mkString(" "))}") if (writeToLog(BatchCleanupEvent(timesToCleanup))) { timeToAllocatedBlocks --= timesToCleanup writeAheadLogOption.foreach(_.clean(cleanupThreshTime.milliseconds, waitForCompletion)) @@ -221,9 +225,10 @@ private[streaming] class ReceivedBlockTracker( } writeAheadLogOption.foreach { writeAheadLog => - logInfo(s"Recovering from write ahead logs in ${checkpointDirOption.get}") + logInfo(log"Recovering from write ahead logs in " + + log"${MDC(LogKeys.PATH, checkpointDirOption.get)}") writeAheadLog.readAll().asScala.foreach { byteBuffer => - logInfo("Recovering record " + byteBuffer) + logInfo(log"Recovering record ${MDC(LogKeys.BYTE_BUFFER, byteBuffer)}") Utils.deserialize[ReceivedBlockTrackerLogEvent]( JavaUtils.bufferToArray(byteBuffer), Thread.currentThread().getContextClassLoader) match { case BlockAdditionEvent(receivedBlockInfo) => @@ -247,7 +252,8 @@ private[streaming] class ReceivedBlockTracker( true } catch { case NonFatal(e) => - logWarning(s"Exception thrown while writing record: $record to the WriteAheadLog.", e) + logWarning(log"Exception thrown while writing record: " + + log"${MDC(RECEIVED_BLOCK_TRACKER_LOG_EVENT, record)} to the WriteAheadLog.", e) false } } else { diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala index 81c6264234f44..a37ba04c10123 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala @@ -24,8 +24,8 @@ import scala.concurrent.ExecutionContext import scala.util.{Failure, Success} import org.apache.spark._ -import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.{ERROR, STREAM_ID} +import org.apache.spark.internal.{Logging, LogKeys, MDC} +import org.apache.spark.internal.LogKeys.{ERROR, MESSAGE, RECEIVER_ID, RECEIVER_IDS, STREAM_ID} import org.apache.spark.rdd.RDD import org.apache.spark.rpc._ import org.apache.spark.scheduler.{ExecutorCacheTaskLocation, TaskLocation} @@ -186,7 +186,8 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false // Check if all the receivers have been deregistered or not val receivers = endpoint.askSync[Seq[Int]](AllReceiverIds) if (receivers.nonEmpty) { - logWarning("Not all of the receivers have deregistered, " + receivers) + logWarning(log"Not all of the receivers have deregistered, " + + log"${MDC(RECEIVER_IDS, receivers)}") } else { logInfo("All of the receivers have deregistered successfully") } @@ -231,7 +232,8 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false // Signal the receivers to delete old block data if (WriteAheadLogUtils.enableReceiverLog(ssc.conf)) { - logInfo(s"Cleanup old received batch data: $cleanupThreshTime") + logInfo(log"Cleanup old received batch data: " + + log"${MDC(LogKeys.CLEANUP_LOCAL_DIRS, cleanupThreshTime)}") synchronized { if (isTrackerStarted) { endpoint.send(CleanupOldBlocks(cleanupThreshTime)) @@ -305,7 +307,8 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false endpoint = Some(receiverEndpoint)) receiverTrackingInfos.put(streamId, receiverTrackingInfo) listenerBus.post(StreamingListenerReceiverStarted(receiverTrackingInfo.toReceiverInfo)) - logInfo("Registered receiver for stream " + streamId + " from " + senderAddress) + logInfo(log"Registered receiver for stream ${MDC(LogKeys.STREAM_ID, streamId)} " + + log"from ${MDC(LogKeys.RPC_ADDRESS, senderAddress)}") true } } @@ -365,11 +368,12 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false receiverTrackingInfos(streamId) = newReceiverTrackingInfo listenerBus.post(StreamingListenerReceiverError(newReceiverTrackingInfo.toReceiverInfo)) val messageWithError = if (error != null && !error.isEmpty) { - s"$message - $error" + log"${MDC(MESSAGE, message)} - ${MDC(ERROR, error)}" } else { - s"$message" + log"${MDC(MESSAGE, message)}" } - logWarning(s"Error reported by receiver for stream $streamId: $messageWithError") + logWarning(log"Error reported by receiver for stream ${MDC(STREAM_ID, streamId)}: " + + messageWithError) } private def scheduleReceiver(receiverId: Int): Seq[TaskLocation] = { @@ -445,7 +449,7 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false runDummySparkJob() - logInfo("Starting " + receivers.length + " receivers") + logInfo(log"Starting ${MDC(LogKeys.NUM_RECEIVERS, receivers.length)} receivers") endpoint.send(StartAllReceivers(receivers.toImmutableArraySeq)) } @@ -623,7 +627,7 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false if (!shouldStartReceiver) { onReceiverJobFinish(receiverId) } else { - logInfo(s"Restarting Receiver $receiverId") + logInfo(log"Restarting Receiver ${MDC(LogKeys.STREAM_ID, receiverId)}") self.send(RestartReceiver(receiver)) } case Failure(e) => @@ -631,11 +635,11 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false onReceiverJobFinish(receiverId) } else { logError("Receiver has been stopped. Try to restart it.", e) - logInfo(s"Restarting Receiver $receiverId") + logInfo(log"Restarting Receiver ${MDC(LogKeys.STREAM_ID, receiverId)}") self.send(RestartReceiver(receiver)) } }(ThreadUtils.sameThread) - logInfo(s"Receiver ${receiver.streamId} started") + logInfo(log"Receiver ${MDC(LogKeys.STREAM_ID, receiver.streamId)} started") } override def onStop(): Unit = { @@ -650,7 +654,7 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false receiverJobExitLatch.countDown() receiverTrackingInfos.remove(receiverId).foreach { receiverTrackingInfo => if (receiverTrackingInfo.state == ReceiverState.ACTIVE) { - logWarning(s"Receiver $receiverId exited but didn't deregister") + logWarning(log"Receiver ${MDC(RECEIVER_ID, receiverId)} exited but didn't deregister") } } } @@ -658,7 +662,8 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false /** Send stop signal to the receivers. */ private def stopReceivers(): Unit = { receiverTrackingInfos.values.flatMap(_.endpoint).foreach { _.send(StopReceiver) } - logInfo("Sent stop signal to all " + receiverTrackingInfos.size + " receivers") + logInfo(log"Sent stop signal to all " + + log"${MDC(LogKeys.NUM_RECEIVERS, receiverTrackingInfos.size)} receivers") } } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/PIDRateEstimator.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/PIDRateEstimator.scala index dc02062b9eb44..1b05a6ac30cc4 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/PIDRateEstimator.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/PIDRateEstimator.scala @@ -17,7 +17,7 @@ package org.apache.spark.streaming.scheduler.rate -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} /** * Implements a proportional-integral-derivative (PID) controller which acts on @@ -74,8 +74,11 @@ private[streaming] class PIDRateEstimator( minRate > 0, s"Minimum rate in PIDRateEstimator should be > 0") - logInfo(s"Created PIDRateEstimator with proportional = $proportional, integral = $integral, " + - s"derivative = $derivative, min rate = $minRate") + logInfo(log"Created PIDRateEstimator with proportional = " + + log"${MDC(LogKeys.PROPORTIONAL, proportional)}, integral = " + + log"${MDC(LogKeys.INTEGRAL, integral)}, derivative = " + + log"${MDC(LogKeys.DERIVATIVE, derivative)}, min rate = " + + log"${MDC(LogKeys.MIN_RATE, minRate)}") def compute( time: Long, // in milliseconds diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/BatchedWriteAheadLog.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/BatchedWriteAheadLog.scala index a73cde1f99aa9..8befe53efffa7 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/util/BatchedWriteAheadLog.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/util/BatchedWriteAheadLog.scala @@ -29,7 +29,8 @@ import scala.jdk.CollectionConverters._ import scala.util.control.NonFatal import org.apache.spark.SparkConf -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{Logging, LogKeys, MDC} +import org.apache.spark.internal.LogKeys.RECORDS import org.apache.spark.network.util.JavaUtils import org.apache.spark.util.{ThreadUtils, Utils} @@ -121,7 +122,8 @@ private[util] class BatchedWriteAheadLog(val wrappedLog: WriteAheadLog, conf: Sp * Stop the batched writer thread, fulfill promises with failures and close the wrapped WAL. */ override def close(): Unit = { - logInfo(s"BatchedWriteAheadLog shutting down at time: ${System.currentTimeMillis()}.") + logInfo(log"BatchedWriteAheadLog shutting down at time: " + + log"${MDC(LogKeys.TIME, System.currentTimeMillis())}.") if (!active.getAndSet(false)) return batchedWriterThread.interrupt() batchedWriterThread.join() @@ -178,7 +180,7 @@ private[util] class BatchedWriteAheadLog(val wrappedLog: WriteAheadLog, conf: Sp logWarning("BatchedWriteAheadLog Writer queue interrupted.", e) buffer.foreach(_.promise.failure(e)) case NonFatal(e) => - logWarning(s"BatchedWriteAheadLog Writer failed to write $buffer", e) + logWarning(log"BatchedWriteAheadLog Writer failed to write ${MDC(RECORDS, buffer)}", e) buffer.foreach(_.promise.failure(e)) } finally { buffer.clear() diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLog.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLog.scala index 3d93f045a5ec4..d90095c73785a 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLog.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLog.scala @@ -31,8 +31,8 @@ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.spark.SparkConf -import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.RETRY_COUNT +import org.apache.spark.internal.{Logging, LogKeys, MDC} +import org.apache.spark.internal.LogKeys.{NUM_RETRY, WRITE_AHEAD_LOG_INFO} import org.apache.spark.util.{CompletionIterator, ThreadUtils} import org.apache.spark.util.ArrayImplicits._ @@ -107,7 +107,7 @@ private[streaming] class FileBasedWriteAheadLog( } } if (fileSegment == null) { - logError(log"Failed to write to write ahead log after ${MDC(RETRY_COUNT, failures)} failures") + logError(log"Failed to write to write ahead log after ${MDC(NUM_RETRY, failures)} failures") throw lastException } fileSegment @@ -137,7 +137,8 @@ private[streaming] class FileBasedWriteAheadLog( */ def readAll(): JIterator[ByteBuffer] = synchronized { val logFilesToRead = pastLogs.map{ _.path} ++ currentLogPath - logInfo("Reading from the logs:\n" + logFilesToRead.mkString("\n")) + logInfo(log"Reading from the logs:\n" + + log"${MDC(LogKeys.PATHS, logFilesToRead.mkString("\n"))}") def readFile(file: String): Iterator[ByteBuffer] = { logDebug(s"Creating log reader with $file") val reader = new FileBasedWriteAheadLogReader(file, hadoopConf) @@ -170,8 +171,11 @@ private[streaming] class FileBasedWriteAheadLog( pastLogs --= expiredLogs expiredLogs } - logInfo(s"Attempting to clear ${oldLogFiles.size} old log files in $logDirectory " + - s"older than $threshTime: ${oldLogFiles.map { _.path }.mkString("\n")}") + logInfo(log"Attempting to clear ${MDC(LogKeys.NUM_RECORDS_READ, oldLogFiles.size)} " + + log"old log files in " + + log"${MDC(LogKeys.PATH, logDirectory)} older than " + + log"${MDC(LogKeys.THRESHOLD, threshTime)}: " + + log"${MDC(LogKeys.FILES, oldLogFiles.map(_.path).mkString("\n"))}") def deleteFile(walInfo: LogInfo): Unit = { try { @@ -181,9 +185,11 @@ private[streaming] class FileBasedWriteAheadLog( logDebug(s"Cleared log file $walInfo") } catch { case ex: Exception => - logWarning(s"Error clearing write ahead log file $walInfo", ex) + logWarning(log"Error clearing write ahead log file " + + log"${MDC(WRITE_AHEAD_LOG_INFO, walInfo)}", ex) } - logInfo(s"Cleared log files in $logDirectory older than $threshTime") + logInfo(log"Cleared log files in ${MDC(LogKeys.PATH, logDirectory)} older than " + + log"${MDC(LogKeys.THRESH_TIME, threshTime)}") } oldLogFiles.foreach { logInfo => if (!executionContext.isShutdown) { @@ -251,7 +257,9 @@ private[streaming] class FileBasedWriteAheadLog( fileSystem.listStatus(logDirectoryPath).map { _.getPath }.toImmutableArraySeq) pastLogs.clear() pastLogs ++= logFileInfo - logInfo(s"Recovered ${logFileInfo.size} write ahead log files from $logDirectory") + logInfo(log"Recovered ${MDC(LogKeys.NUM_FILES, logFileInfo.size)} " + + log"write ahead log files from " + + log"${MDC(LogKeys.PATH, logDirectory)}") logDebug(s"Recovered files are:\n${logFileInfo.map(_.path).mkString("\n")}") } } catch { diff --git a/streaming/src/test/scala/org/apache/spark/streaming/MasterFailureTest.scala b/streaming/src/test/scala/org/apache/spark/streaming/MasterFailureTest.scala index afe17936043a2..771e65ed40b51 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/MasterFailureTest.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/MasterFailureTest.scala @@ -33,7 +33,7 @@ import org.apache.hadoop.fs.Path import org.scalatest.Assertions._ import org.apache.spark.internal.{Logging, MDC} -import org.apache.spark.internal.LogKey.PATH +import org.apache.spark.internal.LogKeys.PATH import org.apache.spark.streaming.dstream.DStream import org.apache.spark.util.Utils diff --git a/ui-test/package-lock.json b/ui-test/package-lock.json index 23ff8ede65159..ec870dfa4801c 100644 --- a/ui-test/package-lock.json +++ b/ui-test/package-lock.json @@ -1392,12 +1392,12 @@ } }, "node_modules/braces": { - "version": "3.0.2", - "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.2.tgz", - "integrity": "sha512-b8um+L1RzM3WDSzvhm6gIz1yfTbBt6YTlcEKAvsmqCZZFw46z626lVj9j1yEPW33H5H+lBQpZMP1k8l+78Ha0A==", + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.3.tgz", + "integrity": "sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA==", "dev": true, "dependencies": { - "fill-range": "^7.0.1" + "fill-range": "^7.1.1" }, "engines": { "node": ">=8" @@ -1911,9 +1911,9 @@ } }, "node_modules/fill-range": { - "version": "7.0.1", - "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.0.1.tgz", - "integrity": "sha512-qOo9F+dMUmC2Lcb4BbVvnKJxTPjCm+RRpe4gDuGrzkL7mEVl/djYSu2OdQ2Pa302N4oqkSg9ir6jaLWJ2USVpQ==", + "version": "7.1.1", + "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.1.1.tgz", + "integrity": "sha512-YsGpe3WHLK8ZYi4tWDg2Jy3ebRz2rXowDxnld4bkQB00cc/1Zw9AWnC0i9ztDJitivtQvaI9KaLyKrc+hBW0yg==", "dev": true, "dependencies": { "to-regex-range": "^5.0.1"